I need help with basic stuff again… I think the question is essentially, why can I not send and receive JSON?
ITT client-side error
192.168.2.250:54636 - "POST /itt HTTP/1.1" 422 Unprocessable Entity
ITT server-side error
{"detail":[{"loc":["body"],"msg":"value is not a valid dict","type":"type_error.dict"}]}
# image-to-text client
response = requests.post(url, json=json.dumps(mesg.dict()), stream=True, headers={"Content-Type": "application/json"})#, headers=headers)
image-to-pygame is confirmed working. just need to add the proxy in the middle now… and tti when that becomes available.
# image-to-text proxy
@app.post("/itt")
#async def itt(context:UserImageContext)->StreamingResponse:
async def itt(data:Dict)->StreamingResponse:
""" (b64-encoded) image to text """
print('itt', file=sys.stderr)
context = json.loads(data)
result :AsyncIterator[str] = _itt(client, context) # TODO await ?
return StreamingResponse(result)
the STT error
{"detail":[{"loc":["body"],"msg":"value is not a valid dict","type":"type_error.dict"}]}^CTraceback (most recent call last):
# speech-to-text client
response = requests.post('http://kali.innovanon.com:5002', json=json.dumps(audio_data.tolist()), headers={"Content-Type": "application/json"})
# speech-to-text proxy
@app.post("/stt")
async def stt(data:Dict)->Response:
print('stt', file=sys.stderr)
response = requests.post(host3 , json=data, headers={"Content-Type": "application/json"})
return Response(response.content)
# speech-to-text server
@app.post("/")
async def stt(data:Dict)->Response:
audio_np = np.array(json.loads(data))
result:Dict[str,str] = audio_model.transcribe(audio_np, fp16=torch.cuda.is_available()) # TODO async ?
text:str = result['text'].strip()
print('stt end', file=sys.stderr)
return Response(text)
I think the ones below here are working, but I’m getting an illegal instruction during encoding. Computer unfixably old probably.
update: switching to np.float32 and pumping it to the speaker process produces the expected audio-output. just gotta connect those proxies in the middle.
# text-to-speech client
response = requests.get(url, params={'prompt':mesg}) # TODO headers ?
print('sent_to_server', file=sys.stderr)
if response.status_code != 200:
print('error', response, file=sys.stderr)
return
print('ok_to_server', file=sys.stderr)
content = json.loads(response.content)
audio:ndarray = np.array(content)
# text-to-speech proxy
@app.get("/tts")
async def tts(prompt:str)->Response:
print('tts', prompt, file=sys.stderr)
response = requests.get(host2, params={'prompt':prompt}) # TODO async ? # TODO headres ?
return Response(response.content)
# text-to-speech server
@app.get("/")
async def tts(prompt:str, name:str="en_speaker_9")->Response:
print('received', file=sys.stderr)
audio_array:ndarray = generate_audio(prompt, history_prompt=name) # TODO no async ?
content = audio_array.tolist()
print('generated', file=sys.stderr)
return Response(json.dumps(audio_array))
UPDATE: minimal example above; reproducible example below
bark server: TTS
os.environ["SUNO_OFFLOAD_CPU"] = "True"
os.environ["SUNO_USE_SMALL_MODELS"] = "True"
from bark import generate_audio, preload_models
def get_app()->FastAPI:
app :FastAPI = FastAPI()
assert app is not None
@app.get("/")
async def tts(prompt:str, name:str="en_speaker_9")->Response:
print('received', file=sys.stderr)
audio_array:ndarray = generate_audio(prompt, history_prompt=name) # TODO no async ?
content = audio_array.tolist()
print('generated', file=sys.stderr)
return Response(json.dumps(audio_array))
return app
def _main()->FastAPI:
app:FastAPI = get_app()
assert app is not None
app.add_middleware(
CORSMiddleware,
allow_origins =['*'],
allow_credentials=True,
allow_methods =["*"],
allow_headers =["*"],
)
return app
def _run(app:FastAPI, host:str, port:int, preload:bool)->None:
assert app is not None
if preload: # download and load all models
preload_models()
urun(app, host=host, port=port)
def main()->None:
load_dotenv ()
host :str = getenv('HOST', 'kali.innovanon.com')
port :int = int(getenv('PORT', 5001))
prld :bool = bool(getenv('PRELOAD', 'False'))
app :FastAPI = _main()
assert app is not None
_run(app, host, port, prld)
if __name__ == '__main__':
main()
whisper server: STT
Entity
INFO: 192.168.2.250:45072 - "POST / HTTP/1.1" 422 Unprocessable Entity
INFO: 192.168.2.250:34990 - "POST / HTTP/1.1" 422 Unprocessable Entity
def get_app(audio_model:Whisper)->FastAPI:
app :FastAPI = FastAPI()
@app.post("/")
async def stt(data:Dict)->Response:
audio_np = np.array(json.loads(data))
print('stt start', file=sys.stderr)
result:Dict[str,str] = audio_model.transcribe(audio_np, fp16=torch.cuda.is_available()) # TODO async ?
text:str = result['text'].strip()
print('stt end', file=sys.stderr)
return Response(text)
return app
def _main(audio_model:Whisper)->FastAPI:
assert audio_model is not None
app:FastAPI = get_app(audio_model)
assert app is not None
app.add_middleware(
CORSMiddleware,
allow_origins =['*'],
allow_credentials=True,
allow_methods =["*"],
allow_headers =["*"],
)
return app
def _run(app:FastAPI, host:str, port:int)->None:
assert app is not None
assert host is not None
assert port is not None
urun(app, host=host, port=port)
def main()->None:
model:str = "small.en"
audio_model:Whisper = whisper.load_model(model)
load_dotenv ()
host :str = getenv('HOST', '0.0.0.0')
port :int = int(getenv('PORT', 5002))
app :FastAPI = _main(audio_model)
assert app is not None
_run(app, host, port)
if __name__ == '__main__':
main()
proxy:
INFO: 192.168.2.250:34146 - "POST /itt HTTP/1.1" 422 Unprocessable Entity
INFO: 192.168.2.250:34152 - "POST /itt HTTP/1.1" 422 Unprocessable Entity
tts testing 1 2 3
INFO: 192.168.2.250:56092 - "GET /tts?prompt=testing+1+2+3 HTTP/1.1" 500 Internal Server Error
async def _llm(client:AsyncClient, context:Context)->AsyncIterator[str]:
print('_llm', context, file=sys.stderr)
model:str = context['model']
async for part in await client.chat(model=model, messages=context.dict(), stream=True):
raw:Dict[str,str] = part['message']['content']
msg:AssistantMessage = AssistantMessage(*raw)
yield msg.content
print('_llm done', context, file=sys.stderr)
async def _itt(client:AsyncClient, context:UserImageContext)->AsyncIterator[str]:
print('_itt', context, file=sys.stderr)
model:str = context['model']
async for part in await client.chat(model=model, messages=context.dict(), stream=True):
raw:Dict[str,str] = part['message']['content']
msg:AssistantMessage = AssistantMessage(*raw)
yield msg.content
print('_itt done', context, file=sys.stderr)
def get_app(host1:str, host2:str, host3:str)->FastAPI:
app :FastAPI = FastAPI()
client:AsyncClient = AsyncClient(host=host1)
@app.get("/llm")
async def llm(context:Context)->StreamingResponse:
""" standard llm """
print('llm', context, file=sys.stderr)
result :AsyncIterator[str] = _llm(client, context) # TODO await ?
return StreamingResponse(result)
@app.post("/itt")
#async def itt(context:UserImageContext)->StreamingResponse:
async def itt(data:Dict)->StreamingResponse:
""" (b64-encoded) image to text """
print('itt', file=sys.stderr)
context = json.loads(data)
result :AsyncIterator[str] = _itt(client, context) # TODO await ?
return StreamingResponse(result)
@app.get("/tts")
async def tts(prompt:str)->Response:
print('tts', prompt, file=sys.stderr)
response = requests.get(host2, params={'prompt':prompt}) # TODO async ? # TODO headres ?
return Response(response.content)
@app.post("/stt")
async def stt(data:Dict)->Response:
print('stt', file=sys.stderr)
response = requests.post(host3 , json=data, headers={"Content-Type": "application/json"})
return Response(response.content)
return app
def _main(addr1:str, addr2:str, addr3:str)->FastAPI:
assert addr1 is not None
app:FastAPI = get_app(addr1, addr2, addr3)
assert app is not None
app.add_middleware(
CORSMiddleware,
allow_origins =['*'],
allow_credentials=True,
allow_methods =["*"],
allow_headers =["*"],
)
return app
def _run(app:FastAPI, host:str, port:int)->None:
assert app is not None
assert host is not None
assert port is not None
urun(app, host=host, port=port)
def main()->None: # TODO configargparse
load_dotenv ()
host :str = getenv('HOST', '0.0.0.0')
port :int = int(getenv('PORT', 5000))
addr1:str = getenv('ADDR', 'http://kali.innovanon.com:11434')
addr2:str = getenv('ADDR', 'http://kali.innovanon.com:5001')
addr3:str = getenv('ADDR', 'http://kali.innovanon.com:5002')
app :FastAPI = _main(addr1, addr2, addr3)
assert app is not None
_run(app, host, port)
if __name__ == '__main__':
main()
These seem to be working when connected to each other. The problem seems to be with the proxies/servers.
cam:
def main()->None:
print('eye start', file=sys.stderr)
cam :int = 0 # 0 Is the built in camera
rate:float = 60.0 # If your camera can achieve 60 fps Else just have this be 1-30 fps
cap :VideoCapture = VideoCapture(cam)
fps :float = cap.get(CAP_PROP_FPS) # Gets fps of your camera
cap.set(CAP_PROP_FPS, rate)
while True:
success :bool
frame :ndarray
success, frame = cap.read()
if not success:
error:EyeError = EyeError()#now)
print(str(error) , file=sys.stderr)
continue
frame = fliplr(frame) #for some reasons the frames appeared inverted
frame = rot90 (frame)
frame = cvtColor(frame, COLOR_BGR2RGB) # The video uses BGR colors and PyGame needs RGB
b64 :bytes = base64.b64encode(frame.tobytes())
assert np.allclose(frame, np.frombuffer(base64.b64decode(b64), frame.dtype).reshape(frame.shape))
utf8:str = b64.decode('utf-8')
assert (b64 == utf8.encode('utf-8'))
print(str(frame.shape))
print(utf8)
if __name__ == "__main__":
main()
mic:
def main()->None:
parser:ArgumentParser = argparse.ArgumentParser()
parser.add_argument("--energy_threshold", default=1000,
help="Energy level for mic to detect.", type=int)
parser.add_argument("--record_timeout", default=2,
help="How real time the recording is in seconds.", type=float)
if 'linux' in platform:
# a bunch of code from the original example goes here
pass
else:
source = Microphone(sample_rate=16000)
assert (source is not None)
record_timeout:int = args.record_timeout
with source:
recorder.adjust_for_ambient_noise(source)
def record_callback(_:Recognizer, audio:sr.AudioData) -> None:
data:bytes = audio.get_raw_data() # Grab the raw bytes and push it into the thread safe queue.
audio_np = np.frombuffer(data, dtype=np.int16).astype(np.float32) / 32768.0
b64 :bytes = base64.b64encode(audio_np)
utf8:str = b64.decode('utf-8')
assert (b64 == utf8.encode('utf-8'))
print(utf8)
stop_listening = recorder.listen_in_background(source, record_callback, phrase_time_limit=record_timeout)
forever:Event = Event()
forever.wait()
stop_listening(wait_for_stop=False)
if __name__ == "__main__":
main()
speaker:
def play_audio(audio_array:ndarray): # TODO async ?
assert audio_array is not None
play(audio_array, SAMPLE_RATE)
wait()
def main()->None:
while True:
print('mouth loop start', file=sys.stderr)
utf8 :str = input()
b64 :bytes = utf8.encode('utf-8')
assert (utf8 == b64.decode('utf-8'))
data :bytes = base64.b64decode(b64) # ?
assert (b64 == base64.b64encode(data))
audio_data:ndarray = np.frombuffer(data, dtype=np.float32)
play_audio(audio_data) # TODO async ?
if __name__ == "__main__":
main()
2