I am trying to get facial recognition working with a virtual assistant I made and have encountered some kind of interferance or problem between the face_recognition and speech_recognition modules (I think). My code will better explain.
When I first began playing around with facial recognition, I followed this tutorial, and everything worked fine. Implemented that into my virtual assistant with no issues at all. Here’s that code:
import requests, os, subprocess, webbrowser, sys, pickle, random, pyaudio, pvporcupine, struct, cv2, threading, time
import win32gui, win32api, win32con
import speech_recognition as sr
from datetime import *
import numpy as np
from openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain.schema import (
AIMessage,
HumanMessage,
SystemMessage
)
muted = False
tts = OpenAI(api_key = os.getenv("JARVIS_SPEECH"))
porcupine = None
paud = None
audio_stream = None
facial_recognizer = cv2.face.LBPHFaceRecognizer_create()
facial_recognizer.read('trainer/trainer.yml')
cascadePath = "haarcascade_frontalface_default.xml"
faceCascade = cv2.CascadeClassifier(cascadePath)
font = cv2.FONT_HERSHEY_SIMPLEX
def user_text(text):
print(f"33[91mUser: {text}33[0m") #red
def jarvis_text(text):
print(f"33[96mJarvis: {text}33[0m") #cyan
def system_text(text):
print(f"33[92mSystem: {text}33[0m") #green
def transcribe_audio_to_text(filename):
system_text("transcribing")
recognizer = sr.Recognizer()
with sr.AudioFile(filename) as source:
audio = recognizer.record(source)
try:
return recognizer.recognize_google(audio)
except:
return None
p = pyaudio.PyAudio()
stream = p.open(format=8,
channels=1,
rate=24_000,
output=True)
def speak_text(text):
with tts.audio.speech.with_streaming_response.create(
model="tts-1",
voice="echo",
input=text,
response_format="wav"
) as response:
for chunk in response.iter_bytes(1024):
stream.write(chunk)
def facial_recognition():
cam = cv2.VideoCapture(0)
cam.set(3, 1280)
cam.set(4, 720)
minW = 0.1 * cam.get(3)
minH = 0.1 * cam.get(4)
names = ['Pytho', 'Java Man']
last_seen_data = {name: {'last_seen': datetime.now(), 'greeted': False} for name in names}
def greet_person(name):
greeting_messages = [f"Hello {name}.", f"Hi there, {name}.", f"Good to see you, {name}.", f"Greetings, {name}."]
chosen_greeting_message = random.choice(greeting_messages)
jarvis_text(chosen_greeting_message)
speak_text(chosen_greeting_message)
last_data_read_time = datetime.now()
while True:
ret, img = cam.read()
img = cv2.flip(img, 1)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
faces = faceCascade.detectMultiScale(
gray,
scaleFactor=1.2,
minNeighbors=5,
minSize=(int(minW), int(minH)),
)
for (x, y, w, h) in faces:
cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
id, confidence = facial_recognizer.predict(gray[y:y + h, x:x + w])
if confidence < 100:
confidence_percentage = round(100 - confidence)
if confidence_percentage > 35:
name = names[id]
confidence_text = f" {confidence_percentage}%"
# Update last seen time and greeting flag every 2 seconds
current_time = datetime.now()
if (current_time - last_data_read_time).total_seconds() >= 2:
last_seen_data[name]['last_seen'] = current_time
if not last_seen_data[name]['greeted']:
greet_person(name)
last_seen_data[name]['greeted'] = True
last_data_read_time = current_time
else:
name = "unknown"
confidence_text = f" {confidence_percentage}%"
else:
name = "unknown"
confidence_text = f" {round(100 - confidence)}%"
cv2.putText(
img,
str(name),
(x + 5, y - 5),
font,
1,
(255, 255, 255),
2
)
cv2.putText(
img,
str(confidence_text),
(x + 5, y + h - 5),
font,
1,
(255, 255, 0),
1
)
# Reset greeting flags for people who have been absent for more than 5 seconds
for name in last_seen_data:
if datetime.now() - last_seen_data[name]['last_seen'] > timedelta(minutes=2):
last_seen_data[name]['greeted'] = False
cv2.imshow('Jarvis Facial Recognition', img)
k = cv2.waitKey(10) & 0xff
if k == 27:
break
cam.release()
cv2.destroyAllWindows()
facial_recognition_thread = threading.Thread(target=facial_recognition)
facial_recognition_thread.start()
def execute_prompt(text, time, history, jarvis_setup):
global muted
stop_listening_messages = ["Alright sir.", "My apologies.", "My bad sir.", "My mistake.", "You literally said my name."]
search_index = text.find("search")
on_index = text.find("on")
if text == "are you there" or text == "you there" or text == "are you up":
user_text(text)
jarvis_text("For you sir, always.")
speak_text("For you sir, always.")
elif text == "I wasn't talking to you" or text == "not you" or text == "I'm not talking to you" or text == "stop listening":
user_text(text)
chosen_stop_listening_message = random.choice(stop_listening_messages)
jarvis_text(chosen_stop_listening_message)
speak_text(chosen_stop_listening_message)
elif text == "exit":
user_text(text)
jarvis_text("Good bye sir.")
speak_text("Good bye sir.")
sys.exit(0)
else:
user_text(text)
history.append(HumanMessage(content = text))
response = jarvis_setup.invoke(history)
history.append(AIMessage(content = response.content))
with open('JarvisMemory.txt', 'wb') as file:
pickle.dump(history, file)
jarvis_text(response.content)
speak_text(response.content)
def main():
global muted
jarvis_setup = ChatOpenAI(openai_api_key=os.getenv("JARVIS"),
model="gpt-3.5-turbo",
temperature=0.7,
max_tokens=400)
JarvisMemory_path = "C:\Users\maste\OneDrive\Documents\Python\Jarvis\JarvisMemory.txt"
if os.path.exists(JarvisMemory_path):
with open(JarvisMemory_path, "rb") as file:
history = pickle.load(file)
else:
with open(JarvisMemory_path, "wb") as file:
history = [SystemMessage(content="""You are a human-like virtual assistant named Jarvis. Use a conversational tone.
All prompts and questions should be answered in a way that can be verbally said, meaning no lists or bullet points, et cetera.
Answer all questions as shortly as possible, unless a longer, more detailed response is requested. Respond like you have emotions.
Not too often, use personal opinion, or a bit of joking / sarcasm.
Keep in mind that in some prompts, I'm not asking for your assistance, just saying hello or telling you a story.
Don't promote your assistance in every response.""")]
pickle.dump(history, file)
recognizer = sr.Recognizer()
microphone = sr.Microphone()
greeting_messages = ["Greetings sir.", "Hello there.", "How can I help?", "What's up?", "At your service."]
while True:
with microphone as source:
recognizer.adjust_for_ambient_noise(source)
try:
porcupine = pvporcupine.create(access_key=os.getenv("PORCUPINE"), keywords=["jarvis"])
paud = pyaudio.PyAudio()
audio_stream = paud.open(rate=porcupine.sample_rate, channels=1, format=pyaudio.paInt16, input=True, frames_per_buffer=porcupine.frame_length)
while True:
keyword = audio_stream.read(porcupine.frame_length)
keyword = struct.unpack_from("h"*porcupine.frame_length, keyword)
keyword_index = porcupine.process(keyword)
if keyword_index >= 0:
filename = "input.wav"
system_text("wake word detected")
chosen_greeting = random.choice(greeting_messages)
jarvis_text(chosen_greeting)
speak_text(chosen_greeting)
system_text("getting audio")
with sr.Microphone() as source:
recognizer.adjust_for_ambient_noise(source)
prompt = recognizer.listen(source, phrase_time_limit=None, timeout=None)
filename = "input.wav"
with open(filename, "wb") as f:
f.write(prompt.get_wav_data())
time = datetime.now()
text = transcribe_audio_to_text(filename)
if text:
if (text == "I wasn't talking to you" or text == "not you" or text == "I'm not talking to you" or text == "stop listening"
or text == "La Musica por favor" or text == "play me some music" or text == "drop the needle"
or text == "play music" or text == "start music"
or text == "pause music" or text == "stop music"
or text == "mute"):
execute_prompt(text, time, history, jarvis_setup)
system_text("going back to main loop")
break
elif muted == False:
execute_prompt(text, time, history, jarvis_setup)
elif muted == True:
if text == "unmute":
user_text(f"You said: unmute")
jarvis_text("Jarvis unmuted.")
speak_text("Jarvis unmuted.")
muted = False
else:
jarvis_text("Jarvis is currently muted.")
speak_text("Jarvis is currently muted.")
system_text("going back to main loop")
break
else:
system_text("going back to main loop")
break
while True:
with sr.Microphone() as source:
recognizer = sr.Recognizer()
system_text("listening")
prompt2 = recognizer.listen(source, phrase_time_limit=None, timeout=None)
with open(filename, "wb") as f:
f.write(prompt2.get_wav_data())
try:
text = transcribe_audio_to_text(filename)
if text:
if (text == "I wasn't talking to you" or text == "not you" or text == "I'm not talking to you" or text == "stop listening"
or text == "La Musica por favor" or text == "play me some music" or text == "drop the needle"
or text == "play music" or text == "start music"
or text == "pause music" or text == "stop music"
or text == "mute"):
execute_prompt(text, time, history, jarvis_setup)
system_text("going back to main loop")
break
elif muted == False:
execute_prompt(text, time, history, jarvis_setup)
elif muted == True:
if text == "unmute":
user_text(text)
jarvis_text("Jarvis unmuted.")
speak_text("Jarvis unmuted.")
muted = False
else:
jarvis_text("Jarvis is currently muted.")
speak_text("Jarvis is currently muted.")
system_text("going back to main loop")
break
else:
system_text("going back to main loop")
break
except Exception as e:
pass # Handle recognition exceptions here
finally:
if porcupine is not None:
porcupine.delete()
if audio_stream is not None:
audio_stream.close()
if paud is not None:
paud.terminate()
if __name__ == "__main__":
print("All systems are online.")
main()
I tried shortening it as much as I could for readability. The full code is almost 900 lines.
At this point I wanted the confidence percentage to be higher and more accurate, with some other upgrades like using colored photos instead of grayscale, so with the help of some tutorials and ChatGPT, I made a new improved version:
import requests, os, subprocess, webbrowser, sys, pickle, random, pyaudio, pvporcupine, struct, cv2, threading, time, face_recognition
import win32gui, win32api, win32con
import speech_recognition as sr
from datetime import *
import numpy as np
from openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain.schema import (
AIMessage,
HumanMessage,
SystemMessage
)
muted = False
tts = OpenAI(api_key = os.getenv("JARVIS_SPEECH"))
porcupine = None
paud = None
audio_stream = None
def user_text(text):
print(f"33[91mUser: {text}33[0m") #red
def jarvis_text(text):
print(f"33[96mJarvis: {text}33[0m") #cyan
def system_text(text):
print(f"33[92mSystem: {text}33[0m") #green
def transcribe_audio_to_text(filename):
system_text("transcribing")
recognizer = sr.Recognizer()
with sr.AudioFile(filename) as source:
audio = recognizer.record(source)
try:
return recognizer.recognize_google(audio)
except:
return None
p = pyaudio.PyAudio()
stream = p.open(format=8,
channels=1,
rate=24_000,
output=True)
def speak_text(text):
with tts.audio.speech.with_streaming_response.create(
model="tts-1",
voice="echo",
input=text,
response_format="wav"
) as response:
for chunk in response.iter_bytes(1024):
stream.write(chunk)
# Initialize the face recognition model
def load_known_faces(folder_path):
known_face_encodings = []
known_face_names = []
for person_name in os.listdir(folder_path):
person_folder = os.path.join(folder_path, person_name)
if os.path.isdir(person_folder):
for image_name in os.listdir(person_folder):
image_path = os.path.join(person_folder, image_name)
image = face_recognition.load_image_file(image_path)
face_encodings = face_recognition.face_encodings(image)
if face_encodings:
known_face_encodings.append(face_encodings[0])
known_face_names.append(person_name)
return known_face_encodings, known_face_names
def greet_person(name):
greeting_messages = [f"Hello {name}.", f"Hi there, {name}.", f"Good to see you, {name}.", f"Greetings, {name}."]
chosen_greeting_message = random.choice(greeting_messages)
print(chosen_greeting_message)
def facial_recognition():
cam = cv2.VideoCapture(0)
cam.set(3, 1280)
cam.set(4, 720)
known_face_encodings, known_face_names = load_known_faces("known_faces")
last_seen_data = {name: {'last_seen': datetime.now(), 'greeted': False} for name in known_face_names}
last_data_read_time = datetime.now()
while True:
ret, img = cam.read()
img = cv2.flip(img, 1)
rgb_img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
face_locations = face_recognition.face_locations(rgb_img)
face_encodings = face_recognition.face_encodings(rgb_img, face_locations)
for face_encoding, face_location in zip(face_encodings, face_locations):
matches = face_recognition.compare_faces(known_face_encodings, face_encoding)
face_distances = face_recognition.face_distance(known_face_encodings, face_encoding)
best_match_index = np.argmin(face_distances)
if matches[best_match_index]:
confidence_percentage = round((1 - face_distances[best_match_index]) * 100)
if confidence_percentage > 50: # Threshold check
name = known_face_names[best_match_index]
current_time = datetime.now()
if (current_time - last_data_read_time).total_seconds() >= 2:
last_seen_data[name]['last_seen'] = current_time
if not last_seen_data[name]['greeted']:
greet_person(name)
last_seen_data[name]['greeted'] = True
last_data_read_time = current_time
else:
name = "unknown"
confidence_percentage = 0
else:
name = "unknown"
confidence_percentage = 0
top, right, bottom, left = face_location
cv2.rectangle(img, (left, top), (right, bottom), (0, 255, 0), 2)
cv2.putText(img, name, (left + 5, bottom + 20), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
cv2.putText(img, f'{confidence_percentage}%', (left + 5, bottom + 45), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 0), 1)
for name in last_seen_data:
if datetime.now() - last_seen_data[name]['last_seen'] > timedelta(minutes=2):
last_seen_data[name]['greeted'] = False
cv2.imshow('Jarvis Facial Recognition', img)
k = cv2.waitKey(10) & 0xff
if k == 27:
break
cam.release()
cv2.destroyAllWindows()
facial_recognition_thread = threading.Thread(target=facial_recognition)
facial_recognition_thread.start()
def execute_prompt(text, time, history, jarvis_setup):
global muted
search_index = text.find("search")
on_index = text.find("on")
if text == "are you there" or text == "you there" or text == "are you up":
user_text(text)
jarvis_text("For you sir, always.")
speak_text("For you sir, always.")
elif text == "exit":
user_text(text)
jarvis_text("Good bye sir.")
speak_text("Good bye sir.")
sys.exit(0)
else:
user_text(text)
history.append(HumanMessage(content = text))
response = jarvis_setup.invoke(history)
history.append(AIMessage(content = response.content))
with open('JarvisMemory.txt', 'wb') as file:
pickle.dump(history, file)
jarvis_text(response.content)
speak_text(response.content)
def main():
global muted
jarvis_setup = ChatOpenAI(openai_api_key=os.getenv("JARVIS"),
model="gpt-3.5-turbo",
temperature=0.7,
max_tokens=400)
JarvisMemory_path = "C:\Users\maste\OneDrive\Documents\Python\Jarvis\JarvisMemory.txt"
if os.path.exists(JarvisMemory_path):
with open(JarvisMemory_path, "rb") as file:
history = pickle.load(file)
else:
with open(JarvisMemory_path, "wb") as file:
history = [SystemMessage(content="""You are a human-like virtual assistant named Jarvis. Use a conversational tone.""")]
pickle.dump(history, file)
recognizer = sr.Recognizer()
microphone = sr.Microphone()
greeting_messages = ["Greetings sir.", "Hello there.", "How can I help?", "What's up?", "At your service."]
while True:
with microphone as source:
recognizer.adjust_for_ambient_noise(source)
try:
porcupine = pvporcupine.create(access_key=os.getenv("PORCUPINE"), keywords=["jarvis"])
paud = pyaudio.PyAudio()
audio_stream = paud.open(rate=porcupine.sample_rate, channels=1, format=pyaudio.paInt16, input=True, frames_per_buffer=porcupine.frame_length)
while True:
keyword = audio_stream.read(porcupine.frame_length)
keyword = struct.unpack_from("h"*porcupine.frame_length, keyword)
keyword_index = porcupine.process(keyword)
if keyword_index >= 0:
filename = "input.wav"
system_text("wake word detected")
chosen_greeting = random.choice(greeting_messages)
jarvis_text(chosen_greeting)
speak_text(chosen_greeting)
system_text("getting audio")
with sr.Microphone() as source:
recognizer.adjust_for_ambient_noise(source)
prompt = recognizer.listen(source, phrase_time_limit=None, timeout=None)
filename = "input.wav"
with open(filename, "wb") as f:
f.write(prompt.get_wav_data())
time = datetime.now()
text = transcribe_audio_to_text(filename)
if text:
if (text == "I wasn't talking to you"):
execute_prompt(text, time, history, jarvis_setup)
system_text("going back to main loop")
break
elif muted == False:
execute_prompt(text, time, history, jarvis_setup)
else:
system_text("going back to main loop")
break
while True:
with sr.Microphone() as source:
recognizer = sr.Recognizer()
system_text("listening")
prompt2 = recognizer.listen(source, phrase_time_limit=None, timeout=None)
with open(filename, "wb") as f:
f.write(prompt2.get_wav_data())
try:
text = transcribe_audio_to_text(filename)
if text:
if (text == "I wasn't talking to you"):
execute_prompt(text, time, history, jarvis_setup)
system_text("going back to main loop")
break
elif muted == False:
execute_prompt(text, time, history, jarvis_setup)
else:
system_text("going back to main loop")
break
except Exception as e:
pass # Handle recognition exceptions here
finally:
if porcupine is not None:
porcupine.delete()
if audio_stream is not None:
audio_stream.close()
if paud is not None:
paud.terminate()
if __name__ == "__main__":
print("All systems are online.")
main()
This code is also shortened by a lot.
The problem I’m having is that when the camera feed window opens, the microphone stops initializing and then coming back again repeatly at very fast intervals, and I’m guessing the face_recognition module is the culprit.
Below is a video of what’s exactly happening:
https://drive.google.com/file/d/1032Y-tpjqby8X8BEpHwdIwjLOAthrjp8/view?usp=sharing
I forgot to say this in the video, but I have tried the working code while it was recognizing my face and yes, it works fine.
I have a feeling this isn’t a simple, straightforward issue. I mean, it’s uncommon for sure. I searched for a long time on the internet for similar issues and couldn’t find any. I really need some help.