Thiết kế website giá rẻ

Question

I am trying to get facial recognition working with a virtual assistant I made and have encountered some kind of interferance or problem between the face_recognition and speech_recognition modules (I think). My code will better explain.

When I first began playing around with facial recognition, I followed this tutorial, and everything worked fine. Implemented that into my virtual assistant with no issues at all. Here’s that code:

import requests, os, subprocess, webbrowser, sys, pickle, random, pyaudio, pvporcupine, struct, cv2, threading, time
import win32gui, win32api, win32con
import speech_recognition as sr
from datetime import *
import numpy as np
from openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

muted = False

tts = OpenAI(api_key = os.getenv("JARVIS_SPEECH"))

porcupine = None
paud = None
audio_stream = None

facial_recognizer = cv2.face.LBPHFaceRecognizer_create()
facial_recognizer.read('trainer/trainer.yml')
cascadePath = "haarcascade_frontalface_default.xml"
faceCascade = cv2.CascadeClassifier(cascadePath)
font = cv2.FONT_HERSHEY_SIMPLEX

def user_text(text):
    print(f"33[91mUser: {text}33[0m") #red
    
def jarvis_text(text):
    print(f"33[96mJarvis: {text}33[0m") #cyan
    
def system_text(text):
    print(f"33[92mSystem: {text}33[0m") #green

def transcribe_audio_to_text(filename):
    system_text("transcribing")
    recognizer = sr.Recognizer()
    with sr.AudioFile(filename) as source:
        audio = recognizer.record(source)
    try:
        return recognizer.recognize_google(audio)
    except:
        return None
    
p = pyaudio.PyAudio()
stream = p.open(format=8,
                channels=1,
                rate=24_000,
                output=True)

def speak_text(text):
    with tts.audio.speech.with_streaming_response.create(
        model="tts-1",
        voice="echo",
        input=text,
        response_format="wav"
) as response:
        for chunk in response.iter_bytes(1024):
            stream.write(chunk)
            
def facial_recognition():
    cam = cv2.VideoCapture(0)
    cam.set(3, 1280)
    cam.set(4, 720)

    minW = 0.1 * cam.get(3)
    minH = 0.1 * cam.get(4)

    names = ['Pytho', 'Java Man']

    last_seen_data = {name: {'last_seen': datetime.now(), 'greeted': False} for name in names}

    def greet_person(name):
        greeting_messages = [f"Hello {name}.", f"Hi there, {name}.", f"Good to see you, {name}.", f"Greetings, {name}."]
        chosen_greeting_message = random.choice(greeting_messages)
        jarvis_text(chosen_greeting_message)
        speak_text(chosen_greeting_message)

    last_data_read_time = datetime.now()

    while True:
        ret, img = cam.read()
        img = cv2.flip(img, 1)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        faces = faceCascade.detectMultiScale(
            gray,
            scaleFactor=1.2,
            minNeighbors=5,
            minSize=(int(minW), int(minH)),
        )

        for (x, y, w, h) in faces:
            cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
            id, confidence = facial_recognizer.predict(gray[y:y + h, x:x + w])

            if confidence < 100:
                confidence_percentage = round(100 - confidence)
                
                if confidence_percentage > 35:
                    name = names[id]
                    confidence_text = f"  {confidence_percentage}%"

                    # Update last seen time and greeting flag every 2 seconds
                    current_time = datetime.now()
                    if (current_time - last_data_read_time).total_seconds() >= 2:
                        last_seen_data[name]['last_seen'] = current_time
                        if not last_seen_data[name]['greeted']:
                            greet_person(name)
                            last_seen_data[name]['greeted'] = True
                        last_data_read_time = current_time
                else:
                    name = "unknown"
                    confidence_text = f"  {confidence_percentage}%"
            else:
                name = "unknown"
                confidence_text = f"  {round(100 - confidence)}%"

            cv2.putText(
                img,
                str(name),
                (x + 5, y - 5),
                font,
                1,
                (255, 255, 255),
                2
            )
            cv2.putText(
                img,
                str(confidence_text),
                (x + 5, y + h - 5),
                font,
                1,
                (255, 255, 0),
                1
            )
            
        # Reset greeting flags for people who have been absent for more than 5 seconds
        for name in last_seen_data:
            if datetime.now() - last_seen_data[name]['last_seen'] > timedelta(minutes=2):
                last_seen_data[name]['greeted'] = False

        cv2.imshow('Jarvis Facial Recognition', img)
        k = cv2.waitKey(10) & 0xff
        if k == 27:
            break

    cam.release()
    cv2.destroyAllWindows()
    
facial_recognition_thread = threading.Thread(target=facial_recognition)
facial_recognition_thread.start()
    
def execute_prompt(text, time, history, jarvis_setup):
    global muted
    
    stop_listening_messages = ["Alright sir.", "My apologies.", "My bad sir.", "My mistake.", "You literally said my name."]
    
    search_index = text.find("search")
    on_index = text.find("on")

    if text == "are you there" or text == "you there" or text == "are you up":
        user_text(text)
        jarvis_text("For you sir, always.")
        speak_text("For you sir, always.")
    
    elif text == "I wasn't talking to you" or text == "not you" or text == "I'm not talking to you"  or text == "stop listening":
        user_text(text)
        chosen_stop_listening_message = random.choice(stop_listening_messages)
        jarvis_text(chosen_stop_listening_message)
        speak_text(chosen_stop_listening_message)
    
    elif text == "exit":
        user_text(text)
        jarvis_text("Good bye sir.")
        speak_text("Good bye sir.")
        sys.exit(0)
        
    else:
        user_text(text)
        history.append(HumanMessage(content = text))
        response = jarvis_setup.invoke(history)
        history.append(AIMessage(content = response.content))
        with open('JarvisMemory.txt', 'wb') as file:
            pickle.dump(history, file)
            
        jarvis_text(response.content)
        speak_text(response.content)
        

def main():
    global muted
    
    jarvis_setup = ChatOpenAI(openai_api_key=os.getenv("JARVIS"),
                              model="gpt-3.5-turbo",
                              temperature=0.7,
                              max_tokens=400)
    
    JarvisMemory_path = "C:\Users\maste\OneDrive\Documents\Python\Jarvis\JarvisMemory.txt"
    
    if os.path.exists(JarvisMemory_path):
        with open(JarvisMemory_path, "rb") as file:
            history = pickle.load(file)
    else:
        with open(JarvisMemory_path, "wb") as file:
            history = [SystemMessage(content="""You are a human-like virtual assistant named Jarvis. Use a conversational tone.
            All prompts and questions should be answered in a way that can be verbally said, meaning no lists or bullet points, et cetera.
            Answer all questions as shortly as possible, unless a longer, more detailed response is requested. Respond like you have emotions.
            Not too often, use personal opinion, or a bit of joking / sarcasm.
            Keep in mind that in some prompts, I'm not asking for your assistance, just saying hello or telling you a story.
            Don't promote your assistance in every response.""")]
            pickle.dump(history, file)
    
    recognizer = sr.Recognizer()
    microphone = sr.Microphone()
    
    greeting_messages = ["Greetings sir.", "Hello there.", "How can I help?", "What's up?", "At your service."]
    
    while True:
        with microphone as source:
            recognizer.adjust_for_ambient_noise(source)
            
            try:
                porcupine = pvporcupine.create(access_key=os.getenv("PORCUPINE"), keywords=["jarvis"])
                paud = pyaudio.PyAudio()
                audio_stream = paud.open(rate=porcupine.sample_rate, channels=1, format=pyaudio.paInt16, input=True, frames_per_buffer=porcupine.frame_length)
                while True:
                    keyword = audio_stream.read(porcupine.frame_length)
                    keyword = struct.unpack_from("h"*porcupine.frame_length, keyword)
                    keyword_index = porcupine.process(keyword)
                    if keyword_index >= 0:
                        filename = "input.wav"
                        
                        system_text("wake word detected")
                        
                        chosen_greeting = random.choice(greeting_messages)
                        jarvis_text(chosen_greeting)
                        speak_text(chosen_greeting)
                        
                        system_text("getting audio")
                        with sr.Microphone() as source:
                            recognizer.adjust_for_ambient_noise(source)
                            prompt = recognizer.listen(source, phrase_time_limit=None, timeout=None)
                            filename = "input.wav"
                            with open(filename, "wb") as f:
                                f.write(prompt.get_wav_data())
                    
                        time = datetime.now()
                        text = transcribe_audio_to_text(filename)
                        
                        if text:
                            if (text == "I wasn't talking to you" or text == "not you" or text == "I'm not talking to you"  or text == "stop listening"
                            or text == "La Musica por favor" or text == "play me some music" or text == "drop the needle"
                            or text == "play music" or text == "start music"
                            or text == "pause music" or text == "stop music"
                            or text == "mute"):
                                execute_prompt(text, time, history, jarvis_setup)
                                system_text("going back to main loop")
                                break
                            
                            elif muted == False:
                                execute_prompt(text, time, history, jarvis_setup)
                                
                            elif muted == True:
                                if text == "unmute":
                                    user_text(f"You said: unmute")
                                    jarvis_text("Jarvis unmuted.")
                                    speak_text("Jarvis unmuted.")
                                    muted = False
                                    
                                else:
                                    jarvis_text("Jarvis is currently muted.")
                                    speak_text("Jarvis is currently muted.")
                                    system_text("going back to main loop")
                                    break
                                
                        else:
                            system_text("going back to main loop")
                            break
                                                
                        while True:
                            with sr.Microphone() as source:
                                recognizer = sr.Recognizer()
                                system_text("listening")
                                prompt2 = recognizer.listen(source, phrase_time_limit=None, timeout=None)
                                with open(filename, "wb") as f:
                                    f.write(prompt2.get_wav_data())
                        
                            try:
                                text = transcribe_audio_to_text(filename)
                                
                                if text:
                                    if (text == "I wasn't talking to you" or text == "not you" or text == "I'm not talking to you"  or text == "stop listening"
                                    or text == "La Musica por favor" or text == "play me some music" or text == "drop the needle"
                                    or text == "play music" or text == "start music"
                                    or text == "pause music" or text == "stop music"
                                    or text == "mute"):
                                        execute_prompt(text, time, history, jarvis_setup)
                                        system_text("going back to main loop")
                                        break
                                    
                                    elif muted == False:
                                        execute_prompt(text, time, history, jarvis_setup)
                                        
                                    elif muted == True:
                                        if text == "unmute":
                                            user_text(text)
                                            jarvis_text("Jarvis unmuted.")
                                            speak_text("Jarvis unmuted.")
                                            muted = False
                                            
                                        else:
                                            jarvis_text("Jarvis is currently muted.")
                                            speak_text("Jarvis is currently muted.")
                                            system_text("going back to main loop")
                                            break
                                    
                                else:
                                    system_text("going back to main loop")
                                    break
                                
                            except Exception as e:
                                pass  # Handle recognition exceptions here
            
            finally:
                if porcupine is not None:
                    porcupine.delete()
                if audio_stream is not None:
                    audio_stream.close()
                if paud is not None:
                    paud.terminate()
                    
if __name__ == "__main__":
    print("All systems are online.")
    main()

I tried shortening it as much as I could for readability. The full code is almost 900 lines.

At this point I wanted the confidence percentage to be higher and more accurate, with some other upgrades like using colored photos instead of grayscale, so with the help of some tutorials and ChatGPT, I made a new improved version:

import requests, os, subprocess, webbrowser, sys, pickle, random, pyaudio, pvporcupine, struct, cv2, threading, time, face_recognition
import win32gui, win32api, win32con
import speech_recognition as sr
from datetime import *
import numpy as np
from openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

muted = False

tts = OpenAI(api_key = os.getenv("JARVIS_SPEECH"))

porcupine = None
paud = None
audio_stream = None

def user_text(text):
    print(f"33[91mUser: {text}33[0m") #red
    
def jarvis_text(text):
    print(f"33[96mJarvis: {text}33[0m") #cyan
    
def system_text(text):
    print(f"33[92mSystem: {text}33[0m") #green

def transcribe_audio_to_text(filename):
    system_text("transcribing")
    recognizer = sr.Recognizer()
    with sr.AudioFile(filename) as source:
        audio = recognizer.record(source)
    try:
        return recognizer.recognize_google(audio)
    except:
        return None
    
p = pyaudio.PyAudio()
stream = p.open(format=8,
                channels=1,
                rate=24_000,
                output=True)

def speak_text(text):
    with tts.audio.speech.with_streaming_response.create(
        model="tts-1",
        voice="echo",
        input=text,
        response_format="wav"
) as response:
        for chunk in response.iter_bytes(1024):
            stream.write(chunk)
            
# Initialize the face recognition model
def load_known_faces(folder_path):
    known_face_encodings = []
    known_face_names = []

    for person_name in os.listdir(folder_path):
        person_folder = os.path.join(folder_path, person_name)
        if os.path.isdir(person_folder):
            for image_name in os.listdir(person_folder):
                image_path = os.path.join(person_folder, image_name)
                image = face_recognition.load_image_file(image_path)
                face_encodings = face_recognition.face_encodings(image)
                if face_encodings:
                    known_face_encodings.append(face_encodings[0])
                    known_face_names.append(person_name)
    return known_face_encodings, known_face_names

def greet_person(name):
    greeting_messages = [f"Hello {name}.", f"Hi there, {name}.", f"Good to see you, {name}.", f"Greetings, {name}."]
    chosen_greeting_message = random.choice(greeting_messages)
    print(chosen_greeting_message)

def facial_recognition():
    cam = cv2.VideoCapture(0)
    cam.set(3, 1280)
    cam.set(4, 720)

    known_face_encodings, known_face_names = load_known_faces("known_faces")

    last_seen_data = {name: {'last_seen': datetime.now(), 'greeted': False} for name in known_face_names}
    last_data_read_time = datetime.now()

    while True:
        ret, img = cam.read()
        img = cv2.flip(img, 1)
        rgb_img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        face_locations = face_recognition.face_locations(rgb_img)
        face_encodings = face_recognition.face_encodings(rgb_img, face_locations)

        for face_encoding, face_location in zip(face_encodings, face_locations):
            matches = face_recognition.compare_faces(known_face_encodings, face_encoding)
            face_distances = face_recognition.face_distance(known_face_encodings, face_encoding)
            best_match_index = np.argmin(face_distances)

            if matches[best_match_index]:
                confidence_percentage = round((1 - face_distances[best_match_index]) * 100)

                if confidence_percentage > 50:  # Threshold check
                    name = known_face_names[best_match_index]

                    current_time = datetime.now()
                    if (current_time - last_data_read_time).total_seconds() >= 2:
                        last_seen_data[name]['last_seen'] = current_time
                        if not last_seen_data[name]['greeted']:
                            greet_person(name)
                            last_seen_data[name]['greeted'] = True
                        last_data_read_time = current_time
                else:
                    name = "unknown"
                    confidence_percentage = 0
            else:
                name = "unknown"
                confidence_percentage = 0

            top, right, bottom, left = face_location
            cv2.rectangle(img, (left, top), (right, bottom), (0, 255, 0), 2)
            cv2.putText(img, name, (left + 5, bottom + 20), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
            cv2.putText(img, f'{confidence_percentage}%', (left + 5, bottom + 45), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 0), 1)

        for name in last_seen_data:
            if datetime.now() - last_seen_data[name]['last_seen'] > timedelta(minutes=2):
                last_seen_data[name]['greeted'] = False

        cv2.imshow('Jarvis Facial Recognition', img)
        k = cv2.waitKey(10) & 0xff
        if k == 27:
            break

    cam.release()
    cv2.destroyAllWindows()

facial_recognition_thread = threading.Thread(target=facial_recognition)
facial_recognition_thread.start()
    
def execute_prompt(text, time, history, jarvis_setup):
    global muted
    
    search_index = text.find("search")
    on_index = text.find("on")

    if text == "are you there" or text == "you there" or text == "are you up":
        user_text(text)
        jarvis_text("For you sir, always.")
        speak_text("For you sir, always.")
    
    elif text == "exit":
        user_text(text)
        jarvis_text("Good bye sir.")
        speak_text("Good bye sir.")
        sys.exit(0)
        
    else:
        user_text(text)
        history.append(HumanMessage(content = text))
        response = jarvis_setup.invoke(history)
        history.append(AIMessage(content = response.content))
        with open('JarvisMemory.txt', 'wb') as file:
            pickle.dump(history, file)
            
        jarvis_text(response.content)
        speak_text(response.content)
        

def main():
    global muted
    
    jarvis_setup = ChatOpenAI(openai_api_key=os.getenv("JARVIS"),
                              model="gpt-3.5-turbo",
                              temperature=0.7,
                              max_tokens=400)
    
    JarvisMemory_path = "C:\Users\maste\OneDrive\Documents\Python\Jarvis\JarvisMemory.txt"
    
    if os.path.exists(JarvisMemory_path):
        with open(JarvisMemory_path, "rb") as file:
            history = pickle.load(file)
    else:
        with open(JarvisMemory_path, "wb") as file:
            history = [SystemMessage(content="""You are a human-like virtual assistant named Jarvis. Use a conversational tone.""")]
            pickle.dump(history, file)
    
    recognizer = sr.Recognizer()
    microphone = sr.Microphone()
    
    greeting_messages = ["Greetings sir.", "Hello there.", "How can I help?", "What's up?", "At your service."]
    
    while True:
        with microphone as source:
            recognizer.adjust_for_ambient_noise(source)
            
            try:
                porcupine = pvporcupine.create(access_key=os.getenv("PORCUPINE"), keywords=["jarvis"])
                paud = pyaudio.PyAudio()
                audio_stream = paud.open(rate=porcupine.sample_rate, channels=1, format=pyaudio.paInt16, input=True, frames_per_buffer=porcupine.frame_length)
                while True:
                    keyword = audio_stream.read(porcupine.frame_length)
                    keyword = struct.unpack_from("h"*porcupine.frame_length, keyword)
                    keyword_index = porcupine.process(keyword)
                    if keyword_index >= 0:
                        filename = "input.wav"
                        
                        system_text("wake word detected")
                        
                        chosen_greeting = random.choice(greeting_messages)
                        jarvis_text(chosen_greeting)
                        speak_text(chosen_greeting)
                        
                        system_text("getting audio")
                        with sr.Microphone() as source:
                            recognizer.adjust_for_ambient_noise(source)
                            prompt = recognizer.listen(source, phrase_time_limit=None, timeout=None)
                            filename = "input.wav"
                            with open(filename, "wb") as f:
                                f.write(prompt.get_wav_data())
                    
                        time = datetime.now()
                        text = transcribe_audio_to_text(filename)
                        
                        if text:
                            if (text == "I wasn't talking to you"):
                                execute_prompt(text, time, history, jarvis_setup)
                                system_text("going back to main loop")
                                break
                            
                            elif muted == False:
                                execute_prompt(text, time, history, jarvis_setup)
                                
                        else:
                            system_text("going back to main loop")
                            break
                                                
                        while True:
                            with sr.Microphone() as source:
                                recognizer = sr.Recognizer()
                                system_text("listening")
                                prompt2 = recognizer.listen(source, phrase_time_limit=None, timeout=None)
                                with open(filename, "wb") as f:
                                    f.write(prompt2.get_wav_data())
                        
                            try:
                                text = transcribe_audio_to_text(filename)
                                
                                if text:
                                    if (text == "I wasn't talking to you"):
                                        execute_prompt(text, time, history, jarvis_setup)
                                        system_text("going back to main loop")
                                        break
                                    
                                    elif muted == False:
                                        execute_prompt(text, time, history, jarvis_setup)
                                    
                                else:
                                    system_text("going back to main loop")
                                    break
                                
                            except Exception as e:
                                pass  # Handle recognition exceptions here
            
            finally:
                if porcupine is not None:
                    porcupine.delete()
                if audio_stream is not None:
                    audio_stream.close()
                if paud is not None:
                    paud.terminate()
                    
if __name__ == "__main__":
    print("All systems are online.")
    main()

This code is also shortened by a lot.

The problem I’m having is that when the camera feed window opens, the microphone stops initializing and then coming back again repeatly at very fast intervals, and I’m guessing the face_recognition module is the culprit.

Below is a video of what’s exactly happening:
https://drive.google.com/file/d/1032Y-tpjqby8X8BEpHwdIwjLOAthrjp8/view?usp=sharing
I forgot to say this in the video, but I have tried the working code while it was recognizing my face and yes, it works fine.

I have a feeling this isn’t a simple, straightforward issue. I mean, it’s uncommon for sure. I searched for a long time on the internet for similar issues and couldn’t find any. I really need some help.

Thiết kế website giá rẻ

Danh mục

face_recognition module somehow badly interfering with speech_recognition module (python)