face_recognition module somehow badly interfering with speech_recognition module (python)

I am trying to get facial recognition working with a virtual assistant I made and have encountered some kind of interferance or problem between the face_recognition and speech_recognition modules (I think). My code will better explain.

When I first began playing around with facial recognition, I followed this tutorial, and everything worked fine. Implemented that into my virtual assistant with no issues at all. Here’s that code:

import requests, os, subprocess, webbrowser, sys, pickle, random, pyaudio, pvporcupine, struct, cv2, threading, time
import win32gui, win32api, win32con
import speech_recognition as sr
from datetime import *
import numpy as np
from openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

muted = False

tts = OpenAI(api_key = os.getenv("JARVIS_SPEECH"))

porcupine = None
paud = None
audio_stream = None

facial_recognizer = cv2.face.LBPHFaceRecognizer_create()
facial_recognizer.read('trainer/trainer.yml')
cascadePath = "haarcascade_frontalface_default.xml"
faceCascade = cv2.CascadeClassifier(cascadePath)
font = cv2.FONT_HERSHEY_SIMPLEX

def user_text(text):
    print(f"33[91mUser: {text}33[0m") #red
    
def jarvis_text(text):
    print(f"33[96mJarvis: {text}33[0m") #cyan
    
def system_text(text):
    print(f"33[92mSystem: {text}33[0m") #green

def transcribe_audio_to_text(filename):
    system_text("transcribing")
    recognizer = sr.Recognizer()
    with sr.AudioFile(filename) as source:
        audio = recognizer.record(source)
    try:
        return recognizer.recognize_google(audio)
    except:
        return None
    
p = pyaudio.PyAudio()
stream = p.open(format=8,
                channels=1,
                rate=24_000,
                output=True)

def speak_text(text):
    with tts.audio.speech.with_streaming_response.create(
        model="tts-1",
        voice="echo",
        input=text,
        response_format="wav"
) as response:
        for chunk in response.iter_bytes(1024):
            stream.write(chunk)
            
def facial_recognition():
    cam = cv2.VideoCapture(0)
    cam.set(3, 1280)
    cam.set(4, 720)

    minW = 0.1 * cam.get(3)
    minH = 0.1 * cam.get(4)

    names = ['Pytho', 'Java Man']

    last_seen_data = {name: {'last_seen': datetime.now(), 'greeted': False} for name in names}

    def greet_person(name):
        greeting_messages = [f"Hello {name}.", f"Hi there, {name}.", f"Good to see you, {name}.", f"Greetings, {name}."]
        chosen_greeting_message = random.choice(greeting_messages)
        jarvis_text(chosen_greeting_message)
        speak_text(chosen_greeting_message)

    last_data_read_time = datetime.now()

    while True:
        ret, img = cam.read()
        img = cv2.flip(img, 1)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        faces = faceCascade.detectMultiScale(
            gray,
            scaleFactor=1.2,
            minNeighbors=5,
            minSize=(int(minW), int(minH)),
        )

        for (x, y, w, h) in faces:
            cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
            id, confidence = facial_recognizer.predict(gray[y:y + h, x:x + w])

            if confidence < 100:
                confidence_percentage = round(100 - confidence)
                
                if confidence_percentage > 35:
                    name = names[id]
                    confidence_text = f"  {confidence_percentage}%"

                    # Update last seen time and greeting flag every 2 seconds
                    current_time = datetime.now()
                    if (current_time - last_data_read_time).total_seconds() >= 2:
                        last_seen_data[name]['last_seen'] = current_time
                        if not last_seen_data[name]['greeted']:
                            greet_person(name)
                            last_seen_data[name]['greeted'] = True
                        last_data_read_time = current_time
                else:
                    name = "unknown"
                    confidence_text = f"  {confidence_percentage}%"
            else:
                name = "unknown"
                confidence_text = f"  {round(100 - confidence)}%"

            cv2.putText(
                img,
                str(name),
                (x + 5, y - 5),
                font,
                1,
                (255, 255, 255),
                2
            )
            cv2.putText(
                img,
                str(confidence_text),
                (x + 5, y + h - 5),
                font,
                1,
                (255, 255, 0),
                1
            )
            
        # Reset greeting flags for people who have been absent for more than 5 seconds
        for name in last_seen_data:
            if datetime.now() - last_seen_data[name]['last_seen'] > timedelta(minutes=2):
                last_seen_data[name]['greeted'] = False

        cv2.imshow('Jarvis Facial Recognition', img)
        k = cv2.waitKey(10) & 0xff
        if k == 27:
            break

    cam.release()
    cv2.destroyAllWindows()
    
facial_recognition_thread = threading.Thread(target=facial_recognition)
facial_recognition_thread.start()
    
def execute_prompt(text, time, history, jarvis_setup):
    global muted
    
    stop_listening_messages = ["Alright sir.", "My apologies.", "My bad sir.", "My mistake.", "You literally said my name."]
    
    search_index = text.find("search")
    on_index = text.find("on")

    if text == "are you there" or text == "you there" or text == "are you up":
        user_text(text)
        jarvis_text("For you sir, always.")
        speak_text("For you sir, always.")
    
    elif text == "I wasn't talking to you" or text == "not you" or text == "I'm not talking to you"  or text == "stop listening":
        user_text(text)
        chosen_stop_listening_message = random.choice(stop_listening_messages)
        jarvis_text(chosen_stop_listening_message)
        speak_text(chosen_stop_listening_message)
    
    elif text == "exit":
        user_text(text)
        jarvis_text("Good bye sir.")
        speak_text("Good bye sir.")
        sys.exit(0)
        
    else:
        user_text(text)
        history.append(HumanMessage(content = text))
        response = jarvis_setup.invoke(history)
        history.append(AIMessage(content = response.content))
        with open('JarvisMemory.txt', 'wb') as file:
            pickle.dump(history, file)
            
        jarvis_text(response.content)
        speak_text(response.content)
        

def main():
    global muted
    
    jarvis_setup = ChatOpenAI(openai_api_key=os.getenv("JARVIS"),
                              model="gpt-3.5-turbo",
                              temperature=0.7,
                              max_tokens=400)
    
    JarvisMemory_path = "C:\Users\maste\OneDrive\Documents\Python\Jarvis\JarvisMemory.txt"
    
    if os.path.exists(JarvisMemory_path):
        with open(JarvisMemory_path, "rb") as file:
            history = pickle.load(file)
    else:
        with open(JarvisMemory_path, "wb") as file:
            history = [SystemMessage(content="""You are a human-like virtual assistant named Jarvis. Use a conversational tone.
            All prompts and questions should be answered in a way that can be verbally said, meaning no lists or bullet points, et cetera.
            Answer all questions as shortly as possible, unless a longer, more detailed response is requested. Respond like you have emotions.
            Not too often, use personal opinion, or a bit of joking / sarcasm.
            Keep in mind that in some prompts, I'm not asking for your assistance, just saying hello or telling you a story.
            Don't promote your assistance in every response.""")]
            pickle.dump(history, file)
    
    recognizer = sr.Recognizer()
    microphone = sr.Microphone()
    
    greeting_messages = ["Greetings sir.", "Hello there.", "How can I help?", "What's up?", "At your service."]
    
    while True:
        with microphone as source:
            recognizer.adjust_for_ambient_noise(source)
            
            try:
                porcupine = pvporcupine.create(access_key=os.getenv("PORCUPINE"), keywords=["jarvis"])
                paud = pyaudio.PyAudio()
                audio_stream = paud.open(rate=porcupine.sample_rate, channels=1, format=pyaudio.paInt16, input=True, frames_per_buffer=porcupine.frame_length)
                while True:
                    keyword = audio_stream.read(porcupine.frame_length)
                    keyword = struct.unpack_from("h"*porcupine.frame_length, keyword)
                    keyword_index = porcupine.process(keyword)
                    if keyword_index >= 0:
                        filename = "input.wav"
                        
                        system_text("wake word detected")
                        
                        chosen_greeting = random.choice(greeting_messages)
                        jarvis_text(chosen_greeting)
                        speak_text(chosen_greeting)
                        
                        system_text("getting audio")
                        with sr.Microphone() as source:
                            recognizer.adjust_for_ambient_noise(source)
                            prompt = recognizer.listen(source, phrase_time_limit=None, timeout=None)
                            filename = "input.wav"
                            with open(filename, "wb") as f:
                                f.write(prompt.get_wav_data())
                    
                        time = datetime.now()
                        text = transcribe_audio_to_text(filename)
                        
                        if text:
                            if (text == "I wasn't talking to you" or text == "not you" or text == "I'm not talking to you"  or text == "stop listening"
                            or text == "La Musica por favor" or text == "play me some music" or text == "drop the needle"
                            or text == "play music" or text == "start music"
                            or text == "pause music" or text == "stop music"
                            or text == "mute"):
                                execute_prompt(text, time, history, jarvis_setup)
                                system_text("going back to main loop")
                                break
                            
                            elif muted == False:
                                execute_prompt(text, time, history, jarvis_setup)
                                
                            elif muted == True:
                                if text == "unmute":
                                    user_text(f"You said: unmute")
                                    jarvis_text("Jarvis unmuted.")
                                    speak_text("Jarvis unmuted.")
                                    muted = False
                                    
                                else:
                                    jarvis_text("Jarvis is currently muted.")
                                    speak_text("Jarvis is currently muted.")
                                    system_text("going back to main loop")
                                    break
                                
                        else:
                            system_text("going back to main loop")
                            break
                                                
                        while True:
                            with sr.Microphone() as source:
                                recognizer = sr.Recognizer()
                                system_text("listening")
                                prompt2 = recognizer.listen(source, phrase_time_limit=None, timeout=None)
                                with open(filename, "wb") as f:
                                    f.write(prompt2.get_wav_data())
                        
                            try:
                                text = transcribe_audio_to_text(filename)
                                
                                if text:
                                    if (text == "I wasn't talking to you" or text == "not you" or text == "I'm not talking to you"  or text == "stop listening"
                                    or text == "La Musica por favor" or text == "play me some music" or text == "drop the needle"
                                    or text == "play music" or text == "start music"
                                    or text == "pause music" or text == "stop music"
                                    or text == "mute"):
                                        execute_prompt(text, time, history, jarvis_setup)
                                        system_text("going back to main loop")
                                        break
                                    
                                    elif muted == False:
                                        execute_prompt(text, time, history, jarvis_setup)
                                        
                                    elif muted == True:
                                        if text == "unmute":
                                            user_text(text)
                                            jarvis_text("Jarvis unmuted.")
                                            speak_text("Jarvis unmuted.")
                                            muted = False
                                            
                                        else:
                                            jarvis_text("Jarvis is currently muted.")
                                            speak_text("Jarvis is currently muted.")
                                            system_text("going back to main loop")
                                            break
                                    
                                else:
                                    system_text("going back to main loop")
                                    break
                                
                            except Exception as e:
                                pass  # Handle recognition exceptions here
            
            finally:
                if porcupine is not None:
                    porcupine.delete()
                if audio_stream is not None:
                    audio_stream.close()
                if paud is not None:
                    paud.terminate()
                    
if __name__ == "__main__":
    print("All systems are online.")
    main()

I tried shortening it as much as I could for readability. The full code is almost 900 lines.

At this point I wanted the confidence percentage to be higher and more accurate, with some other upgrades like using colored photos instead of grayscale, so with the help of some tutorials and ChatGPT, I made a new improved version:

import requests, os, subprocess, webbrowser, sys, pickle, random, pyaudio, pvporcupine, struct, cv2, threading, time, face_recognition
import win32gui, win32api, win32con
import speech_recognition as sr
from datetime import *
import numpy as np
from openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

muted = False

tts = OpenAI(api_key = os.getenv("JARVIS_SPEECH"))

porcupine = None
paud = None
audio_stream = None

def user_text(text):
    print(f"33[91mUser: {text}33[0m") #red
    
def jarvis_text(text):
    print(f"33[96mJarvis: {text}33[0m") #cyan
    
def system_text(text):
    print(f"33[92mSystem: {text}33[0m") #green

def transcribe_audio_to_text(filename):
    system_text("transcribing")
    recognizer = sr.Recognizer()
    with sr.AudioFile(filename) as source:
        audio = recognizer.record(source)
    try:
        return recognizer.recognize_google(audio)
    except:
        return None
    
p = pyaudio.PyAudio()
stream = p.open(format=8,
                channels=1,
                rate=24_000,
                output=True)

def speak_text(text):
    with tts.audio.speech.with_streaming_response.create(
        model="tts-1",
        voice="echo",
        input=text,
        response_format="wav"
) as response:
        for chunk in response.iter_bytes(1024):
            stream.write(chunk)
            
# Initialize the face recognition model
def load_known_faces(folder_path):
    known_face_encodings = []
    known_face_names = []

    for person_name in os.listdir(folder_path):
        person_folder = os.path.join(folder_path, person_name)
        if os.path.isdir(person_folder):
            for image_name in os.listdir(person_folder):
                image_path = os.path.join(person_folder, image_name)
                image = face_recognition.load_image_file(image_path)
                face_encodings = face_recognition.face_encodings(image)
                if face_encodings:
                    known_face_encodings.append(face_encodings[0])
                    known_face_names.append(person_name)
    return known_face_encodings, known_face_names

def greet_person(name):
    greeting_messages = [f"Hello {name}.", f"Hi there, {name}.", f"Good to see you, {name}.", f"Greetings, {name}."]
    chosen_greeting_message = random.choice(greeting_messages)
    print(chosen_greeting_message)

def facial_recognition():
    cam = cv2.VideoCapture(0)
    cam.set(3, 1280)
    cam.set(4, 720)

    known_face_encodings, known_face_names = load_known_faces("known_faces")

    last_seen_data = {name: {'last_seen': datetime.now(), 'greeted': False} for name in known_face_names}
    last_data_read_time = datetime.now()

    while True:
        ret, img = cam.read()
        img = cv2.flip(img, 1)
        rgb_img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        face_locations = face_recognition.face_locations(rgb_img)
        face_encodings = face_recognition.face_encodings(rgb_img, face_locations)

        for face_encoding, face_location in zip(face_encodings, face_locations):
            matches = face_recognition.compare_faces(known_face_encodings, face_encoding)
            face_distances = face_recognition.face_distance(known_face_encodings, face_encoding)
            best_match_index = np.argmin(face_distances)

            if matches[best_match_index]:
                confidence_percentage = round((1 - face_distances[best_match_index]) * 100)

                if confidence_percentage > 50:  # Threshold check
                    name = known_face_names[best_match_index]

                    current_time = datetime.now()
                    if (current_time - last_data_read_time).total_seconds() >= 2:
                        last_seen_data[name]['last_seen'] = current_time
                        if not last_seen_data[name]['greeted']:
                            greet_person(name)
                            last_seen_data[name]['greeted'] = True
                        last_data_read_time = current_time
                else:
                    name = "unknown"
                    confidence_percentage = 0
            else:
                name = "unknown"
                confidence_percentage = 0

            top, right, bottom, left = face_location
            cv2.rectangle(img, (left, top), (right, bottom), (0, 255, 0), 2)
            cv2.putText(img, name, (left + 5, bottom + 20), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
            cv2.putText(img, f'{confidence_percentage}%', (left + 5, bottom + 45), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 0), 1)

        for name in last_seen_data:
            if datetime.now() - last_seen_data[name]['last_seen'] > timedelta(minutes=2):
                last_seen_data[name]['greeted'] = False

        cv2.imshow('Jarvis Facial Recognition', img)
        k = cv2.waitKey(10) & 0xff
        if k == 27:
            break

    cam.release()
    cv2.destroyAllWindows()

facial_recognition_thread = threading.Thread(target=facial_recognition)
facial_recognition_thread.start()
    
def execute_prompt(text, time, history, jarvis_setup):
    global muted
    
    search_index = text.find("search")
    on_index = text.find("on")

    if text == "are you there" or text == "you there" or text == "are you up":
        user_text(text)
        jarvis_text("For you sir, always.")
        speak_text("For you sir, always.")
    
    elif text == "exit":
        user_text(text)
        jarvis_text("Good bye sir.")
        speak_text("Good bye sir.")
        sys.exit(0)
        
    else:
        user_text(text)
        history.append(HumanMessage(content = text))
        response = jarvis_setup.invoke(history)
        history.append(AIMessage(content = response.content))
        with open('JarvisMemory.txt', 'wb') as file:
            pickle.dump(history, file)
            
        jarvis_text(response.content)
        speak_text(response.content)
        

def main():
    global muted
    
    jarvis_setup = ChatOpenAI(openai_api_key=os.getenv("JARVIS"),
                              model="gpt-3.5-turbo",
                              temperature=0.7,
                              max_tokens=400)
    
    JarvisMemory_path = "C:\Users\maste\OneDrive\Documents\Python\Jarvis\JarvisMemory.txt"
    
    if os.path.exists(JarvisMemory_path):
        with open(JarvisMemory_path, "rb") as file:
            history = pickle.load(file)
    else:
        with open(JarvisMemory_path, "wb") as file:
            history = [SystemMessage(content="""You are a human-like virtual assistant named Jarvis. Use a conversational tone.""")]
            pickle.dump(history, file)
    
    recognizer = sr.Recognizer()
    microphone = sr.Microphone()
    
    greeting_messages = ["Greetings sir.", "Hello there.", "How can I help?", "What's up?", "At your service."]
    
    while True:
        with microphone as source:
            recognizer.adjust_for_ambient_noise(source)
            
            try:
                porcupine = pvporcupine.create(access_key=os.getenv("PORCUPINE"), keywords=["jarvis"])
                paud = pyaudio.PyAudio()
                audio_stream = paud.open(rate=porcupine.sample_rate, channels=1, format=pyaudio.paInt16, input=True, frames_per_buffer=porcupine.frame_length)
                while True:
                    keyword = audio_stream.read(porcupine.frame_length)
                    keyword = struct.unpack_from("h"*porcupine.frame_length, keyword)
                    keyword_index = porcupine.process(keyword)
                    if keyword_index >= 0:
                        filename = "input.wav"
                        
                        system_text("wake word detected")
                        
                        chosen_greeting = random.choice(greeting_messages)
                        jarvis_text(chosen_greeting)
                        speak_text(chosen_greeting)
                        
                        system_text("getting audio")
                        with sr.Microphone() as source:
                            recognizer.adjust_for_ambient_noise(source)
                            prompt = recognizer.listen(source, phrase_time_limit=None, timeout=None)
                            filename = "input.wav"
                            with open(filename, "wb") as f:
                                f.write(prompt.get_wav_data())
                    
                        time = datetime.now()
                        text = transcribe_audio_to_text(filename)
                        
                        if text:
                            if (text == "I wasn't talking to you"):
                                execute_prompt(text, time, history, jarvis_setup)
                                system_text("going back to main loop")
                                break
                            
                            elif muted == False:
                                execute_prompt(text, time, history, jarvis_setup)
                                
                        else:
                            system_text("going back to main loop")
                            break
                                                
                        while True:
                            with sr.Microphone() as source:
                                recognizer = sr.Recognizer()
                                system_text("listening")
                                prompt2 = recognizer.listen(source, phrase_time_limit=None, timeout=None)
                                with open(filename, "wb") as f:
                                    f.write(prompt2.get_wav_data())
                        
                            try:
                                text = transcribe_audio_to_text(filename)
                                
                                if text:
                                    if (text == "I wasn't talking to you"):
                                        execute_prompt(text, time, history, jarvis_setup)
                                        system_text("going back to main loop")
                                        break
                                    
                                    elif muted == False:
                                        execute_prompt(text, time, history, jarvis_setup)
                                    
                                else:
                                    system_text("going back to main loop")
                                    break
                                
                            except Exception as e:
                                pass  # Handle recognition exceptions here
            
            finally:
                if porcupine is not None:
                    porcupine.delete()
                if audio_stream is not None:
                    audio_stream.close()
                if paud is not None:
                    paud.terminate()
                    
if __name__ == "__main__":
    print("All systems are online.")
    main()

This code is also shortened by a lot.

The problem I’m having is that when the camera feed window opens, the microphone stops initializing and then coming back again repeatly at very fast intervals, and I’m guessing the face_recognition module is the culprit.

Below is a video of what’s exactly happening:
https://drive.google.com/file/d/1032Y-tpjqby8X8BEpHwdIwjLOAthrjp8/view?usp=sharing
I forgot to say this in the video, but I have tried the working code while it was recognizing my face and yes, it works fine.

I have a feeling this isn’t a simple, straightforward issue. I mean, it’s uncommon for sure. I searched for a long time on the internet for similar issues and couldn’t find any. I really need some help.

Trang chủ Giới thiệu Sinh nhật bé trai Sinh nhật bé gái Tổ chức sự kiện Biểu diễn giải trí Dịch vụ khác Trang trí tiệc cưới Tổ chức khai trương Tư vấn dịch vụ Thư viện ảnh Tin tức - sự kiện Liên hệ Chú hề sinh nhật Trang trí YEAR END PARTY công ty Trang trí tất niên cuối năm Trang trí tất niên xu hướng mới nhất Trang trí sinh nhật bé trai Hải Đăng Trang trí sinh nhật bé Khánh Vân Trang trí sinh nhật Bích Ngân Trang trí sinh nhật bé Thanh Trang Thuê ông già Noel phát quà Biểu diễn xiếc khỉ Xiếc quay đĩa Dịch vụ tổ chức sự kiện 5 sao Thông tin về chúng tôi Dịch vụ sinh nhật bé trai Dịch vụ sinh nhật bé gái Sự kiện trọn gói Các tiết mục giải trí Dịch vụ bổ trợ Tiệc cưới sang trọng Dịch vụ khai trương Tư vấn tổ chức sự kiện Hình ảnh sự kiện Cập nhật tin tức Liên hệ ngay Thuê chú hề chuyên nghiệp Tiệc tất niên cho công ty Trang trí tiệc cuối năm Tiệc tất niên độc đáo Sinh nhật bé Hải Đăng Sinh nhật đáng yêu bé Khánh Vân Sinh nhật sang trọng Bích Ngân Tiệc sinh nhật bé Thanh Trang Dịch vụ ông già Noel Xiếc thú vui nhộn Biểu diễn xiếc quay đĩa Dịch vụ tổ chức tiệc uy tín Khám phá dịch vụ của chúng tôi Tiệc sinh nhật cho bé trai Trang trí tiệc cho bé gái Gói sự kiện chuyên nghiệp Chương trình giải trí hấp dẫn Dịch vụ hỗ trợ sự kiện Trang trí tiệc cưới đẹp Khởi đầu thành công với khai trương Chuyên gia tư vấn sự kiện Xem ảnh các sự kiện đẹp Tin mới về sự kiện Kết nối với đội ngũ chuyên gia Chú hề vui nhộn cho tiệc sinh nhật Ý tưởng tiệc cuối năm Tất niên độc đáo Trang trí tiệc hiện đại Tổ chức sinh nhật cho Hải Đăng Sinh nhật độc quyền Khánh Vân Phong cách tiệc Bích Ngân Trang trí tiệc bé Thanh Trang Thuê dịch vụ ông già Noel chuyên nghiệp Xem xiếc khỉ đặc sắc Xiếc quay đĩa thú vị
Trang chủ Giới thiệu Sinh nhật bé trai Sinh nhật bé gái Tổ chức sự kiện Biểu diễn giải trí Dịch vụ khác Trang trí tiệc cưới Tổ chức khai trương Tư vấn dịch vụ Thư viện ảnh Tin tức - sự kiện Liên hệ Chú hề sinh nhật Trang trí YEAR END PARTY công ty Trang trí tất niên cuối năm Trang trí tất niên xu hướng mới nhất Trang trí sinh nhật bé trai Hải Đăng Trang trí sinh nhật bé Khánh Vân Trang trí sinh nhật Bích Ngân Trang trí sinh nhật bé Thanh Trang Thuê ông già Noel phát quà Biểu diễn xiếc khỉ Xiếc quay đĩa
Thiết kế website Thiết kế website Thiết kế website Cách kháng tài khoản quảng cáo Mua bán Fanpage Facebook Dịch vụ SEO Tổ chức sinh nhật