I have to build a chatbot based on my compnay data. The data is in a pdf file. I want the chatbot to be trained in such a way that it answers any query that has an answer in the document. I dont want any hardcoding in json file where i put the questions and answers myself. The chatbot should be trained to deal with any kind of question and answer from the document. It should be able to carry on conversation as well for example if it answers a question from document and any subsequent question is asked, it should be able to answer it.
I tried embedding and vector approach for this. Here is my code for embedding:
import openai
import json
from config import Config
from pinecone import Pinecone, ServerlessSpec
from langchain_openai import OpenAIEmbeddings # Updated import
# Set API keys and initialize services
openai.api_key = Config.OPENAI_API_KEY
api_key = Config.PINECONE_API_KEY
# Initialize Pinecone with the API key
pc = Pinecone(api_key=api_key)
# Load your data
with open('qa.json', 'r', encoding='utf-8') as f:
company_data = json.load(f)
# Initialize OpenAI embeddings with the API key
embeddings = OpenAIEmbeddings(openai_api_key=Config.OPENAI_API_KEY)
data_embeddings = []
for section, subsections in company_data.items():
for subsection, content in subsections.items():
# Use embed_documents to get embeddings for a list of documents
embed = embeddings.embed_documents([content])[0]
data_embeddings.append({'text': content, 'embedding': embed, 'section': section, 'subsection': subsection})
# Store Embeddings in Pinecone
index_name = 'company-info-index'
# Check if the index exists and create if not
if index_name not in pc.list_indexes().names():
pc.create_index(
name=index_name,
dimension=len(data_embeddings[0]['embedding']),
metric='euclidean',
spec=ServerlessSpec(
cloud='aws', # Use Amazon Web Services
region='us-east-1' # Use the us-east-1 region
)
)
index = pc.Index(index_name)
# Upsert data into the index
for i, data in enumerate(data_embeddings):
index.upsert([{
'id': f'id-{i}',
'values': data['embedding'],
'metadata': {'text': data['text'], 'section': data['section'], 'subsection': data['subsection']}
}])
And i used flask for making the bot. Here is the code for that:
import os
import json
from flask import Flask, request, Response
from slack_sdk import WebClient
from slack_sdk.errors import SlackApiError
from dotenv import load_dotenv
from config import Config
import openai
from pinecone import Pinecone, ServerlessSpec
from langchain_openai import OpenAIEmbeddings
# Load environment variables from .env file
load_dotenv()
# Set API keys and initialize services
openai.api_key = Config.OPENAI_API_KEY
pinecone_api_key = Config.PINECONE_API_KEY
# Initialize Pinecone with the API key
pc = Pinecone(api_key=pinecone_api_key)
# Ensure the index exists
index_name = 'company-info-index'
if index_name not in pc.list_indexes().names():
pc.create_index(
name=index_name,
dimension=1536, # Adjust dimension based on your embeddings
metric='euclidean',
spec=ServerlessSpec(cloud='aws', region='us-east-1') # Adjust region as needed
)
index = pc.Index(index_name)
# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings(openai_api_key=Config.OPENAI_API_KEY)
app = Flask(__name__)
slack_token = os.getenv("SLACK_BOT_TOKEN")
slack_client = WebClient(token=slack_token)
# Dictionary to store conversation history and thread timestamps
conversation_history = {}
thread_timestamps = {}
@app.route("/slack/events", methods=["POST"])
def slack_events():
data = request.json
if "challenge" in data:
return Response(data["challenge"], mimetype="text/plain")
if "event" in data:
event = data["event"]
if event.get("type") == "app_mention" or event.get("type") == "message":
text = event.get("text").lower()
channel = event.get("channel")
user = event.get("user")
ts = event.get("ts")
thread_ts = event.get("thread_ts")
# Determine if we need to start a new thread or continue an existing one
if user not in thread_timestamps:
thread_timestamps[user] = ts
if thread_ts is None:
thread_ts = thread_timestamps[user]
# Retrieve embedding for the query
query_embedding = embeddings.embed_query(text)
print(f"Query Embedding: {query_embedding}") # Debug print
if not isinstance(query_embedding, list) or not all(isinstance(i, float) for i in query_embedding):
print(f"Invalid query embedding format: {query_embedding}")
return Response("Invalid query embedding format", status=500)
results = index.query(queries=[query_embedding], top_k=1)
print("Results:", results) # Debug print
response_text = "I'm sorry, I don't have an answer for that."
if results and 'matches' in results[0] and len(results[0]['matches']) > 0:
# Adjust based on the actual structure of `results`
match = results[0]['matches'][0]
if 'metadata' in match and 'text' in match['metadata']:
response_text = match['metadata']['text']
else:
print(f"Expected metadata.text in match but got: {match}")
# Store the conversation history
if user not in conversation_history:
conversation_history[user] = []
conversation_history[user].append({"role": "user", "content": text})
# Add response to the conversation history
conversation_history[user].append({"role": "assistant", "content": response_text})
# If no predefined answer is found, use OpenAI for a dynamic response
if response_text == "I'm sorry, I don't have an answer for that.":
try:
# Use conversation history to provide context
messages = [{"role": "system", "content": "You are a helpful assistant."}]
messages.extend(conversation_history[user])
gpt_response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=messages,
max_tokens=150,
temperature=0.9,
)
response_text = gpt_response['choices'][0]['message']['content'].strip()
# Update the conversation history with the new response
conversation_history[user].append({"role": "assistant", "content": response_text})
except openai.OpenAIError as e:
print(f"Error generating response: {e}")
response_text = "There was an error with the OpenAI API."
try:
slack_client.chat_postMessage(
channel=channel,
text=response_text,
thread_ts=thread_ts
)
except SlackApiError as e:
print(f"Error posting message: {e}")
return Response("OK", status=200)
if __name__ == "__main__":
app.run(port=3000, debug=True)
However, this is not working I am unable to get any response. Please lmk what to do and if there is any better approach for my bot let me know that too.
Thank you!