I’m a begginer in the chatbot developer world and currently building a rag code to create a context based chatbot, but I keep getting this error, I believe it happens when the text is being splitted, because even after the function is called, the text remains with the “n” separators.
The last line of the traceback occurs in the huggingface library.
The traceback:
<code>Traceback (most recent call last):
File "c:Userssophiminiconda3envsambiente3.9librunpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
File "c:Userssophiminiconda3envsambiente3.9librunpy.py", line 87, in _run_code
File "c:Userssophi.vscodeextensionsms-python.debugpy-2024.6.0-win32-x64bundledlibsdebugpy__main__.py", line 39, in <module>
File "c:Userssophi.vscodeextensionsms-python.debugpy-2024.6.0-win32-x64bundledlibsdebugpy/..debugpyservercli.py", line 430, in main
File "c:Userssophi.vscodeextensionsms-python.debugpy-2024.6.0-win32-x64bundledlibsdebugpy/..debugpyservercli.py", line 284, in run_file
runpy.run_path(target, run_name="__main__")
File "c:Userssophi.vscodeextensionsms-python.debugpy-2024.6.0-win32-x64bundledlibsdebugpy_vendoredpydevd_pydevd_bundlepydevd_runpy.py", line 321, in run_path
return _run_module_code(code, init_globals, run_name,
File "c:Userssophi.vscodeextensionsms-python.debugpy-2024.6.0-win32-x64bundledlibsdebugpy_vendoredpydevd_pydevd_bundlepydevd_runpy.py", line 135, in _run_module_code
_run_code(code, mod_globals, init_globals,
File "c:Userssophi.vscodeextensionsms-python.debugpy-2024.6.0-win32-x64bundledlibsdebugpy_vendoredpydevd_pydevd_bundlepydevd_runpy.py", line 124, in _run_code
File "c:UserssophiDocumentsProjetosdePesquisaProjeto-de-Pesquisa-SOLIRISllm_rag _ver4utilsrag.py", line 130, in <module>
File "c:UserssophiDocumentsProjetosdePesquisaProjeto-de-Pesquisa-SOLIRISllm_rag _ver4utilsrag.py", line 126, in main
response = qa.invoke({"input": {"context": context, "question": question}})
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablesbase.py", line 4588, in invoke
return self.bound.invoke(
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablesbase.py", line 2505, in invoke
input = step.invoke(input, config, **kwargs)
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablespassthrough.py", line 469, in invoke
return self._call_with_config(self._invoke, input, config, **kwargs)
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablesbase.py", line 1599, in _call_with_config
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablesconfig.py", line 380, in call_func_with_variable_args
return func(input, **kwargs) # type: ignore[call-arg]
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablespassthrough.py", line 456, in _invoke
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablesbase.py", line 3152, in invoke
output = {key: future.result() for key, future in zip(steps, futures)}
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablesbase.py", line 3152, in <dictcomp>
output = {key: future.result() for key, future in zip(steps, futures)}
File "c:Userssophiminiconda3envsambiente3.9libconcurrentfutures_base.py", line 446, in result
return self.__get_result()
File "c:Userssophiminiconda3envsambiente3.9libconcurrentfutures_base.py", line 391, in __get_result
File "c:Userssophiminiconda3envsambiente3.9libconcurrentfuturesthread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablesbase.py", line 4588, in invoke
return self.bound.invoke(
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablesbase.py", line 2507, in invoke
input = step.invoke(input, config)
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_coreretrievers.py", line 221, in invoke
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_coreretrievers.py", line 214, in invoke
result = self._get_relevant_documents(
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corevectorstores.py", line 797, in _get_relevant_documents
docs = self.vectorstore.similarity_search(query, **self.search_kwargs)
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_communityvectorstoreschroma.py", line 349, in similarity_search
docs_and_scores = self.similarity_search_with_score(
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_communityvectorstoreschroma.py", line 438, in similarity_search_with_score
query_embedding = self._embedding_function.embed_query(query)
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_huggingfaceembeddingshuggingface.py", line 102, in embed_query
return self.embed_documents([text])[0]
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_huggingfaceembeddingshuggingface.py", line 81, in embed_documents
texts = list(map(lambda x: x.replace("n", " "), texts))
File "c:Userssophiminiconda3envsambiente3.9libsite-**packageslangchain_huggingfaceembeddingshuggingface.py", line 81, in <lambda>
texts = list(map(lambda x: x.replace("n", " "), texts))
AttributeError: 'dict' object has no attribute 'replace'**
<code>Traceback (most recent call last):
File "c:Userssophiminiconda3envsambiente3.9librunpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
File "c:Userssophiminiconda3envsambiente3.9librunpy.py", line 87, in _run_code
exec(code, run_globals)
File "c:Userssophi.vscodeextensionsms-python.debugpy-2024.6.0-win32-x64bundledlibsdebugpy__main__.py", line 39, in <module>
cli.main()
File "c:Userssophi.vscodeextensionsms-python.debugpy-2024.6.0-win32-x64bundledlibsdebugpy/..debugpyservercli.py", line 430, in main
run()
File "c:Userssophi.vscodeextensionsms-python.debugpy-2024.6.0-win32-x64bundledlibsdebugpy/..debugpyservercli.py", line 284, in run_file
runpy.run_path(target, run_name="__main__")
File "c:Userssophi.vscodeextensionsms-python.debugpy-2024.6.0-win32-x64bundledlibsdebugpy_vendoredpydevd_pydevd_bundlepydevd_runpy.py", line 321, in run_path
return _run_module_code(code, init_globals, run_name,
File "c:Userssophi.vscodeextensionsms-python.debugpy-2024.6.0-win32-x64bundledlibsdebugpy_vendoredpydevd_pydevd_bundlepydevd_runpy.py", line 135, in _run_module_code
_run_code(code, mod_globals, init_globals,
File "c:Userssophi.vscodeextensionsms-python.debugpy-2024.6.0-win32-x64bundledlibsdebugpy_vendoredpydevd_pydevd_bundlepydevd_runpy.py", line 124, in _run_code
exec(code, run_globals)
File "c:UserssophiDocumentsProjetosdePesquisaProjeto-de-Pesquisa-SOLIRISllm_rag _ver4utilsrag.py", line 130, in <module>
main()
File "c:UserssophiDocumentsProjetosdePesquisaProjeto-de-Pesquisa-SOLIRISllm_rag _ver4utilsrag.py", line 126, in main
response = qa.invoke({"input": {"context": context, "question": question}})
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablesbase.py", line 4588, in invoke
return self.bound.invoke(
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablesbase.py", line 2505, in invoke
input = step.invoke(input, config, **kwargs)
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablespassthrough.py", line 469, in invoke
return self._call_with_config(self._invoke, input, config, **kwargs)
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablesbase.py", line 1599, in _call_with_config
context.run(
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablesconfig.py", line 380, in call_func_with_variable_args
return func(input, **kwargs) # type: ignore[call-arg]
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablespassthrough.py", line 456, in _invoke
**self.mapper.invoke(
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablesbase.py", line 3152, in invoke
output = {key: future.result() for key, future in zip(steps, futures)}
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablesbase.py", line 3152, in <dictcomp>
output = {key: future.result() for key, future in zip(steps, futures)}
File "c:Userssophiminiconda3envsambiente3.9libconcurrentfutures_base.py", line 446, in result
return self.__get_result()
File "c:Userssophiminiconda3envsambiente3.9libconcurrentfutures_base.py", line 391, in __get_result
raise self._exception
File "c:Userssophiminiconda3envsambiente3.9libconcurrentfuturesthread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablesbase.py", line 4588, in invoke
return self.bound.invoke(
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablesbase.py", line 2507, in invoke
input = step.invoke(input, config)
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_coreretrievers.py", line 221, in invoke
raise e
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_coreretrievers.py", line 214, in invoke
result = self._get_relevant_documents(
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corevectorstores.py", line 797, in _get_relevant_documents
docs = self.vectorstore.similarity_search(query, **self.search_kwargs)
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_communityvectorstoreschroma.py", line 349, in similarity_search
docs_and_scores = self.similarity_search_with_score(
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_communityvectorstoreschroma.py", line 438, in similarity_search_with_score
query_embedding = self._embedding_function.embed_query(query)
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_huggingfaceembeddingshuggingface.py", line 102, in embed_query
return self.embed_documents([text])[0]
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_huggingfaceembeddingshuggingface.py", line 81, in embed_documents
texts = list(map(lambda x: x.replace("n", " "), texts))
File "c:Userssophiminiconda3envsambiente3.9libsite-**packageslangchain_huggingfaceembeddingshuggingface.py", line 81, in <lambda>
texts = list(map(lambda x: x.replace("n", " "), texts))
AttributeError: 'dict' object has no attribute 'replace'**
</code>
Traceback (most recent call last):
File "c:Userssophiminiconda3envsambiente3.9librunpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
File "c:Userssophiminiconda3envsambiente3.9librunpy.py", line 87, in _run_code
exec(code, run_globals)
File "c:Userssophi.vscodeextensionsms-python.debugpy-2024.6.0-win32-x64bundledlibsdebugpy__main__.py", line 39, in <module>
cli.main()
File "c:Userssophi.vscodeextensionsms-python.debugpy-2024.6.0-win32-x64bundledlibsdebugpy/..debugpyservercli.py", line 430, in main
run()
File "c:Userssophi.vscodeextensionsms-python.debugpy-2024.6.0-win32-x64bundledlibsdebugpy/..debugpyservercli.py", line 284, in run_file
runpy.run_path(target, run_name="__main__")
File "c:Userssophi.vscodeextensionsms-python.debugpy-2024.6.0-win32-x64bundledlibsdebugpy_vendoredpydevd_pydevd_bundlepydevd_runpy.py", line 321, in run_path
return _run_module_code(code, init_globals, run_name,
File "c:Userssophi.vscodeextensionsms-python.debugpy-2024.6.0-win32-x64bundledlibsdebugpy_vendoredpydevd_pydevd_bundlepydevd_runpy.py", line 135, in _run_module_code
_run_code(code, mod_globals, init_globals,
File "c:Userssophi.vscodeextensionsms-python.debugpy-2024.6.0-win32-x64bundledlibsdebugpy_vendoredpydevd_pydevd_bundlepydevd_runpy.py", line 124, in _run_code
exec(code, run_globals)
File "c:UserssophiDocumentsProjetosdePesquisaProjeto-de-Pesquisa-SOLIRISllm_rag _ver4utilsrag.py", line 130, in <module>
main()
File "c:UserssophiDocumentsProjetosdePesquisaProjeto-de-Pesquisa-SOLIRISllm_rag _ver4utilsrag.py", line 126, in main
response = qa.invoke({"input": {"context": context, "question": question}})
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablesbase.py", line 4588, in invoke
return self.bound.invoke(
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablesbase.py", line 2505, in invoke
input = step.invoke(input, config, **kwargs)
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablespassthrough.py", line 469, in invoke
return self._call_with_config(self._invoke, input, config, **kwargs)
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablesbase.py", line 1599, in _call_with_config
context.run(
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablesconfig.py", line 380, in call_func_with_variable_args
return func(input, **kwargs) # type: ignore[call-arg]
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablespassthrough.py", line 456, in _invoke
**self.mapper.invoke(
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablesbase.py", line 3152, in invoke
output = {key: future.result() for key, future in zip(steps, futures)}
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablesbase.py", line 3152, in <dictcomp>
output = {key: future.result() for key, future in zip(steps, futures)}
File "c:Userssophiminiconda3envsambiente3.9libconcurrentfutures_base.py", line 446, in result
return self.__get_result()
File "c:Userssophiminiconda3envsambiente3.9libconcurrentfutures_base.py", line 391, in __get_result
raise self._exception
File "c:Userssophiminiconda3envsambiente3.9libconcurrentfuturesthread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablesbase.py", line 4588, in invoke
return self.bound.invoke(
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corerunnablesbase.py", line 2507, in invoke
input = step.invoke(input, config)
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_coreretrievers.py", line 221, in invoke
raise e
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_coreretrievers.py", line 214, in invoke
result = self._get_relevant_documents(
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_corevectorstores.py", line 797, in _get_relevant_documents
docs = self.vectorstore.similarity_search(query, **self.search_kwargs)
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_communityvectorstoreschroma.py", line 349, in similarity_search
docs_and_scores = self.similarity_search_with_score(
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_communityvectorstoreschroma.py", line 438, in similarity_search_with_score
query_embedding = self._embedding_function.embed_query(query)
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_huggingfaceembeddingshuggingface.py", line 102, in embed_query
return self.embed_documents([text])[0]
File "c:Userssophiminiconda3envsambiente3.9libsite-packageslangchain_huggingfaceembeddingshuggingface.py", line 81, in embed_documents
texts = list(map(lambda x: x.replace("n", " "), texts))
File "c:Userssophiminiconda3envsambiente3.9libsite-**packageslangchain_huggingfaceembeddingshuggingface.py", line 81, in <lambda>
texts = list(map(lambda x: x.replace("n", " "), texts))
AttributeError: 'dict' object has no attribute 'replace'**
This is my entire code: (except for the groq api)
from langchain_core.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain.chains import create_retrieval_chain
from langchain_community.document_loaders import TextLoader
from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.vectorstores import Chroma
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.llms import CTransformers
# Caminho para o arquivo PDF
PDF_PATH = 'pdf_handling/entrevistas.pdf'
# Caminho para salvar os dados do ChromaDB
CHROMA_DATA_PATH = "chroma_data"
EMBED_MODEL = "all-MiniLM-L6-v2"
COLLECTION_NAME = "ruth_docs"
def dict_to_string(input_dict):
# Convert the dictionary into a string representation
# This uses a list comprehension to create a list of "key: value" strings
# and then joins them with a comma and a space.
return ', '.join([f"{key}: {value}" for key, value in input_dict.items()])
# Função para extrair texto de um PDF e retornar uma lista de objetos Document
def extract_text_from_pdf(file_path):
with open(file_path, 'rb') as pdf_file:
pdf = PyPDF2.PdfReader(pdf_file)
text += page.extract_text()
text_splitter = RecursiveCharacterTextSplitter(
separators=['nnn','nn','n', ' ', '']
documents = text_splitter.create_documents([text])
splitted_documents = text_splitter.split_documents(documents)
# print("---------------------- vs ---------------------")
# print(splitted_documents)
return splitted_documents
except FileNotFoundError:
print("Arquivo não encontrado")
def save_db(self, documents, embeddings, db_path):
self.embeddings = embeddings
self.documents = documents
vectordb = Chroma.from_documents(input, self.embeddings, persist_directory=self.db_path)
vectordb = Chroma(db_path, embeddings)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device':'cpu'})
# Extraindo texto do PDF e criando a base de dados vetorial
documents = extract_text_from_pdf(PDF_PATH)
vectordb = criar_vectordb().save_db(documents, embeddings, CHROMA_DATA_PATH)
os.environ["GROQ_API_KEY"] = "-"
ruth_prompt_template = """
Você é um assistente virtual de RH utilizando documentos para embasar sua resposta sempre em fatos,
Use as informações presentes no documento para responder a resposta do candidato,
sua resposta deve ser o mais semelhante possível com a descrição presente nos documentos
Apenas retorne as respostas úteis em ajudar na avaliação e seleção de candidatos e nada mais, usando uma linguagem gentil e empática.
Sempre responda em português, uma descrição em texto contínua, além disso adicione
um ou mais emojis às vezes para demonstrar empatia e emoção.
prompt = PromptTemplate(template=ruth_prompt_template, input_variables=['context', 'question'])
model = "model/llama-2-7b-chat.ggmlv3.q8_0.bin",
config={'max_new_tokens': 512,
'repetition_penalty': 1.15}
llm = ChatGroq(model_name="llama3-70b-8192", api_key=os.environ["GROQ_API_KEY"])
retriever = vectordb.as_retriever(search_kwargs={"k": 2})
combine_docs_chain = create_stuff_documents_chain(
qa = create_retrieval_chain(retriever, combine_docs_chain)
context = "Feedback negativo"
question = "Como você lida com feedback negativo?"
response = qa.invoke({"input": {"context": context, "question": question}})
if __name__ == "__main__":
<code>
import sys
import os
from langchain_core.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain.chains import create_retrieval_chain
from langchain_community.document_loaders import TextLoader
from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.vectorstores import Chroma
from langchain.docstore.document import Document
import PyPDF2
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.llms import CTransformers
# Caminho para o arquivo PDF
PDF_PATH = 'pdf_handling/entrevistas.pdf'
# Caminho para salvar os dados do ChromaDB
CHROMA_DATA_PATH = "chroma_data"
# Modelo de embeddings
EMBED_MODEL = "all-MiniLM-L6-v2"
# Nome da coleção
COLLECTION_NAME = "ruth_docs"
def dict_to_string(input_dict):
# Convert the dictionary into a string representation
# This uses a list comprehension to create a list of "key: value" strings
# and then joins them with a comma and a space.
return ', '.join([f"{key}: {value}" for key, value in input_dict.items()])
# Função para extrair texto de um PDF e retornar uma lista de objetos Document
def extract_text_from_pdf(file_path):
try:
with open(file_path, 'rb') as pdf_file:
pdf = PyPDF2.PdfReader(pdf_file)
paginas = len(pdf.pages)
text = ""
for i in range(paginas):
page = pdf.pages[i]
text += page.extract_text()
# print(type(text))
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
length_function=len,
separators=['nnn','nn','n', ' ', '']
)
documents = text_splitter.create_documents([text])
splitted_documents = text_splitter.split_documents(documents)
# print(documents)
# print("---------------------- vs ---------------------")
# print(splitted_documents)
return splitted_documents
except FileNotFoundError:
print("Arquivo não encontrado")
return []
class criar_vectordb:
def save_db(self, documents, embeddings, db_path):
self.db_path = db_path
self.embeddings = embeddings
self.documents = documents
input=self.documents
vectordb = Chroma.from_documents(input, self.embeddings, persist_directory=self.db_path)
vectordb = None
vectordb = Chroma(db_path, embeddings)
return vectordb
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device':'cpu'})
# Extraindo texto do PDF e criando a base de dados vetorial
documents = extract_text_from_pdf(PDF_PATH)
vectordb = criar_vectordb().save_db(documents, embeddings, CHROMA_DATA_PATH)
os.environ["GROQ_API_KEY"] = "-"
ruth_prompt_template = """
Você é um assistente virtual de RH utilizando documentos para embasar sua resposta sempre em fatos,
Use as informações presentes no documento para responder a resposta do candidato,
sua resposta deve ser o mais semelhante possível com a descrição presente nos documentos
contexto: {context}
pergunta: {question}
Apenas retorne as respostas úteis em ajudar na avaliação e seleção de candidatos e nada mais, usando uma linguagem gentil e empática.
Sempre responda em português, uma descrição em texto contínua, além disso adicione
um ou mais emojis às vezes para demonstrar empatia e emoção.
"""
prompt = PromptTemplate(template=ruth_prompt_template, input_variables=['context', 'question'])
'''
llm = CTransformers(
model = "model/llama-2-7b-chat.ggmlv3.q8_0.bin",
model_type = "llama",
config={'max_new_tokens': 512,
'temperature': 0.03,
'context_length': 1000,
'repetition_penalty': 1.15}
)
'''
llm = ChatGroq(model_name="llama3-70b-8192", api_key=os.environ["GROQ_API_KEY"])
retriever = vectordb.as_retriever(search_kwargs={"k": 2})
combine_docs_chain = create_stuff_documents_chain(
llm, prompt
)
qa = create_retrieval_chain(retriever, combine_docs_chain)
# Main
def main():
# Exemplo de uso
context = "Feedback negativo"
question = "Como você lida com feedback negativo?"
response = qa.invoke({"input": {"context": context, "question": question}})
print(response)
if __name__ == "__main__":
main()
</code>
import sys
import os
from langchain_core.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain.chains import create_retrieval_chain
from langchain_community.document_loaders import TextLoader
from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.vectorstores import Chroma
from langchain.docstore.document import Document
import PyPDF2
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.llms import CTransformers
# Caminho para o arquivo PDF
PDF_PATH = 'pdf_handling/entrevistas.pdf'
# Caminho para salvar os dados do ChromaDB
CHROMA_DATA_PATH = "chroma_data"
# Modelo de embeddings
EMBED_MODEL = "all-MiniLM-L6-v2"
# Nome da coleção
COLLECTION_NAME = "ruth_docs"
def dict_to_string(input_dict):
# Convert the dictionary into a string representation
# This uses a list comprehension to create a list of "key: value" strings
# and then joins them with a comma and a space.
return ', '.join([f"{key}: {value}" for key, value in input_dict.items()])
# Função para extrair texto de um PDF e retornar uma lista de objetos Document
def extract_text_from_pdf(file_path):
try:
with open(file_path, 'rb') as pdf_file:
pdf = PyPDF2.PdfReader(pdf_file)
paginas = len(pdf.pages)
text = ""
for i in range(paginas):
page = pdf.pages[i]
text += page.extract_text()
# print(type(text))
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
length_function=len,
separators=['nnn','nn','n', ' ', '']
)
documents = text_splitter.create_documents([text])
splitted_documents = text_splitter.split_documents(documents)
# print(documents)
# print("---------------------- vs ---------------------")
# print(splitted_documents)
return splitted_documents
except FileNotFoundError:
print("Arquivo não encontrado")
return []
class criar_vectordb:
def save_db(self, documents, embeddings, db_path):
self.db_path = db_path
self.embeddings = embeddings
self.documents = documents
input=self.documents
vectordb = Chroma.from_documents(input, self.embeddings, persist_directory=self.db_path)
vectordb = None
vectordb = Chroma(db_path, embeddings)
return vectordb
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device':'cpu'})
# Extraindo texto do PDF e criando a base de dados vetorial
documents = extract_text_from_pdf(PDF_PATH)
vectordb = criar_vectordb().save_db(documents, embeddings, CHROMA_DATA_PATH)
os.environ["GROQ_API_KEY"] = "-"
ruth_prompt_template = """
Você é um assistente virtual de RH utilizando documentos para embasar sua resposta sempre em fatos,
Use as informações presentes no documento para responder a resposta do candidato,
sua resposta deve ser o mais semelhante possível com a descrição presente nos documentos
contexto: {context}
pergunta: {question}
Apenas retorne as respostas úteis em ajudar na avaliação e seleção de candidatos e nada mais, usando uma linguagem gentil e empática.
Sempre responda em português, uma descrição em texto contínua, além disso adicione
um ou mais emojis às vezes para demonstrar empatia e emoção.
"""
prompt = PromptTemplate(template=ruth_prompt_template, input_variables=['context', 'question'])
'''
llm = CTransformers(
model = "model/llama-2-7b-chat.ggmlv3.q8_0.bin",
model_type = "llama",
config={'max_new_tokens': 512,
'temperature': 0.03,
'context_length': 1000,
'repetition_penalty': 1.15}
)
'''
llm = ChatGroq(model_name="llama3-70b-8192", api_key=os.environ["GROQ_API_KEY"])
retriever = vectordb.as_retriever(search_kwargs={"k": 2})
combine_docs_chain = create_stuff_documents_chain(
llm, prompt
)
qa = create_retrieval_chain(retriever, combine_docs_chain)
# Main
def main():
# Exemplo de uso
context = "Feedback negativo"
question = "Como você lida com feedback negativo?"
response = qa.invoke({"input": {"context": context, "question": question}})
print(response)
if __name__ == "__main__":
main()
This is the huggingface file:
<code>from typing import Any, Dict, List, Optional
from langchain_core.embeddings import Embeddings
from langchain_core.pydantic_v1 import BaseModel, Extra, Field
DEFAULT_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
class HuggingFaceEmbeddings(BaseModel, Embeddings):
"""HuggingFace sentence_transformers embedding models.
To use, you should have the ``sentence_transformers`` python package installed.
from langchain_huggingface import HuggingFaceEmbeddings
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs
client: Any #: :meta private:
model_name: str = DEFAULT_MODEL_NAME
cache_folder: Optional[str] = None
Can be also set by SENTENCE_TRANSFORMERS_HOME environment variable."""
model_kwargs: Dict[str, Any] = Field(default_factory=dict)
"""Keyword arguments to pass to the Sentence Transformer model, such as `device`,
`prompts`, `default_prompt_name`, `revision`, `trust_remote_code`, or `token`.
See also the Sentence Transformer documentation: https://sbert.net/docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer"""
encode_kwargs: Dict[str, Any] = Field(default_factory=dict)
"""Keyword arguments to pass when calling the `encode` method of the Sentence
Transformer model, such as `prompt_name`, `prompt`, `batch_size`, `precision`,
`normalize_embeddings`, and more.
See also the Sentence Transformer documentation: https://sbert.net/docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer.encode"""
multi_process: bool = False
"""Run encode() on multiple GPUs."""
show_progress: bool = False
"""Whether to show a progress bar."""
def __init__(self, **kwargs: Any):
"""Initialize the sentence_transformer."""
super().__init__(**kwargs)
import sentence_transformers # type: ignore[import]
except ImportError as exc:
"Could not import sentence_transformers python package. "
"Please install it with `pip install sentence-transformers`."
self.client = sentence_transformers.SentenceTransformer(
self.model_name, cache_folder=self.cache_folder, **self.model_kwargs
"""Configuration for this pydantic object."""
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Compute doc embeddings using a HuggingFace transformer model.
texts: The list of texts to embed.
List of embeddings, one for each text.
import sentence_transformers # type: ignore[import]
texts = list(map(lambda x: x.replace("n", " "), texts))
pool = self.client.start_multi_process_pool()
embeddings = self.client.encode_multi_process(texts, pool)
sentence_transformers.SentenceTransformer.stop_multi_process_pool(pool)
embeddings = self.client.encode(
texts, show_progress_bar=self.show_progress, **self.encode_kwargs
return embeddings.tolist()
def embed_query(self, text: str) -> List[float]:
"""Compute query embeddings using a HuggingFace transformer model.
return self.embed_documents([text])[0]
<code>from typing import Any, Dict, List, Optional
from langchain_core.embeddings import Embeddings
from langchain_core.pydantic_v1 import BaseModel, Extra, Field
DEFAULT_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
class HuggingFaceEmbeddings(BaseModel, Embeddings):
"""HuggingFace sentence_transformers embedding models.
To use, you should have the ``sentence_transformers`` python package installed.
Example:
.. code-block:: python
from langchain_huggingface import HuggingFaceEmbeddings
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs
)
"""
client: Any #: :meta private:
model_name: str = DEFAULT_MODEL_NAME
"""Model name to use."""
cache_folder: Optional[str] = None
"""Path to store models.
Can be also set by SENTENCE_TRANSFORMERS_HOME environment variable."""
model_kwargs: Dict[str, Any] = Field(default_factory=dict)
"""Keyword arguments to pass to the Sentence Transformer model, such as `device`,
`prompts`, `default_prompt_name`, `revision`, `trust_remote_code`, or `token`.
See also the Sentence Transformer documentation: https://sbert.net/docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer"""
encode_kwargs: Dict[str, Any] = Field(default_factory=dict)
"""Keyword arguments to pass when calling the `encode` method of the Sentence
Transformer model, such as `prompt_name`, `prompt`, `batch_size`, `precision`,
`normalize_embeddings`, and more.
See also the Sentence Transformer documentation: https://sbert.net/docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer.encode"""
multi_process: bool = False
"""Run encode() on multiple GPUs."""
show_progress: bool = False
"""Whether to show a progress bar."""
def __init__(self, **kwargs: Any):
"""Initialize the sentence_transformer."""
super().__init__(**kwargs)
try:
import sentence_transformers # type: ignore[import]
except ImportError as exc:
raise ImportError(
"Could not import sentence_transformers python package. "
"Please install it with `pip install sentence-transformers`."
) from exc
self.client = sentence_transformers.SentenceTransformer(
self.model_name, cache_folder=self.cache_folder, **self.model_kwargs
)
class Config:
"""Configuration for this pydantic object."""
extra = Extra.forbid
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Compute doc embeddings using a HuggingFace transformer model.
Args:
texts: The list of texts to embed.
Returns:
List of embeddings, one for each text.
"""
import sentence_transformers # type: ignore[import]
texts = list(map(lambda x: x.replace("n", " "), texts))
if self.multi_process:
pool = self.client.start_multi_process_pool()
embeddings = self.client.encode_multi_process(texts, pool)
sentence_transformers.SentenceTransformer.stop_multi_process_pool(pool)
else:
embeddings = self.client.encode(
texts, show_progress_bar=self.show_progress, **self.encode_kwargs
)
return embeddings.tolist()
def embed_query(self, text: str) -> List[float]:
"""Compute query embeddings using a HuggingFace transformer model.
Args:
text: The text to embed.
Returns:
Embeddings for the text.
"""
return self.embed_documents([text])[0]
</code>
from typing import Any, Dict, List, Optional
from langchain_core.embeddings import Embeddings
from langchain_core.pydantic_v1 import BaseModel, Extra, Field
DEFAULT_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
class HuggingFaceEmbeddings(BaseModel, Embeddings):
"""HuggingFace sentence_transformers embedding models.
To use, you should have the ``sentence_transformers`` python package installed.
Example:
.. code-block:: python
from langchain_huggingface import HuggingFaceEmbeddings
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs
)
"""
client: Any #: :meta private:
model_name: str = DEFAULT_MODEL_NAME
"""Model name to use."""
cache_folder: Optional[str] = None
"""Path to store models.
Can be also set by SENTENCE_TRANSFORMERS_HOME environment variable."""
model_kwargs: Dict[str, Any] = Field(default_factory=dict)
"""Keyword arguments to pass to the Sentence Transformer model, such as `device`,
`prompts`, `default_prompt_name`, `revision`, `trust_remote_code`, or `token`.
See also the Sentence Transformer documentation: https://sbert.net/docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer"""
encode_kwargs: Dict[str, Any] = Field(default_factory=dict)
"""Keyword arguments to pass when calling the `encode` method of the Sentence
Transformer model, such as `prompt_name`, `prompt`, `batch_size`, `precision`,
`normalize_embeddings`, and more.
See also the Sentence Transformer documentation: https://sbert.net/docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer.encode"""
multi_process: bool = False
"""Run encode() on multiple GPUs."""
show_progress: bool = False
"""Whether to show a progress bar."""
def __init__(self, **kwargs: Any):
"""Initialize the sentence_transformer."""
super().__init__(**kwargs)
try:
import sentence_transformers # type: ignore[import]
except ImportError as exc:
raise ImportError(
"Could not import sentence_transformers python package. "
"Please install it with `pip install sentence-transformers`."
) from exc
self.client = sentence_transformers.SentenceTransformer(
self.model_name, cache_folder=self.cache_folder, **self.model_kwargs
)
class Config:
"""Configuration for this pydantic object."""
extra = Extra.forbid
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Compute doc embeddings using a HuggingFace transformer model.
Args:
texts: The list of texts to embed.
Returns:
List of embeddings, one for each text.
"""
import sentence_transformers # type: ignore[import]
texts = list(map(lambda x: x.replace("n", " "), texts))
if self.multi_process:
pool = self.client.start_multi_process_pool()
embeddings = self.client.encode_multi_process(texts, pool)
sentence_transformers.SentenceTransformer.stop_multi_process_pool(pool)
else:
embeddings = self.client.encode(
texts, show_progress_bar=self.show_progress, **self.encode_kwargs
)
return embeddings.tolist()
def embed_query(self, text: str) -> List[float]:
"""Compute query embeddings using a HuggingFace transformer model.
Args:
text: The text to embed.
Returns:
Embeddings for the text.
"""
return self.embed_documents([text])[0]
While debugging, I tried to use split_text() and split_documents() instead of create_documents() and it also didn’t work, all of them give me the same output: this error, and my text still containing all of the “n”. I don’t know if it could be something else in the code, as this is the only part that deals with separators.
Please help!
Thank you!