I am using Chroma vector DB to retrieve relevant chunks to answer a user query. I have created 2 instances of Chroma DB, One instance is using original chunks and the other instance is using compressed chunks. I have compressed each chunk by removing articles and some stop words. Logically, the token size value for original prompt(through original chunks) should be greater than compressed prompt(through compressed chunks) , but the opposite is happening many a times for same k value(Relevant documents retrieved) , same query and same chunk size. Thus the prompt is not getting compressed.
from langchain.text_splitter import RecursiveCharacterTextSplitter
CHROMA_PATH_COMPR = 'docs/chroma_22225311123335326326622123155783567999592220111221823527982/'
CHROMA_PATH = 'docs/chroma_323333322253371255175333363916217279211239913512622611/'
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1400,chunk_overlap=700,separators=["nn", "n", ". ", " ", ""])
def input_file():
file_path = input("Enter file path : ")
loader = PyPDFLoader(file_path)
pages = loader.load()
return pages
def get_original_chunks(pages):
orig_chunks= text_splitter.split_documents(pages)
return orig_chunks
def retrieve_relevant_chunks_original(orig_chunks,query):
db_chroma = Chroma.from_documents(documents= orig_chunks,embedding=getembeddings(),persist_directory=CHROMA_PATH,collection_name="OriginalChunks")
vector_store_retriever_orig = db_chroma.as_retriever(search_type="mmr",search_kwargs={'k':5,'lambda_mult':0.1})
retrieved_context = vector_store_retriever_orig.get_relevant_documents(query)
return retrieved_context
def get_compressed_chunks(orig_chunks,query):
compressedchunks=[]
startTime = timeit.default_timer()
for i in range(len(orig_chunks)):
orig_chunks[i].page_content = compressprompt(orig_chunks[i].page_content)
compressedchunks.append(orig_chunks[i])
endTime = timeit.default_timer()
time_compr = endTime-startTime
print("Time taken to compress chunks = t",time_compr)
db_chroma_compr = Chroma.from_documents(documents= compressedchunks,embedding=getembeddings(),persist_directory=CHROMA_PATH_COMPR,collection_name="CompressedChunks")
vector_store_retriever_compr = db_chroma_compr.as_retriever(search_type="mmr",search_kwargs={'k':5,'lambda_mult':0.1})
retrieved_compr_context = vector_store_retriever_compr.get_relevant_documents(query)
return retrieved_compr_context
user25185721 is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.