I am trying to run this code which taking the pdfs contents and convert it to vectors using ollama embedding then saving it to faiss vector database.
import time
s=time.time()
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import (
HuggingFaceInferenceAPIEmbeddings,
)
import faiss
import numpy as np
import os
#"llama:7b"
#"mxbai-embed-large"
from langchain_community.embeddings import OllamaEmbeddings
ollama_emb = OllamaEmbeddings(
model="mxbai-embed-large",
)
def read_doc(directory):
file_loader = PyPDFDirectoryLoader(directory)
documents = file_loader.load()
return documents
loaded_docs = read_doc("C:/Users/96399/temp/3eme/3e AR/Cours")
def chunk_data(docs, chunk_size=1000, chunk_overlap=50):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
doc_chunks = text_splitter.split_documents(docs)
return doc_chunks
chunked_doc = chunk_data(docs=loaded_docs)
db = FAISS.from_documents(chunked_doc, ollama_emb)
query = "Cancer Causes"
docs = db.similarity_search(query)
print(dir(db))
d=time.time()
t=d-s
print("the total time is",t)
db.save_local("3eme_index")
I am sometime getting the following error
ValueError: Error raised by inference API HTTP code: 500, {“error”:”failed to generate embedding”}
also the code is taking too much time.Is there any issue with it.How can I solve this problem?