I would like to create a ChromaDB with csv in a folder. In each of the csv, each line is a document (text). I am using Gemini embedding model.
My code do run. But the kernel will die after around 100 documents.
It throws the error code as below:
[error] Disposing session as kernel process died ExitCode: 3221225477, Reason:
The The Chroma DB is created. My code as below:
# Set up the embedding model
embed_model = GeminiEmbedding(
model_name="models/embedding-001",
api_key=os.environ["GEMINI_API_KEY"],
)
# Create a ServiceContext
service_context = ServiceContext.from_defaults(
llm=llm,
embed_model=embed_model,
chunk_size=1000,
chunk_overlap=200,
)
def create_chroma_db_from_csv_folder(folder_path, db_path, collection_name):
# Initialize Chroma client
chroma_client = chromadb.PersistentClient(path=db_path)
# Create or get the collection with the correct embedding function
# chroma_collection = chroma_client.get_or_create_collection(
# name=collection_name
# )
chroma_collection = chroma_client.create_collection(name=collection_name)
# Create vector store
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# Initialize text splitter
splitter = SentenceSplitter(chunk_size=1000, chunk_overlap=200)
# Get all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
for csv_file in csv_files:
file_path = os.path.join(folder_path, csv_file)
try:
df = pd.read_csv(file_path)
except Exception as e:
continue
# Assume the text column is named 'text'. Adjust if it's named differently.
if 'Body' not in df.columns:
continue
# Process each row in the dataframe
for index, row in tqdm(df.iterrows(), total=df.shape[0], desc=f"Processing {csv_file}"):
doc_id = str(uuid.uuid4()) # Generate a unique ID for each document
text = row['Body']
# Create metadata from other columns
metadata = row.drop('Body').to_dict()
# Create a Document object
document = Document(text=text, metadata=metadata, id_=doc_id)
# Insert the document into the index
VectorStoreIndex.from_documents(
[document],
storage_context=storage_context,
service_context=service_context
)
print(f"Chroma database created at {db_path}")
print(f"Total documents in collection: {chroma_collection.count()}")
db_path = "./chroma_db"
collection_name = "csv_documents"
create_chroma_db_from_csv_folder(folder_path, db_path, collection_name)
What I did wrong that cause the kernel to die?