I am trying to do a similarity search to find the most similar documents to my query. However, the response does not include id. So the response is a list of tuple with the following format:
(Document(page_content=””, metadata={embedding=[], content=””}), )
I know each document has an ID as when I added the document, it returned the _id and I stored in my postgresql DB.
Ways I found online:
- include _id in metadata, but _id is generated after I add the document in so not sure how it will work
- include postgresql row id in the metadata but i only add the document to postgresql after adding to opensearch (prefer not to change the order because of the logic
I need to get the _id of the similar document. Please let me know what else I can try
This is my add function:
<code>def add_document(opensearch_client, index_name, embedding, content):
document_data = {
"embedding": embedding,
"content": content
}
response = opensearch_client.index(index=index_name, body=document_data)
logger.info("Document added to opensearch")
return response['_id']
</code>
<code>def add_document(opensearch_client, index_name, embedding, content):
document_data = {
"embedding": embedding,
"content": content
}
response = opensearch_client.index(index=index_name, body=document_data)
logger.info("Document added to opensearch")
return response['_id']
</code>
def add_document(opensearch_client, index_name, embedding, content):
document_data = {
"embedding": embedding,
"content": content
}
response = opensearch_client.index(index=index_name, body=document_data)
logger.info("Document added to opensearch")
return response['_id']
This is my search function:
<code>def search_vector_db(query, _is_aoss=False):
session = boto3.Session()
credentials = session.get_credentials()
aws_auth = AWS4Auth(credentials.access_key, credentials.secret_key, "ap-southeast-1", 'es', session_token=credentials.token)
opensearch_endpoint = get_opensearch_endpoint("vector-kb", "ap-southeast-1")
docsearch = OpenSearchVectorSearch(
index_name="vector-kb-index",
embedding_function=get_openai_embedding_client(),
opensearch_url=f"https://{opensearch_endpoint}",
http_auth=aws_auth,
timeout=30,
is_aoss=_is_aoss,
connection_class=RequestsHttpConnection,
use_ssl=True,
verify_certs=True,
)
docs = docsearch.similarity_search_with_score(
query,
search_type="script_scoring",
space_type="cosinesimil",
vector_field="embedding",
text_field="content",
score_threshold=1.5
)
contexts = []
for doc in docs:
logger.info(doc)
contexts.append(doc[0].page_content)
logger.info("Similar documents retrieved from Opensearch for context")
return contexts
</code>
<code>def search_vector_db(query, _is_aoss=False):
session = boto3.Session()
credentials = session.get_credentials()
aws_auth = AWS4Auth(credentials.access_key, credentials.secret_key, "ap-southeast-1", 'es', session_token=credentials.token)
opensearch_endpoint = get_opensearch_endpoint("vector-kb", "ap-southeast-1")
docsearch = OpenSearchVectorSearch(
index_name="vector-kb-index",
embedding_function=get_openai_embedding_client(),
opensearch_url=f"https://{opensearch_endpoint}",
http_auth=aws_auth,
timeout=30,
is_aoss=_is_aoss,
connection_class=RequestsHttpConnection,
use_ssl=True,
verify_certs=True,
)
docs = docsearch.similarity_search_with_score(
query,
search_type="script_scoring",
space_type="cosinesimil",
vector_field="embedding",
text_field="content",
score_threshold=1.5
)
contexts = []
for doc in docs:
logger.info(doc)
contexts.append(doc[0].page_content)
logger.info("Similar documents retrieved from Opensearch for context")
return contexts
</code>
def search_vector_db(query, _is_aoss=False):
session = boto3.Session()
credentials = session.get_credentials()
aws_auth = AWS4Auth(credentials.access_key, credentials.secret_key, "ap-southeast-1", 'es', session_token=credentials.token)
opensearch_endpoint = get_opensearch_endpoint("vector-kb", "ap-southeast-1")
docsearch = OpenSearchVectorSearch(
index_name="vector-kb-index",
embedding_function=get_openai_embedding_client(),
opensearch_url=f"https://{opensearch_endpoint}",
http_auth=aws_auth,
timeout=30,
is_aoss=_is_aoss,
connection_class=RequestsHttpConnection,
use_ssl=True,
verify_certs=True,
)
docs = docsearch.similarity_search_with_score(
query,
search_type="script_scoring",
space_type="cosinesimil",
vector_field="embedding",
text_field="content",
score_threshold=1.5
)
contexts = []
for doc in docs:
logger.info(doc)
contexts.append(doc[0].page_content)
logger.info("Similar documents retrieved from Opensearch for context")
return contexts