I need access to the particular docs that have been retrieved by my retriever, so that i may know their page numbers.
- function where i create and store the embeddings –
def create_embeddings_for_pdf(pdf_id: str, pdf_path: str):
# text_splitter = RecursiveCharacterTextSplitter(
# chunk_size=500,
# chunk_overlap=100
# )
loader = PyPDFLoader(pdf_path)
docs = loader.load_and_split()
for doc in docs:
doc.metadata = {
"page": doc.metadata["page"],
"text": doc.page_content,
"pdf_id": pdf_id
}
vector_store.add_documents(docs)
note that i have commented out the text_splitter because I read another stackoverflow entry that suggested that i should make page-sized chunks instead of having it be a fixed number of characters (i.e. 500)
- the function where i run the conversationalRetrievalChain to actually get answer to my input message.
from flask import Blueprint, g, request, Response, jsonify, stream_with_context
from app.web.hooks import login_required, load_model
from app.web.db.models import Pdf, Conversation
from app.chat import build_chat, ChatArgs
bp = Blueprint("conversation", __name__, url_prefix="/api/conversations")
@bp.route("/", methods=["GET"])
@login_required
@load_model(Pdf, lambda r: r.args.get("pdf_id"))
def list_conversations(pdf):
return [c.as_dict() for c in pdf.conversations]
@bp.route("/", methods=["POST"])
@login_required
@load_model(Pdf, lambda r: r.args.get("pdf_id"))
def create_conversation(pdf):
conversation = Conversation.create(user_id=g.user.id, pdf_id=pdf.id)
return conversation.as_dict()
@bp.route("/<string:conversation_id>/messages", methods=["POST"])
@login_required
@load_model(Conversation)
def create_message(conversation):
input = f"In addition to your answer to the user's query, return the page number of the valid document you have used to arrive at the answer, in the following format - Page: <page_number>, output: <answer_to_user_query> {request.json.get('input')}"
streaming = request.args.get("stream", False)
pdf = conversation.pdf
chat_args = ChatArgs(
conversation_id=conversation.id,
pdf_id=pdf.id,
streaming=streaming,
metadata={
"conversation_id": conversation.id,
"user_id": g.user.id,
"pdf_id": pdf.id,
},
)
chat = build_chat(chat_args)
if not chat:
return "Chat not yet implemented!"
if streaming:
return Response(
stream_with_context(chat.stream(input)), mimetype="text/event-stream"
)
else:
answer = chat.run(input)
print(f"AI answer : {answer}")
return jsonify({"role": "assistant", "content": answer})
don’t bother about the streaming-related code, i have not implemented it yet. The else block will run each time.
- definition of the chat chain
from langchain.chains import ConversationalRetrievalChain
from app.chat.models import ChatArgs
from app.chat.vector_stores.pinecone import build_retriever
from app.chat.llms.chatopenai import build_llm
from app.chat.memories.sql_memory import build_memory
def build_chat(chat_args: ChatArgs):
retriever = build_retriever(chat_args)
llm = build_llm(chat_args)
memory = build_memory(chat_args)
chat_chain = ConversationalRetrievalChain.from_llm(
llm=llm,
memory=memory,
retriever=retriever,
verbose = True
)
return chat_chain
my goal is to know the page number where the output has come from.
for this i believed that
- using page-sized chunks for embedding, as well as …
- finding a way to access those selected docs by my retriever is necessary.
Do correct me if there is a better way of achieving said goal.
Ketan Kunkalikar is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.