We are using pdf to load into our vector db and I am able to get answer from the llm but we have a requirement that we need to have the page content from the pdf.
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.readers.file import PyMuPDFReader
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter
from pathlib import Path
from llama_index.core import Settings
import os
import json
from pathlib import Path
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
query = 'What is the topic of the document?' # Hardcoded query
apiKey = 'xxx' # Hardcoded Azure OpenAI API key
apiVersion = '2023-07-01' # Set the appropriate API version
azure_endpoint = 'xxx' # Your Azure OpenAI endpoint
llm = AzureOpenAI(
model="xxx",
deployment_name="xxx",
api_key=apiKey,
azure_endpoint=azure_endpoint,
api_version=apiVersion,
)
embed_model = AzureOpenAIEmbedding(
model="text-embedding-ada-002",
deployment_name="Embedding",
api_key=apiKey,
azure_endpoint=azure_endpoint,
api_version=apiVersion,
)
# Define the LLM
Settings.llm = llm
Settings.embed_model = embed_model
# Load documents using PyMuPDFReader
docs0 = PyMuPDFReader().load(file_path=Path("sample2.pdf")) # Replace with the correct path if downloading is needed
doc_text = "nn".join([d.get_content() for d in docs0])
docs = [Document(text=doc_text)]
# Split documents into chunks
node_parser = SentenceSplitter(chunk_size=1024)
base_nodes = node_parser.get_nodes_from_documents(docs)
# Create the embedding function using Azure OpenAI
# Create the vector store index
index = VectorStoreIndex(base_nodes, embed_model)
retriever = index.as_retriever(search_type="similarity", search_kwargs={"k": 2})
# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. You always HAVE TO say "thanks for asking!" at the end of the answer!
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)
# # Create a query engine
query_engine = index.as_query_engine()
# Get the answer
result = query_engine.query(query)
print(result)
We need reslut like this:
{
"answer": " Parkinson's disease",
"context": [
{
"name": "qna1718774871.255651\10.1038_s41531-018-0058-0.pdf",
"page": 6,
"pageContent": "ADDITIONAL INFORMATIONnSupplementary information accompanies the paper on the npj Parkinson ’s..."
},
],
"status": [
{
"paper_id": "10.1038/s41531-018-0058-0",
"status": "Success"
}
]
}
I am using Python, so want to know if there’s any library or anything which can be used to get this kind of a result.