I am using LlamaIndex Pandas Query Engine to produce a pandas query that is applied on a dataframe. The query produced is correct however the dataframe returned is not. Specifically, a dataframe with 10 rows is returned. These are the first 5 and last 5 rows of the correct dataframe.
When looking at the logs, the correct filter is applied and the resulting dataframe is printed. Because the resulting dataframe is >10 rows it prints the first 5 rows and last 5 rows. It seems like this printed df is what is taken and passed back as a result.
How can I get the full filtered dataframe back? This is what my query pipeline looks like:
def setup_pipeline(self):
pandas_prompt = self.create_pandas_prompt(self.df)
pandas_output_parser = PandasInstructionParser(self.df)
output_parser = PydanticOutputParser(RetrievalResponse)
response_prompt_tmpl = self.create_response_synthesis_str()
json_response_prompt_str = output_parser.format(response_prompt_tmpl)
response_prompt_tmpl = PromptTemplate(json_response_prompt_str)
qp = QP(
modules={
"input": InputComponent(),
"pandas_prompt": pandas_prompt,
"llm1": self.llm,
"pandas_output_parser": pandas_output_parser,
"response_synthesis_prompt": response_prompt_tmpl,
"llm2": self.llm,
"output_parser": output_parser,
},
verbose=True,
)
qp.add_chain(["input", "pandas_prompt", "llm1", "pandas_output_parser"])
qp.add_links(
[
Link("input", "response_synthesis_prompt", dest_key="query_str"),
Link("llm1", "response_synthesis_prompt", dest_key="pandas_instructions"),
Link(
"pandas_output_parser",
"response_synthesis_prompt",
dest_key="pandas_output",
),
]
)
qp.add_link("response_synthesis_prompt", "llm2")
qp.add_link("llm2", "output_parser")
return qp
def create_pandas_prompt(self, df):
df_str = df.to_string()
instruction_str = (
"1. Convert the query to executable Python code using Pandas.n"
"2. The final line of code should be a Python expression that can be called with the `eval()` function.n"
"3. The code should represent a solution to the query.n"
"4. PRINT ONLY THE EXPRESSION.n"
"5. Do not quote the expression.n"
)
pandas_prompt_str = (
"You are working with a pandas dataframe in Python.n"
"The name of the dataframe is `df`.n"
"This is the result of `print(df.head())`:n"
"{df_str}nn"
"Follow these instructions:n"
"{instruction_str}n"
"Query: {query_str}nn"
"Expression:"
)
return PromptTemplate(pandas_prompt_str).partial_format(
instruction_str=instruction_str, df_str=df_str
)