I have the following custom WebApiSkill:
@app.route(route="CustomSplitSkill", auth_level=func.AuthLevel.FUNCTION)
def CustomSplitSkill(req: func.HttpRequest) -> func.HttpResponse:
logging.info('Python HTTP trigger function processed a request.')
try:
req_body = req.get_json()
except ValueError:
return func.HttpResponse("Invalid input", status_code=400)
try:
# 'values' expected top-level key in the request body
response_body = {"values": []}
for value in req_body.get('values', []):
recordId = value.get('recordId')
text = value.get('data', {}).get('text', '')
# Remove sequences of dots, numbers following them, and
# any additional punctuation or newline characters, replacing them with a single space
cleaned_text = re.sub(r"[',.n]+|d+", ' ', text)
# Replace multiple spaces with a single space and trim leading/trailing spaces
cleaned_text = re.sub(r's{2,}', ' ', cleaned_text).strip()
# Pattern to match sequences of ". " occurring more than twice
cleaned_text = re.sub(r"(. ){3,}", "", cleaned_text)
chunks = split_text_into_chunks_with_overlap(cleaned_text, chunk_size=256, overlap_size=20)
# response object for specific pdf
response_record = {
"recordId": recordId,
"data": {
"textItems": chunks
}
}
response_body['values'].append(response_record)
return func.HttpResponse(json.dumps(response_body), mimetype="application/json")
except ValueError:
return func.HttpResponse("Function app crashed", status_code=400)
The inputs and outputs of this skill in the skillset are defined like this:
inputs=[
InputFieldMappingEntry(name="text", source="/document/content")
],
outputs=[
OutputFieldMappingEntry(name="textItems", target_name="pages")
],
How should I extract page information for each chunk?