I’m working on a Python script that processes PDF files to extract the introduction section using the OpenAI API. The script reads the first few pages of a PDF, extracts the text, and sends it to GPT-4o to identify the introduction. While the text is correctly printed in the terminal, I’m encountering an error when trying to save it to a file: ‘ChatCompletionMessage’ object is not subscriptable.
2024-06-10 06:56:16,995 – ERROR – Error processing PDF /Users/franciscoteixeirabarbosa/Dropbox/Science in Dentistry APP/pdf_extractor/sections_extractor/pdfs/Waiting room time_ An opportunity for parental oral health education.pdf: ‘ChatCompletionMessage’ object is not subscriptable
import re
import os
import json
import logging
import requests
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from lxml import etree as ET
import openai
# Setup logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
# Path to the GROBID service
GROBID_PATH = '/Users/franciscoteixeirabarbosa/projects/test/sections_pdf/grobid'
GROBID_URL = 'http://localhost:8070'
def extract_introduction_from_pdf(pdf_path: str) -> str:
"""
Extracts text from the first few pages of the PDF and uses GPT to find the introduction.
"""
load_dotenv()
INTRODUCTION_OUTPUT_DIR = "/Users/franciscoteixeirabarbosa/Dropbox/Science in Dentistry APP/pdf_extractor/sections_extractor/introduction_output"
# Ensure the output directory exists
os.makedirs(INTRODUCTION_OUTPUT_DIR, exist_ok=True)
# Read the first three pages of the PDF
with open(pdf_path, 'rb') as file:
pdf_reader = PdfReader(file)
pdf_text = []
for page_num in range(min(3, len(pdf_reader.pages))):
page = pdf_reader.pages[page_num]
pdf_text.append(page.extract_text())
pdf_text = "n".join(pdf_text)
# Prepare the prompt for the AI
prompt = f"""
Extract the introduction section from the following text, which is typically found between the abstract and the methods section:
{pdf_text}
"""
# Initialize the OpenAI client
openai.api_key = os.getenv("OPENAI_API_KEY")
response = openai.ChatCompletion.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "You are a document analysis assistant. Extract the introduction section."},
{"role": "user", "content": prompt}
],
max_tokens=1500
)
# Properly access the choices from the response
output_text = response.choices[0].message['content'].strip()
# Define the output file path
output_file_path = os.path.join(INTRODUCTION_OUTPUT_DIR, os.path.basename(pdf_path).replace('.pdf', '_introduction.txt'))
# Write the output to a file
with open(output_file_path, 'w', encoding='utf-8') as file:
file.write(output_text)
return output_text
Error message:
2024-06-10 06:56:16,995 – ERROR – Error processing PDF /Users/franciscoteixeirabarbosa/Dropbox/Science in Dentistry APP/pdf_extractor/sections_extractor/pdfs/Waiting room time_ An opportunity for parental oral health education.pdf: ‘ChatCompletionMessage’ object is not subscriptable
Initial attempt:
output_text = response.choices[0].message['content'].strip()
Suggested Correction:
Based on OpenAI’s documentation, the response should be accessed using:
output_text = response.choices[0].message['content'].strip() # Incorrect
output_text = response.choices[0].message.content.strip() # Correct
Below the full script:
import re
import os
import json
import logging
import requests
import xml.etree.ElementTree as ET
from dotenv import load_dotenv
from PyPDF2 import PdfFileReader
from io import BytesIO
import openai
# Setup logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
# Path to the GROBID service
GROBID_PATH = '/Users/franciscoteixeirabarbosa/projects/test/sections_pdf/grobid'
GROBID_URL = 'http://localhost:8070'
def start_grobid_service():
try:
response = requests.get(f'{GROBID_URL}/api/isalive', timeout=10)
if response.status_code == 200:
logging.info("GROBID service is already running.")
else:
logging.info("GROBID service is not running. Attempting to start it...")
subprocess.Popen(['./gradlew', 'run', '--stacktrace'], cwd=GROBID_PATH)
time.sleep(10)
response = requests.get(f'{GROBID_URL}/api/isalive', timeout=10)
if response.status_code == 200:
logging.info("GROBID service started successfully.")
else:
logging.error("Failed to start GROBID service.")
except requests.exceptions.RequestException as e:
logging.error(f"Error checking GROBID service: {str(e)}")
logging.info("Attempting to start GROBID service...")
subprocess.Popen(['./gradlew', 'run', '--stacktrace'], cwd=GROBID_PATH)
time.sleep(10)
try:
response = requests.get(f'{GROBID_URL}/api/isalive', timeout=10)
if response.status_code == 200:
logging.info("GROBID service started successfully.")
else:
logging.error("Failed to start GROBID service.")
except requests.exceptions.RequestException as e:
logging.error(f"Failed to start GROBID service: {str(e)}")
from lxml import etree as ET
def extract_introduction_from_xml(xml_root: ET.Element, namespace: dict) -> str:
"""
Extracts the introduction from the XML root using the provided namespace.
"""
# Locate the abstract element
abstract_element = xml_root.find('.//tei:abstract', namespaces=namespace)
if abstract_element is not None:
# Get the next element after the abstract, which should be the introduction
introduction_element = abstract_element.getnext()
# Continue to the next element until 'Methods' is found
introduction_text = []
while introduction_element is not None and 'Methods' not in introduction_element.findtext('.//tei:head', namespaces=namespace, default=''):
introduction_text.append(''.join(introduction_element.itertext()))
introduction_element = introduction_element.getnext()
if introduction_text:
return 'n'.join(introduction_text).strip()
return ""
from PyPDF2 import PdfReader
def extract_introduction_from_pdf(pdf_path: str) -> str:
"""
Extracts text from the first few pages of the PDF and uses GPT to find the introduction.
"""
load_dotenv()
INTRODUCTION_OUTPUT_DIR = "/Users/franciscoteixeirabarbosa/Dropbox/Science in Dentistry APP/pdf_extractor/sections_extractor/introduction_output"
# Ensure the output directory exists
os.makedirs(INTRODUCTION_OUTPUT_DIR, exist_ok=True)
# Read the first three pages of the PDF
with open(pdf_path, 'rb') as file:
pdf_reader = PdfReader(file)
pdf_text = []
for page_num in range(min(3, len(pdf_reader.pages))):
page = pdf_reader.pages[page_num]
pdf_text.append(page.extract_text())
pdf_text = "n".join(pdf_text)
# Prepare the prompt for the AI
prompt = f"""
Extract the introduction section from the following text, which is typically found between the abstract and the methods section:
{pdf_text}
"""
# Initialize the OpenAI client
openai.api_key = os.getenv("OPENAI_API_KEY")
response = openai.ChatCompletion.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "You are a document analysis assistant. Extract the introduction section."},
{"role": "user", "content": prompt}
],
max_tokens=1500
)
# Access the response content properly
output_text = response.choices[0].message.content.strip()
# Define the output file path
output_file_path = os.path.join(INTRODUCTION_OUTPUT_DIR, os.path.basename(pdf_path).replace('.pdf', '_introduction.txt'))
# Write the output to a file
with open(output_file_path, 'w', encoding='utf-8') as file:
file.write(output_text)
return output_text
def extract_metadata(xml_root: ET.Element, namespace: dict) -> dict:
"""
Extracts metadata from the XML root, specifically the main article author, publication year, journal, and title.
"""
metadata = {
'title': None,
'authors': [],
'publication_year': None,
'journal': None
}
# Extracting the title
title_element = xml_root.find('.//tei:title[@type="main"]', namespace)
if title_element is not None:
metadata['title'] = title_element.text
# Extracting main article authors
main_article = xml_root.find('.//tei:analytic', namespace)
if main_article is not None:
for author in main_article.findall('.//tei:author/tei:persName', namespace):
forename_element = author.find('.//tei:forename', namespace)
surname_element = author.find('.//tei:surname', namespace)
if forename_element is not None and surname_element is not None:
forename = forename_element.text
surname = surname_element.text
metadata['authors'].append(f"{forename} {surname}")
# Extracting publication year
date_element = xml_root.find('.//tei:imprint/tei:date[@type="published"]', namespace)
if date_element is not None:
metadata['publication_year'] = date_element.get('when')
# Extracting journal title
journal_title_element = xml_root.find('.//tei:monogr/tei:title[@level="j"]', namespace)
if journal_title_element is not None:
metadata['journal'] = journal_title_element.text
return metadata
def generate_metadata(pdf_path: str, output_dir: str, grobid_url: str) -> dict:
"""
Generates metadata for a given PDF file.
Parameters:
pdf_path (str): The path to the PDF file.
output_dir (str): The directory to save the output.
grobid_url (str): The URL of the GROBID service.
Returns:
dict: The extracted metadata.
"""
try:
with open(pdf_path, 'rb') as f:
response = requests.post(f'{grobid_url}/api/processHeaderDocument', files={'input': f}, timeout=10)
response.raise_for_status()
# Save the XML output
xml_output_path = os.path.join(output_dir, 'output.xml')
with open(xml_output_path, 'w', encoding='utf-8') as f:
f.write(response.text)
# Parse the XML output
tree = ET.parse(xml_output_path)
root = tree.getroot()
ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
# Extract the metadata
metadata = extract_metadata(root, ns)
# Define the output file path using the title
sanitized_title = re.sub(r'[^ws-]', '', metadata['title']).strip().replace(' ', '_')
metadata_output_path = os.path.join(output_dir, f'{sanitized_title}_metadata.json')
# Write the metadata to the output file
with open(metadata_output_path, 'w', encoding='utf-8') as f:
json.dump(metadata, f, indent=4)
logging.info(f"Metadata has been written to {metadata_output_path}")
return metadata
except Exception as e:
logging.error(f"Error generating metadata for PDF {pdf_path}: {str(e)}")
return None
def extract_introduction_from_pdf(pdf_path: str) -> str:
"""
Extracts text from the first few pages of the PDF and uses GPT to find the introduction.
"""
load_dotenv()
INTRODUCTION_OUTPUT_DIR = "/Users/franciscoteixeirabarbosa/Dropbox/Science in Dentistry APP/pdf_extractor/sections_extractor/introduction_output"
# Read the first three pages of the PDF
with open(pdf_path, 'rb') as file:
pdf_reader = PdfReader(file)
pdf_text = []
for page_num in range(min(3, len(pdf_reader.pages))):
page = pdf_reader.pages[page_num]
pdf_text.append(page.extract_text())
pdf_text = "n".join(pdf_text)
# Prepare the prompt for the AI
prompt = f"""
Extract the introduction section from the following text, which is typically found between the abstract and the methods section:
{pdf_text}
"""
# Initialize the OpenAI client
openai.api_key = os.getenv("OPENAI_API_KEY")
response = openai.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "You are a document analysis assistant. Extract the introduction section."},
{"role": "user", "content": prompt}
],
max_tokens=3000
)
# Properly access the choices from the response
output_text = response.choices[0].message['content'].strip()
output_file_path = os.path.join(INTRODUCTION_OUTPUT_DIR, os.path.basename(pdf_path).replace('.pdf', '_introduction.txt'))
with open(output_file_path, 'w', encoding='utf-8') as file:
file.write(output_text)
return output_text
def process_pdf_for_introduction(pdf_path: str, output_dir: str, grobid_url: str, metadata: dict):
"""
Processes a PDF file to extract and format the introduction, saving the result to the specified output directory.
Parameters:
pdf_path (str): The path to the PDF file.
output_dir (str): The directory to save the output.
grobid_url (str): The URL of the GROBID service.
metadata (dict): The metadata of the article.
"""
try:
title = metadata.get('title', 'Unknown_Title')
# Send the PDF to GROBID
with open(pdf_path, 'rb') as f:
response = requests.post(f'{grobid_url}/api/processFulltextDocument', files={'input': f}, timeout=10)
response.raise_for_status()
# Save the XML output
xml_output_path = os.path.join(output_dir, 'output.xml')
with open(xml_output_path, 'w', encoding='utf-8') as f:
f.write(response.text)
# Parse the XML output
tree = ET.parse(xml_output_path)
root = tree.getroot()
ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
# Extract the introduction text
introduction_text = extract_introduction_from_xml(root, ns)
if not introduction_text:
introduction_text = extract_introduction_from_pdf(pdf_path)
# Define the output file path using the title
sanitized_title = re.sub(r'[^ws-]', '', title).strip().replace(' ', '_')
output_file_path = os.path.join(output_dir, f'{sanitized_title}_introduction.txt')
# Write the formatted introduction to the output file
with open(output_file_path, 'w', encoding='utf-8') as f:
f.write(introduction_text)
logging.info(f"Introduction has been written to {output_file_path}")
except Exception as e:
logging.error(f"Error processing PDF {pdf_path}: {str(e)}")
def process_pdfs(pdf_dir: str, output_dir: str, grobid_url: str, metadata_dir: str):
"""
Processes all PDF files in the given directory to extract metadata and the introduction section,
and saves the extracted information to the specified output directory.
Parameters:
pdf_dir (str): The directory containing PDF files.
output_dir (str): The directory to save the extracted information.
grobid_url (str): The URL of the GROBID service.
metadata_dir (str): The directory to save metadata files.
"""
pdf_files = [os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.endswith('.pdf')]
for pdf_path in pdf_files:
logging.info(f"Processing file: {pdf_path}")
# Generate or load metadata
metadata_path = os.path.join(metadata_dir, f"{os.path.splitext(os.path.basename(pdf_path))[0]}_metadata.json")
if not os.path.exists(metadata_path):
logging.info(f"Metadata file not found for {pdf_path}. Generating metadata...")
metadata = generate_metadata(pdf_path, metadata_dir, grobid_url)
if metadata is None:
continue
else:
with open(metadata_path, 'r', encoding='utf-8') as f:
metadata = json.load(f)
# Process the PDF to extract the introduction
process_pdf_for_introduction(pdf_path, output_dir, grobid_url, metadata)
if __name__ == "__main__":
PDF_DIR = "/Users/franciscoteixeirabarbosa/Dropbox/Science in Dentistry APP/pdf_extractor/sections_extractor/pdfs"
OUTPUT_DIR = "/Users/franciscoteixeirabarbosa/Dropbox/Science in Dentistry APP/pdf_extractor/sections_extractor/introduction_output"
METADATA_DIR = "/Users/franciscoteixeirabarbosa/Dropbox/Science in Dentistry APP/pdf_extractor/sections_extractor/metadata_output"
# Ensure the output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(METADATA_DIR, exist_ok=True)
# Start the GROBID service
start_grobid_service()
# Process the PDFs
process_pdfs(PDF_DIR, OUTPUT_DIR, GROBID_URL, METADATA_DIR)
Tuminha is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.