pdf: ‘ChatCompletionMessage’ object is not subscriptable

I’m working on a Python script that processes PDF files to extract the introduction section using the OpenAI API. The script reads the first few pages of a PDF, extracts the text, and sends it to GPT-4o to identify the introduction. While the text is correctly printed in the terminal, I’m encountering an error when trying to save it to a file: ‘ChatCompletionMessage’ object is not subscriptable.

2024-06-10 06:56:16,995 – ERROR – Error processing PDF /Users/franciscoteixeirabarbosa/Dropbox/Science in Dentistry APP/pdf_extractor/sections_extractor/pdfs/Waiting room time_ An opportunity for parental oral health education.pdf: ‘ChatCompletionMessage’ object is not subscriptable

import re
import os
import json
import logging
import requests
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from lxml import etree as ET
import openai

# Setup logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

# Path to the GROBID service
GROBID_PATH = '/Users/franciscoteixeirabarbosa/projects/test/sections_pdf/grobid'
GROBID_URL = 'http://localhost:8070'

def extract_introduction_from_pdf(pdf_path: str) -> str:
    """
    Extracts text from the first few pages of the PDF and uses GPT to find the introduction.
    """
    load_dotenv()

    INTRODUCTION_OUTPUT_DIR = "/Users/franciscoteixeirabarbosa/Dropbox/Science in Dentistry APP/pdf_extractor/sections_extractor/introduction_output"

    # Ensure the output directory exists
    os.makedirs(INTRODUCTION_OUTPUT_DIR, exist_ok=True)

    # Read the first three pages of the PDF
    with open(pdf_path, 'rb') as file:
        pdf_reader = PdfReader(file)
        pdf_text = []
        for page_num in range(min(3, len(pdf_reader.pages))):
            page = pdf_reader.pages[page_num]
            pdf_text.append(page.extract_text())
        pdf_text = "n".join(pdf_text)

    # Prepare the prompt for the AI
    prompt = f"""
    Extract the introduction section from the following text, which is typically found between the abstract and the methods section:
    {pdf_text}
    """

    # Initialize the OpenAI client
    openai.api_key = os.getenv("OPENAI_API_KEY")

    response = openai.ChatCompletion.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a document analysis assistant. Extract the introduction section."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=1500
    )

    # Properly access the choices from the response
    output_text = response.choices[0].message['content'].strip()

    # Define the output file path
    output_file_path = os.path.join(INTRODUCTION_OUTPUT_DIR, os.path.basename(pdf_path).replace('.pdf', '_introduction.txt'))

    # Write the output to a file
    with open(output_file_path, 'w', encoding='utf-8') as file:
        file.write(output_text)

    return output_text

Error message:

2024-06-10 06:56:16,995 – ERROR – Error processing PDF /Users/franciscoteixeirabarbosa/Dropbox/Science in Dentistry APP/pdf_extractor/sections_extractor/pdfs/Waiting room time_ An opportunity for parental oral health education.pdf: ‘ChatCompletionMessage’ object is not subscriptable

Initial attempt:

output_text = response.choices[0].message['content'].strip()

Suggested Correction:

Based on OpenAI’s documentation, the response should be accessed using:

output_text = response.choices[0].message['content'].strip() # Incorrect
output_text = response.choices[0].message.content.strip()    # Correct

Below the full script:

import re
import os
import json
import logging
import requests
import xml.etree.ElementTree as ET
from dotenv import load_dotenv
from PyPDF2 import PdfFileReader
from io import BytesIO
import openai

# Setup logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

# Path to the GROBID service
GROBID_PATH = '/Users/franciscoteixeirabarbosa/projects/test/sections_pdf/grobid'
GROBID_URL = 'http://localhost:8070'

def start_grobid_service():
    try:
        response = requests.get(f'{GROBID_URL}/api/isalive', timeout=10)
        if response.status_code == 200:
            logging.info("GROBID service is already running.")
        else:
            logging.info("GROBID service is not running. Attempting to start it...")
            subprocess.Popen(['./gradlew', 'run', '--stacktrace'], cwd=GROBID_PATH)
            time.sleep(10)
            response = requests.get(f'{GROBID_URL}/api/isalive', timeout=10)
            if response.status_code == 200:
                logging.info("GROBID service started successfully.")
            else:
                logging.error("Failed to start GROBID service.")
    except requests.exceptions.RequestException as e:
        logging.error(f"Error checking GROBID service: {str(e)}")
        logging.info("Attempting to start GROBID service...")
        subprocess.Popen(['./gradlew', 'run', '--stacktrace'], cwd=GROBID_PATH)
        time.sleep(10)
        try:
            response = requests.get(f'{GROBID_URL}/api/isalive', timeout=10)
            if response.status_code == 200:
                logging.info("GROBID service started successfully.")
            else:
                logging.error("Failed to start GROBID service.")
        except requests.exceptions.RequestException as e:
            logging.error(f"Failed to start GROBID service: {str(e)}")

from lxml import etree as ET

def extract_introduction_from_xml(xml_root: ET.Element, namespace: dict) -> str:
    """
    Extracts the introduction from the XML root using the provided namespace.
    """
    # Locate the abstract element
    abstract_element = xml_root.find('.//tei:abstract', namespaces=namespace)
    if abstract_element is not None:
        # Get the next element after the abstract, which should be the introduction
        introduction_element = abstract_element.getnext()
        # Continue to the next element until 'Methods' is found
        introduction_text = []
        while introduction_element is not None and 'Methods' not in introduction_element.findtext('.//tei:head', namespaces=namespace, default=''):
            introduction_text.append(''.join(introduction_element.itertext()))
            introduction_element = introduction_element.getnext()
        if introduction_text:
            return 'n'.join(introduction_text).strip()
    return ""

from PyPDF2 import PdfReader

def extract_introduction_from_pdf(pdf_path: str) -> str:
    """
    Extracts text from the first few pages of the PDF and uses GPT to find the introduction.
    """
    load_dotenv()

    INTRODUCTION_OUTPUT_DIR = "/Users/franciscoteixeirabarbosa/Dropbox/Science in Dentistry APP/pdf_extractor/sections_extractor/introduction_output"

    # Ensure the output directory exists
    os.makedirs(INTRODUCTION_OUTPUT_DIR, exist_ok=True)

    # Read the first three pages of the PDF
    with open(pdf_path, 'rb') as file:
        pdf_reader = PdfReader(file)
        pdf_text = []
        for page_num in range(min(3, len(pdf_reader.pages))):
            page = pdf_reader.pages[page_num]
            pdf_text.append(page.extract_text())
        pdf_text = "n".join(pdf_text)

    # Prepare the prompt for the AI
    prompt = f"""
    Extract the introduction section from the following text, which is typically found between the abstract and the methods section:
    {pdf_text}
    """

    # Initialize the OpenAI client
    openai.api_key = os.getenv("OPENAI_API_KEY")

    response = openai.ChatCompletion.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a document analysis assistant. Extract the introduction section."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=1500
    )

    # Access the response content properly
    output_text = response.choices[0].message.content.strip()

    # Define the output file path
    output_file_path = os.path.join(INTRODUCTION_OUTPUT_DIR, os.path.basename(pdf_path).replace('.pdf', '_introduction.txt'))

    # Write the output to a file
    with open(output_file_path, 'w', encoding='utf-8') as file:
        file.write(output_text)

    return output_text

def extract_metadata(xml_root: ET.Element, namespace: dict) -> dict:
    """
    Extracts metadata from the XML root, specifically the main article author, publication year, journal, and title.
    """
    metadata = {
        'title': None,
        'authors': [],
        'publication_year': None,
        'journal': None
    }

    # Extracting the title
    title_element = xml_root.find('.//tei:title[@type="main"]', namespace)
    if title_element is not None:
        metadata['title'] = title_element.text

    # Extracting main article authors
    main_article = xml_root.find('.//tei:analytic', namespace)
    if main_article is not None:
        for author in main_article.findall('.//tei:author/tei:persName', namespace):
            forename_element = author.find('.//tei:forename', namespace)
            surname_element = author.find('.//tei:surname', namespace)
            if forename_element is not None and surname_element is not None:
                forename = forename_element.text
                surname = surname_element.text
                metadata['authors'].append(f"{forename} {surname}")

    # Extracting publication year
    date_element = xml_root.find('.//tei:imprint/tei:date[@type="published"]', namespace)
    if date_element is not None:
        metadata['publication_year'] = date_element.get('when')

    # Extracting journal title
    journal_title_element = xml_root.find('.//tei:monogr/tei:title[@level="j"]', namespace)
    if journal_title_element is not None:
        metadata['journal'] = journal_title_element.text

    return metadata

def generate_metadata(pdf_path: str, output_dir: str, grobid_url: str) -> dict:
    """
    Generates metadata for a given PDF file.

    Parameters:
    pdf_path (str): The path to the PDF file.
    output_dir (str): The directory to save the output.
    grobid_url (str): The URL of the GROBID service.

    Returns:
    dict: The extracted metadata.
    """
    try:
        with open(pdf_path, 'rb') as f:
            response = requests.post(f'{grobid_url}/api/processHeaderDocument', files={'input': f}, timeout=10)
            response.raise_for_status()

        # Save the XML output
        xml_output_path = os.path.join(output_dir, 'output.xml')
        with open(xml_output_path, 'w', encoding='utf-8') as f:
            f.write(response.text)

        # Parse the XML output
        tree = ET.parse(xml_output_path)
        root = tree.getroot()
        ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

        # Extract the metadata
        metadata = extract_metadata(root, ns)

        # Define the output file path using the title
        sanitized_title = re.sub(r'[^ws-]', '', metadata['title']).strip().replace(' ', '_')
        metadata_output_path = os.path.join(output_dir, f'{sanitized_title}_metadata.json')

        # Write the metadata to the output file
        with open(metadata_output_path, 'w', encoding='utf-8') as f:
            json.dump(metadata, f, indent=4)

        logging.info(f"Metadata has been written to {metadata_output_path}")
        return metadata
    except Exception as e:
        logging.error(f"Error generating metadata for PDF {pdf_path}: {str(e)}")
        return None
    
def extract_introduction_from_pdf(pdf_path: str) -> str:
    """
    Extracts text from the first few pages of the PDF and uses GPT to find the introduction.
    """
    load_dotenv()

    INTRODUCTION_OUTPUT_DIR = "/Users/franciscoteixeirabarbosa/Dropbox/Science in Dentistry APP/pdf_extractor/sections_extractor/introduction_output"

    # Read the first three pages of the PDF
    with open(pdf_path, 'rb') as file:
        pdf_reader = PdfReader(file)
        pdf_text = []
        for page_num in range(min(3, len(pdf_reader.pages))):
            page = pdf_reader.pages[page_num]
            pdf_text.append(page.extract_text())
        pdf_text = "n".join(pdf_text)

    # Prepare the prompt for the AI
    prompt = f"""
    Extract the introduction section from the following text, which is typically found between the abstract and the methods section:
    {pdf_text}
    """

    # Initialize the OpenAI client
    openai.api_key = os.getenv("OPENAI_API_KEY")

    response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a document analysis assistant. Extract the introduction section."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=3000
    )

    # Properly access the choices from the response
    output_text = response.choices[0].message['content'].strip()
    output_file_path = os.path.join(INTRODUCTION_OUTPUT_DIR, os.path.basename(pdf_path).replace('.pdf', '_introduction.txt'))
    with open(output_file_path, 'w', encoding='utf-8') as file:
        file.write(output_text)
    return output_text



def process_pdf_for_introduction(pdf_path: str, output_dir: str, grobid_url: str, metadata: dict):
    """
    Processes a PDF file to extract and format the introduction, saving the result to the specified output directory.

    Parameters:
    pdf_path (str): The path to the PDF file.
    output_dir (str): The directory to save the output.
    grobid_url (str): The URL of the GROBID service.
    metadata (dict): The metadata of the article.
    """
    try:
        title = metadata.get('title', 'Unknown_Title')
        # Send the PDF to GROBID
        with open(pdf_path, 'rb') as f:
            response = requests.post(f'{grobid_url}/api/processFulltextDocument', files={'input': f}, timeout=10)
            response.raise_for_status()
        # Save the XML output
        xml_output_path = os.path.join(output_dir, 'output.xml')
        with open(xml_output_path, 'w', encoding='utf-8') as f:
            f.write(response.text)
        # Parse the XML output
        tree = ET.parse(xml_output_path)
        root = tree.getroot()
        ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
        # Extract the introduction text
        introduction_text = extract_introduction_from_xml(root, ns)
        if not introduction_text:
            introduction_text = extract_introduction_from_pdf(pdf_path)
        # Define the output file path using the title
        sanitized_title = re.sub(r'[^ws-]', '', title).strip().replace(' ', '_')
        output_file_path = os.path.join(output_dir, f'{sanitized_title}_introduction.txt')
        # Write the formatted introduction to the output file
        with open(output_file_path, 'w', encoding='utf-8') as f:
            f.write(introduction_text)
        logging.info(f"Introduction has been written to {output_file_path}")
    except Exception as e:
        logging.error(f"Error processing PDF {pdf_path}: {str(e)}")

def process_pdfs(pdf_dir: str, output_dir: str, grobid_url: str, metadata_dir: str):
    """
    Processes all PDF files in the given directory to extract metadata and the introduction section,
    and saves the extracted information to the specified output directory.

    Parameters:
    pdf_dir (str): The directory containing PDF files.
    output_dir (str): The directory to save the extracted information.
    grobid_url (str): The URL of the GROBID service.
    metadata_dir (str): The directory to save metadata files.
    """
    pdf_files = [os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.endswith('.pdf')]
    for pdf_path in pdf_files:
        logging.info(f"Processing file: {pdf_path}")
        
        # Generate or load metadata
        metadata_path = os.path.join(metadata_dir, f"{os.path.splitext(os.path.basename(pdf_path))[0]}_metadata.json")
        if not os.path.exists(metadata_path):
            logging.info(f"Metadata file not found for {pdf_path}. Generating metadata...")
            metadata = generate_metadata(pdf_path, metadata_dir, grobid_url)
            if metadata is None:
                continue
        else:
            with open(metadata_path, 'r', encoding='utf-8') as f:
                metadata = json.load(f)
        
        # Process the PDF to extract the introduction
        process_pdf_for_introduction(pdf_path, output_dir, grobid_url, metadata)

if __name__ == "__main__":
    PDF_DIR = "/Users/franciscoteixeirabarbosa/Dropbox/Science in Dentistry APP/pdf_extractor/sections_extractor/pdfs"
    OUTPUT_DIR = "/Users/franciscoteixeirabarbosa/Dropbox/Science in Dentistry APP/pdf_extractor/sections_extractor/introduction_output"
    METADATA_DIR = "/Users/franciscoteixeirabarbosa/Dropbox/Science in Dentistry APP/pdf_extractor/sections_extractor/metadata_output"
    
    # Ensure the output directory exists
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    os.makedirs(METADATA_DIR, exist_ok=True)
    
    # Start the GROBID service
    start_grobid_service()
    
    # Process the PDFs
    process_pdfs(PDF_DIR, OUTPUT_DIR, GROBID_URL, METADATA_DIR)

New contributor

Tuminha is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.

Trang chủ Giới thiệu Sinh nhật bé trai Sinh nhật bé gái Tổ chức sự kiện Biểu diễn giải trí Dịch vụ khác Trang trí tiệc cưới Tổ chức khai trương Tư vấn dịch vụ Thư viện ảnh Tin tức - sự kiện Liên hệ Chú hề sinh nhật Trang trí YEAR END PARTY công ty Trang trí tất niên cuối năm Trang trí tất niên xu hướng mới nhất Trang trí sinh nhật bé trai Hải Đăng Trang trí sinh nhật bé Khánh Vân Trang trí sinh nhật Bích Ngân Trang trí sinh nhật bé Thanh Trang Thuê ông già Noel phát quà Biểu diễn xiếc khỉ Xiếc quay đĩa Dịch vụ tổ chức sự kiện 5 sao Thông tin về chúng tôi Dịch vụ sinh nhật bé trai Dịch vụ sinh nhật bé gái Sự kiện trọn gói Các tiết mục giải trí Dịch vụ bổ trợ Tiệc cưới sang trọng Dịch vụ khai trương Tư vấn tổ chức sự kiện Hình ảnh sự kiện Cập nhật tin tức Liên hệ ngay Thuê chú hề chuyên nghiệp Tiệc tất niên cho công ty Trang trí tiệc cuối năm Tiệc tất niên độc đáo Sinh nhật bé Hải Đăng Sinh nhật đáng yêu bé Khánh Vân Sinh nhật sang trọng Bích Ngân Tiệc sinh nhật bé Thanh Trang Dịch vụ ông già Noel Xiếc thú vui nhộn Biểu diễn xiếc quay đĩa Dịch vụ tổ chức tiệc uy tín Khám phá dịch vụ của chúng tôi Tiệc sinh nhật cho bé trai Trang trí tiệc cho bé gái Gói sự kiện chuyên nghiệp Chương trình giải trí hấp dẫn Dịch vụ hỗ trợ sự kiện Trang trí tiệc cưới đẹp Khởi đầu thành công với khai trương Chuyên gia tư vấn sự kiện Xem ảnh các sự kiện đẹp Tin mới về sự kiện Kết nối với đội ngũ chuyên gia Chú hề vui nhộn cho tiệc sinh nhật Ý tưởng tiệc cuối năm Tất niên độc đáo Trang trí tiệc hiện đại Tổ chức sinh nhật cho Hải Đăng Sinh nhật độc quyền Khánh Vân Phong cách tiệc Bích Ngân Trang trí tiệc bé Thanh Trang Thuê dịch vụ ông già Noel chuyên nghiệp Xem xiếc khỉ đặc sắc Xiếc quay đĩa thú vị
Trang chủ Giới thiệu Sinh nhật bé trai Sinh nhật bé gái Tổ chức sự kiện Biểu diễn giải trí Dịch vụ khác Trang trí tiệc cưới Tổ chức khai trương Tư vấn dịch vụ Thư viện ảnh Tin tức - sự kiện Liên hệ Chú hề sinh nhật Trang trí YEAR END PARTY công ty Trang trí tất niên cuối năm Trang trí tất niên xu hướng mới nhất Trang trí sinh nhật bé trai Hải Đăng Trang trí sinh nhật bé Khánh Vân Trang trí sinh nhật Bích Ngân Trang trí sinh nhật bé Thanh Trang Thuê ông già Noel phát quà Biểu diễn xiếc khỉ Xiếc quay đĩa
Thiết kế website Thiết kế website Thiết kế website Cách kháng tài khoản quảng cáo Mua bán Fanpage Facebook Dịch vụ SEO Tổ chức sinh nhật