Thiết kế website giá rẻ

Question

I’m working on a Python script that processes PDF files to extract the introduction section using the OpenAI API. The script reads the first few pages of a PDF, extracts the text, and sends it to GPT-4o to identify the introduction. While the text is correctly printed in the terminal, I’m encountering an error when trying to save it to a file: ‘ChatCompletionMessage’ object is not subscriptable.

2024-06-10 06:56:16,995 – ERROR – Error processing PDF /Users/franciscoteixeirabarbosa/Dropbox/Science in Dentistry APP/pdf_extractor/sections_extractor/pdfs/Waiting room time_ An opportunity for parental oral health education.pdf: ‘ChatCompletionMessage’ object is not subscriptable

import re
import os
import json
import logging
import requests
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from lxml import etree as ET
import openai

# Setup logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

# Path to the GROBID service
GROBID_PATH = '/Users/franciscoteixeirabarbosa/projects/test/sections_pdf/grobid'
GROBID_URL = 'http://localhost:8070'

def extract_introduction_from_pdf(pdf_path: str) -> str:
    """
    Extracts text from the first few pages of the PDF and uses GPT to find the introduction.
    """
    load_dotenv()

    INTRODUCTION_OUTPUT_DIR = "/Users/franciscoteixeirabarbosa/Dropbox/Science in Dentistry APP/pdf_extractor/sections_extractor/introduction_output"

    # Ensure the output directory exists
    os.makedirs(INTRODUCTION_OUTPUT_DIR, exist_ok=True)

    # Read the first three pages of the PDF
    with open(pdf_path, 'rb') as file:
        pdf_reader = PdfReader(file)
        pdf_text = []
        for page_num in range(min(3, len(pdf_reader.pages))):
            page = pdf_reader.pages[page_num]
            pdf_text.append(page.extract_text())
        pdf_text = "n".join(pdf_text)

    # Prepare the prompt for the AI
    prompt = f"""
    Extract the introduction section from the following text, which is typically found between the abstract and the methods section:
    {pdf_text}
    """

    # Initialize the OpenAI client
    openai.api_key = os.getenv("OPENAI_API_KEY")

    response = openai.ChatCompletion.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a document analysis assistant. Extract the introduction section."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=1500
    )

    # Properly access the choices from the response
    output_text = response.choices[0].message['content'].strip()

    # Define the output file path
    output_file_path = os.path.join(INTRODUCTION_OUTPUT_DIR, os.path.basename(pdf_path).replace('.pdf', '_introduction.txt'))

    # Write the output to a file
    with open(output_file_path, 'w', encoding='utf-8') as file:
        file.write(output_text)

    return output_text

Error message:

2024-06-10 06:56:16,995 – ERROR – Error processing PDF /Users/franciscoteixeirabarbosa/Dropbox/Science in Dentistry APP/pdf_extractor/sections_extractor/pdfs/Waiting room time_ An opportunity for parental oral health education.pdf: ‘ChatCompletionMessage’ object is not subscriptable

Initial attempt:

output_text = response.choices[0].message['content'].strip()

Suggested Correction:

Based on OpenAI’s documentation, the response should be accessed using:

output_text = response.choices[0].message['content'].strip() # Incorrect
output_text = response.choices[0].message.content.strip()    # Correct

Below the full script:

import re
import os
import json
import logging
import requests
import xml.etree.ElementTree as ET
from dotenv import load_dotenv
from PyPDF2 import PdfFileReader
from io import BytesIO
import openai

# Setup logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

# Path to the GROBID service
GROBID_PATH = '/Users/franciscoteixeirabarbosa/projects/test/sections_pdf/grobid'
GROBID_URL = 'http://localhost:8070'

def start_grobid_service():
    try:
        response = requests.get(f'{GROBID_URL}/api/isalive', timeout=10)
        if response.status_code == 200:
            logging.info("GROBID service is already running.")
        else:
            logging.info("GROBID service is not running. Attempting to start it...")
            subprocess.Popen(['./gradlew', 'run', '--stacktrace'], cwd=GROBID_PATH)
            time.sleep(10)
            response = requests.get(f'{GROBID_URL}/api/isalive', timeout=10)
            if response.status_code == 200:
                logging.info("GROBID service started successfully.")
            else:
                logging.error("Failed to start GROBID service.")
    except requests.exceptions.RequestException as e:
        logging.error(f"Error checking GROBID service: {str(e)}")
        logging.info("Attempting to start GROBID service...")
        subprocess.Popen(['./gradlew', 'run', '--stacktrace'], cwd=GROBID_PATH)
        time.sleep(10)
        try:
            response = requests.get(f'{GROBID_URL}/api/isalive', timeout=10)
            if response.status_code == 200:
                logging.info("GROBID service started successfully.")
            else:
                logging.error("Failed to start GROBID service.")
        except requests.exceptions.RequestException as e:
            logging.error(f"Failed to start GROBID service: {str(e)}")

from lxml import etree as ET

def extract_introduction_from_xml(xml_root: ET.Element, namespace: dict) -> str:
    """
    Extracts the introduction from the XML root using the provided namespace.
    """
    # Locate the abstract element
    abstract_element = xml_root.find('.//tei:abstract', namespaces=namespace)
    if abstract_element is not None:
        # Get the next element after the abstract, which should be the introduction
        introduction_element = abstract_element.getnext()
        # Continue to the next element until 'Methods' is found
        introduction_text = []
        while introduction_element is not None and 'Methods' not in introduction_element.findtext('.//tei:head', namespaces=namespace, default=''):
            introduction_text.append(''.join(introduction_element.itertext()))
            introduction_element = introduction_element.getnext()
        if introduction_text:
            return 'n'.join(introduction_text).strip()
    return ""

from PyPDF2 import PdfReader

def extract_introduction_from_pdf(pdf_path: str) -> str:
    """
    Extracts text from the first few pages of the PDF and uses GPT to find the introduction.
    """
    load_dotenv()

    INTRODUCTION_OUTPUT_DIR = "/Users/franciscoteixeirabarbosa/Dropbox/Science in Dentistry APP/pdf_extractor/sections_extractor/introduction_output"

    # Ensure the output directory exists
    os.makedirs(INTRODUCTION_OUTPUT_DIR, exist_ok=True)

    # Read the first three pages of the PDF
    with open(pdf_path, 'rb') as file:
        pdf_reader = PdfReader(file)
        pdf_text = []
        for page_num in range(min(3, len(pdf_reader.pages))):
            page = pdf_reader.pages[page_num]
            pdf_text.append(page.extract_text())
        pdf_text = "n".join(pdf_text)

    # Prepare the prompt for the AI
    prompt = f"""
    Extract the introduction section from the following text, which is typically found between the abstract and the methods section:
    {pdf_text}
    """

    # Initialize the OpenAI client
    openai.api_key = os.getenv("OPENAI_API_KEY")

    response = openai.ChatCompletion.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a document analysis assistant. Extract the introduction section."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=1500
    )

    # Access the response content properly
    output_text = response.choices[0].message.content.strip()

    # Define the output file path
    output_file_path = os.path.join(INTRODUCTION_OUTPUT_DIR, os.path.basename(pdf_path).replace('.pdf', '_introduction.txt'))

    # Write the output to a file
    with open(output_file_path, 'w', encoding='utf-8') as file:
        file.write(output_text)

    return output_text

def extract_metadata(xml_root: ET.Element, namespace: dict) -> dict:
    """
    Extracts metadata from the XML root, specifically the main article author, publication year, journal, and title.
    """
    metadata = {
        'title': None,
        'authors': [],
        'publication_year': None,
        'journal': None
    }

    # Extracting the title
    title_element = xml_root.find('.//tei:title[@type="main"]', namespace)
    if title_element is not None:
        metadata['title'] = title_element.text

    # Extracting main article authors
    main_article = xml_root.find('.//tei:analytic', namespace)
    if main_article is not None:
        for author in main_article.findall('.//tei:author/tei:persName', namespace):
            forename_element = author.find('.//tei:forename', namespace)
            surname_element = author.find('.//tei:surname', namespace)
            if forename_element is not None and surname_element is not None:
                forename = forename_element.text
                surname = surname_element.text
                metadata['authors'].append(f"{forename} {surname}")

    # Extracting publication year
    date_element = xml_root.find('.//tei:imprint/tei:date[@type="published"]', namespace)
    if date_element is not None:
        metadata['publication_year'] = date_element.get('when')

    # Extracting journal title
    journal_title_element = xml_root.find('.//tei:monogr/tei:title[@level="j"]', namespace)
    if journal_title_element is not None:
        metadata['journal'] = journal_title_element.text

    return metadata

def generate_metadata(pdf_path: str, output_dir: str, grobid_url: str) -> dict:
    """
    Generates metadata for a given PDF file.

    Parameters:
    pdf_path (str): The path to the PDF file.
    output_dir (str): The directory to save the output.
    grobid_url (str): The URL of the GROBID service.

    Returns:
    dict: The extracted metadata.
    """
    try:
        with open(pdf_path, 'rb') as f:
            response = requests.post(f'{grobid_url}/api/processHeaderDocument', files={'input': f}, timeout=10)
            response.raise_for_status()

        # Save the XML output
        xml_output_path = os.path.join(output_dir, 'output.xml')
        with open(xml_output_path, 'w', encoding='utf-8') as f:
            f.write(response.text)

        # Parse the XML output
        tree = ET.parse(xml_output_path)
        root = tree.getroot()
        ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

        # Extract the metadata
        metadata = extract_metadata(root, ns)

        # Define the output file path using the title
        sanitized_title = re.sub(r'[^ws-]', '', metadata['title']).strip().replace(' ', '_')
        metadata_output_path = os.path.join(output_dir, f'{sanitized_title}_metadata.json')

        # Write the metadata to the output file
        with open(metadata_output_path, 'w', encoding='utf-8') as f:
            json.dump(metadata, f, indent=4)

        logging.info(f"Metadata has been written to {metadata_output_path}")
        return metadata
    except Exception as e:
        logging.error(f"Error generating metadata for PDF {pdf_path}: {str(e)}")
        return None
    
def extract_introduction_from_pdf(pdf_path: str) -> str:
    """
    Extracts text from the first few pages of the PDF and uses GPT to find the introduction.
    """
    load_dotenv()

    INTRODUCTION_OUTPUT_DIR = "/Users/franciscoteixeirabarbosa/Dropbox/Science in Dentistry APP/pdf_extractor/sections_extractor/introduction_output"

    # Read the first three pages of the PDF
    with open(pdf_path, 'rb') as file:
        pdf_reader = PdfReader(file)
        pdf_text = []
        for page_num in range(min(3, len(pdf_reader.pages))):
            page = pdf_reader.pages[page_num]
            pdf_text.append(page.extract_text())
        pdf_text = "n".join(pdf_text)

    # Prepare the prompt for the AI
    prompt = f"""
    Extract the introduction section from the following text, which is typically found between the abstract and the methods section:
    {pdf_text}
    """

    # Initialize the OpenAI client
    openai.api_key = os.getenv("OPENAI_API_KEY")

    response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a document analysis assistant. Extract the introduction section."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=3000
    )

    # Properly access the choices from the response
    output_text = response.choices[0].message['content'].strip()
    output_file_path = os.path.join(INTRODUCTION_OUTPUT_DIR, os.path.basename(pdf_path).replace('.pdf', '_introduction.txt'))
    with open(output_file_path, 'w', encoding='utf-8') as file:
        file.write(output_text)
    return output_text



def process_pdf_for_introduction(pdf_path: str, output_dir: str, grobid_url: str, metadata: dict):
    """
    Processes a PDF file to extract and format the introduction, saving the result to the specified output directory.

    Parameters:
    pdf_path (str): The path to the PDF file.
    output_dir (str): The directory to save the output.
    grobid_url (str): The URL of the GROBID service.
    metadata (dict): The metadata of the article.
    """
    try:
        title = metadata.get('title', 'Unknown_Title')
        # Send the PDF to GROBID
        with open(pdf_path, 'rb') as f:
            response = requests.post(f'{grobid_url}/api/processFulltextDocument', files={'input': f}, timeout=10)
            response.raise_for_status()
        # Save the XML output
        xml_output_path = os.path.join(output_dir, 'output.xml')
        with open(xml_output_path, 'w', encoding='utf-8') as f:
            f.write(response.text)
        # Parse the XML output
        tree = ET.parse(xml_output_path)
        root = tree.getroot()
        ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
        # Extract the introduction text
        introduction_text = extract_introduction_from_xml(root, ns)
        if not introduction_text:
            introduction_text = extract_introduction_from_pdf(pdf_path)
        # Define the output file path using the title
        sanitized_title = re.sub(r'[^ws-]', '', title).strip().replace(' ', '_')
        output_file_path = os.path.join(output_dir, f'{sanitized_title}_introduction.txt')
        # Write the formatted introduction to the output file
        with open(output_file_path, 'w', encoding='utf-8') as f:
            f.write(introduction_text)
        logging.info(f"Introduction has been written to {output_file_path}")
    except Exception as e:
        logging.error(f"Error processing PDF {pdf_path}: {str(e)}")

def process_pdfs(pdf_dir: str, output_dir: str, grobid_url: str, metadata_dir: str):
    """
    Processes all PDF files in the given directory to extract metadata and the introduction section,
    and saves the extracted information to the specified output directory.

    Parameters:
    pdf_dir (str): The directory containing PDF files.
    output_dir (str): The directory to save the extracted information.
    grobid_url (str): The URL of the GROBID service.
    metadata_dir (str): The directory to save metadata files.
    """
    pdf_files = [os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.endswith('.pdf')]
    for pdf_path in pdf_files:
        logging.info(f"Processing file: {pdf_path}")
        
        # Generate or load metadata
        metadata_path = os.path.join(metadata_dir, f"{os.path.splitext(os.path.basename(pdf_path))[0]}_metadata.json")
        if not os.path.exists(metadata_path):
            logging.info(f"Metadata file not found for {pdf_path}. Generating metadata...")
            metadata = generate_metadata(pdf_path, metadata_dir, grobid_url)
            if metadata is None:
                continue
        else:
            with open(metadata_path, 'r', encoding='utf-8') as f:
                metadata = json.load(f)
        
        # Process the PDF to extract the introduction
        process_pdf_for_introduction(pdf_path, output_dir, grobid_url, metadata)

if __name__ == "__main__":
    PDF_DIR = "/Users/franciscoteixeirabarbosa/Dropbox/Science in Dentistry APP/pdf_extractor/sections_extractor/pdfs"
    OUTPUT_DIR = "/Users/franciscoteixeirabarbosa/Dropbox/Science in Dentistry APP/pdf_extractor/sections_extractor/introduction_output"
    METADATA_DIR = "/Users/franciscoteixeirabarbosa/Dropbox/Science in Dentistry APP/pdf_extractor/sections_extractor/metadata_output"
    
    # Ensure the output directory exists
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    os.makedirs(METADATA_DIR, exist_ok=True)
    
    # Start the GROBID service
    start_grobid_service()
    
    # Process the PDFs
    process_pdfs(PDF_DIR, OUTPUT_DIR, GROBID_URL, METADATA_DIR)

Danh mục