Thiết kế website giá rẻ

Question

I have a dataset of sec edgar DEF 14A reports (1994-2022) which includes company’s symbol (Like AAPL for Apple Inc.), date the report is published (usually one a year), and the url of the report. I want my python to extract the Chief executive officer’s information from every report_url which is in the form of textual form, here’s the simple example of the text when I searched for cheif executive officer:
Dr. Scangos has served as Chief Executive Officer and a director of Vir Biotechnology, Inc. since January 2017. From July 2010 to January 2017, Dr. Scangos served as the Chief Executive Officer and a director of Biogen Inc., a biopharmaceutical company. From 1996 to July 2010, Dr. Scangos served as the President and Chief Executive Officer of Exelixis, Inc.
…
And so much more patterns in which they are written.

I want to extract all the CEOs names, and when they started their position and when they left the company. Which is in the textual form but I want python to save those information in the different col like this:
Name, Position started, Position ended
Paul, dec 2018, Jan 2021

Does anyone have any idea how can we do such things?

That’s the code I tried but it just saying No match found.

import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import time
import os

# Load the combined DEF 14A data
print("Loading combined DEF 14A data...")
combined_data = pd.read_csv('combined_def14a_data.csv')

# Get the unique symbols
symbols = combined_data['symbol'].unique()

# Create a directory to store the results
output_dir = 'ceo_info_by_symbol'
os.makedirs(output_dir, exist_ok=True)

# Function to download and parse a DEF 14A report URL
def parse_def14a_report(url):
    response = requests.get(url)
    content_type = response.headers['Content-Type']

    if 'html' in content_type:
        soup = BeautifulSoup(response.content, 'html.parser')
        text = soup.get_text(separator=' ').strip()
    else:
        text = response.text

    ceo_info = []

    # Define the patterns to match CEO information
    patterns = [
        r'(?P<name>[ws.]+) has been (?:a|the) Chief Executive Officer of [ws]+ since (?P<start_date>[ws]+).',
        r'(?P<name>[ws.]+) served as Chief Executive Officer from (?P<start_date>[ws]+) to (?P<end_date>[ws]+).',
        r'(?P<name>[ws.]+), (?P<position>Chief Executive Officer) of [ws]+ since (?P<start_date>[ws]+)',
        r'(?P<name>[ws.]+) (?P<position>Chief Executive Officer) from (?P<start_date>[ws]+) to (?P<end_date>[ws]+)',
        r'(?P<name>[ws.]+) served as Chief Executive Officer of [ws]+ from (?P<start_date>[ws]+) to (?P<end_date>[ws]+)',
        r'(?P<name>[ws.]+) (?P<position>Chief Executive Officer) of [ws]+ from (?P<start_date>[ws]+) to (?P<end_date>[ws]+)',
    ]

    for pattern in patterns:
        for match in re.finditer(pattern, text):
            ceo_data = {
                'name': match.group('name'),
                'position': match.group('position') if 'position' in match.groupdict() else 'Chief Executive Officer',
                'start_date': match.group('start_date'),
                'end_date': match.group('end_date') if 'end_date' in match.groupdict() else None
            }
            ceo_info.append(ceo_data)

    # Log the text content for inspection (for debugging purposes)
    if not ceo_info:
        print(f'No matches found in: {url}')
        with open(f'parsed_text_{os.path.basename(url)}.txt', 'w', encoding='utf-8') as f:
            f.write(text[:10000])  # Write the first 10,000 characters for inspection

    return ceo_info

# Process each symbol individually
for symbol in symbols:
    print(f"Processing symbol: {symbol}")
    
    # Filter data for the current symbol
    symbol_data = combined_data[combined_data['symbol'] == symbol]
    
    # Initialize an empty list to hold CEO information for the current symbol
    all_ceo_info = []
    
    # Download and parse each report URL for the current symbol
    for i, row in symbol_data.iterrows():
        url = row['report_url']
        print(f'Parsing {i+1}/{len(symbol_data)}: {url}')
        ceo_info = parse_def14a_report(url)
        for info in ceo_info:
            info['symbol'] = row['symbol']
            info['filed_date'] = row['filed_date']
            all_ceo_info.append(info)
    
    # Save the CEO information for the current symbol to a CSV file
    if all_ceo_info:
        ceo_info_df = pd.DataFrame(all_ceo_info)
        ceo_info_df.to_csv(os.path.join(output_dir, f'{symbol}_ceo_info.csv'), index=False)
        print(f'CEO information for symbol {symbol} saved to {symbol}_ceo_info.csv')
    else:
        print(f'No CEO information found for symbol {symbol}')

    print(f"Completed processing for symbol: {symbol}")

print("All symbols processed.")`

Thiết kế website giá rẻ

Danh mục

Extracting specific keyword from URLs of the .htm or .txt with Python