Thiết kế website giá rẻ

Question

Problem Details

I am working on a personal project to scrape dynamic data from the Small and Medium Enterprise Agency’s database using Python and Selenium. The webpage displays company information in a “card” format, and clicking a button dynamically loads a table containing fee details.

However, I am encountering the following issues:

The button is detected and clicked successfully (verified with print statements).
The dynamic table data does not appear in driver.page_source, even after waiting for the table to load.

The target page： https://ma-shienkikan.go.jp/search

Below is the code for clicking the button and attempting to retrieve the table data：

def get_fee_details_optimized(card, fee_type, side):
    try:
        # Determine button color based on fee_type
        if fee_type == "FA":
            button_color = "bg-yellow-500"
        elif fee_type == "仲介":
            button_color = "bg-green-500"
        else:
            raise ValueError(f"Invalid fee_type: {fee_type}")

        # Find the button based on side (either transferor or transferee)
        button = card.find_element(
            By.XPATH,
            f".//a[contains(@class, '{button_color}') and contains(text(), '{side}側')]"
        )

        # Use JavaScript to click the button
        driver.execute_script("arguments[0].click();", button)

        # Wait for the table to appear
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "table")))

        # Parse the updated page source with BeautifulSoup
        updated_html = driver.page_source
        updated_soup = BeautifulSoup(updated_html, "html.parser")
        table = updated_soup.find("table")

        if table:
            return table.prettify()
        else:
            return "No table found"

    except Exception as e:
        print(f"Error retrieving fee details for {fee_type} {side}側: {e}")
        return "N/A"

Issues

Clicking the button does not result in the corresponding table data being retrieved.
The driver.page_source does not contain the dynamically loaded table data as expected.

What I Tried

Verified that the button is detected and clicked successfully using print statements.
Manually tested the button click in the browser to ensure the table data loads as expected.
Used WebDriverWait to allow sufficient time for the table to load.
Inspected the network activity in the browser’s developer tools to identify related API calls.

Hypotheses

The page may send additional network requests to fetch the table data, and Selenium may not capture the updated content.
The button click might not be correctly triggering the required event in the browser session.

Questions

How can I reliably retrieve the dynamically loaded table data after clicking the button with Selenium?
If the table data is fetched via an API call, how can I identify and directly retrieve the data?

Full Code

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

# Path to ChromeDriver
chromedriver_path = "/Users/kazatoy/PLEX/chromedriver"

# Chrome options
options = Options()
options.add_argument("--headless")  # Uncomment if headless mode is needed
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# Set up ChromeDriver service
service = Service(chromedriver_path)
driver = webdriver.Chrome(service=service, options=options)

# Data storage
data = []

# Base URL
base_url = "https://ma-shienkikan.go.jp/search"

# Format table data
def format_table_data(table_html):
    soup = BeautifulSoup(table_html, "html.parser")
    rows = soup.find("tbody").find_all("tr")
    formatted_rows = []

    for row in rows:
        cols = row.find_all("td")
        if len(cols) == 2:
            # Extract range data and fee rate
            amount_range = cols[0].get_text(strip=True)
            rate = cols[1].get_text(strip=True)
            formatted_rows.append(f"{amount_range} {rate}")
    
    return "n".join(formatted_rows)

# Function to click button and retrieve table data
def get_fee_details_optimized(card, fee_type, side):
    try:
        # Determine button color based on fee_type
        if fee_type == "FA":
            button_color = "bg-yellow-500"
        elif fee_type == "仲介":
            button_color = "bg-green-500"
        else:
            raise ValueError(f"Invalid fee_type: {fee_type}")

        # Find the button based on side (either transferor or transferee)
        button = card.find_element(
            By.XPATH,
            f".//a[contains(@class, '{button_color}') and contains(text(), '{side}側')]"
        )
        print(button)

        # Click the button using JavaScript
        driver.execute_script("arguments[0].click();", button)
        print(f"Clicked Button: {fee_type} {side}側 using JavaScript.")

        # Wait for the table to appear
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "table")))

        # Parse the updated page source
        updated_html = driver.page_source
        updated_soup = BeautifulSoup(updated_html, "html.parser")

        # Retrieve table data
        table = updated_soup.find("table")
        if table:
            return format_table_data(table.prettify())
        else:
            print(f"No table found for {fee_type} {side}側")
            return "No table found"

    except Exception as e:
        print(f"Error retrieving fee details for {fee_type} {side}側: {e}")
        return "N/A"

try:
    for page in range(1, 142):  # Adjust page range as needed
        print(f"Processing page {page}")
        driver.get(f"{base_url}?page={page}")

        # Wait for cards to be visible
        WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "section-card")))
        cards = driver.find_elements(By.CLASS_NAME, "section-card")

        for card in cards:
            try:
                card_html = card.get_attribute('outerHTML')
                soup = BeautifulSoup(card_html, "html.parser")

                # Extract company name
                company_name = soup.find("h1", class_="text-base sm:text-lg py-4 font-bold bg-main text-white px-4 -mx-4 sm:-mx-8")
                company_name = company_name.find("div", class_="inline-block").get_text(strip=True) if company_name else "N/A"

                if company_name == "N/A":
                    continue

                # Extract additional data
                support_type = soup.find("span", text="M&A支援機関の種類")
                support_type = support_type.find_next("span").get_text(strip=True) if support_type else "N/A"

                # Retrieve table data for each button
                mediator_fee_transferee = get_fee_details_optimized(card, "仲介", "譲受")
                mediator_fee_transferor = get_fee_details_optimized(card, "仲介", "譲渡")
                fa_fee_transferee = get_fee_details_optimized(card, "FA", "譲受")
                fa_fee_transferor = get_fee_details_optimized(card, "FA", "譲渡")

                # Save data
                data.append([
                    company_name, support_type, mediator_fee_transferee,
                    mediator_fee_transferor, fa_fee_transferee, fa_fee_transferor
                ])
            except Exception as e:
                print(f"Error processing card: {e}")

finally:
    driver.quit()

# Save data to CSV
df = pd.DataFrame(data, columns=["Company Name", "Support Type", "Mediator Fee (Transferee)", "Mediator Fee (Transferor)", "FA Fee (Transferee)", "FA Fee (Transferor)"])
output_file = "scraped_data.csv"
df.to_csv(output_file, index=False, encoding="utf-8-sig")
print(f"Data saved to {output_file}")

The table I want to retrieve is the attached image section