Thiết kế website giá rẻ

Question

Being new to Selenium and Python I have been given the task of extracting all the data from three tables on a wikipedia page. In all tests I have been able to get the pertinent data from the fist table but the code is unable to find anything about the 2nd or 3rd table. I know it can’t really be this hard but I have been at it for 3 days straight with no advance. What exactly am I missing in my code? I am able to get the page to open and then it spits back that there are only 2 or sometimes one table on the page but I know for a fact that there are three. The page in question is:
https://es.wikipedia.org/wiki/Anexo:Entidades_federativas_de_M%C3%A9xico_por_superficie,_poblaci%C3%B3n_y_densidad
And the code I have is as follows:

# Libraries
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from io import StringIO

# Add debugging statements
print("Starting script...")

# Initialize Chrome options
options = webdriver.ChromeOptions()
options.add_argument('--start-maximized')
options.add_argument('--disable-extensions')

# Use webdriver-manager to get the appropriate ChromeDriver
service = Service(ChromeDriverManager().install())

# Initialize the WebDriver
driver = webdriver.Chrome(service=service, options=options)

try:
    # Start on 2nd Monitor
    driver.set_window_position(2000, 0)
    driver.maximize_window()
    time.sleep(5)

    # Initiate Browser
    driver.get('https://es.wikipedia.org/wiki/Anexo:Entidades_federativas_de_M%C3%A9xico_por_superficie,_poblaci%C3%B3n_y_densidad')

    # Wait for the page to fully load by waiting for a specific element to appear
    WebDriverWait(driver, 30).until(
        EC.presence_of_element_located((By.XPATH, '//*[@id="firstHeading"]'))
    )

    print("Page loaded successfully")

    # Extract data from the tables using specific XPath expressions
    first_table = driver.find_element(By.XPATH, '//table[contains(.//caption, "Entidades federativas de México por superficie, población y densidad")]')
    second_table = driver.find_element(By.XPATH, '(//table[contains(.//caption, "Población histórica de México")])[1]')
    third_table = driver.find_element(By.XPATH, '(//table[contains(.//caption, "Población histórica de México")])[2]')

    print("All tables found")

    # First table extraction
    first_table_html = first_table.get_attribute('outerHTML')
    first_table_df = pd.read_html(first_table_html)[0]
    first_table_df = first_table_df.iloc[2:34, :]  # Remove header rows and ensure 32 rows of data
    first_table_df.columns = first_table_df.iloc[0]  # Set the first row as header
    first_table_df = first_table_df[1:]  # Remove the header row from the data
    print("First table extracted successfully")

    # Second table extraction
    second_table_html = second_table.get_attribute('outerHTML')
    second_table_df = pd.read_html(second_table_html)[0]
    second_table_df.columns = ['Pos', 'Entidad', '2020', '2010', '2000', '1990', '1980', '1970', '1960', '1950', '1940', '1930', '1921', '1910']
    print("Second table extracted successfully")

    # Third table extraction
    third_table_html = third_table.get_attribute('outerHTML')
    third_table_df = pd.read_html(third_table_html)[0]
    third_table_df.columns = ['Pos', 'Entidad', '2010', '2015', '2020', '2025', '2030']
    print("Third table extracted successfully")

    # Save to Excel with each table on a different sheet
    with pd.ExcelWriter('mexico_population_data.xlsx') as writer:
        first_table_df.to_excel(writer, sheet_name='Superficie_Poblacion_Densidad', index=False)
        second_table_df.to_excel(writer, sheet_name='Poblacion_Historica', index=False)
        third_table_df.to_excel(writer, sheet_name='Poblacion_Futura', index=False)

    print("Data extraction and Excel file creation successful")

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    # Close the browser after a delay to see the loaded page
    time.sleep(10)
    driver.quit()

Thiết kế website giá rẻ

Danh mục

Python and Selenium unable to find all tables on a page