Being new to Selenium and Python I have been given the task of extracting all the data from three tables on a wikipedia page. In all tests I have been able to get the pertinent data from the fist table but the code is unable to find anything about the 2nd or 3rd table. I know it can’t really be this hard but I have been at it for 3 days straight with no advance. What exactly am I missing in my code? I am able to get the page to open and then it spits back that there are only 2 or sometimes one table on the page but I know for a fact that there are three. The page in question is:
https://es.wikipedia.org/wiki/Anexo:Entidades_federativas_de_M%C3%A9xico_por_superficie,_poblaci%C3%B3n_y_densidad
And the code I have is as follows:
# Libraries
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from io import StringIO
# Add debugging statements
print("Starting script...")
# Initialize Chrome options
options = webdriver.ChromeOptions()
options.add_argument('--start-maximized')
options.add_argument('--disable-extensions')
# Use webdriver-manager to get the appropriate ChromeDriver
service = Service(ChromeDriverManager().install())
# Initialize the WebDriver
driver = webdriver.Chrome(service=service, options=options)
try:
# Start on 2nd Monitor
driver.set_window_position(2000, 0)
driver.maximize_window()
time.sleep(5)
# Initiate Browser
driver.get('https://es.wikipedia.org/wiki/Anexo:Entidades_federativas_de_M%C3%A9xico_por_superficie,_poblaci%C3%B3n_y_densidad')
# Wait for the page to fully load by waiting for a specific element to appear
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="firstHeading"]'))
)
print("Page loaded successfully")
# Extract data from the tables using specific XPath expressions
first_table = driver.find_element(By.XPATH, '//table[contains(.//caption, "Entidades federativas de México por superficie, población y densidad")]')
second_table = driver.find_element(By.XPATH, '(//table[contains(.//caption, "Población histórica de México")])[1]')
third_table = driver.find_element(By.XPATH, '(//table[contains(.//caption, "Población histórica de México")])[2]')
print("All tables found")
# First table extraction
first_table_html = first_table.get_attribute('outerHTML')
first_table_df = pd.read_html(first_table_html)[0]
first_table_df = first_table_df.iloc[2:34, :] # Remove header rows and ensure 32 rows of data
first_table_df.columns = first_table_df.iloc[0] # Set the first row as header
first_table_df = first_table_df[1:] # Remove the header row from the data
print("First table extracted successfully")
# Second table extraction
second_table_html = second_table.get_attribute('outerHTML')
second_table_df = pd.read_html(second_table_html)[0]
second_table_df.columns = ['Pos', 'Entidad', '2020', '2010', '2000', '1990', '1980', '1970', '1960', '1950', '1940', '1930', '1921', '1910']
print("Second table extracted successfully")
# Third table extraction
third_table_html = third_table.get_attribute('outerHTML')
third_table_df = pd.read_html(third_table_html)[0]
third_table_df.columns = ['Pos', 'Entidad', '2010', '2015', '2020', '2025', '2030']
print("Third table extracted successfully")
# Save to Excel with each table on a different sheet
with pd.ExcelWriter('mexico_population_data.xlsx') as writer:
first_table_df.to_excel(writer, sheet_name='Superficie_Poblacion_Densidad', index=False)
second_table_df.to_excel(writer, sheet_name='Poblacion_Historica', index=False)
third_table_df.to_excel(writer, sheet_name='Poblacion_Futura', index=False)
print("Data extraction and Excel file creation successful")
except Exception as e:
print(f"An error occurred: {e}")
finally:
# Close the browser after a delay to see the loaded page
time.sleep(10)
driver.quit()