I’m looking for a bit of advice around how I can make this scraping script more robust, the script does run but one of the main issues I have with it is that it often fails with an exception when part way through (hence all the error handling, retries and saving).
There doesn’t seem to be an easier way of scraping the data as to get the more detailed information of each aircraft, the script attempts to double-click on a table row to open detailed information, parses the detailed information using parse_detailed_info, handles navigation back to the main page and retries up to three times if errors occur and then logs success or failure for each row.
Basically what the script is doing
- Initialize WebDriver: Sets up the Selenium WebDriver to access the target webpage.
- Configure Data Storage: Initializes an empty list to store scraped data.
- Set Rows per Page: Configures the webpage to display 100 rows per page for efficient data extraction.
- Scrape Data: Iterates through each row of the table on the current page. Performs a double-click on each row to access detailed
information. Extracts and parses the detailed information, including
aircraft registration number, serial number, model, manufacturing
details, and ownership information. Handles potential errors and
retries up to three times if necessary. Handle Pagination: Navigates
to the next page of results and repeats the scraping process until all
pages are processed.- Save Data: Periodically saves the scraped data to a CSV file to ensure progress is not lost.
- Logging: Maintains a detailed log of the scraping process, including successes, failures, and errors for troubleshooting and
auditing purposes.
I guess when trying to scrape over 6000+ aircraft this way the script has a high chance of failure due to the length of time it has to run and also the chance of running into an issue that it can’t recover from after multiple retries.
I’m looking for any tips or advice on how I can improve the stability of the scraping script or if there is an alternative approach I can try. Plus if i can speed up the scraping that would be a big help too as i can see this script would take potentially 17+ hours to finish scraping everything!
Thanks,
Mark
import time
import pandas as pd
import logging
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.action_chains import ActionChains
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException, NoSuchElementException, ElementClickInterceptedException
logging.basicConfig(filename='scraping_log.log', level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s')
def init_driver():
driver = webdriver.Edge()
driver.get("https://arr.caac.gov.cn/aircraft-right/sys-content/index/qldjcxList.html")
return driver
scraped_data = []
def parse_detailed_info(content):
info = {}
try:
info['国籍登记号'] = content.find_element(By.ID, 'nationalitySign').text
info['出厂序号'] = content.find_element(By.ID, 'serialNum').text
info['航空器型号'] = content.find_element(By.ID, 'aircraftModel').text
info['出厂日期'] = content.find_element(By.ID, 'manufactureDate').text
info['制造地点'] = content.find_element(By.ID, 'madePlace').text
info['制造商名称'] = content.find_element(By.ID, 'manufacturerName').text
info['所有人'] = content.find_element(By.XPATH, '//table[@id="syq_info"]/tbody/tr[2]/td[1]').text
info['所有权类型'] = content.find_element(By.XPATH, '//table[@id="syq_info"]/tbody/tr[2]/td[2]').text
info['所有权申请日期'] = content.find_element(By.XPATH, '//table[@id="syq_info"]/tbody/tr[2]/td[3]').text
except NoSuchElementException as e:
logging.warning(f"Error parsing detailed info: {e}")
return info
def double_click_and_scrape(driver, row_index):
retry_count = 0
while retry_count < 3:
try:
cell_xpath = f'/html/body/div[2]/div/div[2]/div/table/tbody/tr[{row_index}]/td[2]'
cell = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, cell_xpath))
)
actions = ActionChains(driver)
actions.double_click(cell).perform()
content = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[2]/div[4]'))
)
detailed_info = parse_detailed_info(content)
scraped_data.append(detailed_info)
logging.info(f"Successfully scraped data for row {row_index}: {detailed_info}")
print(f"Successfully scraped data for row {row_index}: {detailed_info}")
driver.execute_script("window.history.go(-1)")
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="dataTable"]/tbody/tr[1]'))
)
return
except (StaleElementReferenceException, NoSuchElementException, TimeoutException, ElementClickInterceptedException) as e:
logging.warning(f"Failed to scrape data for row {row_index}, attempt {retry_count+1}: {e}")
print(f"Failed to scrape data for row {row_index}, attempt {retry_count+1}: {e}")
retry_count += 1
driver.refresh()
time.sleep(5)
logging.error(f"Failed to scrape data for row {row_index} after 3 attempts")
print(f"Failed to scrape data for row {row_index} after 3 attempts")
scraped_data.append({"row_index": row_index, "error": "Failed to scrape data"})
def set_rows_per_page(driver):
try:
rows_per_page_selector = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="dataTable_length"]/label/select'))
)
rows_per_page_selector.click()
option = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="dataTable_length"]/label/select/option[4]'))
)
option.click()
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="dataTable"]/tbody/tr[100]'))
)
logging.info("Set rows per page to 100")
print("Set rows per page to 100")
except (TimeoutException, NoSuchElementException) as e:
logging.error(f"Failed to set rows per page: {e}")
print(f"Failed to set rows per page: {e}")
def go_to_next_page(driver):
try:
next_button = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="dataTable_next"]/a'))
)
if 'disabled' in next_button.get_attribute('class'):
return False
next_button.click()
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="dataTable"]/tbody/tr[1]'))
)
time.sleep(5)
return True
except (StaleElementReferenceException, NoSuchElementException, TimeoutException, ElementClickInterceptedException) as e:
logging.warning(f"Failed to go to the next page: {e}")
print(f"Failed to go to the next page: {e}")
try:
pagination_element = driver.find_element(By.XPATH, '//*[@id="dataTable_next"]')
print("Pagination HTML:", pagination_element.get_attribute('innerHTML'))
except Exception as inner_e:
logging.error(f"Failed to print pagination HTML: {inner_e}")
print(f"Failed to print pagination HTML: {inner_e}")
return False
def save_progress(output_file_path):
scraped_df = pd.DataFrame(scraped_data)
scraped_df.to_csv(output_file_path, index=False)
logging.info(f"Progress saved to {output_file_path}")
print(f"Progress saved to {output_file_path}")
driver = init_driver()
output_file_path = "scraped_aircraft_data.csv"
set_rows_per_page(driver)
current_page = 1
while True:
logging.info(f"Processing page: {current_page}")
print(f"Processing page: {current_page}")
row_index = 1
while True:
try:
logging.info(f"Processing row index: {row_index} on page: {current_page}")
print(f"Processing row index: {row_index} on page: {current_page}")
double_click_and_scrape(driver, row_index)
row_index += 1
next_row_xpath = f'/html/body/div[2]/div/div[2]/div/table/tbody/tr[{row_index}]/td[2]'
if not driver.find_elements(By.XPATH, next_row_xpath):
break
except Exception as e:
logging.warning(f"Unexpected error processing row {row_index} on page {current_page}: {e}")
print(f"Unexpected error processing row {row_index} on page {current_page}: {e}")
break
save_progress(output_file_path)
if not go_to_next_page(driver):
break
current_page += 1
driver.quit()
save_progress(output_file_path)
logging.info("Scraping complete.")
print("Scraping complete.")