Using selenium to web scrape is kind of slow but its needed what i wanna scrape. i need to scrape thousands of urls, heres one : https://ariregister.rik.ee/est/company/14024665/O%C3%9C-1000-T%C3%96%C3%96D?search_id=4c068c3&pos=1
what can i improve to make it faster? (i use selectors)
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from openpyxl import Workbook
from csv import reader
from selenium.webdriver.chrome.options import Options
# CSS selectors
selectors = {
'nimi': 'body > div.ar__body.min-vh-90 > div.ar__center__bg > div > div:nth-child(1) > div > div.h2.text-primary.mb-2.header-name-print',
'Registreeritud': 'body > div.ar__body.min-vh-90 > div.ar__center__bg > div > div.card-group.row > div:nth-child(1) > div:nth-child(1) > div:nth-child(6) > div.col.font-weight-bold',
'Juhatuse liikme nimi': '#representativesTable > tbody > tr > td.align-top.font-weight-bold.pl-2',
'Isikukood': '#representativesTable > tbody > tr > td:nth-child(2) > div',
'Aadress': 'body > div.ar__body.min-vh-90 > div.ar__center__bg > div > div.card-group.row > div:nth-child(1) > div:nth-child(2) > div:nth-child(2) > div.col.font-weight-bold',
'posti aadress': 'body > div.ar__body.min-vh-90 > div.ar__center__bg > div > div.card-group.row > div:nth-child(1) > div:nth-child(2) > div:nth-child(3) > div.col.font-weight-bold',
'Telefon': 'body > div.ar__body.min-vh-90 > div.ar__center__bg > div > div.card-group.row > div:nth-child(1) > div:nth-child(2) > div:nth-child(4) > div.col.font-weight-bold',
}
# List to store
business_info_list = []
# Extractor
def extract_info_from_page(driver):
business_info = {}
for key, selector in selectors.items():
try:
element = driver.find_element(By.CSS_SELECTOR, selector)
business_info[key] = element.text.strip()
except Exception as e:
business_info[key] = None
print(f"Error extracting {key}: {e}")
return business_info
# Function to handle "I am human" verification
def handle_verification(driver):
try:
# Example selector for the "I am human" checkbox
human_checkbox_selector = 'input[type="checkbox"]'
# Wait for the checkbox to be clickable and click it
checkbox = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, human_checkbox_selector)))
checkbox.click()
# Add additional waits or interactions as needed
time.sleep(2) # Adjust sleep time if necessary
# Optionally, you might need to wait for the page to reload or for a new element to appear
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, selectors['nimi'])))
except Exception as e:
print(f"Failed to handle verification: {e}")
# Read URLs from the CSV file
urls = []
with open('businesslinks.csv', 'r') as f:
csv_reader = reader(f)
for row in csv_reader:
if row:
url = row[0].strip() # Remove any leading/trailing whitespace
if url.startswith('http://') or url.startswith('https://'):
urls.append(url)
else:
print(f"Invalid URL format: {url}")
# Set up Selenium options
options = Options()
options.headless = True # Run headless browser (without GUI)
# Set up WebDriver
driver = webdriver.Chrome(options=options)
# timer
start_time = time.time()
# Process each URL
for url in urls:
try:
print(f"Fetching URL: {url}")
driver.get(url)
time.sleep(3) # Adjust sleep time if necessary for page to load completely
# Handle verification if present
handle_verification(driver)
business_info = extract_info_from_page(driver)
business_info_list.append(business_info)
except Exception as e:
print(f"Failed to fetch {url}: {e}")
# Stop the timer
end_time = time.time()
# Calculate the total time taken
elapsed_time = end_time - start_time
print(f"Total execution time: {elapsed_time:.2f} seconds")
# Create Excel workbook
wb = Workbook()
ws = wb.active
# Write headers
headers = list(selectors.keys())
ws.append(headers)
# Write data
for business_info in business_info_list:
row_data = [business_info.get(header, '') for header in headers]
ws.append(row_data)
# Save the Excel file
wb.save('business_info.xlsx')
print('Complete.')
# Close the browser
driver.quit()
just need some help
New contributor
Romet is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.