Thiết kế website giá rẻ

Question

Using selenium to web scrape is kind of slow but its needed what i wanna scrape. i need to scrape thousands of urls, heres one : https://ariregister.rik.ee/est/company/14024665/O%C3%9C-1000-T%C3%96%C3%96D?search_id=4c068c3&pos=1

what can i improve to make it faster? (i use selectors)

import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from openpyxl import Workbook
from csv import reader
from selenium.webdriver.chrome.options import Options

# CSS selectors
selectors = {
    'nimi': 'body > div.ar__body.min-vh-90 > div.ar__center__bg > div > div:nth-child(1) > div > div.h2.text-primary.mb-2.header-name-print',
    'Registreeritud': 'body > div.ar__body.min-vh-90 > div.ar__center__bg > div > div.card-group.row > div:nth-child(1) > div:nth-child(1) > div:nth-child(6) > div.col.font-weight-bold',
    'Juhatuse liikme nimi': '#representativesTable > tbody > tr > td.align-top.font-weight-bold.pl-2',
    'Isikukood': '#representativesTable > tbody > tr > td:nth-child(2) > div',
    'Aadress': 'body > div.ar__body.min-vh-90 > div.ar__center__bg > div > div.card-group.row > div:nth-child(1) > div:nth-child(2) > div:nth-child(2) > div.col.font-weight-bold',
    'posti aadress': 'body > div.ar__body.min-vh-90 > div.ar__center__bg > div > div.card-group.row > div:nth-child(1) > div:nth-child(2) > div:nth-child(3) > div.col.font-weight-bold',
    'Telefon': 'body > div.ar__body.min-vh-90 > div.ar__center__bg > div > div.card-group.row > div:nth-child(1) > div:nth-child(2) > div:nth-child(4) > div.col.font-weight-bold',
}

# List to store
business_info_list = []

# Extractor
def extract_info_from_page(driver):
    business_info = {}

    for key, selector in selectors.items():
        try:
            element = driver.find_element(By.CSS_SELECTOR, selector)
            business_info[key] = element.text.strip()
        except Exception as e:
            business_info[key] = None
            print(f"Error extracting {key}: {e}")

    return business_info

# Function to handle "I am human" verification
def handle_verification(driver):
    try:
        # Example selector for the "I am human" checkbox
        human_checkbox_selector = 'input[type="checkbox"]'

        # Wait for the checkbox to be clickable and click it
        checkbox = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, human_checkbox_selector)))
        checkbox.click()
        
        # Add additional waits or interactions as needed
        time.sleep(2)  # Adjust sleep time if necessary

        # Optionally, you might need to wait for the page to reload or for a new element to appear
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, selectors['nimi'])))

    except Exception as e:
        print(f"Failed to handle verification: {e}")

# Read URLs from the CSV file
urls = []
with open('businesslinks.csv', 'r') as f:
    csv_reader = reader(f)
    for row in csv_reader:
        if row:
            url = row[0].strip()  # Remove any leading/trailing whitespace
            if url.startswith('http://') or url.startswith('https://'):
                urls.append(url)
            else:
                print(f"Invalid URL format: {url}")

# Set up Selenium options
options = Options()
options.headless = True  # Run headless browser (without GUI)

# Set up WebDriver
driver = webdriver.Chrome(options=options)

# timer
start_time = time.time()

# Process each URL
for url in urls:
    try:
        print(f"Fetching URL: {url}")
        driver.get(url)
        time.sleep(3)  # Adjust sleep time if necessary for page to load completely

        # Handle verification if present
        handle_verification(driver)

        business_info = extract_info_from_page(driver)
        business_info_list.append(business_info)
    except Exception as e:
        print(f"Failed to fetch {url}: {e}")

# Stop the timer
end_time = time.time()

# Calculate the total time taken
elapsed_time = end_time - start_time
print(f"Total execution time: {elapsed_time:.2f} seconds")

# Create Excel workbook
wb = Workbook()
ws = wb.active

# Write headers
headers = list(selectors.keys())
ws.append(headers)

# Write data
for business_info in business_info_list:
    row_data = [business_info.get(header, '') for header in headers]
    ws.append(row_data)

# Save the Excel file
wb.save('business_info.xlsx')

print('Complete.')

# Close the browser
driver.quit()

just need some help

Thiết kế website giá rẻ

Danh mục

selenium web scraping help needed