I’m working on a web scraping project using Python and Playwright to collect data from a website. When accessing the website without a proxy server, my IP gets blocked. To prevent this, I use a proxy server (ScraperAPI) and rotate user agents. While the first page scrapes successfully, subsequent pages fail to load.
Here is a simplified version of my code:
import asyncio
import sqlite3
import random
from urllib.parse import urlencode
from playwright.async_api import async_playwright
SCRAPER_API_KEY = 'your_scraper_api_key'
SCRAPER_API_URL = 'https://api.scraperapi.com/'
def build_scraperapi_url(url):
params = {
'api_key': SCRAPER_API_KEY,
'url': url,
'country_code': 'us',
'device_type': 'desktop',
'session_number': '123',
'keep_headers': 'true'
}
return f"{SCRAPER_API_URL}?{urlencode(params)}"
def load_user_agents(file_path):
try:
with open(file_path, 'r') as file:
user_agents = [line.strip() for line in file if line.strip()]
if not user_agents:
raise ValueError("User agents list is empty.")
return user_agents
except Exception as e:
print(f"Failed to load user agents: {e}")
return []
headers_template = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",
"Content-Type": "application/json",
"Origin": "https://www.example.com"
}
async def scrape_product_urls(category_url, user_agents):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
headers = headers_template.copy()
headers["User-Agent"] = random.choice(user_agents)
await page.set_extra_http_headers(headers)
scraperapi_url = build_scraperapi_url(category_url)
await page.goto(scraperapi_url, timeout=90000)
product_urls = []
product_divs = await page.query_selector_all('div.ProductList0__productItemContainer')
if not product_divs:
print("No products found")
return product_urls
for div in product_divs:
link = await div.query_selector('a')
if link:
href = await link.get_attribute('href')
full_url = f"https://www.example.com{href}"
product_urls.append(full_url)
await browser.close()
return product_urls
async def main():
user_agents = load_user_agents('user_agents.txt')
if not user_agents:
print("No user agents available. Exiting.")
return
category_urls = [
'https://www.example.com/shop/clothing?page=1',
'https://www.example.com/shop/clothing?page=2',
'https://www.example.com/shop/clothing?page=3'
]
for category_url in category_urls:
try:
product_urls = await scrape_product_urls(category_url, user_agents)
print(f"Scraped {len(product_urls)} product URLs from {category_url}")
except Exception as e:
print(f"Error while scraping {category_url}: {e}")
if __name__ == "__main__":
asyncio.run(main())
New contributor
Shahana farvin is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.