i am trying to scrape the reviews here and its actually working but it only works for 5-4 times after it just start to scrape only the first page and ignores the next pages and i searched for the problem, as it i found 2 mistakes 1.The session is not handled properly and the 2.Due to too many requests i am getting from the Amazon. And if leave it for 12hr without working with the code it again starts perform normally, So whats the reason behinds this and is there any way i can improve this code ? Please help i am doing this for my Project(Sentiment Analysis of product reviews). Thank You
from scrapy import scraping_top_url
from bs4 import BeautifulSoup
import requests
import time
import csv
import pandas as pd
from requests.exceptions import RequestException
reviewlist = []
def extract_reviews(response_content):
soup = BeautifulSoup(response_content, "html.parser")
reviews = soup.findAll("div", {'data-hook': 'review'})
# print(soup)
# print(reviews)
for items in reviews:
try:
review = {
'Review Title': items.find('a', {'data-hook': 'review-title'}).get_text().split("stars")[1].strip(),
'Rating': items.find('i', {'data-hook': 'review-star-rating'}).get_text(),
'Review Body': items.find('span', {'data-hook': 'review-body'}).get_text()
}
reviewlist.append(review)
except Exception as e:
print(e)
def url_sessions(urls, userAgents):
with requests.Session() as session:
session.headers.update({'User-Agent': userAgents[0]})`your text`
for url in urls:
if url == 0:
continue
pg = 1
while True: # Loop to iterate through pages
modified_url = f"{url}&pageNumber={pg}&sortBy=recent"
max_tries = 21
attempts = 0
response = None
while attempts < max_tries and not response:
try:
response = session.get(modified_url)
if response.status_code == 200:
print(f"Successfully fetched data from {modified_url}")
extract_reviews(response.content)
soup = BeautifulSoup(response.content, "html.parser")
next_page = soup.find('li', {'class': 'a-last'}) # Checks for the 'Next' button
if next_page and 'a-disabled' not in next_page['class']: # If 'Next' button is not disabled
pg += 1
else:
raise StopIteration # No more pages, raise exception to break from the inner loop
break
except RequestException as e:
print(f"An error occurred: {e}")
except StopIteration: # Catch the exception to break from the outer loop
break
finally:
if attempts < max_tries - 1:
session.headers.update({'User-Agent': userAgents[attempts % len(userAgents)]})
time.sleep(3)
attempts += 1
try:
if response is None or 'a-disabled' in next_page.get('class', []):
break # Break from the outer loop if no response or last page reached
except AttributeError:
break
def main(user_input):
imageUrl, Stars, fiveStarReviewUrl, fourStarReviewUrl, threeStarReviewUrl, twoStarReviewUrl, oneStarReviewUrl = scraping_top_url(user_input)
# print(Stars, fiveStarReviewUrl, fourStarReviewUrl, threeStarReviewUrl, twoStarReviewUrl, oneStarReviewUrl, sep="n")
# fiveStarReviewUrl = fiveStarReviewUrl.replace("#reviews-filter-bar", "&pageNumber=")
# pg = 1
if fiveStarReviewUrl != 0:
fiveStarReviewUrl = fiveStarReviewUrl.replace("#reviews-filter-bar", "") # + f"&pageNumber={pg}" + f"&sortBy=recent"
else:
fiveStarReviewUrl = 0
# print(fiveStarReviewUrl)
if fourStarReviewUrl != 0:
fourStarReviewUrl = fourStarReviewUrl.replace("#reviews-filter-bar", "") # + f"&pageNumber={pg}" + f"&sortBy=recent"
else:
fourStarReviewUrl = 0
if threeStarReviewUrl != 0:
threeStarReviewUrl = threeStarReviewUrl.replace("#reviews-filter-bar", "") # + f"&pageNumber={pg}" + f"&sortBy=recent"
else:
threeStarReviewUrl = 0
if twoStarReviewUrl != 0:
twoStarReviewUrl = twoStarReviewUrl.replace("#reviews-filter-bar", "") # + f"&pageNumber={pg}" + f"&sortBy=recent"
else:
twoStarReviewUrl = 0
if oneStarReviewUrl != 0:
oneStarReviewUrl= oneStarReviewUrl.replace("#reviews-filter-bar", "") # + f"&pageNumber={pg}" + f"&sortBy=recent"
else:
oneStarReviewUrl = 0
userAgents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.2420.81',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/109.0.0.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14.4; rv:124.0) Gecko/20100101 Firefox/124.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/109.0.0.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
'Mozilla/5.0 (X11; Linux i686; rv:124.0) Gecko/20100101 Firefox/124.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/116.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0',
'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/116.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393'
]
urls = []
urls.append(fiveStarReviewUrl)
urls.append(fourStarReviewUrl)
urls.append(threeStarReviewUrl)
urls.append(twoStarReviewUrl)
urls.append(oneStarReviewUrl)
# print(urls)
url_sessions(urls, userAgents)
#with open("reviews.csv", 'w', newline="") as file:
# df = pd.DataFrame(reviewlist)
df = pd.DataFrame(reviewlist)
# df.to_json("reviews.json", index=False)
df.to_csv("reviews.csv", index=False)
reviewlist.clear()
return imageUrl, Stars