currently I have a project to crawl data from a specific website as below and this is the code I have used to crawl since 2023 and now in 2024 the website has changed and I don’t know how to fix the error to show it. If it works, I need someone to show me and thank you for a student-priced cup of coffee. Thank you, admin, for approving my post.
1.Code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
import re
def get_soup(url, headers):
“””Makes a GET request to the given URL and returns a BeautifulSoup object.”””
response = requests.get(url, headers=headers)
return BeautifulSoup(response.content, “html.parser”)
def extract_review_data(row, selectors):
“””Extracts data from a single review row using specified selectors.”””
values = []
for column, selector in selectors.items():
if column == “Rating”:
value = row.select_one(selector)[“aria-label”]
else:
element = row.select_one(selector)
value = element.text.strip() if element else “”
values.append(value)
return values
def scrape_tripadvisor_reviews(base_url, row_selector, selectors, page_range):
“””Scrapes reviews from TripAdvisor and returns a DataFrame.”””
comments = []
headers = {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36”}
for i in tqdm(page_range):
url = base_url.format(i)
soup = get_soup(url, headers)
rows = soup.select(row_selector)
for row in rows:
review_data = extract_review_data(row, selectors)
comments.append(review_data)
time.sleep(1) # Delay to avoid being blocked
return pd.DataFrame(comments, columns=list(selectors.keys()))
CSS selectors for data extraction
selectors = {
“NameReviewer”: “.mwPje .ukgoS”,
“Country”: “.XExLl:nth-last-child(2) > .zpDvc > .JINyA”,
“ShortReview”: “.FGwzt .yCeTE”,
“FullReview”: “.pZUbB .yCeTE”,
“TotalContributions”: “.k .IugUm”,
“Like”: “.Vonfv .FwFXZ”,
“DateandType”: “.RpeCd”,
“Rating”: “.UctUV”
}
Row selector for individual review blocks
row_selector = “.LbPSX .C”
Base URL for scraping
base_url = “https://www.tripadvisor.com/Attraction_Review-g298082-d4507121-Reviews-or{}-Hoi_An_Ancient_Town-Hoi_An_Quang_Nam_Province.html”
Define the range for scraping pages
page_range = range(0, 10, 10)
Scrape reviews and get DataFrame
df_raw = scrape_tripadvisor_reviews(base_url, row_selector, selectors, page_range)
Function to extract numeric rating
def extract_numeric_rating(rating_value):
match = re.search(r’d+’, str(rating_value))
return int(match.group()) if match else None
Modify the ‘Rating’ column in-place
df_raw[‘Rating’] = df_raw[‘Rating’].apply(lambda x: extract_numeric_rating(x))
Function to extract date, year, and type
def extract_date_year_type(date_type_value):
match = re.search(r'(w{3}) (d{4}) • (.+)’, str(date_type_value))
return match.groups() if match else (None, None, None)
Apply the function to create new columns
df_raw[[‘Month’, ‘Year’, ‘Type’]] = pd.DataFrame(df_raw[‘DateandType’].apply(lambda x: extract_date_year_type(x)).tolist(), index=df_raw.index)
Convert ‘Year’ to string and combine into a comma-separated string
df_raw[‘Year’] = df_raw.groupby(‘NameReviewer’)[‘Year’].transform(lambda x: ‘,’.join(x.astype(str)))
Drop the ‘DateandType’ column
df_raw = df_raw.drop(‘DateandType’, axis=1)
Function to extract numeric contributions
def extract_numeric_contributions(value):
match = re.search(r’d+’, str(value))
return int(match.group()) if match else None
Apply the function to the ‘TotalContributions’ column
df_raw[‘TotalContributions’] = df_raw[‘TotalContributions’].apply(extract_numeric_contributions)
Function to extract city or country
def extract_city_or_country(value):
match = re.search(r'(.+?)(?:d+)?s*(?:contribution|contributions)?$’, str(value))
return match.group(1).strip() if match else None
Apply the function to the ‘Country’ column
df_raw[‘Country’] = df_raw[‘Country’].apply(extract_city_or_country)
Display the modified DataFrame
df_raw.to_csv(“tripadvisor_HoiAn_full_Output.csv”, index=False)
df_raw
2.Here is the website i want to crawl:
https://www.tripadvisor.com/Attraction_Review-g298082-d4507121-Reviews-Hoi_An_Ancient_Town-Hoi_An_Quang_Nam_Province.html
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
import re
def get_soup(url, headers):
“””Makes a GET request to the given URL and returns a BeautifulSoup object.”””
response = requests.get(url, headers=headers)
return BeautifulSoup(response.content, “html.parser”)
def extract_review_data(row, selectors):
“””Extracts data from a single review row using specified selectors.”””
values = []
for column, selector in selectors.items():
if column == “Rating”:
value = row.select_one(selector)[“aria-label”]
else:
element = row.select_one(selector)
value = element.text.strip() if element else “”
values.append(value)
return values
def scrape_tripadvisor_reviews(base_url, row_selector, selectors, page_range):
“””Scrapes reviews from TripAdvisor and returns a DataFrame.”””
comments = []
headers = {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36”}
for i in tqdm(page_range):
url = base_url.format(i)
soup = get_soup(url, headers)
rows = soup.select(row_selector)
for row in rows:
review_data = extract_review_data(row, selectors)
comments.append(review_data)
time.sleep(1) # Delay to avoid being blocked
return pd.DataFrame(comments, columns=list(selectors.keys()))
CSS selectors for data extraction
selectors = {
“NameReviewer”: “.mwPje .ukgoS”,
“Country”: “.XExLl:nth-last-child(2) > .zpDvc > .JINyA”,
“ShortReview”: “.FGwzt .yCeTE”,
“FullReview”: “.pZUbB .yCeTE”,
“TotalContributions”: “.k .IugUm”,
“Like”: “.Vonfv .FwFXZ”,
“DateandType”: “.RpeCd”,
“Rating”: “.UctUV”
}
Row selector for individual review blocks
row_selector = “.LbPSX .C”
Base URL for scraping
base_url = “https://www.tripadvisor.com/Attraction_Review-g298082-d4507121-Reviews-or{}-Hoi_An_Ancient_Town-Hoi_An_Quang_Nam_Province.html”
Define the range for scraping pages
page_range = range(0, 10, 10)
Scrape reviews and get DataFrame
df_raw = scrape_tripadvisor_reviews(base_url, row_selector, selectors, page_range)
Function to extract numeric rating
def extract_numeric_rating(rating_value):
match = re.search(r’d+’, str(rating_value))
return int(match.group()) if match else None
Modify the ‘Rating’ column in-place
df_raw[‘Rating’] = df_raw[‘Rating’].apply(lambda x: extract_numeric_rating(x))
Function to extract date, year, and type
def extract_date_year_type(date_type_value):
match = re.search(r'(w{3}) (d{4}) • (.+)’, str(date_type_value))
return match.groups() if match else (None, None, None)
Apply the function to create new columns
df_raw[[‘Month’, ‘Year’, ‘Type’]] = pd.DataFrame(df_raw[‘DateandType’].apply(lambda x: extract_date_year_type(x)).tolist(), index=df_raw.index)
Convert ‘Year’ to string and combine into a comma-separated string
df_raw[‘Year’] = df_raw.groupby(‘NameReviewer’)[‘Year’].transform(lambda x: ‘,’.join(x.astype(str)))
Drop the ‘DateandType’ column
df_raw = df_raw.drop(‘DateandType’, axis=1)
Function to extract numeric contributions
def extract_numeric_contributions(value):
match = re.search(r’d+’, str(value))
return int(match.group()) if match else None
Apply the function to the ‘TotalContributions’ column
df_raw[‘TotalContributions’] = df_raw[‘TotalContributions’].apply(extract_numeric_contributions)
Function to extract city or country
def extract_city_or_country(value):
match = re.search(r'(.+?)(?:d+)?s*(?:contribution|contributions)?$’, str(value))
return match.group(1).strip() if match else None
Apply the function to the ‘Country’ column
df_raw[‘Country’] = df_raw[‘Country’].apply(extract_city_or_country)
Display the modified DataFrame
df_raw.to_csv(“tripadvisor_HoiAn_full_Output.csv”, index=False)
df_raw
2.Here is the website i want to crawl:
https://www.tripadvisor.com/Attraction_Review-g298082-d4507121-Reviews-Hoi_An_Ancient_Town-Hoi_An_Quang_Nam_Province.html
I tried to fix the code but it doesn’t work. I hope everyone can help me, thank you
Như Phú is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.