Thiết kế website giá rẻ

Question

currently I have a project to crawl data from a specific website as below and this is the code I have used to crawl since 2023 and now in 2024 the website has changed and I don’t know how to fix the error to show it. If it works, I need someone to show me and thank you for a student-priced cup of coffee. Thank you, admin, for approving my post.

1.Code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
import re

def get_soup(url, headers):
“””Makes a GET request to the given URL and returns a BeautifulSoup object.”””
response = requests.get(url, headers=headers)
return BeautifulSoup(response.content, “html.parser”)

def extract_review_data(row, selectors):
“””Extracts data from a single review row using specified selectors.”””
values = []
for column, selector in selectors.items():
if column == “Rating”:
value = row.select_one(selector)[“aria-label”]
else:
element = row.select_one(selector)
value = element.text.strip() if element else “”
values.append(value)
return values

def scrape_tripadvisor_reviews(base_url, row_selector, selectors, page_range):
“””Scrapes reviews from TripAdvisor and returns a DataFrame.”””
comments = []
headers = {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36”}

for i in tqdm(page_range):
    url = base_url.format(i)
    soup = get_soup(url, headers)
    rows = soup.select(row_selector)

    for row in rows:
        review_data = extract_review_data(row, selectors)
        comments.append(review_data)

    time.sleep(1)  # Delay to avoid being blocked

return pd.DataFrame(comments, columns=list(selectors.keys()))

CSS selectors for data extraction

selectors = {
“NameReviewer”: “.mwPje .ukgoS”,
“Country”: “.XExLl:nth-last-child(2) > .zpDvc > .JINyA”,
“ShortReview”: “.FGwzt .yCeTE”,
“FullReview”: “.pZUbB .yCeTE”,
“TotalContributions”: “.k .IugUm”,
“Like”: “.Vonfv .FwFXZ”,
“DateandType”: “.RpeCd”,
“Rating”: “.UctUV”
}

Row selector for individual review blocks

row_selector = “.LbPSX .C”

Base URL for scraping

base_url = “https://www.tripadvisor.com/Attraction_Review-g298082-d4507121-Reviews-or{}-Hoi_An_Ancient_Town-Hoi_An_Quang_Nam_Province.html”

Define the range for scraping pages

page_range = range(0, 10, 10)

Scrape reviews and get DataFrame

df_raw = scrape_tripadvisor_reviews(base_url, row_selector, selectors, page_range)

Function to extract numeric rating

def extract_numeric_rating(rating_value):
match = re.search(r’d+’, str(rating_value))
return int(match.group()) if match else None

Modify the ‘Rating’ column in-place

df_raw[‘Rating’] = df_raw[‘Rating’].apply(lambda x: extract_numeric_rating(x))

Function to extract date, year, and type

def extract_date_year_type(date_type_value):
match = re.search(r'(w{3}) (d{4}) • (.+)’, str(date_type_value))
return match.groups() if match else (None, None, None)

Apply the function to create new columns

df_raw[[‘Month’, ‘Year’, ‘Type’]] = pd.DataFrame(df_raw[‘DateandType’].apply(lambda x: extract_date_year_type(x)).tolist(), index=df_raw.index)

Convert ‘Year’ to string and combine into a comma-separated string

df_raw[‘Year’] = df_raw.groupby(‘NameReviewer’)[‘Year’].transform(lambda x: ‘,’.join(x.astype(str)))

Drop the ‘DateandType’ column

df_raw = df_raw.drop(‘DateandType’, axis=1)

Function to extract numeric contributions

def extract_numeric_contributions(value):
match = re.search(r’d+’, str(value))
return int(match.group()) if match else None

Apply the function to the ‘TotalContributions’ column

df_raw[‘TotalContributions’] = df_raw[‘TotalContributions’].apply(extract_numeric_contributions)

Function to extract city or country

def extract_city_or_country(value):
match = re.search(r'(.+?)(?:d+)?s*(?:contribution|contributions)?$’, str(value))
return match.group(1).strip() if match else None

Apply the function to the ‘Country’ column

df_raw[‘Country’] = df_raw[‘Country’].apply(extract_city_or_country)

Display the modified DataFrame

df_raw.to_csv(“tripadvisor_HoiAn_full_Output.csv”, index=False)
df_raw

2.Here is the website i want to crawl:
https://www.tripadvisor.com/Attraction_Review-g298082-d4507121-Reviews-Hoi_An_Ancient_Town-Hoi_An_Quang_Nam_Province.html

import pandas as pd
import numpy as np
import time
from tqdm import tqdm
import re

def get_soup(url, headers):
“””Makes a GET request to the given URL and returns a BeautifulSoup object.”””
response = requests.get(url, headers=headers)
return BeautifulSoup(response.content, “html.parser”)

def extract_review_data(row, selectors):
“””Extracts data from a single review row using specified selectors.”””
values = []
for column, selector in selectors.items():
if column == “Rating”:
value = row.select_one(selector)[“aria-label”]
else:
element = row.select_one(selector)
value = element.text.strip() if element else “”
values.append(value)
return values

def scrape_tripadvisor_reviews(base_url, row_selector, selectors, page_range):
“””Scrapes reviews from TripAdvisor and returns a DataFrame.”””
comments = []
headers = {“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36”}

for i in tqdm(page_range):
    url = base_url.format(i)
    soup = get_soup(url, headers)
    rows = soup.select(row_selector)

    for row in rows:
        review_data = extract_review_data(row, selectors)
        comments.append(review_data)

    time.sleep(1)  # Delay to avoid being blocked

return pd.DataFrame(comments, columns=list(selectors.keys()))

CSS selectors for data extraction

selectors = {
“NameReviewer”: “.mwPje .ukgoS”,
“Country”: “.XExLl:nth-last-child(2) > .zpDvc > .JINyA”,
“ShortReview”: “.FGwzt .yCeTE”,
“FullReview”: “.pZUbB .yCeTE”,
“TotalContributions”: “.k .IugUm”,
“Like”: “.Vonfv .FwFXZ”,
“DateandType”: “.RpeCd”,
“Rating”: “.UctUV”
}

Row selector for individual review blocks

row_selector = “.LbPSX .C”

Base URL for scraping

base_url = “https://www.tripadvisor.com/Attraction_Review-g298082-d4507121-Reviews-or{}-Hoi_An_Ancient_Town-Hoi_An_Quang_Nam_Province.html”

Define the range for scraping pages

page_range = range(0, 10, 10)

Scrape reviews and get DataFrame

df_raw = scrape_tripadvisor_reviews(base_url, row_selector, selectors, page_range)

Function to extract numeric rating

def extract_numeric_rating(rating_value):
match = re.search(r’d+’, str(rating_value))
return int(match.group()) if match else None

Modify the ‘Rating’ column in-place

df_raw[‘Rating’] = df_raw[‘Rating’].apply(lambda x: extract_numeric_rating(x))

Function to extract date, year, and type

def extract_date_year_type(date_type_value):
match = re.search(r'(w{3}) (d{4}) • (.+)’, str(date_type_value))
return match.groups() if match else (None, None, None)

Apply the function to create new columns

df_raw[[‘Month’, ‘Year’, ‘Type’]] = pd.DataFrame(df_raw[‘DateandType’].apply(lambda x: extract_date_year_type(x)).tolist(), index=df_raw.index)

Convert ‘Year’ to string and combine into a comma-separated string

df_raw[‘Year’] = df_raw.groupby(‘NameReviewer’)[‘Year’].transform(lambda x: ‘,’.join(x.astype(str)))

Drop the ‘DateandType’ column

df_raw = df_raw.drop(‘DateandType’, axis=1)

Function to extract numeric contributions

def extract_numeric_contributions(value):
match = re.search(r’d+’, str(value))
return int(match.group()) if match else None

Apply the function to the ‘TotalContributions’ column

df_raw[‘TotalContributions’] = df_raw[‘TotalContributions’].apply(extract_numeric_contributions)

Function to extract city or country

def extract_city_or_country(value):
match = re.search(r'(.+?)(?:d+)?s*(?:contribution|contributions)?$’, str(value))
return match.group(1).strip() if match else None

Apply the function to the ‘Country’ column

df_raw[‘Country’] = df_raw[‘Country’].apply(extract_city_or_country)

Display the modified DataFrame

df_raw.to_csv(“tripadvisor_HoiAn_full_Output.csv”, index=False)
df_raw

2.Here is the website i want to crawl:
https://www.tripadvisor.com/Attraction_Review-g298082-d4507121-Reviews-Hoi_An_Ancient_Town-Hoi_An_Quang_Nam_Province.html

I tried to fix the code but it doesn’t work. I hope everyone can help me, thank you

Thiết kế website giá rẻ

Danh mục

Crawl data from web

CSS selectors for data extraction

Row selector for individual review blocks

Base URL for scraping

Define the range for scraping pages

Scrape reviews and get DataFrame

Function to extract numeric rating

Modify the ‘Rating’ column in-place

Function to extract date, year, and type

Apply the function to create new columns

Convert ‘Year’ to string and combine into a comma-separated string

Drop the ‘DateandType’ column

Function to extract numeric contributions

Apply the function to the ‘TotalContributions’ column

Function to extract city or country

Apply the function to the ‘Country’ column

Display the modified DataFrame

CSS selectors for data extraction

Row selector for individual review blocks

Base URL for scraping

Define the range for scraping pages

Scrape reviews and get DataFrame

Function to extract numeric rating

Modify the ‘Rating’ column in-place

Function to extract date, year, and type

Apply the function to create new columns

Convert ‘Year’ to string and combine into a comma-separated string

Drop the ‘DateandType’ column

Function to extract numeric contributions

Apply the function to the ‘TotalContributions’ column

Function to extract city or country

Apply the function to the ‘Country’ column

Display the modified DataFrame