Thiết kế website giá rẻ

Question

I’m using the below code to scrape a website and export the data to CSV. It works well except that the title and summary fields are not encoded correctly in the CSV. It’s showing apostroshes as â€™ for instance.

I tried adding in the ‘response.encoding’ line but that didn’t fix it. Not sure what else to do. In case it’s relevant, I’m using Replit to run the code and Excel to open the CSV.

<code>import requests

from bs4 import BeautifulSoup

import csv

def scrape_report_content(url):

response = requests.get(url)

response.encoding = 'utf-8'

soup = BeautifulSoup(response.content, 'html.parser')

content_div = soup.find('div', class_='usa-prose pep-prose margin-top-6')

if content_div:

# Extract all paragraphs inside the content_div

paragraphs = content_div.find_all('p')

content = 'n'.join([p.get_text(strip=True) for p in paragraphs])

return content.strip()

else:

return None

base_url = "https://oig.hhs.gov/reports-and-publications/all-reports-and-publications/"

page_limit = 50

report_data = []

for page in range(1, page_limit + 1):

page_url = f"{base_url}?page={page}"

print(f"Scraping page {page}...")

# Fetch the page content

response = requests.get(page_url)

response.encoding = 'utf-8'

soup = BeautifulSoup(response.content, "html.parser")

reports = soup.find_all("div", class_="usa-card__container")

for report in reports:

title_tag = report.find("a")

title = title_tag.get_text(strip=True)

link = "https://oig.hhs.gov" + title_tag['href']

audit, agency, date = [e.get_text(strip=True) for e in report.find_all("dd")]

content = scrape_report_content(link)

report_data.append({

"Title": title,

"URL": link,

"Report Number(s)": audit,

"Agency": agency,

"Date": date,

"Content": content

})

# Export to CSV

csv_file = "OIG_Reports.csv"

with open(csv_file, mode='w', newline='', encoding='utf-8') as file:

fieldnames = ["Title", "URL", "Report Number(s)", "Agency", "Date", "Content"]

writer = csv.DictWriter(file, fieldnames=fieldnames)

writer.writeheader()

for data in report_data:

writer.writerow(data)

print(f"Data exported to {csv_file}")

</code>

<code>import requests from bs4 import BeautifulSoup import csv def scrape_report_content(url): response = requests.get(url) response.encoding = 'utf-8' soup = BeautifulSoup(response.content, 'html.parser') content_div = soup.find('div', class_='usa-prose pep-prose margin-top-6') if content_div: # Extract all paragraphs inside the content_div paragraphs = content_div.find_all('p') content = 'n'.join([p.get_text(strip=True) for p in paragraphs]) return content.strip() else: return None base_url = "https://oig.hhs.gov/reports-and-publications/all-reports-and-publications/" page_limit = 50 report_data = [] for page in range(1, page_limit + 1): page_url = f"{base_url}?page={page}" print(f"Scraping page {page}...") # Fetch the page content response = requests.get(page_url) response.encoding = 'utf-8' soup = BeautifulSoup(response.content, "html.parser") reports = soup.find_all("div", class_="usa-card__container") for report in reports: title_tag = report.find("a") title = title_tag.get_text(strip=True) link = "https://oig.hhs.gov" + title_tag['href'] audit, agency, date = [e.get_text(strip=True) for e in report.find_all("dd")] content = scrape_report_content(link) report_data.append({ "Title": title, "URL": link, "Report Number(s)": audit, "Agency": agency, "Date": date, "Content": content }) # Export to CSV csv_file = "OIG_Reports.csv" with open(csv_file, mode='w', newline='', encoding='utf-8') as file: fieldnames = ["Title", "URL", "Report Number(s)", "Agency", "Date", "Content"] writer = csv.DictWriter(file, fieldnames=fieldnames) writer.writeheader() for data in report_data: writer.writerow(data) print(f"Data exported to {csv_file}") </code>

import requests
from bs4 import BeautifulSoup
import csv

def scrape_report_content(url):
    response = requests.get(url)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.content, 'html.parser')
    content_div = soup.find('div', class_='usa-prose pep-prose margin-top-6')
    if content_div:
        # Extract all paragraphs inside the content_div
        paragraphs = content_div.find_all('p')
        content = 'n'.join([p.get_text(strip=True) for p in paragraphs])
        return content.strip()
    else:
        return None

base_url = "https://oig.hhs.gov/reports-and-publications/all-reports-and-publications/"
page_limit = 50

report_data = []

for page in range(1, page_limit + 1):
    page_url = f"{base_url}?page={page}"
    print(f"Scraping page {page}...")

    # Fetch the page content
    response = requests.get(page_url)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.content, "html.parser")

    reports = soup.find_all("div", class_="usa-card__container")

    for report in reports:
        title_tag = report.find("a")
        title = title_tag.get_text(strip=True)
        link = "https://oig.hhs.gov" + title_tag['href']
        audit, agency, date = [e.get_text(strip=True) for e in report.find_all("dd")]

        content = scrape_report_content(link)

        report_data.append({
            "Title": title,
            "URL": link,
            "Report Number(s)": audit,
            "Agency": agency,
            "Date": date,
            "Content": content
        })

# Export to CSV
csv_file = "OIG_Reports.csv"
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    fieldnames = ["Title", "URL", "Report Number(s)", "Agency", "Date", "Content"]
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    for data in report_data:
        writer.writerow(data)

print(f"Data exported to {csv_file}")

Thiết kế website giá rẻ

Danh mục

Webscraping data is not encoded correctly in CSV