I’m using the below code to scrape a website and export the data to CSV. It works well except that the title and summary fields are not encoded correctly in the CSV. It’s showing apostroshes as ’ for instance.
I tried adding in the ‘response.encoding’ line but that didn’t fix it. Not sure what else to do. In case it’s relevant, I’m using Replit to run the code and Excel to open the CSV.
<code>import requests
from bs4 import BeautifulSoup
import csv
def scrape_report_content(url):
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.content, 'html.parser')
content_div = soup.find('div', class_='usa-prose pep-prose margin-top-6')
if content_div:
# Extract all paragraphs inside the content_div
paragraphs = content_div.find_all('p')
content = 'n'.join([p.get_text(strip=True) for p in paragraphs])
return content.strip()
else:
return None
base_url = "https://oig.hhs.gov/reports-and-publications/all-reports-and-publications/"
page_limit = 50
report_data = []
for page in range(1, page_limit + 1):
page_url = f"{base_url}?page={page}"
print(f"Scraping page {page}...")
# Fetch the page content
response = requests.get(page_url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.content, "html.parser")
reports = soup.find_all("div", class_="usa-card__container")
for report in reports:
title_tag = report.find("a")
title = title_tag.get_text(strip=True)
link = "https://oig.hhs.gov" + title_tag['href']
audit, agency, date = [e.get_text(strip=True) for e in report.find_all("dd")]
content = scrape_report_content(link)
report_data.append({
"Title": title,
"URL": link,
"Report Number(s)": audit,
"Agency": agency,
"Date": date,
"Content": content
})
# Export to CSV
csv_file = "OIG_Reports.csv"
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
fieldnames = ["Title", "URL", "Report Number(s)", "Agency", "Date", "Content"]
writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writeheader()
for data in report_data:
writer.writerow(data)
print(f"Data exported to {csv_file}")
</code>
<code>import requests
from bs4 import BeautifulSoup
import csv
def scrape_report_content(url):
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.content, 'html.parser')
content_div = soup.find('div', class_='usa-prose pep-prose margin-top-6')
if content_div:
# Extract all paragraphs inside the content_div
paragraphs = content_div.find_all('p')
content = 'n'.join([p.get_text(strip=True) for p in paragraphs])
return content.strip()
else:
return None
base_url = "https://oig.hhs.gov/reports-and-publications/all-reports-and-publications/"
page_limit = 50
report_data = []
for page in range(1, page_limit + 1):
page_url = f"{base_url}?page={page}"
print(f"Scraping page {page}...")
# Fetch the page content
response = requests.get(page_url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.content, "html.parser")
reports = soup.find_all("div", class_="usa-card__container")
for report in reports:
title_tag = report.find("a")
title = title_tag.get_text(strip=True)
link = "https://oig.hhs.gov" + title_tag['href']
audit, agency, date = [e.get_text(strip=True) for e in report.find_all("dd")]
content = scrape_report_content(link)
report_data.append({
"Title": title,
"URL": link,
"Report Number(s)": audit,
"Agency": agency,
"Date": date,
"Content": content
})
# Export to CSV
csv_file = "OIG_Reports.csv"
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
fieldnames = ["Title", "URL", "Report Number(s)", "Agency", "Date", "Content"]
writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writeheader()
for data in report_data:
writer.writerow(data)
print(f"Data exported to {csv_file}")
</code>
import requests
from bs4 import BeautifulSoup
import csv
def scrape_report_content(url):
response = requests.get(url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.content, 'html.parser')
content_div = soup.find('div', class_='usa-prose pep-prose margin-top-6')
if content_div:
# Extract all paragraphs inside the content_div
paragraphs = content_div.find_all('p')
content = 'n'.join([p.get_text(strip=True) for p in paragraphs])
return content.strip()
else:
return None
base_url = "https://oig.hhs.gov/reports-and-publications/all-reports-and-publications/"
page_limit = 50
report_data = []
for page in range(1, page_limit + 1):
page_url = f"{base_url}?page={page}"
print(f"Scraping page {page}...")
# Fetch the page content
response = requests.get(page_url)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.content, "html.parser")
reports = soup.find_all("div", class_="usa-card__container")
for report in reports:
title_tag = report.find("a")
title = title_tag.get_text(strip=True)
link = "https://oig.hhs.gov" + title_tag['href']
audit, agency, date = [e.get_text(strip=True) for e in report.find_all("dd")]
content = scrape_report_content(link)
report_data.append({
"Title": title,
"URL": link,
"Report Number(s)": audit,
"Agency": agency,
"Date": date,
"Content": content
})
# Export to CSV
csv_file = "OIG_Reports.csv"
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
fieldnames = ["Title", "URL", "Report Number(s)", "Agency", "Date", "Content"]
writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writeheader()
for data in report_data:
writer.writerow(data)
print(f"Data exported to {csv_file}")