I am trying to scrape articles from several hyperlinks, get the title of the article & create a summary with the contents of the first 4 paragraphs & save the output in a .png file. But it gives error – UnicodeEncodeError: ‘latin-1’ codec can’t encode character ‘u2019’ in position 4: ordinal not in range(256)
import requests
from bs4 import BeautifulSoup
from PIL import Image, ImageDraw, ImageFont
# Function to get title and summary from a URL
def extract_info(url):
try:
response = requests.get(url)
response.encoding = 'utf-8' # Ensure UTF-8 encoding
soup = BeautifulSoup(response.text, 'html.parser')
# Extract title
title = soup.title.string.strip() if soup.title and soup.title.string else 'No Title Found'
# Extract the first 4 paragraphs as summary
paragraphs = soup.find_all('p')
if paragraphs:
summary_paragraphs = [p.get_text(strip=True) for p in paragraphs[:4] if p.get_text(strip=True)]
summary = 'nn'.join(summary_paragraphs) if summary_paragraphs else 'No Summary Found'
else:
summary = 'No Summary Found'
return title, summary
except Exception as e:
return 'Error fetching details', str(e)
# List of URLs
urls = [
"https://www.cisa.gov/news-events/events",
"https://www.cisa.gov/news-events/cybersecurity-advisories",
"https://www.cisa.gov/news-events/alerts/2024/08/14/adobe-releases-security-
updates-multiple-products",
"https://www.youtube.com/@cisagov",
"https://www.dhs.gov",
"https://www.kaspersky.com/home-security?
icid=gl_securelisheader_acq_ona_smm__onl_b2c_securelist_prodmen_______",
]
# Collect all the data
data = []
for url in urls:
print(f"Processing URL: {url}")
title, summary = extract_info(url)
data.append((title, url, summary))
# Determine the image height dynamically
width = 1200
padding = 50 # Padding around text
current_height = padding
# Use a Unicode-compatible font
try:
font_path = "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
font_title = ImageFont.truetype(font_path, 24)
font_url = ImageFont.truetype(font_path, 20)
font_summary = ImageFont.truetype(font_path, 18)
except:
font_title = ImageFont.load_default()
font_url = ImageFont.load_default()
font_summary = ImageFont.load_default()
# Create a dummy image to calculate text size
dummy_img = Image.new('RGB', (width, 1000))
draw = ImageDraw.Draw(dummy_img)
def calculate_text_height(draw, text, font):
# Calculate the bounding box of the text and return its height
bbox = draw.multiline_textbbox((0, 0), text, font=font)
return bbox[3] - bbox[1]
for title, url, summary in data:
# Calculate height for title
title_height = calculate_text_height(draw, title, font_title)
current_height += title_height + 10 # 10 pixels spacing
# Calculate height for url
url_height = calculate_text_height(draw, url, font_url)
current_height += url_height + 10
# Calculate height for summary
summary_height = calculate_text_height(draw, summary, font_summary)
current_height += summary_height + 30 # 30 pixels spacing after summary
# Add some bottom padding
current_height += padding
# Create the final image
image = Image.new('RGB', (width, current_height), color=(255, 255, 255))
draw = ImageDraw.Draw(image)
y_text = padding
for title, url, summary in data:
# Draw title
draw.text((padding, y_text), title, font=font_title, fill="black")
y_text += calculate_text_height(draw, title, font_title) + 10 # 10 pixels spacing
# Draw URL
draw.text((padding, y_text), url, font=font_url, fill="blue")
y_text += calculate_text_height(draw, url, font_url) + 10
# Draw Summary
draw.multiline_text((padding, y_text), summary, font=font_summary, fill="black", spacing=4)
y_text += calculate_text_height(draw, summary, font_summary) + 30 # 30 pixels spacing after summary
# Save the image in the current working directory
image.save('website_info.png')
# Display the image
image.show()
4