I am having trouble trying to parse this email html that I have received. I would like to extract the product name, price and image url of the product from the emails html.
I need to obtain the css/html selectors and then add it to the following function:
def parse_email_content(html_content, order_date=None):
soup = BeautifulSoup(html_content, 'html.parser')
company_name = extract_company_name(soup)
product_info = extract_product_info(soup)
order_number = extract_order_number(soup)
return {
'company_name': company_name,
'products': product_info,
'order_number': order_number,
'order_date': order_date
}
def extract_company_name(soup):
selectors = [
'h1.shop-name__text',
'h1.x_shop-name__text',
'h1.company-name',
'td.x_logo img' # Amazon logo
# New selector for a different email format
]
for selector in selectors:
company_name_tag = soup.select_one(selector)
if company_name_tag:
return company_name_tag.get_text(strip=True)
return "Unknown Company"
def extract_order_number(soup):
order_number_selectors = [
'h2:contains("Order:") + a', # Amazon order number
'a.x_inline-block', # Amazon order number
]
for selector in order_number_selectors:
order_number_tag = soup.select_one(selector)
if order_number_tag:
return order_number_tag.get_text(strip=True)
return "Unknown Order Number"
def extract_product_info(soup):
product_info = []
product_selectors = [
('.order-list__item', '.order-list__item-title', '.order-list__item-price', 'img'),
('.x_order-list__item', '.x_order-list__item-title', '.x_order-list__item-price', 'img'),
('.product-item', '.product-name', '.product-price', 'img'),
('table#x_itemDetails tr', 'td.x_name ul li a', 'td.x_price strong', 'td.x_photo img'), # Amazon Details
('a[title]', 'a img[alt]', 'span.x_rio_15_grey', 'img')
# New selectors for a different email format
]
for item_selector, name_selector, price_selector, image_selector in product_selectors:
for item in soup.select(item_selector):
product_name = item.select_one(name_selector).get_text(strip=True) if item.select_one(name_selector) else "Unknown Product"
product_price = item.select_one(price_selector).get_text(strip=True) if item.select_one(price_selector) else "Unknown Price"
product_image = item.select_one(image_selector)['src'] if item.select_one(image_selector) else "No Image"
product_image = modify_image_url(product_image)
product_info.append({
'product_name': product_name,
'product_price': product_price,
'product_image': product_image,
})
if product_info:
break # Stop if products were found with current selectors
return product_info
The purpose of this is to be able to track product purchases.
Here is the html snippet example of what I want to parse:
</tr>
<tr style="margin:0; padding:0; border:0; outline:0; font-weight:inherit; font-style:inherit; font-size:100%; font-family:inherit; vertical-align:baseline; display:block">
<td style="margin:0; padding:0; border:0; outline:0; font-weight:inherit; font-style:inherit; font-size:100%; font-family:inherit; vertical-align:baseline">
<table style="margin:0; padding:0; border:0; outline:0; font-weight:inherit; font-style:inherit; font-size:100%; font-family:inherit; vertical-align:baseline; background:rgb(255,255,255); display:block; border-collapse:collapse">
<tbody style="margin:0; padding:0; border:0; outline:0; font-weight:inherit; font-style:inherit; font-size:100%; font-family:inherit; vertical-align:baseline; display:block">
<tr class="x_rio_asin_card" style="margin:0; padding:0; border:0; outline:0; font-weight:inherit; font-style:inherit; font-size:100%; font-family:inherit; vertical-align:baseline; display:block">
<td class="x_rio_asin_img" style="border-radius:4px; display:table-cell; width:131px; height:131px; vertical-align:middle; text-align:center; isolation:isolate; margin:0; padding:0; border:0; outline:0; font-weight:inherit; font-style:inherit; font-size:100%; font-family:inherit">
<a href="https://www.amazon.co.uk/gp/r.html?C=139N1T2SKF371&K=24P3B4B2QNYP8&M=urn:rtn:msg:2024022217045437e2313c7ab74cb7b1d7a35834e0p0eu&R=1TDDA7WUV1JHP&T=C&U=https%3A%2F%2Fwww.amazon.co.uk%2Fdp%2FB09FY1WF94%2Fref%3Dpe_27063361_485629781_TE_item_image&H=ONMQCUPSXYOXDUDCAEVGGCNOET8A&ref_=pe_27063361_485629781_TE_item_image" title="Hawksbill Paper 160G..." style="background-color:transparent; color:inherit; text-decoration:none; font-size:inherit; font-family:inherit; font-weight:inherit; line-height:inherit; margin:0; padding:0; border:0; outline:0; font-style:inherit; vertical-align:baseline"><img alt="Hawksbill Paper 160G..." style="max-height:115px; max-width:115px; margin:auto; display:block; padding:8px; mix-blend-mode:multiply; border:0; outline:0; font-weight:inherit; font-style:inherit; font-size:100%; font-family:inherit; vertical-align:baseline; height:auto; line-height:100%; text-decoration:none" src="https://m.media-amazon.com/images/I/21JZPXNafXL._SY115_SX115_.jpg">
</a></td>
<td style="width:8px; margin:0; padding:0; border:0; outline:0; font-weight:inherit; font-style:inherit; font-size:100%; font-family:inherit; vertical-align:baseline">
</td>
<td class="x_rio_asin_internal_text" style="vertical-align:middle; margin:0; padding:0; border:0; outline:0; font-weight:inherit; font-style:inherit; font-size:100%; font-family:inherit">
<p style="margin:0; padding:0; border:0; outline:0; font-weight:inherit; font-style:inherit; font-size:100%; font-family:inherit; vertical-align:baseline">
<span style="display:block; font-family:"Amazon Ember",Arial,sans-serif; font-size:15px; color:rgb(15,17,17); font-style:normal; font-weight:400; line-height:20px; margin:0; padding:0; border:0; outline:0; vertical-align:baseline"><a class="x_rio_black_href x_rio_15_black" href="https://www.amazon.co.uk/gp/r.html?C=139N1T2SKF371&K=24P3B4B2QNYP8&M=urn:rtn:msg:2024022217045437e2313c7ab74cb7b1d7a35834e0p0eu&R=3M0V3VL8DIFEU&T=C&U=https%3A%2F%2Fwww.amazon.co.uk%2Fdp%2FB09FY1WF94%2Fref%3Dpe_27063361_485629781_TE_item&H=2HWV0AI5SGMYLFCHRG7RMGRUR8KA&ref_=pe_27063361_485629781_TE_item" style="color:inherit; text-decoration:none; font-size:inherit; font-family:inherit; font-weight:inherit; line-height:inherit; margin:0; padding:0; border:0; outline:0; font-style:inherit; vertical-align:baseline"><span style="display:inline; font-family:"Amazon Ember",Arial,sans-serif; font-size:15px; color:rgb(15,17,17); font-style:normal; font-weight:400; line-height:20px; margin:0; padding:0; border:0; outline:0; vertical-align:baseline">Hawksbill
Paper 160GSM Multi-Purpose... </span></a></span></p>
<p class="x_rio_micro_space" style="margin-top:4px!important; margin:0; padding:0; border:0; outline:0; font-weight:inherit; font-style:inherit; font-size:100%; font-family:inherit; vertical-align:baseline">
<span class="x_rio_13_grey" style="color:rgb(86,89,89); font-size:13px; display:block; font-family:"Amazon Ember",Arial,sans-serif; font-style:normal; font-weight:400; line-height:20px; margin:0; padding:0; border:0; outline:0; vertical-align:baseline">Sold
by: Amazon EU S.a.r.L. </span></p>
<p class="x_rio_micro_space" style="margin-top:4px!important; margin:0; padding:0; border:0; outline:0; font-weight:inherit; font-style:inherit; font-size:100%; font-family:inherit; vertical-align:baseline">
<span class="x_rio_13_grey" style="color:rgb(86,89,89); font-size:13px; display:block; font-family:"Amazon Ember",Arial,sans-serif; font-style:normal; font-weight:400; line-height:20px; margin:0; padding:0; border:0; outline:0; vertical-align:baseline">Qty:
1 </span></p>
<p class="x_rio_micro_space" style="margin-top:4px!important; margin:0; padding:0; border:0; outline:0; font-weight:inherit; font-style:inherit; font-size:100%; font-family:inherit; vertical-align:baseline">
<span class="x_rio_15_grey" style="color:rgb(86,89,89); font-size:15px; display:block; font-family:"Amazon Ember",Arial,sans-serif; font-style:normal; font-weight:400; line-height:20px; margin:0; padding:0; border:0; outline:0; vertical-align:baseline">£4.11
</span></p>
</td>
</tr>
Where Hawksbill Paper 160G… is product name, https://m.media-amazon.com/images/I/21JZPXNafXL._SY115_SX115_.jpg is image and £4.11 is price.
Please note that this is part of a larger html which is the whole email:
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=Windows-1252">
<style type="text/css" style="display:none;"> P {margin-top:0;margin-bottom:0;} </style>
</head>
<body dir="ltr">
<div class="elementToProof" style="font-family: Aptos, Aptos_EmbeddedFont, Aptos_MSFontService, Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">
<br>
</div>
<div id="appendonsend"></div>
<hr style="display:inline-block;width:98%" tabindex="-1">
<div id="divRplyFwdMsg" dir="ltr"><font face="Calibri, sans-serif" style="font-size:11pt" color="#000000"><b>From:</b> Amazon.co.uk <[email protected]><br>
<b>Sent:</b> Thursday, February 22, 2024 9:04 AM<br>
<b>To:</b> [email protected] <[email protected]><br>
<b>Subject:</b> Your Amazon.co.uk order of "Chiltern Arts 8 Tubes of..." and 2 more item(s)</font>
<div> </div>
</div>
<style type="text/css">
<!--
html, div, span, applet, object, iframe, h1, h2, h3, h4, h5, h6, p, blockquote, pre, a, abbr, acronym, address, big, cite, code, del, dfn, em, font, img, ins, kbd, q, s, samp, small, strike, strong, sub, sup, tt, var, dl, dt, dd, ol, ul, li, fieldset, form, label, legend, table, caption, tbody, tfoot, thead, tr, th, td
{margin:0;
padding:0;
border:0;
outline:0;
font-weight:inherit;
font-style:inherit;
font-size:100%;
font-family:inherit;
vertical-align:baseline}
""" More Code for styling"""
.x_fan_funded_metadata a
{color:rgb(0,113,133)!important}
-->
</style>
<div style="background:rgb(255,255,255); display:block; margin:0 auto; padding:0">
""" More Code """
I have tried including selectors such as ‘a[title]’, ‘a img[alt]’, ‘span.x_rio_15_grey’, ‘img’ but it only capture image url and all the other information is left unfilled. Any guidance would be appreciated.