My chart is showing completely wrong things, and i want to know where in my code that I can change what i want to retrieve, in order for it to display the correct stuff.
Is the extraction of data off? Or is the displaying of data off?
If there isnt anything wrong in the code here, please enlighten me. I am desperate…
import os
import csv
import pandas as pd
from bs4 import BeautifulSoup
import tarfile
file_path = 'kungalv_slutpriser.tar.gz'
with tarfile.open(file_path, 'r:gz') as tar:
tar.extractall('extracted_html')
def extract_house_info(html_content):
house_info = {}
try:
soup = BeautifulSoup(html_content, 'html.parser')
# Extract data using BeautifulSoup (adjust selectors based on actual HTML structure)
house_info['date_of_sale'] = soup.find(string='Såld').find_next().text.strip() if soup.find(string='Såld') else None
house_info['address'] = soup.find(string='Adress').find_next().text.strip() if soup.find(string='Adress') else None
house_info['location'] = soup.find(string='Ort').find_next().text.strip() if soup.find(string='Ort') else None
area_text = soup.find(string='Boarea').find_next().text.strip() if soup.find(string='Boarea') else None
if area_text and area_text.split()[0].isdigit():
house_info['boarea'] = int(area_text.split()[0])
else:
house_info['boarea'] = None
biarea_tag = soup.find(string='Biarea')
if biarea_tag:
biarea_text = biarea_tag.find_next().text.strip()
if biarea_text.split()[0].isdigit():
house_info['biarea'] = int(biarea_text.split()[0])
house_info['totalarea'] = house_info['boarea'] + house_info['biarea']
else:
house_info['biarea'] = None
house_info['totalarea'] = house_info['boarea']
house_info['rooms'] = soup.find(string='Rum').find_next().text.strip() if soup.find(string='Rum') else None
house_info['plot_area'] = soup.find(string='Tomt').find_next().text.strip() if soup.find(string='Tomt') else None
closing_price_tag = soup.find(string='Pris')
if closing_price_tag:
house_info['closing_price'] = closing_price_tag.find_next().text.strip()
else:
house_info['closing_price'] = None
except Exception as e:
print(f"Error extracting data: {e}")
return house_info
# List to hold all extracted house data
all_houses = []
# Process each HTML file
for file_name in html_files:
file_path = os.path.join(html_folder, file_name)
try:
with open(file_path, 'r', encoding='utf-8') as file:
html_content = file.read()
house_info = extract_house_info(html_content)
all_houses.append(house_info)
except Exception as e:
print(f"Error processing file {file_name}: {e}")
# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(all_houses)
# Save DataFrame to CSV file
df.to_csv('house_prices.csv', index=False)
print("Data has been written to 'house_prices.csv'.")
New contributor
MarreBoi is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.