1st problem :
I tried using merge to matching two csv files below based on album name, so the new csv file will contains the rank, album name, artist name, sales, release date, etc. But only few of the showing up. Then, i tried again using fuzzy wuzzy to match them, but it result in like for example, I have EXO-K MAMA album, it match will EXO OBSESSION album. I attach the code at the end. The release date after matching is also wrong.
2nd problem:
these codes below is used to webscraping album sales, but when i load until year 2022, it will stop because the website does not have exo’s album sales in this year and it does not stop loading (my laptop start lagging). Anyone can help me to fix the problem?
==========CODE FOR WEBSCRAPING EXO’s ALBUM SALES===============
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
def scrape_album_sales(url, file_name, mode='a'):
# Initialize WebDriver
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=chrome_options)
driver.get(url)
try:
# Wait until the table is loaded
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'ChartTablePC')))
# Locate the table
table = driver.find_element(By.CLASS_NAME, 'ChartTablePC')
rows = table.find_elements(By.TAG_NAME, 'tr')
# List of valid artist names
valid_artists = ["EXO", "EXO-M", "EXO-K", "EXO-CBX", "세훈&찬열(EXO-SC)"]
# Open CSV file for writing (append mode)
with open(file_name, mode, newline='', encoding='utf-8') as outfile:
writer = csv.writer(outfile)
if mode == 'w':
writer.writerow(['Album Rank on the year release', 'Album Name', 'Artist Name', 'Sales'])
# Iterate over rows
for row in rows[1:]: # Skip the header row
cells = row.find_elements(By.TAG_NAME, 'td')
if len(cells) >= 5: # Ensure there are enough columns
rank = cells[0].text.strip()
album_artist = cells[2].text.strip()
album_name, artist_name = album_artist.split('n')
sales = cells[3].get_attribute("textContent").strip()
# Check if the artist name is in the valid list
if artist_name.strip() in valid_artists:
data_out = [rank, album_name, artist_name, sales]
writer.writerow(data_out)
print(data_out)
except Exception as e:
print(f"An error occurred: {e}")
finally:
# Close the WebDriver
driver.quit()
# Example usage
file_name = "EXOalbumSales.csv"
scrape_album_sales('https://circlechart.kr/page_chart/album.circle?nationGbn=T&targetTime=2012&hitYear=2012&termGbn=year&yearTime=3', file_name, mode='w')
scrape_album_sales('https://circlechart.kr/page_chart/album.circle?nationGbn=T&targetTime=2013&hitYear=2013&termGbn=year&yearTime=3', file_name, mode='a')
scrape_album_sales('https://circlechart.kr/page_chart/album.circle?nationGbn=T&targetTime=2014&hitYear=2014&termGbn=year&yearTime=3', file_name, mode='a')
scrape_album_sales('https://circlechart.kr/page_chart/album.circle?nationGbn=T&targetTime=2015&hitYear=2015&termGbn=year&yearTime=3', file_name, mode='a')
scrape_album_sales('https://circlechart.kr/page_chart/album.circle?nationGbn=T&targetTime=2016&hitYear=2016&termGbn=year&yearTime=3', file_name, mode='a')
scrape_album_sales('https://circlechart.kr/page_chart/album.circle?nationGbn=T&targetTime=2017&hitYear=2017&termGbn=year&yearTime=3', file_name, mode='a')
scrape_album_sales('https://circlechart.kr/page_chart/album.circle?nationGbn=T&targetTime=2018&hitYear=2018&termGbn=year&yearTime=3', file_name, mode='a')
scrape_album_sales('https://circlechart.kr/page_chart/album.circle?nationGbn=T&targetTime=2019&hitYear=2019&termGbn=year&yearTime=3', file_name, mode='a')
scrape_album_sales('https://circlechart.kr/page_chart/album.circle?nationGbn=T&targetTime=2020&hitYear=2020&termGbn=year&yearTime=3', file_name, mode='a')
scrape_album_sales('https://circlechart.kr/page_chart/album.circle?nationGbn=T&targetTime=2021&hitYear=2021&termGbn=year&yearTime=3', file_name, mode='a')
scrape_album_sales('https://circlechart.kr/page_chart/album.circle?nationGbn=T&targetTime=2022&hitYear=2022&termGbn=year&yearTime=3', file_name, mode='a')
scrape_album_sales('https://circlechart.kr/page_chart/album.circle?nationGbn=T&targetTime=2023&hitYear=2023&termGbn=year&yearTime=3', file_name, mode='a')
=======CODE FOR WEBSCRAPING EXO ALBUM NAME AND ITS RELEASE DATE===============
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import csv
from datetime import datetime
# Initialize WebDriver
driver = webdriver.Chrome() # Make sure you have the appropriate web driver installed
# Open the URL of the webpage
url = "https://genius.com/artists/Exo/albums"
driver.get(url)
# Automatically scroll the page
scroll_pause_time = 2 # Pause between each scroll
screen_height = driver.execute_script("return window.screen.height;") # Browser window height
i = 1
while True:
# Scroll down
driver.execute_script(f"window.scrollTo(0, {screen_height * i});")
i += 1
time.sleep(scroll_pause_time)
# Check if reaching the end of the page
scroll_height = driver.execute_script("return document.body.scrollHeight;")
if screen_height * i > scroll_height:
break
# Fetch the data using BeautifulSoup after all data is loaded
soup = BeautifulSoup(driver.page_source, "html.parser")
album_list = soup.find('div', class_='ListSectiondesktop__Container-sc-53xokv-0 jAmPRI')
# Open a CSV file to write the data
with open('exo_albums.csv', mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
# Write the header row
writer.writerow(['Album Name', 'Release Date'])
print("Header added in csv")
if album_list:
# Find all li elements within the ul
albums = album_list.find_all('li', class_='ListItem__Container-sc-122yj9e-0 eRBVjI')
for album in albums:
# Extract the album name
album_name = album.find('h3', class_='ListItem__Title-sc-122yj9e-4 nknYf').text.strip()
# Extract the release date
release_date_text = album.find('div', class_='ListItem__Info-sc-122yj9e-5 iGHTNt').text.strip()
# Convert the release date to dd-mm-yyyy format
try:
release_date = datetime.strptime(release_date_text, "%B %d, %Y").strftime("%d-%m-%Y")
except ValueError:
# Handle cases where the date format is different or missing
release_date = release_date_text
# Write the album data to the CSV file
writer.writerow([album_name, release_date])
print("Data has been written to exo_albums.csv")
else:
print("Album list not found on the page.")
# Close the WebDriver session
driver.quit()
==FUZZY WUZZY PROBLEM===
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
# Function to clean and normalize album names
def clean_album_name(album_name):
# Remove special characters, extra spaces, and convert to lowercase
return album_name.replace("`", "").replace("'", "").replace("-", " ").strip().lower()
# Read the first CSV file into a DataFrame
df1 = pd.read_csv('exo_albums.csv')
# Read the second CSV file into a DataFrame
df2 = pd.read_csv('EXOalbumSales.csv')
# Clean and normalize album names in both DataFrames
df1['Cleaned Album Name'] = df1['Album Name'].apply(clean_album_name)
df2['Cleaned Album Name'] = df2['Album Name'].apply(clean_album_name)
# Function to find closest match using fuzzy matching
def find_closest_match(album_name, df):
choices = df['Cleaned Album Name'].tolist()
match = process.extractOne(album_name, choices, scorer=fuzz.token_sort_ratio)
return match[0] if match else None
# Map album names from df2 to df1 based on closest match
df2['Matched Album Name'] = df2['Cleaned Album Name'].apply(lambda x: find_closest_match(x, df1))
# Merge the DataFrames on the matched album names
merged_df = pd.merge(df2, df1, how='left', left_on='Matched Album Name', right_on='Cleaned Album Name')
# Save the merged DataFrame to a new CSV file
merged_df.to_csv('combined_exo_albums.csv', index=False)
print("Combined data has been written to combined_exo_albums.csv")
the expected output should be two csv files are matches based on the ALBUM NAME, so the new data that append to new CSV file will have rank of the album, album name, artist name, sales, release date