Thiết kế website giá rẻ

Question

1st problem :
I tried using merge to matching two csv files below based on album name, so the new csv file will contains the rank, album name, artist name, sales, release date, etc. But only few of the showing up. Then, i tried again using fuzzy wuzzy to match them, but it result in like for example, I have EXO-K MAMA album, it match will EXO OBSESSION album. I attach the code at the end. The release date after matching is also wrong.

2nd problem:
these codes below is used to webscraping album sales, but when i load until year 2022, it will stop because the website does not have exo’s album sales in this year and it does not stop loading (my laptop start lagging). Anyone can help me to fix the problem?
==========CODE FOR WEBSCRAPING EXO’s ALBUM SALES===============

import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

def scrape_album_sales(url, file_name, mode='a'):
    # Initialize WebDriver
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(url)

    try:
        # Wait until the table is loaded
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'ChartTablePC')))

        # Locate the table
        table = driver.find_element(By.CLASS_NAME, 'ChartTablePC')
        rows = table.find_elements(By.TAG_NAME, 'tr')

        # List of valid artist names
        valid_artists = ["EXO", "EXO-M", "EXO-K", "EXO-CBX", "세훈&찬열(EXO-SC)"]

        # Open CSV file for writing (append mode)
        with open(file_name, mode, newline='', encoding='utf-8') as outfile:
            writer = csv.writer(outfile)
            if mode == 'w':
                writer.writerow(['Album Rank on the year release', 'Album Name', 'Artist Name', 'Sales'])

            # Iterate over rows
            for row in rows[1:]:  # Skip the header row
                cells = row.find_elements(By.TAG_NAME, 'td')
                if len(cells) >= 5:  # Ensure there are enough columns
                    rank = cells[0].text.strip()
                    album_artist = cells[2].text.strip()
                    album_name, artist_name = album_artist.split('n')
                    sales = cells[3].get_attribute("textContent").strip()

                    # Check if the artist name is in the valid list
                    if artist_name.strip() in valid_artists:
                        data_out = [rank, album_name, artist_name, sales]
                        writer.writerow(data_out)
                        print(data_out)
            
    except Exception as e:
        print(f"An error occurred: {e}")
        
    finally:
        # Close the WebDriver
        driver.quit()

# Example usage
file_name = "EXOalbumSales.csv"
scrape_album_sales('https://circlechart.kr/page_chart/album.circle?nationGbn=T&targetTime=2012&hitYear=2012&termGbn=year&yearTime=3', file_name, mode='w')
scrape_album_sales('https://circlechart.kr/page_chart/album.circle?nationGbn=T&targetTime=2013&hitYear=2013&termGbn=year&yearTime=3', file_name, mode='a')
scrape_album_sales('https://circlechart.kr/page_chart/album.circle?nationGbn=T&targetTime=2014&hitYear=2014&termGbn=year&yearTime=3', file_name, mode='a')
scrape_album_sales('https://circlechart.kr/page_chart/album.circle?nationGbn=T&targetTime=2015&hitYear=2015&termGbn=year&yearTime=3', file_name, mode='a')
scrape_album_sales('https://circlechart.kr/page_chart/album.circle?nationGbn=T&targetTime=2016&hitYear=2016&termGbn=year&yearTime=3', file_name, mode='a')
scrape_album_sales('https://circlechart.kr/page_chart/album.circle?nationGbn=T&targetTime=2017&hitYear=2017&termGbn=year&yearTime=3', file_name, mode='a')
scrape_album_sales('https://circlechart.kr/page_chart/album.circle?nationGbn=T&targetTime=2018&hitYear=2018&termGbn=year&yearTime=3', file_name, mode='a')
scrape_album_sales('https://circlechart.kr/page_chart/album.circle?nationGbn=T&targetTime=2019&hitYear=2019&termGbn=year&yearTime=3', file_name, mode='a')
scrape_album_sales('https://circlechart.kr/page_chart/album.circle?nationGbn=T&targetTime=2020&hitYear=2020&termGbn=year&yearTime=3', file_name, mode='a')
scrape_album_sales('https://circlechart.kr/page_chart/album.circle?nationGbn=T&targetTime=2021&hitYear=2021&termGbn=year&yearTime=3', file_name, mode='a')
scrape_album_sales('https://circlechart.kr/page_chart/album.circle?nationGbn=T&targetTime=2022&hitYear=2022&termGbn=year&yearTime=3', file_name, mode='a')
scrape_album_sales('https://circlechart.kr/page_chart/album.circle?nationGbn=T&targetTime=2023&hitYear=2023&termGbn=year&yearTime=3', file_name, mode='a')

=======CODE FOR WEBSCRAPING EXO ALBUM NAME AND ITS RELEASE DATE===============

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import csv
from datetime import datetime

# Initialize WebDriver
driver = webdriver.Chrome()  # Make sure you have the appropriate web driver installed

# Open the URL of the webpage
url = "https://genius.com/artists/Exo/albums"
driver.get(url)

# Automatically scroll the page
scroll_pause_time = 2  # Pause between each scroll
screen_height = driver.execute_script("return window.screen.height;")  # Browser window height
i = 1
while True:
    # Scroll down
    driver.execute_script(f"window.scrollTo(0, {screen_height * i});")
    i += 1
    time.sleep(scroll_pause_time)

    # Check if reaching the end of the page
    scroll_height = driver.execute_script("return document.body.scrollHeight;")
    if screen_height * i > scroll_height:
        break

# Fetch the data using BeautifulSoup after all data is loaded
soup = BeautifulSoup(driver.page_source, "html.parser")
album_list = soup.find('div', class_='ListSectiondesktop__Container-sc-53xokv-0 jAmPRI')

# Open a CSV file to write the data
with open('exo_albums.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header row
    writer.writerow(['Album Name', 'Release Date'])

    print("Header added in csv")

    if album_list:
        # Find all li elements within the ul
        albums = album_list.find_all('li', class_='ListItem__Container-sc-122yj9e-0 eRBVjI')

        for album in albums:
            # Extract the album name
            album_name = album.find('h3', class_='ListItem__Title-sc-122yj9e-4 nknYf').text.strip()
            # Extract the release date
            release_date_text = album.find('div', class_='ListItem__Info-sc-122yj9e-5 iGHTNt').text.strip()

            # Convert the release date to dd-mm-yyyy format
            try:
                release_date = datetime.strptime(release_date_text, "%B %d, %Y").strftime("%d-%m-%Y")
            except ValueError:
                # Handle cases where the date format is different or missing
                release_date = release_date_text

            # Write the album data to the CSV file
            writer.writerow([album_name, release_date])

        print("Data has been written to exo_albums.csv")
    else:
        print("Album list not found on the page.")

# Close the WebDriver session
driver.quit()

==FUZZY WUZZY PROBLEM===

import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# Function to clean and normalize album names
def clean_album_name(album_name):
    # Remove special characters, extra spaces, and convert to lowercase
    return album_name.replace("`", "").replace("'", "").replace("-", " ").strip().lower()

# Read the first CSV file into a DataFrame
df1 = pd.read_csv('exo_albums.csv')

# Read the second CSV file into a DataFrame
df2 = pd.read_csv('EXOalbumSales.csv')

# Clean and normalize album names in both DataFrames
df1['Cleaned Album Name'] = df1['Album Name'].apply(clean_album_name)
df2['Cleaned Album Name'] = df2['Album Name'].apply(clean_album_name)

# Function to find closest match using fuzzy matching
def find_closest_match(album_name, df):
    choices = df['Cleaned Album Name'].tolist()
    match = process.extractOne(album_name, choices, scorer=fuzz.token_sort_ratio)
    return match[0] if match else None

# Map album names from df2 to df1 based on closest match
df2['Matched Album Name'] = df2['Cleaned Album Name'].apply(lambda x: find_closest_match(x, df1))

# Merge the DataFrames on the matched album names
merged_df = pd.merge(df2, df1, how='left', left_on='Matched Album Name', right_on='Cleaned Album Name')

# Save the merged DataFrame to a new CSV file
merged_df.to_csv('combined_exo_albums.csv', index=False)

print("Combined data has been written to combined_exo_albums.csv")

the expected output should be two csv files are matches based on the ALBUM NAME, so the new data that append to new CSV file will have rank of the album, album name, artist name, sales, release date

Thiết kế website giá rẻ

Danh mục

using fuzzy wuzzy try to match 2 csv files but result in a lot of unmatches value