Thiết kế website giá rẻ

Question

I want to scrape the first Wikipedia table from https://en.wikipedia.org/wiki/List_of_Eurovision_Song_Contest_host_cities. The difficulty lies in the fact that the table has merged cells (some of the entries have rowspans larger than 1).

So, for example the first entry in the Contests column is 9 and applies to the first 9 rows of the table (it has a rowspan of 9), so then when scraping the data and adding it to a pandas dataframe, I want the first 9 rows in the Contests column to have the entry ‘9’.

I have tried the following:

<code>import requests

from bs4 import BeautifulSoup

import pandas as pd

url = 'https://en.wikipedia.org/wiki/List_of_Eurovision_Song_Contest_host_cities'

response = requests.get(url)

soup = BeautifulSoup(response.content, "html.parser")

# Create an empty DataFrame with desired column headers

df = pd.DataFrame(columns=['Contests', 'Country', 'City', 'Venue', 'Year', 'Ref'])

for index, row in enumerate(soup.find_all('tr')):

if index == 0: # Skip the first header row

continue

cells = row.find_all(['td', 'th'])

country_value = None

if cells[0].has_attr('rowspan'):

contests_value = cells[0].get_text(strip=True)

contests_rowspan = int(cells[0]['rowspan'])

contests_values = [contests_value] * contests_rowspan # Replicate the value the required number of time

df = df.append(pd.DataFrame({'Contests': contests_values}), ignore_index=True)

if cells[1].has_attr('rowspan'):

country_value = cells[1].get_text(strip=True)

country_rowspan = int(cells[1]['rowspan'])

country_values = [country_value] * country_rowspan

df = df.append(pd.DataFrame({'Country': country_values}), ignore_index=True)

if cells[2].has_attr('rowspan'):

print(cells[2])

city_value = cells[2].get_text(strip=True)

city_rowspan = int(cells[2]['rowspan'])

city_values = [city_value] * city_rowspan

df = df.append(pd.DataFrame({'City': city_values}), ignore_index=True)

venue_value = cells[3].get_text(strip=True)

year_value = cells[4].get_text(strip=True)

ref_value = cells[5].get_text(strip=True)

for _ in range(max(contests_rowspan, country_rowspan, city_rowspan)):

df = df.append({'Venue': venue_value, 'Year': year_value, 'Ref': ref_value}, ignore_index=True)

df.head()

</code>

<code>import requests from bs4 import BeautifulSoup import pandas as pd url = 'https://en.wikipedia.org/wiki/List_of_Eurovision_Song_Contest_host_cities' response = requests.get(url) soup = BeautifulSoup(response.content, "html.parser") # Create an empty DataFrame with desired column headers df = pd.DataFrame(columns=['Contests', 'Country', 'City', 'Venue', 'Year', 'Ref']) for index, row in enumerate(soup.find_all('tr')): if index == 0: # Skip the first header row continue cells = row.find_all(['td', 'th']) country_value = None if cells[0].has_attr('rowspan'): contests_value = cells[0].get_text(strip=True) contests_rowspan = int(cells[0]['rowspan']) contests_values = [contests_value] * contests_rowspan # Replicate the value the required number of time df = df.append(pd.DataFrame({'Contests': contests_values}), ignore_index=True) if cells[1].has_attr('rowspan'): country_value = cells[1].get_text(strip=True) country_rowspan = int(cells[1]['rowspan']) country_values = [country_value] * country_rowspan df = df.append(pd.DataFrame({'Country': country_values}), ignore_index=True) if cells[2].has_attr('rowspan'): print(cells[2]) city_value = cells[2].get_text(strip=True) city_rowspan = int(cells[2]['rowspan']) city_values = [city_value] * city_rowspan df = df.append(pd.DataFrame({'City': city_values}), ignore_index=True) venue_value = cells[3].get_text(strip=True) year_value = cells[4].get_text(strip=True) ref_value = cells[5].get_text(strip=True) for _ in range(max(contests_rowspan, country_rowspan, city_rowspan)): df = df.append({'Venue': venue_value, 'Year': year_value, 'Ref': ref_value}, ignore_index=True) df.head() </code>

import requests
from bs4 import BeautifulSoup
import pandas as pd

url = 'https://en.wikipedia.org/wiki/List_of_Eurovision_Song_Contest_host_cities'
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

# Create an empty DataFrame with desired column headers
df = pd.DataFrame(columns=['Contests', 'Country', 'City', 'Venue', 'Year', 'Ref'])

for index, row in enumerate(soup.find_all('tr')):
    if index == 0:  # Skip the first header row
        continue

    cells = row.find_all(['td', 'th'])
    
    country_value = None
    if cells[0].has_attr('rowspan'):
        contests_value = cells[0].get_text(strip=True)
        contests_rowspan = int(cells[0]['rowspan'])
        contests_values = [contests_value] * contests_rowspan # Replicate the value the required number of time
        df = df.append(pd.DataFrame({'Contests': contests_values}), ignore_index=True)

    if cells[1].has_attr('rowspan'):
        country_value = cells[1].get_text(strip=True)
        country_rowspan = int(cells[1]['rowspan'])
        country_values = [country_value] * country_rowspan
        df = df.append(pd.DataFrame({'Country': country_values}), ignore_index=True)

    if cells[2].has_attr('rowspan'):
        print(cells[2])
        city_value = cells[2].get_text(strip=True)
        city_rowspan = int(cells[2]['rowspan'])
        city_values = [city_value] * city_rowspan
        df = df.append(pd.DataFrame({'City': city_values}), ignore_index=True)
    
    venue_value = cells[3].get_text(strip=True)
    year_value = cells[4].get_text(strip=True)
    ref_value = cells[5].get_text(strip=True)
    
    for _ in range(max(contests_rowspan, country_rowspan, city_rowspan)):
            df = df.append({'Venue': venue_value, 'Year': year_value, 'Ref': ref_value}, ignore_index=True)

df.head()

The problem with this is that in the first row the cells[0] corresponds to the Contests, cells[1] to Country and cells[2] to the City. However, since these 3 entries all have a rowspan larger than 1, they are not included in the second row HTML code, and so now in the second row cells[0] corresponds to Venue, cells[1] to Year and cells[2] to Ref. Note that my rowspans for Contests, Country and City are not always the same.

I am not sure how to fix this.

Thiết kế website giá rẻ

Danh mục

Webscraping a table with rowspans larger than 1