i scraped data from a website and iam having trouble cleaning it
this is the code i scraped the data with , is that best practice ?
import requests
from bs4 import BeautifulSoup
import json
all_countries_links=[]
countries= []
all_data=[]
data_dict={}
data_value=[]
page1 = requests.get(f"https://data.un.org/")
def main(page):
source = page.content
soup = BeautifulSoup(source,'lxml')
all_page = soup.find("div",{"class","CountryList"}).find_all('a',href=True)
for link in all_page:
all_countries_links.append(link['href'])
countries. append(link.text.strip())
def scrape_country(all_countries_links,countries):
for country in all_countries_links[:2]:
page2 = requests.get(f"https://data.un.org/{country}")
source = page2.content
soup = BeautifulSoup(source,'lxml')
all_page= soup.find('ul',{'class','pure-menu-list'})
tables = all_page.contents
for table in tables:
line = table.text.strip()
all_data.append(line)
main(page1)
scrape_country(all_countries_links,countries)
file_path = "data.json"
with open(file_path, 'w') as f:
json.dump(all_data, f, indent=4)
print(f"Data saved to {file_path}")
this is a small example of the data after collecting it
[
"",
"General InformationnnRegionu00a0nu00a0nSouthern AsianPopulationu00a0(000, 2021)nu00a0n39 835anPop. densityu00a0(per km2, 2021)nu00a0n61anCapital cityu00a0nu00a0nKabulnCapital city pop.u00a0(000, 2021)nu00a0n4 114.0bnUN membership dateu00a0nu00a0n19-Nov-46nSurface areau00a0(km2)nu00a0n652 864bnSex ratiou00a0(m per 100 f)nu00a0n105.3anNational currencyu00a0nu00a0nAfghani (AFN)nExchange rateu00a0(per US$)nu00a0n77.1c",
]
i tried to separate the data with this code
cleaned_data =[]
# for line in cleaned_data:
# print(line.split('n'))
# new_data = [line for line in all_data.split()]
for line in all_data[:1]:
for line2 in line.split():
if line2 not in ["General","Information","Economic"," indicators","Social"," indicators"]:
cleaned_data.append(line2)
but i was hoping to find a better way
New contributor
basel nabil is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.