Develop a web scraper capable of extracting paper titles, author list,
publication time, and abstract from PUBMED for the keyword “Breast Cancer”
within the time window 06/01/2023 – 12/31/2023.
▪ Save the retrieved data in CSV format for later use
it is not working properly and I am not sure how to fix it/ what to do- any help is greatly appreciated! 🙂
pip install requests beautifulsoup4
import requests
from bs4 import BeautifulSoup
import csv
# Constants
SEARCH_URL = "https://pubmed.ncbi.nlm.nih.gov/"
QUERY = "Breast Cancer"
START_DATE = "2023/06/01"
END_DATE = "2023/12/31"
HEADERS = {
"User-Agent": "Mozilla/5.0"
}
def fetch_article_links(query, start_date, end_date):
params = {
"term": query,
"mindate": start_date,
"maxdate": end_date
}
response = requests.get(SEARCH_URL, params=params, headers=HEADERS)
soup = BeautifulSoup(response.text, 'html.parser')
article_links = [a['href'] for a in soup.find_all('a', class_='docsum-title')]
return article_links
def fetch_article_details(article_link):
response = requests.get(article_link, headers=HEADERS)
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.find('h1', class_='heading-title').text.strip()
authors = [a.text.strip() for a in soup.find_all('a', class_='full-name')]
pub_date = soup.find('span', class_='cit').text.strip()
abstract = soup.find('div', class_='abstract-content').text.strip()
return {
"Title": title,
"Authors": ", ".join(authors),
"Publication Date": pub_date,
"Abstract": abstract
}
def save_to_csv(data, filename='pubmed_breast_cancer.csv'):
with open(filename, mode='w', newline='', encoding='utf-8') as file:
writer = csv.DictWriter(file, fieldnames=["Title", "Authors", "Publication Date", "Abstract"])
writer.writeheader()
writer.writerows(data)
def main():
article_links = fetch_article_links(QUERY, START_DATE, END_DATE)
base_url = "https://pubmed.ncbi.nlm.nih.gov"
data = []
for link in article_links:
full_link = f"{base_url}{link}"
try:
article_details = fetch_article_details(full_link)
data.append(article_details)
except Exception as e:
print(f"Error fetching details for {full_link}: {e}")
save_to_csv(data)
if __name__ == "__main__":
main()
New contributor
LuluC is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.