I want to do web scraping this website https://www.thesoldiersproject.org/which-exo-members-are-in-the-military/ to retrieve the member name, enlisted date and discharge date. But after I wrote and run my code below, it only save the header into csv file.
these are my code
import requests
from bs4 import BeautifulSoup
import pandas as pd
# Define the URL
url = "https://www.thesoldiersproject.org/which-exo-members-are-in-the-military/"
# Send a GET request to the URL
response = requests.get(url)
# Check if the request was successful
if response.status_code == 200:
# Parse the content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Find the relevant section containing the members' information
content = soup.find('div', class_='entry-content')
# Initialize a list to store the data
data = []
# Loop through the paragraphs and extract dates
paragraphs = content.find_all('p')
for paragraph in paragraphs:
text = paragraph.get_text()
if 'Enlistment date:' in text and 'Discharge date:' in text:
lines = text.split('n')
for line in lines:
if 'Enlistment date:' in line and 'Discharge date:' in line:
parts = line.split(' - ')
if len(parts) == 2:
name = parts[0].strip()
enlistment_date = parts[1].split('Enlistment date: ')[1].split(',')[0].strip()
discharge_date = parts[1].split('Discharge date: ')[1].strip()
data.append([name, enlistment_date, discharge_date])
# Convert the data into a pandas DataFrame
df = pd.DataFrame(data, columns=['Name', 'Enlistment Date', 'Discharge Date'])
# Save the DataFrame to a CSV file
df.to_csv('exo_military_dates.csv', index=False)
else:
print("Failed to retrieve the webpage. Status code:", response.status_code)
expected output : the data that stored into csv file should contains member’s name, member’s enlisted date and discharge date
Zoeyyyy is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
4
I’d probably use regular expression to parse the enlistment/discharge dates:
import re
import requests
from bs4 import BeautifulSoup
url = "https://www.thesoldiersproject.org/which-exo-members-are-in-the-military/"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
def get_text(title):
out, p = [], title.find_next_sibling("p")
while p:
out.append(p.text)
p = p.find_next_sibling()
if not (p and p.name == "p"):
break
return " ".join(out)
print(f"{'NAME':<20} {'START':<20} {'END':<20}")
print("-" * 62)
for h3 in soup.find_all(name="h3", string=re.compile(r"^d+")):
name = h3.text.split(maxsplit=1)[-1]
text = get_text(h3)
m = re.search(
r"(?:enlisted|enlistment|started|began).*?(S+ d+, d{4})(?:.*(?:discharge|back|finish).*?(S+ d+, d+|S+ d{4}))?",
text,
)
start, end = m[1], ("-" if not m[2] else m[2])
print(f"{name:<20} {start:<20} {end:<20}")
Prints:
NAME START END
--------------------------------------------------------------
Xiumin May 7, 2019 December 6, 2020
D.O July 1, 2019 January 25, 2020
Suho May 14, 2020 February 14, 2022
Chen October 26, 2020 April 25, 2022
Chanyeol March 29, 2021 September 2022
Baekhyun May 6, 2021 February 5, 2023
Kai May 11, 2023 May 11, 2025
Sehun December 21, 2023 -
2