I first scraped the basic data using bs4
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
def scrape_funding_data(url):
response = requests.get(url, headers=headers)
if response.status_code != 200:
print(f"Failed to fetch data from {url}")
return None
soup = BeautifulSoup(response.content, 'html.parser')
rounds_data = []
round_containers = soup.find_all('div', class_='group relative flex flex-col gap-y-4 rounded-2xl p-4 shadow-md hover:bg-gray-100 active:bg-gray-200 dark:bg-zinc-800 dark:shadow-black dark:hover:bg-zinc-700 dark:active:bg-zinc-600')
# go through each round
for round_container in round_containers:
data = {}
data['source_link'] = url
# round type
round_type_div = round_container.find('div', class_='mb-1 flex flex-wrap items-center space-y-1')
if round_type_div:
round_type_h2 = round_type_div.find('h2', class_='mr-2 text-left text-lg font-bold')
if round_type_h2:
data['round_type'] = round_type_h2.get_text(strip=True)
else:
data['round_type'] = None
# extra text
extra_info_div = round_container.find('div', class_='mr-18 flex flex-row flex-wrap gap-x-4 gap-y-1')
if extra_info_div:
spans = extra_info_div.find_all('span', class_='text-gray-900 dark:text-white')
all_info = " | ".join(span.get_text(strip=True) for span in spans)
data['all_info'] = all_info
else:
data['all_info'] = None
# ROI USD
roi_usd_div = round_container.find('div', class_='ml-auto mr-4 hidden flex-col items-end sm:flex')
if roi_usd_div:
roi_usd_span = roi_usd_div.find('span', class_='text-base font-semibold')
data['roi_usd'] = roi_usd_span.get_text(strip=True) if roi_usd_span else None
else:
data['roi_usd'] = None
# Append
rounds_data.append(data)
return rounds_data
all_data_list = []
# scrape
for i, url in enumerate(links_list):
# print(f"Scraping data from link {i + 1}: {url}")
rounds_data = scrape_funding_data(url)
if rounds_data:
all_data_list.extend(rounds_data)
df3 = pd.DataFrame(all_data_list)
print(df3)
And now I’m trying to use selenium to scrape data that requires you to push a button on each round to display more data, but I’m quite inexperienced with selenium and having trouble locating the element to scrape the data from
I’m using the selenium on google collab and locating by xpath, but it can’t seem to find the element
This is what I’ve tried
import google_colab_selenium as gs
from selenium.webdriver.chrome.options import Options
# Instantiate options
options = Options()
# Add extra options
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = gs.Chrome(options=options)
URL = "https://dropstab.com/coins/centrifuge/fundraising"
driver = gs.Chrome(options=options)
driver.maximize_window()
driver.get(URL)
button = driver.find_element(by=By.XPATH, value = '/html/body/div/div[1]/div/div[2]/main/div/article/div/div/section/div/div[1]/section[1]/div/div[1]/button')
button.click()
And i’ve tried a couple other xpaths, full xpaths, etc, and it just cannot locate the element. I’m trying to scrape all the additional data that occurs after you click on the fundrasing rounds (https://dropstab.com/coins/centrifuge/fundraising)