I want to extract some elements from a website where more elements load as you scroll. To achieve this, I wrote two pieces of code. I assumed that using Selenium would allow me to find more elements on the website, but I encountered the following problem.
I can find 24 elements (in the result list) when using this code:
from bs4 import BeautifulSoup
respond = requests.get('https://website.com')
soup= BeautifulSoup(respond.text,'html.parser')
result = soup.find_all('div',attrs={'class':'post-list__widget-col-c1444'})
<code>import requests
from bs4 import BeautifulSoup
import re
respond = requests.get('https://website.com')
soup= BeautifulSoup(respond.text,'html.parser')
result = soup.find_all('div',attrs={'class':'post-list__widget-col-c1444'})
print(len(result))
</code>
import requests
from bs4 import BeautifulSoup
import re
respond = requests.get('https://website.com')
soup= BeautifulSoup(respond.text,'html.parser')
result = soup.find_all('div',attrs={'class':'post-list__widget-col-c1444'})
print(len(result))
But with Selenium, I can find only 6 elements (in the tags list):
<code>from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
driver.get('https://website.com')
# Initial wait for page to load
# Scroll until no new elements are loaded
last_height = driver.execute_script("return document.body.scrollHeight")
scroll_pause_time = 5 # Adjust the sleep time as needed
# Scroll down to the bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(scroll_pause_time)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
# After loading all items, find all elements
tags= driver.find_elements(By.CLASS_NAME, "post-list__widget-col-c1444")
<code>from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
from selenium.webdriver.common.by import By
import re
import requests
from bs4 import BeautifulSoup
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
driver.get('https://website.com')
# Initial wait for page to load
time.sleep(5)
# Scroll until no new elements are loaded
last_height = driver.execute_script("return document.body.scrollHeight")
scroll_pause_time = 5 # Adjust the sleep time as needed
while True:
# Scroll down to the bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load the page
time.sleep(scroll_pause_time)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# After loading all items, find all elements
tags= driver.find_elements(By.CLASS_NAME, "post-list__widget-col-c1444")
print(len(tags))
driver.quit()
</code>
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
from selenium.webdriver.common.by import By
import re
import requests
from bs4 import BeautifulSoup
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
driver.get('https://website.com')
# Initial wait for page to load
time.sleep(5)
# Scroll until no new elements are loaded
last_height = driver.execute_script("return document.body.scrollHeight")
scroll_pause_time = 5 # Adjust the sleep time as needed
while True:
# Scroll down to the bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load the page
time.sleep(scroll_pause_time)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# After loading all items, find all elements
tags= driver.find_elements(By.CLASS_NAME, "post-list__widget-col-c1444")
print(len(tags))
driver.quit()
What is wrong with my code?
Thank you in advance for any guidance you can provide. I appreciate your help!
I have tested this on different websites, but I don’t understand what the problem is. I hope that using Selenium will allow me to find all the elements that exist on the page.