I trying to load the videos page of a youtube channel and parse it to extract recent video information.
I want to avoid using the API since it has a daily usage quota. The problem I’m having is Selenium does not seem to load the full html of the webpage when printing “driver.pagesource”..
When there are 10 comments, only 3 are parsed, and the process stops.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import pandas as pd
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
driver.get('https://www.youtube.com/example')
scroll_pause_time = 2
last_height = driver.execute_script("return document.documentElement.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
time.sleep(scroll_pause_time)
new_height = driver.execute_script("return document.documentElement.scrollHeight")
if new_height == last_height:
break
last_height = new_height
driver.execute_script("window.scrollTo(0, 0);")
time.sleep(scroll_pause_time)
buttons = driver.find_elements(By.XPATH, '//*[@id="more-replies"]/yt-button-shape/button/yt-touch-feedback-shape/div/div[2]')
for button in buttons:
time.sleep(1)
try:
button.click()
except Exception as e:
print(f"Failed to click a button: {e}")
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
driver.quit()
New contributor
doob is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.