I’m working on a web scrapper to collect Facebook post comments for analytics purposes.
On Facebook, after login, we can scroll the post page to get all the comments. Which dynamically loads the comments on the page scroll. Unfortunately, I can’t get the page to scroll in the headless
mode, though it works in non-headless mode.
I have referred the following posts – Post 1 Post 2
Here’s my code
import datetime
import re
import time
from decouple import config
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
import yake
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
options = Options()
options.add_argument('--disable-gpu-sandbox')
options.add_argument('--disable-gpu')
options.add_argument('--disable-software-rasterizer')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--no-sandbox')
options.add_argument("--window-size=1280,700")
options.add_argument("--headless=new")
options.add_argument(f"--user-agent={user_agent}")
driver = webdriver.Chrome(options=options)
driver.get("https://www.facebook.com/")
email_input = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "email"))
)
password_input = driver.find_element(By.ID, "pass")
email_input.send_keys(config("FB_EMAIL_INPUT"))
password_input.send_keys(config("FB_PASSWORD_INPUT"))
password_input.send_keys(Keys.RETURN)
time.sleep(1)
try:
profile = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, "//div[@aria-label='Your profile']"))
)
print("Login successful")
except NoSuchElementException:
print("Login failed")
POST_URL = "https://www.facebook.com/thebetterindia/posts/pfbid025Yo2f5Qsd8NDL4AoFoHuvjeAURiRVc7rQ4uZBbULMuUWCfZ9NURRfeVha7aPpnn3l"
driver.get(POST_URL)
def infinite_scroll(driver, timeout=10):
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(timeout)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
try:
infinite_scroll(driver, timeout=2)
except Exception as e:
print(f"An exception occurred: {e}")