I can’t get all the html from this page that I want to convert to pdf. I also tried with selenium, with puppeteer and with splash and I didn’t succeed.
I only got a short html that has nothing to do with the content of the displayed pages
Selenium
from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
from selenium.webdriver.edge.service import Service
# from selenium.webdriver.chrome.options import Options
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import time
# Setează opțiunile pentru Chrome
chrome_options = Options()
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--start-maximized")
# Setează un user-agent uman
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--disable-blink-features")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-plugins-discovery")
chrome_options.add_argument("--profile-directory=Default")
chrome_options.add_argument("--incognito")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--remote-debugging-port=9222")
# Inițializează driverul
service = Service('msedgedriver.exe')
driver = webdriver.Chrome(service=service, options=chrome_options)
# Manipularea navigator.webdriver
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
# Setează anteturi HTTP adiționale (optional, dacă este necesar)
# Note: Selenium nu suportă direct setarea anteturilor HTTP adiționale în toate situațiile,
# dar poți folosi extensii de browser sau proxys pentru acest lucru.
# Navighează la URL
driver.get('https://www.sec.gov/ix?doc=/Archives/edgar/data/0000726728/000072672824000047/o-20231231.htm')
# Așteaptă puțin pentru a se asigura că tot conținutul este încărcat
time.sleep(5)
# Extragerea HTML-ului complet al paginii
html = driver.page_source
print(html)
# Închide driverul
driver.quit()
I don’t know what I should do to get the html from the page that is in javascript, I need to extract several pages to convert them into a pdf, I also tried to find a tag from the html but it didn’t work because it detects my browser as finding controlled by a software and it doesn’t show me the html where I can find that tag or selector or xpath
Marius Robert is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.