Python version: 3.11.8
Hi everyone:
Goal: Retrieve cookies and html content from the following website which has CAPTCHA. This is an example of one url:
https://www.toasttab.com/local/order/rossi-s-bar-grill-2110-whitehorse-mercerville-rd/r-19ab91e0-8c7c-472c-ba17-dda9e7b95132.
My attempt: I implemented the following code to achieve this goal with selenium and seleniumbase:
def open_website(driver,
url: str):
delay = random.uniform(1, 3)
# Pause execution for the random delay
sleep(delay)
driver.get(url=url)
def click_captcha(driver,
wait: int):
driver.switch_to_frame('iframe')
driver.uc_click("span.mark", reconnect_time=3)
driver.switch_to_default_content()
@retry(reraise=True, wait=wait_random(min=1,max=5), stop=stop_after_attempt(2))
def get_with_selenium(url: str,
platform: str,
headless: bool,
wait: int,
user_agents: list,
screenshots_folder_path: str) -> dict:
user_agent = random.choice(user_agents['agents'])
try:
driver = sbDriver(uc=True,
headless=headless,
agent=user_agent,
no_sandbox=True,
disable_gpu=True,
swiftshader=True,
devtools=True
)
captcha_found = False
open_website(driver,url)
try: ## Look for cloudfare challenge. (toasttab)
toasttab_elem = WebDriverWait(driver=driver, timeout=180).until(EC.presence_of_element_located((By.ID, 'challenge-running')))
captcha_found = True
except:
print("Error finding CAPTCHA in the first attempt")
if (driver.is_element_visible('iframe[title*="Cloudflare"]') or
driver.is_element_visible('h2[id="challenge-running"]')):
print("CAPTCHA was found in the second attempt")
captcha_found = True
if captcha_found:
print("CAPTCHA was found")
driver.refresh()
driver.implicitly_wait(wait)
else:
print("CAPTCHA was not found")
driver.implicitly_wait(wait)
cookies = driver.get_cookies()
page_source = driver.page_source
if not cookies or or not page_source:
raise Exception("Cookies or/and page content was not retrieved")
except Exception as e:
logging.error(e)
now = datetime.now()
formatted_timestamp = now.strftime("%Y%m%d%H%M%S%f")
screenshot_file_path = os.path.join(screenshots_folder_path,f"{formatted_timestamp}.png")
logging.error("Error screenshot: " + screenshot_file_path)
driver.save_screenshot(screenshot_file_path)
raise
finally:
if driver is not None:
driver.quit()
print("Driver closed")
return {"cookies": cookies, "html_content": page_source}
Findings: This code is not avaible to bypass the captcha and when the cookies or html content is empty, then I throw an exception and I take a screenshot. When I saw the screenshot it seems the website is blank and there is nothing to see. I set up also a high wait for the website but still I am not able to retrieve the cookies and html content.
Sometimes, I got some cookies and html content, but the information I need is not present. It seems the website have not loaded all the elements.
Question: How can I achieve this goal without being detected by the website?
1