I’ve been trying to extract captcha images from a .php page but have had no luck so far, is there a simple(ish) way to do this? I’ve been trying with python with selenium so far and would like to keep using this if possible.
Any input welcome and appreciated.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException
import urllib.request
# to run Chrome in headless mode
options = Options()
options.add_argument("--headless")
# initialize a Chrome WerbDriver instance
# with the specified options
driver = webdriver.Chrome(
service=ChromeService(),
options=options
)
# to avoid issues with responsive content
driver.maximize_window()
# the URL of the target page
url = "https://www.xxxxxxxxxxxxxxxx.com/crimes-verify.php?bullets=1"
# visit the target page in the controlled browser
driver.get(url)
# select the node images on the page
image_html_nodes = driver.find_elements(By.CSS_SELECTOR, "[data-test="crimes-verify1"]")
# where to store the scraped image url
image_urls = []
# extract the URLs from each image
for image_html_node in image_html_nodes:
try:
# use the URL in the "src" as the default behavior
image_url = image_html_node.get_attribute("src")
# extract the URL of the largest image from "srcset",
# if this attribute exists
srcset = image_html_node.get_attribute("srcset")
if srcset is not None:
# get the last element from the "srcset" value
srcset_last_element = srcset.split(", ")[-1]
# get the first element of the value,
# which is the image URL
image_url = srcset_last_element.split(" ")[0]
# add the image URL to the list
image_urls.append(image_url)
except StaleElementReferenceException as e:
continue
# to keep track of the images saved to disk
image_name_counter = 1
# download each image and add it
# to the "/images" local folder
for image_url in image_urls:
print(f"downloading image no. {image_name_counter} ...")
file_name = f"C:/Users/xxxxx/Pictures/Screenshots{image_name_counter}.gif"
# download the image
urllib.request.urlretrieve(image_url, file_name)
print(f"images downloaded successfully to "{file_name}"n")
driver.get_screenshot_as_file(f"C:/Users/xxxxx/Documents/PythonProgramming/IG{image_name_counter}.gif")
# increment the image counter
image_name_counter += 1
# close the browser and free up its resources
driver.quit()
This is what I’ve tried so far and it doesn’t save or take a screenshot.
New contributor
WillieD is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.