Long story short:
I am trying to write a function that will go through a table column, and as soon as it hits an exact match to a number I give it, it will start visiting the URL on each row after the exact match hit and scrape data off the internal website, then return to the table and continue to the next row – visit the url, grab the content and return to table until it reaches the end of the table.
At the moment the function finds the exact match, visits the inner web page, scrapes the data but after returning to the table page it looses track of the next element on the table (NoElementFound..).
My intention is to have the function keep iterating through the origin table, visit every URL on the next row after finding the exact match row and scraping data from each inner page.
Hope that makes sense.
You can see below that I have tried using .click()
on the URL element and self.driver.back()
to return to the table web page.
I have also tried to use self.driver.execute_script(f"window.open('{url}', '_blank');")
to open the inner page in a new tab, self.driver.switch_to.window(self.driver.window_handles[-1])
to switch to the new tab, then self.driver.close()
to close the tab and also tried self.driver.switch_to.window(self.driver.window_handles[0])
to return to the table tab… Nothing worked so far.
Here is my logic (note that I have included all methods I have mentioned above, so they won’t work together).
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import chromedriver_autoinstaller
from selenium.common.exceptions import (
TimeoutException,
NoSuchElementException,
)
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import undetected_chromedriver as uc
class InitiateRGM:
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# ----------------------SETTINGS----------------
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
def __init__(self):
self.driver = self.setup_driver()
self.doc_selection = None
self.bak_no_text = "123456" # Search exact match number
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
def setup_driver(self):
options = uc.ChromeOptions()
# options.add_argument("--headless=new")
options.add_argument("--start-maximized")
chromedriver_autoinstaller.install()
return uc.Chrome(options=options)
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
def open_rgm(self):
self.driver.get(
"https://website-to-scrape.com/"
)
print(input("Press Enter to start... "))
time.sleep(3)
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
def scrape_table(self):
# Find requests tab element
open_requests_tab = WebDriverWait(self.driver, 30).until(
EC.presence_of_element_located(
(
By.XPATH,
'//*[@id="btn-requests"]/span[1]',
)
)
)
# Scroll the element into view
self.driver.execute_script(
"arguments[0].scrollIntoView(true);", open_requests_tab
)
# Click the requests tab to expand it and view table
open_requests_tab.click()
time.sleep(4)
# base Xpath of table
base_bak_xpath = '//*[@id="table-requests"]/table/tbody/tr'
# Get all table rows
table_rows = self.driver.find_elements(
By.XPATH, f"{base_bak_xpath}/td[2]/a"
)
# Iterate through each row
found_bak_no_text = False
for i, _ in enumerate(table_rows): # Use _ to ignore the row element
# Re-find the bakasha element within the loop
bak_full_xpath = f"{base_bak_xpath}[{i + 1}]/td[2]/a"
bak_element = WebDriverWait(self.driver, 30).until(
EC.presence_of_element_located((By.XPATH, bak_full_xpath))
)
# Get bak number from the row
bak_no_text = bak_element.get_attribute("innerHTML")
placeholder = 2
# Check if we've found the row with the same bak number
if bak_no_text == self.bak_no_text:
found_bak_no_text = True
continue # Skip to the next row
# If we've found the matching bak number, extracting data from each bak page
if found_bak_no_text:
# Click the bak number to view bak info page
bak_element.click()
time.sleep(2)
# Open bak info page in a new tab
time.sleep(2)
url = f"https://website-to-scrape/#request/{bak_no_text}"
self.driver.execute_script(f"window.open('{url}', '_blank');")
time.sleep(3)
# Switch to the new tab
self.driver.switch_to.window(self.driver.window_handles[-1])
# Get Bakasha Details
try:
element = (
WebDriverWait(self.driver, 30)
.until(
EC.presence_of_element_located(
(
By.XPATH,
'//*[@id="info-main"]/table/tbody/tr[5]/td[2]',
)
)
)
.get_attribute("innerHTML")
)
except (TimeoutException, NoSuchElementException) as e:
print(f"Error finding the element: {e}")
print("Continuing without the element information.")
# Close the new tab
self.driver.close()
time.sleep(2)
# Switch back to the original tab
self.driver.switch_to.window(self.driver.window_handles[0])
# Return to previous page with table
self.driver.back()
time.sleep(2)
# Re-find the table rows after returning
table_rows = self.driver.find_elements(
By.XPATH, f"{base_bak_xpath}/td[2]/a"
Daniel M is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
2