Thiết kế website giá rẻ

Question

Long story short:
I am trying to write a function that will go through a table column, and as soon as it hits an exact match to a number I give it, it will start visiting the URL on each row after the exact match hit and scrape data off the internal website, then return to the table and continue to the next row – visit the url, grab the content and return to table until it reaches the end of the table.

At the moment the function finds the exact match, visits the inner web page, scrapes the data but after returning to the table page it looses track of the next element on the table (NoElementFound..).

My intention is to have the function keep iterating through the origin table, visit every URL on the next row after finding the exact match row and scraping data from each inner page.

Hope that makes sense.

You can see below that I have tried using .click() on the URL element and self.driver.back() to return to the table web page.

I have also tried to use self.driver.execute_script(f"window.open('{url}', '_blank');") to open the inner page in a new tab, self.driver.switch_to.window(self.driver.window_handles[-1]) to switch to the new tab, then self.driver.close() to close the tab and also tried self.driver.switch_to.window(self.driver.window_handles[0]) to return to the table tab… Nothing worked so far.

Here is my logic (note that I have included all methods I have mentioned above, so they won’t work together).

from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import chromedriver_autoinstaller
from selenium.common.exceptions import (
    TimeoutException,
    NoSuchElementException,
)
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import undetected_chromedriver as uc



class InitiateRGM:
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    # ----------------------SETTINGS----------------
    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    def __init__(self):
        self.driver = self.setup_driver()
        self.doc_selection = None
        self.bak_no_text = "123456" # Search exact match number

    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    def setup_driver(self):
        options = uc.ChromeOptions()
        # options.add_argument("--headless=new")
        options.add_argument("--start-maximized")
        chromedriver_autoinstaller.install()
        return uc.Chrome(options=options)

    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    def open_rgm(self):
        self.driver.get(
            "https://website-to-scrape.com/"
        )
        print(input("Press Enter to start... "))
        time.sleep(3)

    # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    def scrape_table(self):

        # Find requests tab element
        open_requests_tab = WebDriverWait(self.driver, 30).until(
            EC.presence_of_element_located(
                (
                    By.XPATH,
                    '//*[@id="btn-requests"]/span[1]',
                )
            )
        )

        # Scroll the element into view
        self.driver.execute_script(
            "arguments[0].scrollIntoView(true);", open_requests_tab
        )

        # Click the requests tab to expand it and view table
        open_requests_tab.click()
        time.sleep(4)

        # base Xpath of table
        base_bak_xpath = '//*[@id="table-requests"]/table/tbody/tr'

        # Get all table rows
        table_rows = self.driver.find_elements(
            By.XPATH, f"{base_bak_xpath}/td[2]/a"
        )

        # Iterate through each row
        found_bak_no_text = False
        for i, _ in enumerate(table_rows):  # Use _ to ignore the row element
            # Re-find the bakasha element within the loop
            bak_full_xpath = f"{base_bak_xpath}[{i + 1}]/td[2]/a"
            bak_element = WebDriverWait(self.driver, 30).until(
                EC.presence_of_element_located((By.XPATH, bak_full_xpath))
            )

            # Get bak number from the row
            bak_no_text = bak_element.get_attribute("innerHTML")
            placeholder = 2

            # Check if we've found the row with the same bak number
            if bak_no_text == self.bak_no_text:
                found_bak_no_text = True
                continue  # Skip to the next row

            # If we've found the matching bak number, extracting data from each bak page
            if found_bak_no_text:
                
                # Click the bak number to view bak info page
                bak_element.click()
                time.sleep(2)

                # Open bak info page in a new tab
                time.sleep(2)
                url = f"https://website-to-scrape/#request/{bak_no_text}"
                self.driver.execute_script(f"window.open('{url}', '_blank');")
                time.sleep(3)

                # Switch to the new tab
                self.driver.switch_to.window(self.driver.window_handles[-1])

                # Get Bakasha Details
                try:
                    element = (
                        WebDriverWait(self.driver, 30)
                        .until(
                            EC.presence_of_element_located(
                                (
                                    By.XPATH,
                                    '//*[@id="info-main"]/table/tbody/tr[5]/td[2]',
                                )
                            )
                        )
                        .get_attribute("innerHTML")
                    )
                except (TimeoutException, NoSuchElementException) as e:
                    print(f"Error finding the element: {e}")
                    print("Continuing without the element information.")

                # Close the new tab
                self.driver.close()
                time.sleep(2)

                # Switch back to the original tab
                self.driver.switch_to.window(self.driver.window_handles[0])

                # Return to previous page with table
                self.driver.back()
                time.sleep(2)

                # Re-find the table rows after returning
                table_rows = self.driver.find_elements(
                    By.XPATH, f"{base_bak_xpath}/td[2]/a"

Thiết kế website giá rẻ

Danh mục

Python Selenium – How to Iterate through a table column, visit URLS and scrape content on inner pages