Thiết kế website giá rẻ

Question

So, i’m building this web scraper code. It does work, it works with some search terms and not others.

Its purpose is to open Google Maps and look for business data and place it in the transfer area so that I can paste it wherever I want.

When I search for something like ‘petshop’ somewhere it works. But when I search for other things like ‘beauty salons’ or ‘restaurants’ it doesn’t search, just like in the photos.

image 2 image 1

# manipulate python runtime environment
import sys
# regular expression support
import re
# mathematical calculations in python
import math
# time-related functions
import time
# manipulate dates and times
from datetime import datetime
# data analysis and manipulation
import pandas as pd
# scrape information from web pages
from bs4 import BeautifulSoup
# simplify binary drivers management for browsers
# browser execution and automation
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import tkinter as tk
from tkinter import messagebox


def scrape_website(query, num):
    # time string will be used at saving the file
    timestr = time.strftime("%Y%m%d")

    try:
        # query string to be "googled"
        query_string = str(query)

        # number of businesses from which information
        # is required to be collected
        number_of_businesses = int(num)
        # Validate that 'query_tring' parameter is not empty
        if query_string == "" or not query_string:
            messagebox.showerror("Error", "Query must be a valid search term.")
            raise ValueError("Query must be a valid search term.")
    except ValueError:
        messagebox.showerror("Error", "Query must be a valid search term.")
        raise ValueError("Query must be a valid search term.")

    try:
        # Validate that 'number_of_businesses' parameter is not empty
        # or is between a predefined range
        if number_of_businesses <= 0 or type(number_of_businesses) != int or number_of_businesses > 100:
            messagebox.showerror("Error", "Number of businesses needs to have an assigned number between 1 and 100")
            raise ValueError("Number of businesses needs to have an assigned number between 1 and 100")

    except ValueError:
        messagebox.showerror("Error", "Number of businesses needs to have an assigned number between 1 and 100")
        raise ValueError("Number of businesses needs to have an assigned number between 1 and 100")

    # Locate Chrome Driver to work with selenium
    service = Service(path=r'C:Usersjoao victor sousa naDownloadsGoogle Maps Web Scraperchromedriver.exe')
    driver = webdriver.Chrome(service=service)

    # Open Google search page from Brazil with ?hl=en
    driver.get("https://www.google.com.br/?hl=en")

    # Define explicit wait
    wait = WebDriverWait(driver, )

    # Wait until the query serach box "q" is located
    searchbox = wait.until(EC.presence_of_element_located((By.NAME, "q")))

    # 'Google' for the defined 'query_string' parameter
    searchbox.send_keys(query_string)
    searchbox.send_keys(Keys.RETURN)

    # Wait until the "more businesses" button is located and click it
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "g-more-link"))).click()

    # Define the number of pages that needs to be scraped
    page_iterations = math.ceil(number_of_businesses / 20)
    print(f"Number of pages to scrape: {page_iterations} pages.n")

    # Set auxiliary objects
    business_ranking = 0
    data_de_negocios = pd.DataFrame()

    # Set xpath string used to get info for each business
    business_xpath_string = (
        r"//a[contains(@class, 'rllt__link a-no-hover-decoration')]" 
        r"//div[@role='heading']"
    )

    # Start page iteration
    for page in range(page_iterations):

        # Visit next web page if next iteration is needed
        if page > 0:
            try:
                next_page_element = (r'//*[@id="rl_ist0"]/div/div[2]/div/table/tbody/tr/td[' + str(page + 2) + r']/a')
                element = wait.until(EC.element_to_be_clickable((By.XPATH, next_page_element)))
                # element = driver.find_element(By.XPATH, next_page_element)
                driver.execute_script("arguments[0].click();", element)
                time.sleep(7)
            except:
                print("No additional pages to scrape")
                break

        # Let the user know the current page that is being scraped
        print("Iteration over page {:n} started".format(page + 1))

        # Wait for the loading overlay to disappear
        loading_overlay = wait.until(
            EC.invisibility_of_element_located(
                (By.CSS_SELECTOR, 'div.rlfl__loading-overlay')
            )
        )

        # Parse html of the web page
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Find and get the businesses elements listed in the current web page
        businesses = wait.until(
            EC.element_to_be_clickable((By.XPATH, business_xpath_string))
        )
        businesses = businesses.find_elements(By.XPATH, business_xpath_string)
        print(f"  - {len(businesses):n} businesses displayed in the page")

        # Get the latitude and longitude (if exists) of each business:
        # Find html 'divs' that could contain latitude and longitude attributes
        location_d = soup.find_all('div', {"data-record-click-time": "false"})
        # Find the child element(s) that are a child of the parent element(s) found above
        location_c = [location_c.find('div', {'class': 'rllt__mi', "data-lat": True}) for location_c in location_d]
        # Extract the latitude and longitude attribute value from each child element found
        lats = [child['data-lat'] if child and 'data-lat' in child.attrs else None for child in location_c]
        lons = [child['data-lng'] if child and 'data-lng' in child.attrs else None for child in location_c]

        # Define empty dataframe
        nth_business_data_concat = pd.DataFrame()

        # Start loop that will help to get data from each business
        for business in businesses:

            # Click on the business to open the business 'box'
            driver.execute_script("arguments[0].click();", business)

            try:
                loading_overlay = wait.until(
                    EC.invisibility_of_element_located((By.CSS_SELECTOR, 'div.rlfl__loading-overlay')))
                # Wait until the business 'box' is open and visible to the user
                wait.until(EC.visibility_of_element_located((By.CLASS_NAME, "xpdopen")))
                # Check if the business 'box' is clickable
                xpdopen = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "xpdopen")))
            except:
                xpdopen = None

            # If not clickable, click again and wait for the business 'box' to open and become clickable
            while not xpdopen:
                driver.execute_script("arguments[0].click();", business)
                wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR, 'div.rlfl__loading-overlay')))
                wait.until(EC.visibility_of_element_located((By.CLASS_NAME, "xpdopen")))
                xpdopen = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "xpdopen")))

            # Parse html of the web page with the business window/box open
            soup = BeautifulSoup(driver.page_source, "html.parser")

            # Page ranking
            page_ranking = page + 1

            # Business ranking
            business_ranking += 1

            # Business name
            try:
                business_name = soup.find('h2', {'data-attrid': 'title'}).text
            except:
                business_name = str()

            # Business logo
            try:
                business_logo = soup.find('div', {'class': 'kp-header'}).find('img')['src']
            except:
                business_logo = str()

            # Business picture
            try:
                business_pic = soup.find('div', {'data-attrid': re.compile(r"kc:/location/location:(hotel )?media")})
                business_pic = business_pic.find('div', {'role': 'img'})["style"]
            except:
                business_pic = str()

            # Business open hours
            try:
                table = soup.find('div', {'class': 'a-h'}).find('table')
                # Create empty lists for each column
                business_hours = []
                # Iterate through each row and extract the values
                for row in table.find_all('tr'):
                    columns = row.find_all('td')
                    business_hours.append('{'
                                          + columns[0].find(text=True, recursive=False).strip()
                                          + ': '
                                          + columns[1].find(text=True, recursive=False).strip()
                                          + '}'
                                          )
                business_hours = '[' + ','.join(business_hours) + ']'
            except:
                business_hours = str()

            # Business phone
            try:
                business_phone = soup.find('div',
                                           {'data-attrid': r'kc:/collection/knowledge_panels/has_phone:phone'}
                                           ).text
            except:
                business_phone = str()

            # Business address
            try:
                business_address = soup.find('div', {'data-attrid': r'kc:/location/location:address'}).text
            except:
                business_address = str()

            # Business website
            try:
                business_url = soup.find('div', {'data-attrid': r'kc:/local:unified_actions'}).find('a')["href"]
            except:
                business_url = str()

            # Business additional details
            try:
                business_add_details = soup.find('div', {'data-attrid': r'kc:/local:lu attribute list'})

                try:
                    rating = business_add_details.find('span', text=re.compile(r'^[0-9]{1}.?[0-9]?$'))
                    rating = rating.text if rating else str()
                except AttributeError:
                    rating = str()

                try:
                    reviews = business_add_details.find('span', text=re.compile(r'^([0-9]{1,3}.?[0-9]{0,3}s?K?)$'))
                    reviews = reviews.text if reviews else str()
                except AttributeError:
                    reviews = str()

                try:
                    business_type = business_add_details.find_all("span")[-1]
                    business_type = business_type.text if business_type else str()
                except AttributeError:
                    business_type = str()
            except:
                rating = str()
                reviews = str()
                business_type = str()

            # Business google ads
            try:
                business_ads = soup.find('div', {'data-attrid': r'kc:/local:promotions'}).find('a')
                business_ads = business_ads.find('div').find_all('div')

                is_ad = business_ads[0].find_all('span', {"style": False})[0].text
                ad_landing_page = business_ads[0].find_all('span', {"style": False})[1].text
                ad_title = business_ads[1].text
                ad_description = business_ads[2].text
            except:
                is_ad = str()
                ad_landing_page = str()
                ad_title = str()
                ad_description = str()

            # Business google maps url
            try:
                business_maps_url = driver.current_url
            except:
                business_maps_url = str()

            # Add scraped variables as fields in an auxiliar dataframe
            nth_business_data = pd.DataFrame(data={"page_ranking": page_ranking,
                                                   "business_ranking": business_ranking,
                                                   "business_name": business_name,
                                                   "business_logo": business_logo,
                                                   "business_picture": business_pic,
                                                   "business_hours": business_hours,
                                                   "business_phone": business_phone,
                                                   "business_address": business_address,
                                                   "business_website": business_url,
                                                   "rating": rating,
                                                   "reviews": reviews,
                                                   "business_type": business_type,
                                                   "is_ad": is_ad,
                                                   "ad_landing_page": ad_landing_page,
                                                   "ad_title": ad_title,
                                                   "ad_description": ad_description,
                                                   "business_maps_url": business_maps_url
                                                   },
                                             index=[business_ranking]
                                             )

            # Concat / bind rows from the last scraped business with the rest of them
            nth_business_data_concat = pd.concat([nth_business_data_concat, nth_business_data])

            # Close floating window with business details
            close_element = wait.until(EC.presence_of_element_located((By.XPATH, r'//*[@id="rhs"]/div/div[2]/div/span')))
            driver.execute_script("arguments[0].click();", close_element)

            loading_overlay = wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR, 'div.rlfl__loading-overlay')))
            floating_element = wait.until(EC.invisibility_of_element_located((By.CLASS_NAME, 'xpdopen')))

            while not floating_element:
                close_element = wait.until(EC.presence_of_element_located((By.XPATH, r'//*[@id="rhs"]/div/div[2]/div/span')))
                driver.execute_script("arguments[0].click();", close_element)
                loading_overlay = wait.until(
                    EC.invisibility_of_element_located((By.CSS_SELECTOR, 'div.rlfl__loading-overlay')))
                floating_element = wait.until(EC.invisibility_of_element_located((By.CLASS_NAME, 'xpdopen')))

        # Include latitude and longitde columns in the concatenated dataframe
        nth_business_data_concat['latitude'] = lats
        nth_business_data_concat['longitude'] = lons

        # Let the user know that the page-iteration finished
        print("  - data of {:n} businesses has been collected.n".format(len(nth_business_data_concat.index)))

        # Concat businesses from currrent page with the rest of the businesses from other pages
        data_de_negocios = pd.concat([data_de_negocios, nth_business_data_concat])

    # clores google chrome driver
    driver.close()

    # extract url string from business_picture field
    data_de_negocios['business_picture'] = 
        data_de_negocios['business_picture'].replace(to_replace={r'.*url(': '', r')$': ''}, regex=True)

    # extract phone number string from business_phone field
    data_de_negocios['business_phone'] = 
        data_de_negocios['business_phone'].replace(to_replace={r'^.*: +': ''}, regex=True)

    # extract address string from business_address field
    data_de_negocios['business_address'] = 
        data_de_negocios['business_address'].replace(to_replace={r'^.*: ': ''}, regex=True)

    # extract url string from business_website field
    data_de_negocios['business_website'] = 
        data_de_negocios['business_website'].replace(to_replace={r'^#$': '', r'^//': ''}, regex=True)

    # replace summary letter string (K) with zeroes from reviews field
    data_de_negocios['reviews'] = 
        data_de_negocios['reviews'].replace(to_replace={r'(|)|s': ''}, regex=True)
    data_de_negocios['reviews'] = 
        data_de_negocios['reviews'].replace(to_replace={'k|K': '00'}, regex=True)
    data_de_negocios['reviews'] = 
        data_de_negocios['reviews'].replace(to_replace={r'.': ''}, regex=True)
    data_de_negocios['reviews'] = 
        data_de_negocios['reviews'].replace(to_replace={',': ''}, regex=True) #to process as int number

    # remove special characters in string from is_ad field
    data_de_negocios['is_ad'] = 
        data_de_negocios['is_ad'].replace(to_replace={'[^a-zA-Z]': ''}, regex=True)

    # concatenate google sheets formula to display url as image
    data_de_negocios['business_logo'] = 
        data_de_negocios['business_logo'].apply(lambda x: x if x == "" else '=IMAGE("' + x + '")')

    # concatenate google sheets formula to display url as image
    data_de_negocios['business_picture'] = 
        data_de_negocios['business_picture'].apply(lambda x: x if x == "" else '=IMAGE("' + x + '")')

    # concatenate latitude and longitude fileds into a single column
    data_de_negocios['location'] = 
        data_de_negocios['latitude'].str.cat(data_de_negocios['longitude'].astype(str), sep = ',')

    # rename and keep desired variables
    selector_d = {
        "business_ranking": "Page No.",
        "page_ranking": "Business Ranking",
        "business_name": "Business Name",
       #"business_logo": "Busines Logo",
        "business_picture": "Business Picture",
        "business_type": "Description",
        "rating": "Rating",
        "reviews": "No. Reviews",
        "business_website": "Website",
        "business_phone": "Phone Number",
        "business_address": "Address",
        "location": "Location",
        "business_hours": "Working Hours",
        "is_ad": "Is Ad?",
        "ad_landing_page": "Ad Landing Page",
        "ad_title": "Ad Title",
        "ad_description": "Ad Description",
        "business_maps_url": "Google Maps URL",
        }

    # rename and keep desired variables
    dataset_curated = data_de_negocios.rename(columns=selector_d)[[*selector_d.values()]]

    # copy dataset to clipboard
    dataset_curated.to_clipboard(index=False)

    tk.messagebox.showinfo(title='Information',
                           message=f'Dataset has been captured. The information was copied to your clipboard.n'
                                   f'To use it, press "CTRL+V" in a spreadsheet.')

I tried adding more time for the code to read information, I tried changing the location of the search button, but to no avail.

I just need it to work regardless of the term I ask it to search on maps.

Thiết kế website giá rẻ

Danh mục

My web scraper code don’t work depending of th term of the search