So, i’m building this web scraper code. It does work, it works with some search terms and not others.
Its purpose is to open Google Maps and look for business data and place it in the transfer area so that I can paste it wherever I want.
When I search for something like ‘petshop’ somewhere it works. But when I search for other things like ‘beauty salons’ or ‘restaurants’ it doesn’t search, just like in the photos.
image 2 image 1
# manipulate python runtime environment
import sys
# regular expression support
import re
# mathematical calculations in python
import math
# time-related functions
import time
# manipulate dates and times
from datetime import datetime
# data analysis and manipulation
import pandas as pd
# scrape information from web pages
from bs4 import BeautifulSoup
# simplify binary drivers management for browsers
# browser execution and automation
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import tkinter as tk
from tkinter import messagebox
def scrape_website(query, num):
# time string will be used at saving the file
timestr = time.strftime("%Y%m%d")
try:
# query string to be "googled"
query_string = str(query)
# number of businesses from which information
# is required to be collected
number_of_businesses = int(num)
# Validate that 'query_tring' parameter is not empty
if query_string == "" or not query_string:
messagebox.showerror("Error", "Query must be a valid search term.")
raise ValueError("Query must be a valid search term.")
except ValueError:
messagebox.showerror("Error", "Query must be a valid search term.")
raise ValueError("Query must be a valid search term.")
try:
# Validate that 'number_of_businesses' parameter is not empty
# or is between a predefined range
if number_of_businesses <= 0 or type(number_of_businesses) != int or number_of_businesses > 100:
messagebox.showerror("Error", "Number of businesses needs to have an assigned number between 1 and 100")
raise ValueError("Number of businesses needs to have an assigned number between 1 and 100")
except ValueError:
messagebox.showerror("Error", "Number of businesses needs to have an assigned number between 1 and 100")
raise ValueError("Number of businesses needs to have an assigned number between 1 and 100")
# Locate Chrome Driver to work with selenium
service = Service(path=r'C:Usersjoao victor sousa naDownloadsGoogle Maps Web Scraperchromedriver.exe')
driver = webdriver.Chrome(service=service)
# Open Google search page from Brazil with ?hl=en
driver.get("https://www.google.com.br/?hl=en")
# Define explicit wait
wait = WebDriverWait(driver, )
# Wait until the query serach box "q" is located
searchbox = wait.until(EC.presence_of_element_located((By.NAME, "q")))
# 'Google' for the defined 'query_string' parameter
searchbox.send_keys(query_string)
searchbox.send_keys(Keys.RETURN)
# Wait until the "more businesses" button is located and click it
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "g-more-link"))).click()
# Define the number of pages that needs to be scraped
page_iterations = math.ceil(number_of_businesses / 20)
print(f"Number of pages to scrape: {page_iterations} pages.n")
# Set auxiliary objects
business_ranking = 0
data_de_negocios = pd.DataFrame()
# Set xpath string used to get info for each business
business_xpath_string = (
r"//a[contains(@class, 'rllt__link a-no-hover-decoration')]"
r"//div[@role='heading']"
)
# Start page iteration
for page in range(page_iterations):
# Visit next web page if next iteration is needed
if page > 0:
try:
next_page_element = (r'//*[@id="rl_ist0"]/div/div[2]/div/table/tbody/tr/td[' + str(page + 2) + r']/a')
element = wait.until(EC.element_to_be_clickable((By.XPATH, next_page_element)))
# element = driver.find_element(By.XPATH, next_page_element)
driver.execute_script("arguments[0].click();", element)
time.sleep(7)
except:
print("No additional pages to scrape")
break
# Let the user know the current page that is being scraped
print("Iteration over page {:n} started".format(page + 1))
# Wait for the loading overlay to disappear
loading_overlay = wait.until(
EC.invisibility_of_element_located(
(By.CSS_SELECTOR, 'div.rlfl__loading-overlay')
)
)
# Parse html of the web page
soup = BeautifulSoup(driver.page_source, "html.parser")
# Find and get the businesses elements listed in the current web page
businesses = wait.until(
EC.element_to_be_clickable((By.XPATH, business_xpath_string))
)
businesses = businesses.find_elements(By.XPATH, business_xpath_string)
print(f" - {len(businesses):n} businesses displayed in the page")
# Get the latitude and longitude (if exists) of each business:
# Find html 'divs' that could contain latitude and longitude attributes
location_d = soup.find_all('div', {"data-record-click-time": "false"})
# Find the child element(s) that are a child of the parent element(s) found above
location_c = [location_c.find('div', {'class': 'rllt__mi', "data-lat": True}) for location_c in location_d]
# Extract the latitude and longitude attribute value from each child element found
lats = [child['data-lat'] if child and 'data-lat' in child.attrs else None for child in location_c]
lons = [child['data-lng'] if child and 'data-lng' in child.attrs else None for child in location_c]
# Define empty dataframe
nth_business_data_concat = pd.DataFrame()
# Start loop that will help to get data from each business
for business in businesses:
# Click on the business to open the business 'box'
driver.execute_script("arguments[0].click();", business)
try:
loading_overlay = wait.until(
EC.invisibility_of_element_located((By.CSS_SELECTOR, 'div.rlfl__loading-overlay')))
# Wait until the business 'box' is open and visible to the user
wait.until(EC.visibility_of_element_located((By.CLASS_NAME, "xpdopen")))
# Check if the business 'box' is clickable
xpdopen = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "xpdopen")))
except:
xpdopen = None
# If not clickable, click again and wait for the business 'box' to open and become clickable
while not xpdopen:
driver.execute_script("arguments[0].click();", business)
wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR, 'div.rlfl__loading-overlay')))
wait.until(EC.visibility_of_element_located((By.CLASS_NAME, "xpdopen")))
xpdopen = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "xpdopen")))
# Parse html of the web page with the business window/box open
soup = BeautifulSoup(driver.page_source, "html.parser")
# Page ranking
page_ranking = page + 1
# Business ranking
business_ranking += 1
# Business name
try:
business_name = soup.find('h2', {'data-attrid': 'title'}).text
except:
business_name = str()
# Business logo
try:
business_logo = soup.find('div', {'class': 'kp-header'}).find('img')['src']
except:
business_logo = str()
# Business picture
try:
business_pic = soup.find('div', {'data-attrid': re.compile(r"kc:/location/location:(hotel )?media")})
business_pic = business_pic.find('div', {'role': 'img'})["style"]
except:
business_pic = str()
# Business open hours
try:
table = soup.find('div', {'class': 'a-h'}).find('table')
# Create empty lists for each column
business_hours = []
# Iterate through each row and extract the values
for row in table.find_all('tr'):
columns = row.find_all('td')
business_hours.append('{'
+ columns[0].find(text=True, recursive=False).strip()
+ ': '
+ columns[1].find(text=True, recursive=False).strip()
+ '}'
)
business_hours = '[' + ','.join(business_hours) + ']'
except:
business_hours = str()
# Business phone
try:
business_phone = soup.find('div',
{'data-attrid': r'kc:/collection/knowledge_panels/has_phone:phone'}
).text
except:
business_phone = str()
# Business address
try:
business_address = soup.find('div', {'data-attrid': r'kc:/location/location:address'}).text
except:
business_address = str()
# Business website
try:
business_url = soup.find('div', {'data-attrid': r'kc:/local:unified_actions'}).find('a')["href"]
except:
business_url = str()
# Business additional details
try:
business_add_details = soup.find('div', {'data-attrid': r'kc:/local:lu attribute list'})
try:
rating = business_add_details.find('span', text=re.compile(r'^[0-9]{1}.?[0-9]?$'))
rating = rating.text if rating else str()
except AttributeError:
rating = str()
try:
reviews = business_add_details.find('span', text=re.compile(r'^([0-9]{1,3}.?[0-9]{0,3}s?K?)$'))
reviews = reviews.text if reviews else str()
except AttributeError:
reviews = str()
try:
business_type = business_add_details.find_all("span")[-1]
business_type = business_type.text if business_type else str()
except AttributeError:
business_type = str()
except:
rating = str()
reviews = str()
business_type = str()
# Business google ads
try:
business_ads = soup.find('div', {'data-attrid': r'kc:/local:promotions'}).find('a')
business_ads = business_ads.find('div').find_all('div')
is_ad = business_ads[0].find_all('span', {"style": False})[0].text
ad_landing_page = business_ads[0].find_all('span', {"style": False})[1].text
ad_title = business_ads[1].text
ad_description = business_ads[2].text
except:
is_ad = str()
ad_landing_page = str()
ad_title = str()
ad_description = str()
# Business google maps url
try:
business_maps_url = driver.current_url
except:
business_maps_url = str()
# Add scraped variables as fields in an auxiliar dataframe
nth_business_data = pd.DataFrame(data={"page_ranking": page_ranking,
"business_ranking": business_ranking,
"business_name": business_name,
"business_logo": business_logo,
"business_picture": business_pic,
"business_hours": business_hours,
"business_phone": business_phone,
"business_address": business_address,
"business_website": business_url,
"rating": rating,
"reviews": reviews,
"business_type": business_type,
"is_ad": is_ad,
"ad_landing_page": ad_landing_page,
"ad_title": ad_title,
"ad_description": ad_description,
"business_maps_url": business_maps_url
},
index=[business_ranking]
)
# Concat / bind rows from the last scraped business with the rest of them
nth_business_data_concat = pd.concat([nth_business_data_concat, nth_business_data])
# Close floating window with business details
close_element = wait.until(EC.presence_of_element_located((By.XPATH, r'//*[@id="rhs"]/div/div[2]/div/span')))
driver.execute_script("arguments[0].click();", close_element)
loading_overlay = wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR, 'div.rlfl__loading-overlay')))
floating_element = wait.until(EC.invisibility_of_element_located((By.CLASS_NAME, 'xpdopen')))
while not floating_element:
close_element = wait.until(EC.presence_of_element_located((By.XPATH, r'//*[@id="rhs"]/div/div[2]/div/span')))
driver.execute_script("arguments[0].click();", close_element)
loading_overlay = wait.until(
EC.invisibility_of_element_located((By.CSS_SELECTOR, 'div.rlfl__loading-overlay')))
floating_element = wait.until(EC.invisibility_of_element_located((By.CLASS_NAME, 'xpdopen')))
# Include latitude and longitde columns in the concatenated dataframe
nth_business_data_concat['latitude'] = lats
nth_business_data_concat['longitude'] = lons
# Let the user know that the page-iteration finished
print(" - data of {:n} businesses has been collected.n".format(len(nth_business_data_concat.index)))
# Concat businesses from currrent page with the rest of the businesses from other pages
data_de_negocios = pd.concat([data_de_negocios, nth_business_data_concat])
# clores google chrome driver
driver.close()
# extract url string from business_picture field
data_de_negocios['business_picture'] =
data_de_negocios['business_picture'].replace(to_replace={r'.*url(': '', r')$': ''}, regex=True)
# extract phone number string from business_phone field
data_de_negocios['business_phone'] =
data_de_negocios['business_phone'].replace(to_replace={r'^.*: +': ''}, regex=True)
# extract address string from business_address field
data_de_negocios['business_address'] =
data_de_negocios['business_address'].replace(to_replace={r'^.*: ': ''}, regex=True)
# extract url string from business_website field
data_de_negocios['business_website'] =
data_de_negocios['business_website'].replace(to_replace={r'^#$': '', r'^//': ''}, regex=True)
# replace summary letter string (K) with zeroes from reviews field
data_de_negocios['reviews'] =
data_de_negocios['reviews'].replace(to_replace={r'(|)|s': ''}, regex=True)
data_de_negocios['reviews'] =
data_de_negocios['reviews'].replace(to_replace={'k|K': '00'}, regex=True)
data_de_negocios['reviews'] =
data_de_negocios['reviews'].replace(to_replace={r'.': ''}, regex=True)
data_de_negocios['reviews'] =
data_de_negocios['reviews'].replace(to_replace={',': ''}, regex=True) #to process as int number
# remove special characters in string from is_ad field
data_de_negocios['is_ad'] =
data_de_negocios['is_ad'].replace(to_replace={'[^a-zA-Z]': ''}, regex=True)
# concatenate google sheets formula to display url as image
data_de_negocios['business_logo'] =
data_de_negocios['business_logo'].apply(lambda x: x if x == "" else '=IMAGE("' + x + '")')
# concatenate google sheets formula to display url as image
data_de_negocios['business_picture'] =
data_de_negocios['business_picture'].apply(lambda x: x if x == "" else '=IMAGE("' + x + '")')
# concatenate latitude and longitude fileds into a single column
data_de_negocios['location'] =
data_de_negocios['latitude'].str.cat(data_de_negocios['longitude'].astype(str), sep = ',')
# rename and keep desired variables
selector_d = {
"business_ranking": "Page No.",
"page_ranking": "Business Ranking",
"business_name": "Business Name",
#"business_logo": "Busines Logo",
"business_picture": "Business Picture",
"business_type": "Description",
"rating": "Rating",
"reviews": "No. Reviews",
"business_website": "Website",
"business_phone": "Phone Number",
"business_address": "Address",
"location": "Location",
"business_hours": "Working Hours",
"is_ad": "Is Ad?",
"ad_landing_page": "Ad Landing Page",
"ad_title": "Ad Title",
"ad_description": "Ad Description",
"business_maps_url": "Google Maps URL",
}
# rename and keep desired variables
dataset_curated = data_de_negocios.rename(columns=selector_d)[[*selector_d.values()]]
# copy dataset to clipboard
dataset_curated.to_clipboard(index=False)
tk.messagebox.showinfo(title='Information',
message=f'Dataset has been captured. The information was copied to your clipboard.n'
f'To use it, press "CTRL+V" in a spreadsheet.')
I tried adding more time for the code to read information, I tried changing the location of the search button, but to no avail.
I just need it to work regardless of the term I ask it to search on maps.
Joao_Victor is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.