I made a map scraper using selenium and streamlit but I only get 5 to 7 result and also not get full results like number or address was missing
import time
import streamlit as st
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from dataclasses import dataclass, asdict, field
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from my_webdriver import Business
import os
def main():
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://www.google.com/maps")
time.sleep(5)
def main(search_term, num_results):
scraper = GoogleMapScraper()
scraper.config_driver()
scraper.load_companies(search_term, num_results) # Pass num_results to load_companies
scraper.driver.quit()
st.dataframe(scraper.data_df)
search_list = ["search_term1", "search_term2", "search_term3"] # Replace with your actual search terms
for search_for_index, search_for in enumerate(search_list):
print(f"-----n{search_for_index} - {search_for}".strip())
search_field = driver.find_element(By.NAME, "q")
# Add an input field for the number of results
num_results = st.number_input('Enter the number of results you want', min_value=1, value=10)
for search_for_index, search_for in enumerate(search_list):
print(f"-----n{search_for_index} - {search_for}".strip())
search_field.send_keys(search_for)
search_field.submit()
time.sleep(5)
# scrolling
actions = ActionChains(driver)
actions.move_to_element(driver.find_element(By.XPATH, '//a[contains(@href, "https://www.google.com/maps/place")]'))
actions.perform()
previously_counted = 0
while True:
actions = ActionChains(driver)
actions.send_keys(Keys.PAGE_DOWN)
total = 100 # Replace with your desired value
actions.perform()
time.sleep(5)
listings = driver.find_elements(By.XPATH, '//a[contains(@href, "https://www.google.com/maps/place")]')
if len(listings) >= total:
listings = listings[:total]
print(f"Total Scraped: {len(listings)}")
break
else:
if len(listings) == previously_counted:
print(f"Arrived at all availablenTotal Scraped: {len(listings)}")
break
else:
previously_counted = len(listings)
print(f"Currently Scraped: ", len(listings))
business_list = BusinessList()
# scraping
for listing in listings:
try:
listing.click()
time.sleep(5)
business = Business()
# Add your scraping logic here...
business_list.business_list.append(business)
except Exception as e:
print(f'Error occured: {e}')
# output
business_list.save_to_excel(f"google_maps_data_{search_for}".replace(' ', '_'))
business_list.save_to_csv(f"google_maps_data_{search_for}".replace(' ', '_'))
driver.quit()
@dataclass
class BusinessList:
"""holds list of Business objects,
and save to both excel and csv
"""
business_list: list[Business] = field(default_factory=list)
save_at = 'output'
def dataframe(self):
"""transform business_list to pandas dataframe
Returns: pandas dataframe
"""
return pd.json_normalize(
(asdict(business) for business in self.business_list), sep="_"
)
def save_to_excel(self, filename):
"""saves pandas dataframe to excel (xlsx) file
Args:
filename (str): filename
"""
if not os.path.exists(self.save_at):
os.makedirs(self.save_at)
self.dataframe().to_excel(f"{self.save_at}/{filename}.xlsx", index=False)
def save_to_csv(self, filename):
"""saves pandas dataframe to csv file
Args:
filename (str): filename
"""
if not os.path.exists(self.save_at):
os.makedirs(self.save_at)
self.dataframe().to_csv(f"{self.save_at}/{filename}.csv", index=False)
def extract_coordinates_from_url(url: str) -> tuple[float,float]:
"""helper function to extract coordinates from url"""
coordinates = url.split('/@')[-1].split(',')[0:2]
# return latitude, longitude
return float(coordinates[0]), float(coordinates[1])
class GoogleMapScraper:
def __init__(self):
self.output_file_name = "google_map_business_data.csv"
self.headless = False
self.driver = None
self.unique_check = []
self.data_df = pd.DataFrame(columns=[
'company_name', 'rating', 'reviews_count', 'address', 'category', 'phone', 'website'
]) # Initialize an empty DataFrame with specified columns
def config_driver(self):
options = webdriver.ChromeOptions()
if self.headless:
options.add_argument("--headless")
# Use service argument instead of executable_path
service = Service(ChromeDriverManager().install())
self.driver = webdriver.Chrome(service=service, options=options)
def save_data(self, data):
self.data_df.loc[len(self.data_df)] = data
self.data_df.to_csv(self.output_file_name, index=False)
def parse_contact(self, business):
try:
contact = business.find_elements(By.CLASS_NAME, "W4Efsd")[3].text.split("·")[-1].strip()
except IndexError:
contact = ""
if "+1" not in contact:
try:
contact = business.find_elements(By.CLASS_NAME, "W4Efsd")[4].text.split("·")[-1].strip()
except IndexError:
contact = ""
return contact
def parse_rating_and_review_count(self, business):
try:
reviews_block = business.find_element(By.CLASS_NAME, 'AJB7ye').text.split("(")
rating = reviews_block[0].strip()
reviews_count = reviews_block[1].split(")")[0].strip()
except (IndexError, NoSuchElementException):
rating = ""
reviews_count = ""
return rating, reviews_count
def parse_address_and_category(self, business):
try:
address_block = business.find_elements(By.CLASS_NAME, "W4Efsd")[2].text.split("·")
if len(address_block) >= 2:
address = address_block[1].strip()
category = address_block[0].strip()
elif len(address_block) == 1:
address = ""
category = address_block[0]
else:
address = ""
category = ""
except (IndexError, NoSuchElementException):
address = ""
category = ""
return address, category
def get_business_info(self, num_results):
time.sleep(2) # Wait for the page to load
scroll_pause_time = 5
last_height = self.driver.execute_script("return document.body.scrollHeight")
while True:
businesses = self.driver.find_elements(By.CLASS_NAME, 'THOPZb')
for business in businesses:
name = business.find_element(By.CLASS_NAME, 'fontHeadlineSmall').text
rating, reviews_count = self.parse_rating_and_review_count(business)
address, category = self.parse_address_and_category(business)
contact = self.parse_contact(business)
try:
website = business.find_element(By.CLASS_NAME, "lcr4fd").get_attribute("href")
except NoSuchElementException:
website = ""
unique_id = "".join([name, rating, reviews_count, address, category, contact, website])
if unique_id not in self.unique_check:
data = [name, rating, reviews_count, address, category, contact, website]
self.save_data(data)
self.unique_check.append(unique_id)
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(scroll_pause_time)
new_height = self.driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
def load_companies(self, search_term, num_results):
# Construct the Google Maps search URL based on user input
url = f"https://www.google.com/maps/search/{search_term}"
self.driver.get(url)
# Wait for the page to fully load
WebDriverWait(self.driver, 50).until(lambda driver: driver.execute_script('return document.readyState') == 'complete')
print("Getting business information...")
self.get_business_info(num_results)
# Example usage with Streamlit
if __name__ == "__main__":
st.title("Google Maps Business Scraper")
search_term = st.text_input("Enter search term (e.g., 'coffee shops in New York')")
num_results = st.number_input('Enter the number of results you want', min_value=1, value=10, step=1)
if st.button("Scrape"):
scraper = GoogleMapScraper()
scraper.config_driver()
scraper.load_companies(search_term, num_results)
scraper.driver.quit()
st.dataframe(scraper.data_df)
I want full data and field whare filed corectly for the user
like it miss [hone number somme address or website
New contributor
Raj Aryan is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.