Thiết kế website giá rẻ

Question

I made a map scraper using selenium and streamlit but I only get 5 to 7 result and also not get full results like number or address was missing

import time
import streamlit as st
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from dataclasses import dataclass, asdict, field
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from my_webdriver import Business
import os


def main():
    driver = webdriver.Chrome(ChromeDriverManager().install())
    driver.get("https://www.google.com/maps")
    time.sleep(5)

    def main(search_term, num_results):
        scraper = GoogleMapScraper()
        scraper.config_driver()
        scraper.load_companies(search_term, num_results)  # Pass num_results to load_companies
        scraper.driver.quit()
        st.dataframe(scraper.data_df)

        search_list = ["search_term1", "search_term2", "search_term3"]  # Replace with your actual search terms
        for search_for_index, search_for in enumerate(search_list):
            print(f"-----n{search_for_index} - {search_for}".strip())

            search_field = driver.find_element(By.NAME, "q")
            # Add an input field for the number of results
            num_results = st.number_input('Enter the number of results you want', min_value=1, value=10)
            for search_for_index, search_for in enumerate(search_list):
                print(f"-----n{search_for_index} - {search_for}".strip())
            search_field.send_keys(search_for)
            search_field.submit()
            time.sleep(5)

            # scrolling
            actions = ActionChains(driver)
            actions.move_to_element(driver.find_element(By.XPATH, '//a[contains(@href, "https://www.google.com/maps/place")]'))
            actions.perform()

            previously_counted = 0
            while True:
                actions = ActionChains(driver)
                actions.send_keys(Keys.PAGE_DOWN)
                total = 100  # Replace with your desired value
                actions.perform()
                time.sleep(5)

                listings = driver.find_elements(By.XPATH, '//a[contains(@href, "https://www.google.com/maps/place")]')
                if len(listings) >= total:
                    listings = listings[:total]
                    print(f"Total Scraped: {len(listings)}")
                    break
                else:
                    if len(listings) == previously_counted:
                        print(f"Arrived at all availablenTotal Scraped: {len(listings)}")
                        break
                    else:
                        previously_counted = len(listings)
                        print(f"Currently Scraped: ", len(listings))

            business_list = BusinessList()

            # scraping
            for listing in listings:
                try:
                    listing.click()
                    time.sleep(5)

                    business = Business()

                    # Add your scraping logic here...

                    business_list.business_list.append(business)
                except Exception as e:
                    print(f'Error occured: {e}')

            # output
            business_list.save_to_excel(f"google_maps_data_{search_for}".replace(' ', '_'))
            business_list.save_to_csv(f"google_maps_data_{search_for}".replace(' ', '_'))

        driver.quit()

@dataclass
class BusinessList:
    """holds list of Business objects,
    and save to both excel and csv
    """
    business_list: list[Business] = field(default_factory=list)
    save_at = 'output'

    def dataframe(self):
        """transform business_list to pandas dataframe

        Returns: pandas dataframe
        """
        return pd.json_normalize(
            (asdict(business) for business in self.business_list), sep="_"
        )

    def save_to_excel(self, filename):
        """saves pandas dataframe to excel (xlsx) file

        Args:
            filename (str): filename
        """

        if not os.path.exists(self.save_at):
            os.makedirs(self.save_at)
        self.dataframe().to_excel(f"{self.save_at}/{filename}.xlsx", index=False)

    def save_to_csv(self, filename):
        """saves pandas dataframe to csv file

        Args:
            filename (str): filename
        """

        if not os.path.exists(self.save_at):
            os.makedirs(self.save_at)
        self.dataframe().to_csv(f"{self.save_at}/{filename}.csv", index=False)

def extract_coordinates_from_url(url: str) -> tuple[float,float]:
    """helper function to extract coordinates from url"""
    
    coordinates = url.split('/@')[-1].split(',')[0:2]
    # return latitude, longitude
    return float(coordinates[0]), float(coordinates[1])

class GoogleMapScraper:
    def __init__(self):
        self.output_file_name = "google_map_business_data.csv"
        self.headless = False
        self.driver = None
        self.unique_check = []
        self.data_df = pd.DataFrame(columns=[
            'company_name', 'rating', 'reviews_count', 'address', 'category', 'phone', 'website'
        ])  # Initialize an empty DataFrame with specified columns

    def config_driver(self):
        options = webdriver.ChromeOptions()
        if self.headless:
            options.add_argument("--headless")

        # Use service argument instead of executable_path
        service = Service(ChromeDriverManager().install())
        self.driver = webdriver.Chrome(service=service, options=options)

    def save_data(self, data):
        self.data_df.loc[len(self.data_df)] = data
        self.data_df.to_csv(self.output_file_name, index=False)

    def parse_contact(self, business):
        try:
            contact = business.find_elements(By.CLASS_NAME, "W4Efsd")[3].text.split("·")[-1].strip()
        except IndexError:
            contact = ""

        if "+1" not in contact:
            try:
                contact = business.find_elements(By.CLASS_NAME, "W4Efsd")[4].text.split("·")[-1].strip()
            except IndexError:
                contact = ""

        return contact

    def parse_rating_and_review_count(self, business):
        try:
            reviews_block = business.find_element(By.CLASS_NAME, 'AJB7ye').text.split("(")
            rating = reviews_block[0].strip()
            reviews_count = reviews_block[1].split(")")[0].strip()
        except (IndexError, NoSuchElementException):
            rating = ""
            reviews_count = ""

        return rating, reviews_count

    def parse_address_and_category(self, business):
        try:
            address_block = business.find_elements(By.CLASS_NAME, "W4Efsd")[2].text.split("·")
            if len(address_block) >= 2:
                address = address_block[1].strip()
                category = address_block[0].strip()
            elif len(address_block) == 1:
                address = ""
                category = address_block[0]
            else:
                address = ""
                category = ""
        except (IndexError, NoSuchElementException):
            address = ""
            category = ""

        return address, category

    def get_business_info(self, num_results):
        time.sleep(2)  # Wait for the page to load
        scroll_pause_time = 5
        last_height = self.driver.execute_script("return document.body.scrollHeight")

        while True:
            businesses = self.driver.find_elements(By.CLASS_NAME, 'THOPZb')
            for business in businesses:
                name = business.find_element(By.CLASS_NAME, 'fontHeadlineSmall').text
                rating, reviews_count = self.parse_rating_and_review_count(business)
                address, category = self.parse_address_and_category(business)
                contact = self.parse_contact(business)
                try:
                    website = business.find_element(By.CLASS_NAME, "lcr4fd").get_attribute("href")
                except NoSuchElementException:
                    website = ""

                unique_id = "".join([name, rating, reviews_count, address, category, contact, website])
                if unique_id not in self.unique_check:
                    data = [name, rating, reviews_count, address, category, contact, website]
                    self.save_data(data)
                    self.unique_check.append(unique_id)

            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(scroll_pause_time)
            new_height = self.driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height


    def load_companies(self, search_term, num_results):
        # Construct the Google Maps search URL based on user input
        url = f"https://www.google.com/maps/search/{search_term}"
        self.driver.get(url)
            # Wait for the page to fully load
        WebDriverWait(self.driver, 50).until(lambda driver: driver.execute_script('return document.readyState') == 'complete')
        
        print("Getting business information...")
        self.get_business_info(num_results)

# Example usage with Streamlit
if __name__ == "__main__":
    st.title("Google Maps Business Scraper")
    search_term = st.text_input("Enter search term (e.g., 'coffee shops in New York')")
    num_results = st.number_input('Enter the number of results you want', min_value=1, value=10, step=1)
    if st.button("Scrape"):
        scraper = GoogleMapScraper()
        scraper.config_driver()
        scraper.load_companies(search_term, num_results)
        scraper.driver.quit()
        st.dataframe(scraper.data_df)

I want full data and field whare filed corectly for the user
like it miss [hone number somme address or website

Thiết kế website giá rẻ

Danh mục

Selenium scraper give me limited amount of results and some fields are empty