Thiết kế website giá rẻ

Question

I have a Python script that scrapes data from CoinGecko. It works perfectly on my laptop but throws an error when I run it on my PC. The script is as follows:

<code>import urllib.request

from bs4 import BeautifulSoup

import pandas as pd

import gzip

import brotli

import io

import time

# Function to get the page content with custom headers

def get_page_content(url):

req = urllib.request.Request(url, headers={

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36',

'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

'Accept-Language': 'en-US,en;q=0.5',

'Accept-Encoding': 'gzip, deflate, br',

'Connection': 'keep-alive',

'Upgrade-Insecure-Requests': '1'

})

response = urllib.request.urlopen(req)

# Handle different content encodings

if response.info().get('Content-Encoding') == 'gzip':

buf = io.BytesIO(response.read())

data = gzip.GzipFile(fileobj=buf).read()

elif response.info().get('Content-Encoding') == 'br':

data = brotli.decompress(response.read())

else:

data = response.read()

return data

# Function to extract table data from a given page URL

def extract_table_data(page_url):

try:

webpage = get_page_content(page_url)

soup = BeautifulSoup(webpage, 'html.parser')

div_element = soup.find('div', class_='tw-mb-6 lg:tw-mb-12')

if div_element:

html_table = div_element.find('table')

if html_table:

df = pd.read_html(str(html_table))[0]

df = df.loc[:, df.columns[1:-1]] # Adjust the columns as per your requirement

return df

else:

print(f"No table found in the specified div for URL: {page_url}")

else:

print(f"Specified div element not found for URL: {page_url}")

except Exception as e:

print(f"An error occurred for URL {page_url}: {str(e)}")

return None

# Base URL

base_url = 'https://www.coingecko.com/en/coins/1/markets/spot?page='

# DataFrame to collect all data

all_data = pd.DataFrame()

# Start page

page = 1

max_retries = 3

retry_delay = 5

max_consecutive_errors = 5

consecutive_errors = 0

while True:

url = base_url + str(page)

print(f"Processing {url}")

retries = 0

while retries < max_retries:

try:

df = extract_table_data(url)

if df is not None:

all_data = pd.concat([all_data, df], ignore_index=True)

consecutive_errors = 0 # Reset consecutive errors counter

break # Successfully retrieved data, break out of the retry loop

else:

print(f"No data found on page {page}, stopping.")

consecutive_errors += 1

break

except urllib.error.HTTPError as e:

if e.code == 404:

print(f"HTTP Error 404 on page {page}. Stopping.")

consecutive_errors += 1

break

else:

print(f"HTTP Error on page {page}: {e.code}. Retrying...")

retries += 1

time.sleep(retry_delay)

except Exception as e:

print(f"An error occurred on page {page}: {str(e)}. Retrying...")

retries += 1

time.sleep(retry_delay)

if consecutive_errors >= max_consecutive_errors:

print(f"Stopping due to {max_consecutive_errors} consecutive errors.")

break

page += 1

# Save the complete DataFrame to CSV in the specified path

save_path = r'C:UsershamidDownloadsCrypto_Data_Table.csv'

all_data.to_csv(save_path, index=False)

print(f"All data saved to '{save_path}'")

</code>

<code>import urllib.request from bs4 import BeautifulSoup import pandas as pd import gzip import brotli import io import time # Function to get the page content with custom headers def get_page_content(url): req = urllib.request.Request(url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1' }) response = urllib.request.urlopen(req) # Handle different content encodings if response.info().get('Content-Encoding') == 'gzip': buf = io.BytesIO(response.read()) data = gzip.GzipFile(fileobj=buf).read() elif response.info().get('Content-Encoding') == 'br': data = brotli.decompress(response.read()) else: data = response.read() return data # Function to extract table data from a given page URL def extract_table_data(page_url): try: webpage = get_page_content(page_url) soup = BeautifulSoup(webpage, 'html.parser') div_element = soup.find('div', class_='tw-mb-6 lg:tw-mb-12') if div_element: html_table = div_element.find('table') if html_table: df = pd.read_html(str(html_table))[0] df = df.loc[:, df.columns[1:-1]] # Adjust the columns as per your requirement return df else: print(f"No table found in the specified div for URL: {page_url}") else: print(f"Specified div element not found for URL: {page_url}") except Exception as e: print(f"An error occurred for URL {page_url}: {str(e)}") return None # Base URL base_url = 'https://www.coingecko.com/en/coins/1/markets/spot?page=' # DataFrame to collect all data all_data = pd.DataFrame() # Start page page = 1 max_retries = 3 retry_delay = 5 max_consecutive_errors = 5 consecutive_errors = 0 while True: url = base_url + str(page) print(f"Processing {url}") retries = 0 while retries < max_retries: try: df = extract_table_data(url) if df is not None: all_data = pd.concat([all_data, df], ignore_index=True) consecutive_errors = 0 # Reset consecutive errors counter break # Successfully retrieved data, break out of the retry loop else: print(f"No data found on page {page}, stopping.") consecutive_errors += 1 break except urllib.error.HTTPError as e: if e.code == 404: print(f"HTTP Error 404 on page {page}. Stopping.") consecutive_errors += 1 break else: print(f"HTTP Error on page {page}: {e.code}. Retrying...") retries += 1 time.sleep(retry_delay) except Exception as e: print(f"An error occurred on page {page}: {str(e)}. Retrying...") retries += 1 time.sleep(retry_delay) if consecutive_errors >= max_consecutive_errors: print(f"Stopping due to {max_consecutive_errors} consecutive errors.") break page += 1 # Save the complete DataFrame to CSV in the specified path save_path = r'C:UsershamidDownloadsCrypto_Data_Table.csv' all_data.to_csv(save_path, index=False) print(f"All data saved to '{save_path}'") </code>

import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
import gzip
import brotli
import io
import time

# Function to get the page content with custom headers
def get_page_content(url):
    req = urllib.request.Request(url, headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1'
    })
    response = urllib.request.urlopen(req)
   
    # Handle different content encodings
    if response.info().get('Content-Encoding') == 'gzip':
        buf = io.BytesIO(response.read())
        data = gzip.GzipFile(fileobj=buf).read()
    elif response.info().get('Content-Encoding') == 'br':
        data = brotli.decompress(response.read())
    else:
        data = response.read()
   
    return data

# Function to extract table data from a given page URL
def extract_table_data(page_url):
    try:
        webpage = get_page_content(page_url)
        soup = BeautifulSoup(webpage, 'html.parser')
        div_element = soup.find('div', class_='tw-mb-6 lg:tw-mb-12')
        if div_element:
            html_table = div_element.find('table')
            if html_table:
                df = pd.read_html(str(html_table))[0]
                df = df.loc[:, df.columns[1:-1]]  # Adjust the columns as per your requirement
                return df
            else:
                print(f"No table found in the specified div for URL: {page_url}")
        else:
            print(f"Specified div element not found for URL: {page_url}")
    except Exception as e:
        print(f"An error occurred for URL {page_url}: {str(e)}")
    return None

# Base URL
base_url = 'https://www.coingecko.com/en/coins/1/markets/spot?page='

# DataFrame to collect all data
all_data = pd.DataFrame()

# Start page
page = 1
max_retries = 3
retry_delay = 5
max_consecutive_errors = 5
consecutive_errors = 0

while True:
    url = base_url + str(page)
    print(f"Processing {url}")
    retries = 0
    while retries < max_retries:
        try:
            df = extract_table_data(url)
            if df is not None:
                all_data = pd.concat([all_data, df], ignore_index=True)
                consecutive_errors = 0  # Reset consecutive errors counter
                break  # Successfully retrieved data, break out of the retry loop
            else:
                print(f"No data found on page {page}, stopping.")
                consecutive_errors += 1
                break
        except urllib.error.HTTPError as e:
            if e.code == 404:
                print(f"HTTP Error 404 on page {page}. Stopping.")
                consecutive_errors += 1
                break
            else:
                print(f"HTTP Error on page {page}: {e.code}. Retrying...")
                retries += 1
                time.sleep(retry_delay)
        except Exception as e:
            print(f"An error occurred on page {page}: {str(e)}. Retrying...")
            retries += 1
            time.sleep(retry_delay)
    
    if consecutive_errors >= max_consecutive_errors:
        print(f"Stopping due to {max_consecutive_errors} consecutive errors.")
        break

    page += 1

# Save the complete DataFrame to CSV in the specified path
save_path = r'C:UsershamidDownloadsCrypto_Data_Table.csv'
all_data.to_csv(save_path, index=False)
print(f"All data saved to '{save_path}'")

When I run it on my PC, I receive the following error:

<code>Processing https://www.coingecko.com/en/coins/1/markets/spot?page=1

An error occurred for URL https://www.coingecko.com/en/coins/1/markets/spot?page=1: nonnumeric port: 'port'

No data found on page 1, stopping.

Processing https://www.coingecko.com/en/coins/1/markets/spot?page=2

An error occurred for URL https://www.coingecko.com/en/coins/1/markets/spot?page=2: nonnumeric port: 'port'

No data found on page 2, stopping.

...

Stopping due to 5 consecutive errors.

All data saved to 'C:UsershamidDownloadsCrypto_Data_Table.csv'

</code>

<code>Processing https://www.coingecko.com/en/coins/1/markets/spot?page=1 An error occurred for URL https://www.coingecko.com/en/coins/1/markets/spot?page=1: nonnumeric port: 'port' No data found on page 1, stopping. Processing https://www.coingecko.com/en/coins/1/markets/spot?page=2 An error occurred for URL https://www.coingecko.com/en/coins/1/markets/spot?page=2: nonnumeric port: 'port' No data found on page 2, stopping. ... Stopping due to 5 consecutive errors. All data saved to 'C:UsershamidDownloadsCrypto_Data_Table.csv' </code>

Processing https://www.coingecko.com/en/coins/1/markets/spot?page=1
An error occurred for URL https://www.coingecko.com/en/coins/1/markets/spot?page=1: nonnumeric port: 'port'
No data found on page 1, stopping.
Processing https://www.coingecko.com/en/coins/1/markets/spot?page=2
An error occurred for URL https://www.coingecko.com/en/coins/1/markets/spot?page=2: nonnumeric port: 'port'
No data found on page 2, stopping.
...
Stopping due to 5 consecutive errors.
All data saved to 'C:UsershamidDownloadsCrypto_Data_Table.csv'

I’m not using a proxy, and the same script works fine on my laptop. What could be causing this issue on my PC, and how can I resolve it?. Thanks in advance.

Thiết kế website giá rẻ

Danh mục

Why does my web scraping script work on my laptop but gives ‘nonnumeric port: ‘port” error on my PC?