I have a problem with the following script, I want to save data from my MongoDB which has about 15K Documents but the .csv file onnly contain 858 lines and then stops automatically and does not continue.
Also only the mac_addresses_results_intermediate.csv
is created and not the final result .csv file
PyMongo should not have pagination by default but something is limiting it.
I programm in VSC but I dont think that this the problem.
Here is the full code (without critical information):
import datetime
import pandas as pd
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from mac_vendor_lookup import MacLookup, VendorNotFoundError
# MongoDB URI with placeholders for credentials
uri = "mongodb+srv://<username>:<password>@your_cluster_address/?retryWrites=true&w=majority&appName=YourAppName"
# Replace placeholders with actual MongoDB credentials
uri = uri.replace("<username>", "your_username")
uri = uri.replace("<password>", "your_password")
try:
# Create a client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))
# Ping the server to check the connection
client.admin.command('ping')
print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
print(e)
# Connect to the appropriate database and collection
db = client['genieacs']
collection = db['devices']
# Load CSV file containing MAC ages data
csv_url = "https://raw.githubusercontent.com/hdm/mac-ages/main/data/mac-ages.csv"
mac_ages_df = pd.read_csv(csv_url, header=None, names=["mac_prefix", "registration_date", "source"])
# Function to recursively find MAC addresses in LAN devices
def find_mac_addresses(data):
mac_addresses = []
if isinstance(data, dict):
for key, value in data.items():
if key == 'MACAddress' and isinstance(value, dict) and '_value' in value:
mac_addresses.append((value['_value'], data.get('HostName', {}).get('_value', '<unknown>')))
else:
mac_addresses.extend(find_mac_addresses(value))
elif isinstance(data, list):
for item in data:
mac_addresses.extend(find_mac_addresses(item))
return mac_addresses
# Function to retrieve registration date based on MAC prefix
def get_registration_date(mac_shortened, mac_ages_df):
entry = mac_ages_df[mac_ages_df['mac_prefix'].str.startswith(mac_shortened)]
if not entry.empty:
return entry.iloc[0]['registration_date']
return "N/A"
# Vendor lookup using MacLookup library
mac_lookup = MacLookup()
# Update vendor database
try:
mac_lookup.update_vendors()
print("Vendor database updated successfully.")
except Exception as e:
print(f"Failed to update vendor database: {e}")
# Prepare list and intermediate results DataFrame
results = []
intermediate_results = []
# Iterate through all entries in the 'devices' collection
for entry in collection.find():
entry_id = entry.get('_id', '')
# Traverse path structure to find MAC addresses
lan_devices = entry.get('InternetGatewayDevice', {}).get('LANDevice', {}).get('1', {}).get('Hosts', {}).get('Host', {})
mac_addresses = find_mac_addresses(lan_devices)
for mac, hostname in mac_addresses:
mac_shortened = mac.replace(':', '')[:6].lower()
registration_date = get_registration_date(mac_shortened, mac_ages_df)
try:
vendor_name = mac_lookup.lookup(mac)
if vendor_name is None:
vendor_name = "N/A"
except VendorNotFoundError as e:
# Handle VendorNotFoundError
vendor_name = "N/A"
except Exception as e:
# Handle other exceptions during vendor lookup
vendor_name = "N/A"
# Collect results
results.append({
"DeviceID": entry_id,
"MAC": mac,
"RegistrationDate": registration_date,
"Hostname": hostname,
"VendorName": vendor_name
})
# Collect intermediate results for CSV storage
intermediate_results.append({
"DeviceID": entry_id,
"MAC": mac,
"RegistrationDate": registration_date,
"Hostname": hostname,
"VendorName": vendor_name
})
# Save intermediate results to CSV file
intermediate_results_df = pd.DataFrame(intermediate_results)
output_csv_file = 'mac_addresses_results_intermediate.csv'
intermediate_results_df.to_csv(output_csv_file, index=False)
# Convert final results to DataFrame
results_df = pd.DataFrame(results)
# Save final results to CSV file with current date in filename
output_csv_file = 'mac_addresses_results_' + str(datetime.date.today()) + '.csv'
results_df.to_csv(output_csv_file, index=False)
print(f"Final results written to file '{output_csv_file}'.")