I’m working on a Python script to parse and format address data using Pandas and the usaddress library. I have a DataFrame with two columns: address_main and address_cnr. Each row contains an address in the address_main column and a corner address in the address_cnr column.
Here’s a sample input DataFrame:
import pandas as pd
data = {
'address_main': ['585 - 595 salisbury highway', '91 queen street', 'arndale shopping centre', 'bay street', 'capalaba park shopping centre'],
'address_cnr': ['cnr. greenfields drive', 'cnr albert & elizabeth streets', 'cnr hanson & torrens rd', 'cnr park street', 'cnr redland bay & mt cotton road']
}
df = pd.DataFrame(data)
I want to parse these addresses using the usaddress
library and format the parsed components into separate columns in the DataFrame. Here’s the desired output format:
address_main,address_cnr,street_number,street_name1,street_type1,street_name2,street_type2,street_name3,street_type3
585 - 595 salisbury highway,cnr. greenfields drive,585 - 595,salisbury,highway,greenfields,drive,,
91 queen street,cnr albert & elizabeth streets,91,queen,street,albert,street,elizabeth,street
arndale shopping centre,cnr hanson & torrens rd,,hanson,rd,torrens,rd,,
bay street,cnr park street,.bay,street,park,street,,
capalaba park shopping centre,cnr redland bay & mt cotton road,,redland,bay,mt cotton,road,,
I’ve written a script to achieve this, but I’m having trouble getting the desired output format. Can someone please help me modify my script or suggest a better approach to achieve the desired output?
Here’s the script I have so far:
import pandas as pd
import usaddress
# Sample data
data = {
'address_main': ['585 - 595 salisbury highway', '91 queen street', 'arndale shopping centre', 'bay street', 'capalaba park shopping centre'],
'address_cnr': ['cnr. greenfields drive', 'cnr albert & elizabeth streets', 'cnr hanson & torrens rd', 'cnr park street', 'cnr redland bay & mt cotton road']
}
# Create a DataFrame
df = pd.DataFrame(data)
def parse_address(address):
try:
# Use usaddress to tag the address
parsed_address, address_type = usaddress.tag(address)
return parsed_address
except usaddress.RepeatedLabelError as e:
# print(f"Error parsing address '{address}': {e}")
return {}
def extract_address_components(parsed_address, prefix='street'):
components = {
f'{prefix}_number': parsed_address.get('AddressNumber', ''),
f'{prefix}_name1': parsed_address.get('StreetName', ''),
f'{prefix}_type1': parsed_address.get('StreetNamePostType', ''),
f'{prefix}_name2': parsed_address.get('SecondStreetName', ''),
f'{prefix}_type2': parsed_address.get('SecondStreetNamePostType', ''),
f'{prefix}_name3': '',
f'{prefix}_type3': ''
}
# Handle third street name and type if present
if 'ThirdStreetName' in parsed_address:
components[f'{prefix}_name3'] = parsed_address['ThirdStreetName']
if 'ThirdStreetNamePostType' in parsed_address:
components[f'{prefix}_type3'] = parsed_address['ThirdStreetNamePostType']
return components
# Initialize new columns in the DataFrame
df['street_number'] = ''
df['street_name1'] = ''
df['street_type1'] = ''
df['street_name2'] = ''
df['street_type2'] = ''
df['street_name3'] = ''
df['street_type3'] = ''
for index, row in df.iterrows():
# Parse the main address
parsed_main = parse_address(row['address_main'].replace(' - ', '-'))
main_components = extract_address_components(parsed_main, 'street')
# Assign main address components to DataFrame
df.at[index, 'street_number'] = main_components['street_number']
df.at[index, 'street_name1'] = main_components['street_name1']
df.at[index, 'street_type1'] = main_components['street_type1']
# Parse the corner address for additional street names and types
if pd.notnull(row['address_cnr']):
parsed_cnr = parse_address(row['address_cnr'])
cnr_components = extract_address_components(parsed_cnr, 'street')
# Assign corner address components to DataFrame
df.at[index, 'street_name2'] = cnr_components['street_name1']
df.at[index, 'street_type2'] = cnr_components['street_type1']
df.at[index, 'street_name3'] = cnr_components['street_name2']
df.at[index, 'street_type3'] = cnr_components['street_type2']
# Output the DataFrame
df.head()