Every time I run my code the lengths of the lists are different than the last time. And every time jobTitles and links are longer than companyName and jobLocation. I’m not sure how to fix this. All of the lists should be the same length. I’m just not sure what is happening that they are alwasy different lengths, not always by the same amount either.
import hrequests
from bs4 import BeautifulSoup
import pandas as pd
import csv
####################
## INDEED SCRAPER ##
####################
# Creates the CSV in Write Mode
with open('Indeed_Jobs.csv', 'w', newline='') as file:
writer = csv.writer(file)
#Creating Lists for info
jobTitles = []
companyName = []
jobLocation = []
links = []
#Creates list of URLs
URLs = ["https://www.indeed.com/jobs?q=IT+Entry+Level&l=United+States&from=searchOnHP&vjk=55e3c7e5e7a919c9",
"https://www.indeed.com/jobs?q=development+entry+level&l=United+States&from=searchOnDesktopSerp&vjk=996b270bd119f225",
"https://www.indeed.com/jobs?q=user+experience+entry+level&l=United+States&from=searchOnDesktopSerp&vjk=4fc6a443fd7c11df",
"https://www.indeed.com/jobs?q=ux%2Fui+experience+entry+level&l=United+States&from=searchOnDesktopSerp&vjk=254dcd8c33926527",
"https://www.indeed.com/jobs?q=data+entry+level&l=United+States&from=searchOnDesktopSerp&vjk=bd4fd45f0feb91c2"]
for url in URLs:
#Connecting to Indeed and reading HTML
target_url = url
head= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
"Accept-Encoding": "gzip, deflate, br",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Connection": "keep-alive",
"Accept-Language": "en-US,en;q=0.9,lt;q=0.8,et;q=0.7,de;q=0.6",
}
resp = hrequests.get(target_url, headers=head)
soup = BeautifulSoup(resp.text, 'html.parser')
#Finds all items in list
for div in soup.find_all('div', {'class': 'css-dekpa e37uo190'}):
#Pulls job title
for span in div.find_all('span'):
jobTitles.append(span.text.strip())
#Pulls link to job
for a in div.find_all('a'):
links.append("indeed.com" + a.get('href'))
#Pulls company name
for div in soup.find_all('div', {'class': 'company_location css-17fky0v e37uo190'}):
for span in div.find_all('span',{'data-testid': 'company-name'}):
companyName.append(span.text.strip())
#Pulls job location
for divLocation in div.find_all('div', {'data-testid': 'text-location'}):
jobLocation.append(divLocation.text.strip())
# Changing the heading on the CSV depending on which URL was searched
if url == "https://www.indeed.com/jobs?q=development+entry+level&l=United+States&from=searchOnDesktopSerp&vjk=996b270bd119f225":
additional_row = ["Development - Entry Level"]
elif url =="https://www.indeed.com/jobs?q=it+entry+level&l=United+States&from=searchOnHP&vjk=bf9a1055e66b240c":
additional_row = ["IT - Entry Level"]
elif url == "https://www.indeed.com/jobs?q=user+experience+entry+level&l=United+States&from=searchOnDesktopSerp&vjk=4fc6a443fd7c11df":
additional_row = ["User Experience - Entry Level"]
elif url == "https://www.indeed.com/jobs?q=ux%2Fui+experience+entry+level&l=United+States&from=searchOnDesktopSerp&vjk=254dcd8c33926527":
additional_row = ["UX/UI - Entry Level"]
elif url == "https://www.indeed.com/jobs?q=data+entry+level&l=United+States&from=searchOnDesktopSerp&vjk=bd4fd45f0feb91c2":
additional_row = ["Data - Entry Level"]
else:
additional_row = ["Title Not Found"]
# Blank Row to make formatting nice
blank_row = ['']
print(len(jobTitles))
print(len(companyName))
print(len(jobLocation))
print(len(links))
#
# # Open the CSV file in append mode
# with open('Indeed_Jobs.csv', 'a', newline='') as file:
# writer = csv.writer(file)
# # Write the header and blank rows for formatting
# writer.writerow(blank_row)
# writer.writerow(additional_row)
# writer.writerow(blank_row)
#
# # Write jobs to the CSV
# df = pd.DataFrame({'Job Title': jobTitles, 'Company Name': companyName, 'Location': jobLocation, 'Link': links})
# df.to_csv(file, mode='a', header=True, index=False)