I’m trying to scrape some linkedin user’s profile. So it goes to the “people” page and goes one by one to get the information and save on database. If the profile’s not public it will simply move on… if the profile’s public it will click on the profile’s card and extract the information on the profile’s page. But when I print inside my “for” it only prints the first profile found, it doesn’t iterate at all. Also, I’m clueless on how to do the automatic scroll. Here is the code:
def run_query():
url = input_field.get()
if(url == '' or not url.startswith("https://www.linkedin.com/company/")):
CTkMessagebox(title="Info", icon="cancel", message="Empty or invalid URL!")
return
driver.get(url)
driver.maximize_window()
driver.implicitly_wait(30)
print(url)
driver.save_screenshot('LINKEDINPAGE.png')
company_title = driver.find_element(By.XPATH, "//div[@class='relative']//p[@class='org-top-card-summary__tagline']").text
print(company_title)
industry = driver.find_element(By.CLASS_NAME, 'org-top-card-summary-info-list__info-item').text
print(industry)
people_url = url + '/people/'
driver.get(people_url)
driver.implicitly_wait(10)
previous_page_height = driver.execute_script("return document.body.scrollHeight")
employees = driver.find_elements(By.XPATH, "//ul[@class='display-flex list-style-none flex-wrap']//li[@class='grid grid__col--lg-8 block org-people-profile-card__profile-card-spacing']")
for employee in employees:
driver.save_screenshot("employees.png")
linkedin_user = employee.find_element(By.XPATH, "//div[@class='org-people-profile-card__profile-info']//div[@class='ember-view lt-line-clamp lt-line-clamp--single-line org-people-profile-card__profile-title t-black']").text
print(linkedin_user)
linkedin_user_position = driver.find_element(By.XPATH, "//div[@class='org-people-profile-card__profile-info']//div[@class='ember-view lt-line-clamp lt-line-clamp--multi-line']").text
print(linkedin_user_position)
if(linkedin_user != "LinkedIn Member"):
employee_card = driver.find_element(By.XPATH, "//div[@class='org-people-profile-card__profile-info']")
employee_link = employee_card.find_element(By.TAG_NAME, 'a').click()
print('entered the profile')
driver.save_screenshot('LINKEDINPROFILE.png')
driver.implicitly_wait(10)
employee_name = driver.find_element(By.CLASS_NAME, "text-heading-xlarge inline t-24 v-align-middle break-words").text
employee_position = driver.find_element(By.CLASS_NAME, "text-body-medium break-words").text
experiences = driver.find_elements(By.XPATH, "//section[@class='artdeco-card pv-profile-card break-words mt2']//ul[@class='bYpMhDbGeHrPlJHJoKkUOYvSmZyJIyEvfrLc']//li[@class='artdeco-list__item MHGhNRxrFEYePDVElXiPzsGcfCtVjWRE UHceesMDBsAFbVbsOghsJALmURLeFDeg']")
first_experience = experiences[0]
first_position_start_date = first_experience.find_element(By.XPATH, "//div[@class='display-flex flex-column full-width align-self-center']//span[@class='t-14 t-normal t-black--light']//span[@class='pvs-entity__caption-wrapper']").text
cur.execute('INSERT INTO tbl_LinkedIn_Company_Employee (Name, Position, StartDate) VALUES (%s, %s, %s)', (employee_name, employee_position, first_position_start_date.split('-')[0]))
print(employee_name)
print(employee_position)
con.commit()
def pagination():
while True:
save_employee()
new_page_height = driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
if new_page_height == previous_page_height:
break
previous_page_height = new_page_height
confirm_button = ctkinter.CTkButton(master=topview, command=run_query, text="OK", fg_color=("#DB3E39", "#821D1A")).place(relx=0.5, relyx=0.5, anchor='c')
Also, I wonder in an infinite scroll page I can just simply click on “show more” or keep scrolling until the end, and, AFTER that it goes to the first element of the loop until the end.
5