I managed to extract data, this code works halfly right :
I’ve created a python script that uses Selenium’s and spark to scrape:
-
Job Titles(titre)
-
Company Name(entreprise)
-
skills(Compétences)
-
Qualifications
-
Localisation
-
field(domaine)
-
Description
(description which I need help getting!), off of LinkedIn jobs search section.
all columns exist and are extracted, but description is not always extracted correctly even if in LinkedIn the description existsfrom selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType, StructType, StructField
from rich.console import Console
from rich.table import Table
import pandas as pd
import timeInitialize Spark session
spark = SparkSession.builder
.appName(“Job Scraper”)
.getOrCreate()Define schema explicitly
schema = StructType([
StructField(“Titre”, StringType(), True),
StructField(“Entreprise”, StringType(), True),
StructField(“Localisation”, StringType(), True),
StructField(“Description”, StringType(), True)
])Instantiate global variables
df = spark.createDataFrame([], schema)
console = Console()
table = Table(show_header=True, header_style=”bold”)Dictionary of fields and associated keywords
fields_keywords = {
“Technologies de l’Information”: [“logiciel”, “développeur”, “IT”, “technologies de l’information”, “software”, “developer”, “information technology”],
“Finance”: [“finance”, “financier”, “banque”, “investissement”, “financial”, “banking”, “investment”],
“Santé”: [“santé”, “médical”, “clinique”, “hôpital”, “health”, “medical”, “clinic”, “hospital”],
“Éducation”: [“éducation”, “école”, “université”, “collège”, “education”, “school”, “university”, “college”],
“Marketing”: [“marketing”, “publicité”, “SEO”, “contenu”, “advertising”, “content”],
“Ressources Humaines”: [“ressources humaines”, “RH”, “recrutement”, “gestion des talents”, “human resources”, “HR”, “recruitment”, “talent management”],
“Vente”: [“vente”, “commercial”, “prospection”, “négociation”, “sales”, “commercial”, “negotiation”],
“Ingénierie”: [“ingénieur”, “ingénierie”, “conception”, “développement produit”, “engineer”, “engineering”, “design”, “product development”],
“Recherche et Développement”: [“R&D”, “recherche”, “innovation”, “développement”, “research”, “development”],
“Médias”: [“télévision”, “TV”,”divertissement”,”cinéma”,”magazines”]
}Function to determine the field based on job description
def determine_field(description):
for field, keywords in fields_keywords.items():
if any(keyword in description.lower() for keyword in keywords):
return field
return “Autre”determine_field_udf = udf(determine_field, StringType())
Function to extract skills from the job description
def extract_skills(description):
skills_keywords = [
“python”, “java”, “c#”, “javascript”, “c++”, “php”, “ruby”, “swift”, “kotlin”, “typescript”,
“django”, “flask”, “spring boot”, “react”, “angular”, “vue.js”, “node.js”, “ruby on rails”, “asp.net”, “laravel”,
“html5”, “css3”, “sass”, “less”, “bootstrap”, “jquery”, “restful apis”, “graphql”,
“sql”, “mysql”, “postgresql”, “mongodb”, “oracle db”, “sqlite”, “nosql”,
“linux”, “windows server”, “unix”,
“apache”, “nginx”, “iis”,
“docker”, “kubernetes”, “jenkins”, “ansible”, “terraform”, “git”, “github”, “gitlab”, “bitbucket”,
“aws”, “azure”, “google cloud platform”, “cloudformation”, “azure devops”,”devops”,
“cybersécurité”,”cybersecurity”, “gestion des identités et des accès”,”identity and access management”,
“cryptographie”,”cryptography”, “tests de pénétration”,”penetration testing”,
“analyse des vulnérabilités”,”vulnerability analysis”, “firewall”, “sécurité réseau”,”network security”,
“wireshark”, “nmap”, “metasploit”, “burp suite”, “owasp zap”,
“statistiques”,”statistics”, “excel”, “r”,
“scikit-learn”, “tensorflow”, “keras”, “pytorch”, “algorithmes d’apprentissage automatique”,”machine learning algorithms”,
“deep learning”,”hadoop”, “spark”, “kafka”,
“agile”, “scrum”, “kanban”, “waterfall”,
“jira”, “trello”, “asana”, “monday.com”,
“assistance technique”, “diagnostic”, “résolution de problèmes”, “gestion des incidents et des changements”,
“technical support”, “diagnosis”, “problem solving”, “incident and change management”,
“wireframing”, “prototypage”,”prototyping”, “figma”, “adobe xd”, “sketch”,
“résolution de problèmes”, “communication”, “travail d’équipe”, “gestion du temps”,
“problem solving”, “communication”, “teamwork”, “time management”
]skills = [skill for skill in skills_keywords if skill in description.lower()] return ", ".join(skills)
extract_skills_udf = udf(extract_skills, StringType())
Function to extract qualifications from the job description
def extract_qualifications(description):
qualifications_keywords = {
“Expérience”: [“expérience”, “années d’expérience”, “professionnelle”, “pratique”, “experience”, “years of experience”, “professional”, “practice”],
“Diplôme”: [“diplôme”, “certificat”, “qualification”, “certification”, “diploma”, “certificate”],
“Licence”: [“licence”, “bachelor”, “baccalauréat”, “license”, “bachelor’s degree”, “undergraduate degree”],
“Master”: [“master”,”mastère”, “M2″,”M1”, “MSc”, “MA”, “master’s degree”],
“Doctorat”: [“doctorat”, “PhD”, “DSc”, “doctorate”],
“Certification”: [“certification”, “certifié”, “attestation”, “certified”],
“Compétences”: [“compétences”, “aptitudes”, “skills”, “abilities”, “competencies”]
}
qualifications = []
for qualification, keywords in qualifications_keywords.items():
if any(keyword in description.lower() for keyword in keywords):
qualifications.append(qualification)
return “, “.join(qualifications) if qualifications else “Aucune”extract_qualifications_udf = udf(extract_qualifications, StringType())
Get user input
console.print(“Titre du poste recherché :”, style=”bold green”, end=” “)
inputJobTitle = input()
console.print(“Nombre de pages à scrapper (all pour toutes les pages) :”, style=”bold green”, end=” “)
inputNumPages = input()
inputJobLocation = “France”def scrapeJobDescription(url):
driver = webdriver.Chrome(options=options)
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, “html.parser”)
try:
jobDescription = soup.find(
“div”, class_=”show-more-less-html__markup”
).text.strip()
return jobDescription
except:
return “”Configuration des options pour Chrome
options = Options()
options.binary_location = r”C:Program FilesGoogleChromeApplicationchrome.exe”
options.add_argument(‘–no-sandbox’)
options.add_argument(‘–disable-dev-shm-usage’)
options.add_argument(‘–remote-debugging-port=9222’)
options.add_argument(‘–headless’)
options.add_argument(‘–disable-gpu’)def scrapeLinkedin():
global df
global inputNumPages
global inputJobLocation
global inputJobTitledriver = webdriver.Chrome(options=options) counter = 0 pageCounter = 1 while True: try: driver.get( f"https://www.linkedin.com/jobs/search/?&keywords={inputJobTitle}&location={inputJobLocation}&refresh=true&start={counter}" ) html = driver.page_source soup = BeautifulSoup(html, "html.parser") ulElement = soup.find("ul", class_="jobs-search__results-list") if ulElement is None: break liElements = ulElement.find_all("li") jobs = [] for item in liElements: jobTitle = item.find( "h3", class_="base-search-card__title" ).text.strip() jobLocation = item.find( "span", class_="job-search-card__location" ).text.strip() jobCompany = item.find( "a", class_="hidden-nested-link" ).text.strip() jobLink = item.find( "a", class_="base-card__full-link" )["href"].strip() jobDescription = scrapeJobDescription(jobLink) jobs.append((jobTitle, jobCompany, jobLocation, jobDescription)) job_df = spark.createDataFrame( jobs, ["Titre", "Entreprise", "Localisation", "Description"] ) # Union des DataFrames if df.isEmpty(): df = job_df else: df = df.union(job_df) if inputNumPages.lower() == 'all': counter += 25 pageCounter += 1 else: if pageCounter >= int(inputNumPages): break counter += 25 pageCounter += 1 except Exception as e: console.print(f"Une erreur est survenue : {e}", style="bold red") break driver.quit()
def main():
global df
start_time = time.time()
scrapeLinkedin()# Correct the columns df = df.withColumn("Domaine", determine_field_udf(col("Description"))) df = df.withColumn("Compétences", extract_skills_udf(col("Description"))) df = df.withColumn("Qualifications", extract_qualifications_udf(col("Description"))) # Reorder columns df = df.select( col("Titre"), col("Entreprise"), col("Compétences"), col("Qualifications"), col("Localisation"), col("Domaine"), col("Description") ) # Show DataFrame df.show(truncate=False) # Print execution time end_time = time.time() execution_time = end_time - start_time console.print(f"Temps d'exécution: {execution_time:.2f} secondes", style="bold green")
if name == “main“:
main()