Thiết kế website giá rẻ

Question

I managed to extract data, this code works halfly right :
I’ve created a python script that uses Selenium’s and spark to scrape:

Job Titles(titre)
Company Name(entreprise)
skills(Compétences)
Qualifications
Localisation
field(domaine)
Description
(description which I need help getting!), off of LinkedIn jobs search section.
all columns exist and are extracted, but description is not always extracted correctly even if in LinkedIn the description exists

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType, StructType, StructField
from rich.console import Console
from rich.table import Table
import pandas as pd
import time

Initialize Spark session

spark = SparkSession.builder
.appName(“Job Scraper”)
.getOrCreate()

Define schema explicitly

schema = StructType([
StructField(“Titre”, StringType(), True),
StructField(“Entreprise”, StringType(), True),
StructField(“Localisation”, StringType(), True),
StructField(“Description”, StringType(), True)
])

Instantiate global variables

df = spark.createDataFrame([], schema)

console = Console()
table = Table(show_header=True, header_style=”bold”)

Dictionary of fields and associated keywords

fields_keywords = {
“Technologies de l’Information”: [“logiciel”, “développeur”, “IT”, “technologies de l’information”, “software”, “developer”, “information technology”],
“Finance”: [“finance”, “financier”, “banque”, “investissement”, “financial”, “banking”, “investment”],
“Santé”: [“santé”, “médical”, “clinique”, “hôpital”, “health”, “medical”, “clinic”, “hospital”],
“Éducation”: [“éducation”, “école”, “université”, “collège”, “education”, “school”, “university”, “college”],
“Marketing”: [“marketing”, “publicité”, “SEO”, “contenu”, “advertising”, “content”],
“Ressources Humaines”: [“ressources humaines”, “RH”, “recrutement”, “gestion des talents”, “human resources”, “HR”, “recruitment”, “talent management”],
“Vente”: [“vente”, “commercial”, “prospection”, “négociation”, “sales”, “commercial”, “negotiation”],
“Ingénierie”: [“ingénieur”, “ingénierie”, “conception”, “développement produit”, “engineer”, “engineering”, “design”, “product development”],
“Recherche et Développement”: [“R&D”, “recherche”, “innovation”, “développement”, “research”, “development”],
“Médias”: [“télévision”, “TV”,”divertissement”,”cinéma”,”magazines”]
}

Function to determine the field based on job description

def determine_field(description):
for field, keywords in fields_keywords.items():
if any(keyword in description.lower() for keyword in keywords):
return field
return “Autre”

determine_field_udf = udf(determine_field, StringType())

Function to extract skills from the job description

def extract_skills(description):
skills_keywords = [
“python”, “java”, “c#”, “javascript”, “c++”, “php”, “ruby”, “swift”, “kotlin”, “typescript”,
“django”, “flask”, “spring boot”, “react”, “angular”, “vue.js”, “node.js”, “ruby on rails”, “asp.net”, “laravel”,
“html5”, “css3”, “sass”, “less”, “bootstrap”, “jquery”, “restful apis”, “graphql”,
“sql”, “mysql”, “postgresql”, “mongodb”, “oracle db”, “sqlite”, “nosql”,
“linux”, “windows server”, “unix”,
“apache”, “nginx”, “iis”,
“docker”, “kubernetes”, “jenkins”, “ansible”, “terraform”, “git”, “github”, “gitlab”, “bitbucket”,
“aws”, “azure”, “google cloud platform”, “cloudformation”, “azure devops”,”devops”,
“cybersécurité”,”cybersecurity”, “gestion des identités et des accès”,”identity and access management”,
“cryptographie”,”cryptography”, “tests de pénétration”,”penetration testing”,
“analyse des vulnérabilités”,”vulnerability analysis”, “firewall”, “sécurité réseau”,”network security”,
“wireshark”, “nmap”, “metasploit”, “burp suite”, “owasp zap”,
“statistiques”,”statistics”, “excel”, “r”,
“scikit-learn”, “tensorflow”, “keras”, “pytorch”, “algorithmes d’apprentissage automatique”,”machine learning algorithms”,
“deep learning”,”hadoop”, “spark”, “kafka”,
“agile”, “scrum”, “kanban”, “waterfall”,
“jira”, “trello”, “asana”, “monday.com”,
“assistance technique”, “diagnostic”, “résolution de problèmes”, “gestion des incidents et des changements”,
“technical support”, “diagnosis”, “problem solving”, “incident and change management”,
“wireframing”, “prototypage”,”prototyping”, “figma”, “adobe xd”, “sketch”,
“résolution de problèmes”, “communication”, “travail d’équipe”, “gestion du temps”,
“problem solving”, “communication”, “teamwork”, “time management”
]
```
 skills = [skill for skill in skills_keywords if skill in description.lower()]
 return ", ".join(skills)
```
extract_skills_udf = udf(extract_skills, StringType())

Function to extract qualifications from the job description

def extract_qualifications(description):
qualifications_keywords = {
“Expérience”: [“expérience”, “années d’expérience”, “professionnelle”, “pratique”, “experience”, “years of experience”, “professional”, “practice”],
“Diplôme”: [“diplôme”, “certificat”, “qualification”, “certification”, “diploma”, “certificate”],
“Licence”: [“licence”, “bachelor”, “baccalauréat”, “license”, “bachelor’s degree”, “undergraduate degree”],
“Master”: [“master”,”mastère”, “M2″,”M1”, “MSc”, “MA”, “master’s degree”],
“Doctorat”: [“doctorat”, “PhD”, “DSc”, “doctorate”],
“Certification”: [“certification”, “certifié”, “attestation”, “certified”],
“Compétences”: [“compétences”, “aptitudes”, “skills”, “abilities”, “competencies”]
}
qualifications = []
for qualification, keywords in qualifications_keywords.items():
if any(keyword in description.lower() for keyword in keywords):
qualifications.append(qualification)
return “, “.join(qualifications) if qualifications else “Aucune”

extract_qualifications_udf = udf(extract_qualifications, StringType())

Get user input

console.print(“Titre du poste recherché :”, style=”bold green”, end=” “)
inputJobTitle = input()
console.print(“Nombre de pages à scrapper (all pour toutes les pages) :”, style=”bold green”, end=” “)
inputNumPages = input()
inputJobLocation = “France”

def scrapeJobDescription(url):
driver = webdriver.Chrome(options=options)
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, “html.parser”)
try:
jobDescription = soup.find(
“div”, class_=”show-more-less-html__markup”
).text.strip()
return jobDescription
except:
return “”

Configuration des options pour Chrome

options = Options()
options.binary_location = r”C:Program FilesGoogleChromeApplicationchrome.exe”
options.add_argument(‘–no-sandbox’)
options.add_argument(‘–disable-dev-shm-usage’)
options.add_argument(‘–remote-debugging-port=9222’)
options.add_argument(‘–headless’)
options.add_argument(‘–disable-gpu’)

def scrapeLinkedin():
global df
global inputNumPages
global inputJobLocation
global inputJobTitle
```
 driver = webdriver.Chrome(options=options)
 counter = 0
 pageCounter = 1

 while True:
     try:
         driver.get(
             f"https://www.linkedin.com/jobs/search/?&keywords={inputJobTitle}&location={inputJobLocation}&refresh=true&start={counter}"
         )

         html = driver.page_source
         soup = BeautifulSoup(html, "html.parser")

         ulElement = soup.find("ul", class_="jobs-search__results-list")
         if ulElement is None:
             break
         liElements = ulElement.find_all("li")

         jobs = []
         for item in liElements:
             jobTitle = item.find(
                 "h3", class_="base-search-card__title"
             ).text.strip()
             jobLocation = item.find(
                 "span", class_="job-search-card__location"
             ).text.strip()
             jobCompany = item.find(
                 "a", class_="hidden-nested-link"
             ).text.strip()
             jobLink = item.find(
                 "a", class_="base-card__full-link"
             )["href"].strip()

             jobDescription = scrapeJobDescription(jobLink)

             jobs.append((jobTitle, jobCompany, jobLocation, jobDescription))

         job_df = spark.createDataFrame(
             jobs, ["Titre", "Entreprise", "Localisation", "Description"]
         )

         # Union des DataFrames
         if df.isEmpty():
             df = job_df
         else:
             df = df.union(job_df)

         if inputNumPages.lower() == 'all':
             counter += 25
             pageCounter += 1
         else:
             if pageCounter >= int(inputNumPages):
                 break
             counter += 25
             pageCounter += 1

     except Exception as e:
         console.print(f"Une erreur est survenue : {e}", style="bold red")
         break

 driver.quit()
```
def main():
global df
start_time = time.time()
scrapeLinkedin()
```
 # Correct the columns
 df = df.withColumn("Domaine", determine_field_udf(col("Description")))
 df = df.withColumn("Compétences", extract_skills_udf(col("Description")))
 df = df.withColumn("Qualifications", extract_qualifications_udf(col("Description")))

 # Reorder columns
 df = df.select(
     col("Titre"),
     col("Entreprise"),
     col("Compétences"),
     col("Qualifications"),
     col("Localisation"),
     col("Domaine"),
     col("Description")
 )

 # Show DataFrame
 df.show(truncate=False)

 # Print execution time
 end_time = time.time()
 execution_time = end_time - start_time
 console.print(f"Temps d'exécution: {execution_time:.2f} secondes", style="bold green")
```
if name == “main“:
main()

Thiết kế website giá rẻ

Danh mục

linkedin job not extracting all descriptions even if it exists

Initialize Spark session

Define schema explicitly

Instantiate global variables

Dictionary of fields and associated keywords

Function to determine the field based on job description

Function to extract skills from the job description

Function to extract qualifications from the job description

Get user input

Configuration des options pour Chrome