import spacy
from PyPDF2 import PdfReader
import docx
import re
Load the spaCy model for NLP
nlp = spacy.load(‘en_core_web_sm’)
Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
reader = PdfReader(pdf_path)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
Function to extract text from a DOCX file
def extract_text_from_docx(docx_path):
doc = docx.Document(docx_path)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text
return text
Function to preprocess text
def preprocess_text(text):
# Lowercase the text
text = text.lower()
# Remove numbers
text = re.sub(r'd+', '', text)
# Remove punctuation
text = re.sub(r'[^ws]', '', text)
return text
Function to extract skills using spaCy’s NER
def extract_skills(text):
skills = []
doc = nlp(text)
for ent in doc.ents:
if ent.label_ == "SKILL":
skills.append(ent.text)
return skills
Function to filter resumes based on criteria
def filter_resumes(resume_texts, required_skills):
filtered_resumes = []
for resume_text in resume_texts:
skills = extract_skills(resume_text)
if all(skill in skills for skill in required_skills):
filtered_resumes.append(resume_text)
return filtered_resumes
Example usage
if name == “main“:
# Define the required skills
required_skills = ['python', 'machine learning', 'nlp']
# Extract text from resumes
resume_texts = []
resume_texts.append(preprocess_text(extract_text_from_pdf('resume1.pdf')))
resume_texts.append(preprocess_text(extract_text_from_docx('resume2.docx')))
# Filter resumes
filtered_resumes = filter_resumes(resume_texts, required_skills)
# Print the filtered resumes
for resume in filtered_resumes:
print("Filtered Resume:n", resume)
Code not running with nlp
New contributor
user25615874 is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.