-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathresume_parser.py
67 lines (55 loc) · 2.33 KB
/
resume_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import spacy
import pdfplumber
import docx
# Load the spaCy model for Named Entity Recognition (NER)
nlp = spacy.load("en_core_web_sm")
def extract_text_from_pdf(file_path):
"""
Extract text from a PDF file.
"""
with pdfplumber.open(file_path) as pdf:
text = ""
for page in pdf.pages:
text += page.extract_text() # Extract text from each page
return text
def extract_text_from_docx(file_path):
"""
Extract text from a DOCX file.
"""
doc = docx.Document(file_path)
text = "\n".join([para.text for para in doc.paragraphs]) # Combine all paragraphs into a string
return text
def parse_resume(file_path):
"""
Parse a resume to extract details like name, education, skills, and experience.
"""
# Extract text based on file format (PDF or DOCX)
if file_path.endswith(".pdf"):
text = extract_text_from_pdf(file_path)
elif file_path.endswith(".docx"):
text = extract_text_from_docx(file_path)
else:
raise ValueError("File format not supported. Use PDF or DOCX.")
# Process the text using spaCy
doc = nlp(text)
# Initialize a dictionary to store extracted entities
entities = {"name": None, "education": [], "skills": [], "experience": []}
# Try to extract the name from the document using NER (Person entities)
for ent in doc.ents:
if ent.label_ == "PERSON" and not entities["name"]:
entities["name"] = ent.text # Set name if it's not already set
# Fallback: If no name was detected, use the first line from the resume text
if not entities["name"]:
lines = text.split("\n")
entities["name"] = lines[0].strip() if lines else "Unknown"
# Extract organization names (for education or job experience)
for ent in doc.ents:
if ent.label_ == "ORG": # Organization entities could represent education or companies
entities["education"].append(ent.text)
# Extract job experience (using WORK_OF_ART for positions, might need adjustment based on data)
for ent in doc.ents:
if ent.label_ == "WORK_OF_ART": # This is often used for job titles
entities["experience"].append(ent.text)
# Extract skills (using noun tokens in the document)
entities["skills"] = [token.text for token in doc if token.pos_ == "NOUN"]
return entities