Skip to content

Commit

Permalink
debug
Browse files Browse the repository at this point in the history
  • Loading branch information
India Kerle committed May 10, 2024
1 parent 4839866 commit 0c87545
Show file tree
Hide file tree
Showing 7 changed files with 121 additions and 99 deletions.
83 changes: 40 additions & 43 deletions ojd_daps_skills/extract_skills/extract_skills.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Union
from typing import List, Optional, Union

from pydantic import BaseModel
from spacy.tokens import Doc
Expand All @@ -7,10 +7,7 @@
from ojd_daps_skills import setup_spacy_extensions
from ojd_daps_skills.extract_skills.extract_skills_utils import ExtractConfig
from ojd_daps_skills.extract_skills.multiskill_rules import (
_split_duplicate_object,
_split_duplicate_verb,
_split_skill_mentions,
)
_split_duplicate_object, _split_duplicate_verb, _split_skill_mentions)
from ojd_daps_skills.map_skills.skill_mapper import SkillsMapper
from ojd_daps_skills.map_skills.skill_mapper_utils import MapConfig
from ojd_daps_skills.utils.text_cleaning import clean_text, short_hash
Expand All @@ -28,28 +25,20 @@ class SkillsExtractor(BaseModel):
taxonomy_name (str): pre-defined skills taxonomy name to load data for.
"""

taxonomy_name: str = "toy"
ner_model_name: str = "nestauk/en_skillner"
ms_model_name: str = "nestauk/multiskill-classifier"

def __init__(
self,
taxonomy_name: str = "toy",
ner_model_name: str = "nestauk/en_skillner",
ms_model_name: str = "nestauk/multiskill-classifier",
):
super().__init__(
taxonomy_name=taxonomy_name,
ner_model_name=ner_model_name,
ms_model_name=ms_model_name,
)
# Initialize additional properties if needed
self._extract_config = ExtractConfig.create(
ner_model_name=self.ner_model_name,
ms_model_name=self.ms_model_name,
taxonomy_name: str = "toy"
map_config: Optional[MapConfig] = None
extract_config: Optional[ExtractConfig] = None
skill_mapper: Optional[SkillsMapper] = None

def __init__(self, **data):
super().__init__(**data)
self.map_config = MapConfig.create(taxonomy_name=self.taxonomy_name)
self.extract_config = ExtractConfig.create(
ner_model_name=self.ner_model_name, ms_model_name=self.ms_model_name
)
self._map_config = MapConfig.create(taxonomy_name=self.taxonomy_name)
self._skill_mapper = SkillsMapper(config=self._map_config)
self.skill_mapper = SkillsMapper(config=self.map_config)

def extract_skills(self, job_ads: Union[str, List[str]]) -> List[Doc]:
"""Return a list of spaCy Doc objects with entities
Expand Down Expand Up @@ -94,23 +83,28 @@ def get_skills(self, job_ad: str) -> Doc:
rules = [_split_duplicate_object, _split_duplicate_verb, _split_skill_mentions]

job_ad_clean = clean_text(job_ad)
doc = self._extract_config.nlp(job_ad_clean)

all_skill_ents = []
for ent in doc.ents:
if ent.label_ == "SKILL":
ms_pred = self._extract_config.ms_model.predict([ent.text])[0]
if ms_pred == 1:
for rule in rules:
split_ent = rule(ent)
if split_ent:
all_skill_ents.append(split_ent)
# else, if no split, append the original entity
all_skill_ents.append(ent)
else:
all_skill_ents.append(ent)

doc._.skill_spans = all_skill_ents
doc = self.extract_config.nlp(job_ad_clean)

# check that there are ents in the first place

if doc.ents:
all_skill_ents = []
for ent in doc.ents:
if ent.label_ == "SKILL":
ms_pred = self.extract_config.ms_model.predict([ent.text])[0]
if ms_pred == 1:
for rule in rules:
split_ent = rule(ent)
if split_ent:
all_skill_ents.append(split_ent)
# else, if no split, append the original entity
all_skill_ents.append(ent)
else:
all_skill_ents.append(ent)

doc._.skill_spans = all_skill_ents
else:
doc._.skill_spans = []

return doc

Expand Down Expand Up @@ -138,15 +132,18 @@ def map_skills(self, job_ads: Union[Doc, list[Doc]]) -> List[Doc]:
exits=1,
)

all_mapped_skills = self._skill_mapper.match_skills(job_ads)
all_mapped_skills = self.skill_mapper.match_skills(job_ads)

for job_ad in job_ads:
mapped_skills_list = []
for skill_span in job_ad._.skill_spans:
if not isinstance(skill_span, str):
skill_span = skill_span.text
skill_hash = short_hash(skill_span)
mapped_skills_list.append(all_mapped_skills.get(skill_hash))
if all_mapped_skills:
mapped_skills_list.append(all_mapped_skills.get(skill_hash))
else:
mapped_skills_list = []

job_ad._.mapped_skills = mapped_skills_list

Expand Down
44 changes: 26 additions & 18 deletions ojd_daps_skills/extract_skills/extract_skills_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from skops.hub_utils import download
from spacy.language import Language
from spacy.tokens import Doc
from wasabi import msg

Expand Down Expand Up @@ -61,41 +62,42 @@ class ExtractConfig(BaseModel):
You can use your own NER model if you have a custom NER model to extract skills.
ms_model_name (str): The name of the Multi-Skill model to use. Current configurations
supports "nestauk/multiskill-classifier".
nlp (spacy.Language): spaCy NLP model.
ms_model (Pipeline): Multi-Skill model pipeline.
nlp (Optional[Language]): The NLP model to use for Named Entity Recognition. This
is set during creation.
ms_model (Optional[Pipeline]): The SVM model to use for Multi-Skill classification.
This is set during creation.
"""

ner_model_name: str = "nestauk/en_skillner"
ms_model_name: str = "nestauk/multiskill-classifier"
nlp: spacy.Language
ms_model: Pipeline
nlp: Optional[Language] = None # Optional, since it's set during creation
ms_model: Optional[Pipeline] = None # Optional for the same reason

class Config:
arbitrary_types_allowed = True

@classmethod
def create(
cls, ner_model_name: str, ms_model_name: str
cls,
ner_model_name: Optional[str] = ner_model_name,
ms_model_name: Optional[str] = ms_model_name,
) -> "ExtractConfig":
"""
Creates an instance of ExtractConfig by loading configurations.
Parameters:
ner_model_name (str): The name of the NER model to use.
ms_model_name (str): The name of the Multi-Skill model to use.
ner_model_name (Optional[str]): The name of the NER model to use. Defaults
to "nestauk/en_skillner".
ms_model_name (Optional[str]): The name of the Multi-Skill model to use.
Defaults to "nestauk/multiskill-classifier".
Returns:
ExtractConfig: An initialized instance of this configuration class.
Raises:
msg.fail: If the data or Multi-Skill models are not loaded
locally, this error is raised.
msg.fail: If the models are not loaded locally, this error is raised.
OSError: If the NER model is not loaded, this error is raised.
"""
# set Doc extension here
# Use default values if none provided
Doc.set_extension("skill_spans", default=[], force=True)

if "/" in ner_model_name:
namespace, ner_name = ner_model_name.split("/")
else:
Expand All @@ -107,11 +109,17 @@ def create(
nlp = spacy.load(ner_name)

except OSError:
msg.fail(f"{ner_model_name} NER model not loaded. Downloading model...")
os.system(
f"pip install https://huggingface.co/{namespace}/{ner_name}/resolve/main/{ner_name}-any-py3-none-any.whl"
)
nlp = spacy.load(ner_name)
if ner_model_name == "nestauk/en_skillner":
msg.info(f"{ner_model_name} NER model not loaded. Downloading model...")
os.system(
f"pip install https://huggingface.co/{namespace}/{ner_name}/resolve/main/{ner_name}-any-py3-none-any.whl"
)
nlp = spacy.load(ner_name)
else:
msg.fail(
f"{ner_model_name} NER model not loaded: {ner_model_name} Please install accordingly.",
exit=1,
)

# Load multi-skill model
ms_model_path = PUBLIC_MODEL_FOLDER_PATH / "ms_model"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
from wasabi import msg

from ojd_daps_skills import bucket_name
from ojd_daps_skills.utils.data_getters import get_s3_resource, load_s3_data, save_to_s3
from ojd_daps_skills.utils.data_getters import (get_s3_resource, load_s3_data,
save_to_s3)


def find_lev_0(code):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
"""

from ojd_daps_skills import bucket_name
from ojd_daps_skills.utils.data_getters import get_s3_resource, load_s3_data, save_to_s3
from ojd_daps_skills.utils.data_getters import (get_s3_resource, load_s3_data,
save_to_s3)
from ojd_daps_skills.utils.text_cleaning import short_hash

if __name__ == "__main__":
Expand Down
44 changes: 26 additions & 18 deletions ojd_daps_skills/map_skills/skill_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,7 @@

from ojd_daps_skills import setup_spacy_extensions
from ojd_daps_skills.map_skills.skill_mapper_utils import (
MapConfig,
get_most_common_code,
get_top_comparisons,
)
MapConfig, get_most_common_code, get_top_comparisons)
from ojd_daps_skills.utils.text_cleaning import clean_text, short_hash

setup_spacy_extensions()
Expand All @@ -30,11 +27,15 @@ class SkillsMapper(BaseModel):
similarity of the skill embeddings.
Attributes:
taxonomy_name (str): The name of the taxonomy.
taxonomy_name (str): The name of the taxonomy. Default is "toy".
config (MapConfig): A configuration manager for mapping skills.
It is initiated with the taxonomy name.
all_skills_unique_dict (Dict[int, str]): A dictionary with unique skill
hashes as keys and the corresponding skill text as values. It is
created during the get_embeddings method.
"""

taxonomy_name: str = "toy"
config: MapConfig = MapConfig.create(taxonomy_name)
config: MapConfig
all_skills_unique_dict: Dict[int, str] = {}

def get_top_taxonomy_skills(
Expand Down Expand Up @@ -125,20 +126,17 @@ def get_embeddings(
"""
all_skills = list(chain.from_iterable([doc._.skill_spans for doc in job_ads]))
all_skills_unique = list(set(all_skills))


if not isinstance(self.config.hard_coded_taxonomy, dict):
self.config.hard_coded_taxonomy = {}

self.all_skills_unique_dict = {}
for skill in all_skills_unique:
skill_clean = clean_text(skill)
skill_hash = short_hash(skill_clean)
self.all_skills_unique_dict[skill_hash] = skill_clean

if self.config.previous_skill_matches:
self.all_skills_unique_dict = {
skill_hash: skill
for skill_hash, skill in self.all_skills_unique_dict.items()
if skill_hash not in self.config.previous_skill_matches.keys()
}

if not self.config.hard_coded_taxonomy.get(skill_hash):
self.all_skills_unique_dict[skill_hash] = skill_clean

skill_embeddings = self.config.bert_model.transform(
list(self.all_skills_unique_dict.values())
)
Expand Down Expand Up @@ -171,11 +169,17 @@ def map_skills(self, job_ads: List[Doc]) -> List[Dict[str, Any]]:

skill_embeddings, taxonomy_embeddings_dict = self.get_embeddings(job_ads)


(
top_skill_indxs,
top_skill_scores,
tax_skills_ix,
) = self.get_top_taxonomy_skills(skill_embeddings, taxonomy_embeddings_dict)

print("top_skill_indxs", top_skill_indxs)
print("top_skill_scores", top_skill_scores)
print("tax_skills_ix", tax_skills_ix)


if self.config.taxonomy_config.get("skill_hier_info_col"):
top_hier_skills, hier_types = self.get_top_hierarchy_skills(
Expand All @@ -185,6 +189,7 @@ def map_skills(self, job_ads: List[Doc]) -> List[Dict[str, Any]]:
# Output the top matches (using the different metrics) for each OJO skill
# Need to match indexes back correctly (hence all the ix variables)
skill_mapper_list = []

for i, (match_i, match_text) in enumerate(self.all_skills_unique_dict.items()):
# Top highest matches (any threshold)
match_results = {
Expand Down Expand Up @@ -242,7 +247,7 @@ def map_skills(self, job_ads: List[Doc]) -> List[Dict[str, Any]]:
)

skill_mapper_list.append(match_results)

return skill_mapper_list

def match_skills(self, job_ads: List[Doc]) -> Dict[int, dict]:
Expand Down Expand Up @@ -353,4 +358,7 @@ def match_skills(self, job_ads: List[Doc]) -> Dict[int, dict]:

final_match_dict = {match["ojo_skill_id"]: match for match in final_match}

if self.config.hard_coded_taxonomy:
final_match_dict = {**final_match_dict, **self.config.hard_coded_taxonomy}

return final_match_dict
33 changes: 19 additions & 14 deletions ojd_daps_skills/map_skills/skill_mapper_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,19 @@ def get_top_comparisons(ojo_embs: np.array, taxonomy_embs: np.array) -> Tuple[li
Tuple[list]: List of top 10 most similar taxonomy skills
for each extracted skill and their corresponding scores.
"""
if ojo_embs.size > 0:
emb_sims = cosine_similarity(ojo_embs, taxonomy_embs)

emb_sims = cosine_similarity(ojo_embs, taxonomy_embs)
top_sim_indxs = [list(np.argsort(sim)[::-1][:10]) for sim in emb_sims]
top_sim_scores = [
[float(s) for s in np.sort(sim)[::-1][:10]] for sim in emb_sims
]

top_sim_indxs = [list(np.argsort(sim)[::-1][:10]) for sim in emb_sims]
top_sim_scores = [[float(s) for s in np.sort(sim)[::-1][:10]] for sim in emb_sims]
return top_sim_indxs, top_sim_scores

return top_sim_indxs, top_sim_scores
else:

return None, None


def get_most_common_code(
Expand Down Expand Up @@ -155,20 +161,20 @@ class MapConfig(BaseModel):
skills to taxonomy skills.
"""

taxonomy_name: str
taxonomy_config: Dict[str, Any]
bert_model: BertVectorizer
taxonomy_data: pd.DataFrame
taxonomy_embeddings: Optional[Dict[int, np.array]]
hier_mapper: Dict[str, str]
hard_coded_taxonomy: Union[Dict[int, Any], None]
previous_skill_matches: Union[Dict[int, Any], None]
taxonomy_name: Optional[str] = None
taxonomy_config: Optional[Dict[str, Any]] = None
bert_model: Optional[BertVectorizer] = None
taxonomy_data: Optional[pd.DataFrame] = None
taxonomy_embeddings: Optional[Dict[int, np.array]] = None
hier_mapper: Optional[Dict[str, str]] = None
hard_coded_taxonomy: Optional[Dict[int, Any]] = None
previous_skill_matches: Optional[Dict[int, Any]] = None

class Config:
arbitrary_types_allowed = True

@classmethod
def create(cls, taxonomy_name: str) -> "MapConfig":
def create(cls, taxonomy_name: Optional[str] = "toy") -> "MapConfig":
"""
Creates an instance of MapConfig by loading configurations.
Expand All @@ -183,7 +189,6 @@ def create(cls, taxonomy_name: str) -> "MapConfig":
msg.fail: If the configuration file or data is not loaded locally, this error
is raised.
"""

config_path = PROJECT_DIR / "ojd_daps_skills/configs"
config_file = config_path / f"extract_skills_{taxonomy_name}.yaml"

Expand Down
Loading

0 comments on commit 0c87545

Please sign in to comment.