diff --git a/README.md b/README.md index d8e6f61d..291157df 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,77 @@ +To install as a package: + +``` +pipx install poetry +poetry shell +poetry install +``` + +To extract skills from a job advert: + +``` +from ojd_daps_skills.extract_skills.extract_skills import SkillsExtractor + +sm = SkillsExtractor(taxonomy_name="toy") + +✘ nestauk/en_skillner NER model not loaded. Downloading model... +Collecting en-skillner==any + Downloading https://huggingface.co/nestauk/en_skillner/resolve/main/en_skillner-any-py3-none-any.whl (587.7 MB) + ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 587.7/587.7 MB 5.1 MB/s eta 0:00:0000:0100:01 +Installing collected packages: en-skillner +Successfully installed en-skillner-3.7.1 +✘ Multi-skill classifier not loaded. Downloading model... +Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 26843.55it/s] +✘ Neccessary data files are not downloaded. Downloading ~0.5GB of +neccessary data files to +/Users/india.kerlenesta/Projects/nesta/ojd_daps/ojd_daps_extension/ojd_daps_skills/ojd_daps_skills_data. +ℹ Data folder downloaded from +/Users/india.kerlenesta/Projects/nesta/ojd_daps/ojd_daps_extension/ojd_daps_skills/ojd_daps_skills_data + +job_ad = "You should be skilled in Python, Java and R." +job_ad_with_skills = sm(job_ad) + +ℹ Getting embeddings for 3 texts ... +ℹ Took 0.018199920654296875 seconds +``` + +To access the extracted and mapped skills: + +``` +job_ad_with_skills_doc = job_ad_with_skills[0] + +#print raw ents (i.e. multiskills are not split, also include 'BENEFIT' and 'EXPERIENCE' spans) +job_ad_with_skills_doc.ents +>> (Python, Java, R.) + +#print SKILL spans (where SKILL spans are predicted as multiskills, split them) + +job_ad_with_skills._.skill_spans +>> [Python, Java, R.] + +#print mapped skills to the "toy" taxonomy +job_ad_with_skills._.mapped_skills +>> [{'ojo_skill': 'Python', + 'ojo_skill_id': 2232581233191055, + 'match_skill': 'working with computers', + 'match_score': 0.75, + 'match_type': 'most_common_level_1', + 'match_id': 'S5'}, + {'ojo_skill': 'Java', + 'ojo_skill_id': 2833100423969322, + 'match_skill': 'working with computers', + 'match_score': 0.6666666666666666, + 'match_type': 'most_common_level_1', + 'match_id': 'S5'}, + {'ojo_skill': 'R.', + 'ojo_skill_id': 8622187230313821, + 'match_skill': 'working with computers', + 'match_score': 0.6666666666666666, + 'match_type': 'most_common_level_1', + 'match_id': 'S5'}] +``` + +To run tests: + ``` pytest tests/ ``` diff --git a/ojd_daps_skills/extract_skills/extract_skills.py b/ojd_daps_skills/extract_skills/extract_skills.py index ba62fb21..61f09683 100644 --- a/ojd_daps_skills/extract_skills/extract_skills.py +++ b/ojd_daps_skills/extract_skills/extract_skills.py @@ -9,6 +9,7 @@ from ojd_daps_skills.extract_skills.multiskill_rules import ( _split_duplicate_object, _split_duplicate_verb, _split_skill_mentions) from ojd_daps_skills.map_skills.skill_mapper import SkillsMapper +from ojd_daps_skills.map_skills.skill_mapper_utils import MapConfig from ojd_daps_skills.utils.text_cleaning import clean_text, short_hash setup_spacy_extensions() @@ -30,15 +31,22 @@ class SkillsExtractor(BaseModel): def __init__( self, + taxonomy_name: str = "toy", + ner_model_name: str = "nestauk/en_skillner", + ms_model_name: str = "nestauk/multiskill-classifier", ): - super().__init__() - self._extract_config: ExtractConfig = ExtractConfig.create( + super().__init__( + taxonomy_name=taxonomy_name, + ner_model_name=ner_model_name, + ms_model_name=ms_model_name, + ) + # Initialize additional properties if needed + self._extract_config = ExtractConfig.create( ner_model_name=self.ner_model_name, ms_model_name=self.ms_model_name, ) - self._skill_mapper: SkillsMapper = SkillsMapper( - taxonomy_name=self.taxonomy_name - ) + self._map_config = MapConfig.create(taxonomy_name=self.taxonomy_name) + self._skill_mapper = SkillsMapper(config=self._map_config) def extract_skills(self, job_ads: Union[str, List[str]]) -> List[Doc]: """Return a list of spaCy Doc objects with entities diff --git a/ojd_daps_skills/map_skills/format_taxonomy/lightcast_formatting.py b/ojd_daps_skills/map_skills/format_taxonomy/lightcast_formatting.py index 3d8e8b58..46235819 100644 --- a/ojd_daps_skills/map_skills/format_taxonomy/lightcast_formatting.py +++ b/ojd_daps_skills/map_skills/format_taxonomy/lightcast_formatting.py @@ -5,7 +5,7 @@ |---|---|---|---| id: A unique id for the skill/hierarchy -description: The skill/hierarchy level description text +description: The skill/hierarchy level description texts type: What column name the skill/hier description is from (category, subcategory) hierarchy_levels: If a skill then which hierarchy levels is it in @@ -155,10 +155,10 @@ def remove_bad_hierarchy_levels(hierarchy_levels): lightcast_formatted = pd.concat( [all_skills, category_skills, subcategory_skills] ).reset_index(drop=True) - lightcast_formatted["hierarchy_levels"] = ( - lightcast_formatted.hierarchy_levels.apply(map_subcategory_ids).apply( - remove_bad_hierarchy_levels - ) + lightcast_formatted[ + "hierarchy_levels" + ] = lightcast_formatted.hierarchy_levels.apply(map_subcategory_ids).apply( + remove_bad_hierarchy_levels ) lightcast_formatted = lightcast_formatted.query("description.notna()").query( 'description != "NULL"' diff --git a/ojd_daps_skills/map_skills/skill_mapper.py b/ojd_daps_skills/map_skills/skill_mapper.py index 3f56dd1b..488dc55e 100644 --- a/ojd_daps_skills/map_skills/skill_mapper.py +++ b/ojd_daps_skills/map_skills/skill_mapper.py @@ -219,9 +219,9 @@ def map_skills(self, job_ads: List[Doc]) -> List[Dict[str, Any]]: high_hier_codes += [hier_level] * round(sim_score * 10) high_tax_skills_results = {} for hier_level in range(self.config.taxonomy_config["num_hier_levels"]): - high_tax_skills_results["most_common_level_" + str(hier_level)] = ( - get_most_common_code(high_hier_codes, hier_level) - ) + high_tax_skills_results[ + "most_common_level_" + str(hier_level) + ] = get_most_common_code(high_hier_codes, hier_level) if high_tax_skills_results: match_results["high_tax_skills"] = high_tax_skills_results diff --git a/ojd_daps_skills/map_skills/skill_mapper_utils.py b/ojd_daps_skills/map_skills/skill_mapper_utils.py index 825a82c3..817d454e 100644 --- a/ojd_daps_skills/map_skills/skill_mapper_utils.py +++ b/ojd_daps_skills/map_skills/skill_mapper_utils.py @@ -161,8 +161,8 @@ class MapConfig(BaseModel): taxonomy_data: pd.DataFrame taxonomy_embeddings: Optional[Dict[int, np.array]] hier_mapper: Dict[str, str] - hard_coded_taxonomy: Optional[Dict[int, dict]] - previous_skill_matches: Optional[Dict[int, str]] + hard_coded_taxonomy: Union[Dict[int, Any], None] + previous_skill_matches: Union[Dict[int, Any], None] class Config: arbitrary_types_allowed = True