debug

nestauk · May 10, 2024 · 0c87545 · 0c87545
1 parent 4839866
commit 0c87545
Show file tree

Hide file tree

Showing 7 changed files with 121 additions and 99 deletions.
diff --git a/ojd_daps_skills/extract_skills/extract_skills.py b/ojd_daps_skills/extract_skills/extract_skills.py
@@ -1,4 +1,4 @@
-from typing import List, Union
+from typing import List, Optional, Union
 
 from pydantic import BaseModel
 from spacy.tokens import Doc
@@ -7,10 +7,7 @@
 from ojd_daps_skills import setup_spacy_extensions
 from ojd_daps_skills.extract_skills.extract_skills_utils import ExtractConfig
 from ojd_daps_skills.extract_skills.multiskill_rules import (
-    _split_duplicate_object,
-    _split_duplicate_verb,
-    _split_skill_mentions,
-)
+    _split_duplicate_object, _split_duplicate_verb, _split_skill_mentions)
 from ojd_daps_skills.map_skills.skill_mapper import SkillsMapper
 from ojd_daps_skills.map_skills.skill_mapper_utils import MapConfig
 from ojd_daps_skills.utils.text_cleaning import clean_text, short_hash
@@ -28,28 +25,20 @@ class SkillsExtractor(BaseModel):
         taxonomy_name (str): pre-defined skills taxonomy name to load data for.
     """
 
-    taxonomy_name: str = "toy"
     ner_model_name: str = "nestauk/en_skillner"
     ms_model_name: str = "nestauk/multiskill-classifier"
-
-    def __init__(
-        self,
-        taxonomy_name: str = "toy",
-        ner_model_name: str = "nestauk/en_skillner",
-        ms_model_name: str = "nestauk/multiskill-classifier",
-    ):
-        super().__init__(
-            taxonomy_name=taxonomy_name,
-            ner_model_name=ner_model_name,
-            ms_model_name=ms_model_name,
-        )
-        # Initialize additional properties if needed
-        self._extract_config = ExtractConfig.create(
-            ner_model_name=self.ner_model_name,
-            ms_model_name=self.ms_model_name,
+    taxonomy_name: str = "toy"
+    map_config: Optional[MapConfig] = None
+    extract_config: Optional[ExtractConfig] = None
+    skill_mapper: Optional[SkillsMapper] = None
+
+    def __init__(self, **data):
+        super().__init__(**data)
+        self.map_config = MapConfig.create(taxonomy_name=self.taxonomy_name)
+        self.extract_config = ExtractConfig.create(
+            ner_model_name=self.ner_model_name, ms_model_name=self.ms_model_name
         )
-        self._map_config = MapConfig.create(taxonomy_name=self.taxonomy_name)
-        self._skill_mapper = SkillsMapper(config=self._map_config)
+        self.skill_mapper = SkillsMapper(config=self.map_config)
 
     def extract_skills(self, job_ads: Union[str, List[str]]) -> List[Doc]:
         """Return a list of spaCy Doc objects with entities
@@ -94,23 +83,28 @@ def get_skills(self, job_ad: str) -> Doc:
         rules = [_split_duplicate_object, _split_duplicate_verb, _split_skill_mentions]
 
         job_ad_clean = clean_text(job_ad)
-        doc = self._extract_config.nlp(job_ad_clean)
-
-        all_skill_ents = []
-        for ent in doc.ents:
-            if ent.label_ == "SKILL":
-                ms_pred = self._extract_config.ms_model.predict([ent.text])[0]
-                if ms_pred == 1:
-                    for rule in rules:
-                        split_ent = rule(ent)
-                        if split_ent:
-                            all_skill_ents.append(split_ent)
-                    # else, if no split, append the original entity
-                    all_skill_ents.append(ent)
-                else:
-                    all_skill_ents.append(ent)
-
-        doc._.skill_spans = all_skill_ents
+        doc = self.extract_config.nlp(job_ad_clean)
+
+        # check that there are ents in the first place
+
+        if doc.ents:
+            all_skill_ents = []
+            for ent in doc.ents:
+                if ent.label_ == "SKILL":
+                    ms_pred = self.extract_config.ms_model.predict([ent.text])[0]
+                    if ms_pred == 1:
+                        for rule in rules:
+                            split_ent = rule(ent)
+                            if split_ent:
+                                all_skill_ents.append(split_ent)
+                        # else, if no split, append the original entity
+                        all_skill_ents.append(ent)
+                    else:
+                        all_skill_ents.append(ent)
+
+            doc._.skill_spans = all_skill_ents
+        else:
+            doc._.skill_spans = []
 
         return doc
 
@@ -138,15 +132,18 @@ def map_skills(self, job_ads: Union[Doc, list[Doc]]) -> List[Doc]:
                 exits=1,
             )
 
-        all_mapped_skills = self._skill_mapper.match_skills(job_ads)
+        all_mapped_skills = self.skill_mapper.match_skills(job_ads)
 
         for job_ad in job_ads:
             mapped_skills_list = []
             for skill_span in job_ad._.skill_spans:
                 if not isinstance(skill_span, str):
                     skill_span = skill_span.text
                 skill_hash = short_hash(skill_span)
-                mapped_skills_list.append(all_mapped_skills.get(skill_hash))
+                if all_mapped_skills:
+                    mapped_skills_list.append(all_mapped_skills.get(skill_hash))
+                else:
+                    mapped_skills_list = []
 
             job_ad._.mapped_skills = mapped_skills_list
 

diff --git a/ojd_daps_skills/extract_skills/extract_skills_utils.py b/ojd_daps_skills/extract_skills/extract_skills_utils.py
@@ -12,6 +12,7 @@
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.pipeline import Pipeline
 from skops.hub_utils import download
+from spacy.language import Language
 from spacy.tokens import Doc
 from wasabi import msg
 
@@ -61,41 +62,42 @@ class ExtractConfig(BaseModel):
             You can use your own NER model if you have a custom NER model to extract skills.
         ms_model_name (str): The name of the Multi-Skill model to use. Current configurations
             supports "nestauk/multiskill-classifier".
-        nlp (spacy.Language): spaCy NLP model.
-        ms_model (Pipeline): Multi-Skill model pipeline.
+        nlp (Optional[Language]): The NLP model to use for Named Entity Recognition. This
+            is set during creation.
+        ms_model (Optional[Pipeline]): The SVM model to use for Multi-Skill classification.
+            This is set during creation.
     """
 
     ner_model_name: str = "nestauk/en_skillner"
     ms_model_name: str = "nestauk/multiskill-classifier"
-    nlp: spacy.Language
-    ms_model: Pipeline
+    nlp: Optional[Language] = None  # Optional, since it's set during creation
+    ms_model: Optional[Pipeline] = None  # Optional for the same reason
 
     class Config:
         arbitrary_types_allowed = True
 
     @classmethod
     def create(
-        cls, ner_model_name: str, ms_model_name: str
+        cls,
+        ner_model_name: Optional[str] = ner_model_name,
+        ms_model_name: Optional[str] = ms_model_name,
     ) -> "ExtractConfig":
         """
         Creates an instance of ExtractConfig by loading configurations.
 
         Parameters:
-            ner_model_name (str): The name of the NER model to use.
-            ms_model_name (str): The name of the Multi-Skill model to use.
+            ner_model_name (Optional[str]): The name of the NER model to use. Defaults
+                to "nestauk/en_skillner".
+            ms_model_name (Optional[str]): The name of the Multi-Skill model to use.
+                Defaults to "nestauk/multiskill-classifier".
 
         Returns:
             ExtractConfig: An initialized instance of this configuration class.
 
         Raises:
-            msg.fail: If the data or Multi-Skill models are not loaded
-            locally, this error is raised.
+            msg.fail: If the models are not loaded locally, this error is raised.
             OSError: If the NER model is not loaded, this error is raised.
         """
-        # set Doc extension here
-        # Use default values if none provided
-        Doc.set_extension("skill_spans", default=[], force=True)
-
         if "/" in ner_model_name:
             namespace, ner_name = ner_model_name.split("/")
         else:
@@ -107,11 +109,17 @@ def create(
             nlp = spacy.load(ner_name)
 
         except OSError:
-            msg.fail(f"{ner_model_name} NER model not loaded. Downloading model...")
-            os.system(
-                f"pip install https://huggingface.co/{namespace}/{ner_name}/resolve/main/{ner_name}-any-py3-none-any.whl"
-            )
-            nlp = spacy.load(ner_name)
+            if ner_model_name == "nestauk/en_skillner":
+                msg.info(f"{ner_model_name} NER model not loaded. Downloading model...")
+                os.system(
+                    f"pip install https://huggingface.co/{namespace}/{ner_name}/resolve/main/{ner_name}-any-py3-none-any.whl"
+                )
+                nlp = spacy.load(ner_name)
+            else:
+                msg.fail(
+                    f"{ner_model_name} NER model not loaded: {ner_model_name} Please install accordingly.",
+                    exit=1,
+                )
 
         # Load multi-skill model
         ms_model_path = PUBLIC_MODEL_FOLDER_PATH / "ms_model"

diff --git a/ojd_daps_skills/map_skills/format_taxonomy/esco_formatting.py b/ojd_daps_skills/map_skills/format_taxonomy/esco_formatting.py
@@ -18,7 +18,8 @@
 from wasabi import msg
 
 from ojd_daps_skills import bucket_name
-from ojd_daps_skills.utils.data_getters import get_s3_resource, load_s3_data, save_to_s3
+from ojd_daps_skills.utils.data_getters import (get_s3_resource, load_s3_data,
+                                                save_to_s3)
 
 
 def find_lev_0(code):

diff --git a/ojd_daps_skills/map_skills/format_taxonomy/hard_coded_mapper_formatting.py b/ojd_daps_skills/map_skills/format_taxonomy/hard_coded_mapper_formatting.py
@@ -3,7 +3,8 @@
 """
 
 from ojd_daps_skills import bucket_name
-from ojd_daps_skills.utils.data_getters import get_s3_resource, load_s3_data, save_to_s3
+from ojd_daps_skills.utils.data_getters import (get_s3_resource, load_s3_data,
+                                                save_to_s3)
 from ojd_daps_skills.utils.text_cleaning import short_hash
 
 if __name__ == "__main__":

diff --git a/ojd_daps_skills/map_skills/skill_mapper.py b/ojd_daps_skills/map_skills/skill_mapper.py
@@ -11,10 +11,7 @@
 
 from ojd_daps_skills import setup_spacy_extensions
 from ojd_daps_skills.map_skills.skill_mapper_utils import (
-    MapConfig,
-    get_most_common_code,
-    get_top_comparisons,
-)
+    MapConfig, get_most_common_code, get_top_comparisons)
 from ojd_daps_skills.utils.text_cleaning import clean_text, short_hash
 
 setup_spacy_extensions()
@@ -30,11 +27,15 @@ class SkillsMapper(BaseModel):
     similarity of the skill embeddings.
 
     Attributes:
-        taxonomy_name (str): The name of the taxonomy.
+        taxonomy_name (str): The name of the taxonomy. Default is "toy".
+        config (MapConfig): A configuration manager for mapping skills.
+            It is initiated with the taxonomy name.
+        all_skills_unique_dict (Dict[int, str]): A dictionary with unique skill
+            hashes as keys and the corresponding skill text as values. It is
+            created during the get_embeddings method.
     """
 
-    taxonomy_name: str = "toy"
-    config: MapConfig = MapConfig.create(taxonomy_name)
+    config: MapConfig
     all_skills_unique_dict: Dict[int, str] = {}
 
     def get_top_taxonomy_skills(
@@ -125,20 +126,17 @@ def get_embeddings(
         """
         all_skills = list(chain.from_iterable([doc._.skill_spans for doc in job_ads]))
         all_skills_unique = list(set(all_skills))
-
+
+        if not isinstance(self.config.hard_coded_taxonomy, dict):
+            self.config.hard_coded_taxonomy = {}
+
         self.all_skills_unique_dict = {}
         for skill in all_skills_unique:
             skill_clean = clean_text(skill)
             skill_hash = short_hash(skill_clean)
-            self.all_skills_unique_dict[skill_hash] = skill_clean
-
-        if self.config.previous_skill_matches:
-            self.all_skills_unique_dict = {
-                skill_hash: skill
-                for skill_hash, skill in self.all_skills_unique_dict.items()
-                if skill_hash not in self.config.previous_skill_matches.keys()
-            }
-
+            if not self.config.hard_coded_taxonomy.get(skill_hash):
+                self.all_skills_unique_dict[skill_hash] = skill_clean
+
         skill_embeddings = self.config.bert_model.transform(
             list(self.all_skills_unique_dict.values())
         )
@@ -171,11 +169,17 @@ def map_skills(self, job_ads: List[Doc]) -> List[Dict[str, Any]]:
 
         skill_embeddings, taxonomy_embeddings_dict = self.get_embeddings(job_ads)
 
+
         (
             top_skill_indxs,
             top_skill_scores,
             tax_skills_ix,
         ) = self.get_top_taxonomy_skills(skill_embeddings, taxonomy_embeddings_dict)
+
+        print("top_skill_indxs", top_skill_indxs)
+        print("top_skill_scores", top_skill_scores)
+        print("tax_skills_ix", tax_skills_ix)
+
 
         if self.config.taxonomy_config.get("skill_hier_info_col"):
             top_hier_skills, hier_types = self.get_top_hierarchy_skills(
@@ -185,6 +189,7 @@ def map_skills(self, job_ads: List[Doc]) -> List[Dict[str, Any]]:
         # Output the top matches (using the different metrics) for each OJO skill
         # Need to match indexes back correctly (hence all the ix variables)
         skill_mapper_list = []
+
         for i, (match_i, match_text) in enumerate(self.all_skills_unique_dict.items()):
             # Top highest matches (any threshold)
             match_results = {
@@ -242,7 +247,7 @@ def map_skills(self, job_ads: List[Doc]) -> List[Dict[str, Any]]:
                 )
 
             skill_mapper_list.append(match_results)
-
+        
         return skill_mapper_list
 
     def match_skills(self, job_ads: List[Doc]) -> Dict[int, dict]:
@@ -353,4 +358,7 @@ def match_skills(self, job_ads: List[Doc]) -> Dict[int, dict]:
 
         final_match_dict = {match["ojo_skill_id"]: match for match in final_match}
 
+        if self.config.hard_coded_taxonomy:
+            final_match_dict = {**final_match_dict, **self.config.hard_coded_taxonomy}
+
         return final_match_dict
diff --git a/ojd_daps_skills/map_skills/skill_mapper_utils.py b/ojd_daps_skills/map_skills/skill_mapper_utils.py
@@ -31,13 +31,19 @@ def get_top_comparisons(ojo_embs: np.array, taxonomy_embs: np.array) -> Tuple[li
         Tuple[list]: List of top 10 most similar taxonomy skills
         for each extracted skill and their corresponding scores.
     """
+    if ojo_embs.size > 0:
+        emb_sims = cosine_similarity(ojo_embs, taxonomy_embs)
 
-    emb_sims = cosine_similarity(ojo_embs, taxonomy_embs)
+        top_sim_indxs = [list(np.argsort(sim)[::-1][:10]) for sim in emb_sims]
+        top_sim_scores = [
+            [float(s) for s in np.sort(sim)[::-1][:10]] for sim in emb_sims
+        ]
 
-    top_sim_indxs = [list(np.argsort(sim)[::-1][:10]) for sim in emb_sims]
-    top_sim_scores = [[float(s) for s in np.sort(sim)[::-1][:10]] for sim in emb_sims]
+        return top_sim_indxs, top_sim_scores
 
-    return top_sim_indxs, top_sim_scores
+    else:
+
+        return None, None
 
 
 def get_most_common_code(
@@ -155,20 +161,20 @@ class MapConfig(BaseModel):
             skills to taxonomy skills.
     """
 
-    taxonomy_name: str
-    taxonomy_config: Dict[str, Any]
-    bert_model: BertVectorizer
-    taxonomy_data: pd.DataFrame
-    taxonomy_embeddings: Optional[Dict[int, np.array]]
-    hier_mapper: Dict[str, str]
-    hard_coded_taxonomy: Union[Dict[int, Any], None]
-    previous_skill_matches: Union[Dict[int, Any], None]
+    taxonomy_name: Optional[str] = None
+    taxonomy_config: Optional[Dict[str, Any]] = None
+    bert_model: Optional[BertVectorizer] = None
+    taxonomy_data: Optional[pd.DataFrame] = None
+    taxonomy_embeddings: Optional[Dict[int, np.array]] = None
+    hier_mapper: Optional[Dict[str, str]] = None
+    hard_coded_taxonomy: Optional[Dict[int, Any]] = None
+    previous_skill_matches: Optional[Dict[int, Any]] = None
 
     class Config:
         arbitrary_types_allowed = True
 
     @classmethod
-    def create(cls, taxonomy_name: str) -> "MapConfig":
+    def create(cls, taxonomy_name: Optional[str] = "toy") -> "MapConfig":
         """
         Creates an instance of MapConfig by loading configurations.
 
@@ -183,7 +189,6 @@ def create(cls, taxonomy_name: str) -> "MapConfig":
             msg.fail: If the configuration file or data is not loaded locally, this error
                 is raised.
         """
-
         config_path = PROJECT_DIR / "ojd_daps_skills/configs"
         config_file = config_path / f"extract_skills_{taxonomy_name}.yaml"