diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index b738db01..d04b4771 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -29,5 +29,5 @@ jobs:
         uses: mhausenblas/mkdocs-deploy-gh-pages@master
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          CONFIG_FILE: mkdocs.yml
-          REQUIREMENTS: requirements.txt
\ No newline at end of file
+          CONFIG_FILE: docs/mkdocs.yml
+          REQUIREMENTS: docs/requirements.txt
diff --git a/.github/workflows/pytest.yaml b/.github/workflows/pytest.yaml
new file mode 100644
index 00000000..b8e2a70d
--- /dev/null
+++ b/.github/workflows/pytest.yaml
@@ -0,0 +1,57 @@
+name: Unit Tests
+
+on: [push]
+
+jobs:
+  test:
+    runs-on: ${{ matrix.os }}
+
+    strategy:
+      matrix:
+        os: ["ubuntu-latest", "macos-latest"]
+        python-version: ["3.9", "3.10"]
+    steps:
+      #----------------------------------------------
+      #       check-out repo and set-up python
+      #----------------------------------------------
+      - name: Check out repository
+        uses: actions/checkout@v4
+      - name: Set up python ${{ matrix.python-version }}
+        id: setup-python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      #----------------------------------------------
+      #  -----  install & configure poetry  -----
+      #----------------------------------------------
+      - name: Install Poetry
+        uses: snok/install-poetry@v1
+        with:
+          virtualenvs-create: true
+          virtualenvs-in-project: true
+      #----------------------------------------------
+      #       load cached venv if cache exists
+      #----------------------------------------------
+      - name: Load cached venv
+        id: cached-poetry-dependencies
+        uses: actions/cache@v3
+        with:
+          path: .venv
+          key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
+      #----------------------------------------------
+      # install dependencies if cache does not exist
+      #----------------------------------------------
+      - name: Install dependencies
+        if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
+        run: poetry install --no-interaction --no-root
+      #----------------------------------------------
+      # install your root project, if required
+      #----------------------------------------------
+      - name: Install additional dependencies
+        run: |
+          poetry install --no-interaction
+      #----------------------------------------------
+      #    add matrix specifics and run test suite
+      #----------------------------------------------
+      - name: Run tests
+        run: poetry run pytest tests/ --verbose
diff --git a/README.md b/README.md
index b78992a7..8a3e59fc 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ This page contains information on how to install and use Nesta's skills extracti
 
 We currently support three different taxonomies to map onto: the [European Commission’s European Skills, Competences, and Occupations (ESCO)](https://esco.ec.europa.eu/en/about-esco/what-esco), [Lightcast’s Open Skills](https://skills.lightcast.io/) and a “toy” taxonomy developed internally for the purpose of testing.
 
-If you'd like to learn more about the models used in the library, please refer to the [model card page](https://nestauk.github.io/ojd_daps_skills/build/html/model_card.html).
+If you'd like to learn more about the models used in the library, please refer to the [model card page](https://nestauk.github.io/ojd_daps_skills/source/model_card.md).
 
 You may also want to read more about the wider project by reading:
 
@@ -113,4 +113,4 @@ If contributing, changes will need to be pushed to a new branch in order for our
 
 <small><p>Project template is based on <a target="_blank" href="https://github.com/nestauk/ds-cookiecutter">Nesta's data science project template</a>
 (<a href="http://nestauk.github.io/ds-cookiecutter">Read the docs here</a>).
-</small>
\ No newline at end of file
+</small>
diff --git a/docs/images/label_eg1.jpg b/docs/images/label_eg1.jpg
new file mode 100644
index 00000000..aaf44d52
Binary files /dev/null and b/docs/images/label_eg1.jpg differ
diff --git a/docs/images/label_eg4.jpg b/docs/images/label_eg4.jpg
new file mode 100644
index 00000000..82de254e
Binary files /dev/null and b/docs/images/label_eg4.jpg differ
diff --git a/docs/images/label_eg5.jpg b/docs/images/label_eg5.jpg
new file mode 100644
index 00000000..23c97e1f
Binary files /dev/null and b/docs/images/label_eg5.jpg differ
diff --git a/docs/images/label_studio.png b/docs/images/label_studio.png
new file mode 100644
index 00000000..8c82b3c4
Binary files /dev/null and b/docs/images/label_studio.png differ
diff --git a/docs/images/match_flow.png b/docs/images/match_flow.png
new file mode 100644
index 00000000..59174032
Binary files /dev/null and b/docs/images/match_flow.png differ
diff --git a/docs/images/overview.png b/docs/images/overview.png
new file mode 100644
index 00000000..80176db8
Binary files /dev/null and b/docs/images/overview.png differ
diff --git a/docs/images/overview_example.png b/docs/images/overview_example.png
new file mode 100644
index 00000000..34c7dd7a
Binary files /dev/null and b/docs/images/overview_example.png differ
diff --git a/docs/images/predict_flow.png b/docs/images/predict_flow.png
new file mode 100644
index 00000000..fc05f3d6
Binary files /dev/null and b/docs/images/predict_flow.png differ
diff --git a/docs/index.md b/docs/index.md
index 26de0815..8afba6a4 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -2,7 +2,6 @@
 
 - [Installation](#installation)
 - [Using Nesta’s Skills Extractor library](#tldr-using-nestas-skills-extractor-library)
-- Development
 
 ## Welcome to Nesta’s Skills Extractor Library
 
@@ -14,7 +13,7 @@ This page contains information on how to install and use Nesta’s skills extrac
 
 We currently support three different taxonomies to map onto: the European Commission’s European Skills, Competences, and Occupations (ESCO), Lightcast’s Open Skills and a “toy” taxonomy developed internally for the purpose of testing.
 
-If you’d like to learn more about the models used in the library, please refer to the model card page.
+If you’d like to learn more about the models used in the library, please refer to [the model card page](source/model_card.md). For more information on how we labelled the training data for the models see [the labelling page](source/labelling.md). A more in depth discussion of the pipeline and evaluation of it can be found in [the pipeline summary and metrics page](source/pipeline_summary.md).
 
 You may also want to read more about the wider project by reading:
 
@@ -33,7 +32,7 @@ You will also need to install spaCy’s English language model:
 
 Note that this package was developed on MacOS and tested on Ubuntu. Changes have been made to be compatible on a Windows system but are not tested and cannot be guaranteed.
 
-When the package is first used it will automatically download a folder of neccessary data and models. (~1GB)
+When the package is first used it will automatically download a folder of neccessary data and models (~1GB).
 
 ## TL;DR: Using Nesta’s Skills Extractor library
 
@@ -115,4 +114,3 @@ If you would like to demo the library using a front end, we have also built a st
 The technical and working style guidelines can be found [here](https://github.com/nestauk/ds-cookiecutter/blob/master/GUIDELINES.md).
 
 If contributing, changes will need to be pushed to a new branch in order for our code checks to be triggered.
-
diff --git a/mkdocs.yml b/docs/mkdocs.yml
similarity index 79%
rename from mkdocs.yml
rename to docs/mkdocs.yml
index c3404934..4dfc46aa 100644
--- a/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -7,11 +7,11 @@ extra:
   homepage: https://nestauk.github.io/ojd_daps_skills
 docs_dir: .
 extra_css:
-  - docs/style.css
+  - styles.css
 theme:
   name: material
-  logo: docs/images/favicon.ico
-  favicon: docs/images/favicon.ico
+  logo: images/favicon.png
+  favicon: images/favicon.png
   features:
     - navigation.instant
     - navigation.tracking
@@ -35,6 +35,9 @@ theme:
         icon: material/weather-sunny
         name: Switch to light mode
 nav:
-  - Home: docs/index.md
+  - Home: index.md
+  - Model cards: source/model_card.md
+  - Pipeline summary and metrics: source/pipeline_summary.md
+  - Entity labelling: source/labelling.md
 plugins:
-  - same-dir
\ No newline at end of file
+  - same-dir
diff --git a/docs/source/labelling.md b/docs/source/labelling.md
new file mode 100644
index 00000000..335a84c2
--- /dev/null
+++ b/docs/source/labelling.md
@@ -0,0 +1,33 @@
+# Entity Labelling
+
+[June 2024 update: The training of the models used in the skills extraction algorithm is now done using code from the [ojd_daps_language_models](https://github.com/nestauk/ojd_daps_language_models/blob/dev/skillner/README.md) Github repo. Read more up to date information about the process and metrics there.]
+
+To extract skills from job adverts we took an approach of training a named entity recognition (NER) model to predict which parts of job adverts were skills ("skill entities"), which were experiences ("experience entities") and which were job benefits ("benefit entities").
+
+To train the NER model we needed labelled data. First we created a random sample of job adverts and got them into a form needed for labelling using [Label Studio](https://labelstud.io/) and also [Prodigy](https://prodi.gy/).
+
+There are 4 entity labels in our training data:
+
+1. `SKILL`
+2. `MULTISKILL`
+3. `EXPERIENCE`
+4. `BENEFIT`
+
+The user interface for the labelling task in label-studio looks like:
+
+![](../images/label_studio.png)
+
+We tried our best to label from the start to end of each individual skill, starting at the verb (if given):
+![](../images/label_eg1.jpg)
+
+Sometimes it wasn't easy to label individual skills, for example an earlier part of the sentence might be needed to define the later part. An example of this is "Working in a team and on an individual basis" - we could label "Working in a team" as a single skill, but "on an individual basis" makes no sense without the "Working" word. In these situations we labelled the whole span as multi skills:
+![](../images/label_eg4.jpg)
+
+Sometimes there were no entities to label:
+![](../images/label_eg5.jpg)
+
+`EXPERIENCE` labels will often be followed by the word "experience" e.g. "insurance experience", and we included some qualifications as experience, e.g. "Electrical qualifications".
+
+### Training dataset
+
+For the current NER model (20230808), 8971 entities in 500 job adverts from our dataset of job adverts were labelled; 443 are multiskill, 7313 are skill, 852 were experience entities, and 363 were benefit entities. 20% of the labelled entities were held out as a test set to evaluate the models.
diff --git a/docs/source/model_card.md b/docs/source/model_card.md
new file mode 100644
index 00000000..cd091b1b
--- /dev/null
+++ b/docs/source/model_card.md
@@ -0,0 +1,88 @@
+# Model Cards
+
+[June 2024 update: The training of the models used in the skills extraction algorithm is now done using code from the [ojd_daps_language_models](https://github.com/nestauk/ojd_daps_language_models/blob/dev/skillner/README.md) Github repo. Read more up to date information about the process and metrics there.]
+
+This page contains information for different parts of the skills extraction and mapping pipeline. We detail the two main parts of the pipeline; the extract skills pipeline and the skills to taxonomy mapping pipeline.
+
+Developed by data scientists in Nesta’s Data Analytics Practice, (last updated on 29-09-2023).
+
+- [Model Card: Extract Skills](#extract_skills_card)
+- [Model Card: Skills to Taxonomy Mapping](#mapping_card)
+
+![](../images/overview_example.png)
+_An example of extracting skills and mapping them to the ESCO taxonomy._
+
+## Model Card: Named Entity Recognition Model <a name="extract_skills_card"></a>
+
+![](../images/predict_flow.png)
+_The extracting skills pipeline._
+
+### Summary
+
+- Train a Named Entity Recognition (NER) spaCy component to extract skills, multiskills, experience and benefits entities from job adverts.
+- Predict whether or not a skill is multi-skill or not using scikit learn's SVM model. Features are length of entity; if 'and' in entity; if ',' in entity.
+- Split multiskills, where possible, based on semantic rules.
+
+### Training
+
+- For the NER model, 500 job adverts were labelled for skills, multiskills, experience and benefits.
+- As of 8th August 2023, **8971** entities in 500 job adverts from OJO were labelled;
+- **443** are multiskill, **7313** are skill, **852** were experience entities, and **363** were benefit entities. 20% of the labelled entities were held out as a test set to evaluate the models.
+
+The NER model we trained used [spaCy's](https://spacy.io/) NER neural network architecture. Their NER architecture _"features a sophisticated word embedding strategy using subword features and 'Bloom' embeddings, a deep convolutional neural network with residual connections, and a novel transition-based approach to named entity parsing"_ - more about this [here](https://spacy.io/universe/project/video-spacys-ner-model).
+
+You can read more about the creation of the labelling data [here](./labelling.md).
+
+### NER Metrics
+
+- A metric in the python library nerevaluate ([read more here](https://pypi.org/project/nervaluate/)) was used to calculate F1, precision and recall for the NER and SVM classifier on the held-out test set. As of 8th August 2023, the results are as follows:
+
+| Entity     | F1    | Precision | Recall |
+| ---------- | ----- | --------- | ------ |
+| Skill      | 0.612 | 0.712     | 0.537  |
+| Experience | 0.524 | 0.647     | 0.441  |
+| Benefit    | 0.531 | 0.708     | 0.425  |
+| All        | 0.590 | 0.680     | 0.521  |
+
+- These metrics use partial entity matching.
+
+### Multiskill Metrics
+
+- The same training data and held out test set used for the NER model was used to evaluate the SVM model. On a held out test set, the SVM model achieved 94% accuracy.
+- When evaluating the multiskill splitter algorithm rules, 253 multiskill spans were labelled as ‘good’, ‘ok’ or ‘bad’ splits. Of the 253 multiskill spans, 80 were split. Of the splits, 66% were ‘good’, 9% were ‘ok’ and 25% were ‘bad’.
+
+### Caveats and Recommendations
+
+- As we take a rules based approach to splitting multiskills, many multiskills do not get split. If a multiskill is unable to be split, we still match to a taxonomy of choice. Future work should add more rules to split multiskills.
+- We deduplicate the extracted skills in the output. This means that if a job advert mentions ‘excel skills’ twice and these entities are extracted, the output will just contain "excel skills" once. However, if the string is slightly different, e.g. "excel skills" and "Excel skill", both occurrences will be outputted.
+- Future work could look to train embeddings with job-specific texts, disambiguate acronyms and improve NER model performance.
+
+## Model Card: Skills to Taxonomy Mapping <a name="mapping_card"></a>
+
+![](../images/match_flow.png)
+_The methodology for matching skills to the ESCO taxonomy - threshold numbers can be changed in the config file._
+
+### Summary
+
+- Match to a taxonomy based on different similarity thresholds.
+- First try to match at the most granular level of a taxonomy based on cosine similarity between embedded, extracted skill and taxonomy skills. Extracted and taxonomy skills are embedded using huggingface’s [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) model.
+- If there is no close granular skill above 0.7 cosine similarity (this threshold can be changed in configuration file), we then assign the skill to different levels of the taxonomy in one of two approaches (maximum share and maximum similarity - see diagram above for details).
+- If matching to ESCO, 43 commonly occurring skills from a sample of 100,000 job adverts are hard coded.
+
+### Model Factors
+
+The main factors in this matching approach are: 1) the different thresholds at different levels of a taxonomy and 2) the different matching approaches.
+
+### Caveats and Recommendations
+
+This step does less well when:
+
+- The extracted skill is a metaphor: i.e. 'understand the bigger picture' gets matched to 'take pictures'
+- The extracted skill is an acronym: i.e. 'drafting ORSAs' gets matched to 'fine arts'
+- The extracted skill is not a skill (poor NER model performance): i.e. 'assist with the' gets matched to providing general assistance to people
+
+We recommend that:
+
+- Skill entities might match to the same taxonomy skill; the output does not deduplicate matched skills. If deduplicating is important, you will need to deduplicate at the taxonomy level.
+- The current predefined configurations ensures that every extracted skill will be matched to a taxonomy. However, if a skill is matched to the highest skill group, we label it as ‘unmatched’. Under this definition, for ESCO we identify approximately 2% of skills as ‘unmatched’.
+- The configuration file contains the relevant thresholds for matching per taxonomy. These thresholds will need to be manually tuned based on different taxonomies.
diff --git a/docs/source/pipeline_summary.md b/docs/source/pipeline_summary.md
new file mode 100644
index 00000000..b45c8b8d
--- /dev/null
+++ b/docs/source/pipeline_summary.md
@@ -0,0 +1,97 @@
+# Pipeline summary and metrics
+
+[June 2024 update: The training of the models used in the skills extraction algorithm is now done using code from the [ojd_daps_language_models](https://github.com/nestauk/ojd_daps_language_models/blob/dev/skillner/README.md) Github repo. Read more up to date information about the process and metrics there.]
+
+![](../images/overview.png)
+
+High level, the overall pipeline includes:
+
+- Named Entity Recognition (NER) model to extract skill, multi skill or experience entities in job adverts;
+- Support Vector Machine (SVM) model to predict whether the skill entity is a skill or multiskill; if multiskill, apply rules to split multiskills into individual skill entities;
+- Embed all entities (skill and multi skill entities) and taxonomy skills using huggingface’s [sentence-transformers/all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) pre-trained model;
+- Map extracted skills (skill and multi skill) onto taxonomy skills using cosine similarity of embeddings.
+
+For further information or feedback please contact Liz Gallagher, India Kerle or Cath Sleeman.
+
+## Intended Use
+
+- Extract skills from online job adverts and match extracted skills to a user’s skill taxonomy of choice, such as the European Commission’s European Skills, Competences, and Occupations (ESCO) or Lightcast’s Open Skills.
+- Intended users include researchers in labour statistics or related government bodies.
+
+## Out of Scope Uses
+
+- Out of scope is extracting and matching skills from job adverts in non-English languages; extracting and matching skills from texts other than job adverts; drawing conclusions on new, unidentified skills.
+- Skills extracted should not be used to determine skill demand without expert steer and input nor should be used for any discriminatory hiring practices.
+
+## Metrics
+
+There is no exact way to evaluate how well our pipeline works; however we have several proxies to better understand how our approach compares. The analysis in this section was performed using the results of the `20220825` model. We believe the newer `20230808` model will improve these results, but the analysis hasn't been repeated.
+
+### Comparison 1 - Top skill groups per occupation comparison to ESCO essential skill groups per occupation
+
+The ESCO dataset also includes information on the essential skills per occupation. We compare ESCO’s essential skill groups per occupation with the top ESCO-mapped skill groups per occupation. We identify top skills per occupation by:
+
+- Identifying occupations for which we have at least 100 job adverts;
+- Identify skills extracted at ONLY the skill level;
+- Identify a top skill threshold by calculating the 75 percentile % of skills counts for a given occupation
+- Identify the % of top ESCO-mapped skill groups in ESCO’s essential skill groups per occupation
+
+At a high level, we find that:
+
+- 58 occupations with 100 or more job adverts were found in both ESCO and a sample of deduplicated 100,000 job adverts
+- The average # of adverts per occupation is 345.54
+- We extract essential ESCO skills, transversal skills and additional skills
+- On average, 94.5 percent of essential ESCO skill groups were also in the top skill groups extracted per occupation
+- The median percent of essential ESCO skills per occupation that were extracted from our algorithm is 97.84.
+
+### Comparison 2 - Degree of overlap between Lightcast’s extracted skills and our Lightcast skills
+
+We compare extracted Lightcast skills from Lightcasts’ Open Skills algorithm and our current approach from 99 job adverts, with a minimum cosine similarity threshold between an extracted skill and taxonomy skill set to 0 to guarantee we only match at the skill level
+
+We found:
+
+- We extract an average of 10.22 skills per job advert while Lightcast’s Open Skills algorithm extracts an average of 6.42 skills per job advert
+- There no overlap for 40% of job adverts between the two approaches
+- Of the job adverts where there is overlap, on average, 39.3% of extracted Lightcast skills are present in our current approach. The median percentage is 33.3%.
+- Qualitatively, there are a number of limitations to the degree of overlap approach for comparison:
+- The two skill lists may contain very similar skills i.e. Financial Accounting vs. Finance but will be considered different as a result
+- For exact comparison, we set the cosine similarity threshold to 0 to guarantee extracted skill-level skills but would otherwise not do so. This allows for inappropriate skill matches i.e. ‘Eye Examination’ for a supply chain role
+- Lightcast’s algorithm may not be a single source of truth and it also extracts inappropriate skill matches i.e. ‘Flooring’ for a care assistant role
+
+### Evaluation 1 - Manual judgement of false positive rate
+
+We looked at the ESCO-mapped skills extracted from a random sample of 64 job adverts, and manually judged how many skills shouldn’t have been extracted from the job advert i.e. the false positives. We also performed this analysis when looking at the skills extracted from 22 job adverts using Lightcast’s Skills Extractor API.
+
+- Our results showed on average 27% of the skills extracted from a job advert are false positives.
+- For Lightcast, on average 12% of the skills extracted from a job advert are false positives.
+
+### Evaluation 2 - Manual judgement of skills extraction and mapping quality
+
+We manually tagged a random sample of skills extracted from job adverts, with whether we thought they were inappropriate, OK or excellent skill entities, and whether we thought they had inappropriate, OK or excellent matches to ESCO skills.
+
+- We felt that out of 183 skill entities 73% were excellent entities, 19% were OK and 8% were inappropriate.
+- 172 out of 183 skill entities were matched to ESCO skills.
+- Of the 172 matched skill entities we felt 53% were excellently matched, 30% were OK and 17% were inappropriate.
+
+| Skill entity quality | ESCO match quality | count |
+| -------------------- | ------------------ | ----- |
+| Inappropriate        | Inappropriate      | 9     |
+| Inappropriate        | OK                 | 1     |
+| OK                   | Inappropriate      | 9     |
+| OK                   | OK                 | 16    |
+| OK                   | Excellent          | 7     |
+| Excellent            | Inappropriate      | 11    |
+| Excellent            | OK                 | 35    |
+| Excellent            | Excellent          | 83    |
+
+- 87% of the matches were to either an individual skill or the lowest level of the skills taxonomy (level 3).
+- The match quality is at its best when the skill entity is matched to an individual ESCO skill.
+
+| Taxonomy level mapped to | Number in sample | Average match quality score (0-inappropriate, 1-OK, 2-excellent) |
+| ------------------------ | ---------------- | ---------------------------------------------------------------- |
+| Skill                    | 99               | 1.71                                                             |
+| Skill hierarchy level 3  | 51               | 0.90                                                             |
+| Attitudes hierarchy      | 8                | 1.63                                                             |
+| Skill hierarchy level 2  | 6                | 0.33                                                             |
+| Knoweldge hierarchy      | 6                | 0.17                                                             |
+| Transversal hierarchy    | 1                | 1.00                                                             |
diff --git a/ojd_daps_skills/extract_skills/extract_skills_utils.py b/ojd_daps_skills/extract_skills/extract_skills_utils.py
index e332f7b3..51648d1d 100644
--- a/ojd_daps_skills/extract_skills/extract_skills_utils.py
+++ b/ojd_daps_skills/extract_skills/extract_skills_utils.py
@@ -111,8 +111,9 @@ def create(
             if ner_model_name == "nestauk/en_skillner":
                 msg.info(f"{ner_model_name} NER model not loaded. Downloading model...")
                 os.system(
-                    f"pip install https://huggingface.co/{namespace}/{ner_name}/resolve/main/{ner_name}-any-py3-none-any.whl"
+                    f'pip install "{ner_name} @ https://huggingface.co/{namespace}/{ner_name}/resolve/main/{ner_name}-any-py3-none-any.whl"'
                 )
+                msg.info("Model downloaded")
                 nlp = spacy.load(ner_name)
             else:
                 msg.fail(