From e2e6aca3700f58b4289369bace1f9b961aa0da29 Mon Sep 17 00:00:00 2001
From: Willian Antunes <willian.lima.antunes@gmail.com>
Date: Mon, 12 Apr 2021 12:39:09 -0300
Subject: [PATCH] fix: changed api contract, tokenization logic and faq entries

---
 .../apps/core/api/v2/api_views.py             |   4 +-
 .../apps/core/api/v2/serializers.py           |   4 +
 .../int/apps/core/api/v2/test_api_views.py    |  79 ++--
 .../int/apps/core/api/v2/test_serializers.py  |  11 +
 .../FrequentlyAskedQuestions/index.js         |  47 +--
 frontend/src/domains/TranscriptionDetails.js  |  16 +-
 .../src/redux/slices/transcription-slice.js   |   4 +-
 frontend/src/utils/tokenization.js            |  21 +-
 .../domains/TranscriptionDetails.spec.test.js | 145 ++++----
 frontend/tests/support/domain-utils.js        |  89 ++---
 .../unit/src/utils/tokenization.spec.test.js  | 341 +++++++++++++++++-
 11 files changed, 533 insertions(+), 228 deletions(-)

diff --git a/backend/rave_of_phonetics/apps/core/api/v2/api_views.py b/backend/rave_of_phonetics/apps/core/api/v2/api_views.py
index 2dc231c..52eabf5 100644
--- a/backend/rave_of_phonetics/apps/core/api/v2/api_views.py
+++ b/backend/rave_of_phonetics/apps/core/api/v2/api_views.py
@@ -30,9 +30,9 @@ def transcribe(request: Request) -> Response:
     transcriptions = check_and_retrieve_transcriptions(words, language)
     logger.debug(f"Transcriptions: {transcriptions}")
 
-    result = []
+    result = {}
     for transcription in transcriptions:
         transcription_as_dict = asdict(transcription)
-        result.append(transcription_as_dict)
+        result[transcription.word] = transcription_as_dict["entries"]
 
     return Response(result)
diff --git a/backend/rave_of_phonetics/apps/core/api/v2/serializers.py b/backend/rave_of_phonetics/apps/core/api/v2/serializers.py
index 75c423e..12bf188 100644
--- a/backend/rave_of_phonetics/apps/core/api/v2/serializers.py
+++ b/backend/rave_of_phonetics/apps/core/api/v2/serializers.py
@@ -25,4 +25,8 @@ def validate(self, data):
         if language == self.supported_languages[1]:
             data["language"] = "en-gb-x-rp"
 
+        # Without repeated items
+        words = data["words"]
+        data["words"] = list(dict.fromkeys(words))
+
         return data
diff --git a/backend/tests/int/apps/core/api/v2/test_api_views.py b/backend/tests/int/apps/core/api/v2/test_api_views.py
index cf0dda5..efe9341 100644
--- a/backend/tests/int/apps/core/api/v2/test_api_views.py
+++ b/backend/tests/int/apps/core/api/v2/test_api_views.py
@@ -53,11 +53,7 @@ def test_should_receive_empty_entries_as_the_words_does_not_exist_in_database(cl
 
     assert ResearchedWord.objects.count() == 3
     assert response.status_code == 200
-    assert result == [
-        {"word": "rave", "entries": None},
-        {"word": "of", "entries": None},
-        {"word": "phonetics", "entries": None},
-    ]
+    assert result == {"of": None, "phonetics": None, "rave": None}
 
 
 @pytest.mark.django_db
@@ -79,44 +75,35 @@ def test_should_receive_transcriptions(client, mock_recaptcha_verify):
 
     assert ResearchedWord.objects.count() == 3
     assert response.status_code == 200
-    assert result == [
-        {
-            "entries": [
-                {
-                    "classification": "Undefined",
-                    "phonemic": "ɹ eɪ v",
-                    "phonemic_syllables": "ɹ eɪ v",
-                    "phonetic": None,
-                    "phonetic_syllables": None,
-                    "version": "Version 1",
-                }
-            ],
-            "word": "rave",
-        },
-        {
-            "entries": [
-                {
-                    "classification": "Undefined",
-                    "phonemic": "ə v",
-                    "phonemic_syllables": "ə v",
-                    "phonetic": None,
-                    "phonetic_syllables": None,
-                    "version": "Version 1",
-                },
-            ],
-            "word": "of",
-        },
-        {
-            "entries": [
-                {
-                    "classification": "Undefined",
-                    "phonemic": "f ə ˈn ɛ t ɪ k s",
-                    "phonemic_syllables": "f ə • ˈn ɛ • t ɪ k s",
-                    "phonetic": None,
-                    "phonetic_syllables": None,
-                    "version": "Version 1",
-                }
-            ],
-            "word": "phonetics",
-        },
-    ]
+    assert result == {
+        "of": [
+            {
+                "classification": "Undefined",
+                "phonemic": "ə v",
+                "phonemic_syllables": "ə v",
+                "phonetic": None,
+                "phonetic_syllables": None,
+                "version": "Version 1",
+            }
+        ],
+        "phonetics": [
+            {
+                "classification": "Undefined",
+                "phonemic": "f ə ˈn ɛ t ɪ k s",
+                "phonemic_syllables": "f ə • ˈn ɛ • t ɪ k s",
+                "phonetic": None,
+                "phonetic_syllables": None,
+                "version": "Version 1",
+            }
+        ],
+        "rave": [
+            {
+                "classification": "Undefined",
+                "phonemic": "ɹ eɪ v",
+                "phonemic_syllables": "ɹ eɪ v",
+                "phonetic": None,
+                "phonetic_syllables": None,
+                "version": "Version 1",
+            }
+        ],
+    }
diff --git a/backend/tests/int/apps/core/api/v2/test_serializers.py b/backend/tests/int/apps/core/api/v2/test_serializers.py
index 443e55c..4606236 100644
--- a/backend/tests/int/apps/core/api/v2/test_serializers.py
+++ b/backend/tests/int/apps/core/api/v2/test_serializers.py
@@ -37,3 +37,14 @@ def test_should_inform_that_is_valid_and_change_language_to_correct_one(self):
         words, language = serializer.validated_data["words"], serializer.validated_data["language"]
         assert words == fake_data["words"]
         assert language == "en-gb-x-rp"
+
+    def test_should_inform_that_is_valid_and_words_must_not_be_repeatable(self):
+        fake_data = {"words": ["you", "if", "you", "won't", "won't"], "language": "en-gb"}
+        serializer = TranscriberSerializer(data=fake_data)
+
+        assert serializer.is_valid()
+
+        words, language = serializer.validated_data["words"], serializer.validated_data["language"]
+        assert len(words) == 3
+        assert words == ["you", "if", "won't"]
+        assert language == "en-gb-x-rp"
diff --git a/frontend/src/components/FrequentlyAskedQuestions/index.js b/frontend/src/components/FrequentlyAskedQuestions/index.js
index 75c2939..94f8368 100644
--- a/frontend/src/components/FrequentlyAskedQuestions/index.js
+++ b/frontend/src/components/FrequentlyAskedQuestions/index.js
@@ -5,10 +5,10 @@ import { slugify } from "../../utils/general"
 const entries = [
   {
     question: "How do I use Rave of Phonetics?",
-    text: `The main function of Rave of Phonetics is to provide you a phonemic transcription of a word or text in 
-    order to help you pronounce it. You can also see its syllables, stress marks and the phonetic version as well, if 
-    they are available. Simply type a word in the space provided and read the transcription as well as listen to 
-    the audio to improve your listening skills.`,
+    text: `The main function of Rave of Phonetics is to provide you a phonemic transcription of a word or text in order 
+    to help you pronounce it. You can also see its syllables, stress marks and the phonetic version as well, if they are 
+    available. Simply type a word in the space provided and read the transcription as well as listen to the audio 
+    to improve your listening skills.`,
   },
   {
     question: "Can I improve my accent with this page?",
@@ -27,15 +27,15 @@ const entries = [
   },
   {
     question: "How do I share my transcriptions?",
-    text: `Sharing is caring. At the bottom of the <strong>IPA Transcription Tool</strong> panel you have a bottom named 
-    <strong>copy link</strong>. Just set the tool as you'd like, let's say, you choose the word THING, using AMERICAN ENGLISH,
-    with SHOW STRESS and SHOW SYLLABLES activated, after that, you can simply click on <strong>copy link</strong> and then 
-    it will be available in your transfer area! Just press CTRL+V on your social media and you'll see it!`,
+    text: `Sharing is caring. At the bottom of the <strong>IPA Transcription Tool</strong> panel, there is an option named 
+    <strong>copy link</strong>. Just type in the desired word you would like to transcribe, apply your options of stress, 
+    syllables, etc. and after that you can simply click on <strong>copy link</strong>. Then 
+    it will be available on your clipboard! Just press CTRL+V on your social media and you'll see it!`,
   },
   {
     question: "Is there a blog for this page?",
     text: `Of course, there is. If you click <a href="/blog">here</a>, you will find a blog section that has interesting 
-    topics related to phonetic and languages. Please share with all your friends 
+    topics related to Phonetics and Languages. Please share with all your friends 
     <span role="img" aria-label="slightly smiling face">😊</span>`,
   },
   {
@@ -53,8 +53,8 @@ const entries = [
   },
   {
     question: "What does ‘show stress’ mean?",
-    text: `Glad you asked, no need to stress. This option is used to see where the syllables of the words are and which 
-    one is pronounce, or stressed, with standard pronunciation.`,
+    text: `Glad you asked, no need to stress. This option is used to see which syllable of the word has primary and 
+    secondary stress. This option shows standard pronunciation.`,
   },
   {
     question: "Why do I need to loop the speech?",
@@ -64,9 +64,9 @@ const entries = [
   },
   {
     question: "How do I leave a comment?",
-    text: `Ah, yes. Please let us know what you think. If you want to leave a comment you can go to the bottom of the page. 
-    They are available in our home, changelog, FAQ and blog pages. Also you can get in touch with us through our social 
-    medias (see the bottom bar).`,
+    text: `Ah, yes. Please let us know what you think. If you want to leave a comment you can go to the bottom of the 
+    page and find our comment section. They are available in our home, changelog, FAQ and blog sections. Also, you can 
+    get in touch with us through our social medias (see the bottom bar).`,
   },
   {
     question: "How can I ask questions?",
@@ -82,18 +82,19 @@ const entries = [
     options but for now you will see mainly phonemic transcriptions.`,
   },
   {
-    question: "Is there an option for allophone variations?",
-    text: `I knew we would have some experts ask this question. For the moment, we mainly provide phonemic transcriptions, 
-    as phonetic, syllables and allophones are being filled by the community through suggestions. If you'd like to check 
-    all sort of variations, you should check if the transcription is underlined, if so, just click on it to see its 
-    variations. The details can be seen if you click on the word, which will be underlined as well.`,
+    question: "Is there an option for phonetic variations of the word?",
+    text: `I knew we would have some experts ask this question. For the moment, we mainly provide phonemic transcriptions. 
+    Phonetic transcription, syllables and allophone variations are still being developed as well as receiving 
+    contributions and suggestions by our great community of learners and experts in the area. If you'd like to check 
+    alternate variations of the word, you should check if the transcription is underlined, if so, just click on it to 
+    see its variations. The details can be seen if you click on the word, which will be underlined as well.`,
   },
   {
     question: "I would like to add or fix a transcription. Is it possible? How do I do that?",
-    text: `Sure thing! First you try to transcribe the desired word or phrase, after you receive the transcription, you 
-    click on the underlined word. You should see the option <strong>apply suggestion</strong>. If you click on it, a 
-    window will be opened describing what you can do. If you'd like to provide only the phonemic, just fill the field 
-    related to it, give us some reasons and click on <strong>send suggestion</strong>. The same applies to phonetic. 
+    text: `Sure thing! First you try to transcribe the desired word or phrase and then click on the underlined word. 
+    You should see the option <strong>apply suggestion</strong>. If you click on it, a window will open describing what 
+    you can do. If you'd like to provide only the phonemic, just fill the field related to that and give us some reasons 
+    why you made the suggestion before you click on <strong>send suggestion</strong>. The same applies to phonetic. 
     Syllables will be handled by us, so you don't have to worry.`,
   },
 ]
diff --git a/frontend/src/domains/TranscriptionDetails.js b/frontend/src/domains/TranscriptionDetails.js
index 48ecc79..96f0215 100644
--- a/frontend/src/domains/TranscriptionDetails.js
+++ b/frontend/src/domains/TranscriptionDetails.js
@@ -1,4 +1,4 @@
-import { extractRawWordsFromText } from "../utils/tokenization"
+import { extractRawWordsAndTheirTokensFromText } from "../utils/tokenization"
 
 export class TranscriptionDetails {
   constructor(
@@ -149,16 +149,20 @@ export class TranscriptionDetails {
     // REGEX to deal with stress marks and punctuations
     const regexToExtractStressMarks = /[ˈˌ]+/g
     // Words that may have punctuations
-    const wordsFromText = extractRawWordsFromText(this._text)
+    const rawWordsAndTheirTokens = extractRawWordsAndTheirTokensFromText(this._text)
     // What will be returned
     const changedTranscription = []
     // Filling changedTranscription array with data
-    for (const [index, word] of wordsFromText.entries()) {
-      const wordDetails = this._transcriptionSetup[index]
+    for (const tokenDetails of rawWordsAndTheirTokens) {
+      // Extracting objetcs
+      const word = tokenDetails.raw
+      const token = tokenDetails.token
+      // Creating a new entry to insert into changedTranscription array
+      const entries = this._transcriptionSetup[token]
       const changedWord = { word }
       const changedEntries = []
-      if (wordDetails.entries) {
-        wordDetails.entries.forEach(transcription => {
+      if (entries) {
+        entries.forEach(transcription => {
           const changedTranscription = {}
           Object.assign(changedTranscription, transcription)
           if (!this._showStress) {
diff --git a/frontend/src/redux/slices/transcription-slice.js b/frontend/src/redux/slices/transcription-slice.js
index 0414341..2047f98 100644
--- a/frontend/src/redux/slices/transcription-slice.js
+++ b/frontend/src/redux/slices/transcription-slice.js
@@ -1,7 +1,7 @@
 import { createSlice } from "@reduxjs/toolkit"
 import { transcribe } from "../../services/rop-api"
 import { findById } from "../../domains/transcription-details-dao"
-import { extractWordsFromText } from "../../utils/tokenization"
+import { extractTokensFromText } from "../../utils/tokenization"
 
 const initialState = {
   text: "",
@@ -102,7 +102,7 @@ export const transcriptionFromText = (text, chosenLanguage, token, hookWhenError
   dispatch(analysingText())
 
   try {
-    const words = extractWordsFromText(text)
+    const words = extractTokensFromText(text)
     const result = await transcribe(words, chosenLanguage, token)
     dispatch(textWasTranscribed(result))
     dispatch(transcriptionToBeSaved())
diff --git a/frontend/src/utils/tokenization.js b/frontend/src/utils/tokenization.js
index 2f824dd..522dfff 100644
--- a/frontend/src/utils/tokenization.js
+++ b/frontend/src/utils/tokenization.js
@@ -1,9 +1,18 @@
-export function extractWordsFromText(text) {
-  const regexToExtractWordsAndEmojis = /([\w’'\-\u00a9\u00ae\u2000-\u3300\ud83c\ud000-\udfff\ud83d\ud000-\udfff\ud83e\ud000-\udfff])+/g
-  return text.match(regexToExtractWordsAndEmojis).map(value => value.toLowerCase())
-}
+const regexNegationToExtractWordsAndEmojis = /([^\w’'\-\u00a9\u00ae\u2000-\u3300\ud83c\ud000-\udfff\ud83d\ud000-\udfff\ud83e\ud000-\udfff])+/g
 
-export function extractRawWordsFromText(text) {
+export function extractRawWordsAndTheirTokensFromText(text) {
   const splitText = text.split(" ")
-  return splitText.filter(entry => entry).map(dirtyWord => dirtyWord.trim())
+
+  return splitText
+    .filter(entry => entry)
+    .map(dirtyWord => dirtyWord.trim())
+    .map(cleanedWord => {
+      const token = cleanedWord.toLowerCase().replace(regexNegationToExtractWordsAndEmojis, "")
+      return { raw: cleanedWord, token: token ? token : null }
+    })
+}
+
+export function extractTokensFromText(text) {
+  const tokens = extractRawWordsAndTheirTokensFromText(text)
+  return tokens.map(({ raw, token }) => (token ? token : raw.toLowerCase()))
 }
diff --git a/frontend/tests/int/src/domains/TranscriptionDetails.spec.test.js b/frontend/tests/int/src/domains/TranscriptionDetails.spec.test.js
index adbcdcf..6a18c57 100644
--- a/frontend/tests/int/src/domains/TranscriptionDetails.spec.test.js
+++ b/frontend/tests/int/src/domains/TranscriptionDetails.spec.test.js
@@ -109,87 +109,70 @@ describe("Transcription domain", () => {
       // Arrange
       const text = `Don't ever, if "you; please, ad-hoc 1989!`
       const showPunctuations = true
-      const transcriptionSetup = [
-        {
-          word: "don't",
-          entries: [
-            {
-              classification: "Undefined",
-              version: "Version 1",
-              phonemic: "d oʊ n t",
-              phonemic_syllables: "d oʊ n t",
-              phonetic: null,
-              phonetic_syllables: null,
-            },
-          ],
-        },
-        {
-          word: "ever",
-          entries: [
-            {
-              classification: "Undefined",
-              version: "Version 1",
-              phonemic: "ˈɛ v ər",
-              phonemic_syllables: "ˈɛ • v ər",
-              phonetic: null,
-              phonetic_syllables: null,
-            },
-          ],
-        },
-        {
-          word: "if",
-          entries: [
-            {
-              classification: "Undefined",
-              version: "Version 1",
-              phonemic: "ɪ f",
-              phonemic_syllables: "ɪ f",
-              phonetic: null,
-              phonetic_syllables: null,
-            },
-          ],
-        },
-        {
-          word: "you",
-          entries: [
-            {
-              classification: "Undefined",
-              version: "Version 1",
-              phonemic: "j u",
-              phonemic_syllables: "j u",
-              phonetic: null,
-              phonetic_syllables: null,
-            },
-          ],
-        },
-        {
-          word: "please",
-          entries: [
-            {
-              classification: "Undefined",
-              version: "Version 1",
-              phonemic: "p l i z",
-              phonemic_syllables: "p l i z",
-              phonetic: null,
-              phonetic_syllables: null,
-            },
-          ],
-        },
-        {
-          word: "ad-hoc",
-          entries: [
-            {
-              classification: "Undefined",
-              version: "Version 1",
-              phonemic: "ˈæ ˈd h ɑ k",
-              phonemic_syllables: "ˈæ ˈd • h ɑ k",
-              phonetic: null,
-              phonetic_syllables: null,
-            },
-          ],
-        },
-        { word: "1989", entries: null },
-      ]
+      const transcriptionSetup = {
+        "don't": [
+          {
+            classification: "Undefined",
+            version: "Version 1",
+            phonemic: "d oʊ n t",
+            phonemic_syllables: "d oʊ n t",
+            phonetic: null,
+            phonetic_syllables: null,
+          },
+        ],
+        ever: [
+          {
+            classification: "Undefined",
+            version: "Version 1",
+            phonemic: "ˈɛ v ər",
+            phonemic_syllables: "ˈɛ • v ər",
+            phonetic: null,
+            phonetic_syllables: null,
+          },
+        ],
+        if: [
+          {
+            classification: "Undefined",
+            version: "Version 1",
+            phonemic: "ɪ f",
+            phonemic_syllables: "ɪ f",
+            phonetic: null,
+            phonetic_syllables: null,
+          },
+        ],
+        you: [
+          {
+            classification: "Undefined",
+            version: "Version 1",
+            phonemic: "j u",
+            phonemic_syllables: "j u",
+            phonetic: null,
+            phonetic_syllables: null,
+          },
+        ],
+        please: [
+          {
+            classification: "Undefined",
+            version: "Version 1",
+            phonemic: "p l i z",
+            phonemic_syllables: "p l i z",
+            phonetic: null,
+            phonetic_syllables: null,
+          },
+        ],
+        "ad-hoc": [
+          {
+            classification: "Undefined",
+            version: "Version 1",
+            phonemic: "ˈæ ˈd h ɑ k",
+            phonemic_syllables: "ˈæ ˈd • h ɑ k",
+            phonetic: null,
+            phonetic_syllables: null,
+          },
+        ],
+        1989: null,
+      }
+
       const transcriptionDetails = createTranscriptionDetails({ text, transcriptionSetup, showPunctuations })
       // Act
       const transcription = transcriptionDetails.refreshedTranscriptionSetup
diff --git a/frontend/tests/support/domain-utils.js b/frontend/tests/support/domain-utils.js
index 66f9577..db4ff4d 100644
--- a/frontend/tests/support/domain-utils.js
+++ b/frontend/tests/support/domain-utils.js
@@ -8,55 +8,46 @@ export function createTranscriptionDetails({
   showSyllables = false,
   showPunctuations = false,
   showPhonetic = false,
-  transcriptionSetup = [
-    {
-      word: "rave",
-      entries: [
-        {
-          classification: "Undefined",
-          version: "Version 1",
-          phonemic: "ɹ eɪ v",
-          phonemic_syllables: "ɹ eɪ v",
-          phonetic: null,
-          phonetic_syllables: null,
-        },
-      ],
-    },
-    {
-      word: "live",
-      entries: [
-        {
-          classification: "Undefined",
-          version: "Version 1",
-          phonemic: "l aɪ v",
-          phonemic_syllables: "l aɪ v",
-          phonetic: null,
-          phonetic_syllables: null,
-        },
-        {
-          classification: "Undefined",
-          version: "Version 2",
-          phonemic: "l ɪ v",
-          phonemic_syllables: "l ɪ v",
-          phonetic: null,
-          phonetic_syllables: null,
-        },
-      ],
-    },
-    {
-      word: "phonetics",
-      entries: [
-        {
-          classification: "Undefined",
-          version: "Version 1",
-          phonemic: "f ə ˈn ɛ t ɪ k s",
-          phonemic_syllables: "f ə • ˈn ɛ • t ɪ k s",
-          phonetic: null,
-          phonetic_syllables: null,
-        },
-      ],
-    },
-  ],
+  transcriptionSetup = {
+    rave: [
+      {
+        classification: "Undefined",
+        version: "Version 1",
+        phonemic: "ɹ eɪ v",
+        phonemic_syllables: "ɹ eɪ v",
+        phonetic: null,
+        phonetic_syllables: null,
+      },
+    ],
+    live: [
+      {
+        classification: "Undefined",
+        version: "Version 1",
+        phonemic: "l aɪ v",
+        phonemic_syllables: "l aɪ v",
+        phonetic: null,
+        phonetic_syllables: null,
+      },
+      {
+        classification: "Undefined",
+        version: "Version 2",
+        phonemic: "l ɪ v",
+        phonemic_syllables: "l ɪ v",
+        phonetic: null,
+        phonetic_syllables: null,
+      },
+    ],
+    phonetics: [
+      {
+        classification: "Undefined",
+        version: "Version 1",
+        phonemic: "f ə ˈn ɛ t ɪ k s",
+        phonemic_syllables: "f ə • ˈn ɛ • t ɪ k s",
+        phonetic: null,
+        phonetic_syllables: null,
+      },
+    ],
+  },
 } = {}) {
   return new TranscriptionDetails(
     id,
diff --git a/frontend/tests/unit/src/utils/tokenization.spec.test.js b/frontend/tests/unit/src/utils/tokenization.spec.test.js
index 27d5b16..264bd95 100644
--- a/frontend/tests/unit/src/utils/tokenization.spec.test.js
+++ b/frontend/tests/unit/src/utils/tokenization.spec.test.js
@@ -1,4 +1,4 @@
-import { extractRawWordsFromText, extractWordsFromText } from "../../../../src/utils/tokenization"
+import { extractRawWordsAndTheirTokensFromText, extractTokensFromText } from "../../../../src/utils/tokenization"
 
 describe("Tokenization", () => {
   describe("Words extraction", () => {
@@ -6,7 +6,7 @@ describe("Tokenization", () => {
       // Arrange
       const sampleText = "Rave, live Phonetics!"
       // Act
-      const result = extractWordsFromText(sampleText)
+      const result = extractTokensFromText(sampleText)
       // Assert
       expect(result).toHaveLength(3)
       expect(result).toMatchObject(["rave", "live", "phonetics"])
@@ -16,7 +16,7 @@ describe("Tokenization", () => {
       // Arrange
       const sampleText = `Don't ever, if "you; please, ad-hoc 1989!`
       // Act
-      const result = extractWordsFromText(sampleText)
+      const result = extractTokensFromText(sampleText)
       // Assert
       expect(result).toHaveLength(7)
       expect(result).toMatchObject(["don't", "ever", "if", "you", "please", "ad-hoc", "1989"])
@@ -27,42 +27,357 @@ describe("Tokenization", () => {
       const sampleTextOne = `Until her father’s health`
       const sampleTextTwo = `Until her father's health`
       // Act
-      const resultOne = extractWordsFromText(sampleTextOne)
-      const resultTwo = extractWordsFromText(sampleTextTwo)
+      const resultOne = extractTokensFromText(sampleTextOne)
+      const resultTwo = extractTokensFromText(sampleTextTwo)
       // Assert
       expect(resultOne).toStrictEqual(["until", "her", "father’s", "health"])
       expect(resultTwo).toMatchObject(["until", "her", "father's", "health"])
     })
 
-    test(`When with text "Frequently asked questions 🤔", then 4 words is extracts (even the emoji)`, () => {
+    test(`When with text "Frequently asked questions 🤔", then 7 words is extracts (even the emoji)`, () => {
       // Arrange
       const sampleText = `😎 Frequently 🤔 asked 🥵 questions 🤬`
       // Act
-      const result = extractWordsFromText(sampleText)
+      const result = extractTokensFromText(sampleText)
       // Assert
       expect(result).toStrictEqual(["😎", "frequently", "🤔", "asked", "🥵", "questions", "🤬"])
     })
+
+    test(`When with text "We, are checking! here @", then 5 words is extracts (even the emoji)`, () => {
+      // Arrange
+      const sampleText = `We, are checking! here @`
+      // Act
+      const result = extractTokensFromText(sampleText)
+      // Assert
+      expect(result).toStrictEqual(["we", "are", "checking", "here", "@"])
+    })
   })
 
   describe("Raw words extraction", () => {
-    test("When with SCENARIO 1, then 3 words is extracted", () => {
+    test("When with SCENARIO 1, then 3 objects is extracted", () => {
       // Arrange
       const sampleText = " Rave, live Phonetics!"
       // Act
-      const result = extractRawWordsFromText(sampleText)
+      const result = extractRawWordsAndTheirTokensFromText(sampleText)
       // Assert
       expect(result).toHaveLength(3)
-      expect(result).toMatchObject(["Rave,", "live", "Phonetics!"])
+      expect(result).toMatchObject([
+        {
+          raw: "Rave,",
+          token: "rave",
+        },
+        {
+          raw: "live",
+          token: "live",
+        },
+        {
+          raw: "Phonetics!",
+          token: "phonetics",
+        },
+      ])
     })
 
-    test("When with SCENARIO 2, then 5 words is extracted", () => {
+    test("When with SCENARIO 2, then 5 objects is extracted", () => {
       // Arrange
       const sampleText = ` Rave,     OF\n live \r\nPhonetics! @antunes\r\n\r\n`
       // Act
-      const result = extractRawWordsFromText(sampleText)
+      const result = extractRawWordsAndTheirTokensFromText(sampleText)
+      // Assert
+      expect(result).toHaveLength(5)
+      expect(result).toMatchObject([
+        {
+          raw: "Rave,",
+          token: "rave",
+        },
+        {
+          raw: "OF",
+          token: "of",
+        },
+        {
+          raw: "live",
+          token: "live",
+        },
+        {
+          raw: "Phonetics!",
+          token: "phonetics",
+        },
+        {
+          raw: "@antunes",
+          token: "antunes",
+        },
+      ])
+    })
+
+    test("When with SCENARIO 3, then 7 objects is extracted", () => {
+      // Arrange
+      const sampleText = `😎 Frequently 🤔 asked 🥵 questions 🤬`
+      // Act
+      const result = extractRawWordsAndTheirTokensFromText(sampleText)
+      // Assert
+      expect(result).toHaveLength(7)
+      expect(result).toMatchObject([
+        {
+          raw: "😎",
+          token: "😎",
+        },
+        {
+          raw: "Frequently",
+          token: "frequently",
+        },
+        {
+          raw: "🤔",
+          token: "🤔",
+        },
+        {
+          raw: "asked",
+          token: "asked",
+        },
+        {
+          raw: "🥵",
+          token: "🥵",
+        },
+        {
+          raw: "questions",
+          token: "questions",
+        },
+        {
+          raw: "🤬",
+          token: "🤬",
+        },
+      ])
+    })
+
+    test("When with SCENARIO 4, then 7 objects is extracted", () => {
+      // Arrange
+      const sampleText = `Don't ever, if "you; please, ad-hoc 1989!`
+      // Act
+      const result = extractRawWordsAndTheirTokensFromText(sampleText)
+      // Assert
+      expect(result).toHaveLength(7)
+      expect(result).toMatchObject([
+        {
+          raw: "Don't",
+          token: "don't",
+        },
+        {
+          raw: "ever,",
+          token: "ever",
+        },
+        {
+          raw: "if",
+          token: "if",
+        },
+        {
+          raw: `"you;`,
+          token: "you",
+        },
+        {
+          raw: "please,",
+          token: "please",
+        },
+        {
+          raw: "ad-hoc",
+          token: "ad-hoc",
+        },
+        {
+          raw: "1989!",
+          token: "1989",
+        },
+      ])
+    })
+
+    test("When with SCENARIO 5, then 5 objects is extracted", () => {
+      // Arrange
+      const sampleText = `We, are checking! here @`
+      // Act
+      const result = extractRawWordsAndTheirTokensFromText(sampleText)
       // Assert
       expect(result).toHaveLength(5)
-      expect(result).toMatchObject(["Rave,", "OF", "live", "Phonetics!", "@antunes"])
+      expect(result).toMatchObject([
+        {
+          raw: "We,",
+          token: "we",
+        },
+        {
+          raw: "are",
+          token: "are",
+        },
+        {
+          raw: "checking!",
+          token: "checking",
+        },
+        {
+          raw: "here",
+          token: "here",
+        },
+        {
+          raw: "@",
+          token: null,
+        },
+      ])
+    })
+
+    test("When with SCENARIO 6, then X objects is extracted", () => {
+      // Arrange
+      const sampleText = `The 2015 Boat Races took place on 11 April. The Boat Race is an annual side-by-side rowing race between crews from the universities of Oxford and Cambridge along a 4.2-mile (6.8 km) tidal stretch of the River Thames`
+      // Act
+      const result = extractRawWordsAndTheirTokensFromText(sampleText)
+      // Assert
+      expect(result).toHaveLength(38)
+      expect(result).toMatchObject([
+        {
+          raw: "The",
+          token: "the",
+        },
+        {
+          raw: "2015",
+          token: "2015",
+        },
+        {
+          raw: "Boat",
+          token: "boat",
+        },
+        {
+          raw: "Races",
+          token: "races",
+        },
+        {
+          raw: "took",
+          token: "took",
+        },
+        {
+          raw: "place",
+          token: "place",
+        },
+        {
+          raw: "on",
+          token: "on",
+        },
+        {
+          raw: "11",
+          token: "11",
+        },
+        {
+          raw: "April.",
+          token: "april",
+        },
+        {
+          raw: "The",
+          token: "the",
+        },
+        {
+          raw: "Boat",
+          token: "boat",
+        },
+        {
+          raw: "Race",
+          token: "race",
+        },
+        {
+          raw: "is",
+          token: "is",
+        },
+        {
+          raw: "an",
+          token: "an",
+        },
+        {
+          raw: "annual",
+          token: "annual",
+        },
+        {
+          raw: "side-by-side",
+          token: "side-by-side",
+        },
+        {
+          raw: "rowing",
+          token: "rowing",
+        },
+        {
+          raw: "race",
+          token: "race",
+        },
+        {
+          raw: "between",
+          token: "between",
+        },
+        {
+          raw: "crews",
+          token: "crews",
+        },
+        {
+          raw: "from",
+          token: "from",
+        },
+        {
+          raw: "the",
+          token: "the",
+        },
+        {
+          raw: "universities",
+          token: "universities",
+        },
+        {
+          raw: "of",
+          token: "of",
+        },
+        {
+          raw: "Oxford",
+          token: "oxford",
+        },
+        {
+          raw: "and",
+          token: "and",
+        },
+        {
+          raw: "Cambridge",
+          token: "cambridge",
+        },
+        {
+          raw: "along",
+          token: "along",
+        },
+        {
+          raw: "a",
+          token: "a",
+        },
+        {
+          raw: "4.2-mile",
+          token: "42-mile",
+        },
+        {
+          raw: "(6.8",
+          token: "68",
+        },
+        {
+          raw: "km)",
+          token: "km",
+        },
+        {
+          raw: "tidal",
+          token: "tidal",
+        },
+        {
+          raw: "stretch",
+          token: "stretch",
+        },
+        {
+          raw: "of",
+          token: "of",
+        },
+        {
+          raw: "the",
+          token: "the",
+        },
+        {
+          raw: "River",
+          token: "river",
+        },
+        {
+          raw: "Thames",
+          token: "thames",
+        },
+      ])
     })
   })
 })