From e2e6aca3700f58b4289369bace1f9b961aa0da29 Mon Sep 17 00:00:00 2001 From: Willian Antunes Date: Mon, 12 Apr 2021 12:39:09 -0300 Subject: [PATCH] fix: changed api contract, tokenization logic and faq entries --- .../apps/core/api/v2/api_views.py | 4 +- .../apps/core/api/v2/serializers.py | 4 + .../int/apps/core/api/v2/test_api_views.py | 79 ++-- .../int/apps/core/api/v2/test_serializers.py | 11 + .../FrequentlyAskedQuestions/index.js | 47 +-- frontend/src/domains/TranscriptionDetails.js | 16 +- .../src/redux/slices/transcription-slice.js | 4 +- frontend/src/utils/tokenization.js | 21 +- .../domains/TranscriptionDetails.spec.test.js | 145 ++++---- frontend/tests/support/domain-utils.js | 89 ++--- .../unit/src/utils/tokenization.spec.test.js | 341 +++++++++++++++++- 11 files changed, 533 insertions(+), 228 deletions(-) diff --git a/backend/rave_of_phonetics/apps/core/api/v2/api_views.py b/backend/rave_of_phonetics/apps/core/api/v2/api_views.py index 2dc231c..52eabf5 100644 --- a/backend/rave_of_phonetics/apps/core/api/v2/api_views.py +++ b/backend/rave_of_phonetics/apps/core/api/v2/api_views.py @@ -30,9 +30,9 @@ def transcribe(request: Request) -> Response: transcriptions = check_and_retrieve_transcriptions(words, language) logger.debug(f"Transcriptions: {transcriptions}") - result = [] + result = {} for transcription in transcriptions: transcription_as_dict = asdict(transcription) - result.append(transcription_as_dict) + result[transcription.word] = transcription_as_dict["entries"] return Response(result) diff --git a/backend/rave_of_phonetics/apps/core/api/v2/serializers.py b/backend/rave_of_phonetics/apps/core/api/v2/serializers.py index 75c423e..12bf188 100644 --- a/backend/rave_of_phonetics/apps/core/api/v2/serializers.py +++ b/backend/rave_of_phonetics/apps/core/api/v2/serializers.py @@ -25,4 +25,8 @@ def validate(self, data): if language == self.supported_languages[1]: data["language"] = "en-gb-x-rp" + # Without repeated items + words = data["words"] + data["words"] = list(dict.fromkeys(words)) + return data diff --git a/backend/tests/int/apps/core/api/v2/test_api_views.py b/backend/tests/int/apps/core/api/v2/test_api_views.py index cf0dda5..efe9341 100644 --- a/backend/tests/int/apps/core/api/v2/test_api_views.py +++ b/backend/tests/int/apps/core/api/v2/test_api_views.py @@ -53,11 +53,7 @@ def test_should_receive_empty_entries_as_the_words_does_not_exist_in_database(cl assert ResearchedWord.objects.count() == 3 assert response.status_code == 200 - assert result == [ - {"word": "rave", "entries": None}, - {"word": "of", "entries": None}, - {"word": "phonetics", "entries": None}, - ] + assert result == {"of": None, "phonetics": None, "rave": None} @pytest.mark.django_db @@ -79,44 +75,35 @@ def test_should_receive_transcriptions(client, mock_recaptcha_verify): assert ResearchedWord.objects.count() == 3 assert response.status_code == 200 - assert result == [ - { - "entries": [ - { - "classification": "Undefined", - "phonemic": "ɹ eɪ v", - "phonemic_syllables": "ɹ eɪ v", - "phonetic": None, - "phonetic_syllables": None, - "version": "Version 1", - } - ], - "word": "rave", - }, - { - "entries": [ - { - "classification": "Undefined", - "phonemic": "ə v", - "phonemic_syllables": "ə v", - "phonetic": None, - "phonetic_syllables": None, - "version": "Version 1", - }, - ], - "word": "of", - }, - { - "entries": [ - { - "classification": "Undefined", - "phonemic": "f ə ˈn ɛ t ɪ k s", - "phonemic_syllables": "f ə • ˈn ɛ • t ɪ k s", - "phonetic": None, - "phonetic_syllables": None, - "version": "Version 1", - } - ], - "word": "phonetics", - }, - ] + assert result == { + "of": [ + { + "classification": "Undefined", + "phonemic": "ə v", + "phonemic_syllables": "ə v", + "phonetic": None, + "phonetic_syllables": None, + "version": "Version 1", + } + ], + "phonetics": [ + { + "classification": "Undefined", + "phonemic": "f ə ˈn ɛ t ɪ k s", + "phonemic_syllables": "f ə • ˈn ɛ • t ɪ k s", + "phonetic": None, + "phonetic_syllables": None, + "version": "Version 1", + } + ], + "rave": [ + { + "classification": "Undefined", + "phonemic": "ɹ eɪ v", + "phonemic_syllables": "ɹ eɪ v", + "phonetic": None, + "phonetic_syllables": None, + "version": "Version 1", + } + ], + } diff --git a/backend/tests/int/apps/core/api/v2/test_serializers.py b/backend/tests/int/apps/core/api/v2/test_serializers.py index 443e55c..4606236 100644 --- a/backend/tests/int/apps/core/api/v2/test_serializers.py +++ b/backend/tests/int/apps/core/api/v2/test_serializers.py @@ -37,3 +37,14 @@ def test_should_inform_that_is_valid_and_change_language_to_correct_one(self): words, language = serializer.validated_data["words"], serializer.validated_data["language"] assert words == fake_data["words"] assert language == "en-gb-x-rp" + + def test_should_inform_that_is_valid_and_words_must_not_be_repeatable(self): + fake_data = {"words": ["you", "if", "you", "won't", "won't"], "language": "en-gb"} + serializer = TranscriberSerializer(data=fake_data) + + assert serializer.is_valid() + + words, language = serializer.validated_data["words"], serializer.validated_data["language"] + assert len(words) == 3 + assert words == ["you", "if", "won't"] + assert language == "en-gb-x-rp" diff --git a/frontend/src/components/FrequentlyAskedQuestions/index.js b/frontend/src/components/FrequentlyAskedQuestions/index.js index 75c2939..94f8368 100644 --- a/frontend/src/components/FrequentlyAskedQuestions/index.js +++ b/frontend/src/components/FrequentlyAskedQuestions/index.js @@ -5,10 +5,10 @@ import { slugify } from "../../utils/general" const entries = [ { question: "How do I use Rave of Phonetics?", - text: `The main function of Rave of Phonetics is to provide you a phonemic transcription of a word or text in - order to help you pronounce it. You can also see its syllables, stress marks and the phonetic version as well, if - they are available. Simply type a word in the space provided and read the transcription as well as listen to - the audio to improve your listening skills.`, + text: `The main function of Rave of Phonetics is to provide you a phonemic transcription of a word or text in order + to help you pronounce it. You can also see its syllables, stress marks and the phonetic version as well, if they are + available. Simply type a word in the space provided and read the transcription as well as listen to the audio + to improve your listening skills.`, }, { question: "Can I improve my accent with this page?", @@ -27,15 +27,15 @@ const entries = [ }, { question: "How do I share my transcriptions?", - text: `Sharing is caring. At the bottom of the IPA Transcription Tool panel you have a bottom named - copy link. Just set the tool as you'd like, let's say, you choose the word THING, using AMERICAN ENGLISH, - with SHOW STRESS and SHOW SYLLABLES activated, after that, you can simply click on copy link and then - it will be available in your transfer area! Just press CTRL+V on your social media and you'll see it!`, + text: `Sharing is caring. At the bottom of the IPA Transcription Tool panel, there is an option named + copy link. Just type in the desired word you would like to transcribe, apply your options of stress, + syllables, etc. and after that you can simply click on copy link. Then + it will be available on your clipboard! Just press CTRL+V on your social media and you'll see it!`, }, { question: "Is there a blog for this page?", text: `Of course, there is. If you click here, you will find a blog section that has interesting - topics related to phonetic and languages. Please share with all your friends + topics related to Phonetics and Languages. Please share with all your friends 😊`, }, { @@ -53,8 +53,8 @@ const entries = [ }, { question: "What does ‘show stress’ mean?", - text: `Glad you asked, no need to stress. This option is used to see where the syllables of the words are and which - one is pronounce, or stressed, with standard pronunciation.`, + text: `Glad you asked, no need to stress. This option is used to see which syllable of the word has primary and + secondary stress. This option shows standard pronunciation.`, }, { question: "Why do I need to loop the speech?", @@ -64,9 +64,9 @@ const entries = [ }, { question: "How do I leave a comment?", - text: `Ah, yes. Please let us know what you think. If you want to leave a comment you can go to the bottom of the page. - They are available in our home, changelog, FAQ and blog pages. Also you can get in touch with us through our social - medias (see the bottom bar).`, + text: `Ah, yes. Please let us know what you think. If you want to leave a comment you can go to the bottom of the + page and find our comment section. They are available in our home, changelog, FAQ and blog sections. Also, you can + get in touch with us through our social medias (see the bottom bar).`, }, { question: "How can I ask questions?", @@ -82,18 +82,19 @@ const entries = [ options but for now you will see mainly phonemic transcriptions.`, }, { - question: "Is there an option for allophone variations?", - text: `I knew we would have some experts ask this question. For the moment, we mainly provide phonemic transcriptions, - as phonetic, syllables and allophones are being filled by the community through suggestions. If you'd like to check - all sort of variations, you should check if the transcription is underlined, if so, just click on it to see its - variations. The details can be seen if you click on the word, which will be underlined as well.`, + question: "Is there an option for phonetic variations of the word?", + text: `I knew we would have some experts ask this question. For the moment, we mainly provide phonemic transcriptions. + Phonetic transcription, syllables and allophone variations are still being developed as well as receiving + contributions and suggestions by our great community of learners and experts in the area. If you'd like to check + alternate variations of the word, you should check if the transcription is underlined, if so, just click on it to + see its variations. The details can be seen if you click on the word, which will be underlined as well.`, }, { question: "I would like to add or fix a transcription. Is it possible? How do I do that?", - text: `Sure thing! First you try to transcribe the desired word or phrase, after you receive the transcription, you - click on the underlined word. You should see the option apply suggestion. If you click on it, a - window will be opened describing what you can do. If you'd like to provide only the phonemic, just fill the field - related to it, give us some reasons and click on send suggestion. The same applies to phonetic. + text: `Sure thing! First you try to transcribe the desired word or phrase and then click on the underlined word. + You should see the option apply suggestion. If you click on it, a window will open describing what + you can do. If you'd like to provide only the phonemic, just fill the field related to that and give us some reasons + why you made the suggestion before you click on send suggestion. The same applies to phonetic. Syllables will be handled by us, so you don't have to worry.`, }, ] diff --git a/frontend/src/domains/TranscriptionDetails.js b/frontend/src/domains/TranscriptionDetails.js index 48ecc79..96f0215 100644 --- a/frontend/src/domains/TranscriptionDetails.js +++ b/frontend/src/domains/TranscriptionDetails.js @@ -1,4 +1,4 @@ -import { extractRawWordsFromText } from "../utils/tokenization" +import { extractRawWordsAndTheirTokensFromText } from "../utils/tokenization" export class TranscriptionDetails { constructor( @@ -149,16 +149,20 @@ export class TranscriptionDetails { // REGEX to deal with stress marks and punctuations const regexToExtractStressMarks = /[ˈˌ]+/g // Words that may have punctuations - const wordsFromText = extractRawWordsFromText(this._text) + const rawWordsAndTheirTokens = extractRawWordsAndTheirTokensFromText(this._text) // What will be returned const changedTranscription = [] // Filling changedTranscription array with data - for (const [index, word] of wordsFromText.entries()) { - const wordDetails = this._transcriptionSetup[index] + for (const tokenDetails of rawWordsAndTheirTokens) { + // Extracting objetcs + const word = tokenDetails.raw + const token = tokenDetails.token + // Creating a new entry to insert into changedTranscription array + const entries = this._transcriptionSetup[token] const changedWord = { word } const changedEntries = [] - if (wordDetails.entries) { - wordDetails.entries.forEach(transcription => { + if (entries) { + entries.forEach(transcription => { const changedTranscription = {} Object.assign(changedTranscription, transcription) if (!this._showStress) { diff --git a/frontend/src/redux/slices/transcription-slice.js b/frontend/src/redux/slices/transcription-slice.js index 0414341..2047f98 100644 --- a/frontend/src/redux/slices/transcription-slice.js +++ b/frontend/src/redux/slices/transcription-slice.js @@ -1,7 +1,7 @@ import { createSlice } from "@reduxjs/toolkit" import { transcribe } from "../../services/rop-api" import { findById } from "../../domains/transcription-details-dao" -import { extractWordsFromText } from "../../utils/tokenization" +import { extractTokensFromText } from "../../utils/tokenization" const initialState = { text: "", @@ -102,7 +102,7 @@ export const transcriptionFromText = (text, chosenLanguage, token, hookWhenError dispatch(analysingText()) try { - const words = extractWordsFromText(text) + const words = extractTokensFromText(text) const result = await transcribe(words, chosenLanguage, token) dispatch(textWasTranscribed(result)) dispatch(transcriptionToBeSaved()) diff --git a/frontend/src/utils/tokenization.js b/frontend/src/utils/tokenization.js index 2f824dd..522dfff 100644 --- a/frontend/src/utils/tokenization.js +++ b/frontend/src/utils/tokenization.js @@ -1,9 +1,18 @@ -export function extractWordsFromText(text) { - const regexToExtractWordsAndEmojis = /([\w’'\-\u00a9\u00ae\u2000-\u3300\ud83c\ud000-\udfff\ud83d\ud000-\udfff\ud83e\ud000-\udfff])+/g - return text.match(regexToExtractWordsAndEmojis).map(value => value.toLowerCase()) -} +const regexNegationToExtractWordsAndEmojis = /([^\w’'\-\u00a9\u00ae\u2000-\u3300\ud83c\ud000-\udfff\ud83d\ud000-\udfff\ud83e\ud000-\udfff])+/g -export function extractRawWordsFromText(text) { +export function extractRawWordsAndTheirTokensFromText(text) { const splitText = text.split(" ") - return splitText.filter(entry => entry).map(dirtyWord => dirtyWord.trim()) + + return splitText + .filter(entry => entry) + .map(dirtyWord => dirtyWord.trim()) + .map(cleanedWord => { + const token = cleanedWord.toLowerCase().replace(regexNegationToExtractWordsAndEmojis, "") + return { raw: cleanedWord, token: token ? token : null } + }) +} + +export function extractTokensFromText(text) { + const tokens = extractRawWordsAndTheirTokensFromText(text) + return tokens.map(({ raw, token }) => (token ? token : raw.toLowerCase())) } diff --git a/frontend/tests/int/src/domains/TranscriptionDetails.spec.test.js b/frontend/tests/int/src/domains/TranscriptionDetails.spec.test.js index adbcdcf..6a18c57 100644 --- a/frontend/tests/int/src/domains/TranscriptionDetails.spec.test.js +++ b/frontend/tests/int/src/domains/TranscriptionDetails.spec.test.js @@ -109,87 +109,70 @@ describe("Transcription domain", () => { // Arrange const text = `Don't ever, if "you; please, ad-hoc 1989!` const showPunctuations = true - const transcriptionSetup = [ - { - word: "don't", - entries: [ - { - classification: "Undefined", - version: "Version 1", - phonemic: "d oʊ n t", - phonemic_syllables: "d oʊ n t", - phonetic: null, - phonetic_syllables: null, - }, - ], - }, - { - word: "ever", - entries: [ - { - classification: "Undefined", - version: "Version 1", - phonemic: "ˈɛ v ər", - phonemic_syllables: "ˈɛ • v ər", - phonetic: null, - phonetic_syllables: null, - }, - ], - }, - { - word: "if", - entries: [ - { - classification: "Undefined", - version: "Version 1", - phonemic: "ɪ f", - phonemic_syllables: "ɪ f", - phonetic: null, - phonetic_syllables: null, - }, - ], - }, - { - word: "you", - entries: [ - { - classification: "Undefined", - version: "Version 1", - phonemic: "j u", - phonemic_syllables: "j u", - phonetic: null, - phonetic_syllables: null, - }, - ], - }, - { - word: "please", - entries: [ - { - classification: "Undefined", - version: "Version 1", - phonemic: "p l i z", - phonemic_syllables: "p l i z", - phonetic: null, - phonetic_syllables: null, - }, - ], - }, - { - word: "ad-hoc", - entries: [ - { - classification: "Undefined", - version: "Version 1", - phonemic: "ˈæ ˈd h ɑ k", - phonemic_syllables: "ˈæ ˈd • h ɑ k", - phonetic: null, - phonetic_syllables: null, - }, - ], - }, - { word: "1989", entries: null }, - ] + const transcriptionSetup = { + "don't": [ + { + classification: "Undefined", + version: "Version 1", + phonemic: "d oʊ n t", + phonemic_syllables: "d oʊ n t", + phonetic: null, + phonetic_syllables: null, + }, + ], + ever: [ + { + classification: "Undefined", + version: "Version 1", + phonemic: "ˈɛ v ər", + phonemic_syllables: "ˈɛ • v ər", + phonetic: null, + phonetic_syllables: null, + }, + ], + if: [ + { + classification: "Undefined", + version: "Version 1", + phonemic: "ɪ f", + phonemic_syllables: "ɪ f", + phonetic: null, + phonetic_syllables: null, + }, + ], + you: [ + { + classification: "Undefined", + version: "Version 1", + phonemic: "j u", + phonemic_syllables: "j u", + phonetic: null, + phonetic_syllables: null, + }, + ], + please: [ + { + classification: "Undefined", + version: "Version 1", + phonemic: "p l i z", + phonemic_syllables: "p l i z", + phonetic: null, + phonetic_syllables: null, + }, + ], + "ad-hoc": [ + { + classification: "Undefined", + version: "Version 1", + phonemic: "ˈæ ˈd h ɑ k", + phonemic_syllables: "ˈæ ˈd • h ɑ k", + phonetic: null, + phonetic_syllables: null, + }, + ], + 1989: null, + } + const transcriptionDetails = createTranscriptionDetails({ text, transcriptionSetup, showPunctuations }) // Act const transcription = transcriptionDetails.refreshedTranscriptionSetup diff --git a/frontend/tests/support/domain-utils.js b/frontend/tests/support/domain-utils.js index 66f9577..db4ff4d 100644 --- a/frontend/tests/support/domain-utils.js +++ b/frontend/tests/support/domain-utils.js @@ -8,55 +8,46 @@ export function createTranscriptionDetails({ showSyllables = false, showPunctuations = false, showPhonetic = false, - transcriptionSetup = [ - { - word: "rave", - entries: [ - { - classification: "Undefined", - version: "Version 1", - phonemic: "ɹ eɪ v", - phonemic_syllables: "ɹ eɪ v", - phonetic: null, - phonetic_syllables: null, - }, - ], - }, - { - word: "live", - entries: [ - { - classification: "Undefined", - version: "Version 1", - phonemic: "l aɪ v", - phonemic_syllables: "l aɪ v", - phonetic: null, - phonetic_syllables: null, - }, - { - classification: "Undefined", - version: "Version 2", - phonemic: "l ɪ v", - phonemic_syllables: "l ɪ v", - phonetic: null, - phonetic_syllables: null, - }, - ], - }, - { - word: "phonetics", - entries: [ - { - classification: "Undefined", - version: "Version 1", - phonemic: "f ə ˈn ɛ t ɪ k s", - phonemic_syllables: "f ə • ˈn ɛ • t ɪ k s", - phonetic: null, - phonetic_syllables: null, - }, - ], - }, - ], + transcriptionSetup = { + rave: [ + { + classification: "Undefined", + version: "Version 1", + phonemic: "ɹ eɪ v", + phonemic_syllables: "ɹ eɪ v", + phonetic: null, + phonetic_syllables: null, + }, + ], + live: [ + { + classification: "Undefined", + version: "Version 1", + phonemic: "l aɪ v", + phonemic_syllables: "l aɪ v", + phonetic: null, + phonetic_syllables: null, + }, + { + classification: "Undefined", + version: "Version 2", + phonemic: "l ɪ v", + phonemic_syllables: "l ɪ v", + phonetic: null, + phonetic_syllables: null, + }, + ], + phonetics: [ + { + classification: "Undefined", + version: "Version 1", + phonemic: "f ə ˈn ɛ t ɪ k s", + phonemic_syllables: "f ə • ˈn ɛ • t ɪ k s", + phonetic: null, + phonetic_syllables: null, + }, + ], + }, } = {}) { return new TranscriptionDetails( id, diff --git a/frontend/tests/unit/src/utils/tokenization.spec.test.js b/frontend/tests/unit/src/utils/tokenization.spec.test.js index 27d5b16..264bd95 100644 --- a/frontend/tests/unit/src/utils/tokenization.spec.test.js +++ b/frontend/tests/unit/src/utils/tokenization.spec.test.js @@ -1,4 +1,4 @@ -import { extractRawWordsFromText, extractWordsFromText } from "../../../../src/utils/tokenization" +import { extractRawWordsAndTheirTokensFromText, extractTokensFromText } from "../../../../src/utils/tokenization" describe("Tokenization", () => { describe("Words extraction", () => { @@ -6,7 +6,7 @@ describe("Tokenization", () => { // Arrange const sampleText = "Rave, live Phonetics!" // Act - const result = extractWordsFromText(sampleText) + const result = extractTokensFromText(sampleText) // Assert expect(result).toHaveLength(3) expect(result).toMatchObject(["rave", "live", "phonetics"]) @@ -16,7 +16,7 @@ describe("Tokenization", () => { // Arrange const sampleText = `Don't ever, if "you; please, ad-hoc 1989!` // Act - const result = extractWordsFromText(sampleText) + const result = extractTokensFromText(sampleText) // Assert expect(result).toHaveLength(7) expect(result).toMatchObject(["don't", "ever", "if", "you", "please", "ad-hoc", "1989"]) @@ -27,42 +27,357 @@ describe("Tokenization", () => { const sampleTextOne = `Until her father’s health` const sampleTextTwo = `Until her father's health` // Act - const resultOne = extractWordsFromText(sampleTextOne) - const resultTwo = extractWordsFromText(sampleTextTwo) + const resultOne = extractTokensFromText(sampleTextOne) + const resultTwo = extractTokensFromText(sampleTextTwo) // Assert expect(resultOne).toStrictEqual(["until", "her", "father’s", "health"]) expect(resultTwo).toMatchObject(["until", "her", "father's", "health"]) }) - test(`When with text "Frequently asked questions 🤔", then 4 words is extracts (even the emoji)`, () => { + test(`When with text "Frequently asked questions 🤔", then 7 words is extracts (even the emoji)`, () => { // Arrange const sampleText = `😎 Frequently 🤔 asked 🥵 questions 🤬` // Act - const result = extractWordsFromText(sampleText) + const result = extractTokensFromText(sampleText) // Assert expect(result).toStrictEqual(["😎", "frequently", "🤔", "asked", "🥵", "questions", "🤬"]) }) + + test(`When with text "We, are checking! here @", then 5 words is extracts (even the emoji)`, () => { + // Arrange + const sampleText = `We, are checking! here @` + // Act + const result = extractTokensFromText(sampleText) + // Assert + expect(result).toStrictEqual(["we", "are", "checking", "here", "@"]) + }) }) describe("Raw words extraction", () => { - test("When with SCENARIO 1, then 3 words is extracted", () => { + test("When with SCENARIO 1, then 3 objects is extracted", () => { // Arrange const sampleText = " Rave, live Phonetics!" // Act - const result = extractRawWordsFromText(sampleText) + const result = extractRawWordsAndTheirTokensFromText(sampleText) // Assert expect(result).toHaveLength(3) - expect(result).toMatchObject(["Rave,", "live", "Phonetics!"]) + expect(result).toMatchObject([ + { + raw: "Rave,", + token: "rave", + }, + { + raw: "live", + token: "live", + }, + { + raw: "Phonetics!", + token: "phonetics", + }, + ]) }) - test("When with SCENARIO 2, then 5 words is extracted", () => { + test("When with SCENARIO 2, then 5 objects is extracted", () => { // Arrange const sampleText = ` Rave, OF\n live \r\nPhonetics! @antunes\r\n\r\n` // Act - const result = extractRawWordsFromText(sampleText) + const result = extractRawWordsAndTheirTokensFromText(sampleText) + // Assert + expect(result).toHaveLength(5) + expect(result).toMatchObject([ + { + raw: "Rave,", + token: "rave", + }, + { + raw: "OF", + token: "of", + }, + { + raw: "live", + token: "live", + }, + { + raw: "Phonetics!", + token: "phonetics", + }, + { + raw: "@antunes", + token: "antunes", + }, + ]) + }) + + test("When with SCENARIO 3, then 7 objects is extracted", () => { + // Arrange + const sampleText = `😎 Frequently 🤔 asked 🥵 questions 🤬` + // Act + const result = extractRawWordsAndTheirTokensFromText(sampleText) + // Assert + expect(result).toHaveLength(7) + expect(result).toMatchObject([ + { + raw: "😎", + token: "😎", + }, + { + raw: "Frequently", + token: "frequently", + }, + { + raw: "🤔", + token: "🤔", + }, + { + raw: "asked", + token: "asked", + }, + { + raw: "🥵", + token: "🥵", + }, + { + raw: "questions", + token: "questions", + }, + { + raw: "🤬", + token: "🤬", + }, + ]) + }) + + test("When with SCENARIO 4, then 7 objects is extracted", () => { + // Arrange + const sampleText = `Don't ever, if "you; please, ad-hoc 1989!` + // Act + const result = extractRawWordsAndTheirTokensFromText(sampleText) + // Assert + expect(result).toHaveLength(7) + expect(result).toMatchObject([ + { + raw: "Don't", + token: "don't", + }, + { + raw: "ever,", + token: "ever", + }, + { + raw: "if", + token: "if", + }, + { + raw: `"you;`, + token: "you", + }, + { + raw: "please,", + token: "please", + }, + { + raw: "ad-hoc", + token: "ad-hoc", + }, + { + raw: "1989!", + token: "1989", + }, + ]) + }) + + test("When with SCENARIO 5, then 5 objects is extracted", () => { + // Arrange + const sampleText = `We, are checking! here @` + // Act + const result = extractRawWordsAndTheirTokensFromText(sampleText) // Assert expect(result).toHaveLength(5) - expect(result).toMatchObject(["Rave,", "OF", "live", "Phonetics!", "@antunes"]) + expect(result).toMatchObject([ + { + raw: "We,", + token: "we", + }, + { + raw: "are", + token: "are", + }, + { + raw: "checking!", + token: "checking", + }, + { + raw: "here", + token: "here", + }, + { + raw: "@", + token: null, + }, + ]) + }) + + test("When with SCENARIO 6, then X objects is extracted", () => { + // Arrange + const sampleText = `The 2015 Boat Races took place on 11 April. The Boat Race is an annual side-by-side rowing race between crews from the universities of Oxford and Cambridge along a 4.2-mile (6.8 km) tidal stretch of the River Thames` + // Act + const result = extractRawWordsAndTheirTokensFromText(sampleText) + // Assert + expect(result).toHaveLength(38) + expect(result).toMatchObject([ + { + raw: "The", + token: "the", + }, + { + raw: "2015", + token: "2015", + }, + { + raw: "Boat", + token: "boat", + }, + { + raw: "Races", + token: "races", + }, + { + raw: "took", + token: "took", + }, + { + raw: "place", + token: "place", + }, + { + raw: "on", + token: "on", + }, + { + raw: "11", + token: "11", + }, + { + raw: "April.", + token: "april", + }, + { + raw: "The", + token: "the", + }, + { + raw: "Boat", + token: "boat", + }, + { + raw: "Race", + token: "race", + }, + { + raw: "is", + token: "is", + }, + { + raw: "an", + token: "an", + }, + { + raw: "annual", + token: "annual", + }, + { + raw: "side-by-side", + token: "side-by-side", + }, + { + raw: "rowing", + token: "rowing", + }, + { + raw: "race", + token: "race", + }, + { + raw: "between", + token: "between", + }, + { + raw: "crews", + token: "crews", + }, + { + raw: "from", + token: "from", + }, + { + raw: "the", + token: "the", + }, + { + raw: "universities", + token: "universities", + }, + { + raw: "of", + token: "of", + }, + { + raw: "Oxford", + token: "oxford", + }, + { + raw: "and", + token: "and", + }, + { + raw: "Cambridge", + token: "cambridge", + }, + { + raw: "along", + token: "along", + }, + { + raw: "a", + token: "a", + }, + { + raw: "4.2-mile", + token: "42-mile", + }, + { + raw: "(6.8", + token: "68", + }, + { + raw: "km)", + token: "km", + }, + { + raw: "tidal", + token: "tidal", + }, + { + raw: "stretch", + token: "stretch", + }, + { + raw: "of", + token: "of", + }, + { + raw: "the", + token: "the", + }, + { + raw: "River", + token: "river", + }, + { + raw: "Thames", + token: "thames", + }, + ]) }) }) })