Merge pull request #11 from sergiomrebelo/modularise-nlp-utils

Modularise nlp utils #9
sergiomrebelo · Jan 29, 2023 · 320933f · 320933f
2 parents 832b5e9 + 0cb6bf6
commit 320933f
Show file tree

Hide file tree

Showing 34 changed files with 9,618 additions and 1,819 deletions.
diff --git a/src/index.js b/src/index.js
@@ -2,8 +2,9 @@ import express from 'express';
 import dotenv from 'dotenv';
 
 import cors from 'cors';
-import {setup, classification, lexicon} from "./nlp-utils/nlp_utils.mjs";
-import * as NLP from "./nlp-utils/nlp_utils.mjs";
+import * as CLASSIFIER from "./nlp-utils/ml-emotion-analysis/ml-emotion-analysis.mjs";
+import * as LEXICON from "./nlp-utils/lexicon-emotion-analysis/lexicon-emotion-analysis.mjs";
+import sentenceTokeniser from "./nlp-utils/sentence-tokeniser/sentence-tokeniser.mjs";
 
 const APP = express();
 const PORT = process.env.PORT || "8000";
@@ -15,24 +16,26 @@ APP.use(express.urlencoded({extended: true}));
 APP.use(express.static('src/public'));
 
 
-APP.listen(PORT, () => {
+APP.listen(PORT, async () => {
+    await CLASSIFIER.config(process.env.LANGUAGE_TRANSLATOR_IAM_APIKEY, process.env.LANGUAGE_TRANSLATOR_URL);
+    await LEXICON.config(process.env.MW_API_KEY, process.env.LANGUAGE_TRANSLATOR_IAM_APIKEY, process.env.LANGUAGE_TRANSLATOR_URL);
     console.info(`👂at port ${PORT}`);
-    setup(process.env.MW_API_KEY, process.env.LANGUAGE_TRANSLATOR_IAM_APIKEY, process.env.LANGUAGE_TRANSLATOR_URL);
 });
 
 APP.get("/lines/:delimiter/:lang/:input/", async (req, res) => {
     const delimiter = req.params.delimiter;
     const text = req.params.input;
     const sentences = text.split(delimiter);
     const lang = req.params.lang;
-    const results = await analysis(text, lang, sentences);
+    const results = await analysis(sentences, lang);
     res.status(results[0]).send(JSON.stringify(results[1]));
 });
 
 APP.get("/text/:lang/:input", async (req, res) => {
     const text = req.params.input;
     const lang = req.params.lang;
-    const results = await analysis(text, lang);
+    const sentences = (await _sentenceTokenizer(text)).flat();
+    const results = await analysis(sentences, lang);
     res.status(results[0]).send(JSON.stringify(results[1]));
 });
 
@@ -48,6 +51,11 @@ const errHandler = (code, msg) => {
     }
 }
 
+const _sentenceTokenizer = async (text) => {
+    return sentenceTokeniser(text);
+}
+
+// TODO: NLP UTILS..
 const _lexiconGlobalResults = async (sentences) => {
 
     // compute global lexicon value
@@ -69,18 +77,16 @@ const _lexiconGlobalResults = async (sentences) => {
     return res.length === 0 ? [['neutral', 1]] : res;
 }
 
-const analysis = async (text, lang, sentences = []) => {
+const analysis = async (sentences = [], lang) => {
+    const text = sentences.flat().join(' ');
     // classification analysis
-    const classificationResults = await classification(text, lang);
+    const classificationResults = await CLASSIFIER.classification(text, lang);
     if (!classificationResults.success) return [400, errHandler(400, `Error in the classification method`)];
 
-    // sentence tokenizer (if necessary)
-    if (sentences.length === 0) sentences = (await NLP.sentenceTokenizer(text)).flat();
-
     // lexicon-based analysis
     let lexiconResults = { "global": null, "sentences": [] };
     for (const sentence of sentences) {
-        const res = await lexicon(sentence, lang, false);
+        const res = await LEXICON.lexicon(sentence, lang, false);
         lexiconResults.sentences.push(res);
         if (!res.success) return [400, errHandler(400, `Error in the lexicon-based method (msg: ${res.msg})`)];;
     }

diff --git a/src/nlp-utils/lexicon-emotion-analysis/History.js b/src/nlp-utils/lexicon-emotion-analysis/History.js
@@ -0,0 +1,200 @@
+export default class History {
+    constructor(txt) {
+        this.data = txt.split(' ').map((w,i ) => [w, i, false, [[w]]]);
+        this.active = this.data;
+    }
+
+    updateMultiple = (previous, current = []) => {
+        for (let tokens of previous) {
+            for (let token of tokens.split(' ')) {
+                for (const entry of this.data) {
+                    if (entry[0] === token) {
+                        entry[3].push(current);
+                        if (current.length === 0) {
+                            entry[2] = true;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    update = (previous, current = "") => {
+        // remove space in the end if exist
+        let word = previous.trim();
+        current = current.trim();
+        for (const entry of this.data) {
+            if (entry[3][entry[3].length-1][0] === word) {
+                entry[3].push(current.split(" "));
+                if (current === "") {
+                    entry[2] = true;
+                }
+                return true;
+            }
+        }
+        return false;
+    }
+
+    remove = (previous) => {
+        this.updateMultiple(previous, []);
+    }
+
+    getActive = () => {
+        return this.data.filter(entry => entry[2] === false);
+    }
+
+    clean = () => {
+        for (let entry of this.data) {
+            let i = entry[3].length-1;
+            let current = entry[3][i];
+            if (current.length === 0 || current === '' || current === ' ' ) {
+                entry[2] = true;
+            }
+        }
+    }
+
+    updateActivePos = (detected = [], processed, info, remove=false) => {
+        this.active = this.getActive();
+        const pos = this.pos(detected.map((v) => [v[0], v[1]]), processed);
+        let counter = 0, indexCounter = 0;
+        for (let entry of this.active) {
+            let current = entry[3][entry[3].length-1];
+            for (let j = 0; j<current.length; j++) {
+                if (counter === pos[indexCounter]) {
+                    if (remove === false) {
+                        // replace the word by another
+                        let n = [...entry[3][this.data[entry[1]][3].length - 1]];
+                        n[j] = info[indexCounter];
+                        entry[3].push(n);
+                    } else {
+                        // remove the word
+                        entry[3].push(['']);
+                        entry[2] = true;
+                    }
+                    indexCounter++;
+                }
+                counter++;
+            }
+
+        }
+    }
+
+    updateActive = (processed) => {
+        this.clean();
+        this.active = this.getActive();
+        let counter = 0;
+        for (let j = 0; j<this.active.length; j++) {
+            const words = this.active[j][3][this.active[j][3].length-1];
+            let update = [];
+            for (let word of words) {
+                if (word !== '') {
+                    update.push (processed[counter]);
+                    counter++;
+                }
+            }
+            this.active[j][3].push(update);
+        }
+    }
+
+    updateLemmas = (pLemmas = [], pTokens = []) => {
+        this.active = this.getActive();
+        let counter = 0;
+        let _raw = [];
+
+        for (let i=0; i<pLemmas.length; i++) {
+            for (let j=0; j<pTokens[i].length; j++) {
+                // for saving in history purposes
+                // lemma, raw, position on sentences array
+                _raw.push([pLemmas[i][j], pTokens[i][j], [i,j]]);
+            }
+        }
+
+        let saved = [];
+
+        for (let w = 0; w<_raw.length; w++) {
+            let word = _raw[w];
+            // DEBUG
+            // console.log ("history", this.data[counter], "active", this.active[counter], "counter", counter);
+
+            const entry = this.active[counter][3][this.active[counter][3].length - 1];
+            if (entry.length === 1) {
+                if (word[1] === entry[0]) {
+                    // const id = this.active[counter][1];
+                    this.active[counter][3].push([word[0]]);
+                    counter++;
+                } else if ((this.active.length-1) > (counter+1)) {
+                    const next = counter + 1;
+                    const nextEntry = this.active[next][3][this.active[next][3].length - 1];
+                    if (nextEntry !== undefined && _raw[w+1] !== undefined) {
+                        if (_raw[w+1][1] === nextEntry[0]) {
+                            // const id = this.active[counter][1];
+                            this.active[counter][3].push(saved);
+                            saved = [];
+                            counter++;
+                        }
+                    }
+                }
+            } else {
+                saved.push(word[0]);
+                if (saved.length === entry.length) {
+                    this.active[counter][3].push(saved);
+                    saved = [];
+                    counter++;
+                }
+            }
+        }
+
+    }
+
+    pos = (arr = [0,0], processed) => {
+        const out = [];
+        for (let a of arr) {
+            let current = 0;
+            if (a[0] === 0) {
+                current = a[1];
+            } else if (a[0] > 0) {
+                for (let i=0; i<a[0]; i++) {
+                    current += processed.sentences[i].tokens.length;
+                }
+                current += a[1];
+            }
+            out.push(current);
+        }
+        return out;
+    }
+
+    removeBasedOnMap = (map) => {
+        this.active = this.getActive();
+        let counter = 0;
+        for (let entry of this.active) {
+            let current = entry[3][entry[3].length-1];
+            let toRemove = [];
+            for (let word of current) {
+                toRemove.push(map[counter]);
+                counter++;
+            }
+            let arr = [];
+            for (let index in toRemove) {
+                const pos = toRemove[index];
+                if (pos === true) {
+                    arr.push(current[index]);
+                }
+            }
+            entry[3].push(arr);
+            if (arr.length === 0) {
+                entry[2] = true;
+            }
+        }
+    }
+
+    log = () => {
+        for (let entry of this.data) {
+            console.log (`raw="${entry[0]}`, `deleted=${entry[2]}`, entry[3], `(length:${entry[3].length})}`);
+            // console.log(`entry no. ${entry[1]}: {raw="${entry[0]}" deleted=${entry[2]} history=[${entry[3]}] (length:${entry[3].length})}`);
+        }
+    }
+
+    get = () => {
+        return this.data;
+    }
+}
diff --git a/...p-utils/lexicon-emotion-analysis/data/NRC-Emotion-Intensity-Lexicon-v1_1618414260694.json b/...p-utils/lexicon-emotion-analysis/data/NRC-Emotion-Intensity-Lexicon-v1_1618414260694.json
diff --git a/src/nlp-utils/lexicon-emotion-analysis/data/emoticons.json b/src/nlp-utils/lexicon-emotion-analysis/data/emoticons.json
@@ -0,0 +1,30 @@
+{
+    ":)": "smile",
+    ": )": "smile",
+    ":-)": "smile",
+    "(:": "smile",
+    "( :": "smile",
+    "(-:": "smile",
+    ":’)": "smile",
+    ":d": "laugh",
+    ": d": "laugh",
+    ":-d": "laugh",
+    "xd": "laugh",
+    "x-d": "laugh",
+    "xd": "laugh",
+    "x-d": "laugh",
+    "<3": "love",
+    ":*": "love",
+    ";-)": "affection",
+    ";)": "affection",
+    ";-d": "affection",
+    ";d": "affection",
+    "(;": "affection",
+    "(-;": "affection",
+    ":-(": "sad",
+    ": (": "sad",
+    ":(": "sad",
+    "):": "sad",
+    ")-": "sad",
+    ":’(": "cry"
+}