#5 add renci nlp server use case

ResearchSoftwareInstitute · Aug 7, 2017 · dcfe9d5 · dcfe9d5
1 parent 6c22492
commit dcfe9d5
Show file tree

Hide file tree

Showing 73 changed files with 11,243 additions and 0 deletions.
diff --git a/use-cases/renci/renci_nlp_server/Dockerfile b/use-cases/renci/renci_nlp_server/Dockerfile
@@ -0,0 +1,64 @@
+FROM centos:centos6.7
+MAINTAINER Lisa Stillwell <[email protected]>
+
+ENV NLP_DIR /renci_nlp_server
+WORKDIR $NLP_DIR
+
+RUN yum -y update \
+	&&  yum clean all \
+	&&  yum -y install epel-release \
+	&&  yum clean all \
+	&&  yum -y install wget
+
+# Install postgresql and setup database
+RUN rpm -Uvh http://yum.postgresql.org/9.4/redhat/rhel-6-x86_64/pgdg-centos94-9.4-3.noarch.rpm \
+	&& yum -y install postgresql94 postgresql94-server postgresql94-contrib postgresql94-devel \
+	&& ln -s /usr/pgsql-9.4/bin/pg_config /usr/bin/pg_config \
+	&& service postgresql-9.4 initdb \
+	&& sed -i 's/^host[ \t]*all[ \t]*all[ \t]*127\.0\.0\.1\/32[ \t]*ident/host  all  all  127\.0\.0\.1\/32  password/' /var/lib/pgsql/9.4/data/pg_hba.conf
+
+# Install Java 1.8
+#RUN wget --no-cookies --no-check-certificate --header "Cookie: gpw_e24=http%3A%2F%2Fwww.oracle.com%2F \
+#	&& oraclelicense=accept-securebackup-cookie" http://download.oracle.com/otn-pub/java/jdk/8u111-b14/jdk-8u111-linux-x64.tar.gz \
+RUN curl -L -O -H "Cookie: oraclelicense=accept-securebackup-cookie" -k "https://edelivery.oracle.com/otn-pub/java/jdk/8u111-b14/jdk-8u111-linux-x64.tar.gz" \
+	&& tar -xzf jdk-8u111-linux-x64.tar.gz \
+	&& rm jdk-8u111-linux-x64.tar.gz \
+	&& cd jdk1.8.0_111/ \
+	&&  alternatives --install /usr/bin/java java $NLP_DIR/jdk1.8.0_111/bin/java 1
+
+# Install Python 2.7 and all prereqs
+RUN yum -y install gcc zlib-devel unzip sqlite-devel openssl openssl-devel \
+	&& wget https://www.python.org/ftp/python/2.7.12/Python-2.7.12.tgz \
+	&& tar -xzvf Python-2.7.12.tgz \
+	&& rm Python-2.7.12.tgz \
+	&& cd Python-2.7.12 \
+	&& ./configure; make; make altinstall \
+	&& yum -y install python-pip \
+	&& pip install --upgrade pip \
+	&& pip install virtualenv \
+	&& virtualenv -p /usr/local/bin/python2.7 $NLP_DIR
+
+# Download and install all NLP code
+RUN wget http://nlp.stanford.edu/software/stanford-corenlp-full-2015-12-09.zip \
+	&& unzip stanford-corenlp-full-2015-12-09.zip \
+	&& rm stanford-corenlp-full-2015-12-09.zip \
+	&& wget https://github.com/brendano/stanford_corenlp_pywrapper/archive/master.zip \
+	&& unzip master.zip; rm master.zip \
+	&& wget https://github.com/lstillwe/renci_nlp_server/archive/master.zip \
+	&& unzip master.zip \
+	&& mv renci_nlp_server-master/* $NLP_DIR; rm master.zip; rm -rf renci_nlp_server-master \
+	&& mv stanford-corenlp-full-2015-12-09 stanford-corenlp \
+	&& cp stanford-corenlp/stanford-corenlp-full-2015-12-09/*.jar stanford-corenlp \
+	&& mv stanford_corenlp_pywrapper-master stanford_corenlp_pywrapper \
+	&& source ./bin/activate \
+	&& cd stanford_corenlp_pywrapper \
+	&& pip install . \
+	&& cd .. \
+	&& pip install -r requirements.txt \
+	&& pip install requests \
+	&& python -m nltk.downloader -d /usr/local/share/nltk_data wordnet
+
+
+ENTRYPOINT ["/bin/bash", "start.sh"]
+# for testing:
+# CMD ["/bin/bash"]
diff --git a/use-cases/renci/renci_nlp_server/README.md b/use-cases/renci/renci_nlp_server/README.md
@@ -0,0 +1,42 @@
+Basic required environment:
+- Python 2.7 installed
+- pip installed
+- virtualenv installed
+- postgresql 9.4 installed
+
+Steps to Delopy Server:\
+Environment Setting
+- Using virtualenv to create a wrapped vitural env and activate it
+- cd to the directory where requirements.txt is located.
+- run: pip install -r requirements.txt in your shell to install required libraries.
+- go to https://github.com/brendano/stanford_corenlp_pywrapper and follow the instruction to install the stanford-corenlp-pywrapper as well
+- copy the files under the vituralenv wrapper folder, the structure of the files should looks simliar to the following structure
+    - virtualenv_warpper
+        - bin (virtualenv)
+        - include (virtualenv)
+        - lib (virtualenv)
+        - app.py (server main function here)
+        - config.ini (configuration for database and server setting)
+        - coref_rsl (co-reference resolution code here)
+        - event (event detection code here)
+        - html_parser (html parser code here)
+        - ner (nlp and ner improvement code here)
+        - stanford-corenlp (stanford core nlp.jars files here)
+        - utils (some common used functions or files here)
+        - setup_db.sql (just used for db init)
+        - test_files (not required for server, just for test)
+
+Database setup
+- Create a postgre database, update the database info in config.ini
+- import db_setup.sql to create the schema of the database
+
+Start the server
+- run: python app.py (make sure you activate the vituralenv first before trying start the server)
+- server setting can be changed in config.ini, including debug mode, host and port
+
+Other things to notice:\
+-Directory '/test_files' is just some simple tests to validate the server work properly. \
+-If you want to use test file, you need to install 'Request' library first. Run: pip install requests in your shell.  
+
+Added capability to Dockerize
+-Should change passwords in create_db.sql and config.ini to correspond
diff --git a/use-cases/renci/renci_nlp_server/app.py b/use-cases/renci/renci_nlp_server/app.py
@@ -0,0 +1,62 @@
+from ConfigParser import ConfigParser
+from coref_rsl.coref_detect import coref_rsl
+from event.ipo.ipo_detect import ipo_detect
+from event.layoff.layoff_detect import layoff_detect
+from flask import Flask, request, Response, json
+from html_parser.html_parsers import parser, update_news_db, get_news_by_url
+from ner.CoreNLP import nlp_process_article, update_ner_db, get_ner_by_id
+
+app = Flask(__name__)
+
+
+@app.route('/nlp', methods=['GET', 'POST'])
+def get_html():
+    if request.method == 'POST':
+        content = request.form['html']
+        if content == '':
+            return Response('Received data is Empty.', 400)
+        # get db info from config.ini file
+        config = ConfigParser()
+        config.read('config.ini')
+        db_name, username, host, pwd = config.get('DATABASE', 'db_name'), config.get('DATABASE', 'username'), config.get('DATABASE', 'host'), config.get('DATABASE', 'password')
+        # Try to search if news is already in database
+        news_id = get_news_by_url(request.form['url'], db_name, username, pwd, host)
+        # if not, using parser to parse the html and store the result in the database
+        if news_id is None:
+            result = parser(content)
+            if result is None:
+                return Response('Received html is not compatible with our parsers.', 400)
+            result['url'] = request.form['url']
+            news_id = update_news_db(result, db_name, username, pwd, host)
+            result['news_id'] = news_id
+            ner_result = nlp_process_article(result)
+            update_ner_db(ner_result, db_name, username, pwd, host)
+            coref_rsl(news_id, db_name, username, pwd, host)
+        else:
+            ner_result = get_ner_by_id(news_id, db_name, username, pwd, host)
+        # PROCESS THE EACH WORD IN A SENTENCE AND ITS NER
+        # Flat the nested list to be one list
+        ners = [ner if ner in ['ORGANIZATION', 'PERSON', 'TITLE', 'LOCATION'] else 'O' \
+                for ner_tag in ner_result['ner_tag'] for ner in ner_tag]
+        word_l = [y for x in ner_result['word'] for y in x]
+        mydata = {"data": {'ner': ners, 'word': word_l}}
+
+        # Find event Type
+        if request.form['event'] == 'IPO':
+            ipo_detector = ipo_detect(news_id, db_name, username, pwd, host)
+            ipo_info = ipo_detector.detect_ipo()
+            mydata['IPO'] = ipo_info
+        elif request.form['event'] == 'Layoff':
+            layoff_detector = layoff_detect(news_id, db_name, username, pwd, host)
+            layoff_info = layoff_detector.detect_layoff()
+            mydata['Layoff'] = layoff_info
+        return Response(json.dumps(mydata), mimetype='application/json')
+
+    return 'Hello World!'
+
+
+if __name__ == '__main__':
+    config = ConfigParser()
+    config.read('config.ini')
+    app.run(host=config.get('SEVER_SETTING', 'host'), port=config.getint('SEVER_SETTING', 'port'),
+            debug=config.getboolean('SEVER_SETTING', 'debug_mode'))
diff --git a/use-cases/renci/renci_nlp_server/config.ini b/use-cases/renci/renci_nlp_server/config.ini
@@ -0,0 +1,9 @@
+[DATABASE]
+db_name = nlp
+username = nlp_user
+host = localhost
+password = ThePassw0rd
+[SEVER_SETTING]
+host = 0.0.0.0
+port = 5000
+debug_mode = True
diff --git a/use-cases/renci/renci_nlp_server/coref_rsl/__init__.py b/use-cases/renci/renci_nlp_server/coref_rsl/__init__.py
diff --git a/use-cases/renci/renci_nlp_server/coref_rsl/coref_detect.py b/use-cases/renci/renci_nlp_server/coref_rsl/coref_detect.py
@@ -0,0 +1,194 @@
+import psycopg2
+import psycopg2.extras
+from entity_coref_rsl import CR
+from utils.help_func import get_topic_company
+import logging
+sp = '~^~'
+
+
+def convert_dep(dep_l):
+    deps = []
+    for dep in dep_l:
+        if '~^~' in dep:
+            (tag, head_index, dep_index) = dep.split('~^~')
+            deps.append((tag, int(head_index), int(dep_index)))
+    return deps
+
+
+# Person Pipline to Filter Stanford Coref
+def person_pip(coref_sessions, article, ner):
+    person_prn = ['he', 'she', 'his', 'her', 'him', 'himself', 'herself', 'I', 'my', 'me', 'myself', 'we', 'our', 'us',
+                  'ourselves', 'they', 'their', 'themselves']
+    for coref_id in coref_sessions:
+        coref_session = coref_sessions[coref_id]
+        person_count = 0
+        for corf in coref_session:
+            (sen_id, start_index, end_index) = [int(tmp) for tmp in corf.split('@')]
+            word = ' '.join(article[sen_id][start_index:end_index])
+            w_ner = ' '.join(ner[sen_id][start_index:end_index])
+            if word.lower() in person_prn or 'PERSON' in w_ner:
+                person_count += 1
+        if person_count != 0:
+            yield coref_id
+
+
+# Organization Pipline to Filter Stanford Coref
+# Not used in this file now
+def org_pip(coref_sessions, article, ner):
+    logging.info('In organization_pipeline')
+    org_prn = ['it', 'its']
+    # org_noun = ['company', 'firm', 'business', 'group']
+    for coref_id in coref_sessions:
+        coref_session = coref_sessions[coref_id]
+        org_count = 0
+        for corf in coref_session:
+            (sen_id, start_index, end_index) = [int(tmp) for tmp in corf.split('@')]
+            word = ' '.join(article[sen_id][start_index:end_index])
+            w_ner = ' '.join(ner[sen_id][start_index:end_index])
+            if 'ORGANIZATION' in w_ner and 'PERSON' not in w_ner:
+                org_count += 1
+            if word.lower() in org_prn:
+                org_count += 1
+            if word.lower().startswith('the'):
+                org_count += 1
+        if org_count != 0:
+            yield coref_id
+
+
+# Validate if Coref Session is valid
+def validate_coref(coref_session, article):
+    person_prn = ['he', 'she', 'her', 'him', 'himself', 'herself', 'I', 'me', 'myself', 'we', 'us', 'ourselves', 'they',
+                  'themselves']
+    possessive_adj = ['his', 'her', 'my', 'our', 'their', 'its']
+    entity_prn = ['it']
+    # Validate coref session using two rule: not all words are the same, not all words are pronouns
+    prn_count = len(coref_session)
+    referent_sum = set()
+    for corf in coref_session:
+        (sen_id, start_index, end_index) = [int(tmp) for tmp in corf.split('@')]
+        word = ' '.join(article[sen_id][start_index:end_index])
+        for adj in possessive_adj:
+            if word.lower().startswith(adj):
+                prn_count -= 1
+        if word.lower().startswith('the') or word.lower() in person_prn + entity_prn + possessive_adj:
+            prn_count -= 1
+        referent_sum.add(word.lower().strip(' \'s').strip(' \''))
+    if len(referent_sum) == 1:
+        return 0
+    elif prn_count == 0:
+        return 0.5
+    else:
+        return 1
+
+
+# Get candidate referent index(sen_id,start_index,end_index),
+# if strict = True, referent cannot be any pronoun unless the coref_session only has pronoun
+def get_referent(coref_session, article, strict=False):
+    person_prn = ['he', 'she', 'her', 'him', 'himself', 'herself', 'I', 'me', 'myself', 'we', 'us', 'ourselves', 'they',
+                  'themselves']
+    possessive_adj = ['his', 'her', 'my', 'our', 'their', 'its']
+    entity_prn = ['it']
+    ref_index = None
+    if strict:
+        # Using the first phrase which is not pronoun as default referent
+        for i in xrange(len(coref_session)):
+            (id, s_index, e_index) = [int(tmp) for tmp in coref_session[i].split('@')]
+            coref_word = ' '.join(article[id][s_index:e_index])
+            if coref_word not in person_prn + possessive_adj + entity_prn:
+                ref_index = (id, s_index, e_index)
+                break
+    if ref_index == None:
+        ref_index = [int(tmp) for tmp in coref_session[0].split('@')]
+    return ref_index
+
+def load_data(cur, doc_id):
+    sql = '''
+        SELECT sentence_id,words,lemma,pos_tags,ner_tags,dependencies,parse_tree
+		FROM sentences WHERE document_id = %s ORDER BY sentence_offset;
+		'''
+    cur.execute(sql, (doc_id,))
+    sql_results = cur.fetchall()
+    ner_tags = [r['ner_tags'] for r in sql_results]
+    pos_tags = [r['pos_tags'] for r in sql_results]
+    lemmas = [r['lemma'] for r in sql_results]
+    words = [r['words'] for r in sql_results]
+    sentence_ids = [r['sentence_id'] for r in sql_results]
+    dependencies = [convert_dep(r['dependencies']) for r in sql_results]
+    parse_trees = [r['parse_tree'] for r in sql_results]
+    sql = '''SELECT news_title,news_time FROM raw_news WHERE news_id = %s;'''
+    cur.execute(sql, (doc_id,))
+    sql_result = cur.fetchone()
+    if sql_result is None:
+        news_time, news_title = None, None
+    else:
+        news_time = sql_result['news_time']
+        news_title = sql_result['news_title']
+    # the commented code used to get Co-reference result parsed from Stanford NLP, but it is not supported now.
+    # sql = '''SELECT coref_offset, coreferences FROM doc_coreference WHERE document_id = %s ORDER BY coref_offset;'''
+    # cur.execute(sql, (doc_id,))
+    # sql_results = cur.fetchall()
+    # corefs = {r['coref_offset']: r['coreferences'] for r in sql_results}
+    # nlp_info = {'sen_id':sentence_ids, 'word':words, 'lemma':lemmas, 'ner':ner_tags, 'pos':pos_tags,
+    #             'dependency':dependencies, 'parse_tree': parse_trees, 'cr':corefs}
+    nlp_info = {'sen_id': sentence_ids, 'word': words, 'lemma': lemmas, 'ner': ner_tags, 'pos': pos_tags,
+                'dependency': dependencies, 'parse_tree': parse_trees}
+    return nlp_info, news_time, news_title
+
+
+def coref_rsl(doc_id, db_name, username, pwd, host):
+    con = None
+    try:
+        con = psycopg2.connect(database=db_name, user=username, password=pwd, host=host)
+        cur = con.cursor(cursor_factory=psycopg2.extras.DictCursor)
+        # Loading Coref_Info and NLP_Info
+        nlp_info, news_time, news_title = load_data(cur,doc_id)
+        # Get topic company from article
+        (topic_company, mentioned_comps, comp_pairs) = get_topic_company(nlp_info['word'], nlp_info['ner'], news_title,
+                                                                         all_comps=True, comp_pairs=True)
+        # Initialize our CR approach instance and get our CR process result
+        cr = CR(nlp_info, topic_company, doc_id)
+        cr_result = cr.entity_coref_rsl()
+
+        '''
+        # Code under comment aims at
+        # merging stanford CR result (nlp_info['coref']) with our naive CR result (cr_result)
+        # but this approach is still under development
+
+        merge_cr = []
+        for j in xrange(len(nlp_info['word'])):
+            merge_cr.append(['O'] * len(nlp_info['word']))
+
+        # Get coref_id for person and org pipeline individually
+        person_pipeline = person_pip(nlp_info['cr'], nlp_info['word'], nlp_info['ner'])
+        # org_pipeline = org_pip(corefs, words, ner_tags)
+
+        for coref_id in person_pipeline:
+            if validate_coref(nlp_info['cr'][coref_id], nlp_info['word']) > 0.5:
+                (p_sen_id, p_s_id, p_e_id) = get_referent(nlp_info['cr'][coref_id], nlp_info['word'], strict=True)
+                # check if referent have further referent in cr_result, if any, update referent
+                for r in search_tag(cr_result[p_sen_id][p_s_id:p_e_id], 'O', cr_result[p_sen_id][p_s_id:p_e_id]):
+                    (p_sen_id, p_s_id, p_e_id) = [int(tmp) for tmp in r.split('@')]
+                # transfer index to word
+                person_referent = ' '.join(nlp_info['word'][p_sen_id][p_s_id:p_e_id])
+                for coref in nlp_info['cr'][coref_id]:
+                    (sen_id, start_index, end_index) = [int(tmp) for tmp in coref.split('@')]
+                    word = ' '.join(nlp_info['word'][sen_id][start_index:end_index])
+                    # update person referent to entity map
+                    if person_referent != ' '.join(nlp_info['word'][sen_id][start_index:end_index]):
+                        if (end_index - start_index) <= 6:
+                            for ii in xrange(start_index, end_index):
+                                cr_result[sen_id][ii] = 'I-' + coref
+                            cr_result[sen_id][start_index] = 'B-' + coref
+        '''
+        # update CR in database
+        for j in xrange(0, len(nlp_info['word'])):
+            value_sets = {"sen_id": nlp_info['sen_id'][j], "sen_coref": cr_result[j], "doc_id": doc_id, "sen_offset": j}
+            cur.execute('''INSERT INTO doc_coref(sentence_id, sen_coref, document_id, sentence_offset)
+                                VALUES (%(sen_id)s,%(sen_coref)s,%(doc_id)s,%(sen_offset)s);''', value_sets)
+        logging.info('Document {} finished coref parser.'.format(doc_id))
+        con.commit()
+    except psycopg2.DatabaseError, e:
+        logging.info('Error %s' % e)
+    finally:
+        if con:
+            con.close()