-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
73 changed files
with
11,243 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
FROM centos:centos6.7 | ||
MAINTAINER Lisa Stillwell <[email protected]> | ||
|
||
ENV NLP_DIR /renci_nlp_server | ||
WORKDIR $NLP_DIR | ||
|
||
RUN yum -y update \ | ||
&& yum clean all \ | ||
&& yum -y install epel-release \ | ||
&& yum clean all \ | ||
&& yum -y install wget | ||
|
||
# Install postgresql and setup database | ||
RUN rpm -Uvh http://yum.postgresql.org/9.4/redhat/rhel-6-x86_64/pgdg-centos94-9.4-3.noarch.rpm \ | ||
&& yum -y install postgresql94 postgresql94-server postgresql94-contrib postgresql94-devel \ | ||
&& ln -s /usr/pgsql-9.4/bin/pg_config /usr/bin/pg_config \ | ||
&& service postgresql-9.4 initdb \ | ||
&& sed -i 's/^host[ \t]*all[ \t]*all[ \t]*127\.0\.0\.1\/32[ \t]*ident/host all all 127\.0\.0\.1\/32 password/' /var/lib/pgsql/9.4/data/pg_hba.conf | ||
|
||
# Install Java 1.8 | ||
#RUN wget --no-cookies --no-check-certificate --header "Cookie: gpw_e24=http%3A%2F%2Fwww.oracle.com%2F \ | ||
# && oraclelicense=accept-securebackup-cookie" http://download.oracle.com/otn-pub/java/jdk/8u111-b14/jdk-8u111-linux-x64.tar.gz \ | ||
RUN curl -L -O -H "Cookie: oraclelicense=accept-securebackup-cookie" -k "https://edelivery.oracle.com/otn-pub/java/jdk/8u111-b14/jdk-8u111-linux-x64.tar.gz" \ | ||
&& tar -xzf jdk-8u111-linux-x64.tar.gz \ | ||
&& rm jdk-8u111-linux-x64.tar.gz \ | ||
&& cd jdk1.8.0_111/ \ | ||
&& alternatives --install /usr/bin/java java $NLP_DIR/jdk1.8.0_111/bin/java 1 | ||
|
||
# Install Python 2.7 and all prereqs | ||
RUN yum -y install gcc zlib-devel unzip sqlite-devel openssl openssl-devel \ | ||
&& wget https://www.python.org/ftp/python/2.7.12/Python-2.7.12.tgz \ | ||
&& tar -xzvf Python-2.7.12.tgz \ | ||
&& rm Python-2.7.12.tgz \ | ||
&& cd Python-2.7.12 \ | ||
&& ./configure; make; make altinstall \ | ||
&& yum -y install python-pip \ | ||
&& pip install --upgrade pip \ | ||
&& pip install virtualenv \ | ||
&& virtualenv -p /usr/local/bin/python2.7 $NLP_DIR | ||
|
||
# Download and install all NLP code | ||
RUN wget http://nlp.stanford.edu/software/stanford-corenlp-full-2015-12-09.zip \ | ||
&& unzip stanford-corenlp-full-2015-12-09.zip \ | ||
&& rm stanford-corenlp-full-2015-12-09.zip \ | ||
&& wget https://github.com/brendano/stanford_corenlp_pywrapper/archive/master.zip \ | ||
&& unzip master.zip; rm master.zip \ | ||
&& wget https://github.com/lstillwe/renci_nlp_server/archive/master.zip \ | ||
&& unzip master.zip \ | ||
&& mv renci_nlp_server-master/* $NLP_DIR; rm master.zip; rm -rf renci_nlp_server-master \ | ||
&& mv stanford-corenlp-full-2015-12-09 stanford-corenlp \ | ||
&& cp stanford-corenlp/stanford-corenlp-full-2015-12-09/*.jar stanford-corenlp \ | ||
&& mv stanford_corenlp_pywrapper-master stanford_corenlp_pywrapper \ | ||
&& source ./bin/activate \ | ||
&& cd stanford_corenlp_pywrapper \ | ||
&& pip install . \ | ||
&& cd .. \ | ||
&& pip install -r requirements.txt \ | ||
&& pip install requests \ | ||
&& python -m nltk.downloader -d /usr/local/share/nltk_data wordnet | ||
|
||
|
||
ENTRYPOINT ["/bin/bash", "start.sh"] | ||
# for testing: | ||
# CMD ["/bin/bash"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
Basic required environment: | ||
- Python 2.7 installed | ||
- pip installed | ||
- virtualenv installed | ||
- postgresql 9.4 installed | ||
|
||
Steps to Delopy Server:\ | ||
Environment Setting | ||
- Using virtualenv to create a wrapped vitural env and activate it | ||
- cd to the directory where requirements.txt is located. | ||
- run: pip install -r requirements.txt in your shell to install required libraries. | ||
- go to https://github.com/brendano/stanford_corenlp_pywrapper and follow the instruction to install the stanford-corenlp-pywrapper as well | ||
- copy the files under the vituralenv wrapper folder, the structure of the files should looks simliar to the following structure | ||
- virtualenv_warpper | ||
- bin (virtualenv) | ||
- include (virtualenv) | ||
- lib (virtualenv) | ||
- app.py (server main function here) | ||
- config.ini (configuration for database and server setting) | ||
- coref_rsl (co-reference resolution code here) | ||
- event (event detection code here) | ||
- html_parser (html parser code here) | ||
- ner (nlp and ner improvement code here) | ||
- stanford-corenlp (stanford core nlp.jars files here) | ||
- utils (some common used functions or files here) | ||
- setup_db.sql (just used for db init) | ||
- test_files (not required for server, just for test) | ||
|
||
Database setup | ||
- Create a postgre database, update the database info in config.ini | ||
- import db_setup.sql to create the schema of the database | ||
|
||
Start the server | ||
- run: python app.py (make sure you activate the vituralenv first before trying start the server) | ||
- server setting can be changed in config.ini, including debug mode, host and port | ||
|
||
Other things to notice:\ | ||
-Directory '/test_files' is just some simple tests to validate the server work properly. \ | ||
-If you want to use test file, you need to install 'Request' library first. Run: pip install requests in your shell. | ||
|
||
Added capability to Dockerize | ||
-Should change passwords in create_db.sql and config.ini to correspond |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
from ConfigParser import ConfigParser | ||
from coref_rsl.coref_detect import coref_rsl | ||
from event.ipo.ipo_detect import ipo_detect | ||
from event.layoff.layoff_detect import layoff_detect | ||
from flask import Flask, request, Response, json | ||
from html_parser.html_parsers import parser, update_news_db, get_news_by_url | ||
from ner.CoreNLP import nlp_process_article, update_ner_db, get_ner_by_id | ||
|
||
app = Flask(__name__) | ||
|
||
|
||
@app.route('/nlp', methods=['GET', 'POST']) | ||
def get_html(): | ||
if request.method == 'POST': | ||
content = request.form['html'] | ||
if content == '': | ||
return Response('Received data is Empty.', 400) | ||
# get db info from config.ini file | ||
config = ConfigParser() | ||
config.read('config.ini') | ||
db_name, username, host, pwd = config.get('DATABASE', 'db_name'), config.get('DATABASE', 'username'), config.get('DATABASE', 'host'), config.get('DATABASE', 'password') | ||
# Try to search if news is already in database | ||
news_id = get_news_by_url(request.form['url'], db_name, username, pwd, host) | ||
# if not, using parser to parse the html and store the result in the database | ||
if news_id is None: | ||
result = parser(content) | ||
if result is None: | ||
return Response('Received html is not compatible with our parsers.', 400) | ||
result['url'] = request.form['url'] | ||
news_id = update_news_db(result, db_name, username, pwd, host) | ||
result['news_id'] = news_id | ||
ner_result = nlp_process_article(result) | ||
update_ner_db(ner_result, db_name, username, pwd, host) | ||
coref_rsl(news_id, db_name, username, pwd, host) | ||
else: | ||
ner_result = get_ner_by_id(news_id, db_name, username, pwd, host) | ||
# PROCESS THE EACH WORD IN A SENTENCE AND ITS NER | ||
# Flat the nested list to be one list | ||
ners = [ner if ner in ['ORGANIZATION', 'PERSON', 'TITLE', 'LOCATION'] else 'O' \ | ||
for ner_tag in ner_result['ner_tag'] for ner in ner_tag] | ||
word_l = [y for x in ner_result['word'] for y in x] | ||
mydata = {"data": {'ner': ners, 'word': word_l}} | ||
|
||
# Find event Type | ||
if request.form['event'] == 'IPO': | ||
ipo_detector = ipo_detect(news_id, db_name, username, pwd, host) | ||
ipo_info = ipo_detector.detect_ipo() | ||
mydata['IPO'] = ipo_info | ||
elif request.form['event'] == 'Layoff': | ||
layoff_detector = layoff_detect(news_id, db_name, username, pwd, host) | ||
layoff_info = layoff_detector.detect_layoff() | ||
mydata['Layoff'] = layoff_info | ||
return Response(json.dumps(mydata), mimetype='application/json') | ||
|
||
return 'Hello World!' | ||
|
||
|
||
if __name__ == '__main__': | ||
config = ConfigParser() | ||
config.read('config.ini') | ||
app.run(host=config.get('SEVER_SETTING', 'host'), port=config.getint('SEVER_SETTING', 'port'), | ||
debug=config.getboolean('SEVER_SETTING', 'debug_mode')) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
[DATABASE] | ||
db_name = nlp | ||
username = nlp_user | ||
host = localhost | ||
password = ThePassw0rd | ||
[SEVER_SETTING] | ||
host = 0.0.0.0 | ||
port = 5000 | ||
debug_mode = True |
Empty file.
194 changes: 194 additions & 0 deletions
194
use-cases/renci/renci_nlp_server/coref_rsl/coref_detect.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,194 @@ | ||
import psycopg2 | ||
import psycopg2.extras | ||
from entity_coref_rsl import CR | ||
from utils.help_func import get_topic_company | ||
import logging | ||
sp = '~^~' | ||
|
||
|
||
def convert_dep(dep_l): | ||
deps = [] | ||
for dep in dep_l: | ||
if '~^~' in dep: | ||
(tag, head_index, dep_index) = dep.split('~^~') | ||
deps.append((tag, int(head_index), int(dep_index))) | ||
return deps | ||
|
||
|
||
# Person Pipline to Filter Stanford Coref | ||
def person_pip(coref_sessions, article, ner): | ||
person_prn = ['he', 'she', 'his', 'her', 'him', 'himself', 'herself', 'I', 'my', 'me', 'myself', 'we', 'our', 'us', | ||
'ourselves', 'they', 'their', 'themselves'] | ||
for coref_id in coref_sessions: | ||
coref_session = coref_sessions[coref_id] | ||
person_count = 0 | ||
for corf in coref_session: | ||
(sen_id, start_index, end_index) = [int(tmp) for tmp in corf.split('@')] | ||
word = ' '.join(article[sen_id][start_index:end_index]) | ||
w_ner = ' '.join(ner[sen_id][start_index:end_index]) | ||
if word.lower() in person_prn or 'PERSON' in w_ner: | ||
person_count += 1 | ||
if person_count != 0: | ||
yield coref_id | ||
|
||
|
||
# Organization Pipline to Filter Stanford Coref | ||
# Not used in this file now | ||
def org_pip(coref_sessions, article, ner): | ||
logging.info('In organization_pipeline') | ||
org_prn = ['it', 'its'] | ||
# org_noun = ['company', 'firm', 'business', 'group'] | ||
for coref_id in coref_sessions: | ||
coref_session = coref_sessions[coref_id] | ||
org_count = 0 | ||
for corf in coref_session: | ||
(sen_id, start_index, end_index) = [int(tmp) for tmp in corf.split('@')] | ||
word = ' '.join(article[sen_id][start_index:end_index]) | ||
w_ner = ' '.join(ner[sen_id][start_index:end_index]) | ||
if 'ORGANIZATION' in w_ner and 'PERSON' not in w_ner: | ||
org_count += 1 | ||
if word.lower() in org_prn: | ||
org_count += 1 | ||
if word.lower().startswith('the'): | ||
org_count += 1 | ||
if org_count != 0: | ||
yield coref_id | ||
|
||
|
||
# Validate if Coref Session is valid | ||
def validate_coref(coref_session, article): | ||
person_prn = ['he', 'she', 'her', 'him', 'himself', 'herself', 'I', 'me', 'myself', 'we', 'us', 'ourselves', 'they', | ||
'themselves'] | ||
possessive_adj = ['his', 'her', 'my', 'our', 'their', 'its'] | ||
entity_prn = ['it'] | ||
# Validate coref session using two rule: not all words are the same, not all words are pronouns | ||
prn_count = len(coref_session) | ||
referent_sum = set() | ||
for corf in coref_session: | ||
(sen_id, start_index, end_index) = [int(tmp) for tmp in corf.split('@')] | ||
word = ' '.join(article[sen_id][start_index:end_index]) | ||
for adj in possessive_adj: | ||
if word.lower().startswith(adj): | ||
prn_count -= 1 | ||
if word.lower().startswith('the') or word.lower() in person_prn + entity_prn + possessive_adj: | ||
prn_count -= 1 | ||
referent_sum.add(word.lower().strip(' \'s').strip(' \'')) | ||
if len(referent_sum) == 1: | ||
return 0 | ||
elif prn_count == 0: | ||
return 0.5 | ||
else: | ||
return 1 | ||
|
||
|
||
# Get candidate referent index(sen_id,start_index,end_index), | ||
# if strict = True, referent cannot be any pronoun unless the coref_session only has pronoun | ||
def get_referent(coref_session, article, strict=False): | ||
person_prn = ['he', 'she', 'her', 'him', 'himself', 'herself', 'I', 'me', 'myself', 'we', 'us', 'ourselves', 'they', | ||
'themselves'] | ||
possessive_adj = ['his', 'her', 'my', 'our', 'their', 'its'] | ||
entity_prn = ['it'] | ||
ref_index = None | ||
if strict: | ||
# Using the first phrase which is not pronoun as default referent | ||
for i in xrange(len(coref_session)): | ||
(id, s_index, e_index) = [int(tmp) for tmp in coref_session[i].split('@')] | ||
coref_word = ' '.join(article[id][s_index:e_index]) | ||
if coref_word not in person_prn + possessive_adj + entity_prn: | ||
ref_index = (id, s_index, e_index) | ||
break | ||
if ref_index == None: | ||
ref_index = [int(tmp) for tmp in coref_session[0].split('@')] | ||
return ref_index | ||
|
||
def load_data(cur, doc_id): | ||
sql = ''' | ||
SELECT sentence_id,words,lemma,pos_tags,ner_tags,dependencies,parse_tree | ||
FROM sentences WHERE document_id = %s ORDER BY sentence_offset; | ||
''' | ||
cur.execute(sql, (doc_id,)) | ||
sql_results = cur.fetchall() | ||
ner_tags = [r['ner_tags'] for r in sql_results] | ||
pos_tags = [r['pos_tags'] for r in sql_results] | ||
lemmas = [r['lemma'] for r in sql_results] | ||
words = [r['words'] for r in sql_results] | ||
sentence_ids = [r['sentence_id'] for r in sql_results] | ||
dependencies = [convert_dep(r['dependencies']) for r in sql_results] | ||
parse_trees = [r['parse_tree'] for r in sql_results] | ||
sql = '''SELECT news_title,news_time FROM raw_news WHERE news_id = %s;''' | ||
cur.execute(sql, (doc_id,)) | ||
sql_result = cur.fetchone() | ||
if sql_result is None: | ||
news_time, news_title = None, None | ||
else: | ||
news_time = sql_result['news_time'] | ||
news_title = sql_result['news_title'] | ||
# the commented code used to get Co-reference result parsed from Stanford NLP, but it is not supported now. | ||
# sql = '''SELECT coref_offset, coreferences FROM doc_coreference WHERE document_id = %s ORDER BY coref_offset;''' | ||
# cur.execute(sql, (doc_id,)) | ||
# sql_results = cur.fetchall() | ||
# corefs = {r['coref_offset']: r['coreferences'] for r in sql_results} | ||
# nlp_info = {'sen_id':sentence_ids, 'word':words, 'lemma':lemmas, 'ner':ner_tags, 'pos':pos_tags, | ||
# 'dependency':dependencies, 'parse_tree': parse_trees, 'cr':corefs} | ||
nlp_info = {'sen_id': sentence_ids, 'word': words, 'lemma': lemmas, 'ner': ner_tags, 'pos': pos_tags, | ||
'dependency': dependencies, 'parse_tree': parse_trees} | ||
return nlp_info, news_time, news_title | ||
|
||
|
||
def coref_rsl(doc_id, db_name, username, pwd, host): | ||
con = None | ||
try: | ||
con = psycopg2.connect(database=db_name, user=username, password=pwd, host=host) | ||
cur = con.cursor(cursor_factory=psycopg2.extras.DictCursor) | ||
# Loading Coref_Info and NLP_Info | ||
nlp_info, news_time, news_title = load_data(cur,doc_id) | ||
# Get topic company from article | ||
(topic_company, mentioned_comps, comp_pairs) = get_topic_company(nlp_info['word'], nlp_info['ner'], news_title, | ||
all_comps=True, comp_pairs=True) | ||
# Initialize our CR approach instance and get our CR process result | ||
cr = CR(nlp_info, topic_company, doc_id) | ||
cr_result = cr.entity_coref_rsl() | ||
|
||
''' | ||
# Code under comment aims at | ||
# merging stanford CR result (nlp_info['coref']) with our naive CR result (cr_result) | ||
# but this approach is still under development | ||
merge_cr = [] | ||
for j in xrange(len(nlp_info['word'])): | ||
merge_cr.append(['O'] * len(nlp_info['word'])) | ||
# Get coref_id for person and org pipeline individually | ||
person_pipeline = person_pip(nlp_info['cr'], nlp_info['word'], nlp_info['ner']) | ||
# org_pipeline = org_pip(corefs, words, ner_tags) | ||
for coref_id in person_pipeline: | ||
if validate_coref(nlp_info['cr'][coref_id], nlp_info['word']) > 0.5: | ||
(p_sen_id, p_s_id, p_e_id) = get_referent(nlp_info['cr'][coref_id], nlp_info['word'], strict=True) | ||
# check if referent have further referent in cr_result, if any, update referent | ||
for r in search_tag(cr_result[p_sen_id][p_s_id:p_e_id], 'O', cr_result[p_sen_id][p_s_id:p_e_id]): | ||
(p_sen_id, p_s_id, p_e_id) = [int(tmp) for tmp in r.split('@')] | ||
# transfer index to word | ||
person_referent = ' '.join(nlp_info['word'][p_sen_id][p_s_id:p_e_id]) | ||
for coref in nlp_info['cr'][coref_id]: | ||
(sen_id, start_index, end_index) = [int(tmp) for tmp in coref.split('@')] | ||
word = ' '.join(nlp_info['word'][sen_id][start_index:end_index]) | ||
# update person referent to entity map | ||
if person_referent != ' '.join(nlp_info['word'][sen_id][start_index:end_index]): | ||
if (end_index - start_index) <= 6: | ||
for ii in xrange(start_index, end_index): | ||
cr_result[sen_id][ii] = 'I-' + coref | ||
cr_result[sen_id][start_index] = 'B-' + coref | ||
''' | ||
# update CR in database | ||
for j in xrange(0, len(nlp_info['word'])): | ||
value_sets = {"sen_id": nlp_info['sen_id'][j], "sen_coref": cr_result[j], "doc_id": doc_id, "sen_offset": j} | ||
cur.execute('''INSERT INTO doc_coref(sentence_id, sen_coref, document_id, sentence_offset) | ||
VALUES (%(sen_id)s,%(sen_coref)s,%(doc_id)s,%(sen_offset)s);''', value_sets) | ||
logging.info('Document {} finished coref parser.'.format(doc_id)) | ||
con.commit() | ||
except psycopg2.DatabaseError, e: | ||
logging.info('Error %s' % e) | ||
finally: | ||
if con: | ||
con.close() |
Oops, something went wrong.