diff --git a/NLP/src/app.py b/NLP/src/app.py index a5d09cc..b3d7316 100644 --- a/NLP/src/app.py +++ b/NLP/src/app.py @@ -3,13 +3,22 @@ # flask REST API code #=========================== from flask import Flask -from flask_cors import CORS, cross_origin +from flask_cors import CORS from flask import request from flask import json from flask import Response -import csv +import sentry_sdk +from sentry_sdk.integrations.flask import FlaskIntegration -from utils import keyword_extractor +from body import * +from getTagList import * +from nounExtractor import * +from preprocessing import * + +sentry_sdk.init( + dsn="https://3748dcc21dcc4661b7157e902f07998b@sentry.io/1810010", + integrations=[FlaskIntegration()] +) app = Flask(__name__) cors = CORS(app) @@ -23,24 +32,23 @@ def get(): ''' arrayList = { - title : ""r + title : "" url : "" - memo : ["", "", ...] - highlight : ["", "", ...] } ''' result = {} - result["title"] = data["title"] - result["url"] = data["url"] - #result["keywords"], result["result"] = keyword_extractor(data["title"], "") + title = data["title"] + url = data["url"] + + title_nouns, body_nouns = nouns_extractor(title, url, nlpapi_num=3) + TF = TF_score(title_nouns, body_nouns) + tags = Total_score(TF, seletAllTagsFromDB(), alpha=1, beta=1, test=False) - res = json.dumps(result, ensure_ascii=False).encode('utf8') + result["version"] = "1.0" + result["tags"] = tags - with open('./data.csv', 'a', encoding='utf-8') as csvfile: - fieldnames = ['title', 'url', 'keywords', 'result'] - wr = csv.DictWriter(csvfile, fieldnames=fieldnames) - wr.writeheader() - #wr.writerow(result) + res = json.dumps(result, ensure_ascii=False).encode('utf8') + print(res) return Response(res, content_type='application/json; charset=utf-8') diff --git a/NLP/src/body.py b/NLP/src/body.py index a539a6c..4b142c6 100644 --- a/NLP/src/body.py +++ b/NLP/src/body.py @@ -15,6 +15,7 @@ def spider(url): return ''.join(data) except Exception as ex: print("::CRAWLING ERROR::\n",ex) + return "" ''' urls = [] diff --git a/NLP/src/getTagList.py b/NLP/src/getTagList.py index 4ec7dec..de8782e 100644 --- a/NLP/src/getTagList.py +++ b/NLP/src/getTagList.py @@ -8,12 +8,9 @@ def TF_score(title, body): words = {} - try: - for w, c in Counter(body).most_common(1000): - if notNoStopWords(w): - words[w] = c - except Exception as e: - print("::body nouns ERROR::\n", e) + for w, c in Counter(body).most_common(1000): + if notNoStopWords(w): + words[w] = c for w, c in Counter(title).most_common(10): if notNoStopWords(w): diff --git a/NLP/src/nounExtractor.py b/NLP/src/nounExtractor.py index e1f2857..ebf908c 100644 --- a/NLP/src/nounExtractor.py +++ b/NLP/src/nounExtractor.py @@ -16,7 +16,7 @@ def getEngNouns(sen): def nlp_apis(api_num, sen): sen = clean_sentence(sen) - api_dict = {1:Kkma(), 2:Twitter(), 3:Hannanum(), 4:Komoran(), 5:Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')} + api_dict = {1:Kkma(), 2:Twitter(), 3:Hannanum(), 4:Komoran(), 5:Mecab()} module = api_dict[api_num] @@ -26,10 +26,6 @@ def nlp_apis(api_num, sen): def nouns_extractor(title, url, nlpapi_num=5): title_nouns, _ = nlp_apis(nlpapi_num, title) - try: - body_nouns, _ = nlp_apis(nlpapi_num, spider(url)) - - except Exception as e: - print("::body nouns ERROR::\n", e) + body_nouns, _ = nlp_apis(nlpapi_num, spider(url)) return title_nouns, body_nouns \ No newline at end of file diff --git a/NLP/src/receiveMessageFromSQS.py b/NLP/src/receiveMessageFromSQS.py index 93ee23e..2bdc825 100644 --- a/NLP/src/receiveMessageFromSQS.py +++ b/NLP/src/receiveMessageFromSQS.py @@ -1,30 +1,58 @@ import boto3 +from boto3.dynamodb.conditions import Key, Attr +from ast import literal_eval + +from body import * +from getTagList import * +from nounExtractor import * +from preprocessing import * # Create SQS client sqs = boto3.client('sqs', region_name='ap-northeast-2') queue_url = 'https://sqs.ap-northeast-2.amazonaws.com/616448378550/newQueue' +dynamodb = boto3.resource('dynamodb', region_name='ap-northeast-2') +table = dynamodb.Table('UrlTag') + +while 1: + # Receive message from SQS queue + response = sqs.receive_message( + QueueUrl=queue_url, + AttributeNames=[ + 'SentTimestamp' + ], + MaxNumberOfMessages=1, + MessageAttributeNames=[ + 'All' + ], + VisibilityTimeout=0, + WaitTimeSeconds=0 + ) + + try: + message = response['Messages'][0] + receipt_handle = message['ReceiptHandle'] + body = literal_eval(message["Body"]) + + url = body["url"] + title = body["title"] + + title_nouns, body_nouns = nouns_extractor(title, url) + TF = TF_score(title_nouns, body_nouns) + tags = Total_score(TF, seletAllTagsFromDB(), alpha=1, beta=1, test=False) + + response = table.put_item( + Item={ + 'url': url, + 'tags': tags + } + ) -# Receive message from SQS queue -response = sqs.receive_message( - QueueUrl=queue_url, - AttributeNames=[ - 'SentTimestamp' - ], - MaxNumberOfMessages=1, - MessageAttributeNames=[ - 'All' - ], - VisibilityTimeout=0, - WaitTimeSeconds=0 -) - -message = response['Messages'][0] -receipt_handle = message['ReceiptHandle'] - -# Delete received message from queue -sqs.delete_message( - QueueUrl=queue_url, - ReceiptHandle=receipt_handle -) -print('Received and deleted message: %s' % message) \ No newline at end of file + # Delete received message from queue + sqs.delete_message( + QueueUrl=queue_url, + ReceiptHandle=receipt_handle + ) + print('Received and deleted message: %s' % body["url"]) + except: + pass diff --git a/NLP/src/requirements.txt b/NLP/src/requirements.txt new file mode 100644 index 0000000..eb85e3c --- /dev/null +++ b/NLP/src/requirements.txt @@ -0,0 +1,27 @@ +beautifulsoup4==4.8.1 +blinker==1.4 +boto3==1.10.14 +botocore==1.13.14 +bs4==0.0.1 +certifi==2019.9.11 +Click==7.0 +docutils==0.15.2 +Flask==1.1.1 +Flask-Cors==3.0.8 +itsdangerous==1.1.0 +Jinja2==2.10.3 +jmespath==0.9.4 +JPype1==0.7.0 +konlpy==0.5.1 +lxml==4.4.1 +MarkupSafe==1.1.1 +mecab-python===0.996-ko-0.9.2 +nltk==3.4.5 +python-dateutil==2.8.0 +s3transfer==0.2.1 +sentry-sdk==0.13.2 +six==1.13.0 +soupsieve==1.9.5 +textblob==0.15.3 +urllib3==1.25.6 +Werkzeug==0.16.0