From fdf7c67fc2504f71ba4a5ab911ba96e243a5da3a Mon Sep 17 00:00:00 2001 From: Chae Eun Lee Date: Tue, 12 Nov 2019 19:01:31 +0900 Subject: [PATCH 1/5] =?UTF-8?q?sqs=20receive=20message=20=EC=B6=94?= =?UTF-8?q?=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- NLP/src/receiveMessageFromSQS.py | 60 +++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 21 deletions(-) diff --git a/NLP/src/receiveMessageFromSQS.py b/NLP/src/receiveMessageFromSQS.py index 93ee23e..52fe8c7 100644 --- a/NLP/src/receiveMessageFromSQS.py +++ b/NLP/src/receiveMessageFromSQS.py @@ -1,30 +1,48 @@ import boto3 +from boto3.dynamodb.conditions import Key, Attr +from ast import literal_eval # Create SQS client sqs = boto3.client('sqs', region_name='ap-northeast-2') queue_url = 'https://sqs.ap-northeast-2.amazonaws.com/616448378550/newQueue' +dynamodb = boto3.resource('dynamodb', region_name='ap-northeast-2') +table = dynamodb.Table('UrlTag') -# Receive message from SQS queue -response = sqs.receive_message( - QueueUrl=queue_url, - AttributeNames=[ - 'SentTimestamp' - ], - MaxNumberOfMessages=1, - MessageAttributeNames=[ - 'All' - ], - VisibilityTimeout=0, - WaitTimeSeconds=0 -) +while 1: + # Receive message from SQS queue + response = sqs.receive_message( + QueueUrl=queue_url, + AttributeNames=[ + 'SentTimestamp' + ], + MaxNumberOfMessages=1, + MessageAttributeNames=[ + 'All' + ], + VisibilityTimeout=0, + WaitTimeSeconds=0 + ) + + try: + message = response['Messages'][0] + receipt_handle = message['ReceiptHandle'] + body = literal_eval(message["Body"]) -message = response['Messages'][0] -receipt_handle = message['ReceiptHandle'] + print(body["url"]) -# Delete received message from queue -sqs.delete_message( - QueueUrl=queue_url, - ReceiptHandle=receipt_handle -) -print('Received and deleted message: %s' % message) \ No newline at end of file + response = table.put_item( + Item={ + 'url': body["url"], + 'tags': ["sqstest"] + } + ) + + # Delete received message from queue + sqs.delete_message( + QueueUrl=queue_url, + ReceiptHandle=receipt_handle + ) + print('Received and deleted message: %s' % body["url"]) + except: + pass From 0a5d43bf032798c2f3be459cdb479c0be623721d Mon Sep 17 00:00:00 2001 From: Chae Eun Lee Date: Thu, 7 Nov 2019 01:11:59 +0900 Subject: [PATCH 2/5] =?UTF-8?q?sentry=20=EC=B6=94=EA=B0=80,=20=ED=95=A8?= =?UTF-8?q?=EC=88=98=20=EB=B3=80=EA=B2=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry picked from commit 7d7b20f5d55df7a371dd7e05bdde53dd1cbd514c) --- NLP/src/app.py | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/NLP/src/app.py b/NLP/src/app.py index a5d09cc..b3d7316 100644 --- a/NLP/src/app.py +++ b/NLP/src/app.py @@ -3,13 +3,22 @@ # flask REST API code #=========================== from flask import Flask -from flask_cors import CORS, cross_origin +from flask_cors import CORS from flask import request from flask import json from flask import Response -import csv +import sentry_sdk +from sentry_sdk.integrations.flask import FlaskIntegration -from utils import keyword_extractor +from body import * +from getTagList import * +from nounExtractor import * +from preprocessing import * + +sentry_sdk.init( + dsn="https://3748dcc21dcc4661b7157e902f07998b@sentry.io/1810010", + integrations=[FlaskIntegration()] +) app = Flask(__name__) cors = CORS(app) @@ -23,24 +32,23 @@ def get(): ''' arrayList = { - title : ""r + title : "" url : "" - memo : ["", "", ...] - highlight : ["", "", ...] } ''' result = {} - result["title"] = data["title"] - result["url"] = data["url"] - #result["keywords"], result["result"] = keyword_extractor(data["title"], "") + title = data["title"] + url = data["url"] + + title_nouns, body_nouns = nouns_extractor(title, url, nlpapi_num=3) + TF = TF_score(title_nouns, body_nouns) + tags = Total_score(TF, seletAllTagsFromDB(), alpha=1, beta=1, test=False) - res = json.dumps(result, ensure_ascii=False).encode('utf8') + result["version"] = "1.0" + result["tags"] = tags - with open('./data.csv', 'a', encoding='utf-8') as csvfile: - fieldnames = ['title', 'url', 'keywords', 'result'] - wr = csv.DictWriter(csvfile, fieldnames=fieldnames) - wr.writeheader() - #wr.writerow(result) + res = json.dumps(result, ensure_ascii=False).encode('utf8') + print(res) return Response(res, content_type='application/json; charset=utf-8') From 2f953d566506f070c8c74a1afe9d3e35ab20bc27 Mon Sep 17 00:00:00 2001 From: Chae Eun Lee Date: Thu, 7 Nov 2019 01:12:27 +0900 Subject: [PATCH 3/5] =?UTF-8?q?=EC=98=88=EC=99=B8=EC=B2=98=EB=A6=AC=20?= =?UTF-8?q?=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry picked from commit bb566f2d6df29ea81551902e6c0625b0287eebe9) --- NLP/src/body.py | 1 + NLP/src/getTagList.py | 9 +++------ NLP/src/nounExtractor.py | 8 ++------ 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/NLP/src/body.py b/NLP/src/body.py index a539a6c..4b142c6 100644 --- a/NLP/src/body.py +++ b/NLP/src/body.py @@ -15,6 +15,7 @@ def spider(url): return ''.join(data) except Exception as ex: print("::CRAWLING ERROR::\n",ex) + return "" ''' urls = [] diff --git a/NLP/src/getTagList.py b/NLP/src/getTagList.py index 4ec7dec..de8782e 100644 --- a/NLP/src/getTagList.py +++ b/NLP/src/getTagList.py @@ -8,12 +8,9 @@ def TF_score(title, body): words = {} - try: - for w, c in Counter(body).most_common(1000): - if notNoStopWords(w): - words[w] = c - except Exception as e: - print("::body nouns ERROR::\n", e) + for w, c in Counter(body).most_common(1000): + if notNoStopWords(w): + words[w] = c for w, c in Counter(title).most_common(10): if notNoStopWords(w): diff --git a/NLP/src/nounExtractor.py b/NLP/src/nounExtractor.py index e1f2857..ebf908c 100644 --- a/NLP/src/nounExtractor.py +++ b/NLP/src/nounExtractor.py @@ -16,7 +16,7 @@ def getEngNouns(sen): def nlp_apis(api_num, sen): sen = clean_sentence(sen) - api_dict = {1:Kkma(), 2:Twitter(), 3:Hannanum(), 4:Komoran(), 5:Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')} + api_dict = {1:Kkma(), 2:Twitter(), 3:Hannanum(), 4:Komoran(), 5:Mecab()} module = api_dict[api_num] @@ -26,10 +26,6 @@ def nlp_apis(api_num, sen): def nouns_extractor(title, url, nlpapi_num=5): title_nouns, _ = nlp_apis(nlpapi_num, title) - try: - body_nouns, _ = nlp_apis(nlpapi_num, spider(url)) - - except Exception as e: - print("::body nouns ERROR::\n", e) + body_nouns, _ = nlp_apis(nlpapi_num, spider(url)) return title_nouns, body_nouns \ No newline at end of file From a51372cae09d71aac50be188fa77a34c87bb935f Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sat, 9 Nov 2019 15:03:04 +0000 Subject: [PATCH 4/5] add requirements.txt (cherry picked from commit 6cb084bc0fcd8abeef298526e6ce99f883668373) --- NLP/src/requirements.txt | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 NLP/src/requirements.txt diff --git a/NLP/src/requirements.txt b/NLP/src/requirements.txt new file mode 100644 index 0000000..eb85e3c --- /dev/null +++ b/NLP/src/requirements.txt @@ -0,0 +1,27 @@ +beautifulsoup4==4.8.1 +blinker==1.4 +boto3==1.10.14 +botocore==1.13.14 +bs4==0.0.1 +certifi==2019.9.11 +Click==7.0 +docutils==0.15.2 +Flask==1.1.1 +Flask-Cors==3.0.8 +itsdangerous==1.1.0 +Jinja2==2.10.3 +jmespath==0.9.4 +JPype1==0.7.0 +konlpy==0.5.1 +lxml==4.4.1 +MarkupSafe==1.1.1 +mecab-python===0.996-ko-0.9.2 +nltk==3.4.5 +python-dateutil==2.8.0 +s3transfer==0.2.1 +sentry-sdk==0.13.2 +six==1.13.0 +soupsieve==1.9.5 +textblob==0.15.3 +urllib3==1.25.6 +Werkzeug==0.16.0 From aac05af0ac594e92b6f7b0c2a5dacbd739b7d4b1 Mon Sep 17 00:00:00 2001 From: Chae Eun Lee Date: Tue, 12 Nov 2019 19:17:26 +0900 Subject: [PATCH 5/5] =?UTF-8?q?tag=20=EC=B6=94=EC=B2=9C=20=EB=B6=80?= =?UTF-8?q?=EB=B6=84=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- NLP/src/receiveMessageFromSQS.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/NLP/src/receiveMessageFromSQS.py b/NLP/src/receiveMessageFromSQS.py index 52fe8c7..2bdc825 100644 --- a/NLP/src/receiveMessageFromSQS.py +++ b/NLP/src/receiveMessageFromSQS.py @@ -2,6 +2,11 @@ from boto3.dynamodb.conditions import Key, Attr from ast import literal_eval +from body import * +from getTagList import * +from nounExtractor import * +from preprocessing import * + # Create SQS client sqs = boto3.client('sqs', region_name='ap-northeast-2') @@ -29,12 +34,17 @@ receipt_handle = message['ReceiptHandle'] body = literal_eval(message["Body"]) - print(body["url"]) + url = body["url"] + title = body["title"] + + title_nouns, body_nouns = nouns_extractor(title, url) + TF = TF_score(title_nouns, body_nouns) + tags = Total_score(TF, seletAllTagsFromDB(), alpha=1, beta=1, test=False) response = table.put_item( Item={ - 'url': body["url"], - 'tags': ["sqstest"] + 'url': url, + 'tags': tags } )