Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/221 nlp sqs #222

Merged
merged 5 commits into from
Nov 12, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 23 additions & 15 deletions NLP/src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,22 @@
# flask REST API code
#===========================
from flask import Flask
from flask_cors import CORS, cross_origin
from flask_cors import CORS
from flask import request
from flask import json
from flask import Response
import csv
import sentry_sdk
from sentry_sdk.integrations.flask import FlaskIntegration

from utils import keyword_extractor
from body import *
from getTagList import *
from nounExtractor import *
from preprocessing import *

sentry_sdk.init(
dsn="https://[email protected]/1810010",
integrations=[FlaskIntegration()]
)

app = Flask(__name__)
cors = CORS(app)
Expand All @@ -23,24 +32,23 @@ def get():
'''
arrayList =
{
title : ""r
title : ""
url : ""
memo : ["", "", ...]
highlight : ["", "", ...]
}
'''
result = {}
result["title"] = data["title"]
result["url"] = data["url"]
#result["keywords"], result["result"] = keyword_extractor(data["title"], "")
title = data["title"]
url = data["url"]

title_nouns, body_nouns = nouns_extractor(title, url, nlpapi_num=3)
TF = TF_score(title_nouns, body_nouns)
tags = Total_score(TF, seletAllTagsFromDB(), alpha=1, beta=1, test=False)

res = json.dumps(result, ensure_ascii=False).encode('utf8')
result["version"] = "1.0"
result["tags"] = tags

with open('./data.csv', 'a', encoding='utf-8') as csvfile:
fieldnames = ['title', 'url', 'keywords', 'result']
wr = csv.DictWriter(csvfile, fieldnames=fieldnames)
wr.writeheader()
#wr.writerow(result)
res = json.dumps(result, ensure_ascii=False).encode('utf8')
print(res)

return Response(res, content_type='application/json; charset=utf-8')

Expand Down
1 change: 1 addition & 0 deletions NLP/src/body.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def spider(url):
return ''.join(data)
except Exception as ex:
print("::CRAWLING ERROR::\n",ex)
return ""

'''
urls = []
Expand Down
9 changes: 3 additions & 6 deletions NLP/src/getTagList.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,9 @@
def TF_score(title, body):

words = {}
try:
for w, c in Counter(body).most_common(1000):
if notNoStopWords(w):
words[w] = c
except Exception as e:
print("::body nouns ERROR::\n", e)
for w, c in Counter(body).most_common(1000):
if notNoStopWords(w):
words[w] = c

for w, c in Counter(title).most_common(10):
if notNoStopWords(w):
Expand Down
8 changes: 2 additions & 6 deletions NLP/src/nounExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def getEngNouns(sen):

def nlp_apis(api_num, sen):
sen = clean_sentence(sen)
api_dict = {1:Kkma(), 2:Twitter(), 3:Hannanum(), 4:Komoran(), 5:Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic')}
api_dict = {1:Kkma(), 2:Twitter(), 3:Hannanum(), 4:Komoran(), 5:Mecab()}

module = api_dict[api_num]

Expand All @@ -26,10 +26,6 @@ def nlp_apis(api_num, sen):

def nouns_extractor(title, url, nlpapi_num=5):
title_nouns, _ = nlp_apis(nlpapi_num, title)
try:
body_nouns, _ = nlp_apis(nlpapi_num, spider(url))

except Exception as e:
print("::body nouns ERROR::\n", e)
body_nouns, _ = nlp_apis(nlpapi_num, spider(url))

return title_nouns, body_nouns
74 changes: 51 additions & 23 deletions NLP/src/receiveMessageFromSQS.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,58 @@
import boto3
from boto3.dynamodb.conditions import Key, Attr
from ast import literal_eval

from body import *
from getTagList import *
from nounExtractor import *
from preprocessing import *

# Create SQS client
sqs = boto3.client('sqs', region_name='ap-northeast-2')

queue_url = 'https://sqs.ap-northeast-2.amazonaws.com/616448378550/newQueue'
dynamodb = boto3.resource('dynamodb', region_name='ap-northeast-2')
table = dynamodb.Table('UrlTag')

while 1:
# Receive message from SQS queue
response = sqs.receive_message(
QueueUrl=queue_url,
AttributeNames=[
'SentTimestamp'
],
MaxNumberOfMessages=1,
MessageAttributeNames=[
'All'
],
VisibilityTimeout=0,
WaitTimeSeconds=0
)

try:
message = response['Messages'][0]
receipt_handle = message['ReceiptHandle']
body = literal_eval(message["Body"])

url = body["url"]
title = body["title"]

title_nouns, body_nouns = nouns_extractor(title, url)
TF = TF_score(title_nouns, body_nouns)
tags = Total_score(TF, seletAllTagsFromDB(), alpha=1, beta=1, test=False)

response = table.put_item(
Item={
'url': url,
'tags': tags
}
)

# Receive message from SQS queue
response = sqs.receive_message(
QueueUrl=queue_url,
AttributeNames=[
'SentTimestamp'
],
MaxNumberOfMessages=1,
MessageAttributeNames=[
'All'
],
VisibilityTimeout=0,
WaitTimeSeconds=0
)

message = response['Messages'][0]
receipt_handle = message['ReceiptHandle']

# Delete received message from queue
sqs.delete_message(
QueueUrl=queue_url,
ReceiptHandle=receipt_handle
)
print('Received and deleted message: %s' % message)
# Delete received message from queue
sqs.delete_message(
QueueUrl=queue_url,
ReceiptHandle=receipt_handle
)
print('Received and deleted message: %s' % body["url"])
except:
pass
27 changes: 27 additions & 0 deletions NLP/src/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
beautifulsoup4==4.8.1
blinker==1.4
boto3==1.10.14
botocore==1.13.14
bs4==0.0.1
certifi==2019.9.11
Click==7.0
docutils==0.15.2
Flask==1.1.1
Flask-Cors==3.0.8
itsdangerous==1.1.0
Jinja2==2.10.3
jmespath==0.9.4
JPype1==0.7.0
konlpy==0.5.1
lxml==4.4.1
MarkupSafe==1.1.1
mecab-python===0.996-ko-0.9.2
nltk==3.4.5
python-dateutil==2.8.0
s3transfer==0.2.1
sentry-sdk==0.13.2
six==1.13.0
soupsieve==1.9.5
textblob==0.15.3
urllib3==1.25.6
Werkzeug==0.16.0