Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added autophrase extractor #6

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/workflows/docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ jobs:
- name: topic_modeling_extractor
FOLDER: topic_modeling_extractor
PLATFORM: "linux/amd64,linux/arm64"
- name: autophrase_extractor
FOLDER: autophrase_extractor
PLATFORM: "linux/amd64,linux/arm64"

steps:
- uses: actions/checkout@v2
Expand Down Expand Up @@ -152,7 +155,7 @@ jobs:
uses: docker/build-push-action@v2
with:
push: true
file: ${{ matrix.FOLDER }}/extractor.dockerfile
file: ${{ matrix.FOLDER }}/Dockerfile
context: ${{ matrix.FOLDER }}
platforms: ${{ matrix.PLATFORM }}
cache-from: type=gha,scope=${{ matrix.name }}
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ You can access the latest pre-built SMM extractors on docker.io:
[socialmediamacroscope/network_analysis_extractor](https://hub.docker.com/repository/docker/socialmediamacroscope/network_analysis_extractor/general)
[socialmediamacroscope/preprocessing_extractor](https://hub.docker.com/repository/docker/socialmediamacroscope/preprocessing_extractor/general)
[socialmediamacroscope/name_entity_recognition_extractor](https://hub.docker.com/repository/docker/socialmediamacroscope/name_entity_recognition_extractor/general)
[socialmediamacroscope/autophrase_extractor](https://hub.docker.com/repository/docker/socialmediamacroscope/autophrase_extractor/general)

To include those extractors with docker-compose:
```
Expand Down
10 changes: 10 additions & 0 deletions autophrase_extractor/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Changelog
All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.1.0]

### Added
- Initial release of the autophrase extractor [#2](https://github.com/clowder-framework/smm-extractor/issues/2)
18 changes: 18 additions & 0 deletions autophrase_extractor/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
FROM socialmediamacroscope/autophrase:latest

RUN mkdir -p /scripts
WORKDIR /scripts

COPY SmmExtractor.py ./
COPY extractor_info.json ./
COPY requirement.txt ./extractor-requirement.txt

# Install pyClowder and any other python dependencies
RUN pip3 install --no-cache-dir -r ./extractor-requirement.txt -U

# Command to be run when container is run
# Can add heartbeat to change the refresh rate
CMD python3 SmmExtractor.py --heartbeat 300

ENV MAIN_SCRIPT="SmmExtractor.py" \
CLOWDER_VERSION=1
Empty file added autophrase_extractor/README.md
Empty file.
154 changes: 154 additions & 0 deletions autophrase_extractor/SmmExtractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
#!/usr/bin/env python

"""Example extractor based on the clowder code."""
import posixpath

import pandas as pd
import json
import os
import csv
import types
import pickle
from datetime import datetime

import logging
from pyclowder.extractors import Extractor
import pyclowder.files

from algorithm import algorithm
import requests

def save_local_output(localSavePath, fname, output_data):
"""
save output in memory first to local file
:param localSavePath: local saved file
:param remoteSavePath: remote save file path
:param fname: filename
:param output_data: the actual data
:return: local saved file path
"""
# json
if isinstance(output_data, dict):
fname += '.json'
with open(os.path.join(localSavePath, fname), 'w') as f:
json.dump(output_data, f)

# dataframe to csv
elif isinstance(output_data, pd.DataFrame):
fname += '.csv'
output_data.to_csv(fname, encoding='utf-8')

# string to html
elif isinstance(output_data, str):
fname += '.html'
with open(os.path.join(localSavePath, fname), 'w') as f:
f.write(output_data)

# list(list) to csv
elif isinstance(output_data, list) \
and (isinstance(output_data[0], list) or isinstance(output_data[0],
tuple)):
fname += '.csv'
with open(os.path.join(localSavePath, fname), 'w', newline='',
encoding='utf-8') as f:
writer = csv.writer(f)
for row in output_data:
try:
writer.writerow(row)
except UnicodeEncodeError as e:
print(e)

# special case
elif isinstance(output_data, types.GeneratorType):
if fname == 'gephi':
fname += '.gml'
elif fname == 'pajek':
fname += '.net'
else:
fname += '.unknown'

with open(os.path.join(localSavePath, fname), 'w', newline='',
encoding='utf-8') as f:
for line in output_data:
f.write(line + '\n')

# else pickle the object
else:
fname += '.pickle'
with open(os.path.join(localSavePath, fname), 'wb') as f:
pickle.dump(output_data, f)

return os.path.join(localSavePath, fname)


# TODO wrap this into method on pyclowder
def create_output_folder(dataset_id, host, secret_key):
url = posixpath.join(host, f'api/v2/datasets/{dataset_id}/folders')
headers = {"Content-Type": "application/json",
"X-API-KEY": secret_key}
current_timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
folder_data = {"name": current_timestamp}
response = requests.post(url, json=folder_data, headers=headers)
if response.status_code == 200:
return response.json().get("id")
else:
print(f"Error creating folder: {response.status_code} {response.text}")
return None


class SmmExtractor(Extractor):
"""Count the number of characters, words and lines in a text file."""
def __init__(self):
Extractor.__init__(self)

# parse command line and load default logging configuration
self.setup()

# setup logging for the exctractor
logging.getLogger('pyclowder').setLevel(logging.DEBUG)
logging.getLogger('__main__').setLevel(logging.DEBUG)

def process_message(self, connector, host, secret_key, resource, parameters):
# this extractor runs on dataset
# uncomment to see the resource
logger = logging.getLogger(__name__)
inputfile = resource["local_paths"][0]
dataset_id = resource['parent'].get('id')

df = pd.read_csv(inputfile)
connector.message_process(resource, "Loading contents of file...")

# execute the algorithm
# Parse user parameters to determine which column to analyze
userParams = parameters.get('parameters')

output = algorithm(df, userParams)
connector.message_process(resource, "Running the algorithm...")

# Create folder to save output
clowder_version = int(os.getenv('CLOWDER_VERSION', '1'))
if clowder_version == 2:
connector.message_process(resource, "Creating output folder...")
folder_id = create_output_folder(dataset_id, host, secret_key)
if folder_id is not None:
connector.message_process(resource, f"folder id: {folder_id} created ...")
else:
folder_id = None
for fname, output_data in output.items():
if fname != 'uid':
local_output_path = save_local_output("", fname, output_data)
connector.message_process(resource, "Saving " + local_output_path + "...")
uploaded_file_id = pyclowder.files.upload_to_dataset(connector, host, secret_key, dataset_id,
local_output_path,
folder_id=folder_id)
connector.message_process(resource, local_output_path + " saved...")

connector.message_process(resource, "Writing metadata...")
metadata = self.get_metadata(userParams, 'file', uploaded_file_id, host)
pyclowder.files.upload_metadata(connector, host, secret_key, uploaded_file_id, metadata)
connector.message_process(resource, "Metadata written...")


if __name__ == "__main__":
extractor = SmmExtractor()
extractor.start()
65 changes: 65 additions & 0 deletions autophrase_extractor/extractor_info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
{
"@context": "http://clowder.ncsa.illinois.edu/contexts/extractors.jsonld",
"name": "smm.automated.phrase.mining",
"version": "0.1.0",
"description": "As one of the fundamental tasks in text analysis, phrase mining aims at extracting quality phrases from a text corpus. Phrase mining is important in various tasks such as information extraction/retrieval, taxonomy construction, and topic modeling.",
"author": "Wang, Chen <[email protected]>",
"contributors": [],
"contexts": [
{}
],
"repository": [
{
"repType": "git",
"repUrl": "https://github.com/ncsa/standalone-smm-analytics.git"
},
{
"repType": "git",
"repUrl": "https://github.com/clowder-framework/smm-extractor.git"
}
],
"process": {
"file": [
"manual"
]
},
"external_services": [],
"dependencies": [],
"bibtex": [],
"parameters": {
"schema": {
"column": {
"type": "string",
"title": "Text Column Header",
"default": "text"
},
"minSup": {
"type": "number",
"title": "Minimum Support",
"default": 3
},
"algorithm": {
"type": "string",
"title": "Automated Phrase Mining Algorithm",
"enum": [
"AutoPhrase (Automated Phrase Mining from Massive Text Corpora)"
],
"default": "AutoPhrase (Automated Phrase Mining from Massive Text Corpora)"
}
},
"form": [
{
"key": "column",
"type": "text"
},
{
"key": "minSup",
"type": "text"
},
{
"key": "algorithm",
"type": "select"
}
]
}
}
1 change: 1 addition & 0 deletions autophrase_extractor/requirement.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pyclowder>=3.0.7
2 changes: 1 addition & 1 deletion name_entity_recognition_extractor/requirement.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
pyclowder==3.0.7
pyclowder>=3.0.7
2 changes: 1 addition & 1 deletion network_analysis_extractor/requirement.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
pyclowder==3.0.7
pyclowder>=3.0.7
2 changes: 1 addition & 1 deletion preprocessing_extractor/requirement.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
pyclowder==3.0.7
pyclowder>=3.0.7
2 changes: 1 addition & 1 deletion sentiment_analysis_extractor/requirement.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
pyclowder==3.0.7
pyclowder>=3.0.7
2 changes: 1 addition & 1 deletion topic_modeling_extractor/requirement.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
pyclowder==3.0.7
pyclowder>=3.0.7