From 608e4f9c0971d9344a7010c3b9f56a343b4d2fea Mon Sep 17 00:00:00 2001 From: YONG WOOK KIM Date: Fri, 13 Oct 2023 13:57:35 -0500 Subject: [PATCH 1/7] added autophrase extractor --- .github/workflows/docker.yml | 2 +- autophrase_extractor/CHANGELOG.md | 10 ++ autophrase_extractor/Dockerfile | 18 ++ autophrase_extractor/README.md | 0 autophrase_extractor/SmmExtractor.py | 154 ++++++++++++++++++ autophrase_extractor/extractor_info.json | 56 +++++++ autophrase_extractor/requirement.txt | 1 + .../{extractor.dockerfile => Dockerfile} | 0 .../{extractor.dockerfile => Dockerfile} | 0 .../{extractor.dockerfile => Dockerfile} | 0 .../{extractor.dockerfile => Dockerfile} | 0 .../{extractor.dockerfile => Dockerfile} | 0 12 files changed, 240 insertions(+), 1 deletion(-) create mode 100644 autophrase_extractor/CHANGELOG.md create mode 100644 autophrase_extractor/Dockerfile create mode 100644 autophrase_extractor/README.md create mode 100644 autophrase_extractor/SmmExtractor.py create mode 100644 autophrase_extractor/extractor_info.json create mode 100644 autophrase_extractor/requirement.txt rename name_entity_recognition_extractor/{extractor.dockerfile => Dockerfile} (100%) rename network_analysis_extractor/{extractor.dockerfile => Dockerfile} (100%) rename preprocessing_extractor/{extractor.dockerfile => Dockerfile} (100%) rename sentiment_analysis_extractor/{extractor.dockerfile => Dockerfile} (100%) rename topic_modeling_extractor/{extractor.dockerfile => Dockerfile} (100%) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 5438cd9..c2b1ee5 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -152,7 +152,7 @@ jobs: uses: docker/build-push-action@v2 with: push: true - file: ${{ matrix.FOLDER }}/extractor.dockerfile + file: ${{ matrix.FOLDER }}/dockerfile context: ${{ matrix.FOLDER }} platforms: ${{ matrix.PLATFORM }} cache-from: type=gha,scope=${{ matrix.name }} diff --git a/autophrase_extractor/CHANGELOG.md b/autophrase_extractor/CHANGELOG.md new file mode 100644 index 0000000..73d55bf --- /dev/null +++ b/autophrase_extractor/CHANGELOG.md @@ -0,0 +1,10 @@ +# Changelog +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added +- Initial release of the autophrase extractor diff --git a/autophrase_extractor/Dockerfile b/autophrase_extractor/Dockerfile new file mode 100644 index 0000000..85f6494 --- /dev/null +++ b/autophrase_extractor/Dockerfile @@ -0,0 +1,18 @@ +FROM socialmediamacroscope/autophrase:latest + +RUN mkdir -p /scripts +WORKDIR /scripts + +COPY SmmExtractor.py ./ +COPY extractor_info.json ./ +COPY requirement.txt ./extractor-requirement.txt + +# Install pyClowder and any other python dependencies +RUN pip3 install --no-cache-dir -r ./extractor-requirement.txt -U + +# Command to be run when container is run +# Can add heartbeat to change the refresh rate +CMD python3 SmmExtractor.py --heartbeat 300 + +ENV MAIN_SCRIPT="SmmExtractor.py" \ + CLOWDER_VERSION=1 diff --git a/autophrase_extractor/README.md b/autophrase_extractor/README.md new file mode 100644 index 0000000..e69de29 diff --git a/autophrase_extractor/SmmExtractor.py b/autophrase_extractor/SmmExtractor.py new file mode 100644 index 0000000..0af20da --- /dev/null +++ b/autophrase_extractor/SmmExtractor.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python + +"""Example extractor based on the clowder code.""" +import posixpath + +import pandas as pd +import json +import os +import csv +import types +import pickle +from datetime import datetime + +import logging +from pyclowder.extractors import Extractor +import pyclowder.files + +from algorithm import algorithm +import requests + +def save_local_output(localSavePath, fname, output_data): + """ + save output in memory first to local file + :param localSavePath: local saved file + :param remoteSavePath: remote save file path + :param fname: filename + :param output_data: the actual data + :return: local saved file path + """ + # json + if isinstance(output_data, dict): + fname += '.json' + with open(os.path.join(localSavePath, fname), 'w') as f: + json.dump(output_data, f) + + # dataframe to csv + elif isinstance(output_data, pd.DataFrame): + fname += '.csv' + output_data.to_csv(fname, encoding='utf-8') + + # string to html + elif isinstance(output_data, str): + fname += '.html' + with open(os.path.join(localSavePath, fname), 'w') as f: + f.write(output_data) + + # list(list) to csv + elif isinstance(output_data, list) \ + and (isinstance(output_data[0], list) or isinstance(output_data[0], + tuple)): + fname += '.csv' + with open(os.path.join(localSavePath, fname), 'w', newline='', + encoding='utf-8') as f: + writer = csv.writer(f) + for row in output_data: + try: + writer.writerow(row) + except UnicodeEncodeError as e: + print(e) + + # special case + elif isinstance(output_data, types.GeneratorType): + if fname == 'gephi': + fname += '.gml' + elif fname == 'pajek': + fname += '.net' + else: + fname += '.unknown' + + with open(os.path.join(localSavePath, fname), 'w', newline='', + encoding='utf-8') as f: + for line in output_data: + f.write(line + '\n') + + # else pickle the object + else: + fname += '.pickle' + with open(os.path.join(localSavePath, fname), 'wb') as f: + pickle.dump(output_data, f) + + return os.path.join(localSavePath, fname) + + +# TODO wrap this into method on pyclowder +def create_output_folder(dataset_id, host, secret_key): + url = posixpath.join(host, f'api/v2/datasets/{dataset_id}/folders') + headers = {"Content-Type": "application/json", + "X-API-KEY": secret_key} + current_timestamp = datetime.now().strftime("%Y%m%d%H%M%S") + folder_data = {"name": current_timestamp} + response = requests.post(url, json=folder_data, headers=headers) + if response.status_code == 200: + return response.json().get("id") + else: + print(f"Error creating folder: {response.status_code} {response.text}") + return None + + +class SmmExtractor(Extractor): + """Count the number of characters, words and lines in a text file.""" + def __init__(self): + Extractor.__init__(self) + + # parse command line and load default logging configuration + self.setup() + + # setup logging for the exctractor + logging.getLogger('pyclowder').setLevel(logging.DEBUG) + logging.getLogger('__main__').setLevel(logging.DEBUG) + + def process_message(self, connector, host, secret_key, resource, parameters): + # this extractor runs on dataset + # uncomment to see the resource + logger = logging.getLogger(__name__) + inputfile = resource["local_paths"][0] + dataset_id = resource['parent'].get('id') + + df = pd.read_csv(inputfile) + connector.message_process(resource, "Loading contents of file...") + + # execute the algorithm + # Parse user parameters to determine which column to analyze + userParams = parameters.get('parameters') + + output = algorithm(df, userParams) + connector.message_process(resource, "Running the algorithm...") + + # Create folder to save output + clowder_version = int(os.getenv('CLOWDER_VERSION', '1')) + if clowder_version == 2: + connector.message_process(resource, "Creating output folder...") + folder_id = create_output_folder(dataset_id, host, secret_key) + if folder_id is not None: + connector.message_process(resource, f"folder id: {folder_id} created ...") + else: + folder_id = None + for fname, output_data in output.items(): + if fname != 'uid': + local_output_path = save_local_output("", fname, output_data) + connector.message_process(resource, "Saving " + local_output_path + "...") + uploaded_file_id = pyclowder.files.upload_to_dataset(connector, host, secret_key, dataset_id, + local_output_path, + folder_id=folder_id) + connector.message_process(resource, local_output_path + " saved...") + + connector.message_process(resource, "Writing metadata...") + metadata = self.get_metadata(userParams, 'file', uploaded_file_id, host) + pyclowder.files.upload_metadata(connector, host, secret_key, uploaded_file_id, metadata) + connector.message_process(resource, "Metadata written...") + + +if __name__ == "__main__": + extractor = SmmExtractor() + extractor.start() diff --git a/autophrase_extractor/extractor_info.json b/autophrase_extractor/extractor_info.json new file mode 100644 index 0000000..e2eeef5 --- /dev/null +++ b/autophrase_extractor/extractor_info.json @@ -0,0 +1,56 @@ +{ + "@context": "http://clowder.ncsa.illinois.edu/contexts/extractors.jsonld", + "name": "smm.sentiment.analysis", + "version": "0.1.2", + "description": "Sentiment analysis (sometimes known as opinion mining or emotion AI) refers to the use of natural language processing, text analysis, computational linguistics, and biometrics to systematically identify, extract, quantify, and study affective states and subjective information.", + "author": "Wang, Chen ", + "contributors": [], + "contexts": [{}], + "repository": [ + { + "repType": "git", + "repUrl": "https://github.com/ncsa/standalone-smm-analytics.git" + }, + { + "repType": "git", + "repUrl": "https://github.com/clowder-framework/smm-extractor.git" + } + ], + "process": { + "file": [ + "manual" + ] + }, + "external_services": [], + "dependencies": [], + "bibtex": [], + "parameters": { + "schema": { + "column": { + "type": "string", + "title": "Text Column Header", + "default": "text" + }, + "algorithm": { + "type": "string", + "title": "Sentiment Analysis Algorithms", + "enum": [ + "vader", + "sentiWordNet", + "debias" + ], + "default": "vader" + } + }, + "form": [ + { + "key": "column", + "type": "text" + }, + { + "key": "algorithm", + "type": "select" + } + ] + } +} diff --git a/autophrase_extractor/requirement.txt b/autophrase_extractor/requirement.txt new file mode 100644 index 0000000..7c54c75 --- /dev/null +++ b/autophrase_extractor/requirement.txt @@ -0,0 +1 @@ +pyclowder==3.0.7 diff --git a/name_entity_recognition_extractor/extractor.dockerfile b/name_entity_recognition_extractor/Dockerfile similarity index 100% rename from name_entity_recognition_extractor/extractor.dockerfile rename to name_entity_recognition_extractor/Dockerfile diff --git a/network_analysis_extractor/extractor.dockerfile b/network_analysis_extractor/Dockerfile similarity index 100% rename from network_analysis_extractor/extractor.dockerfile rename to network_analysis_extractor/Dockerfile diff --git a/preprocessing_extractor/extractor.dockerfile b/preprocessing_extractor/Dockerfile similarity index 100% rename from preprocessing_extractor/extractor.dockerfile rename to preprocessing_extractor/Dockerfile diff --git a/sentiment_analysis_extractor/extractor.dockerfile b/sentiment_analysis_extractor/Dockerfile similarity index 100% rename from sentiment_analysis_extractor/extractor.dockerfile rename to sentiment_analysis_extractor/Dockerfile diff --git a/topic_modeling_extractor/extractor.dockerfile b/topic_modeling_extractor/Dockerfile similarity index 100% rename from topic_modeling_extractor/extractor.dockerfile rename to topic_modeling_extractor/Dockerfile From ca0bf3dc88b8b62851a712c1f6e490f2da798b55 Mon Sep 17 00:00:00 2001 From: Chen Wang Date: Tue, 17 Oct 2023 13:00:10 -0500 Subject: [PATCH 2/7] capitalize the dockerfile name in the workflow --- .github/workflows/docker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index c2b1ee5..eefd577 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -152,7 +152,7 @@ jobs: uses: docker/build-push-action@v2 with: push: true - file: ${{ matrix.FOLDER }}/dockerfile + file: ${{ matrix.FOLDER }}/Dockerfile context: ${{ matrix.FOLDER }} platforms: ${{ matrix.PLATFORM }} cache-from: type=gha,scope=${{ matrix.name }} From b78e0b86063db1a4e7a707071d2265053198779a Mon Sep 17 00:00:00 2001 From: Chen Wang Date: Tue, 17 Oct 2023 15:49:32 -0500 Subject: [PATCH 3/7] add entry to changelog --- autophrase_extractor/CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/autophrase_extractor/CHANGELOG.md b/autophrase_extractor/CHANGELOG.md index 73d55bf..fe58dc1 100644 --- a/autophrase_extractor/CHANGELOG.md +++ b/autophrase_extractor/CHANGELOG.md @@ -4,7 +4,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## [0.1.0] ### Added -- Initial release of the autophrase extractor +- Initial release of the autophrase extractor [#2](https://github.com/clowder-framework/smm-extractor/issues/2) From bb6440d9acb168144a0b825324118b1fcdb57342 Mon Sep 17 00:00:00 2001 From: Chen Wang Date: Tue, 17 Oct 2023 16:05:53 -0500 Subject: [PATCH 4/7] update extractor information json --- autophrase_extractor/extractor_info.json | 29 ++++++++++++------- autophrase_extractor/requirement.txt | 2 +- .../requirement.txt | 2 +- network_analysis_extractor/requirement.txt | 2 +- preprocessing_extractor/requirement.txt | 2 +- sentiment_analysis_extractor/requirement.txt | 2 +- topic_modeling_extractor/requirement.txt | 2 +- 7 files changed, 25 insertions(+), 16 deletions(-) diff --git a/autophrase_extractor/extractor_info.json b/autophrase_extractor/extractor_info.json index e2eeef5..440f23a 100644 --- a/autophrase_extractor/extractor_info.json +++ b/autophrase_extractor/extractor_info.json @@ -1,11 +1,13 @@ { "@context": "http://clowder.ncsa.illinois.edu/contexts/extractors.jsonld", - "name": "smm.sentiment.analysis", - "version": "0.1.2", - "description": "Sentiment analysis (sometimes known as opinion mining or emotion AI) refers to the use of natural language processing, text analysis, computational linguistics, and biometrics to systematically identify, extract, quantify, and study affective states and subjective information.", + "name": "smm.autophrase", + "version": "0.1.0", + "description": "As one of the fundamental tasks in text analysis, phrase mining aims at extracting quality phrases from a text corpus. Phrase mining is important in various tasks such as information extraction/retrieval, taxonomy construction, and topic modeling.", "author": "Wang, Chen ", "contributors": [], - "contexts": [{}], + "contexts": [ + {} + ], "repository": [ { "repType": "git", @@ -31,15 +33,18 @@ "title": "Text Column Header", "default": "text" }, + "minSup": { + "type": "number", + "title": "Minimum Support", + "default": 3 + }, "algorithm": { "type": "string", - "title": "Sentiment Analysis Algorithms", - "enum": [ - "vader", - "sentiWordNet", - "debias" + "title": "Automated Phrase Mining Algorithm", + "enum": [ + "AutoPhrase (Automated Phrase Mining from Massive Text Corpora)" ], - "default": "vader" + "default": "AutoPhrase (Automated Phrase Mining from Massive Text Corpora)" } }, "form": [ @@ -47,6 +52,10 @@ "key": "column", "type": "text" }, + { + "key": "minSup", + "type": "text" + }, { "key": "algorithm", "type": "select" diff --git a/autophrase_extractor/requirement.txt b/autophrase_extractor/requirement.txt index 7c54c75..813bc6f 100644 --- a/autophrase_extractor/requirement.txt +++ b/autophrase_extractor/requirement.txt @@ -1 +1 @@ -pyclowder==3.0.7 +pyclowder>=3.0.7 diff --git a/name_entity_recognition_extractor/requirement.txt b/name_entity_recognition_extractor/requirement.txt index 7c54c75..813bc6f 100644 --- a/name_entity_recognition_extractor/requirement.txt +++ b/name_entity_recognition_extractor/requirement.txt @@ -1 +1 @@ -pyclowder==3.0.7 +pyclowder>=3.0.7 diff --git a/network_analysis_extractor/requirement.txt b/network_analysis_extractor/requirement.txt index 7c54c75..813bc6f 100644 --- a/network_analysis_extractor/requirement.txt +++ b/network_analysis_extractor/requirement.txt @@ -1 +1 @@ -pyclowder==3.0.7 +pyclowder>=3.0.7 diff --git a/preprocessing_extractor/requirement.txt b/preprocessing_extractor/requirement.txt index 7c54c75..813bc6f 100644 --- a/preprocessing_extractor/requirement.txt +++ b/preprocessing_extractor/requirement.txt @@ -1 +1 @@ -pyclowder==3.0.7 +pyclowder>=3.0.7 diff --git a/sentiment_analysis_extractor/requirement.txt b/sentiment_analysis_extractor/requirement.txt index 7c54c75..813bc6f 100644 --- a/sentiment_analysis_extractor/requirement.txt +++ b/sentiment_analysis_extractor/requirement.txt @@ -1 +1 @@ -pyclowder==3.0.7 +pyclowder>=3.0.7 diff --git a/topic_modeling_extractor/requirement.txt b/topic_modeling_extractor/requirement.txt index 7c54c75..813bc6f 100644 --- a/topic_modeling_extractor/requirement.txt +++ b/topic_modeling_extractor/requirement.txt @@ -1 +1 @@ -pyclowder==3.0.7 +pyclowder>=3.0.7 From e9304d4c1483cf53008e3b57ac8ce78ba128800e Mon Sep 17 00:00:00 2001 From: Chen Wang Date: Tue, 17 Oct 2023 16:09:07 -0500 Subject: [PATCH 5/7] update the extractor info --- autophrase_extractor/extractor_info.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autophrase_extractor/extractor_info.json b/autophrase_extractor/extractor_info.json index 440f23a..b3228b0 100644 --- a/autophrase_extractor/extractor_info.json +++ b/autophrase_extractor/extractor_info.json @@ -1,6 +1,6 @@ { "@context": "http://clowder.ncsa.illinois.edu/contexts/extractors.jsonld", - "name": "smm.autophrase", + "name": "smm.automated.phrase.mining", "version": "0.1.0", "description": "As one of the fundamental tasks in text analysis, phrase mining aims at extracting quality phrases from a text corpus. Phrase mining is important in various tasks such as information extraction/retrieval, taxonomy construction, and topic modeling.", "author": "Wang, Chen ", From 26e0a9a02325a2f93c66ec18f1ab31dfc12d2950 Mon Sep 17 00:00:00 2001 From: Chen Wang Date: Tue, 17 Oct 2023 16:12:03 -0500 Subject: [PATCH 6/7] add correct matrix to github action --- .github/workflows/docker.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index eefd577..d63da79 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -48,6 +48,9 @@ jobs: - name: topic_modeling_extractor FOLDER: topic_modeling_extractor PLATFORM: "linux/amd64,linux/arm64" + - name: autophrase_extractor + FOLDER: autophrase_extractor + PLATFORM: "linux/amd64,linux/arm64" steps: - uses: actions/checkout@v2 From 86eb18328dad13e5aac91fc18de1ee1be9004266 Mon Sep 17 00:00:00 2001 From: Chen Wang Date: Tue, 17 Oct 2023 16:26:50 -0500 Subject: [PATCH 7/7] update the README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index f277124..1cc1264 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,7 @@ You can access the latest pre-built SMM extractors on docker.io: [socialmediamacroscope/network_analysis_extractor](https://hub.docker.com/repository/docker/socialmediamacroscope/network_analysis_extractor/general) [socialmediamacroscope/preprocessing_extractor](https://hub.docker.com/repository/docker/socialmediamacroscope/preprocessing_extractor/general) [socialmediamacroscope/name_entity_recognition_extractor](https://hub.docker.com/repository/docker/socialmediamacroscope/name_entity_recognition_extractor/general) +[socialmediamacroscope/autophrase_extractor](https://hub.docker.com/repository/docker/socialmediamacroscope/autophrase_extractor/general) To include those extractors with docker-compose: ```