clowder-framework · ywkim312 · Oct 13, 2023 · Oct 17, 2023 · Oct 17, 2023 · Oct 17, 2023
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
@@ -48,6 +48,9 @@ jobs:
           - name: topic_modeling_extractor
             FOLDER: topic_modeling_extractor
             PLATFORM: "linux/amd64,linux/arm64"
+          - name: autophrase_extractor
+            FOLDER: autophrase_extractor
+            PLATFORM: "linux/amd64,linux/arm64"
 
     steps:
       - uses: actions/checkout@v2
@@ -152,7 +155,7 @@ jobs:
         uses: docker/build-push-action@v2
         with:
           push: true
-          file: ${{ matrix.FOLDER }}/extractor.dockerfile
+          file: ${{ matrix.FOLDER }}/Dockerfile
           context: ${{ matrix.FOLDER }}
           platforms: ${{ matrix.PLATFORM }}
           cache-from: type=gha,scope=${{ matrix.name }}

diff --git a/README.md b/README.md
@@ -30,6 +30,7 @@ You can access the latest pre-built SMM extractors on docker.io:
 [socialmediamacroscope/network_analysis_extractor](https://hub.docker.com/repository/docker/socialmediamacroscope/network_analysis_extractor/general)
 [socialmediamacroscope/preprocessing_extractor](https://hub.docker.com/repository/docker/socialmediamacroscope/preprocessing_extractor/general)
 [socialmediamacroscope/name_entity_recognition_extractor](https://hub.docker.com/repository/docker/socialmediamacroscope/name_entity_recognition_extractor/general)
+[socialmediamacroscope/autophrase_extractor](https://hub.docker.com/repository/docker/socialmediamacroscope/autophrase_extractor/general)
 
 To include those extractors with docker-compose:
 ```

diff --git a/autophrase_extractor/CHANGELOG.md b/autophrase_extractor/CHANGELOG.md
@@ -0,0 +1,10 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [0.1.0] 
+
+### Added
+- Initial release of the autophrase extractor [#2](https://github.com/clowder-framework/smm-extractor/issues/2)
diff --git a/autophrase_extractor/Dockerfile b/autophrase_extractor/Dockerfile
@@ -0,0 +1,18 @@
+FROM socialmediamacroscope/autophrase:latest
+
+RUN mkdir -p /scripts
+WORKDIR /scripts
+
+COPY SmmExtractor.py ./
+COPY extractor_info.json ./
+COPY requirement.txt ./extractor-requirement.txt
+
+# Install pyClowder and any other python dependencies
+RUN pip3 install --no-cache-dir -r ./extractor-requirement.txt -U
+
+# Command to be run when container is run
+# Can add heartbeat to change the refresh rate
+CMD python3 SmmExtractor.py --heartbeat 300
+
+ENV MAIN_SCRIPT="SmmExtractor.py" \
+    CLOWDER_VERSION=1
diff --git a/autophrase_extractor/README.md b/autophrase_extractor/README.md
diff --git a/autophrase_extractor/SmmExtractor.py b/autophrase_extractor/SmmExtractor.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python
+
+"""Example extractor based on the clowder code."""
+import posixpath
+
+import pandas as pd
+import json
+import os
+import csv
+import types
+import pickle
+from datetime import datetime
+
+import logging
+from pyclowder.extractors import Extractor
+import pyclowder.files
+
+from algorithm import algorithm
+import requests
+
+def save_local_output(localSavePath, fname, output_data):
+    """
+    save output in memory first to local file
+    :param localSavePath: local saved file
+    :param remoteSavePath: remote save file path
+    :param fname: filename
+    :param output_data: the actual data
+    :return: local saved file path
+    """
+    # json
+    if isinstance(output_data, dict):
+        fname += '.json'
+        with open(os.path.join(localSavePath, fname), 'w') as f:
+            json.dump(output_data, f)
+
+    # dataframe to csv
+    elif isinstance(output_data, pd.DataFrame):
+        fname += '.csv'
+        output_data.to_csv(fname, encoding='utf-8')
+
+    # string to html
+    elif isinstance(output_data, str):
+        fname += '.html'
+        with open(os.path.join(localSavePath, fname), 'w') as f:
+            f.write(output_data)
+
+    # list(list) to csv
+    elif isinstance(output_data, list) \
+            and (isinstance(output_data[0], list) or isinstance(output_data[0],
+                                                                tuple)):
+        fname += '.csv'
+        with open(os.path.join(localSavePath, fname), 'w', newline='',
+                  encoding='utf-8') as f:
+            writer = csv.writer(f)
+            for row in output_data:
+                try:
+                    writer.writerow(row)
+                except UnicodeEncodeError as e:
+                    print(e)
+
+    # special case
+    elif isinstance(output_data, types.GeneratorType):
+        if fname == 'gephi':
+            fname += '.gml'
+        elif fname == 'pajek':
+            fname += '.net'
+        else:
+            fname += '.unknown'
+
+        with open(os.path.join(localSavePath, fname), 'w', newline='',
+                  encoding='utf-8') as f:
+            for line in output_data:
+                f.write(line + '\n')
+
+    # else pickle the object
+    else:
+        fname += '.pickle'
+        with open(os.path.join(localSavePath, fname), 'wb') as f:
+            pickle.dump(output_data, f)
+
+    return os.path.join(localSavePath, fname)
+
+
+# TODO wrap this into method on pyclowder
+def create_output_folder(dataset_id, host, secret_key):
+    url = posixpath.join(host, f'api/v2/datasets/{dataset_id}/folders')
+    headers = {"Content-Type": "application/json",
+               "X-API-KEY": secret_key}
+    current_timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
+    folder_data = {"name": current_timestamp}
+    response = requests.post(url, json=folder_data, headers=headers)
+    if response.status_code == 200:
+        return response.json().get("id")
+    else:
+        print(f"Error creating folder: {response.status_code} {response.text}")
+        return None
+
+
+class SmmExtractor(Extractor):
+    """Count the number of characters, words and lines in a text file."""
+    def __init__(self):
+        Extractor.__init__(self)
+
+        # parse command line and load default logging configuration
+        self.setup()
+
+        # setup logging for the exctractor
+        logging.getLogger('pyclowder').setLevel(logging.DEBUG)
+        logging.getLogger('__main__').setLevel(logging.DEBUG)
+
+    def process_message(self, connector, host, secret_key, resource, parameters):
+        # this extractor runs on dataset
+        # uncomment to see the resource
+        logger = logging.getLogger(__name__)
+        inputfile = resource["local_paths"][0]
+        dataset_id = resource['parent'].get('id')
+
+        df = pd.read_csv(inputfile)
+        connector.message_process(resource, "Loading contents of file...")
+
+        # execute the algorithm
+        # Parse user parameters to determine which column to analyze
+        userParams = parameters.get('parameters')
+
+        output = algorithm(df, userParams)
+        connector.message_process(resource, "Running the algorithm...")
+
+        # Create folder to save output
+        clowder_version = int(os.getenv('CLOWDER_VERSION', '1'))
+        if clowder_version == 2:
+            connector.message_process(resource, "Creating output folder...")
+            folder_id = create_output_folder(dataset_id, host, secret_key)
+            if folder_id is not None:
+                connector.message_process(resource, f"folder id: {folder_id} created ...")
+        else:
+            folder_id = None
+        for fname, output_data in output.items():
+            if fname != 'uid':
+                local_output_path = save_local_output("", fname, output_data)
+                connector.message_process(resource, "Saving " + local_output_path + "...")
+                uploaded_file_id = pyclowder.files.upload_to_dataset(connector, host, secret_key, dataset_id,
+                                                                     local_output_path,
+                                                                     folder_id=folder_id)
+                connector.message_process(resource, local_output_path + " saved...")
+
+                connector.message_process(resource, "Writing metadata...")
+                metadata = self.get_metadata(userParams, 'file', uploaded_file_id, host)
+                pyclowder.files.upload_metadata(connector, host, secret_key, uploaded_file_id, metadata)
+                connector.message_process(resource, "Metadata written...")
+
+
+if __name__ == "__main__":
+    extractor = SmmExtractor()
+    extractor.start()
diff --git a/autophrase_extractor/extractor_info.json b/autophrase_extractor/extractor_info.json
@@ -0,0 +1,65 @@
+{
+  "@context": "http://clowder.ncsa.illinois.edu/contexts/extractors.jsonld",
+  "name": "smm.automated.phrase.mining",
+  "version": "0.1.0",
+  "description": "As one of the fundamental tasks in text analysis, phrase mining aims at extracting quality phrases from a text corpus. Phrase mining is important in various tasks such as information extraction/retrieval, taxonomy construction, and topic modeling.",
+  "author": "Wang, Chen <[email protected]>",
+  "contributors": [],
+  "contexts": [
+    {}
+  ],
+  "repository": [
+    {
+      "repType": "git",
+      "repUrl": "https://github.com/ncsa/standalone-smm-analytics.git"
+    },
+    {
+      "repType": "git",
+      "repUrl": "https://github.com/clowder-framework/smm-extractor.git"
+    }
+  ],
+  "process": {
+    "file": [
+      "manual"
+    ]
+  },
+  "external_services": [],
+  "dependencies": [],
+  "bibtex": [],
+  "parameters": {
+    "schema": {
+      "column": {
+        "type": "string",
+        "title": "Text Column Header",
+        "default": "text"
+      },
+      "minSup": {
+        "type": "number",
+        "title": "Minimum Support",
+        "default": 3
+      },
+      "algorithm": {
+        "type": "string",
+        "title": "Automated Phrase Mining Algorithm",
+        "enum": [
+          "AutoPhrase (Automated Phrase Mining from Massive Text Corpora)"
+        ],
+        "default": "AutoPhrase (Automated Phrase Mining from Massive Text Corpora)"
+      }
+    },
+    "form": [
+      {
+        "key": "column",
+        "type": "text"
+      },
+      {
+        "key": "minSup",
+        "type": "text"
+      },
+      {
+        "key": "algorithm",
+        "type": "select"
+      }
+    ]
+  }
+}
diff --git a/autophrase_extractor/requirement.txt b/autophrase_extractor/requirement.txt
@@ -0,0 +1 @@
+pyclowder>=3.0.7
diff --git a/...ecognition_extractor/extractor.dockerfile → name_entity_recognition_extractor/Dockerfile b/...ecognition_extractor/extractor.dockerfile → name_entity_recognition_extractor/Dockerfile
diff --git a/name_entity_recognition_extractor/requirement.txt b/name_entity_recognition_extractor/requirement.txt
@@ -1 +1 @@
-pyclowder==3.0.7
+pyclowder>=3.0.7
diff --git a/...k_analysis_extractor/extractor.dockerfile → network_analysis_extractor/Dockerfile b/...k_analysis_extractor/extractor.dockerfile → network_analysis_extractor/Dockerfile
diff --git a/network_analysis_extractor/requirement.txt b/network_analysis_extractor/requirement.txt
@@ -1 +1 @@
-pyclowder==3.0.7
+pyclowder>=3.0.7
diff --git a/preprocessing_extractor/extractor.dockerfile → preprocessing_extractor/Dockerfile b/preprocessing_extractor/extractor.dockerfile → preprocessing_extractor/Dockerfile
diff --git a/preprocessing_extractor/requirement.txt b/preprocessing_extractor/requirement.txt
@@ -1 +1 @@
-pyclowder==3.0.7
+pyclowder>=3.0.7
diff --git a/...t_analysis_extractor/extractor.dockerfile → sentiment_analysis_extractor/Dockerfile b/...t_analysis_extractor/extractor.dockerfile → sentiment_analysis_extractor/Dockerfile
diff --git a/sentiment_analysis_extractor/requirement.txt b/sentiment_analysis_extractor/requirement.txt
@@ -1 +1 @@
-pyclowder==3.0.7
+pyclowder>=3.0.7
diff --git a/...c_modeling_extractor/extractor.dockerfile → topic_modeling_extractor/Dockerfile b/...c_modeling_extractor/extractor.dockerfile → topic_modeling_extractor/Dockerfile
diff --git a/topic_modeling_extractor/requirement.txt b/topic_modeling_extractor/requirement.txt
@@ -1 +1 @@
-pyclowder==3.0.7
+pyclowder>=3.0.7