clowder-framework · ddey2 · Oct 10, 2023
diff --git a/sample-extractors/word-cloud-extractor/Dockerfile b/sample-extractors/word-cloud-extractor/Dockerfile
@@ -0,0 +1,8 @@
+FROM python:3.8
+
+WORKDIR /extractor
+COPY requirements.txt ./
+RUN pip install -r requirements.txt
+
+COPY word-cloud-extractor.py extractor_info.json ./
+CMD python word-cloud-extractor.py
diff --git a/sample-extractors/word-cloud-extractor/README.md b/sample-extractors/word-cloud-extractor/README.md
@@ -0,0 +1,81 @@
+A simple test extractor that verifies the functions of file in pyclowder.
+
+# Docker
+
+This extractor is ready to be run as a docker container, the only dependency is a running Clowder instance. Simply build and run.
+
+1. Start Clowder V2. For help starting Clowder V2, see our [getting started guide](https://github.com/clowder-framework/clowder2/blob/main/README.md).
+
+2. First build the extractor Docker container:
+
+```
+# from this directory, run:
+
+docker build -t test-file-extractor .
+```
+
+3. Finally run the extractor:
+
+```
+docker run -t -i --rm --net clowder_clowder -e "RABBITMQ_URI=amqp://guest:guest@rabbitmq:5672/%2f" --name "test-file-extractor" test-file-extractor
+```
+
+Then open the Clowder web app and run the wordcount extractor on a .txt file (or similar)! Done.
+
+### Python and Docker details
+
+You may use any version of Python 3. Simply edit the first line of the `Dockerfile`, by default it uses `FROM python:3.8`.
+
+Docker flags:
+
+- `--net` links the extractor to the Clowder Docker network (run `docker network ls` to identify your own.)
+- `-e RABBITMQ_URI=` sets the environment variables can be used to control what RabbitMQ server and exchange it will bind itself to. Setting the `RABBITMQ_EXCHANGE` may also help.
+  - You can also use `--link` to link the extractor to a RabbitMQ container.
+- `--name` assigns the container a name visible in Docker Desktop.
+
+## Troubleshooting
+
+**If you run into _any_ trouble**, please reach out on our Clowder Slack in the [#pyclowder channel](https://clowder-software.slack.com/archives/CNC2UVBCP).
+
+Alternate methods of running extractors are below.
+
+# Commandline Execution
+
+To execute the extractor from the command line you will need to have the required packages installed. It is highly recommended to use python virtual environment for this. You will need to create a virtual environment first, then activate it and finally install all required packages.
+
+```
+  Step 1 - Start clowder docker-compose 
+  Step 2 - Starting heartbeat listener 
+          virtualenv clowder2-python (try pipenv)
+          source clowder2-python/bin/activate
+  Step 3 - Run heatbeat_listener_sync.py to register new extractor (This step will likely not be needed in future)
+            cd ~/Git/clowder2/backend
+	       pip install email_validator
+        copy heartbeat_listener_sync.py to /backend from /backend/app/rabbitmq
+	    python heartbeat_listener_sync.py
+
+  Step 4 - Installing pyclowder branch & running extractor
+	    source ~/clowder2-python/bin/activate
+	    pip uninstall pyclowder
+
+	    # the pyclowder Git repo should have Todd's branch activated (50-clowder20-submit-file-to-extractor)
+	    pip install -e ~/Git/pyclowder
+
+	    cd ~/Git/pyclowder/sample-extractors/test-file-extractor
+	    export CLOWDER_VERSION=2   
+	    export CLOWDER_URL=http://localhost:8000/
+
+	    python test-file-extractor.py
+
+
+  Step 5 = # post a particular File ID (text file) to the new extractor
+    POST http://localhost:3002/api/v2/files/639b31754241665a4fc3e513/extract?extractorName=ncsa.test-file-extractor
+
+    Or,
+    Go to Clowder UI and submit a file for extraction
+```
+
+# Run the extractor from Pycharm
+  You can run the heartbeat_listener_sync.py and test_file_extractor.py from pycharm. 
+  Create a pipenv (generally pycharm directs you to create one when you first open the file). To run test_file_extractor.py,
+  add 'CLOWDER_VERSION=2' to environment variable in run configuration.
diff --git a/sample-extractors/word-cloud-extractor/extractor_info.json b/sample-extractors/word-cloud-extractor/extractor_info.json
@@ -0,0 +1,27 @@
+{
+  "@context": "https://vega.github.io/vega/examples/word-cloud/",
+  "name": "ncsa.word-cloud-extractor",
+  "version": "2.0",
+  "description": "Word cloud extractor using VegaLite. Visualize word cloud out of a csv file with 2 columns.",
+  "author": "Dipannita Dey <[email protected]>",
+  "contributors": [],
+  "contexts": [
+    {
+      "url" : "https://vega.github.io/vega/examples/word-cloud/"
+    }
+  ],
+  "repository": [
+    {
+      "repType": "git",
+      "repUrl": "https://opensource.ncsa.illinois.edu/stash/scm/cats/pyclowder.git"
+    }
+  ],
+  "process": {
+    "file": [
+      "csv"
+    ]
+  },
+  "external_services": [],
+  "dependencies": [],
+  "bibtex": []
+}
diff --git a/sample-extractors/word-cloud-extractor/requirements.txt b/sample-extractors/word-cloud-extractor/requirements.txt
@@ -0,0 +1 @@
+pyclowder==3.0.2
diff --git a/sample-extractors/word-cloud-extractor/word-cloud-extractor.py b/sample-extractors/word-cloud-extractor/word-cloud-extractor.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python
+
+"""Example extractor based on the clowder code."""
+import csv
+import logging
+import json
+
+from pyclowder.extractors import Extractor
+import pyclowder.files
+
+
+class WordCloudExtractor(Extractor):
+    """Test the functionalities of an extractor."""
+    def __init__(self):
+        Extractor.__init__(self)
+
+        # add any additional arguments to parser
+        # self.parser.add_argument('--max', '-m', type=int, nargs='?', default=-1,
+        #                          help='maximum number (default=-1)')
+
+        # parse command line and load default logging configuration
+        self.setup()
+
+        # setup logging for the exctractor
+        logging.getLogger('pyclowder').setLevel(logging.DEBUG)
+        logging.getLogger('__main__').setLevel(logging.DEBUG)
+
+    def process_message(self, connector, host, secret_key, resource, parameters):
+        # Process the file and upload the results
+
+        logger = logging.getLogger(__name__)
+        file_id = resource['id']
+        file_path = resource['local_paths'][0]
+
+        # Initialize a list to store the JSON objects
+        json_objects = []
+
+        with open(file_path, 'r') as file:
+            # Create a CSV reader object
+            csv_reader = csv.reader(file)
+
+            # Skip the header row if it exists
+            next(csv_reader, None)
+
+            # Iterate through the rows in the CSV file
+            for row_index, row in enumerate(csv_reader):
+                if row_index >= 1000:
+                    break
+
+                # Create a dictionary with 'terms' and 'count' as keys
+                data = {
+                    'text': row[0],  # Assuming the term is in the first column
+                    'count': int(row[1])  # Assuming the count is in the second column
+                }
+
+                # Append the dictionary to the list
+                json_objects.append(data)
+
+        # Convert the list of dictionaries to a JSON object
+        json_data = json.dumps(json_objects, indent=2)
+
+        # Print the JSON object
+        #print(json_data)
+
+        spec = {
+          "$schema": "https://vega.github.io/schema/vega/v5.json",
+          "description": "A word cloud visualization depicting Vega research paper abstracts.",
+          "width": 350,
+          "height": 400,
+          "padding": 0,
+          "data": [
+            {
+              "name": "table",
+              "values": json_data,
+              "transform": [
+                {
+                  "type": "formula",
+                  "as": "angle",
+                  "expr": "[-45, 0, 45][~~(random() * 3)]"
+                },
+                {
+                  "type": "formula",
+                  "as": "text2",
+                  "expr": "[datum.text]"
+                },
+                {
+                  "type": "formula",
+                  "as": "weight",
+                  "expr": "if(datum.text=='VEGA', 600, 300)"
+                },
+                {
+                  "type": "wordcloud",
+                  "size": [350, 400],
+                  "text": {"field": "text2"},
+                  "rotate": {"field": "angle"},
+                  "font": "Helvetica Neue, Arial",
+                  "fontSize": {"field": "count"},
+                  "fontWeight": {"field": "weight"},
+                  "fontSizeRange": [12, 56],
+                  "padding": 2
+                }
+              ]
+            }
+          ],
+          "scales": [
+            {
+              "name": "color",
+              "type": "ordinal",
+              "domain": {"data": "table", "field": "text"},
+              "range": ["#d5a928", "#652c90", "#939597"]
+            }
+          ],
+          "marks": [
+            {
+              "type": "text",
+              "from": {"data": "table"},
+              "encode": {
+                "enter": {
+                  "text": {"field": "text2"},
+                  "align": {"value": "center"},
+                  "baseline": {"value": "alphabetic"},
+                  "fill": {"scale": "color", "field": "text"}
+                },
+                "update": {
+                  "x": {"field": "x"},
+                  "y": {"field": "y"},
+                  "angle": {"field": "angle"},
+                  "fontSize": {"field": "fontSize"},
+                  "fillOpacity": {"value": 1}
+                },
+                "hover": {"fillOpacity": {"value": 0.5}}
+              }
+            },
+
+          ]
+        }
+
+        # Define the path to the text file where you want to save the JSON
+        output_file_path = 'spec.json'
+
+        # Write the formatted JSON data to the text file
+        with open(output_file_path, 'w') as file:
+            file.write(json.dumps(spec, indent=2))
+
+        pyclowder.files.upload_preview(connector, host, secret_key, file_id, output_file_path, "application/json",
+                                       "spec.json",
+                                       visualization_name="word-cloud-extractor",
+                                       visualization_component_id="word-cloud")
+
+
+
+
+
+if __name__ == "__main__":
+    extractor = WordCloudExtractor()
+    extractor.start()