diff --git a/name_entity_recognition_extractor/CHANGELOG.md b/name_entity_recognition_extractor/CHANGELOG.md index 332c0e2..83fb175 100644 --- a/name_entity_recognition_extractor/CHANGELOG.md +++ b/name_entity_recognition_extractor/CHANGELOG.md @@ -4,13 +4,20 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [0.1.0] - 03-15-2023 -### Added -- Initial release of the name entity recognition extractor +## [0.1.2] - 10-11-2023 + +### Added +- Organize output data to folder [#4](https://github.com/clowder-framework/smm-extractor/issues/4) ## [0.1.1] - 10-03-2023 ### Changed - Support Clowder V2 [#1](https://github.com/clowder-framework/smm-extractor/issues/1) + + +## [0.1.0] - 03-15-2023 + +### Added +- Initial release of the name entity recognition extractor diff --git a/name_entity_recognition_extractor/SmmExtractor.py b/name_entity_recognition_extractor/SmmExtractor.py index d79adf2..0af20da 100644 --- a/name_entity_recognition_extractor/SmmExtractor.py +++ b/name_entity_recognition_extractor/SmmExtractor.py @@ -1,19 +1,22 @@ #!/usr/bin/env python """Example extractor based on the clowder code.""" +import posixpath + import pandas as pd import json import os import csv import types import pickle +from datetime import datetime import logging from pyclowder.extractors import Extractor import pyclowder.files from algorithm import algorithm - +import requests def save_local_output(localSavePath, fname, output_data): """ @@ -78,6 +81,21 @@ def save_local_output(localSavePath, fname, output_data): return os.path.join(localSavePath, fname) +# TODO wrap this into method on pyclowder +def create_output_folder(dataset_id, host, secret_key): + url = posixpath.join(host, f'api/v2/datasets/{dataset_id}/folders') + headers = {"Content-Type": "application/json", + "X-API-KEY": secret_key} + current_timestamp = datetime.now().strftime("%Y%m%d%H%M%S") + folder_data = {"name": current_timestamp} + response = requests.post(url, json=folder_data, headers=headers) + if response.status_code == 200: + return response.json().get("id") + else: + print(f"Error creating folder: {response.status_code} {response.text}") + return None + + class SmmExtractor(Extractor): """Count the number of characters, words and lines in a text file.""" def __init__(self): @@ -107,13 +125,22 @@ def process_message(self, connector, host, secret_key, resource, parameters): output = algorithm(df, userParams) connector.message_process(resource, "Running the algorithm...") - # upload object to s3 bucket and return the url + # Create folder to save output + clowder_version = int(os.getenv('CLOWDER_VERSION', '1')) + if clowder_version == 2: + connector.message_process(resource, "Creating output folder...") + folder_id = create_output_folder(dataset_id, host, secret_key) + if folder_id is not None: + connector.message_process(resource, f"folder id: {folder_id} created ...") + else: + folder_id = None for fname, output_data in output.items(): if fname != 'uid': local_output_path = save_local_output("", fname, output_data) connector.message_process(resource, "Saving " + local_output_path + "...") uploaded_file_id = pyclowder.files.upload_to_dataset(connector, host, secret_key, dataset_id, - local_output_path) + local_output_path, + folder_id=folder_id) connector.message_process(resource, local_output_path + " saved...") connector.message_process(resource, "Writing metadata...") diff --git a/name_entity_recognition_extractor/extractor_info.json b/name_entity_recognition_extractor/extractor_info.json index b700c56..ddb3e0b 100644 --- a/name_entity_recognition_extractor/extractor_info.json +++ b/name_entity_recognition_extractor/extractor_info.json @@ -1,7 +1,7 @@ { "@context": "http://clowder.ncsa.illinois.edu/contexts/extractors.jsonld", "name": "smm.name.entity.recognition", - "version": "0.1.1", + "version": "0.1.2", "description": "Named-entity recognition (NER) (also known as entity identification, entity chunking and entity extraction) is a subtask of information extraction that seeks to locate and classify named entity mentions in unstructured text into pre-defined categories such as the person names, organizations, locations, medical codes, time expressions, quantities, monetary values, percentages, etc.", "author": "Wang, Chen ", "contributors": [], diff --git a/name_entity_recognition_extractor/requirement.txt b/name_entity_recognition_extractor/requirement.txt index 74e79f5..7c54c75 100644 --- a/name_entity_recognition_extractor/requirement.txt +++ b/name_entity_recognition_extractor/requirement.txt @@ -1,2 +1 @@ -pyclowder==3.0.4 - +pyclowder==3.0.7 diff --git a/network_analysis_extractor/CHANGELOG.md b/network_analysis_extractor/CHANGELOG.md index 2136e0f..399cccc 100644 --- a/network_analysis_extractor/CHANGELOG.md +++ b/network_analysis_extractor/CHANGELOG.md @@ -4,13 +4,20 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [0.1.0] - 03-15-2023 -### Added -- Initial release of the network analysis extractor +## [0.1.2] - 10-11-2023 + +### Added +- Organize output data to folder [#4](https://github.com/clowder-framework/smm-extractor/issues/4) ## [0.1.1] - 10-03-2023 ### Changed - Support Clowder V2 [#1](https://github.com/clowder-framework/smm-extractor/issues/1) + + +## [0.1.0] - 03-15-2023 + +### Added +- Initial release of the network analysis extractor diff --git a/network_analysis_extractor/SmmExtractor.py b/network_analysis_extractor/SmmExtractor.py index d79adf2..0af20da 100644 --- a/network_analysis_extractor/SmmExtractor.py +++ b/network_analysis_extractor/SmmExtractor.py @@ -1,19 +1,22 @@ #!/usr/bin/env python """Example extractor based on the clowder code.""" +import posixpath + import pandas as pd import json import os import csv import types import pickle +from datetime import datetime import logging from pyclowder.extractors import Extractor import pyclowder.files from algorithm import algorithm - +import requests def save_local_output(localSavePath, fname, output_data): """ @@ -78,6 +81,21 @@ def save_local_output(localSavePath, fname, output_data): return os.path.join(localSavePath, fname) +# TODO wrap this into method on pyclowder +def create_output_folder(dataset_id, host, secret_key): + url = posixpath.join(host, f'api/v2/datasets/{dataset_id}/folders') + headers = {"Content-Type": "application/json", + "X-API-KEY": secret_key} + current_timestamp = datetime.now().strftime("%Y%m%d%H%M%S") + folder_data = {"name": current_timestamp} + response = requests.post(url, json=folder_data, headers=headers) + if response.status_code == 200: + return response.json().get("id") + else: + print(f"Error creating folder: {response.status_code} {response.text}") + return None + + class SmmExtractor(Extractor): """Count the number of characters, words and lines in a text file.""" def __init__(self): @@ -107,13 +125,22 @@ def process_message(self, connector, host, secret_key, resource, parameters): output = algorithm(df, userParams) connector.message_process(resource, "Running the algorithm...") - # upload object to s3 bucket and return the url + # Create folder to save output + clowder_version = int(os.getenv('CLOWDER_VERSION', '1')) + if clowder_version == 2: + connector.message_process(resource, "Creating output folder...") + folder_id = create_output_folder(dataset_id, host, secret_key) + if folder_id is not None: + connector.message_process(resource, f"folder id: {folder_id} created ...") + else: + folder_id = None for fname, output_data in output.items(): if fname != 'uid': local_output_path = save_local_output("", fname, output_data) connector.message_process(resource, "Saving " + local_output_path + "...") uploaded_file_id = pyclowder.files.upload_to_dataset(connector, host, secret_key, dataset_id, - local_output_path) + local_output_path, + folder_id=folder_id) connector.message_process(resource, local_output_path + " saved...") connector.message_process(resource, "Writing metadata...") diff --git a/network_analysis_extractor/extractor_info.json b/network_analysis_extractor/extractor_info.json index ff015dc..953354a 100644 --- a/network_analysis_extractor/extractor_info.json +++ b/network_analysis_extractor/extractor_info.json @@ -1,7 +1,7 @@ { "@context": "http://clowder.ncsa.illinois.edu/contexts/extractors.jsonld", "name": "smm.network.analysis", - "version": "0.1.1", + "version": "0.1.2", "description": "Social network analysis is the process of investigating social structures through the use of networks and graph theory .It characterizes networked structures in terms of nodes (individual actors, people, or things within the network) and the ties, edges, or links (relationships or interactions) that connect them.", "author": "Wang, Chen ", "contributors": [], diff --git a/network_analysis_extractor/requirement.txt b/network_analysis_extractor/requirement.txt index d5f92cd..7c54c75 100644 --- a/network_analysis_extractor/requirement.txt +++ b/network_analysis_extractor/requirement.txt @@ -1 +1 @@ -pyclowder==3.0.4 +pyclowder==3.0.7 diff --git a/preprocessing_extractor/CHANGELOG.md b/preprocessing_extractor/CHANGELOG.md index bb99d11..2b32d99 100644 --- a/preprocessing_extractor/CHANGELOG.md +++ b/preprocessing_extractor/CHANGELOG.md @@ -4,13 +4,22 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [0.1.0] - 03-15-2023 -### Added -- Initial release of the preprocessing extractor +## [0.1.2] - 10-11-2023 + +### Added +- Organize output data to folder [#4](https://github.com/clowder-framework/smm-extractor/issues/4) ## [0.1.1] - 10-03-2023 ### Changed - Support Clowder V2 [#1](https://github.com/clowder-framework/smm-extractor/issues/1) + + +## [0.1.0] - 03-15-2023 + +### Added +- Initial release of the preprocessing extractor + + diff --git a/preprocessing_extractor/SmmExtractor.py b/preprocessing_extractor/SmmExtractor.py index d79adf2..0af20da 100644 --- a/preprocessing_extractor/SmmExtractor.py +++ b/preprocessing_extractor/SmmExtractor.py @@ -1,19 +1,22 @@ #!/usr/bin/env python """Example extractor based on the clowder code.""" +import posixpath + import pandas as pd import json import os import csv import types import pickle +from datetime import datetime import logging from pyclowder.extractors import Extractor import pyclowder.files from algorithm import algorithm - +import requests def save_local_output(localSavePath, fname, output_data): """ @@ -78,6 +81,21 @@ def save_local_output(localSavePath, fname, output_data): return os.path.join(localSavePath, fname) +# TODO wrap this into method on pyclowder +def create_output_folder(dataset_id, host, secret_key): + url = posixpath.join(host, f'api/v2/datasets/{dataset_id}/folders') + headers = {"Content-Type": "application/json", + "X-API-KEY": secret_key} + current_timestamp = datetime.now().strftime("%Y%m%d%H%M%S") + folder_data = {"name": current_timestamp} + response = requests.post(url, json=folder_data, headers=headers) + if response.status_code == 200: + return response.json().get("id") + else: + print(f"Error creating folder: {response.status_code} {response.text}") + return None + + class SmmExtractor(Extractor): """Count the number of characters, words and lines in a text file.""" def __init__(self): @@ -107,13 +125,22 @@ def process_message(self, connector, host, secret_key, resource, parameters): output = algorithm(df, userParams) connector.message_process(resource, "Running the algorithm...") - # upload object to s3 bucket and return the url + # Create folder to save output + clowder_version = int(os.getenv('CLOWDER_VERSION', '1')) + if clowder_version == 2: + connector.message_process(resource, "Creating output folder...") + folder_id = create_output_folder(dataset_id, host, secret_key) + if folder_id is not None: + connector.message_process(resource, f"folder id: {folder_id} created ...") + else: + folder_id = None for fname, output_data in output.items(): if fname != 'uid': local_output_path = save_local_output("", fname, output_data) connector.message_process(resource, "Saving " + local_output_path + "...") uploaded_file_id = pyclowder.files.upload_to_dataset(connector, host, secret_key, dataset_id, - local_output_path) + local_output_path, + folder_id=folder_id) connector.message_process(resource, local_output_path + " saved...") connector.message_process(resource, "Writing metadata...") diff --git a/preprocessing_extractor/extractor_info.json b/preprocessing_extractor/extractor_info.json index fd3e33b..d64e8cc 100644 --- a/preprocessing_extractor/extractor_info.json +++ b/preprocessing_extractor/extractor_info.json @@ -1,7 +1,7 @@ { "@context": "http://clowder.ncsa.illinois.edu/contexts/extractors.jsonld", "name": "smm.preprocessing.analysis", - "version": "0.1.1", + "version": "0.1.2", "description": "Tokenization is the process of dividing written text into meaningful units, such as words, sentences , or topics. Lemmatization and Stemming reduces word forms to common base words. Part-of-speech Tagging is the process of marking up a word in a text (corpus) as corresponding to a particular part of speech, based on both its definition and its context.", "author": "Wang, Chen ", "contributors": [], diff --git a/preprocessing_extractor/requirement.txt b/preprocessing_extractor/requirement.txt index d5f92cd..7c54c75 100644 --- a/preprocessing_extractor/requirement.txt +++ b/preprocessing_extractor/requirement.txt @@ -1 +1 @@ -pyclowder==3.0.4 +pyclowder==3.0.7 diff --git a/requirement.txt b/requirement.txt deleted file mode 100644 index d5f92cd..0000000 --- a/requirement.txt +++ /dev/null @@ -1 +0,0 @@ -pyclowder==3.0.4 diff --git a/sentiment_analysis_extractor/CHANGELOG.md b/sentiment_analysis_extractor/CHANGELOG.md index 6e5dad4..ce6a7ee 100644 --- a/sentiment_analysis_extractor/CHANGELOG.md +++ b/sentiment_analysis_extractor/CHANGELOG.md @@ -4,13 +4,20 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [0.1.0] - 03-15-2023 -### Added -- Initial release of the sentiment analysis extractor +## [0.1.2] - 10-11-2023 + +### Added +- Organize output data to folder [#4](https://github.com/clowder-framework/smm-extractor/issues/4) ## [0.1.1] - 10-03-2023 ### Changed - Support Clowder V2 [#1](https://github.com/clowder-framework/smm-extractor/issues/1) + + +## [0.1.0] - 03-15-2023 + +### Added +- Initial release of the sentiment analysis extractor diff --git a/sentiment_analysis_extractor/SmmExtractor.py b/sentiment_analysis_extractor/SmmExtractor.py index d79adf2..0af20da 100644 --- a/sentiment_analysis_extractor/SmmExtractor.py +++ b/sentiment_analysis_extractor/SmmExtractor.py @@ -1,19 +1,22 @@ #!/usr/bin/env python """Example extractor based on the clowder code.""" +import posixpath + import pandas as pd import json import os import csv import types import pickle +from datetime import datetime import logging from pyclowder.extractors import Extractor import pyclowder.files from algorithm import algorithm - +import requests def save_local_output(localSavePath, fname, output_data): """ @@ -78,6 +81,21 @@ def save_local_output(localSavePath, fname, output_data): return os.path.join(localSavePath, fname) +# TODO wrap this into method on pyclowder +def create_output_folder(dataset_id, host, secret_key): + url = posixpath.join(host, f'api/v2/datasets/{dataset_id}/folders') + headers = {"Content-Type": "application/json", + "X-API-KEY": secret_key} + current_timestamp = datetime.now().strftime("%Y%m%d%H%M%S") + folder_data = {"name": current_timestamp} + response = requests.post(url, json=folder_data, headers=headers) + if response.status_code == 200: + return response.json().get("id") + else: + print(f"Error creating folder: {response.status_code} {response.text}") + return None + + class SmmExtractor(Extractor): """Count the number of characters, words and lines in a text file.""" def __init__(self): @@ -107,13 +125,22 @@ def process_message(self, connector, host, secret_key, resource, parameters): output = algorithm(df, userParams) connector.message_process(resource, "Running the algorithm...") - # upload object to s3 bucket and return the url + # Create folder to save output + clowder_version = int(os.getenv('CLOWDER_VERSION', '1')) + if clowder_version == 2: + connector.message_process(resource, "Creating output folder...") + folder_id = create_output_folder(dataset_id, host, secret_key) + if folder_id is not None: + connector.message_process(resource, f"folder id: {folder_id} created ...") + else: + folder_id = None for fname, output_data in output.items(): if fname != 'uid': local_output_path = save_local_output("", fname, output_data) connector.message_process(resource, "Saving " + local_output_path + "...") uploaded_file_id = pyclowder.files.upload_to_dataset(connector, host, secret_key, dataset_id, - local_output_path) + local_output_path, + folder_id=folder_id) connector.message_process(resource, local_output_path + " saved...") connector.message_process(resource, "Writing metadata...") diff --git a/sentiment_analysis_extractor/extractor_info.json b/sentiment_analysis_extractor/extractor_info.json index 174a1fd..e2eeef5 100644 --- a/sentiment_analysis_extractor/extractor_info.json +++ b/sentiment_analysis_extractor/extractor_info.json @@ -1,7 +1,7 @@ { "@context": "http://clowder.ncsa.illinois.edu/contexts/extractors.jsonld", "name": "smm.sentiment.analysis", - "version": "0.1.1", + "version": "0.1.2", "description": "Sentiment analysis (sometimes known as opinion mining or emotion AI) refers to the use of natural language processing, text analysis, computational linguistics, and biometrics to systematically identify, extract, quantify, and study affective states and subjective information.", "author": "Wang, Chen ", "contributors": [], diff --git a/sentiment_analysis_extractor/requirement.txt b/sentiment_analysis_extractor/requirement.txt index d5f92cd..7c54c75 100644 --- a/sentiment_analysis_extractor/requirement.txt +++ b/sentiment_analysis_extractor/requirement.txt @@ -1 +1 @@ -pyclowder==3.0.4 +pyclowder==3.0.7 diff --git a/topic_modeling_extractor/CHANGELOG.md b/topic_modeling_extractor/CHANGELOG.md index 0bccccd..dc9ab71 100644 --- a/topic_modeling_extractor/CHANGELOG.md +++ b/topic_modeling_extractor/CHANGELOG.md @@ -4,13 +4,21 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [0.1.0] - 03-15-2023 -### Added -- Initial release of the topic modeling extractor +## [0.1.2] - 10-11-2023 + +### Added +- Organize output data to folder [#4](https://github.com/clowder-framework/smm-extractor/issues/4) ## [0.1.1] - 10-03-2023 ### Changed - Support Clowder V2 [#1](https://github.com/clowder-framework/smm-extractor/issues/1) + + +## [0.1.0] - 03-15-2023 + +### Added +- Initial release of the topic modeling extractor + diff --git a/topic_modeling_extractor/SmmExtractor.py b/topic_modeling_extractor/SmmExtractor.py index d79adf2..0af20da 100644 --- a/topic_modeling_extractor/SmmExtractor.py +++ b/topic_modeling_extractor/SmmExtractor.py @@ -1,19 +1,22 @@ #!/usr/bin/env python """Example extractor based on the clowder code.""" +import posixpath + import pandas as pd import json import os import csv import types import pickle +from datetime import datetime import logging from pyclowder.extractors import Extractor import pyclowder.files from algorithm import algorithm - +import requests def save_local_output(localSavePath, fname, output_data): """ @@ -78,6 +81,21 @@ def save_local_output(localSavePath, fname, output_data): return os.path.join(localSavePath, fname) +# TODO wrap this into method on pyclowder +def create_output_folder(dataset_id, host, secret_key): + url = posixpath.join(host, f'api/v2/datasets/{dataset_id}/folders') + headers = {"Content-Type": "application/json", + "X-API-KEY": secret_key} + current_timestamp = datetime.now().strftime("%Y%m%d%H%M%S") + folder_data = {"name": current_timestamp} + response = requests.post(url, json=folder_data, headers=headers) + if response.status_code == 200: + return response.json().get("id") + else: + print(f"Error creating folder: {response.status_code} {response.text}") + return None + + class SmmExtractor(Extractor): """Count the number of characters, words and lines in a text file.""" def __init__(self): @@ -107,13 +125,22 @@ def process_message(self, connector, host, secret_key, resource, parameters): output = algorithm(df, userParams) connector.message_process(resource, "Running the algorithm...") - # upload object to s3 bucket and return the url + # Create folder to save output + clowder_version = int(os.getenv('CLOWDER_VERSION', '1')) + if clowder_version == 2: + connector.message_process(resource, "Creating output folder...") + folder_id = create_output_folder(dataset_id, host, secret_key) + if folder_id is not None: + connector.message_process(resource, f"folder id: {folder_id} created ...") + else: + folder_id = None for fname, output_data in output.items(): if fname != 'uid': local_output_path = save_local_output("", fname, output_data) connector.message_process(resource, "Saving " + local_output_path + "...") uploaded_file_id = pyclowder.files.upload_to_dataset(connector, host, secret_key, dataset_id, - local_output_path) + local_output_path, + folder_id=folder_id) connector.message_process(resource, local_output_path + " saved...") connector.message_process(resource, "Writing metadata...") diff --git a/topic_modeling_extractor/extractor_info.json b/topic_modeling_extractor/extractor_info.json index 40532d0..8bb206c 100644 --- a/topic_modeling_extractor/extractor_info.json +++ b/topic_modeling_extractor/extractor_info.json @@ -1,7 +1,7 @@ { "@context": "http://clowder.ncsa.illinois.edu/contexts/extractors.jsonld", "name": "smm.topic.modeling", - "version": "0.1.1", + "version": "0.1.2", "description": "One of the primary applications of natural language processing is to automatically extract what topics people are discussing from large volumes of text. Topic modeling is a type of statistical modeling for discovering the abstract topics that occur in a collection of documents. Latent Dirichlet Allocation (LDA) is an example of topic model and is used to classify text in a document to a particular topic. It builds a topic per document model and words per topic model, modeled as Dirichlet distributions.", "author": "Wang, Chen ", "contributors": [], diff --git a/topic_modeling_extractor/requirement.txt b/topic_modeling_extractor/requirement.txt index d5f92cd..7c54c75 100644 --- a/topic_modeling_extractor/requirement.txt +++ b/topic_modeling_extractor/requirement.txt @@ -1 +1 @@ -pyclowder==3.0.4 +pyclowder==3.0.7