fix markdowns

jonathanwvd · Aug 16, 2024 · c0cd21e · c0cd21e
1 parent 4f81d61
commit c0cd21e
Show file tree

Hide file tree

Showing 28 changed files with 731 additions and 94 deletions.
diff --git a/README.md b/README.md
diff --git a/generate_documentation.py b/generate_documentation.py
@@ -1,5 +1,61 @@
 import json
 import os
+import pandas as pd
+
+
+def update_readme_with_data(json_folder, readme_file, md_folder_path):
+    # Collecting dataset information
+    datasets = []
+    for filename in os.listdir(json_folder):
+        if filename.endswith('.json') and filename.lower() not in ['template.json', 'datasets.json']:  # Skip template and datasets.json
+            with open(os.path.join(json_folder, filename), 'r', encoding='utf-8') as jsonfile:
+                data = json.load(jsonfile)
+                datasets.append({
+                    'Dataset Name': data.get('Name', ''),
+                    'Labeled': data.get('Labeled', ''),
+                    'Time Series': data.get('Time Series', ''),
+                    'Simulation': data.get('Simulation', ''),
+                    'Additional Tags': '; '.join(data.get('Additional Tags', [])),
+                    'Description': data.get('Summary', ''),  # Assuming 'Summary' is the description field
+                    'Link': filename.replace('.json', '').replace(' ', '_').replace('.', '_').lower()  # Link based on filename
+                })
+
+    # Convert list to DataFrame
+    df = pd.DataFrame(datasets)
+    df.fillna('', inplace=True)
+    df['Link'] = df['Link'].apply(lambda x: f"[{x.replace('_', ' ').title()}]({md_folder_path}/{x}.md)")
+    df['Dataset Name'] = df.apply(lambda x: x['Link'], axis=1)
+    df.drop(['Link', 'Description'], axis=1, inplace=True)
+    markdown_table = df.to_markdown(index=False)
+
+    # Read the existing README content and update
+    with open(readme_file, 'r', encoding='utf-8') as file:
+        content = file.read()
+
+    # Find the TABLE_START and TABLE_END markers
+    table_start = content.find("<!-- TABLE_START -->")
+    table_end = content.find("<!-- TABLE_END -->")
+
+    # Ensure both markers are found
+    if table_start == -1 or table_end == -1:
+        raise ValueError("Markers <!-- TABLE_START --> or <!-- TABLE_END --> not found in the README file.")
+
+    # Replace the content between the markers with the new table, preserving the TABLE_END marker
+    updated_content = (
+        content[:table_start] + 
+        "<!-- TABLE_START -->\n" + 
+        markdown_table + 
+        "\n" + 
+        content[table_end:]
+    )
+
+    # Write the updated content back to the README
+    with open(readme_file, 'w', encoding='utf-8') as file:
+        file.write(updated_content)
+
+    print("Updated README with the new Markdown table.")
+
+
 
 def generate_json_data(json_folder):
     datasets = []
@@ -62,28 +118,48 @@ def inject_json_to_html(json_data, html_file):
 def json_to_markdown(json_path, md_path):
     with open(json_path, 'r', encoding='utf-8') as file:
         data = json.load(file)
-
+
+    # Start building the Markdown content
     markdown_content = f"# {data['Name']}\n\n"
-    if 'table' in data:
-        markdown_content += "| Parameter | Value |\n"
-        markdown_content += "| --- | --- |\n"
-        for item in data['table']:
-            markdown_content += f"| {item['Parameter']} | {item['Value']} |\n"
-        markdown_content += "\n"
-
-    for section in data['Sections']:
-        markdown_content += f"## {section['Title']}\n{section['Content']}\n\n"
-
+    markdown_content += f"**Summary:** {data.get('Summary', '')}\n\n"
+
+    # Add the table with the relevant information
+    markdown_content += "| Parameter | Value |\n"
+    markdown_content += "| --- | --- |\n"
+    for key in ['Name', 'Labeled', 'Time Series', 'Simulation', 'Missing Values', 'Dataset Characteristics', 'Feature Type', 'Associated Tasks', 'Number of Instances', 'Number of Features', 'Date Donated', 'Source']:
+        if key in data:
+            markdown_content += f"| **{key}** | {data[key]} |\n"
+
+    markdown_content += "\n"
+
+    # Add sections
+    if 'Sections' in data:
+        for section in data['Sections']:
+            markdown_content += f"## {section['Title']}\n\n"
+            markdown_content += f"{section['Content']}\n\n"
+
+    # Add tags section
+    if 'Additional Tags' in data and data['Additional Tags']:
+        markdown_content += "## Tags\n\n"
+        markdown_content += ", ".join(data['Additional Tags']) + "\n\n"
+
+    # Add references
     if 'References' in data:
-        markdown_content += "## References\n"
+        markdown_content += "## References\n\n"
         for ref in data['References']:
             markdown_content += f"- [{ref['Text']}]({ref['Link']})\n"
         markdown_content += "\n"
-
+
+    # Add a link to go back to the main README or index page
+    markdown_content += "[⬅️ Back to Index](../README.md)\n"
+
+    # Write the Markdown content to the file
     with open(md_path, 'w', encoding='utf-8') as file:
         file.write(markdown_content)
+
     print(f"Markdown file created for {os.path.basename(md_path)}")
 
+
 def json_to_html(json_path, html_path):
     with open(json_path, 'r', encoding='utf-8') as file:
         data = json.load(file)
@@ -161,6 +237,8 @@ def json_to_html(json_path, html_path):
 md_folder_path = 'markdown'
 html_folder_path = 'html/pages'
 index_html_file = 'index.html'
+readme_file_path = 'README.md'
+
 os.makedirs(md_folder_path, exist_ok=True)
 os.makedirs(html_folder_path, exist_ok=True)
 
@@ -182,3 +260,6 @@ def json_to_html(json_path, html_path):
             json_to_html(json_path, html_path)
         except Exception as e:
             print(f"Error processing {filename}: {e}")
+
+# Update the README with dataset information directly from JSON files
+update_readme_with_data(json_folder_path, readme_file_path, md_folder_path)
diff --git a/index.html b/index.html
@@ -161,6 +161,21 @@ <h1 class="mb-4 text-center">Awesome Industrial Datasets</h1>
             });
         });
     </script>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 <script type="application/json" id="dataset-json">
 [
     {

diff --git a/markdown/3w.md b/markdown/3w.md
@@ -1,13 +1,38 @@
 # 3W
 
+**Summary:** Promotes development of ML algorithms for early detection and classification of undesirable events in offshore oil wells.
+
+| Parameter | Value |
+| --- | --- |
+| **Name** | 3W |
+| **Labeled** | Yes |
+| **Time Series** | Yes |
+| **Simulation** | Both |
+| **Missing Values** | NIA |
+| **Dataset Characteristics** | Multivariate, Time-Series |
+| **Feature Type** | Real |
+| **Associated Tasks** | Regression |
+| **Number of Instances** | N/A |
+| **Number of Features** | N/A |
+| **Date Donated** | 2022-04-06 |
+| **Source** | GitHub |
+
 ## Dataset Information
+
 This is the first repository published by Petrobras on GitHub. It supports the 3W Project, which aims to promote experimentation and development of Machine Learning-based approaches and algorithms for specific problems related to detection and classification of undesirable events that occur in offshore oil wells.
 
 The 3W Project is based on the 3W Dataset, a database described in [this paper](https://doi.org/10.1016/j.petrol.2019.106223), and on the 3W Toolkit, a software package that promotes experimentation with the 3W Dataset for specific problems. The name **3W** was chosen because this dataset is composed of instances from ***3*** different sources and which contain undesirable events that occur in oil ***W***ells.
 
 ## Motivation
+
 Timely detection of undesirable events in oil wells can help prevent production losses, reduce maintenance costs, environmental accidents, and human casualties. Losses related to this type of events can reach 5% of production in certain scenarios, especially in areas such as Flow Assurance and Artificial Lifting Methods. In terms of maintenance, the cost of a maritime probe, required to perform various types of operations, can exceed US $500,000 per day.
 
+## Tags
+
+Oil and Gas, Real events, Fault detection, Multivariate data, Sensor data, Time-series analysis, Oil wells, Machine learning benchmark
+
 ## References
+
 - [GitHub](https://github.com/petrobras/3W/tree/main)
 
+[⬅️ Back to Index](../README.md)
diff --git a/markdown/ai4i_2020_predictive_maintenance_dataset.md b/markdown/ai4i_2020_predictive_maintenance_dataset.md
@@ -1,8 +1,32 @@
 # AI4I 2020 Predictive Maintenance Dataset
 
+**Summary:** The AI4I 2020 Predictive Maintenance Dataset is a synthetic dataset that reflects real predictive maintenance data encountered in industry.
+
+| Parameter | Value |
+| --- | --- |
+| **Name** | AI4I 2020 Predictive Maintenance Dataset |
+| **Labeled** | Yes |
+| **Time Series** | Yes |
+| **Simulation** | Yes |
+| **Missing Values** | No |
+| **Dataset Characteristics** | Multivariate, Time-Series |
+| **Feature Type** | Real |
+| **Associated Tasks** | Classification, Regression, Causal-Discovery |
+| **Number of Instances** | 10000 |
+| **Number of Features** | 6 |
+| **Date Donated** | 2020-08-29 |
+| **Source** | UCI Machine Learning Repository |
+
 ## Dataset Information
+
 Since real predictive maintenance datasets are generally difficult to obtain and in particular difficult to publish, we present and provide a synthetic dataset that reflects real predictive maintenance encountered in industry to the best of our knowledge.
 
+## Tags
+
+Predictive maintenance, Synthetic data, Industry 4.0, Machine failure, Time-series data
+
 ## References
+
 - [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/AI4I+2020+Predictive+Maintenance+Dataset)
 
+[⬅️ Back to Index](../README.md)
diff --git a/markdown/air_quality.md b/markdown/air_quality.md
@@ -1,8 +1,32 @@
 # Air Quality
 
+**Summary:** Contains the responses of a gas multisensor device deployed on the field in an Italian city. Hourly responses averages are recorded along with gas concentrations references from a certified analyzer.
+
+| Parameter | Value |
+| --- | --- |
+| **Name** | Air Quality |
+| **Labeled** | Yes |
+| **Time Series** | Yes |
+| **Simulation** | No |
+| **Missing Values** | Yes |
+| **Dataset Characteristics** | Multivariate, Time-Series |
+| **Feature Type** | Real |
+| **Associated Tasks** | Regression |
+| **Number of Instances** | 9358 |
+| **Number of Features** | 15 |
+| **Date Donated** | 2016-03-22 |
+| **Source** | UCI Machine Learning Repository |
+
 ## Dataset Information
+
 The dataset contains 9358 instances of hourly averaged responses from an array of 5 metal oxide chemical sensors embedded in an Air Quality Chemical Multisensor Device. The device was located on the field in a significantly polluted area, at road level,within an Italian city. Data were recorded from March 2004 to February 2005 (one year)representing the longest freely available recordings of on field deployed air quality chemical sensor devices responses. Ground Truth hourly averaged concentrations for CO, Non Metanic Hydrocarbons, Benzene, Total Nitrogen Oxides (NOx) and Nitrogen Dioxide (NO2)  and were provided by a co-located reference certified analyzer. Evidences of cross-sensitivities as well as both concept and sensor drifts are present as described in De Vito et al., Sens. And Act. B, Vol. 129,2,2008 (citation required) eventually affecting sensors concentration estimation capabilities. Missing values are tagged with -200 value. This dataset can be used exclusively for research purposes. Commercial purposes are fully excluded.
 
+## Tags
+
+Air quality monitoring, Sensor data, Pollution levels, Time-series analysis, Environmental data
+
 ## References
+
 - [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Air+Quality)
 
+[⬅️ Back to Index](../README.md)
diff --git a/markdown/appliances_energy_prediction.md b/markdown/appliances_energy_prediction.md
@@ -1,9 +1,33 @@
 # Appliances Energy Prediction
 
+**Summary:** Experimental data used to create regression models of appliances energy use in a low energy building.
+
+| Parameter | Value |
+| --- | --- |
+| **Name** | Appliances Energy Prediction |
+| **Labeled** | No |
+| **Time Series** | Yes |
+| **Simulation** | No |
+| **Missing Values** | No |
+| **Dataset Characteristics** | Multivariate, Time-Series |
+| **Feature Type** | Real |
+| **Associated Tasks** | Regression |
+| **Number of Instances** | 19735 |
+| **Number of Features** | 28 |
+| **Date Donated** | 2017-02-14 |
+| **Source** | UCI Machine Learning Repository |
+
 ## Dataset Information
+
 The data set is at 10 min for about 4.5 months. The house temperature and humidity conditions were monitored with a ZigBee wireless sensor network. Each wireless node transmitted the temperature and humidity conditions around 3.3 min. Then, the wireless data was averaged for 10 minutes periods. The energy data was logged every 10 minutes with m-bus energy meters. Weather from the nearest airport weather station (Chievres Airport, Belgium) was downloaded from a public data set from Reliable Prognosis (rp5.ru), and merged together with the experimental data sets using the date and time column. Two random variables have been included in the data set for testing the regression models and to filter out non predictive attributes (parameters).
 
+## Tags
+
+Indoor environment monitoring, ZigBee wireless network, Temperature data, Humidity data, Weather integration, Energy consumption, M-bus energy meters, Airport weather station
+
 ## References
+
 - [UCI Machine Learning Repository](https://archive.ics.uci.edu/dataset/374/appliances+energy+prediction)
 - [GitHub Repository](https://github.com/LuisM78/Appliances-energy-prediction-data)
 
+[⬅️ Back to Index](../README.md)
diff --git a/markdown/beijing_pm2_5_data.md b/markdown/beijing_pm2_5_data.md
@@ -1,8 +1,32 @@
 # Beijing PM2.5 Data
 
+**Summary:** This hourly data set contains the PM2.5 data of US Embassy in Beijing. Meanwhile, meteorological data from Beijing Capital International Airport are also included.
+
+| Parameter | Value |
+| --- | --- |
+| **Name** | Beijing PM2.5 Data |
+| **Labeled** | Yes |
+| **Time Series** | Yes |
+| **Simulation** | No |
+| **Missing Values** | Yes |
+| **Dataset Characteristics** | Multivariate, Time-Series |
+| **Feature Type** | Integer, Real |
+| **Associated Tasks** | Regression |
+| **Number of Instances** | 43824 |
+| **Number of Features** | 11 |
+| **Date Donated** | 2017-01-18 |
+| **Source** | UCI Machine Learning Repository |
+
 ## Dataset Information
+
 The data's time period is between Jan 1st, 2010 to Dec 31st, 2014. Missing data are denoted as 'NA'.
 
+## Tags
+
+Air quality, PM2.5 concentration, Meteorological data, Environmental monitoring, Time-series data
+
 ## References
+
 - [UCI Machine Learning Repository](https://archive.ics.uci.edu/dataset/381/beijing+pm2+5+data)
 
+[⬅️ Back to Index](../README.md)
diff --git a/markdown/c-mapss_aircraft_engine_simulator_data.md b/markdown/c-mapss_aircraft_engine_simulator_data.md
@@ -1,23 +1,52 @@
 # C-MAPSS Aircraft Engine Simulator Data
 
+**Summary:** This dataset was generated with the C-MAPSS simulator, a tool for the simulation of realistic large commercial turbofan engine data. The data consists of a series of flights with a reasonable linear transition period to allow the engine to change from one flight condition to the next. The fault was injected at a given time in one of the flights and persists throughout the remaining flights, effectively increasing the age of the engine. The intent is to identify which flight and when in the flight the fault occurred.
+
+| Parameter | Value |
+| --- | --- |
+| **Name** | C-MAPSS Aircraft Engine Simulator Data |
+| **Labeled** | Yes |
+| **Time Series** | Yes |
+| **Simulation** | Yes |
+| **Missing Values** | NIA |
+| **Dataset Characteristics** | Time-Series, Multivariate |
+| **Feature Type** | Real, Integer |
+| **Associated Tasks** | Regression, Classification |
+| **Number of Instances** | N/A |
+| **Number of Features** | N/A |
+| **Date Donated** | NIA |
+| **Source** | NASA |
+
 ## Special Note
+
 C-MAPSS and C-MAPSS40K ARE CURRENTLY UNAVAILABLE FOR DOWNLOAD. Glenn Research Center management is reviewing the availability requirements for these software packages. We are working with Center management to get the review completed and issues resolved in a timely manner. We will post updates on this website when the issues are resolved. We apologize for any inconvenience. Please contact Jonathan Litt, [email protected], if you have any questions in the meantime.
 
 ## Subject Area
+
 Engine Health
 
 ## Description
+
 This data set was generated with the C-MAPSS simulator. C-MAPSS stands for 'Commercial Modular Aero-Propulsion System Simulation' and it is a tool for the simulation of realistic large commercial turbofan engine data. Each flight is a combination of a series of flight conditions with a reasonable linear transition period to allow the engine to change from one flight condition to the next. The flight conditions are arranged to cover a typical ascent from sea level to 35K ft and descent back down to sea level. The fault was injected at a given time in one of the flights and persists throughout the remaining flights, effectively increasing the age of the engine. The intent is to identify which flight and when in the flight the fault occurred.
 
 ## How Data Was Acquired
+
 The data provided is from a high fidelity system level engine simulation designed to simulate nominal and fault engine degradation over a series of flights. The simulated data was created with a Matlab Simulink tool called C-MAPSS.
 
 ## Sample Rates and Parameter Description
+
 The flights are full flight recordings sampled at 1 Hz and consist of 30 engine and flight condition parameters. Each flight contains 7 unique flight conditions for an approximately 90 min flight including ascent to cruise at 35K ft and descent back to sea level. The parameters for each flight are the flight conditions, health indicators, measurement temperatures and pressure measurements.
 
 ## Faults/Anomalies
+
 Faults arose from the inlet engine fan, the low pressure compressor, the high pressure compressor, the high pressure turbine, and the low pressure turbine.
 
+## Tags
+
+Aircraft engine, Simulator data, Engine performance, Sensor data, Prognostics
+
 ## References
+
 - [NASA's Open Data Portal](https://data.nasa.gov/dataset/C-MAPSS-Aircraft-Engine-Simulator-Data/xaut-bemq/about_data)
 
+[⬅️ Back to Index](../README.md)
-Original file line number
+Diff line change
@@ Expand Up @@
                 });
             });
         </script>
     <script type="application/json" id="dataset-json">
     [
         {
@@ Expand Down @@