Skip to content

Commit

Permalink
fix markdowns
Browse files Browse the repository at this point in the history
  • Loading branch information
jonathanwvd committed Aug 16, 2024
1 parent 4f81d61 commit c0cd21e
Show file tree
Hide file tree
Showing 28 changed files with 731 additions and 94 deletions.
97 changes: 16 additions & 81 deletions README.md

Large diffs are not rendered by default.

107 changes: 94 additions & 13 deletions generate_documentation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,61 @@
import json
import os
import pandas as pd


def update_readme_with_data(json_folder, readme_file, md_folder_path):
# Collecting dataset information
datasets = []
for filename in os.listdir(json_folder):
if filename.endswith('.json') and filename.lower() not in ['template.json', 'datasets.json']: # Skip template and datasets.json
with open(os.path.join(json_folder, filename), 'r', encoding='utf-8') as jsonfile:
data = json.load(jsonfile)
datasets.append({
'Dataset Name': data.get('Name', ''),
'Labeled': data.get('Labeled', ''),
'Time Series': data.get('Time Series', ''),
'Simulation': data.get('Simulation', ''),
'Additional Tags': '; '.join(data.get('Additional Tags', [])),
'Description': data.get('Summary', ''), # Assuming 'Summary' is the description field
'Link': filename.replace('.json', '').replace(' ', '_').replace('.', '_').lower() # Link based on filename
})

# Convert list to DataFrame
df = pd.DataFrame(datasets)
df.fillna('', inplace=True)
df['Link'] = df['Link'].apply(lambda x: f"[{x.replace('_', ' ').title()}]({md_folder_path}/{x}.md)")
df['Dataset Name'] = df.apply(lambda x: x['Link'], axis=1)
df.drop(['Link', 'Description'], axis=1, inplace=True)
markdown_table = df.to_markdown(index=False)

# Read the existing README content and update
with open(readme_file, 'r', encoding='utf-8') as file:
content = file.read()

# Find the TABLE_START and TABLE_END markers
table_start = content.find("<!-- TABLE_START -->")
table_end = content.find("<!-- TABLE_END -->")

# Ensure both markers are found
if table_start == -1 or table_end == -1:
raise ValueError("Markers <!-- TABLE_START --> or <!-- TABLE_END --> not found in the README file.")

# Replace the content between the markers with the new table, preserving the TABLE_END marker
updated_content = (
content[:table_start] +
"<!-- TABLE_START -->\n" +
markdown_table +
"\n" +
content[table_end:]
)

# Write the updated content back to the README
with open(readme_file, 'w', encoding='utf-8') as file:
file.write(updated_content)

print("Updated README with the new Markdown table.")



def generate_json_data(json_folder):
datasets = []
Expand Down Expand Up @@ -62,28 +118,48 @@ def inject_json_to_html(json_data, html_file):
def json_to_markdown(json_path, md_path):
with open(json_path, 'r', encoding='utf-8') as file:
data = json.load(file)


# Start building the Markdown content
markdown_content = f"# {data['Name']}\n\n"
if 'table' in data:
markdown_content += "| Parameter | Value |\n"
markdown_content += "| --- | --- |\n"
for item in data['table']:
markdown_content += f"| {item['Parameter']} | {item['Value']} |\n"
markdown_content += "\n"

for section in data['Sections']:
markdown_content += f"## {section['Title']}\n{section['Content']}\n\n"

markdown_content += f"**Summary:** {data.get('Summary', '')}\n\n"

# Add the table with the relevant information
markdown_content += "| Parameter | Value |\n"
markdown_content += "| --- | --- |\n"
for key in ['Name', 'Labeled', 'Time Series', 'Simulation', 'Missing Values', 'Dataset Characteristics', 'Feature Type', 'Associated Tasks', 'Number of Instances', 'Number of Features', 'Date Donated', 'Source']:
if key in data:
markdown_content += f"| **{key}** | {data[key]} |\n"

markdown_content += "\n"

# Add sections
if 'Sections' in data:
for section in data['Sections']:
markdown_content += f"## {section['Title']}\n\n"
markdown_content += f"{section['Content']}\n\n"

# Add tags section
if 'Additional Tags' in data and data['Additional Tags']:
markdown_content += "## Tags\n\n"
markdown_content += ", ".join(data['Additional Tags']) + "\n\n"

# Add references
if 'References' in data:
markdown_content += "## References\n"
markdown_content += "## References\n\n"
for ref in data['References']:
markdown_content += f"- [{ref['Text']}]({ref['Link']})\n"
markdown_content += "\n"


# Add a link to go back to the main README or index page
markdown_content += "[⬅️ Back to Index](../README.md)\n"

# Write the Markdown content to the file
with open(md_path, 'w', encoding='utf-8') as file:
file.write(markdown_content)

print(f"Markdown file created for {os.path.basename(md_path)}")


def json_to_html(json_path, html_path):
with open(json_path, 'r', encoding='utf-8') as file:
data = json.load(file)
Expand Down Expand Up @@ -161,6 +237,8 @@ def json_to_html(json_path, html_path):
md_folder_path = 'markdown'
html_folder_path = 'html/pages'
index_html_file = 'index.html'
readme_file_path = 'README.md'

os.makedirs(md_folder_path, exist_ok=True)
os.makedirs(html_folder_path, exist_ok=True)

Expand All @@ -182,3 +260,6 @@ def json_to_html(json_path, html_path):
json_to_html(json_path, html_path)
except Exception as e:
print(f"Error processing {filename}: {e}")

# Update the README with dataset information directly from JSON files
update_readme_with_data(json_folder_path, readme_file_path, md_folder_path)
15 changes: 15 additions & 0 deletions index.html
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,21 @@ <h1 class="mb-4 text-center">Awesome Industrial Datasets</h1>
});
});
</script>















<script type="application/json" id="dataset-json">
[
{
Expand Down
25 changes: 25 additions & 0 deletions markdown/3w.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,38 @@
# 3W

**Summary:** Promotes development of ML algorithms for early detection and classification of undesirable events in offshore oil wells.

| Parameter | Value |
| --- | --- |
| **Name** | 3W |
| **Labeled** | Yes |
| **Time Series** | Yes |
| **Simulation** | Both |
| **Missing Values** | NIA |
| **Dataset Characteristics** | Multivariate, Time-Series |
| **Feature Type** | Real |
| **Associated Tasks** | Regression |
| **Number of Instances** | N/A |
| **Number of Features** | N/A |
| **Date Donated** | 2022-04-06 |
| **Source** | GitHub |

## Dataset Information

This is the first repository published by Petrobras on GitHub. It supports the 3W Project, which aims to promote experimentation and development of Machine Learning-based approaches and algorithms for specific problems related to detection and classification of undesirable events that occur in offshore oil wells.

The 3W Project is based on the 3W Dataset, a database described in [this paper](https://doi.org/10.1016/j.petrol.2019.106223), and on the 3W Toolkit, a software package that promotes experimentation with the 3W Dataset for specific problems. The name **3W** was chosen because this dataset is composed of instances from ***3*** different sources and which contain undesirable events that occur in oil ***W***ells.

## Motivation

Timely detection of undesirable events in oil wells can help prevent production losses, reduce maintenance costs, environmental accidents, and human casualties. Losses related to this type of events can reach 5% of production in certain scenarios, especially in areas such as Flow Assurance and Artificial Lifting Methods. In terms of maintenance, the cost of a maritime probe, required to perform various types of operations, can exceed US $500,000 per day.

## Tags

Oil and Gas, Real events, Fault detection, Multivariate data, Sensor data, Time-series analysis, Oil wells, Machine learning benchmark

## References

- [GitHub](https://github.com/petrobras/3W/tree/main)

[⬅️ Back to Index](../README.md)
24 changes: 24 additions & 0 deletions markdown/ai4i_2020_predictive_maintenance_dataset.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,32 @@
# AI4I 2020 Predictive Maintenance Dataset

**Summary:** The AI4I 2020 Predictive Maintenance Dataset is a synthetic dataset that reflects real predictive maintenance data encountered in industry.

| Parameter | Value |
| --- | --- |
| **Name** | AI4I 2020 Predictive Maintenance Dataset |
| **Labeled** | Yes |
| **Time Series** | Yes |
| **Simulation** | Yes |
| **Missing Values** | No |
| **Dataset Characteristics** | Multivariate, Time-Series |
| **Feature Type** | Real |
| **Associated Tasks** | Classification, Regression, Causal-Discovery |
| **Number of Instances** | 10000 |
| **Number of Features** | 6 |
| **Date Donated** | 2020-08-29 |
| **Source** | UCI Machine Learning Repository |

## Dataset Information

Since real predictive maintenance datasets are generally difficult to obtain and in particular difficult to publish, we present and provide a synthetic dataset that reflects real predictive maintenance encountered in industry to the best of our knowledge.

## Tags

Predictive maintenance, Synthetic data, Industry 4.0, Machine failure, Time-series data

## References

- [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/AI4I+2020+Predictive+Maintenance+Dataset)

[⬅️ Back to Index](../README.md)
24 changes: 24 additions & 0 deletions markdown/air_quality.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,32 @@
# Air Quality

**Summary:** Contains the responses of a gas multisensor device deployed on the field in an Italian city. Hourly responses averages are recorded along with gas concentrations references from a certified analyzer.

| Parameter | Value |
| --- | --- |
| **Name** | Air Quality |
| **Labeled** | Yes |
| **Time Series** | Yes |
| **Simulation** | No |
| **Missing Values** | Yes |
| **Dataset Characteristics** | Multivariate, Time-Series |
| **Feature Type** | Real |
| **Associated Tasks** | Regression |
| **Number of Instances** | 9358 |
| **Number of Features** | 15 |
| **Date Donated** | 2016-03-22 |
| **Source** | UCI Machine Learning Repository |

## Dataset Information

The dataset contains 9358 instances of hourly averaged responses from an array of 5 metal oxide chemical sensors embedded in an Air Quality Chemical Multisensor Device. The device was located on the field in a significantly polluted area, at road level,within an Italian city. Data were recorded from March 2004 to February 2005 (one year)representing the longest freely available recordings of on field deployed air quality chemical sensor devices responses. Ground Truth hourly averaged concentrations for CO, Non Metanic Hydrocarbons, Benzene, Total Nitrogen Oxides (NOx) and Nitrogen Dioxide (NO2) and were provided by a co-located reference certified analyzer. Evidences of cross-sensitivities as well as both concept and sensor drifts are present as described in De Vito et al., Sens. And Act. B, Vol. 129,2,2008 (citation required) eventually affecting sensors concentration estimation capabilities. Missing values are tagged with -200 value. This dataset can be used exclusively for research purposes. Commercial purposes are fully excluded.

## Tags

Air quality monitoring, Sensor data, Pollution levels, Time-series analysis, Environmental data

## References

- [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Air+Quality)

[⬅️ Back to Index](../README.md)
24 changes: 24 additions & 0 deletions markdown/appliances_energy_prediction.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,33 @@
# Appliances Energy Prediction

**Summary:** Experimental data used to create regression models of appliances energy use in a low energy building.

| Parameter | Value |
| --- | --- |
| **Name** | Appliances Energy Prediction |
| **Labeled** | No |
| **Time Series** | Yes |
| **Simulation** | No |
| **Missing Values** | No |
| **Dataset Characteristics** | Multivariate, Time-Series |
| **Feature Type** | Real |
| **Associated Tasks** | Regression |
| **Number of Instances** | 19735 |
| **Number of Features** | 28 |
| **Date Donated** | 2017-02-14 |
| **Source** | UCI Machine Learning Repository |

## Dataset Information

The data set is at 10 min for about 4.5 months. The house temperature and humidity conditions were monitored with a ZigBee wireless sensor network. Each wireless node transmitted the temperature and humidity conditions around 3.3 min. Then, the wireless data was averaged for 10 minutes periods. The energy data was logged every 10 minutes with m-bus energy meters. Weather from the nearest airport weather station (Chievres Airport, Belgium) was downloaded from a public data set from Reliable Prognosis (rp5.ru), and merged together with the experimental data sets using the date and time column. Two random variables have been included in the data set for testing the regression models and to filter out non predictive attributes (parameters).

## Tags

Indoor environment monitoring, ZigBee wireless network, Temperature data, Humidity data, Weather integration, Energy consumption, M-bus energy meters, Airport weather station

## References

- [UCI Machine Learning Repository](https://archive.ics.uci.edu/dataset/374/appliances+energy+prediction)
- [GitHub Repository](https://github.com/LuisM78/Appliances-energy-prediction-data)

[⬅️ Back to Index](../README.md)
24 changes: 24 additions & 0 deletions markdown/beijing_pm2_5_data.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,32 @@
# Beijing PM2.5 Data

**Summary:** This hourly data set contains the PM2.5 data of US Embassy in Beijing. Meanwhile, meteorological data from Beijing Capital International Airport are also included.

| Parameter | Value |
| --- | --- |
| **Name** | Beijing PM2.5 Data |
| **Labeled** | Yes |
| **Time Series** | Yes |
| **Simulation** | No |
| **Missing Values** | Yes |
| **Dataset Characteristics** | Multivariate, Time-Series |
| **Feature Type** | Integer, Real |
| **Associated Tasks** | Regression |
| **Number of Instances** | 43824 |
| **Number of Features** | 11 |
| **Date Donated** | 2017-01-18 |
| **Source** | UCI Machine Learning Repository |

## Dataset Information

The data's time period is between Jan 1st, 2010 to Dec 31st, 2014. Missing data are denoted as 'NA'.

## Tags

Air quality, PM2.5 concentration, Meteorological data, Environmental monitoring, Time-series data

## References

- [UCI Machine Learning Repository](https://archive.ics.uci.edu/dataset/381/beijing+pm2+5+data)

[⬅️ Back to Index](../README.md)
29 changes: 29 additions & 0 deletions markdown/c-mapss_aircraft_engine_simulator_data.md
Original file line number Diff line number Diff line change
@@ -1,23 +1,52 @@
# C-MAPSS Aircraft Engine Simulator Data

**Summary:** This dataset was generated with the C-MAPSS simulator, a tool for the simulation of realistic large commercial turbofan engine data. The data consists of a series of flights with a reasonable linear transition period to allow the engine to change from one flight condition to the next. The fault was injected at a given time in one of the flights and persists throughout the remaining flights, effectively increasing the age of the engine. The intent is to identify which flight and when in the flight the fault occurred.

| Parameter | Value |
| --- | --- |
| **Name** | C-MAPSS Aircraft Engine Simulator Data |
| **Labeled** | Yes |
| **Time Series** | Yes |
| **Simulation** | Yes |
| **Missing Values** | NIA |
| **Dataset Characteristics** | Time-Series, Multivariate |
| **Feature Type** | Real, Integer |
| **Associated Tasks** | Regression, Classification |
| **Number of Instances** | N/A |
| **Number of Features** | N/A |
| **Date Donated** | NIA |
| **Source** | NASA |

## Special Note

C-MAPSS and C-MAPSS40K ARE CURRENTLY UNAVAILABLE FOR DOWNLOAD. Glenn Research Center management is reviewing the availability requirements for these software packages. We are working with Center management to get the review completed and issues resolved in a timely manner. We will post updates on this website when the issues are resolved. We apologize for any inconvenience. Please contact Jonathan Litt, [email protected], if you have any questions in the meantime.

## Subject Area

Engine Health

## Description

This data set was generated with the C-MAPSS simulator. C-MAPSS stands for 'Commercial Modular Aero-Propulsion System Simulation' and it is a tool for the simulation of realistic large commercial turbofan engine data. Each flight is a combination of a series of flight conditions with a reasonable linear transition period to allow the engine to change from one flight condition to the next. The flight conditions are arranged to cover a typical ascent from sea level to 35K ft and descent back down to sea level. The fault was injected at a given time in one of the flights and persists throughout the remaining flights, effectively increasing the age of the engine. The intent is to identify which flight and when in the flight the fault occurred.

## How Data Was Acquired

The data provided is from a high fidelity system level engine simulation designed to simulate nominal and fault engine degradation over a series of flights. The simulated data was created with a Matlab Simulink tool called C-MAPSS.

## Sample Rates and Parameter Description

The flights are full flight recordings sampled at 1 Hz and consist of 30 engine and flight condition parameters. Each flight contains 7 unique flight conditions for an approximately 90 min flight including ascent to cruise at 35K ft and descent back to sea level. The parameters for each flight are the flight conditions, health indicators, measurement temperatures and pressure measurements.

## Faults/Anomalies

Faults arose from the inlet engine fan, the low pressure compressor, the high pressure compressor, the high pressure turbine, and the low pressure turbine.

## Tags

Aircraft engine, Simulator data, Engine performance, Sensor data, Prognostics

## References

- [NASA's Open Data Portal](https://data.nasa.gov/dataset/C-MAPSS-Aircraft-Engine-Simulator-Data/xaut-bemq/about_data)

[⬅️ Back to Index](../README.md)
Loading

0 comments on commit c0cd21e

Please sign in to comment.