petrobras · castrokelly · Oct 19, 2024 · Oct 19, 2024 · Oct 19, 2024 · Oct 19, 2024
diff --git a/README.md b/README.md
@@ -131,6 +131,19 @@ It is important to note that there are arbitrary choices in this toolkit, but th
 
 The 3W Toolkit is implemented in sub-modules as discribed [here](3W_TOOLKIT_STRUCTURE.md).
 
+### Loading the 3W Dataset 2.0
+
+The `load_3w_dataset()` function loads the 3W Dataset 2.0, which is composed of multiple Parquet files organized in folders.
+
+**Usage:**
+
+```python
+import toolkit as tk
+
+# Load the real data from the 3W Dataset 2.0
+df = tk.load_3w_dataset(data_type='real', base_path='path/to/dataset')
+```
+
 ## Incorporated Problems
 
 Specific problems will be incorporated into this project gradually. At this point, we can work on:

diff --git a/problems/01_binary_classifier_of_spurious_closure_of_dhsv/_baseline/main.ipynb b/problems/01_binary_classifier_of_spurious_closure_of_dhsv/_baseline/main.ipynb
@@ -49,7 +49,19 @@
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'numpy'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[1], line 3\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msys\u001b[39;00m\n\u001b[0;32m      2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[0;32m      5\u001b[0m sys\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mappend(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m..\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m..\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m..\u001b[39m\u001b[38;5;124m'\u001b[39m))\n\u001b[0;32m      6\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtoolkit\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mtk\u001b[39;00m\n",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'numpy'"
+     ]
+    }
+   ],
    "source": [
     "import sys\n",
     "import os\n",
@@ -58,6 +70,8 @@
     "sys.path.append(os.path.join('..','..','..'))\n",
     "import toolkit as tk\n",
     "\n",
+    "from toolkit.base import load_3w_dataset\n",
+    "\n",
     "%matplotlib inline\n",
     "%config InlineBackend.figure_format = 'svg'"
    ]
@@ -78,7 +92,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -101,7 +115,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -114,9 +128,43 @@
    ],
    "source": [
     "event_labels = list(experiment.event_labels.values())\n",
-    "event_labels_idx = {v: i for i, v in enumerate(event_labels)}\n",
-    "fold: tk.EventFold\n",
-    "folds: tk.EventFolds = experiment.folds()"
+    "event_labels_idx = {v: i for i, v in enumerate(event_labels)}\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Upload 3W Dataset 2.0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = load_3w_dataset(data_type='real', base_path='path/to/dataset')  # Replaced by correct path"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Create the folds manually"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "folds = tk.EventFolds(\n",
+    "    experiment=experiment,\n",
+    "    df=df,  # Pass the loaded DataFrame to the EventFolds class\n",
+    "    # ... (other parameters, if necessary) ...\n",
+    ")\n"
    ]
   },
   {
@@ -135,7 +183,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -185,7 +233,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1501,7 +1549,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.12.0"
   },
   "toc": {
    "base_numbering": 1,

diff --git a/toolkit/README.md b/toolkit/README.md
@@ -13,8 +13,9 @@
 
 * [Introduction](#introduction)
 * [Release Notes](#release-notes)
-  * [1.0.0](#100)
-  * [1.1.0](#110)
+  * [1.0.0](#100)
+  * [1.1.0](#110)
+  * [1.2.0](#120)  
 
 # Introduction
 
@@ -36,4 +37,17 @@ Release: July 25, 2024.
 
 Highlights:
 
-1. Makes resources (functions and constants) compatible with 3W Dataset version 2.0.0, which is based on Parquet files.
+1. Makes resources (functions and constants) compatible with 3W Dataset version 2.0.0, which is based on Parquet files.
+
+## 1.2.0
+
+Release: October 19, 2024 # Lastiest version
+
+Highlights:
+
+1. **Adapts `load_dataset()` to 3W Dataset 2.0:** The `load_dataset()` function in `base.py` was adapted to correctly handle the folder structure and different data types of the 3W Dataset 2.0. It was renamed to `load_3w_dataset()`.
+2. **Updates `dev.py` for 3W Dataset 2.0:** The `dev.py` sub-module was updated to ensure compatibility with the new `load_3w_dataset()` function and the 3W Dataset 2.0 structure. The `extrai_arrays()` function was removed, and the `EventFolds` and `Experiment` classes were adjusted.
+3. **Updates `misc.py` for 3W Dataset 2.0:** The `misc.py` sub-module was updated to ensure compatibility with the new `load_3w_dataset()` function and the 3W Dataset 2.0 structure. Redundant functions were removed, and existing functions were adapted to receive the DataFrame as a parameter.
+4. **Updates `__init__.py` for 3W Dataset 2.0:**  The `__init__.py` file was updated to import and expose the new `load_3w_dataset()` function.
+
+These updates ensure that the 3W Toolkit is fully compatible with the 3W Dataset 2.0, providing a more efficient and streamlined workflow for loading and analyzing the data.
diff --git a/toolkit/__init__.py b/toolkit/__init__.py
@@ -1,53 +1,59 @@
-"""This is the 3W Toolkit, a software package written in Python 3 that 
+"""This is the 3W Toolkit, a software package written in Python 3 that 
 is one of the 3W Project's major components.
 
 This toolkit contains resources that make the following easier:
 
 - 3W Dataset overview generation;
-- Experimentation and comparative analysis of Machine Learning-based 
-approaches and algorithms for specific problems related to undesirable 
-events that occur in offshore oil wells during their respective 
+- Experimentation and comparative   
+ analysis of Machine Learning-based   
+
+approaches and algorithms for specific problems related to undesirable 
+events that occur in offshore oil wells during their respective 
 production phases;
-* Standardization of key points of the Machine Learning-based algorithm 
+* Standardization of key points of the Machine Learning-based algorithm 
 development pipeline.
 
 All these resources are implemented in the following sub-modules:
 
 - **base**: groups the objects used by the other sub-modules;
-- **dev**: has all the resources related to development of Machine 
+- **dev**: has all the resources related to development of Machine   
+
 Learning models;
-- **misc**: brings together diverse resources that do not fit in the 
+- **misc**: brings together diverse resources that do not fit in the 
 other sub-modules;
-- **rolling_window**: creates a view of array which for every point 
-gives the n-dimensional neighbourhood of size window. New dimensions are 
+- **rolling_window**: creates a view of array which for every point 
+gives the n-dimensional neighbourhood of size window. New dimensions are 
 added at the end of array or after the corresponding original dimension.
 
-Specific problems will be incorporated into this toolkit gradually. At 
+Specific problems will be incorporated into this toolkit gradually. At   
+
 this time, models can be developed for the following problems:
 
 - Binary Classifier of Spurious Closure of DHSV.
 
-Examples of how to use this toolkit will be incremented throughout its 
+Examples of how to use this toolkit will be incremented throughout   
+ its 
 development. Please, check the 3W Project's README.md file for more details.
 
-It is important to note that there are arbitrary choices in this 
-toolkit, but they have been carefully made to allow adequate comparative 
-analysis without compromising the ability to experiment with different 
+It is important to note that there are arbitrary choices in this 
+toolkit, but they have been carefully made to allow adequate comparative 
+analysis without compromising the ability to experiment with different 
 approaches and algorithms.
 
-This toolkit's documentation is generated in english and in Google format 
+This toolkit's documentation is generated in english and in Google format 
 with [autoDocstring - Python Docstring Generator
 ](https://github.com/NilsJPWerner/autoDocstring), which follows [PEP 257
 ](https://peps.python.org/pep-0257/), and [pdoc3
 ](https://pdoc3.github.io/pdoc/).
 
-Its source code is implemented according to the style guide established 
-by [PEP 8](https://peps.python.org/pep-0008/). This is guaranteed with 
+Its source code is implemented according to the style guide established   
+
+by [PEP 8](https://peps.python.org/pep-0008/). This is guaranteed with 
 the use of the [Black formatter](https://github.com/psf/black).
 """
 
 __status__ = "Development"
-__version__ = "1.1.0"
+__version__ = "1.2.0"  # Update version number after changes
 __license__ = "Apache License 2.0"
 __copyright__ = "Copyright 2024, Petróleo Brasileiro S.A."
 __authors__ = [
@@ -73,7 +79,8 @@
     EventType,
     LABELS_DESCRIPTIONS,
     NORMAL_LABEL,
-    PARQUET_EXTENSION,
+    PARQUET_EXTENSION,   
+
     PARQUET_ENGINE,
     PARQUET_COMPRESSION,
     PATH_3W_PROJECT,
@@ -83,6 +90,7 @@
     PATH_TOOLKIT,
     TRANSIENT_OFFSET,
     VARS,
+    load_3w_dataset,  # To use by 3W  v2.0
     load_config_in_dataset_ini,
 )
 
@@ -104,7 +112,8 @@
     get_all_labels_and_files,
     label_and_file_generator,
     load_instance,
-    load_instances,
+    load_instances,   
+
     resample,
     plot_instance,
 )
diff --git a/toolkit/base.py b/toolkit/base.py
@@ -1,7 +1,7 @@
-"""This 3W toolkits' sub-module groups objects used by the other 
-sub-modules. 
+"""This 3W toolkits' sub-module groups objects used by the other
+sub-modules.
 
-Any resource that is not used by another sub-module must be maintained 
+Any resource that is not used by another sub-module must be maintained
 in the miscellaneous sub-module.
 """
 
@@ -56,6 +56,59 @@ def load_config_in_dataset_ini():
     return dict(dataset_ini)
 
 
+def load_3w_dataset(data_type='real', base_path=PATH_DATASET):
+    """
+    Load the 3W Dataset 2.0.
+
+    Parameters
+    ----------
+    data_type : str, optional
+        Type of data to be loaded ('real', 'simulated' or 'imputed').
+        The default is 'real'.
+    base_path : str, optional
+        Path to the root folder of the dataset. The default is PATH_DATASET.
+
+    Returns
+    -------
+    pandas.DataFrame
+        DataFrame with the 3W Dataset 2.0 data.
+    """
+
+    dataframes = []
+    for i in range(10):  # Loop through folders 0 to 9
+        folder_path = os.path.join(base_path, str(i))
+        if os.path.exists(folder_path):
+            parquet_files = [f for f in os.listdir(folder_path) if f.endswith('.parquet')]
+            for file in parquet_files:
+                file_path = os.path.join(folder_path, file)
+                try:
+                    df = pd.read_parquet(file_path)
+
+                    # Filter data by specified type
+                    if data_type == 'real':
+                        df_filtered = df[df['state'] == 0]  # Real data
+                    elif data_type == 'simulated':
+                        df_filtered = df[df['state'] == 1]  # Simulated data
+                    elif data_type == 'imputed':
+                        df_filtered = df[df['state'] == 2]  # Imputed data
+                    else:
+                        raise ValueError("Invalid data type. Choose between 'real', 'simulated' or 'imputed'.")
+
+                    dataframes.append(df_filtered)
+                except Exception as e:
+                    print(f"Error reading file {file_path}: {e}")
+        else:
+            print(f"Folder {folder_path} not found.")
+
+    # Concatenate all DataFrames into a single DataFrame
+    if dataframes:
+        df = pd.concat(dataframes, ignore_index=True)
+        return df
+    else:
+        print("No data found.")
+        return None
+
+
 # Loads all configurations present in the 3W Dataset's main
 # configuration file and provides specific configurations in different
 # granularity and formats
@@ -123,3 +176,4 @@ def __init__(self, event_name):
         self.TRANSIENT = event_section.getboolean("TRANSIENT")
         self.window = event_section.getint("WINDOW")
         self.step = event_section.getint("STEP")
+