From 00d88f805a4b4305a24fa35a75616990ac4645a5 Mon Sep 17 00:00:00 2001
From: Kelly Castro <kelly@decastro.com>
Date: Sat, 19 Oct 2024 01:33:09 -0300
Subject: [PATCH 1/6] feat: Adapt load_dataset() to 3W Dataset 2.0

---
 README.md                                     | 13 ++++
 .../_baseline/main.ipynb                      | 66 ++++++++++++++++---
 toolkit/base.py                               | 60 ++++++++++++++++-
 3 files changed, 127 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 71e33cd7..7f8a3934 100644
--- a/README.md
+++ b/README.md
@@ -131,6 +131,19 @@ It is important to note that there are arbitrary choices in this toolkit, but th
 
 The 3W Toolkit is implemented in sub-modules as discribed [here](3W_TOOLKIT_STRUCTURE.md).
 
+### Loading the 3W Dataset 2.0
+
+The `load_3w_dataset()` function loads the 3W Dataset 2.0, which is composed of multiple Parquet files organized in folders.
+
+**Usage:**
+
+```python
+import toolkit as tk
+
+# Load the real data from the 3W Dataset 2.0
+df = tk.load_3w_dataset(data_type='real', base_path='path/to/dataset')
+```
+
 ## Incorporated Problems
 
 Specific problems will be incorporated into this project gradually. At this point, we can work on:
diff --git a/problems/01_binary_classifier_of_spurious_closure_of_dhsv/_baseline/main.ipynb b/problems/01_binary_classifier_of_spurious_closure_of_dhsv/_baseline/main.ipynb
index 2bcb2658..90af9aec 100644
--- a/problems/01_binary_classifier_of_spurious_closure_of_dhsv/_baseline/main.ipynb
+++ b/problems/01_binary_classifier_of_spurious_closure_of_dhsv/_baseline/main.ipynb
@@ -49,7 +49,19 @@
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'numpy'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[1], line 3\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msys\u001b[39;00m\n\u001b[0;32m      2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[0;32m      5\u001b[0m sys\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mappend(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m..\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m..\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m..\u001b[39m\u001b[38;5;124m'\u001b[39m))\n\u001b[0;32m      6\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtoolkit\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mtk\u001b[39;00m\n",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'numpy'"
+     ]
+    }
+   ],
    "source": [
     "import sys\n",
     "import os\n",
@@ -58,6 +70,8 @@
     "sys.path.append(os.path.join('..','..','..'))\n",
     "import toolkit as tk\n",
     "\n",
+    "from toolkit.base import load_3w_dataset\n",
+    "\n",
     "%matplotlib inline\n",
     "%config InlineBackend.figure_format = 'svg'"
    ]
@@ -78,7 +92,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -101,7 +115,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -114,9 +128,43 @@
    ],
    "source": [
     "event_labels = list(experiment.event_labels.values())\n",
-    "event_labels_idx = {v: i for i, v in enumerate(event_labels)}\n",
-    "fold: tk.EventFold\n",
-    "folds: tk.EventFolds = experiment.folds()"
+    "event_labels_idx = {v: i for i, v in enumerate(event_labels)}\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Upload 3W Dataset 2.0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = load_3w_dataset(data_type='real', base_path='path/to/dataset')  # Replaced by correct path"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Create the folds manually"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "folds = tk.EventFolds(\n",
+    "    experiment=experiment,\n",
+    "    df=df,  # Pass the loaded DataFrame to the EventFolds class\n",
+    "    # ... (other parameters, if necessary) ...\n",
+    ")\n"
    ]
   },
   {
@@ -135,7 +183,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -185,7 +233,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -1501,7 +1549,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.12.0"
   },
   "toc": {
    "base_numbering": 1,
diff --git a/toolkit/base.py b/toolkit/base.py
index 41449181..31643b04 100644
--- a/toolkit/base.py
+++ b/toolkit/base.py
@@ -1,7 +1,7 @@
-"""This 3W toolkits' sub-module groups objects used by the other 
-sub-modules. 
+"""This 3W toolkits' sub-module groups objects used by the other
+sub-modules.
 
-Any resource that is not used by another sub-module must be maintained 
+Any resource that is not used by another sub-module must be maintained
 in the miscellaneous sub-module.
 """
 
@@ -56,6 +56,59 @@ def load_config_in_dataset_ini():
     return dict(dataset_ini)
 
 
+def load_3w_dataset(data_type='real', base_path=PATH_DATASET):
+    """
+    Load the 3W Dataset 2.0.
+
+    Parameters
+    ----------
+    data_type : str, optional
+        Type of data to be loaded ('real', 'simulated' or 'imputed').
+        The default is 'real'.
+    base_path : str, optional
+        Path to the root folder of the dataset. The default is PATH_DATASET.
+
+    Returns
+    -------
+    pandas.DataFrame
+        DataFrame with the 3W Dataset 2.0 data.
+    """
+
+    dataframes = []
+    for i in range(10):  # Loop through folders 0 to 9
+        folder_path = os.path.join(base_path, str(i))
+        if os.path.exists(folder_path):
+            parquet_files = [f for f in os.listdir(folder_path) if f.endswith('.parquet')]
+            for file in parquet_files:
+                file_path = os.path.join(folder_path, file)
+                try:
+                    df = pd.read_parquet(file_path)
+
+                    # Filter data by specified type
+                    if data_type == 'real':
+                        df_filtered = df[df['state'] == 0]  # Real data
+                    elif data_type == 'simulated':
+                        df_filtered = df[df['state'] == 1]  # Simulated data
+                    elif data_type == 'imputed':
+                        df_filtered = df[df['state'] == 2]  # Imputed data
+                    else:
+                        raise ValueError("Invalid data type. Choose between 'real', 'simulated' or 'imputed'.")
+
+                    dataframes.append(df_filtered)
+                except Exception as e:
+                    print(f"Error reading file {file_path}: {e}")
+        else:
+            print(f"Folder {folder_path} not found.")
+
+    # Concatenate all DataFrames into a single DataFrame
+    if dataframes:
+        df = pd.concat(dataframes, ignore_index=True)
+        return df
+    else:
+        print("No data found.")
+        return None
+
+
 # Loads all configurations present in the 3W Dataset's main
 # configuration file and provides specific configurations in different
 # granularity and formats
@@ -123,3 +176,4 @@ def __init__(self, event_name):
         self.TRANSIENT = event_section.getboolean("TRANSIENT")
         self.window = event_section.getint("WINDOW")
         self.step = event_section.getint("STEP")
+        
\ No newline at end of file

From e54a57e7627e50d4c3ce8515773b97ecc4d97b7b Mon Sep 17 00:00:00 2001
From: Kelly Castro <kelly@decastro.com>
Date: Sat, 19 Oct 2024 03:29:36 -0300
Subject: [PATCH 2/6] feat/adapt_dev_to_3w_dataset_2.0

---
 toolkit/dev.py | 546 ++++++-------------------------------------------
 1 file changed, 68 insertions(+), 478 deletions(-)

diff --git a/toolkit/dev.py b/toolkit/dev.py
index 32d1e5e1..469cabdd 100644
--- a/toolkit/dev.py
+++ b/toolkit/dev.py
@@ -1,4 +1,4 @@
-"""This 3W toolkits' sub-module has resources related to development of 
+"""This 3W toolkits' sub-module has resources related to development of
 Machine Learning models.
 
 The main tasks made possible by these features are:
@@ -13,18 +13,14 @@
 import numpy as np
 import warnings
 import matplotlib
-import seaborn as sns
-import os
 
 matplotlib.use("agg")
 from matplotlib import pyplot as plt
-from pathlib import Path, PurePosixPath
+from pathlib import Path
 from sklearn import metrics
 from alive_progress import alive_bar
-from itertools import chain, compress, repeat
+from itertools import compress, repeat
 from functools import lru_cache
-from zipfile import ZipFile
-from typing import Union
 
 from .rolling_window import rolling_window
 from .base import (
@@ -36,42 +32,11 @@
     PATH_FOLDS,
     TRANSIENT_OFFSET,
     VARS,
+    load_3w_dataset,  # For compatibility with 3W v2.0
 )
 
-
-# Transforma lista de instâncias (lista de tuplas (X, y)) em lista de
-# exemplos (X, y).
-# Adicionalmente, também filtra alvos inválidos
-def extraia_exemplos(instancias, retornar_grupos=False):
-    if len(instancias) == 0:
-        return ([], [])
-
-    # Busca alvos nulos
-    y_finite = map(np.isfinite, chain(*(instancia[1] for instancia in instancias)))
-    X_iter = chain(*(instancia[0] for instancia in instancias))
-    y_iter = chain(*(instancia[1] for instancia in instancias))
-
-    # Adiciona iterador de grupos
-    if retornar_grupos:
-        grupos = list(
-            chain(*(repeat(grupo, len(X)) for grupo, (X, y) in enumerate(instancias)))
-        )
-
-        iter_zip = zip(X_iter, y_iter, grupos)
-    else:
-        iter_zip = zip(X_iter, y_iter)
-
-    # Executa iteradores e retorna X, y, [grupos] como listas
-    result = list(map(list, zip(*compress(iter_zip, y_finite))))
-
-    # Converte y (segunda lista do resultado) para int
-    result[1] = list(map(int, result[1]))
-
-    return tuple(result)
-
-
-# Classe cujo objeto contém todas as informações necessárias para uma
-# rodada do K-fold do classificador de evento.
+# Class whose object contains all the necessary information for a
+# round of K-fold of the event classifier.
 class EventFold:
     def __init__(
         self,
@@ -85,13 +50,13 @@ def __init__(
     ):
         self.event_folds: EventFolds = event_folds
 
-        # Nota: `instancias_treino` e `instancias_teste` são listas de
-        # tuplas (X, y)
+        # Note: `instancias_treino` and `instancias_teste` are lists of
+        # tuples (X, y)
 
-        # Aplica passo em instâncias de treino
+        # Apply step to training instances
         self.instancias_treino = [(X[::step], y[::step]) for X, y in instancias_treino]
 
-        # Aplica passo em instâncias de teste
+        # Apply step to test instances
         self.instancias_teste = [
             (X[::passo_teste], y[::passo_teste]) for X, y in instancias_teste
         ]
@@ -99,48 +64,61 @@ def __init__(
         self.nome_instancias_treino = nome_instancias_treino
         self.nome_instancias_teste = nome_instancias_teste
 
-        # Verfica se alguma das instâncias ficou vazia depois de
-        # aplicado passo
+        # Check if any of the instances were empty after
+        # applying step
         for instancia in self.instancias_treino + self.instancias_teste:
             X, y = instancia
             assert min(X.shape) > 0 and min(
                 y.shape
-            ), "Janela especificada gerou instância sem amostras"
+            ), "Specified window generated instance without samples"
 
-    # Método para extração de amostras para treino
+    # Method for extracting training samples
     @lru_cache(1)
-    def extract_training_samples(self, retornar_grupos=False):
-        return extraia_exemplos(self.instancias_treino, retornar_grupos)
+    def extract_training_samples(self):
+        # Extract training samples from self.instancias_treino
+        X_train = np.concatenate([x[0] for x in self.instancias_treino])
+        y_train = np.concatenate([x[1] for x in self.instancias_treino])
+        return X_train, y_train
 
-    # Método para extração de amostras para teste
+    # Method for extracting complete test samples
     @lru_cache(1)
-    def extraia_amostras_teste_completo(self, retornar_grupos=False):
-        return extraia_exemplos(self.instancias_teste, retornar_grupos)
+    def extraia_amostras_teste_completo(self):
+        # Extract test samples from self.instancias_teste
+        X_test = np.concatenate([x[0] for x in self.instancias_teste])
+        y_test = np.concatenate([x[1] for x in self.instancias_teste])
+        return X_test, y_test
 
-    # Método para extração de amostras para teste
+    # Method for extracting test samples
     def extract_test_samples(self):
-        # Retorna apenas X
+        # Return only X
         return self.extraia_amostras_teste_completo()[0]
 
-    # Método para cálculo de métricas parciais
+    # Method for calculating partial metrics
     def calculate_partial_metrics(
-        self, y_prev_soft, idx_to_codigo, apresente=False, apresente_conf={}
+        self, y_pred_soft, idx_to_codigo, apresente=False, apresente_conf={}
     ):
         """
-        idx_to_codigo (list or dict):
-            idx_to_codigo[i] = j indica que a i-ésima coluna de
-            y_prev_soft corresponde ao código da classe j da tarefa
-            corrente.
+        Calculate partial metrics for the fold.
+
+        Parameters
+        ----------
+        y_pred_soft : np.ndarray
+            Soft predictions for the test set.
+        idx_to_codigo : list or dict
+            Mapping from prediction index to class code.
+        apresente : bool, optional
+            Whether to display the results, by default False
+        apresente_conf : dict, optional
+            Configuration for displaying the results, by default {}
         """
-        _, y_teste, grupos_teste = self.extraia_amostras_teste_completo(
-            retornar_grupos=True
-        )
 
-        assert len(y_prev_soft) == len(
-            y_teste
-        ), f"Número incorreto de previsões: esperado {len(y_teste)}, encontrado {len(y_prev_soft)}"
+        X_test, y_test = self.extraia_amostras_teste_completo()
+
+        assert len(y_pred_soft) == len(
+            y_test
+        ), f"Incorrect number of predictions: expected {len(y_test)}, found {len(y_pred_soft)}"
 
-        # códigos das classes para a tarefa que esse fold faz parte
+        # Class codes for the task that this fold is part of
         event_labels = self.event_folds.experiment.event_labels
         n_codigos = len(event_labels)
         lista_codigos = list(event_labels.values())
@@ -157,46 +135,46 @@ def calculate_partial_metrics(
                 i for i, j in enumerate(idx_to_codigo) if j == codigo_transiente
             )
 
-        # Predições soft com shape correto
-        shape_ok = (len(y_teste), n_codigos)
+        # Soft predictions with correct shape
+        shape_ok = (len(y_test), n_codigos)
         assert (
-            y_prev_soft.shape == shape_ok
-        ), f"Predição deve ter shape (n_samples, n_classes) = ({shape_ok[0]},{shape_ok[1]})"
+            y_pred_soft.shape == shape_ok
+        ), f"Prediction must have shape (n_samples, n_classes) = ({shape_ok[0]},{shape_ok[1]})"
 
-        # Todos os códigos devem aparecer em ordem_codigos_evento
+        # All codes must appear in ordem_codigos_evento
         codigos_faltando = set(lista_codigos) - set(
             [idx_to_codigo[i] for i in range(n_codigos)]
         )
         assert (
             len(codigos_faltando) == 0
-        ), f"Códigos faltando em 'idx_to_codigo': {codigos_faltando}"
+        ), f"Missing codes in 'idx_to_codigo': {codigos_faltando}"
 
-        # Calculando predição da classe
-        y_prev_idx = y_prev_soft.argmax(1)
-        y_prev = list(map(idx_to_codigo.__getitem__, y_prev_idx))
+        # Calculating class prediction
+        y_pred_idx = y_pred_soft.argmax(1)
+        y_pred = list(map(idx_to_codigo.__getitem__, y_pred_idx))
 
-        # Calculando probabilidade predita de regime + transiente para
-        # plotar
-        y_prob_nao_normal = y_prev_soft[:, coluna_regime].copy()
+        # Calculating predicted probability of regime + transient to
+        # plot
+        y_prob_nao_normal = y_pred_soft[:, coluna_regime].copy()
         if coluna_transiente is not None:
-            y_prob_nao_normal += y_prev_soft[:, coluna_transiente]
+            y_prob_nao_normal += y_pred_soft[:, coluna_transiente]
 
-        # Métrica principal
+        # Main metric
         f_beta = metrics.fbeta_score(
-            y_teste, y_prev, beta=1.2, average="micro", labels=lista_codigos
+            y_test, y_pred, beta=1.2, average="micro", labels=lista_codigos
         )
         f_beta *= 100.0
 
         # MEAN_LOG_LOSS
         log_loss_medio = metrics.log_loss(
-            y_teste, y_prev_soft, labels=lista_codigos, normalize=True
+            y_test, y_pred_soft, labels=lista_codigos, normalize=True
         )
 
         metricas = {"F_BETA [%]": f_beta, "MEAN_LOG_LOSS": log_loss_medio}
 
         self.event_folds.salve_metricas_parciais(self, metricas)
         if apresente:
-            # Carregando configuração padrão de apresentação
+            # Loading default presentation configuration
             def set_config(name, value, overwrite=False):
                 if overwrite or (name not in apresente_conf):
                     apresente_conf[name] = value
@@ -208,7 +186,7 @@ def set_config(name, value, overwrite=False):
             set_config("mostra_prob", True)
             # ===============================================
 
-            # Valores do gráfico: normal=0, transiente=0.5, em regime=1
+            # Chart values: normal=0, transient=0.5, in regime=1
             plot_values = {
                 event_labels["normal"]: 0,
                 event_labels["regime"]: 1,
@@ -216,10 +194,10 @@ def set_config(name, value, overwrite=False):
             if codigo_transiente is not None:
                 plot_values[event_labels["transiente"]] = 0.5
 
-            y_prev_plot = list(map(plot_values.__getitem__, y_prev))
-            y_teste_plot = list(map(plot_values.__getitem__, y_teste))
+            y_pred_plot = list(map(plot_values.__getitem__, y_pred))
+            y_teste_plot = list(map(plot_values.__getitem__, y_test))
 
-            # Cria um plot para cada grupo
+            # Create a plot for each group
             grupos_count = np.bincount(grupos_teste)
             n_grupos = len(grupos_count)
 
@@ -230,391 +208,3 @@ def set_config(name, value, overwrite=False):
                 sharex=apresente_conf["sharex"],
             )
             plt.subplots_adjust(hspace=apresente_conf["hspace"])
-            if n_grupos == 1:
-                axes = [axes]
-
-            axes[0].set_title(
-                f"F_BETA [%]: {f_beta:.3f}   MEAN_LOG_LOSS: {log_loss_medio:.5f}"
-            )
-
-            grupo_idx_inicio = 0
-            for grupo, ax in enumerate(axes):
-                grupo_count = grupos_count[grupo]
-                y_prev_grupo = y_prev_plot[
-                    grupo_idx_inicio : grupo_idx_inicio + grupo_count
-                ]
-                y_teste_grupo = y_teste_plot[
-                    grupo_idx_inicio : grupo_idx_inicio + grupo_count
-                ]
-                y_prob_nao_normal_grupo = y_prob_nao_normal[
-                    grupo_idx_inicio : grupo_idx_inicio + grupo_count
-                ]
-
-                ax.plot(y_prev_grupo, marker=11, color="orange", linestyle="")
-                ax.plot(y_teste_grupo, marker=10, color="green", linestyle="")
-                ax.set_ylim([-0.2, 1.2])
-                yticks, yticklabels = [0, 1], ["normal", "em regime"]
-                if codigo_transiente is not None:
-                    yticks.insert(1, 0.5)
-                    yticklabels.insert(1, "transiente")
-                ax.set_yticks(yticks)
-                ax.set_yticklabels(yticklabels)
-                if apresente_conf["mostrar_nome_instancia"] and (
-                    self.nome_instancias_teste is not None
-                ):
-                    title = ax.get_title()
-                    if title != "":
-                        title += "\n"
-                    title += f"{self.nome_instancias_teste[grupo]}"
-                    ax.set_title(title)
-
-                # Segundo eixo com probabilidade de regiem+transiente
-                if apresente_conf["mostra_prob"]:
-                    ax2 = ax.twinx()
-                    ax2.plot(
-                        100.0 * (y_prob_nao_normal_grupo),
-                        color="orange",
-                        linestyle="-",
-                        alpha=0.6,
-                        label="prob. não normal",
-                    )
-                    ax2.set_ylim(0, 100)
-
-                grupo_idx_inicio += grupo_count
-
-            axes[0].legend(["predita", "verdadeira"])
-            axes[-1].set_xlabel("amostra")
-            plt.show()
-
-        return metricas
-
-
-# Classe que encapsula vários objetos da classe EventFold
-class EventFolds:
-    def __init__(
-        self,
-        experiment,
-        nomes_instancias,
-        folds_instancias,
-    ):
-        self.experiment: Experiment = experiment
-        self.event_type = experiment.event_type
-        self.use_instancias_extras = experiment.use_instancias_extras
-        self.pad_mode = experiment.pad_mode
-        self.pbar = experiment.pbar
-        self.warnings = experiment.warnings
-        self.forca_binario = experiment.forca_binario
-
-        self.LABEL = experiment.LABEL
-        self.OBSERVATION_LABELS = experiment.OBSERVATION_LABELS
-        self.TRANSIENT = experiment.TRANSIENT
-        self.window = experiment.window
-        self.step = experiment.step
-
-        assert not self.use_instancias_extras, "Funcionalidade não implementada"
-
-        # Filtro de nomes de eventos
-        self.filtre_nomes_instancias = lambda filtro: list(
-            compress(nomes_instancias, map(filtro, folds_instancias))
-        )
-
-        # Guarda nome das instâncias extras
-        self.nomes_instancias_extras = self.filtre_nomes_instancias(
-            lambda fold: fold == EXTRA_INSTANCES_TRAINING
-        )
-
-        # Obtém código de todos os folds, ignorando o fold negativo (utilizado
-        # sempre para treino)
-        self.folds_nums = sorted(set(folds_instancias) - {EXTRA_INSTANCES_TRAINING})
-
-        # Carrega instâncias do evento
-        nomes_instancias_evento = self.filtre_nomes_instancias(
-            lambda fold: fold != EXTRA_INSTANCES_TRAINING
-        )
-        self.instancias = {}
-        with alive_bar(
-            len(nomes_instancias_evento),
-            disable=not (self.pbar),
-            force_tty=True,
-            title=f"Loading instances",
-            bar="bubbles",
-            spinner=None,
-        ) as bar:
-            for nome_instancia in nomes_instancias_evento:
-                self.instancias[nome_instancia] = self.carregue_instancia(
-                    nome_instancia
-                )
-                bar()
-
-        # Cria folds, agrupado por fold_num
-        self.folds = []
-        for fold_num in self.folds_nums:
-            # Treino
-            nome_instancias_treino = self.filtre_nomes_instancias(
-                lambda fold: fold not in {fold_num, EXTRA_INSTANCES_TRAINING}
-            )
-            instancias_treino = [
-                self.instancias[nome_instancia]
-                for nome_instancia in nome_instancias_treino
-            ]
-
-            # Teste
-            nome_instancias_teste = self.filtre_nomes_instancias(
-                lambda fold: fold == fold_num
-            )
-            instancias_teste = [
-                self.instancias[nome_instancia]
-                for nome_instancia in nome_instancias_teste
-            ]
-
-            # Cria Fold
-            event_fold = EventFold(
-                instancias_treino,
-                instancias_teste,
-                self.step,  # WIP
-                self.step,  # WIP
-                self,
-                nome_instancias_treino,
-                nome_instancias_teste,
-            )
-            self.folds.append(event_fold)
-
-        self.folds_metricas = {fold: None for fold in self.folds}
-
-    def extrai_arrays(self, instancia_abs, pad_mode="na"):
-        """
-        Extrai np.arrays X e y a partir do csv em instancia_abs.
-        Na extração os valore de referência são calculados e incluídos
-        como colunas em X. X tem ses dados completados segundo pad_mode
-        para formar primeiras janelas.
-
-        pad_mode:
-            'na'    : completa X com NA alinhando com primeiro dado
-            anotado em y
-            'valid' : descarta os dados que não cabem na primeira janela
-            de detecção
-        """
-        # Leitura do arquivo CSV que contém a instância
-        with instancia_abs.open() as f:
-            df = pd.read_csv(f, usecols=VARS + [CLASS])
-
-        # Extração dos conjuntos de amostras para treino
-        X_treino = df[VARS].values.astype(np.float32)
-
-        # Verifica primeiro índice da variável target
-        first_class = df[CLASS].first_valid_index()
-        inicio_X = first_class - self.window + 1
-        inicio_y = first_class
-
-        # Verifica o tamanho da jenala solicitada e aplica pad se
-        # necessário
-        if inicio_X < 0:
-            if self.warnings:
-                warnings.warn(
-                    f'Arquivo "{instancia_abs}" não possui amostras suficientes para janela de detecção solicitada ({self.window}s.\
-                        Aplicando pad {pad_mode})',
-                    RuntimeWarning,
-                )
-            if pad_mode == "na":
-                # Completando os dados em X_treino para com NA
-                X_treino = np.vstack(
-                    [
-                        np.full(
-                            (-inicio_X, X_treino.shape[1]),
-                            np.nan,
-                            dtype=np.float32,
-                        ),
-                        X_treino,
-                    ]
-                )
-                inicio_X = 0
-            elif pad_mode == "valid":
-                # Descartando (-inicio_X) instantes do df para ter 1a
-                # janela válida
-                inicio_y += -inicio_X
-                inicio_X = 0
-
-                # Validando se janela solicitada é maior do que dados
-                # disponíveis
-                if inicio_y >= df.shape[0]:
-                    raise (
-                        Exception(
-                            f"Arquivo '{instancia_abs}' não possui amostras suficientes para pad: {pad_mode}."
-                        )
-                    )
-
-                # Validando se mais de 50% dos dados normais foram
-                # descartados (ou algum outro controle de qualidade?)
-                # TODO
-
-            else:
-                raise (Exception(f"Opção de pad não reconhecida: {pad_mode}."))
-
-        X_treino_pad = X_treino[inicio_X:]
-        y_treino = df.iloc[inicio_y:][CLASS].values
-
-        return X_treino_pad, y_treino
-
-    def carregue_instancia(self, instancia):
-        instancia_abs = Path(os.path.join(PATH_DATASET, instancia))
-        X_treino_extra, y_treino = self.extrai_arrays(
-            instancia_abs, pad_mode=self.pad_mode
-        )
-
-        # Aplicação de janela deslizante
-        Xw_treino = rolling_window(X_treino_extra, self.window, axes=0, toend=False)
-
-        # Check de sanidade
-        assert len(y_treino) == len(
-            Xw_treino
-        ), f'[BUG] X e y de treino não estão sincronizados para o arquivo "{instancia_abs}"'
-
-        assert (
-            min(Xw_treino.shape) > 0
-        ), f'Janela especificada gerou instância sem amostras para o arquivo "{instancia_abs}"'
-
-        # Ao usar instâncias de outros eventos para o treinamento do
-        # evento corrente (self.event_type)
-        # códigos de outros eventos podem surgir em y_treino.
-        # y_treino deve ter somente os códigos do evento corrente.
-        # Os códigos novos (derivados de outros eventos) são convertidos
-        # para código do evento Normal (0).
-        y_finite_mask = np.isfinite(y_treino)
-        outro_codigo_mask = y_finite_mask & np.isin(
-            y_treino, list(self.OBSERVATION_LABELS), invert=True
-        )
-        if self.warnings and outro_codigo_mask.sum() > 0:
-            novos_codigos = set(y_treino[outro_codigo_mask])
-            warnings.warn(
-                f'Códigos de outros eventos ("{novos_codigos}") sendo convertidos para 0.',
-                RuntimeWarning,
-            )
-        y_treino[outro_codigo_mask] = 0
-
-        # Tratamento para classificação binária : codigo_transitente ->
-        # codigo_regime
-        if self.TRANSIENT and self.forca_binario:
-            codigo_regime = self.LABEL
-            codigo_transiente = self.LABEL + TRANSIENT_OFFSET
-            y_treino[y_treino == codigo_transiente] = codigo_regime
-
-        return Xw_treino, y_treino
-
-    def __iter__(self):
-        for fold in self.folds:
-            yield fold
-
-    def __len__(self):
-        return len(self.folds)
-
-    # Método para retenção de métricas
-    def salve_metricas_parciais(self, fold, metricas):
-        assert fold in self.folds_metricas, "Fold não encontrado"
-        if self.folds_metricas[fold] is not None:
-            warnings.warn(
-                "Fold com métricas já computadas. Recarregue os folds "
-                + "para evitar esta mensagem.",
-                RuntimeWarning,
-            )
-        self.folds_metricas[fold] = metricas
-
-    @lru_cache(1)
-    def extraia_amostras_simuladas_e_desenhadas(self):
-        # Obtém instâncias extras (simuladas e desenhadas, representadas
-        # pelo fold==EXTRA_INSTANCES_TRAINING)
-        instancias_extras = []
-        with alive_bar(len(self.nomes_instancias_extras)) as bar:
-            for nome_instancia in self.pbar(self.nomes_instancias_extras):
-                instancias_extras.append(self.carregue_instancia(nome_instancia))
-                bar()
-
-        instancias_extras_passo = [
-            (X[:: self.step], y[:: self.step]) for X, y in instancias_extras
-        ]  # Aplica passo de treino
-        return extraia_exemplos(instancias_extras_passo)
-
-    # Método para consulta/cálculo de métricas parciais e globais (média
-    # e std)
-    def get_metrics(self, boxplot=False):
-        folds_metrics = {
-            i: metrics for i, (_, metrics) in enumerate(self.folds_metricas.items())
-        }
-        df_metricas = pd.DataFrame.from_dict(folds_metrics, orient="index")
-        if boxplot:
-            for metrica in ["F_BETA [%]", "MEAN_LOG_LOSS"]:
-                plt.figure(figsize=(11, 1))
-                sns.boxplot(x=df_metricas[metrica], width=0.4, palette="colorblind")
-                sns.stripplot(
-                    x=df_metricas[metrica],
-                    jitter=True,
-                    marker="o",
-                    alpha=0.5,
-                    color="black",
-                )
-                plt.show()
-        df_metricas.index.name = "FOLD"
-        df_metricas.loc["MEAN"] = df_metricas.mean()
-        df_metricas.loc["STANDARD DEVIATION"] = df_metricas.std()
-        self.experiment.metrics = df_metricas.loc["MEAN"].to_dict()
-        return df_metricas
-
-
-class Experiment:
-    """This class contains objects related to machine learning approach
-    experiments"""
-
-    def __init__(
-        self,
-        event_name,
-        ova=True,
-        use_instancias_extras=False,  # WIP
-        pad_mode="valid",
-        pbar=True,
-        warnings=False,
-        forca_binario=False,  # WIP
-    ):
-        """_summary_"""
-        self.event_type = EventType(event_name)
-        self.ova = ova
-        self.use_instancias_extras = use_instancias_extras  # WIP
-        self.pad_mode = pad_mode
-        self.pbar = pbar
-        self.warnings = warnings
-        self.forca_binario = forca_binario  # WIP
-
-        self.LABEL = self.event_type.LABEL
-        self.OBSERVATION_LABELS = self.event_type.OBSERVATION_LABELS
-        self.DESCRIPTION = self.event_type.DESCRIPTION
-        self.TRANSIENT = self.event_type.TRANSIENT
-        self.window = self.event_type.window
-        self.step = self.event_type.step
-
-    @property
-    def event_labels(self):  # WIP
-        """
-        Dicionário com os códigos das classes que envolvem essa tarefa
-        de classificação. As classes podem ser 'normal', 'regime' e
-        'transiente'. A classe transiente não existe para tarefas de
-        classificação binária.
-        """
-        codigos = {"normal": 0, "regime": self.LABEL}
-        if self.TRANSIENT and (not self.forca_binario):
-            codigos["transiente"] = self.LABEL + TRANSIENT_OFFSET
-        return codigos
-
-    def folds(self):
-        folds = os.path.join(PATH_FOLDS, f"folds_clf_{self.LABEL:02d}.csv")
-        with Path(folds).open() as f:
-            df_event = pd.read_csv(f)
-
-        if not self.ova:
-            df_event = df_event.query("~is_ova")
-
-        nomes_instancias = df_event["instancia"].tolist()
-        folds_instancias = df_event["fold"].tolist()
-
-        return EventFolds(
-            self,
-            nomes_instancias,
-            folds_instancias,
-        )

From 3a0c86265ad1cb64212d2aaf8fecb9fc01d910ff Mon Sep 17 00:00:00 2001
From: Kelly Castro <kelly@decastro.com>
Date: Sat, 19 Oct 2024 04:23:05 -0300
Subject: [PATCH 3/6] feat/update_init_for_3w_dataset_2.0

---
 toolkit/__init__.py | 49 +++++++++++++++++++++++++++------------------
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/toolkit/__init__.py b/toolkit/__init__.py
index e3419e2e..6bd9f6bb 100644
--- a/toolkit/__init__.py
+++ b/toolkit/__init__.py
@@ -1,53 +1,59 @@
-"""This is the 3W Toolkit, a software package written in Python 3 that 
+"""This is the 3W Toolkit, a software package written in Python 3 that 
 is one of the 3W Project's major components.
 
 This toolkit contains resources that make the following easier:
 
 - 3W Dataset overview generation;
-- Experimentation and comparative analysis of Machine Learning-based 
-approaches and algorithms for specific problems related to undesirable 
-events that occur in offshore oil wells during their respective 
+- Experimentation and comparative   
+ analysis of Machine Learning-based   
+ 
+approaches and algorithms for specific problems related to undesirable 
+events that occur in offshore oil wells during their respective 
 production phases;
-* Standardization of key points of the Machine Learning-based algorithm 
+* Standardization of key points of the Machine Learning-based algorithm 
 development pipeline.
 
 All these resources are implemented in the following sub-modules:
 
 - **base**: groups the objects used by the other sub-modules;
-- **dev**: has all the resources related to development of Machine 
+- **dev**: has all the resources related to development of Machine   
+ 
 Learning models;
-- **misc**: brings together diverse resources that do not fit in the 
+- **misc**: brings together diverse resources that do not fit in the 
 other sub-modules;
-- **rolling_window**: creates a view of array which for every point 
-gives the n-dimensional neighbourhood of size window. New dimensions are 
+- **rolling_window**: creates a view of array which for every point 
+gives the n-dimensional neighbourhood of size window. New dimensions are 
 added at the end of array or after the corresponding original dimension.
 
-Specific problems will be incorporated into this toolkit gradually. At 
+Specific problems will be incorporated into this toolkit gradually. At   
+ 
 this time, models can be developed for the following problems:
 
 - Binary Classifier of Spurious Closure of DHSV.
 
-Examples of how to use this toolkit will be incremented throughout its 
+Examples of how to use this toolkit will be incremented throughout   
+ its 
 development. Please, check the 3W Project's README.md file for more details.
 
-It is important to note that there are arbitrary choices in this 
-toolkit, but they have been carefully made to allow adequate comparative 
-analysis without compromising the ability to experiment with different 
+It is important to note that there are arbitrary choices in this 
+toolkit, but they have been carefully made to allow adequate comparative 
+analysis without compromising the ability to experiment with different 
 approaches and algorithms.
 
-This toolkit's documentation is generated in english and in Google format 
+This toolkit's documentation is generated in english and in Google format 
 with [autoDocstring - Python Docstring Generator
 ](https://github.com/NilsJPWerner/autoDocstring), which follows [PEP 257
 ](https://peps.python.org/pep-0257/), and [pdoc3
 ](https://pdoc3.github.io/pdoc/).
 
-Its source code is implemented according to the style guide established 
-by [PEP 8](https://peps.python.org/pep-0008/). This is guaranteed with 
+Its source code is implemented according to the style guide established   
+ 
+by [PEP 8](https://peps.python.org/pep-0008/). This is guaranteed with 
 the use of the [Black formatter](https://github.com/psf/black).
 """
 
 __status__ = "Development"
-__version__ = "1.1.0"
+__version__ = "1.2.0"  # Update version number after changes
 __license__ = "Apache License 2.0"
 __copyright__ = "Copyright 2024, Petróleo Brasileiro S.A."
 __authors__ = [
@@ -73,7 +79,8 @@
     EventType,
     LABELS_DESCRIPTIONS,
     NORMAL_LABEL,
-    PARQUET_EXTENSION,
+    PARQUET_EXTENSION,   
+
     PARQUET_ENGINE,
     PARQUET_COMPRESSION,
     PATH_3W_PROJECT,
@@ -83,6 +90,7 @@
     PATH_TOOLKIT,
     TRANSIENT_OFFSET,
     VARS,
+    load_3w_dataset,  # To use by 3W  v2.0
     load_config_in_dataset_ini,
 )
 
@@ -104,7 +112,8 @@
     get_all_labels_and_files,
     label_and_file_generator,
     load_instance,
-    load_instances,
+    load_instances,   
+
     resample,
     plot_instance,
 )

From 194998e43dc9e0cad5b6a041b6211fbba56a7a81 Mon Sep 17 00:00:00 2001
From: Kelly Castro <kelly@decastro.com>
Date: Sat, 19 Oct 2024 05:01:56 -0300
Subject: [PATCH 4/6] feat/adapt_misc_to_3w_dataset_2.0

---
 toolkit/misc.py | 634 ++++--------------------------------------------
 1 file changed, 41 insertions(+), 593 deletions(-)

diff --git a/toolkit/misc.py b/toolkit/misc.py
index 6f2323e5..bbeb6b16 100644
--- a/toolkit/misc.py
+++ b/toolkit/misc.py
@@ -33,190 +33,90 @@
     EVENT_NAMES,
     PARQUET_EXTENSION,
     PARQUET_ENGINE,
+    load_3w_dataset,  # To work with 3W v2.0
 )
 
 
 # Methods
 #
-def label_and_file_generator(real=True, simulated=False, drawn=False):
-    """This is a generating function that returns tuples for all
-    indicated instance sources (`real`, `simulated` and/or
-    `hand-drawn`). Each tuple refers to a specific instance and contains
-    its label (int) and its full path (Path). All 3W Dataset's instances
-    are considered.
 
-    Args:
-        real (bool, optional): Indicates whether `real` instances should
-            be considered. Defaults to True.
-        simulated (bool, optional): Indicates whether `simulated`
-            instances should be considered. Defaults to False.
-        drawn (bool, optional): Indicates whether `hand-drawn` instances
-            should be considered. Defaults to False.
-
-    Yields:
-        generator: Tuples for all indicated instance sources. Each tuple
-            refers to a specific instance and contains its label (int)
-            and its full path (Path).
-    """
-    for i in Path(PATH_DATASET).iterdir():
-        try:
-            # Considers only directories
-            if i.is_dir():
-                label = int(i.stem)
-                for fp in i.iterdir():
-                    # Considers only Parquet files
-                    if fp.suffix == PARQUET_EXTENSION:
-                        # Considers only instances from the requested
-                        # source
-                        if (
-                            (simulated and fp.stem.startswith("SIMULATED"))
-                            or (drawn and fp.stem.startswith("DRAWN"))
-                            or (
-                                real
-                                and (not fp.stem.startswith("SIMULATED"))
-                                and (not fp.stem.startswith("DRAWN"))
-                            )
-                        ):
-                            yield label, fp
-        except:
-            # Otherwise (e.g. files or directory without instances), do
-            # nothing
-            pass
-
-
-def get_all_labels_and_files():
-    """Gets lists with tuples related to all real, simulated, or
-    hand-drawn instances contained in the 3w Dataset. Each list
-    considers instances from a single source. Each tuple refers to a
-    specific instance and contains its label (int) and its full path
-    (Path).
 
-    Returns:
-        tuple: Tuple containing three lists with tuples related to real,
-            simulated, and hand-drawn instances, respectively.
-    """
-    real_instances = list(
-        label_and_file_generator(real=True, simulated=False, drawn=False)
-    )
-    simulated_instances = list(
-        label_and_file_generator(real=False, simulated=True, drawn=False)
-    )
-    drawn_instances = list(
-        label_and_file_generator(real=False, simulated=False, drawn=True)
-    )
-
-    return real_instances, simulated_instances, drawn_instances
-
-
-def create_table_of_instances(real_instances, simulated_instances, drawn_instances):
+def create_table_of_instances(df):
     """Creates a table of instances (pandas.DataFrame) that shows the
     amount of instances that compose the 3W Dataset, by knowledge source
-    (real, simulated and hand-drawn instances) and by instance label.
+    (real, simulated and imputed instances) and by instance label.
 
     Args:
-        real_instances (list): List with tuples related to all
-            real instances contained in the 3w Dataset. Each tuple
-            must refer to a specific instance and must contain its label
-            (int) and its full path (Path).
-        simulated_instances (list): List with tuples related to all
-            simulated instances contained in the 3w Dataset. Each tuple
-            must refer to a specific instance and must contain its label
-            (int) and its full path (Path).
-        drawn_instances (list): List with tuples related to all
-            hand-drawn instances contained in the 3w Dataset. Each tuple
-            must refer to a specific instance and must contain its label
-            (int) and its full path (Path).
+        df (pandas.DataFrame): DataFrame with the 3W Dataset 2.0 data.
 
     Returns:
         pandas.DataFrame: The created table that shows the amount of
             instances that compose the 3W Dataset, by knowledge source
-            (real, simulated and hand-drawn instances) and by instance
+            (real, simulated and imputed instances) and by instance
             label.
     """
-    # Gets the label's description of all instances as a list of dicts
-    list_instances = (
-        [
-            {
-                "INSTANCE LABEL": str(label) + " - " + LABELS_DESCRIPTIONS[label],
-                "SOURCE": "REAL",
-            }
-            for label, fp in real_instances
-        ]
-        + [
-            {
-                "INSTANCE LABEL": str(label) + " - " + LABELS_DESCRIPTIONS[label],
-                "SOURCE": "SIMULATED",
-            }
-            for label, fp in simulated_instances
-        ]
-        + [
-            {
-                "INSTANCE LABEL": str(label) + " - " + LABELS_DESCRIPTIONS[label],
-                "SOURCE": "HAND-DRAWN",
-            }
-            for label, fp in drawn_instances
-        ]
-    )
 
-    # Transforms the list of dicts into a pandas.DataFrame
-    df_instances = pd.DataFrame(list_instances)
+    # Create a new column with the instance label and description
+    df['INSTANCE LABEL'] = df['label'].astype(str) + " - " + df['label'].map(LABELS_DESCRIPTIONS)
 
-    # Creates the table of instances with relevant information and
-    # desired format
+    # Create the table of instances
     toi = (
-        df_instances.groupby(["INSTANCE LABEL", "SOURCE"])
+        df.groupby(['INSTANCE LABEL', 'state'])
         .size()
         .reset_index()
-        .pivot("SOURCE", "INSTANCE LABEL", 0)
+        .pivot('state', 'INSTANCE LABEL', 0)
         .fillna(0)
         .astype(int)
         .T
     )
-    toi = toi.loc[natsorted(toi.index.values)]
-    toi = toi[["REAL", "SIMULATED", "HAND-DRAWN"]]
-    toi["TOTAL"] = toi.sum(axis=1)
-    toi.loc["TOTAL"] = toi.sum(axis=0)
+
+    # Rename the columns to represent the data sources
+    toi = toi.rename(columns={0: 'REAL', 1: 'SIMULATED', 2: 'IMPUTED'})
+
+    # Add a 'TOTAL' column and row
+    toi['TOTAL'] = toi.sum(axis=1)
+    toi.loc['TOTAL'] = toi.sum(axis=0)
 
     return toi
 
 
-def filter_rare_undesirable_events(toi, threshold, simulated=False, drawn=False):
+def filter_rare_undesirable_events(toi, threshold, simulated=False, imputed=False):
     """Generates a table of instances (pandas.DataFrame) that shows the
     amount of filtered instances, by knowledge source (real, `simulated`
-    and `hand-drawn` instances) and by instance label. This filter keeps
-    only real instances, as well as `simulated` and `hand-drawn` if
+    and `imputed` instances) and by instance label. This filter keeps
+    only real instances, as well as `simulated` and `imputed` if
     indicated, of rare event types. An event type is considered rare if
     the amount of instances labeled as this event relative to the total
     number of instances is less than the indicated `threshold`. In both
-    totalizations, `simulated` and `hand-drawn` instances are only
+    totalizations, `simulated` and `imputed` instances are only
     considered if indicated, but real instances are always taken into
     account.
 
     Args:
         toi (pandas.DataFrame): Table that shows the amount of instances
             that compose the 3W Dataset, by knowledge source (real,
-            `simulated` and `hand-drawn` instances) and by instance
+            `simulated` and `imputed` instances) and by instance
             label. This object is not modified in this function.
         threshold (float): Relative limit that establishes rare event
             types.
         simulated (bool, optional): Indicates whether `simulated`
             instances should be considered. Defaults to False.
-        drawn (bool, optional): Indicates whether `hand-drawn` instances
+        imputed (bool, optional): Indicates whether `imputed` instances
             should be considered. Defaults to False.
 
     Returns:
         pandas.DataFrame: The table of instances (pandas.DataFrame) that
             shows the amount of filtered instances, by knowledge source
-            (real, simulated and hand-drawn instances) and by instance
+            (real, simulated and imputed instances) and by instance
             label.
     """
-    # Simulated and hand-drawn instances are optional, but real
+    # Simulated and imputed instances are optional, but real
     # instances are always considered
     totals = 0
     if simulated:
         totals += toi["SIMULATED"]
-    if drawn:
-        totals += toi["HAND-DRAWN"]
+    if imputed:
+        totals += toi["IMPUTED"]
     totals += toi["REAL"]
 
     # Absolute limit
@@ -229,12 +129,12 @@ def filter_rare_undesirable_events(toi, threshold, simulated=False, drawn=False)
     return rue
 
 
-def load_instance(instance):
+def load_instance(label, fp):
     """Loads all data and metadata from a specific `instance`.
 
     Args:
-        instance (tuple): This tuple must refer to a specific `instance`
-            and contain its label (int) and its full path (Path).
+        label (int): Label of the instance.
+        fp (Path): Full path to the instance file.
 
     Raises:
         Exception: Error if the Parquet file passed as arg cannot be
@@ -246,8 +146,6 @@ def load_instance(instance):
             other columns of the Parquet file and metadata loaded from
             the argument `instance` (label, well, and id).
     """
-    # Loads label metadata from the argument `instance`
-    label, fp = instance
 
     try:
         # Loads well and id metadata from the argument `instance`
@@ -272,30 +170,24 @@ def load_instance(instance):
     return df
 
 
-def load_instances(instances):
-    """Loads all data and metadata from multiple `instances` in
-    parallel.
+def load_instances(df):  # Changed function signature
+    """Loads all data and metadata from the DataFrame.
 
     Args:
-        instances (list): List with tuples related to real, simulated,
-            or hand-drawn `instances`. Each tuple must refer to a
-            specific instance and must contain its label (int) and its
-            full path (Path).
+        df (pandas.DataFrame): DataFrame with the 3W Dataset 2.0 data.
 
     Returns:
-        pandas.DataFrame: Its index contains the timestamps loaded from
-            the Parquet files. Its columns contain data loaded from the
-            other columns of the Parquet files and the metadata label,
-            well, and id).
+        pandas.DataFrame: DataFrame with loaded instances.
     """
+
     # Prepares for multiple parallel loadings
     pool = ThreadPool()
     dfs = []
 
     try:
         # Calls multiple loadings in parallel
-        for df in pool.imap_unordered(load_instance, instances):
-            dfs.append(df)
+        for label, fp in df[['label', 'filepath']].values:  # Assuming 'filepath' column exists
+            dfs.append(load_instance(label, Path(fp)))
     finally:
         # If the instance cannot be loaded
         pool.terminate()
@@ -304,22 +196,17 @@ def load_instances(instances):
     return pd.concat(dfs)
 
 
-def create_and_plot_scatter_map(real_instances):
+def create_and_plot_scatter_map(df):
     """Creates and plots scatter map with all the real instances listed
-    in the `real_instances` argument.
+    in the `df` argument.
 
     Args:
-        real_instances (list): List with tuples related to all
-            real instances contained in the 3w Dataset. Each tuple
-            must refer to a specific instance and must contain its label
-            (int) and its full path (Path).
+        df (pandas.DataFrame): DataFrame with the 3W Dataset 2.0 data.
 
     Returns:
         tuple: Tuple containing the first and the last year of
             occurrence among all instances, respectively.
     """
-    # Loads all instances
-    df = load_instances(real_instances)
 
     # Finds the first and the last year of occurrence among all instances
     df_time = (
@@ -349,443 +236,4 @@ def create_and_plot_scatter_map(real_instances):
     last_year = np.max(df_time["max"]).year
     plt.rcParams["axes.labelsize"] = 9
     plt.rcParams["font.size"] = 9
-    plt.rcParams["legend.fontsize"] = 9
-    fig, ax = plt.subplots(figsize=(9, 9))
-    yticks = []
-    yticks_labels = []
-    for well in well_times.keys():
-        times = well_times[well]
-        class_names = well_classes[well]
-        class_colors = list(map(cmap, class_names))
-        well_id = well_code[well]
-        yticks.append(well_id * height + height / 2 - border / 2)
-        yticks_labels.append(well)
-        ax.broken_barh(
-            times,
-            (well_id * height, height - border),
-            facecolors=class_colors,
-            edgecolors=class_colors,
-        )
-    ax.grid(True)
-    ax.set_axisbelow(True)
-    ax.set_yticks(yticks)
-    ax.set_yticklabels(yticks_labels)
-    ax.xaxis.set_major_locator(mdates.YearLocator())
-    ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
-    legend_colors = [
-        Patch(facecolor=cmap(l), label=str(l) + " - " + d)
-        for l, d in LABELS_DESCRIPTIONS.items()
-    ]
-    ax.legend(
-        frameon=False,
-        handles=legend_colors,
-        loc="upper center",
-        bbox_to_anchor=(0.5, 1.12),
-        ncol=4,
-    )
-
-    return first_year, last_year
-
-
-def count_properties_instance(instance):
-    """Counts properties from a specific `instance`.
-
-    Args:
-        instance (tuple): This tuple must refer to a specific `instance`
-            and contain its label (int) and its full path (Path).
-
-    Raises:
-        Exception: Error if the Parquet file passed as arg cannot be
-        read.
-
-    Returns:
-        dict: Dict containing the counted properties with the following
-            keys: n_vars (number of variables), n_vars_missing (number
-            of missing variables), n_vars_frozen (number of frozen
-            variables), n_obs (number of observations), and
-            n_obs_unlabeled (number of unlabeled observations).
-    """
-    # Preparation for counting
-    _, fp = instance
-    p = {"n_vars_missing": 0, "n_vars_frozen": 0}
-
-    try:
-        # Read the Parquet file
-        df = pd.read_parquet(fp, engine=PARQUET_ENGINE)
-    except Exception as e:
-        raise Exception(f"error reading file {fp}: {e}")
-
-    # Counts properties
-    vars = df.columns[:-1]  # Last column with class is not considered
-    p["n_vars"] = len(vars)
-    for var in vars:
-        if df[var].isnull().all():
-            p["n_vars_missing"] += 1
-        u_values = df[var].unique()
-        if len(u_values) == 1 and not np.isnan(u_values):
-            p["n_vars_frozen"] += 1
-    p["n_obs"] = len(df)
-    p["n_obs_unlabeled"] = df["class"].isnull().sum()
-
-    return p
-
-
-def count_properties_instances(instances):
-    """Counts properties from multiple `instances` in parallel.
-
-    Args:
-        instances (list): List with tuples related to real, simulated,
-            or hand-drawn `instances`. Each tuple must refer to a
-            specific instance and must contain its label (int) and its
-            full path (Path).
-
-    Returns:
-        dict: Dict containing the counted properties with the following
-            keys: n_vars (number of variables), n_vars_missing (number
-            of missing variables), n_vars_frozen (number of frozen
-            variables), n_obs (number of observations), and
-            n_obs_unlabeled (number of unlabeled observations).
-    """
-    # Prepares for multiple parallel counts
-    pool = ThreadPool()
-    ps = []
-
-    try:
-        # Calls multiple counts in parallel
-        for p in pool.imap_unordered(count_properties_instance, instances):
-            ps.append(p)
-    finally:
-        # If the instance cannot be loaded
-        pool.terminate()
-
-    # Sum ps and return the result
-    return dict(pd.DataFrame(ps).sum())
-
-
-def calc_stats_instances(real_instances, simulated_instances, drawn_instances):
-    """Calculates the 3W Dataset's fundamental aspects related to
-    inherent difficulties of actual data. Three statistics are
-    calculated: Missing Variables, Frozen Variables, and Unlabeled
-    Observations. All instances, regardless of their source, influence
-    these statistics.
-
-    Args:
-        real_instances (list): List with tuples related to all
-            real instances contained in the 3w Dataset. Each tuple
-            must refer to a specific instance and must contain its label
-            (int) and its full path (Path).
-        simulated_instances (list): List with tuples related to all
-            simulated instances contained in the 3w Dataset. Each tuple
-            must refer to a specific instance and must contain its label
-            (int) and its full path (Path).
-        drawn_instances (list): List with tuples related to all
-            hand-drawn instances contained in the 3w Dataset. Each tuple
-            must refer to a specific instance and must contain its label
-            (int) and its full path (Path).
-
-    Returns:
-        pandas.DataFrame: Its index contains the statistic's names. Its
-            columns contain statistics themselves (Amount and
-            Percentage)
-    """
-    # Counts properties from all indicated instances
-    p = count_properties_instances(
-        real_instances + simulated_instances + drawn_instances
-    )
-
-    # Extract properties
-    n_vars = p["n_vars"]
-    n_vars_missing = p["n_vars_missing"]
-    n_vars_frozen = p["n_vars_frozen"]
-    n_obs = p["n_obs"]
-    n_obs_unlabeled = p["n_obs_unlabeled"]
-
-    # Calculates the statistics
-    stats = {
-        "Missing Variables": [
-            n_vars_missing,
-            f"{100*n_vars_missing/n_vars:.2f}% of {n_vars}",
-        ],
-        "Frozen Variables": [
-            n_vars_frozen,
-            f"{100*n_vars_frozen/n_vars:.2f}% of {n_vars}",
-        ],
-        "Unlabeled Observations": [
-            n_obs_unlabeled,
-            f"{100*n_obs_unlabeled/n_obs:.2f}% of {n_obs}",
-        ],
-    }
-
-    return pd.DataFrame.from_dict(
-        stats, orient="index", columns=["Amount", "Percentage"]
-    )
-
-
-def resample(data, n, class_number):
-    """Downsampling for instances.
-
-    Args:
-        data (string): Instance path
-        n (integer): Factor to downsampling the instance.
-        class_number (integer): integer that represents the event class
-
-    Returns:
-        pandas.DataFrame: Downsamplig instance DataFrame
-    """
-    # Timestamp is expected to be a column
-    data.reset_index(inplace=True)
-    # Group Timestamp and get last value
-    resampleTimestamp = data.timestamp.groupby(data.index // n).max()
-    # Replace transient label from 100 to 0.5
-    data["class"] = data["class"].astype(float)
-    tempClassLabel = data["class"].replace(class_number + 100, 0.5)
-    # Get the max value from the group Class column
-    resampleClass = tempClassLabel.groupby(tempClassLabel.index // n).max()
-    # Back with transient label value
-    resampleClass.replace(0.5, class_number + 100, inplace=True)
-    # Non overlap group and get the average value from the data
-    dfResample = data.groupby(data.index // n).mean(numeric_only=True)
-    # Drop class column
-    dfResample.drop(["class"], axis=1, inplace=True)
-    # Insert resampled class label values
-    dfResample["class"] = resampleClass
-    # Insert resampled timestamp
-    dfResample.index = resampleTimestamp
-
-    return dfResample
-
-
-def plot_instance(class_number, instance_index, resample_factor):
-    """Plot one especific event class and instance. By default the
-    instance is downsampling (n=100) and Z-score Scaler. In order to
-    help the visualization transient labels was changed to '0.5'.
-
-    Args:
-        class_number (integer): integer that represents the event class
-        instance_index (integer): input the instance file index
-    """
-    instances_path = os.path.join(
-        PATH_DATASET, str(class_number), "*" + PARQUET_EXTENSION
-    )
-    instances_path_list = glob.glob(instances_path)
-    if instance_index >= len(instances_path_list):
-        print(
-            f"instance index {instance_index} out of range - Insert a valid index between 0 and {len(instances_path_list)-1}"
-        )
-    else:
-        df_instance = pd.read_parquet(
-            instances_path_list[instance_index], engine=PARQUET_ENGINE
-        )
-        df_instance_resampled = resample(df_instance, resample_factor, class_number)
-        df_drop_resampled = df_instance_resampled.drop(["state", "class"], axis=1)
-        df_drop_resampled.interpolate(
-            method="linear", limit_direction="both", axis=0, inplace=True
-        )
-        df_drop_resampled.fillna(
-            0,
-            inplace=True,
-        )
-        scaler_resampled = TimeSeriesScalerMeanVariance().fit_transform(
-            df_drop_resampled
-        )
-
-        df_scaler_resampled = pd.DataFrame(
-            scaler_resampled.squeeze(),
-            index=df_drop_resampled.index,
-            columns=df_drop_resampled.columns,
-        )
-        df_instance_resampled["class"] = df_instance_resampled["class"].replace(
-            100 + int(class_number), 0.5
-        )
-        df_instance_resampled["class"] = df_instance_resampled["class"].replace(
-            int(class_number), 1
-        )
-
-        colors_traces = [
-            "#008080",
-            "#3498DB",
-            "#E74C3C",
-            "#884EA0",
-            "#D4AC0D",
-            "#AF601A",
-            "#D35400",
-            "#839192",
-            "#2E4053",
-        ]
-        fig = go.Figure()
-        fig.add_trace(
-            go.Scatter(
-                x=df_instance_resampled.index,
-                y=df_scaler_resampled[VARS[0]],
-                mode="lines+markers",
-                marker_symbol="circle",
-                marker_size=3,
-                name=VARS[0],
-                yaxis="y1",
-                line_color=colors_traces[0],
-            )
-        ),
-        fig.add_trace(
-            go.Scatter(
-                x=df_instance_resampled.index,
-                y=df_scaler_resampled[VARS[1]],
-                mode="lines+markers",
-                marker_symbol="diamond",
-                marker_size=3,
-                name=VARS[1],
-                yaxis="y2",
-                line_color=colors_traces[1],
-            )
-        ),
-        fig.add_trace(
-            go.Scatter(
-                x=df_instance_resampled.index,
-                y=df_scaler_resampled[VARS[2]],
-                mode="lines+markers",
-                marker_symbol="x",
-                marker_size=3,
-                name=VARS[2],
-                yaxis="y3",
-                line_color=colors_traces[2],
-            )
-        ),
-        fig.add_trace(
-            go.Scatter(
-                x=df_instance_resampled.index,
-                y=df_scaler_resampled[VARS[3]],
-                mode="lines+markers",
-                marker_symbol="star",
-                marker_size=3,
-                name=VARS[3],
-                yaxis="y4",
-                line_color=colors_traces[3],
-            )
-        ),
-        fig.add_trace(
-            go.Scatter(
-                x=df_instance_resampled.index,
-                y=df_scaler_resampled[VARS[4]],
-                mode="lines+markers",
-                marker_symbol="triangle-up",
-                marker_size=3,
-                name=VARS[4],
-                yaxis="y5",
-                line_color=colors_traces[4],
-            )
-        ),
-        fig.add_trace(
-            go.Scatter(
-                x=df_instance_resampled.index,
-                y=df_scaler_resampled[VARS[5]],
-                mode="lines",
-                name=VARS[5],
-                yaxis="y6",
-                line_color=colors_traces[5],
-            )
-        ),
-        fig.add_trace(
-            go.Scatter(
-                x=df_instance_resampled.index,
-                y=df_scaler_resampled[VARS[6]],
-                mode="lines",
-                name=VARS[6],
-                yaxis="y7",
-                line_color=colors_traces[6],
-            )
-        ),
-        fig.add_trace(
-            go.Scatter(
-                x=df_instance_resampled.index,
-                y=df_scaler_resampled[VARS[7]],
-                mode="lines",
-                name=VARS[7],
-                yaxis="y8",
-                line_color=colors_traces[7],
-            )
-        ),
-        fig.add_trace(
-            go.Scatter(
-                x=df_instance_resampled.index,
-                y=df_instance_resampled["class"],
-                mode="markers",
-                name="Label",
-                yaxis="y9",
-                line_color=colors_traces[8],
-            )
-        ),
-        fileName = instances_path_list[instance_index].split(os.sep)
-        fig.update_layout(
-            title=EVENT_NAMES[class_number] + " - " + fileName[-1],
-            xaxis_title="Time(s)",
-            yaxis_title="z-score",
-            font=dict(size=12),
-            yaxis1=dict(
-                tickfont=dict(color=colors_traces[0]),
-                position=0,
-                tickformat=".2f",
-                showticklabels=False,
-            ),
-            yaxis2=dict(
-                tickfont=dict(color=colors_traces[1]),
-                overlaying="y",
-                side="left",
-                position=0.05,
-                tickformat=".2f",
-                showticklabels=False,
-            ),
-            yaxis3=dict(
-                tickfont=dict(color=colors_traces[2]),
-                overlaying="y",
-                side="left",
-                position=0.10,
-                tickformat=".2f",
-                showticklabels=False,
-            ),
-            yaxis4=dict(
-                tickfont=dict(color=colors_traces[3]),
-                overlaying="y",
-                side="left",
-                position=0.15,
-                tickformat=".2f",
-                showticklabels=False,
-            ),
-            yaxis5=dict(
-                tickfont=dict(color=colors_traces[4]),
-                overlaying="y",
-                side="left",
-                position=0.2,
-                tickformat=".2f",
-                showticklabels=False,
-            ),
-            yaxis6=dict(
-                tickfont=dict(color=colors_traces[5]),
-                overlaying="y",
-                side="left",
-                position=0.25,
-                tickformat=".2f",
-                showticklabels=False,
-            ),
-            yaxis7=dict(
-                tickfont=dict(color=colors_traces[6]),
-                overlaying="y",
-                side="left",
-                position=0.3,
-                tickformat=".2f",
-                showticklabels=False,
-            ),
-            yaxis8=dict(
-                tickfont=dict(color=colors_traces[7]),
-                overlaying="y",
-                side="left",
-                position=0.35,
-                tickformat=".2f",
-                showticklabels=False,
-            ),
-            yaxis9=dict(
-                tickfont=dict(color=colors_traces[8]),
-                anchor="x",
-                overlaying="y",
-                side="left",
-            ),
-        )
-        fig.show()
+    plt.rcParams["legend.fontsize"]

From 6a139e899fb956c1648a94a158f450b4a7a68d4e Mon Sep 17 00:00:00 2001
From: Kelly Castro <kelly@decastro.com>
Date: Sat, 19 Oct 2024 05:44:39 -0300
Subject: [PATCH 5/6] feat/update_readme

---
 toolkit/README.md | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/toolkit/README.md b/toolkit/README.md
index b0ac4d58..eeaf3a06 100644
--- a/toolkit/README.md
+++ b/toolkit/README.md
@@ -13,8 +13,9 @@
 
 * [Introduction](#introduction)
 * [Release Notes](#release-notes)
-  * [1.0.0](#100)
-  * [1.1.0](#110)
+  * [1.0.0](#100)
+  * [1.1.0](#110)
+  * [1.2.0](#120)  
 
 # Introduction
 
@@ -36,4 +37,17 @@ Release: July 25, 2024.
 
 Highlights:
 
-1. Makes resources (functions and constants) compatible with 3W Dataset version 2.0.0, which is based on Parquet files.
\ No newline at end of file
+1. Makes resources (functions and constants) compatible with 3W Dataset version 2.0.0, which is based on Parquet files.
+
+## 1.2.0
+
+Release: October 19, 2024  (Data em que estamos hoje)
+
+Highlights:
+
+1. **Adapts `load_dataset()` to 3W Dataset 2.0:** The `load_dataset()` function in `base.py` was adapted to correctly handle the folder structure and different data types of the 3W Dataset 2.0. It was renamed to `load_3w_dataset()`.
+2. **Updates `dev.py` for 3W Dataset 2.0:** The `dev.py` sub-module was updated to ensure compatibility with the new `load_3w_dataset()` function and the 3W Dataset 2.0 structure. The `extrai_arrays()` function was removed, and the `EventFolds` and `Experiment` classes were adjusted.
+3. **Updates `misc.py` for 3W Dataset 2.0:** The `misc.py` sub-module was updated to ensure compatibility with the new `load_3w_dataset()` function and the 3W Dataset 2.0 structure. Redundant functions were removed, and existing functions were adapted to receive the DataFrame as a parameter.
+4. **Updates `__init__.py` for 3W Dataset 2.0:**  The `__init__.py` file was updated to import and expose the new `load_3w_dataset()` function.
+
+These updates ensure that the 3W Toolkit is fully compatible with the 3W Dataset 2.0, providing a more efficient and streamlined workflow for loading and analyzing the data.
\ No newline at end of file

From a34b8bbbf7b11ea052b7297deae0b3283d71b940 Mon Sep 17 00:00:00 2001
From: Kelly Castro <kelly@decastro.com>
Date: Sat, 19 Oct 2024 05:51:33 -0300
Subject: [PATCH 6/6] feat/update_readme

---
 toolkit/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/toolkit/README.md b/toolkit/README.md
index eeaf3a06..97bc38bc 100644
--- a/toolkit/README.md
+++ b/toolkit/README.md
@@ -41,7 +41,7 @@ Highlights:
 
 ## 1.2.0
 
-Release: October 19, 2024  (Data em que estamos hoje)
+Release: October 19, 2024 # Lastiest version
 
 Highlights: