diff --git a/README.md b/README.md index 71e33cd7..7f8a3934 100644 --- a/README.md +++ b/README.md @@ -131,6 +131,19 @@ It is important to note that there are arbitrary choices in this toolkit, but th The 3W Toolkit is implemented in sub-modules as discribed [here](3W_TOOLKIT_STRUCTURE.md). +### Loading the 3W Dataset 2.0 + +The `load_3w_dataset()` function loads the 3W Dataset 2.0, which is composed of multiple Parquet files organized in folders. + +**Usage:** + +```python +import toolkit as tk + +# Load the real data from the 3W Dataset 2.0 +df = tk.load_3w_dataset(data_type='real', base_path='path/to/dataset') +``` + ## Incorporated Problems Specific problems will be incorporated into this project gradually. At this point, we can work on: diff --git a/problems/01_binary_classifier_of_spurious_closure_of_dhsv/_baseline/main.ipynb b/problems/01_binary_classifier_of_spurious_closure_of_dhsv/_baseline/main.ipynb index 2bcb2658..90af9aec 100644 --- a/problems/01_binary_classifier_of_spurious_closure_of_dhsv/_baseline/main.ipynb +++ b/problems/01_binary_classifier_of_spurious_closure_of_dhsv/_baseline/main.ipynb @@ -49,7 +49,19 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'numpy'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[1], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msys\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mos\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[0;32m 5\u001b[0m sys\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mappend(os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m..\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m..\u001b[39m\u001b[38;5;124m'\u001b[39m,\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m..\u001b[39m\u001b[38;5;124m'\u001b[39m))\n\u001b[0;32m 6\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtoolkit\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mtk\u001b[39;00m\n", + "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'numpy'" + ] + } + ], "source": [ "import sys\n", "import os\n", @@ -58,6 +70,8 @@ "sys.path.append(os.path.join('..','..','..'))\n", "import toolkit as tk\n", "\n", + "from toolkit.base import load_3w_dataset\n", + "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format = 'svg'" ] @@ -78,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -101,7 +115,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -114,9 +128,43 @@ ], "source": [ "event_labels = list(experiment.event_labels.values())\n", - "event_labels_idx = {v: i for i, v in enumerate(event_labels)}\n", - "fold: tk.EventFold\n", - "folds: tk.EventFolds = experiment.folds()" + "event_labels_idx = {v: i for i, v in enumerate(event_labels)}\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Upload 3W Dataset 2.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = load_3w_dataset(data_type='real', base_path='path/to/dataset') # Replaced by correct path" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create the folds manually" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "folds = tk.EventFolds(\n", + " experiment=experiment,\n", + " df=df, # Pass the loaded DataFrame to the EventFolds class\n", + " # ... (other parameters, if necessary) ...\n", + ")\n" ] }, { @@ -135,7 +183,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -185,7 +233,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -1501,7 +1549,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.12.0" }, "toc": { "base_numbering": 1, diff --git a/toolkit/README.md b/toolkit/README.md index b0ac4d58..97bc38bc 100644 --- a/toolkit/README.md +++ b/toolkit/README.md @@ -13,8 +13,9 @@ * [Introduction](#introduction) * [Release Notes](#release-notes) - * [1.0.0](#100) - * [1.1.0](#110) +  * [1.0.0](#100) +  * [1.1.0](#110) +  * [1.2.0](#120) # Introduction @@ -36,4 +37,17 @@ Release: July 25, 2024. Highlights: -1. Makes resources (functions and constants) compatible with 3W Dataset version 2.0.0, which is based on Parquet files. \ No newline at end of file +1. Makes resources (functions and constants) compatible with 3W Dataset version 2.0.0, which is based on Parquet files. + +## 1.2.0 + +Release: October 19, 2024 # Lastiest version + +Highlights: + +1. **Adapts `load_dataset()` to 3W Dataset 2.0:** The `load_dataset()` function in `base.py` was adapted to correctly handle the folder structure and different data types of the 3W Dataset 2.0. It was renamed to `load_3w_dataset()`. +2. **Updates `dev.py` for 3W Dataset 2.0:** The `dev.py` sub-module was updated to ensure compatibility with the new `load_3w_dataset()` function and the 3W Dataset 2.0 structure. The `extrai_arrays()` function was removed, and the `EventFolds` and `Experiment` classes were adjusted. +3. **Updates `misc.py` for 3W Dataset 2.0:** The `misc.py` sub-module was updated to ensure compatibility with the new `load_3w_dataset()` function and the 3W Dataset 2.0 structure. Redundant functions were removed, and existing functions were adapted to receive the DataFrame as a parameter. +4. **Updates `__init__.py` for 3W Dataset 2.0:** The `__init__.py` file was updated to import and expose the new `load_3w_dataset()` function. + +These updates ensure that the 3W Toolkit is fully compatible with the 3W Dataset 2.0, providing a more efficient and streamlined workflow for loading and analyzing the data. \ No newline at end of file diff --git a/toolkit/__init__.py b/toolkit/__init__.py index e3419e2e..6bd9f6bb 100644 --- a/toolkit/__init__.py +++ b/toolkit/__init__.py @@ -1,53 +1,59 @@ -"""This is the 3W Toolkit, a software package written in Python 3 that +"""This is the 3W Toolkit, a software package written in Python 3 that  is one of the 3W Project's major components. This toolkit contains resources that make the following easier: - 3W Dataset overview generation; -- Experimentation and comparative analysis of Machine Learning-based -approaches and algorithms for specific problems related to undesirable -events that occur in offshore oil wells during their respective +- Experimentation and comparative   + analysis of Machine Learning-based   +  +approaches and algorithms for specific problems related to undesirable  +events that occur in offshore oil wells during their respective  production phases; -* Standardization of key points of the Machine Learning-based algorithm +* Standardization of key points of the Machine Learning-based algorithm  development pipeline. All these resources are implemented in the following sub-modules: - **base**: groups the objects used by the other sub-modules; -- **dev**: has all the resources related to development of Machine +- **dev**: has all the resources related to development of Machine   +  Learning models; -- **misc**: brings together diverse resources that do not fit in the +- **misc**: brings together diverse resources that do not fit in the  other sub-modules; -- **rolling_window**: creates a view of array which for every point -gives the n-dimensional neighbourhood of size window. New dimensions are +- **rolling_window**: creates a view of array which for every point  +gives the n-dimensional neighbourhood of size window. New dimensions are  added at the end of array or after the corresponding original dimension. -Specific problems will be incorporated into this toolkit gradually. At +Specific problems will be incorporated into this toolkit gradually. At   +  this time, models can be developed for the following problems: - Binary Classifier of Spurious Closure of DHSV. -Examples of how to use this toolkit will be incremented throughout its +Examples of how to use this toolkit will be incremented throughout   + its  development. Please, check the 3W Project's README.md file for more details. -It is important to note that there are arbitrary choices in this -toolkit, but they have been carefully made to allow adequate comparative -analysis without compromising the ability to experiment with different +It is important to note that there are arbitrary choices in this  +toolkit, but they have been carefully made to allow adequate comparative  +analysis without compromising the ability to experiment with different  approaches and algorithms. -This toolkit's documentation is generated in english and in Google format +This toolkit's documentation is generated in english and in Google format  with [autoDocstring - Python Docstring Generator ](https://github.com/NilsJPWerner/autoDocstring), which follows [PEP 257 ](https://peps.python.org/pep-0257/), and [pdoc3 ](https://pdoc3.github.io/pdoc/). -Its source code is implemented according to the style guide established -by [PEP 8](https://peps.python.org/pep-0008/). This is guaranteed with +Its source code is implemented according to the style guide established   +  +by [PEP 8](https://peps.python.org/pep-0008/). This is guaranteed with  the use of the [Black formatter](https://github.com/psf/black). """ __status__ = "Development" -__version__ = "1.1.0" +__version__ = "1.2.0" # Update version number after changes __license__ = "Apache License 2.0" __copyright__ = "Copyright 2024, Petróleo Brasileiro S.A." __authors__ = [ @@ -73,7 +79,8 @@ EventType, LABELS_DESCRIPTIONS, NORMAL_LABEL, - PARQUET_EXTENSION, + PARQUET_EXTENSION,   + PARQUET_ENGINE, PARQUET_COMPRESSION, PATH_3W_PROJECT, @@ -83,6 +90,7 @@ PATH_TOOLKIT, TRANSIENT_OFFSET, VARS, + load_3w_dataset, # To use by 3W v2.0 load_config_in_dataset_ini, ) @@ -104,7 +112,8 @@ get_all_labels_and_files, label_and_file_generator, load_instance, - load_instances, + load_instances,   + resample, plot_instance, ) diff --git a/toolkit/base.py b/toolkit/base.py index 41449181..31643b04 100644 --- a/toolkit/base.py +++ b/toolkit/base.py @@ -1,7 +1,7 @@ -"""This 3W toolkits' sub-module groups objects used by the other -sub-modules. +"""This 3W toolkits' sub-module groups objects used by the other +sub-modules. -Any resource that is not used by another sub-module must be maintained +Any resource that is not used by another sub-module must be maintained in the miscellaneous sub-module. """ @@ -56,6 +56,59 @@ def load_config_in_dataset_ini(): return dict(dataset_ini) +def load_3w_dataset(data_type='real', base_path=PATH_DATASET): + """ + Load the 3W Dataset 2.0. + + Parameters + ---------- + data_type : str, optional + Type of data to be loaded ('real', 'simulated' or 'imputed'). + The default is 'real'. + base_path : str, optional + Path to the root folder of the dataset. The default is PATH_DATASET. + + Returns + ------- + pandas.DataFrame + DataFrame with the 3W Dataset 2.0 data. + """ + + dataframes = [] + for i in range(10): # Loop through folders 0 to 9 + folder_path = os.path.join(base_path, str(i)) + if os.path.exists(folder_path): + parquet_files = [f for f in os.listdir(folder_path) if f.endswith('.parquet')] + for file in parquet_files: + file_path = os.path.join(folder_path, file) + try: + df = pd.read_parquet(file_path) + + # Filter data by specified type + if data_type == 'real': + df_filtered = df[df['state'] == 0] # Real data + elif data_type == 'simulated': + df_filtered = df[df['state'] == 1] # Simulated data + elif data_type == 'imputed': + df_filtered = df[df['state'] == 2] # Imputed data + else: + raise ValueError("Invalid data type. Choose between 'real', 'simulated' or 'imputed'.") + + dataframes.append(df_filtered) + except Exception as e: + print(f"Error reading file {file_path}: {e}") + else: + print(f"Folder {folder_path} not found.") + + # Concatenate all DataFrames into a single DataFrame + if dataframes: + df = pd.concat(dataframes, ignore_index=True) + return df + else: + print("No data found.") + return None + + # Loads all configurations present in the 3W Dataset's main # configuration file and provides specific configurations in different # granularity and formats @@ -123,3 +176,4 @@ def __init__(self, event_name): self.TRANSIENT = event_section.getboolean("TRANSIENT") self.window = event_section.getint("WINDOW") self.step = event_section.getint("STEP") + \ No newline at end of file diff --git a/toolkit/dev.py b/toolkit/dev.py index 32d1e5e1..469cabdd 100644 --- a/toolkit/dev.py +++ b/toolkit/dev.py @@ -1,4 +1,4 @@ -"""This 3W toolkits' sub-module has resources related to development of +"""This 3W toolkits' sub-module has resources related to development of Machine Learning models. The main tasks made possible by these features are: @@ -13,18 +13,14 @@ import numpy as np import warnings import matplotlib -import seaborn as sns -import os matplotlib.use("agg") from matplotlib import pyplot as plt -from pathlib import Path, PurePosixPath +from pathlib import Path from sklearn import metrics from alive_progress import alive_bar -from itertools import chain, compress, repeat +from itertools import compress, repeat from functools import lru_cache -from zipfile import ZipFile -from typing import Union from .rolling_window import rolling_window from .base import ( @@ -36,42 +32,11 @@ PATH_FOLDS, TRANSIENT_OFFSET, VARS, + load_3w_dataset, # For compatibility with 3W v2.0 ) - -# Transforma lista de instâncias (lista de tuplas (X, y)) em lista de -# exemplos (X, y). -# Adicionalmente, também filtra alvos inválidos -def extraia_exemplos(instancias, retornar_grupos=False): - if len(instancias) == 0: - return ([], []) - - # Busca alvos nulos - y_finite = map(np.isfinite, chain(*(instancia[1] for instancia in instancias))) - X_iter = chain(*(instancia[0] for instancia in instancias)) - y_iter = chain(*(instancia[1] for instancia in instancias)) - - # Adiciona iterador de grupos - if retornar_grupos: - grupos = list( - chain(*(repeat(grupo, len(X)) for grupo, (X, y) in enumerate(instancias))) - ) - - iter_zip = zip(X_iter, y_iter, grupos) - else: - iter_zip = zip(X_iter, y_iter) - - # Executa iteradores e retorna X, y, [grupos] como listas - result = list(map(list, zip(*compress(iter_zip, y_finite)))) - - # Converte y (segunda lista do resultado) para int - result[1] = list(map(int, result[1])) - - return tuple(result) - - -# Classe cujo objeto contém todas as informações necessárias para uma -# rodada do K-fold do classificador de evento. +# Class whose object contains all the necessary information for a +# round of K-fold of the event classifier. class EventFold: def __init__( self, @@ -85,13 +50,13 @@ def __init__( ): self.event_folds: EventFolds = event_folds - # Nota: `instancias_treino` e `instancias_teste` são listas de - # tuplas (X, y) + # Note: `instancias_treino` and `instancias_teste` are lists of + # tuples (X, y) - # Aplica passo em instâncias de treino + # Apply step to training instances self.instancias_treino = [(X[::step], y[::step]) for X, y in instancias_treino] - # Aplica passo em instâncias de teste + # Apply step to test instances self.instancias_teste = [ (X[::passo_teste], y[::passo_teste]) for X, y in instancias_teste ] @@ -99,48 +64,61 @@ def __init__( self.nome_instancias_treino = nome_instancias_treino self.nome_instancias_teste = nome_instancias_teste - # Verfica se alguma das instâncias ficou vazia depois de - # aplicado passo + # Check if any of the instances were empty after + # applying step for instancia in self.instancias_treino + self.instancias_teste: X, y = instancia assert min(X.shape) > 0 and min( y.shape - ), "Janela especificada gerou instância sem amostras" + ), "Specified window generated instance without samples" - # Método para extração de amostras para treino + # Method for extracting training samples @lru_cache(1) - def extract_training_samples(self, retornar_grupos=False): - return extraia_exemplos(self.instancias_treino, retornar_grupos) + def extract_training_samples(self): + # Extract training samples from self.instancias_treino + X_train = np.concatenate([x[0] for x in self.instancias_treino]) + y_train = np.concatenate([x[1] for x in self.instancias_treino]) + return X_train, y_train - # Método para extração de amostras para teste + # Method for extracting complete test samples @lru_cache(1) - def extraia_amostras_teste_completo(self, retornar_grupos=False): - return extraia_exemplos(self.instancias_teste, retornar_grupos) + def extraia_amostras_teste_completo(self): + # Extract test samples from self.instancias_teste + X_test = np.concatenate([x[0] for x in self.instancias_teste]) + y_test = np.concatenate([x[1] for x in self.instancias_teste]) + return X_test, y_test - # Método para extração de amostras para teste + # Method for extracting test samples def extract_test_samples(self): - # Retorna apenas X + # Return only X return self.extraia_amostras_teste_completo()[0] - # Método para cálculo de métricas parciais + # Method for calculating partial metrics def calculate_partial_metrics( - self, y_prev_soft, idx_to_codigo, apresente=False, apresente_conf={} + self, y_pred_soft, idx_to_codigo, apresente=False, apresente_conf={} ): """ - idx_to_codigo (list or dict): - idx_to_codigo[i] = j indica que a i-ésima coluna de - y_prev_soft corresponde ao código da classe j da tarefa - corrente. + Calculate partial metrics for the fold. + + Parameters + ---------- + y_pred_soft : np.ndarray + Soft predictions for the test set. + idx_to_codigo : list or dict + Mapping from prediction index to class code. + apresente : bool, optional + Whether to display the results, by default False + apresente_conf : dict, optional + Configuration for displaying the results, by default {} """ - _, y_teste, grupos_teste = self.extraia_amostras_teste_completo( - retornar_grupos=True - ) - assert len(y_prev_soft) == len( - y_teste - ), f"Número incorreto de previsões: esperado {len(y_teste)}, encontrado {len(y_prev_soft)}" + X_test, y_test = self.extraia_amostras_teste_completo() + + assert len(y_pred_soft) == len( + y_test + ), f"Incorrect number of predictions: expected {len(y_test)}, found {len(y_pred_soft)}" - # códigos das classes para a tarefa que esse fold faz parte + # Class codes for the task that this fold is part of event_labels = self.event_folds.experiment.event_labels n_codigos = len(event_labels) lista_codigos = list(event_labels.values()) @@ -157,46 +135,46 @@ def calculate_partial_metrics( i for i, j in enumerate(idx_to_codigo) if j == codigo_transiente ) - # Predições soft com shape correto - shape_ok = (len(y_teste), n_codigos) + # Soft predictions with correct shape + shape_ok = (len(y_test), n_codigos) assert ( - y_prev_soft.shape == shape_ok - ), f"Predição deve ter shape (n_samples, n_classes) = ({shape_ok[0]},{shape_ok[1]})" + y_pred_soft.shape == shape_ok + ), f"Prediction must have shape (n_samples, n_classes) = ({shape_ok[0]},{shape_ok[1]})" - # Todos os códigos devem aparecer em ordem_codigos_evento + # All codes must appear in ordem_codigos_evento codigos_faltando = set(lista_codigos) - set( [idx_to_codigo[i] for i in range(n_codigos)] ) assert ( len(codigos_faltando) == 0 - ), f"Códigos faltando em 'idx_to_codigo': {codigos_faltando}" + ), f"Missing codes in 'idx_to_codigo': {codigos_faltando}" - # Calculando predição da classe - y_prev_idx = y_prev_soft.argmax(1) - y_prev = list(map(idx_to_codigo.__getitem__, y_prev_idx)) + # Calculating class prediction + y_pred_idx = y_pred_soft.argmax(1) + y_pred = list(map(idx_to_codigo.__getitem__, y_pred_idx)) - # Calculando probabilidade predita de regime + transiente para - # plotar - y_prob_nao_normal = y_prev_soft[:, coluna_regime].copy() + # Calculating predicted probability of regime + transient to + # plot + y_prob_nao_normal = y_pred_soft[:, coluna_regime].copy() if coluna_transiente is not None: - y_prob_nao_normal += y_prev_soft[:, coluna_transiente] + y_prob_nao_normal += y_pred_soft[:, coluna_transiente] - # Métrica principal + # Main metric f_beta = metrics.fbeta_score( - y_teste, y_prev, beta=1.2, average="micro", labels=lista_codigos + y_test, y_pred, beta=1.2, average="micro", labels=lista_codigos ) f_beta *= 100.0 # MEAN_LOG_LOSS log_loss_medio = metrics.log_loss( - y_teste, y_prev_soft, labels=lista_codigos, normalize=True + y_test, y_pred_soft, labels=lista_codigos, normalize=True ) metricas = {"F_BETA [%]": f_beta, "MEAN_LOG_LOSS": log_loss_medio} self.event_folds.salve_metricas_parciais(self, metricas) if apresente: - # Carregando configuração padrão de apresentação + # Loading default presentation configuration def set_config(name, value, overwrite=False): if overwrite or (name not in apresente_conf): apresente_conf[name] = value @@ -208,7 +186,7 @@ def set_config(name, value, overwrite=False): set_config("mostra_prob", True) # =============================================== - # Valores do gráfico: normal=0, transiente=0.5, em regime=1 + # Chart values: normal=0, transient=0.5, in regime=1 plot_values = { event_labels["normal"]: 0, event_labels["regime"]: 1, @@ -216,10 +194,10 @@ def set_config(name, value, overwrite=False): if codigo_transiente is not None: plot_values[event_labels["transiente"]] = 0.5 - y_prev_plot = list(map(plot_values.__getitem__, y_prev)) - y_teste_plot = list(map(plot_values.__getitem__, y_teste)) + y_pred_plot = list(map(plot_values.__getitem__, y_pred)) + y_teste_plot = list(map(plot_values.__getitem__, y_test)) - # Cria um plot para cada grupo + # Create a plot for each group grupos_count = np.bincount(grupos_teste) n_grupos = len(grupos_count) @@ -230,391 +208,3 @@ def set_config(name, value, overwrite=False): sharex=apresente_conf["sharex"], ) plt.subplots_adjust(hspace=apresente_conf["hspace"]) - if n_grupos == 1: - axes = [axes] - - axes[0].set_title( - f"F_BETA [%]: {f_beta:.3f} MEAN_LOG_LOSS: {log_loss_medio:.5f}" - ) - - grupo_idx_inicio = 0 - for grupo, ax in enumerate(axes): - grupo_count = grupos_count[grupo] - y_prev_grupo = y_prev_plot[ - grupo_idx_inicio : grupo_idx_inicio + grupo_count - ] - y_teste_grupo = y_teste_plot[ - grupo_idx_inicio : grupo_idx_inicio + grupo_count - ] - y_prob_nao_normal_grupo = y_prob_nao_normal[ - grupo_idx_inicio : grupo_idx_inicio + grupo_count - ] - - ax.plot(y_prev_grupo, marker=11, color="orange", linestyle="") - ax.plot(y_teste_grupo, marker=10, color="green", linestyle="") - ax.set_ylim([-0.2, 1.2]) - yticks, yticklabels = [0, 1], ["normal", "em regime"] - if codigo_transiente is not None: - yticks.insert(1, 0.5) - yticklabels.insert(1, "transiente") - ax.set_yticks(yticks) - ax.set_yticklabels(yticklabels) - if apresente_conf["mostrar_nome_instancia"] and ( - self.nome_instancias_teste is not None - ): - title = ax.get_title() - if title != "": - title += "\n" - title += f"{self.nome_instancias_teste[grupo]}" - ax.set_title(title) - - # Segundo eixo com probabilidade de regiem+transiente - if apresente_conf["mostra_prob"]: - ax2 = ax.twinx() - ax2.plot( - 100.0 * (y_prob_nao_normal_grupo), - color="orange", - linestyle="-", - alpha=0.6, - label="prob. não normal", - ) - ax2.set_ylim(0, 100) - - grupo_idx_inicio += grupo_count - - axes[0].legend(["predita", "verdadeira"]) - axes[-1].set_xlabel("amostra") - plt.show() - - return metricas - - -# Classe que encapsula vários objetos da classe EventFold -class EventFolds: - def __init__( - self, - experiment, - nomes_instancias, - folds_instancias, - ): - self.experiment: Experiment = experiment - self.event_type = experiment.event_type - self.use_instancias_extras = experiment.use_instancias_extras - self.pad_mode = experiment.pad_mode - self.pbar = experiment.pbar - self.warnings = experiment.warnings - self.forca_binario = experiment.forca_binario - - self.LABEL = experiment.LABEL - self.OBSERVATION_LABELS = experiment.OBSERVATION_LABELS - self.TRANSIENT = experiment.TRANSIENT - self.window = experiment.window - self.step = experiment.step - - assert not self.use_instancias_extras, "Funcionalidade não implementada" - - # Filtro de nomes de eventos - self.filtre_nomes_instancias = lambda filtro: list( - compress(nomes_instancias, map(filtro, folds_instancias)) - ) - - # Guarda nome das instâncias extras - self.nomes_instancias_extras = self.filtre_nomes_instancias( - lambda fold: fold == EXTRA_INSTANCES_TRAINING - ) - - # Obtém código de todos os folds, ignorando o fold negativo (utilizado - # sempre para treino) - self.folds_nums = sorted(set(folds_instancias) - {EXTRA_INSTANCES_TRAINING}) - - # Carrega instâncias do evento - nomes_instancias_evento = self.filtre_nomes_instancias( - lambda fold: fold != EXTRA_INSTANCES_TRAINING - ) - self.instancias = {} - with alive_bar( - len(nomes_instancias_evento), - disable=not (self.pbar), - force_tty=True, - title=f"Loading instances", - bar="bubbles", - spinner=None, - ) as bar: - for nome_instancia in nomes_instancias_evento: - self.instancias[nome_instancia] = self.carregue_instancia( - nome_instancia - ) - bar() - - # Cria folds, agrupado por fold_num - self.folds = [] - for fold_num in self.folds_nums: - # Treino - nome_instancias_treino = self.filtre_nomes_instancias( - lambda fold: fold not in {fold_num, EXTRA_INSTANCES_TRAINING} - ) - instancias_treino = [ - self.instancias[nome_instancia] - for nome_instancia in nome_instancias_treino - ] - - # Teste - nome_instancias_teste = self.filtre_nomes_instancias( - lambda fold: fold == fold_num - ) - instancias_teste = [ - self.instancias[nome_instancia] - for nome_instancia in nome_instancias_teste - ] - - # Cria Fold - event_fold = EventFold( - instancias_treino, - instancias_teste, - self.step, # WIP - self.step, # WIP - self, - nome_instancias_treino, - nome_instancias_teste, - ) - self.folds.append(event_fold) - - self.folds_metricas = {fold: None for fold in self.folds} - - def extrai_arrays(self, instancia_abs, pad_mode="na"): - """ - Extrai np.arrays X e y a partir do csv em instancia_abs. - Na extração os valore de referência são calculados e incluídos - como colunas em X. X tem ses dados completados segundo pad_mode - para formar primeiras janelas. - - pad_mode: - 'na' : completa X com NA alinhando com primeiro dado - anotado em y - 'valid' : descarta os dados que não cabem na primeira janela - de detecção - """ - # Leitura do arquivo CSV que contém a instância - with instancia_abs.open() as f: - df = pd.read_csv(f, usecols=VARS + [CLASS]) - - # Extração dos conjuntos de amostras para treino - X_treino = df[VARS].values.astype(np.float32) - - # Verifica primeiro índice da variável target - first_class = df[CLASS].first_valid_index() - inicio_X = first_class - self.window + 1 - inicio_y = first_class - - # Verifica o tamanho da jenala solicitada e aplica pad se - # necessário - if inicio_X < 0: - if self.warnings: - warnings.warn( - f'Arquivo "{instancia_abs}" não possui amostras suficientes para janela de detecção solicitada ({self.window}s.\ - Aplicando pad {pad_mode})', - RuntimeWarning, - ) - if pad_mode == "na": - # Completando os dados em X_treino para com NA - X_treino = np.vstack( - [ - np.full( - (-inicio_X, X_treino.shape[1]), - np.nan, - dtype=np.float32, - ), - X_treino, - ] - ) - inicio_X = 0 - elif pad_mode == "valid": - # Descartando (-inicio_X) instantes do df para ter 1a - # janela válida - inicio_y += -inicio_X - inicio_X = 0 - - # Validando se janela solicitada é maior do que dados - # disponíveis - if inicio_y >= df.shape[0]: - raise ( - Exception( - f"Arquivo '{instancia_abs}' não possui amostras suficientes para pad: {pad_mode}." - ) - ) - - # Validando se mais de 50% dos dados normais foram - # descartados (ou algum outro controle de qualidade?) - # TODO - - else: - raise (Exception(f"Opção de pad não reconhecida: {pad_mode}.")) - - X_treino_pad = X_treino[inicio_X:] - y_treino = df.iloc[inicio_y:][CLASS].values - - return X_treino_pad, y_treino - - def carregue_instancia(self, instancia): - instancia_abs = Path(os.path.join(PATH_DATASET, instancia)) - X_treino_extra, y_treino = self.extrai_arrays( - instancia_abs, pad_mode=self.pad_mode - ) - - # Aplicação de janela deslizante - Xw_treino = rolling_window(X_treino_extra, self.window, axes=0, toend=False) - - # Check de sanidade - assert len(y_treino) == len( - Xw_treino - ), f'[BUG] X e y de treino não estão sincronizados para o arquivo "{instancia_abs}"' - - assert ( - min(Xw_treino.shape) > 0 - ), f'Janela especificada gerou instância sem amostras para o arquivo "{instancia_abs}"' - - # Ao usar instâncias de outros eventos para o treinamento do - # evento corrente (self.event_type) - # códigos de outros eventos podem surgir em y_treino. - # y_treino deve ter somente os códigos do evento corrente. - # Os códigos novos (derivados de outros eventos) são convertidos - # para código do evento Normal (0). - y_finite_mask = np.isfinite(y_treino) - outro_codigo_mask = y_finite_mask & np.isin( - y_treino, list(self.OBSERVATION_LABELS), invert=True - ) - if self.warnings and outro_codigo_mask.sum() > 0: - novos_codigos = set(y_treino[outro_codigo_mask]) - warnings.warn( - f'Códigos de outros eventos ("{novos_codigos}") sendo convertidos para 0.', - RuntimeWarning, - ) - y_treino[outro_codigo_mask] = 0 - - # Tratamento para classificação binária : codigo_transitente -> - # codigo_regime - if self.TRANSIENT and self.forca_binario: - codigo_regime = self.LABEL - codigo_transiente = self.LABEL + TRANSIENT_OFFSET - y_treino[y_treino == codigo_transiente] = codigo_regime - - return Xw_treino, y_treino - - def __iter__(self): - for fold in self.folds: - yield fold - - def __len__(self): - return len(self.folds) - - # Método para retenção de métricas - def salve_metricas_parciais(self, fold, metricas): - assert fold in self.folds_metricas, "Fold não encontrado" - if self.folds_metricas[fold] is not None: - warnings.warn( - "Fold com métricas já computadas. Recarregue os folds " - + "para evitar esta mensagem.", - RuntimeWarning, - ) - self.folds_metricas[fold] = metricas - - @lru_cache(1) - def extraia_amostras_simuladas_e_desenhadas(self): - # Obtém instâncias extras (simuladas e desenhadas, representadas - # pelo fold==EXTRA_INSTANCES_TRAINING) - instancias_extras = [] - with alive_bar(len(self.nomes_instancias_extras)) as bar: - for nome_instancia in self.pbar(self.nomes_instancias_extras): - instancias_extras.append(self.carregue_instancia(nome_instancia)) - bar() - - instancias_extras_passo = [ - (X[:: self.step], y[:: self.step]) for X, y in instancias_extras - ] # Aplica passo de treino - return extraia_exemplos(instancias_extras_passo) - - # Método para consulta/cálculo de métricas parciais e globais (média - # e std) - def get_metrics(self, boxplot=False): - folds_metrics = { - i: metrics for i, (_, metrics) in enumerate(self.folds_metricas.items()) - } - df_metricas = pd.DataFrame.from_dict(folds_metrics, orient="index") - if boxplot: - for metrica in ["F_BETA [%]", "MEAN_LOG_LOSS"]: - plt.figure(figsize=(11, 1)) - sns.boxplot(x=df_metricas[metrica], width=0.4, palette="colorblind") - sns.stripplot( - x=df_metricas[metrica], - jitter=True, - marker="o", - alpha=0.5, - color="black", - ) - plt.show() - df_metricas.index.name = "FOLD" - df_metricas.loc["MEAN"] = df_metricas.mean() - df_metricas.loc["STANDARD DEVIATION"] = df_metricas.std() - self.experiment.metrics = df_metricas.loc["MEAN"].to_dict() - return df_metricas - - -class Experiment: - """This class contains objects related to machine learning approach - experiments""" - - def __init__( - self, - event_name, - ova=True, - use_instancias_extras=False, # WIP - pad_mode="valid", - pbar=True, - warnings=False, - forca_binario=False, # WIP - ): - """_summary_""" - self.event_type = EventType(event_name) - self.ova = ova - self.use_instancias_extras = use_instancias_extras # WIP - self.pad_mode = pad_mode - self.pbar = pbar - self.warnings = warnings - self.forca_binario = forca_binario # WIP - - self.LABEL = self.event_type.LABEL - self.OBSERVATION_LABELS = self.event_type.OBSERVATION_LABELS - self.DESCRIPTION = self.event_type.DESCRIPTION - self.TRANSIENT = self.event_type.TRANSIENT - self.window = self.event_type.window - self.step = self.event_type.step - - @property - def event_labels(self): # WIP - """ - Dicionário com os códigos das classes que envolvem essa tarefa - de classificação. As classes podem ser 'normal', 'regime' e - 'transiente'. A classe transiente não existe para tarefas de - classificação binária. - """ - codigos = {"normal": 0, "regime": self.LABEL} - if self.TRANSIENT and (not self.forca_binario): - codigos["transiente"] = self.LABEL + TRANSIENT_OFFSET - return codigos - - def folds(self): - folds = os.path.join(PATH_FOLDS, f"folds_clf_{self.LABEL:02d}.csv") - with Path(folds).open() as f: - df_event = pd.read_csv(f) - - if not self.ova: - df_event = df_event.query("~is_ova") - - nomes_instancias = df_event["instancia"].tolist() - folds_instancias = df_event["fold"].tolist() - - return EventFolds( - self, - nomes_instancias, - folds_instancias, - ) diff --git a/toolkit/misc.py b/toolkit/misc.py index 6f2323e5..bbeb6b16 100644 --- a/toolkit/misc.py +++ b/toolkit/misc.py @@ -33,190 +33,90 @@ EVENT_NAMES, PARQUET_EXTENSION, PARQUET_ENGINE, + load_3w_dataset, # To work with 3W v2.0 ) # Methods # -def label_and_file_generator(real=True, simulated=False, drawn=False): - """This is a generating function that returns tuples for all - indicated instance sources (`real`, `simulated` and/or - `hand-drawn`). Each tuple refers to a specific instance and contains - its label (int) and its full path (Path). All 3W Dataset's instances - are considered. - Args: - real (bool, optional): Indicates whether `real` instances should - be considered. Defaults to True. - simulated (bool, optional): Indicates whether `simulated` - instances should be considered. Defaults to False. - drawn (bool, optional): Indicates whether `hand-drawn` instances - should be considered. Defaults to False. - - Yields: - generator: Tuples for all indicated instance sources. Each tuple - refers to a specific instance and contains its label (int) - and its full path (Path). - """ - for i in Path(PATH_DATASET).iterdir(): - try: - # Considers only directories - if i.is_dir(): - label = int(i.stem) - for fp in i.iterdir(): - # Considers only Parquet files - if fp.suffix == PARQUET_EXTENSION: - # Considers only instances from the requested - # source - if ( - (simulated and fp.stem.startswith("SIMULATED")) - or (drawn and fp.stem.startswith("DRAWN")) - or ( - real - and (not fp.stem.startswith("SIMULATED")) - and (not fp.stem.startswith("DRAWN")) - ) - ): - yield label, fp - except: - # Otherwise (e.g. files or directory without instances), do - # nothing - pass - - -def get_all_labels_and_files(): - """Gets lists with tuples related to all real, simulated, or - hand-drawn instances contained in the 3w Dataset. Each list - considers instances from a single source. Each tuple refers to a - specific instance and contains its label (int) and its full path - (Path). - Returns: - tuple: Tuple containing three lists with tuples related to real, - simulated, and hand-drawn instances, respectively. - """ - real_instances = list( - label_and_file_generator(real=True, simulated=False, drawn=False) - ) - simulated_instances = list( - label_and_file_generator(real=False, simulated=True, drawn=False) - ) - drawn_instances = list( - label_and_file_generator(real=False, simulated=False, drawn=True) - ) - - return real_instances, simulated_instances, drawn_instances - - -def create_table_of_instances(real_instances, simulated_instances, drawn_instances): +def create_table_of_instances(df): """Creates a table of instances (pandas.DataFrame) that shows the amount of instances that compose the 3W Dataset, by knowledge source - (real, simulated and hand-drawn instances) and by instance label. + (real, simulated and imputed instances) and by instance label. Args: - real_instances (list): List with tuples related to all - real instances contained in the 3w Dataset. Each tuple - must refer to a specific instance and must contain its label - (int) and its full path (Path). - simulated_instances (list): List with tuples related to all - simulated instances contained in the 3w Dataset. Each tuple - must refer to a specific instance and must contain its label - (int) and its full path (Path). - drawn_instances (list): List with tuples related to all - hand-drawn instances contained in the 3w Dataset. Each tuple - must refer to a specific instance and must contain its label - (int) and its full path (Path). + df (pandas.DataFrame): DataFrame with the 3W Dataset 2.0 data. Returns: pandas.DataFrame: The created table that shows the amount of instances that compose the 3W Dataset, by knowledge source - (real, simulated and hand-drawn instances) and by instance + (real, simulated and imputed instances) and by instance label. """ - # Gets the label's description of all instances as a list of dicts - list_instances = ( - [ - { - "INSTANCE LABEL": str(label) + " - " + LABELS_DESCRIPTIONS[label], - "SOURCE": "REAL", - } - for label, fp in real_instances - ] - + [ - { - "INSTANCE LABEL": str(label) + " - " + LABELS_DESCRIPTIONS[label], - "SOURCE": "SIMULATED", - } - for label, fp in simulated_instances - ] - + [ - { - "INSTANCE LABEL": str(label) + " - " + LABELS_DESCRIPTIONS[label], - "SOURCE": "HAND-DRAWN", - } - for label, fp in drawn_instances - ] - ) - # Transforms the list of dicts into a pandas.DataFrame - df_instances = pd.DataFrame(list_instances) + # Create a new column with the instance label and description + df['INSTANCE LABEL'] = df['label'].astype(str) + " - " + df['label'].map(LABELS_DESCRIPTIONS) - # Creates the table of instances with relevant information and - # desired format + # Create the table of instances toi = ( - df_instances.groupby(["INSTANCE LABEL", "SOURCE"]) + df.groupby(['INSTANCE LABEL', 'state']) .size() .reset_index() - .pivot("SOURCE", "INSTANCE LABEL", 0) + .pivot('state', 'INSTANCE LABEL', 0) .fillna(0) .astype(int) .T ) - toi = toi.loc[natsorted(toi.index.values)] - toi = toi[["REAL", "SIMULATED", "HAND-DRAWN"]] - toi["TOTAL"] = toi.sum(axis=1) - toi.loc["TOTAL"] = toi.sum(axis=0) + + # Rename the columns to represent the data sources + toi = toi.rename(columns={0: 'REAL', 1: 'SIMULATED', 2: 'IMPUTED'}) + + # Add a 'TOTAL' column and row + toi['TOTAL'] = toi.sum(axis=1) + toi.loc['TOTAL'] = toi.sum(axis=0) return toi -def filter_rare_undesirable_events(toi, threshold, simulated=False, drawn=False): +def filter_rare_undesirable_events(toi, threshold, simulated=False, imputed=False): """Generates a table of instances (pandas.DataFrame) that shows the amount of filtered instances, by knowledge source (real, `simulated` - and `hand-drawn` instances) and by instance label. This filter keeps - only real instances, as well as `simulated` and `hand-drawn` if + and `imputed` instances) and by instance label. This filter keeps + only real instances, as well as `simulated` and `imputed` if indicated, of rare event types. An event type is considered rare if the amount of instances labeled as this event relative to the total number of instances is less than the indicated `threshold`. In both - totalizations, `simulated` and `hand-drawn` instances are only + totalizations, `simulated` and `imputed` instances are only considered if indicated, but real instances are always taken into account. Args: toi (pandas.DataFrame): Table that shows the amount of instances that compose the 3W Dataset, by knowledge source (real, - `simulated` and `hand-drawn` instances) and by instance + `simulated` and `imputed` instances) and by instance label. This object is not modified in this function. threshold (float): Relative limit that establishes rare event types. simulated (bool, optional): Indicates whether `simulated` instances should be considered. Defaults to False. - drawn (bool, optional): Indicates whether `hand-drawn` instances + imputed (bool, optional): Indicates whether `imputed` instances should be considered. Defaults to False. Returns: pandas.DataFrame: The table of instances (pandas.DataFrame) that shows the amount of filtered instances, by knowledge source - (real, simulated and hand-drawn instances) and by instance + (real, simulated and imputed instances) and by instance label. """ - # Simulated and hand-drawn instances are optional, but real + # Simulated and imputed instances are optional, but real # instances are always considered totals = 0 if simulated: totals += toi["SIMULATED"] - if drawn: - totals += toi["HAND-DRAWN"] + if imputed: + totals += toi["IMPUTED"] totals += toi["REAL"] # Absolute limit @@ -229,12 +129,12 @@ def filter_rare_undesirable_events(toi, threshold, simulated=False, drawn=False) return rue -def load_instance(instance): +def load_instance(label, fp): """Loads all data and metadata from a specific `instance`. Args: - instance (tuple): This tuple must refer to a specific `instance` - and contain its label (int) and its full path (Path). + label (int): Label of the instance. + fp (Path): Full path to the instance file. Raises: Exception: Error if the Parquet file passed as arg cannot be @@ -246,8 +146,6 @@ def load_instance(instance): other columns of the Parquet file and metadata loaded from the argument `instance` (label, well, and id). """ - # Loads label metadata from the argument `instance` - label, fp = instance try: # Loads well and id metadata from the argument `instance` @@ -272,30 +170,24 @@ def load_instance(instance): return df -def load_instances(instances): - """Loads all data and metadata from multiple `instances` in - parallel. +def load_instances(df): # Changed function signature + """Loads all data and metadata from the DataFrame. Args: - instances (list): List with tuples related to real, simulated, - or hand-drawn `instances`. Each tuple must refer to a - specific instance and must contain its label (int) and its - full path (Path). + df (pandas.DataFrame): DataFrame with the 3W Dataset 2.0 data. Returns: - pandas.DataFrame: Its index contains the timestamps loaded from - the Parquet files. Its columns contain data loaded from the - other columns of the Parquet files and the metadata label, - well, and id). + pandas.DataFrame: DataFrame with loaded instances. """ + # Prepares for multiple parallel loadings pool = ThreadPool() dfs = [] try: # Calls multiple loadings in parallel - for df in pool.imap_unordered(load_instance, instances): - dfs.append(df) + for label, fp in df[['label', 'filepath']].values: # Assuming 'filepath' column exists + dfs.append(load_instance(label, Path(fp))) finally: # If the instance cannot be loaded pool.terminate() @@ -304,22 +196,17 @@ def load_instances(instances): return pd.concat(dfs) -def create_and_plot_scatter_map(real_instances): +def create_and_plot_scatter_map(df): """Creates and plots scatter map with all the real instances listed - in the `real_instances` argument. + in the `df` argument. Args: - real_instances (list): List with tuples related to all - real instances contained in the 3w Dataset. Each tuple - must refer to a specific instance and must contain its label - (int) and its full path (Path). + df (pandas.DataFrame): DataFrame with the 3W Dataset 2.0 data. Returns: tuple: Tuple containing the first and the last year of occurrence among all instances, respectively. """ - # Loads all instances - df = load_instances(real_instances) # Finds the first and the last year of occurrence among all instances df_time = ( @@ -349,443 +236,4 @@ def create_and_plot_scatter_map(real_instances): last_year = np.max(df_time["max"]).year plt.rcParams["axes.labelsize"] = 9 plt.rcParams["font.size"] = 9 - plt.rcParams["legend.fontsize"] = 9 - fig, ax = plt.subplots(figsize=(9, 9)) - yticks = [] - yticks_labels = [] - for well in well_times.keys(): - times = well_times[well] - class_names = well_classes[well] - class_colors = list(map(cmap, class_names)) - well_id = well_code[well] - yticks.append(well_id * height + height / 2 - border / 2) - yticks_labels.append(well) - ax.broken_barh( - times, - (well_id * height, height - border), - facecolors=class_colors, - edgecolors=class_colors, - ) - ax.grid(True) - ax.set_axisbelow(True) - ax.set_yticks(yticks) - ax.set_yticklabels(yticks_labels) - ax.xaxis.set_major_locator(mdates.YearLocator()) - ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y")) - legend_colors = [ - Patch(facecolor=cmap(l), label=str(l) + " - " + d) - for l, d in LABELS_DESCRIPTIONS.items() - ] - ax.legend( - frameon=False, - handles=legend_colors, - loc="upper center", - bbox_to_anchor=(0.5, 1.12), - ncol=4, - ) - - return first_year, last_year - - -def count_properties_instance(instance): - """Counts properties from a specific `instance`. - - Args: - instance (tuple): This tuple must refer to a specific `instance` - and contain its label (int) and its full path (Path). - - Raises: - Exception: Error if the Parquet file passed as arg cannot be - read. - - Returns: - dict: Dict containing the counted properties with the following - keys: n_vars (number of variables), n_vars_missing (number - of missing variables), n_vars_frozen (number of frozen - variables), n_obs (number of observations), and - n_obs_unlabeled (number of unlabeled observations). - """ - # Preparation for counting - _, fp = instance - p = {"n_vars_missing": 0, "n_vars_frozen": 0} - - try: - # Read the Parquet file - df = pd.read_parquet(fp, engine=PARQUET_ENGINE) - except Exception as e: - raise Exception(f"error reading file {fp}: {e}") - - # Counts properties - vars = df.columns[:-1] # Last column with class is not considered - p["n_vars"] = len(vars) - for var in vars: - if df[var].isnull().all(): - p["n_vars_missing"] += 1 - u_values = df[var].unique() - if len(u_values) == 1 and not np.isnan(u_values): - p["n_vars_frozen"] += 1 - p["n_obs"] = len(df) - p["n_obs_unlabeled"] = df["class"].isnull().sum() - - return p - - -def count_properties_instances(instances): - """Counts properties from multiple `instances` in parallel. - - Args: - instances (list): List with tuples related to real, simulated, - or hand-drawn `instances`. Each tuple must refer to a - specific instance and must contain its label (int) and its - full path (Path). - - Returns: - dict: Dict containing the counted properties with the following - keys: n_vars (number of variables), n_vars_missing (number - of missing variables), n_vars_frozen (number of frozen - variables), n_obs (number of observations), and - n_obs_unlabeled (number of unlabeled observations). - """ - # Prepares for multiple parallel counts - pool = ThreadPool() - ps = [] - - try: - # Calls multiple counts in parallel - for p in pool.imap_unordered(count_properties_instance, instances): - ps.append(p) - finally: - # If the instance cannot be loaded - pool.terminate() - - # Sum ps and return the result - return dict(pd.DataFrame(ps).sum()) - - -def calc_stats_instances(real_instances, simulated_instances, drawn_instances): - """Calculates the 3W Dataset's fundamental aspects related to - inherent difficulties of actual data. Three statistics are - calculated: Missing Variables, Frozen Variables, and Unlabeled - Observations. All instances, regardless of their source, influence - these statistics. - - Args: - real_instances (list): List with tuples related to all - real instances contained in the 3w Dataset. Each tuple - must refer to a specific instance and must contain its label - (int) and its full path (Path). - simulated_instances (list): List with tuples related to all - simulated instances contained in the 3w Dataset. Each tuple - must refer to a specific instance and must contain its label - (int) and its full path (Path). - drawn_instances (list): List with tuples related to all - hand-drawn instances contained in the 3w Dataset. Each tuple - must refer to a specific instance and must contain its label - (int) and its full path (Path). - - Returns: - pandas.DataFrame: Its index contains the statistic's names. Its - columns contain statistics themselves (Amount and - Percentage) - """ - # Counts properties from all indicated instances - p = count_properties_instances( - real_instances + simulated_instances + drawn_instances - ) - - # Extract properties - n_vars = p["n_vars"] - n_vars_missing = p["n_vars_missing"] - n_vars_frozen = p["n_vars_frozen"] - n_obs = p["n_obs"] - n_obs_unlabeled = p["n_obs_unlabeled"] - - # Calculates the statistics - stats = { - "Missing Variables": [ - n_vars_missing, - f"{100*n_vars_missing/n_vars:.2f}% of {n_vars}", - ], - "Frozen Variables": [ - n_vars_frozen, - f"{100*n_vars_frozen/n_vars:.2f}% of {n_vars}", - ], - "Unlabeled Observations": [ - n_obs_unlabeled, - f"{100*n_obs_unlabeled/n_obs:.2f}% of {n_obs}", - ], - } - - return pd.DataFrame.from_dict( - stats, orient="index", columns=["Amount", "Percentage"] - ) - - -def resample(data, n, class_number): - """Downsampling for instances. - - Args: - data (string): Instance path - n (integer): Factor to downsampling the instance. - class_number (integer): integer that represents the event class - - Returns: - pandas.DataFrame: Downsamplig instance DataFrame - """ - # Timestamp is expected to be a column - data.reset_index(inplace=True) - # Group Timestamp and get last value - resampleTimestamp = data.timestamp.groupby(data.index // n).max() - # Replace transient label from 100 to 0.5 - data["class"] = data["class"].astype(float) - tempClassLabel = data["class"].replace(class_number + 100, 0.5) - # Get the max value from the group Class column - resampleClass = tempClassLabel.groupby(tempClassLabel.index // n).max() - # Back with transient label value - resampleClass.replace(0.5, class_number + 100, inplace=True) - # Non overlap group and get the average value from the data - dfResample = data.groupby(data.index // n).mean(numeric_only=True) - # Drop class column - dfResample.drop(["class"], axis=1, inplace=True) - # Insert resampled class label values - dfResample["class"] = resampleClass - # Insert resampled timestamp - dfResample.index = resampleTimestamp - - return dfResample - - -def plot_instance(class_number, instance_index, resample_factor): - """Plot one especific event class and instance. By default the - instance is downsampling (n=100) and Z-score Scaler. In order to - help the visualization transient labels was changed to '0.5'. - - Args: - class_number (integer): integer that represents the event class - instance_index (integer): input the instance file index - """ - instances_path = os.path.join( - PATH_DATASET, str(class_number), "*" + PARQUET_EXTENSION - ) - instances_path_list = glob.glob(instances_path) - if instance_index >= len(instances_path_list): - print( - f"instance index {instance_index} out of range - Insert a valid index between 0 and {len(instances_path_list)-1}" - ) - else: - df_instance = pd.read_parquet( - instances_path_list[instance_index], engine=PARQUET_ENGINE - ) - df_instance_resampled = resample(df_instance, resample_factor, class_number) - df_drop_resampled = df_instance_resampled.drop(["state", "class"], axis=1) - df_drop_resampled.interpolate( - method="linear", limit_direction="both", axis=0, inplace=True - ) - df_drop_resampled.fillna( - 0, - inplace=True, - ) - scaler_resampled = TimeSeriesScalerMeanVariance().fit_transform( - df_drop_resampled - ) - - df_scaler_resampled = pd.DataFrame( - scaler_resampled.squeeze(), - index=df_drop_resampled.index, - columns=df_drop_resampled.columns, - ) - df_instance_resampled["class"] = df_instance_resampled["class"].replace( - 100 + int(class_number), 0.5 - ) - df_instance_resampled["class"] = df_instance_resampled["class"].replace( - int(class_number), 1 - ) - - colors_traces = [ - "#008080", - "#3498DB", - "#E74C3C", - "#884EA0", - "#D4AC0D", - "#AF601A", - "#D35400", - "#839192", - "#2E4053", - ] - fig = go.Figure() - fig.add_trace( - go.Scatter( - x=df_instance_resampled.index, - y=df_scaler_resampled[VARS[0]], - mode="lines+markers", - marker_symbol="circle", - marker_size=3, - name=VARS[0], - yaxis="y1", - line_color=colors_traces[0], - ) - ), - fig.add_trace( - go.Scatter( - x=df_instance_resampled.index, - y=df_scaler_resampled[VARS[1]], - mode="lines+markers", - marker_symbol="diamond", - marker_size=3, - name=VARS[1], - yaxis="y2", - line_color=colors_traces[1], - ) - ), - fig.add_trace( - go.Scatter( - x=df_instance_resampled.index, - y=df_scaler_resampled[VARS[2]], - mode="lines+markers", - marker_symbol="x", - marker_size=3, - name=VARS[2], - yaxis="y3", - line_color=colors_traces[2], - ) - ), - fig.add_trace( - go.Scatter( - x=df_instance_resampled.index, - y=df_scaler_resampled[VARS[3]], - mode="lines+markers", - marker_symbol="star", - marker_size=3, - name=VARS[3], - yaxis="y4", - line_color=colors_traces[3], - ) - ), - fig.add_trace( - go.Scatter( - x=df_instance_resampled.index, - y=df_scaler_resampled[VARS[4]], - mode="lines+markers", - marker_symbol="triangle-up", - marker_size=3, - name=VARS[4], - yaxis="y5", - line_color=colors_traces[4], - ) - ), - fig.add_trace( - go.Scatter( - x=df_instance_resampled.index, - y=df_scaler_resampled[VARS[5]], - mode="lines", - name=VARS[5], - yaxis="y6", - line_color=colors_traces[5], - ) - ), - fig.add_trace( - go.Scatter( - x=df_instance_resampled.index, - y=df_scaler_resampled[VARS[6]], - mode="lines", - name=VARS[6], - yaxis="y7", - line_color=colors_traces[6], - ) - ), - fig.add_trace( - go.Scatter( - x=df_instance_resampled.index, - y=df_scaler_resampled[VARS[7]], - mode="lines", - name=VARS[7], - yaxis="y8", - line_color=colors_traces[7], - ) - ), - fig.add_trace( - go.Scatter( - x=df_instance_resampled.index, - y=df_instance_resampled["class"], - mode="markers", - name="Label", - yaxis="y9", - line_color=colors_traces[8], - ) - ), - fileName = instances_path_list[instance_index].split(os.sep) - fig.update_layout( - title=EVENT_NAMES[class_number] + " - " + fileName[-1], - xaxis_title="Time(s)", - yaxis_title="z-score", - font=dict(size=12), - yaxis1=dict( - tickfont=dict(color=colors_traces[0]), - position=0, - tickformat=".2f", - showticklabels=False, - ), - yaxis2=dict( - tickfont=dict(color=colors_traces[1]), - overlaying="y", - side="left", - position=0.05, - tickformat=".2f", - showticklabels=False, - ), - yaxis3=dict( - tickfont=dict(color=colors_traces[2]), - overlaying="y", - side="left", - position=0.10, - tickformat=".2f", - showticklabels=False, - ), - yaxis4=dict( - tickfont=dict(color=colors_traces[3]), - overlaying="y", - side="left", - position=0.15, - tickformat=".2f", - showticklabels=False, - ), - yaxis5=dict( - tickfont=dict(color=colors_traces[4]), - overlaying="y", - side="left", - position=0.2, - tickformat=".2f", - showticklabels=False, - ), - yaxis6=dict( - tickfont=dict(color=colors_traces[5]), - overlaying="y", - side="left", - position=0.25, - tickformat=".2f", - showticklabels=False, - ), - yaxis7=dict( - tickfont=dict(color=colors_traces[6]), - overlaying="y", - side="left", - position=0.3, - tickformat=".2f", - showticklabels=False, - ), - yaxis8=dict( - tickfont=dict(color=colors_traces[7]), - overlaying="y", - side="left", - position=0.35, - tickformat=".2f", - showticklabels=False, - ), - yaxis9=dict( - tickfont=dict(color=colors_traces[8]), - anchor="x", - overlaying="y", - side="left", - ), - ) - fig.show() + plt.rcParams["legend.fontsize"]