Skip to content

Commit

Permalink
Refactoring sklearn notebooks - making them use the Secret Store (#43)
Browse files Browse the repository at this point in the history
* Initial commit of the configuration notebook.

* Configuration notebook, addressed review issues.

* Refactoring of the sandbox_config.ipynb, sandbox_config_ui.ipynb

* Further refactoring of the sandbox_config_ui.ipynb

* Refactoring of the sklearn notebooks

* Update doc/tutorials/sklearn/sklearn_predict_abalone.ipynb

Co-authored-by: Christoph Kuhnke <[email protected]>

* Minor text refactoring

* Cleared cell outputs

---------

Co-authored-by: Christoph Kuhnke <[email protected]>
  • Loading branch information
ahsimb and ckunki authored Nov 14, 2023
1 parent f83662f commit 3bd4ba4
Show file tree
Hide file tree
Showing 7 changed files with 305 additions and 975 deletions.
150 changes: 36 additions & 114 deletions doc/tutorials/data/data_abalone.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -5,123 +5,49 @@
"id": "30001487-61d5-46be-a662-83f406d8cc2a",
"metadata": {},
"source": [
"# Abalone\n",
"\n",
"Here we will load data of physical measurements of abalones (sea snails). <a href=\"https://archive.ics.uci.edu/dataset/1/abalone\" target=\"_blank\" rel=\"noopener\">Follow this link</a> to get details about this dataset.\n",
"\n",
"To execute queries and upload data to Exasol database we will be using the <a href=\"https://github.com/exasol/pyexasol\" target=\"_blank\" rel=\"noopener\">`pyexasol`</a> module.\n",
"\n",
"Prior to using this notebook one needs to [create the database schema](../setup_db.ipynb)."
"## Prerequisites\n",
"\n",
"Prior to using this notebook the following steps need to be completed:\n",
"1. [Configure the sandbox](../sandbox_config.ipynb).\n",
"\n",
"## Setup\n",
"\n",
"### Access configuration"
]
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"id": "7d1fd009-42ca-4b16-929a-d00d284e2e1f",
"metadata": {},
"outputs": [],
"source": [
"# TODO: Move this to a separate configuration notebook. Here we just need to load this configuration from a store.\n",
"from dataclasses import dataclass\n",
"\n",
"@dataclass\n",
"class SandboxConfig:\n",
" EXTERNAL_HOST_NAME = \"192.168.124.93\"\n",
" HOST_PORT = \"8888\"\n",
"\n",
" @property\n",
" def EXTERNAL_HOST(self):\n",
" return f\"\"\"{self.EXTERNAL_HOST_NAME}:{self.HOST_PORT}\"\"\"\n",
"\n",
" USER = \"sys\"\n",
" PASSWORD = \"exasol\"\n",
" BUCKETFS_PORT = \"6666\"\n",
" BUCKETFS_USER = \"w\"\n",
" BUCKETFS_PASSWORD = \"write\"\n",
" BUCKETFS_USE_HTTPS = False\n",
" BUCKETFS_SERVICE = \"bfsdefault\"\n",
" BUCKETFS_BUCKET = \"default\"\n",
"\n",
" @property\n",
" def EXTERNAL_BUCKETFS_HOST(self):\n",
" return f\"\"\"{self.EXTERNAL_HOST_NAME}:{self.BUCKETFS_PORT}\"\"\"\n",
"\n",
" @property\n",
" def BUCKETFS_URL_PREFIX(self):\n",
" return \"https://\" if self.BUCKETFS_USE_HTTPS else \"http://\"\n",
"\n",
" @property\n",
" def BUCKETFS_PATH(self):\n",
" # Filesystem-Path to the read-only mounted BucketFS inside the running UDF Container\n",
" return f\"/buckets/{self.BUCKETFS_SERVICE}/{self.BUCKETFS_BUCKET}\"\n",
"\n",
" SCRIPT_LANGUAGE_NAME = \"PYTHON3_60\"\n",
" UDF_FLAVOR = \"python3-ds-EXASOL-6.0.0\"\n",
" UDF_RELEASE= \"20190116\"\n",
" UDF_CLIENT = \"exaudfclient\" # or for newer versions of the flavor exaudfclient_py3\n",
" SCHEMA = \"IDA\"\n",
"\n",
" @property\n",
" def SCRIPT_LANGUAGES(self):\n",
" return f\"\"\"{self.SCRIPT_LANGUAGE_NAME}=localzmq+protobuf:///{self.BUCKETFS_SERVICE}/\n",
" {self.BUCKETFS_BUCKET}/{self.UDF_FLAVOR}?lang=python#buckets/{self.BUCKETFS_SERVICE}/\n",
" {self.BUCKETFS_BUCKET}/{self.UDF_FLAVOR}/exaudf/{self.UDF_CLIENT}\"\"\";\n",
"\n",
" @property\n",
" def connection_params(self):\n",
" return {\"dns\": self.EXTERNAL_HOST, \"user\": self.USER, \"password\": self.PASSWORD, \"compression\": True}\n",
"\n",
" @property\n",
" def params(self):\n",
" return {\n",
" \"script_languages\": self.SCRIPT_LANGUAGES,\n",
" \"script_language_name\": self.SCRIPT_LANGUAGE_NAME,\n",
" \"schema\": self.SCHEMA,\n",
" \"BUCKETFS_PORT\": self.BUCKETFS_PORT,\n",
" \"BUCKETFS_USER\": self.BUCKETFS_USER,\n",
" \"BUCKETFS_PASSWORD\": self.BUCKETFS_PASSWORD,\n",
" \"BUCKETFS_USE_HTTPS\": self.BUCKETFS_USE_HTTPS,\n",
" \"BUCKETFS_BUCKET\": self.BUCKETFS_BUCKET,\n",
" \"BUCKETFS_PATH\": self.BUCKETFS_PATH\n",
" }\n",
"\n",
"conf = SandboxConfig()"
"%run ../access_store_ui.ipynb\n",
"display(get_access_store_ui('../'))"
]
},
{
"cell_type": "markdown",
"id": "2b97740f-c9b6-40f8-a9c7-3ddcd08e0898",
"metadata": {},
"source": [
"First we will load the data into pandas DataFrame.\n",
"We will name the column as per their description (see Variable Table in the dataset description)."
"## Download data\n",
"\n",
"First, we will load the data into Pandas DataFrame. Each data column represents one of the features and is named accordingly, see section Variable Table in the dataset description. We will name the Pandas columns as per the variable description."
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"id": "863b0429-1fe3-46ae-9569-a08c11c78464",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading the data took: 777.70ms\n",
" Sex Length Diameter Height Whole_weight Shucked_weight Viscera_weight \\\n",
"0 M 0.350 0.265 0.090 0.2255 0.0995 0.0485 \n",
"1 F 0.530 0.420 0.135 0.6770 0.2565 0.1415 \n",
"2 M 0.440 0.365 0.125 0.5160 0.2155 0.1140 \n",
"3 I 0.330 0.255 0.080 0.2050 0.0895 0.0395 \n",
"4 I 0.425 0.300 0.095 0.3515 0.1410 0.0775 \n",
"\n",
" Shell_weight Rings \n",
"0 0.070 7 \n",
"1 0.210 9 \n",
"2 0.155 10 \n",
"3 0.055 7 \n",
"4 0.120 8 \n"
]
}
],
"outputs": [],
"source": [
"from urllib.request import urlopen\n",
"import tempfile\n",
Expand Down Expand Up @@ -167,50 +93,46 @@
"id": "916ba87d-57f4-44f6-a0f8-8c5556afb719",
"metadata": {},
"source": [
"## Upload data into DB\n",
"\n",
"Let's split data randomly into train and test sets. We will then create two tables - ABALONE_TRAIN and ABALONE_TEST - and load the datasets into these tables."
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"id": "547907a7-b437-4ff4-8ab8-08b55e0dcc88",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Imported 3340 rows into TRAIN.\n",
"Imported 836 rows into TEST.\n",
"Importing the data took: 434.25ms\n"
]
}
],
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"import pyexasol\n",
"\n",
"# Split the data into train and test sets\n",
"df_train, df_test = train_test_split(df, test_size=0.2)\n",
"\n",
"train_table = 'ABALONE_TRAIN'\n",
"test_table = 'ABALONE_TEST'\n",
"column_desc = [' '.join(c) for c in column_def]\n",
"dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n",
"\n",
"stopwatch = Stopwatch()\n",
"\n",
"# Create Exasol connection\n",
"with pyexasol.connect(dsn=conf.EXTERNAL_HOST, user=conf.USER, password=conf.PASSWORD, compression=True) as conn:\n",
"# Create an Exasol connection\n",
"dsn = f'{sb_config.EXTERNAL_HOST_NAME}:{sb_config.HOST_PORT}'\n",
"with pyexasol.connect(dsn=dsn, user=sb_config.USER, password=sb_config.PASSWORD, compression=True) as conn:\n",
"\n",
" # Create tables\n",
" sql = f'CREATE OR REPLACE TABLE {{schema!i}}.ABALONE_TRAIN({\", \".join(column_desc)})'\n",
" conn.execute(query=sql, query_params=conf.params)\n",
" sql = 'CREATE OR REPLACE TABLE {schema!i}.ABALONE_TEST LIKE {schema!i}.ABALONE_TRAIN'\n",
" conn.execute(query=sql, query_params=conf.params)\n",
" sql = f'CREATE OR REPLACE TABLE \"{sb_config.SCHEMA}\".\"{train_table}\"({\", \".join(column_desc)})'\n",
" conn.execute(query=sql)\n",
" sql = f'CREATE OR REPLACE TABLE \"{sb_config.SCHEMA}\".\"{test_table}\" LIKE \"{sb_config.SCHEMA}\".\"{train_table}\"'\n",
" conn.execute(query=sql)\n",
"\n",
" # Import data into Exasol\n",
" conn.import_from_pandas(df_train, (conf.SCHEMA, \"ABALONE_TRAIN\"))\n",
" print(f\"Imported {conn.last_statement().rowcount()} rows into TRAIN.\")\n",
" conn.import_from_pandas(df_test, (conf.SCHEMA, \"ABALONE_TEST\"))\n",
" print(f\"Imported {conn.last_statement().rowcount()} rows into TEST.\")\n",
" conn.import_from_pandas(df_train, (sb_config.SCHEMA, train_table))\n",
" print(f\"Imported {conn.last_statement().rowcount()} rows into {train_table}.\")\n",
" conn.import_from_pandas(df_test, (sb_config.SCHEMA, test_table))\n",
" print(f\"Imported {conn.last_statement().rowcount()} rows into {test_table}.\")\n",
"\n",
"print(f\"Importing the data took: {stopwatch}\")"
]
Expand Down
Loading

0 comments on commit 3bd4ba4

Please sign in to comment.