prepare Table so it can parse data from three different sources

CorrelAid · Jan 30, 2024 · 73b316f · 73b316f
1 parent 0e2f0a2
commit 73b316f
Show file tree

Hide file tree

Showing 2 changed files with 71 additions and 9 deletions.
diff --git a/nb/presentation.py b/nb/presentation.py
@@ -36,30 +36,66 @@
 # %% [markdown]
 # ### Find
 
+# %%
+
 # %% [markdown]
 # ### Table
 
+# %% [markdown]
+# `pystatis.Table` offers a simple Interface to get any table via its "name" ([EVAS](https://www.destatis.de/DE/Service/Bibliothek/Abloesung-Fachserien/uebersicht-fs.html) number).
+#
+# 1. Create a new Table instance by passing `name=<EVAS>`
+# 2. Download the actual data with `.get_data(prettify=<True|False>)`
+# 3. Access data via either `.raw_data` or `.data`, metadata via `.metadata`
+
+# %%
+# GENESIS - https://www-genesis.destatis.de/genesis//online?operation=table&code=31231-0001&bypass=true&levelindex=1&levelid=1706599948340#abreadcrumb
+t = pystatis.Table(name="31231-0001")  #
+
+# %% [markdown]
+# Per default, `prettify` is set to `True` and will return a more readable format. Here we show the original format first.
+
 # %%
-t = pystatis.Table(name="32111-01-01-4")
+t.get_data(prettify=False)
 
 # %%
-t.get_data()
+t.raw_data.splitlines()
 
 # %%
 t.data
 
+# %% [markdown]
+# As you can see, the original format has a lot of redundant information and columns with metadata like the codes for the different variables. Let's rerun `get_data` with `prettify=True`.
+
 # %%
-t.raw_data.splitlines()
+t.get_data()
+
+# %%
+t.data
+
+# %% [markdown]
+# You can also access the metadata as returned by the Catalogue endpoint.
 
 # %%
 pprint(t.metadata)
 
+# %% [markdown]
+# You can use any EVAS number from the supported databases like GENESIS, Regionalstatistik or Zensus. The library identifies the database for you so you don't have to care about this.
+
 # %%
-t = pystatis.Table(name="12111-01-01-5-B")
+# GENESIS
+t = pystatis.Table(name="71321-0001")
+t.get_data()
+t.data
 
 # %%
-# runs for roughly 2 minutes
-t.get_data()  # GENESIS starts a backghround job and we wait 3000 seconds -> no action required
+# Regionalstatistik
+t = pystatis.Table(name="71327-01-05-4")
+t.get_data()
+t.data
 
 # %%
-t.data  # 122058 x 18 columns
+# Zensus
+t = pystatis.Table(name="2000S-1006")
+t.get_data()
+t.data
diff --git a/src/pystatis/table.py b/src/pystatis/table.py
@@ -3,6 +3,7 @@
 
 import pandas as pd
 
+import pystatis.db as db
 from pystatis.http_helper import load_data
 
 
@@ -45,7 +46,9 @@ def get_data(self, area: str = "all", prettify: bool = True, **kwargs):
         self.data = pd.read_csv(data_str, sep=";")
 
         if prettify:
-            self.data = self.prettify_table(self.data)
+            self.data = self.prettify_table(
+                self.data, db.identify_db(self.name)[0]
+            )
 
         metadata = load_data(
             endpoint="metadata", method="table", params=params, as_json=True
@@ -55,16 +58,31 @@ def get_data(self, area: str = "all", prettify: bool = True, **kwargs):
         self.metadata = metadata
 
     @staticmethod
-    def prettify_table(data: pd.DataFrame) -> pd.DataFrame:
+    def prettify_table(data: pd.DataFrame, db: str) -> pd.DataFrame:
         """Reformat the data into a more readable table
 
         Args:
             data (pd.DataFrame): A pandas dataframe created from raw_data
+            db (str): The name of the database.
 
         Returns:
             pd.DataFrame: Formatted dataframe that omits all unnecessary Code columns
             and includes informative columns names
         """
+        match db:
+            case "genesis":
+                pretty_data = Table.parse_genesis_table(data)
+            case "zensus":
+                pretty_data = Table.parse_zensus_table(data)
+            case "regio":
+                pretty_data = Table.parse_regio_table(data)
+            case _:
+                pretty_data = data
+
+        return pretty_data
+
+    @staticmethod
+    def parse_genesis_table(data: pd.DataFrame) -> pd.DataFrame:
         # Extracts time column with name from first element of Zeit_Label column
         time = pd.DataFrame({data["Zeit_Label"].iloc[0]: data["Zeit"]})
 
@@ -82,3 +100,11 @@ def prettify_table(data: pd.DataFrame) -> pd.DataFrame:
 
         pretty_data = pd.concat([time, attributes, values], axis=1)
         return pretty_data
+
+    @staticmethod
+    def parse_zensus_table(data: pd.DateFrame) -> pd.DataFrame:
+        pass
+
+    @staticmethod
+    def parse_regio_table(data: pd.DateFrame) -> pd.DataFrame:
+        pass