Skip to content

Commit

Permalink
prepare Table so it can parse data from three different sources
Browse files Browse the repository at this point in the history
  • Loading branch information
pmayd committed Jan 30, 2024
1 parent 0e2f0a2 commit 73b316f
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 9 deletions.
50 changes: 43 additions & 7 deletions nb/presentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,30 +36,66 @@
# %% [markdown]
# ### Find

# %%

# %% [markdown]
# ### Table

# %% [markdown]
# `pystatis.Table` offers a simple Interface to get any table via its "name" ([EVAS](https://www.destatis.de/DE/Service/Bibliothek/Abloesung-Fachserien/uebersicht-fs.html) number).
#
# 1. Create a new Table instance by passing `name=<EVAS>`
# 2. Download the actual data with `.get_data(prettify=<True|False>)`
# 3. Access data via either `.raw_data` or `.data`, metadata via `.metadata`

# %%
# GENESIS - https://www-genesis.destatis.de/genesis//online?operation=table&code=31231-0001&bypass=true&levelindex=1&levelid=1706599948340#abreadcrumb
t = pystatis.Table(name="31231-0001") #

# %% [markdown]
# Per default, `prettify` is set to `True` and will return a more readable format. Here we show the original format first.

# %%
t = pystatis.Table(name="32111-01-01-4")
t.get_data(prettify=False)

# %%
t.get_data()
t.raw_data.splitlines()

# %%
t.data

# %% [markdown]
# As you can see, the original format has a lot of redundant information and columns with metadata like the codes for the different variables. Let's rerun `get_data` with `prettify=True`.

# %%
t.raw_data.splitlines()
t.get_data()

# %%
t.data

# %% [markdown]
# You can also access the metadata as returned by the Catalogue endpoint.

# %%
pprint(t.metadata)

# %% [markdown]
# You can use any EVAS number from the supported databases like GENESIS, Regionalstatistik or Zensus. The library identifies the database for you so you don't have to care about this.

# %%
t = pystatis.Table(name="12111-01-01-5-B")
# GENESIS
t = pystatis.Table(name="71321-0001")
t.get_data()
t.data

# %%
# runs for roughly 2 minutes
t.get_data() # GENESIS starts a backghround job and we wait 3000 seconds -> no action required
# Regionalstatistik
t = pystatis.Table(name="71327-01-05-4")
t.get_data()
t.data

# %%
t.data # 122058 x 18 columns
# Zensus
t = pystatis.Table(name="2000S-1006")
t.get_data()
t.data
30 changes: 28 additions & 2 deletions src/pystatis/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import pandas as pd

import pystatis.db as db
from pystatis.http_helper import load_data


Expand Down Expand Up @@ -45,7 +46,9 @@ def get_data(self, area: str = "all", prettify: bool = True, **kwargs):
self.data = pd.read_csv(data_str, sep=";")

if prettify:
self.data = self.prettify_table(self.data)
self.data = self.prettify_table(
self.data, db.identify_db(self.name)[0]
)

metadata = load_data(
endpoint="metadata", method="table", params=params, as_json=True
Expand All @@ -55,16 +58,31 @@ def get_data(self, area: str = "all", prettify: bool = True, **kwargs):
self.metadata = metadata

@staticmethod
def prettify_table(data: pd.DataFrame) -> pd.DataFrame:
def prettify_table(data: pd.DataFrame, db: str) -> pd.DataFrame:
"""Reformat the data into a more readable table
Args:
data (pd.DataFrame): A pandas dataframe created from raw_data
db (str): The name of the database.
Returns:
pd.DataFrame: Formatted dataframe that omits all unnecessary Code columns
and includes informative columns names
"""
match db:
case "genesis":
pretty_data = Table.parse_genesis_table(data)
case "zensus":
pretty_data = Table.parse_zensus_table(data)
case "regio":
pretty_data = Table.parse_regio_table(data)
case _:
pretty_data = data

return pretty_data

@staticmethod
def parse_genesis_table(data: pd.DataFrame) -> pd.DataFrame:
# Extracts time column with name from first element of Zeit_Label column
time = pd.DataFrame({data["Zeit_Label"].iloc[0]: data["Zeit"]})

Expand All @@ -82,3 +100,11 @@ def prettify_table(data: pd.DataFrame) -> pd.DataFrame:

pretty_data = pd.concat([time, attributes, values], axis=1)
return pretty_data

@staticmethod
def parse_zensus_table(data: pd.DateFrame) -> pd.DataFrame:
pass

@staticmethod
def parse_regio_table(data: pd.DateFrame) -> pd.DataFrame:
pass

0 comments on commit 73b316f

Please sign in to comment.