Skip to content

Commit

Permalink
Add activity and plotting modules (#4)
Browse files Browse the repository at this point in the history
  • Loading branch information
g4brielvs authored Sep 17, 2024
1 parent 5f39a1c commit e260d47
Show file tree
Hide file tree
Showing 8 changed files with 875 additions and 106 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,6 @@ _build/
.env

# Project-specific

data/
.jupyter/
.ruff_cache/
16 changes: 16 additions & 0 deletions docs/_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,19 @@ sphinx:
config:
html_show_copyright: false
html_last_updated_fmt: "%b %d, %Y"
html_js_files:
- https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js
mathjax_path: https://cdnjs.cloudflare.com/ajax/libs/mathjax/3.2.0/es5/tex-mml-chtml.js
mathjax2_config: {
'tex2jax': {
'inlineMath': [['$', '$'], ['\\(', '\\)']],
'processEscapes': True,
'ignoreClass': 'document',
'processClass': 'math|output_area',
}
}
mathjax_options: {
"async": "async",
}
suppress_warnings: ["mystnb.unknown_mime_type"]

8 changes: 8 additions & 0 deletions docs/bibliography.bib
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,11 @@ @online{worldbankeca
url = {https://www.worldbank.org/en/region/eca/brief/climate-change-in-europe-and-central-asia},
urldate = {2023-01-05},
}
@article{Ubaldi2024,
doi = {10.21105/joss.05201},
url = {https://doi.org/10.21105/joss.05201},
year = {2024}, publisher = {The Open Journal}, volume = {9}, number = {95}, pages = {5201},
author = {Enrico Ubaldi and Takahiro Yabe and Nicholas Jones and Maham Faisal Khan and Alessandra Feliciotti and Riccardo Di Clemente and Satish V. Ukkusuri and Emanuele Strano},
title = {Mobilkit: A Python Toolkit for Urban Resilience and Disaster Risk Management Analytics},
journal = {Journal of Open Source Software}
}
158 changes: 76 additions & 82 deletions notebooks/tessellate.ipynb

Large diffs are not rendered by default.

15 changes: 9 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,13 @@ build-backend = "hatchling.build"

[project]
name = "mobilyze"
description = "A Python package harnessing mobility data to quantify movement intensity"
description = "A Python package designed to analyze mobility data and measure and visualize movement intensity."
readme = { file = "README.md", content-type = "text/markdown" }
license = { file = "LICENSE" }
keywords = ["mobility"]
authors = [{ name = "Development Data Group", email = "[email protected]" }]
classifiers = [
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3 :: Only",
"License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)",
"Operating System :: OS Independent",
Expand All @@ -20,20 +19,24 @@ classifiers = [
]
dynamic = ["version"]

requires-python = ">=3.10"
requires-python = ">=3.8,<3.10"
dependencies = [
"bokeh",
"datashader",
"dask",
"datashader",
"h3>=3,<4",
"holoviews",
"humanize",
"mobilkit>=0.2.8",
"scikit-mobility>=1.3.1",
"numpy<2",
"palettable>=3.3.3",
"plotly",
"scikit-mobility>=1.3.1",
]
[project.optional-dependencies]
docs = [
"docutils==0.17.1", # https://jupyterbook.org/en/stable/content/citations.html?highlight=docutils#citations-and-bibliographies
"jupyter-book >=1,<2",
"jupyter-book>=1,<2",
]

[project.urls]
Expand Down
218 changes: 218 additions & 0 deletions src/mobilyze/activity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
import dask.dataframe as dd
import dask_geopandas
import h3
import mobilkit
import pandas as pd
from dask.dataframe import DataFrame
from sklearn.preprocessing import StandardScaler
from tqdm.auto import tqdm


def compute_ping_interval(ddf: DataFrame):
def _ping_interval(df: pd.DataFrame):
df.sort_values(by=["uid", "datetime"], inplace=True)
df["ping_interval"] = df.groupby("uid")["datetime"].diff()
df["ping_interval_minutes"] = df["ping_interval"].dt.total_seconds() / 60

return (
ddf.groupby(["uid"])
.apply(_ping_interval)
.map_partitions(lambda d: d.reset_index(drop=True))
).compute()


def compute_activity(
ddf: DataFrame,
start="2022-01-01",
end="2023-12-31",
) -> pd.DataFrame:
"""
Compute activity levels based on GPS mobility data, returning both z-scores and
percentage change relative to a baseline period. Activity is measured as the
number of unique devices (`uid`) detected in each spatial area (`hex_id`) daily.
The z-score standardizes activity data, showing how much the activity deviates
from the baseline mean, expressed in standard deviations. Percentage change
indicates the relative increase or decrease in activity compared to the baseline.
Parameters
----------
ddf : dask.dataframe.DataFrame
A Dask DataFrame containing mobility data. The DataFrame must include at
least the following columns:
- `datetime`: Timestamps of device detections.
- `hex_id`: Spatial index for the detection location.
- `uid`: Unique identifier for each device.
start : str, optional
The start date of the baseline period in "YYYY-MM-DD" format. The default is
"2022-01-01".
end : str, optional
The end date of the baseline period in "YYYY-MM-DD" format. The default is
"2023-12-31".
Returns
-------
pandas.DataFrame
A pandas DataFrame containing the calculated activity metrics with the
following columns:
- `hex_id`: The spatial area identifier.
- `date`: Date of the activity.
- `nunique`: Number of unique devices detected per `hex_id` and day.
- `weekday`: Day of the week (0=Monday, 6=Sunday).
- `nunique.mean`: Mean baseline device count for each `hex_id` and weekday.
- `nunique.std`: Standard deviation of baseline device counts.
- `n_baseline`: Baseline mean device count.
- `n_difference`: Difference between the current and baseline device counts.
- `percent_change`: Percentage change in device counts relative to the baseline.
- `z_score`: Standardized activity level relative to the baseline.
"""
# Convert datetime to date and group by hex_id and date, counting unique devices
activity = (
ddf.assign(date=lambda x: dd.to_datetime(ddf["datetime"].dt.date))
.groupby(["hex_id", "date"])["uid"]
.nunique()
.to_frame("nunique")
.reset_index()
.compute()
)

# Add weekday column for future standardization
activity["weekday"] = activity["date"].dt.weekday

# Define baseline period and calculate mean and standard deviation for each hex_id and weekday
baseline = activity[activity["date"].between(start, end)]
mean = baseline.groupby(["hex_id", "weekday"]).agg({"nunique": ["mean", "std"]})
mean.columns = mean.columns.map(".".join)

# Initialize scalers for each `hex_id`
scalers = {}
for hex_id in tqdm(baseline["hex_id"].unique()):
scaler = StandardScaler()
scaler.fit(baseline[baseline["hex_id"] == hex_id][["nunique"]])
scalers[hex_id] = scaler

# Merge activity data with baseline mean and standard deviation
activity = pd.merge(activity, mean, on=["hex_id", "weekday"], how="left")

# Calculate z-scores using the pre-trained scalers
for hex_id, scaler in tqdm(scalers.items()):
predicate = activity["hex_id"] == hex_id
score = scaler.transform(activity[predicate][["nunique"]])
activity.loc[predicate, "z_score"] = score

# Compute baseline, difference, and percent change
activity["n_baseline"] = activity["nunique.mean"]
activity["n_difference"] = activity["nunique"] - activity["n_baseline"]
activity["percent_change"] = 100 * (
activity["nunique"] / (activity["n_baseline"]) - 1
)

return (
activity[
[
"date",
"hex_id",
"weekday",
"nunique",
"nunique.mean",
"nunique.std",
"n_baseline",
"n_difference",
"percent_change",
"z_score",
]
]
.sort_values(["date", "hex_id"], ascending=True)
.set_index(["date", "hex_id"])
)


def categorize(row):
if pd.notna(row["healthcare"]) or row["amenity"] in (
"doctors",
"dentist",
"clinic",
"hospital",
"pharmacy",
):
return "health facilities"
elif row["amenity"] in ("kindergarten", "school", "college", "university") or row[
"building"
] in ("kindergarten", "school", "college", "university"):
return "education"


def compute_stops(
ddf: DataFrame,
stay_locations_kwds={"minutes_for_a_stop": 5.0, "spatial_radius_km": 0.25},
resolution: int = 7,
) -> DataFrame:
"""
Calculate stop locations from a trajectory DataFrame using spatial and temporal parameters.
The function takes a DataFrame containing GPS trajectory data and calculates stop locations
where an entity remains within a spatial radius for a given duration. The resulting DataFrame
contains additional information about the stops, such as entry and exit times, as well as
H3 hexagonal grid IDs representing the stop locations.
Parameters
----------
ddf : DataFrame
Input Dask DataFrame containing GPS trajectory data. It must have the columns 'hex_id',
'latitude', and 'longitude'.
stay_locations_kwds : dict, optional
A dictionary of parameters controlling the stop detection logic. The default is
`{"minutes_for_a_stop": 5.0, "spatial_radius_km": 0.25}`, where:
- "minutes_for_a_stop": Minimum duration (in minutes) to consider a location as a stop.
- "spatial_radius_km": Spatial radius (in kilometers) to define the area around a point
to consider it as a stop.
resolution : int, optional
Resolution level for H3 hexagonal grid representation of stops. Higher values increase
the resolution, meaning smaller hexagons. Default is 7.
Returns
-------
DataFrame
A Dask DataFrame with the detected stops, containing columns:
- "datetime": The timestamp of entering the stop.
- "leaving_datetime": The timestamp of leaving the stop.
- "date": The date of the stop.
- "hex_id": The H3 hexagonal grid ID at the specified resolution level, representing the
stop location.
- "geometry": Geopandas geometry of the stop points.
"""
# Assign columns before using mobilkit
ddf["tile_ID"] = ddf["hex_id"]
ddf["lat"] = ddf["latitude"]
ddf["lng"] = ddf["longitude"]

# Detect stops based on spatial and temporal thresholds
STOPS = mobilkit.spatial.findStops(
ddf,
stay_locations_kwds=stay_locations_kwds,
)
# Calculate H3 hex ID for each stop based on latitude and longitude
STOPS["hex_id"] = STOPS.apply(
lambda row: h3.geo_to_h3(row["lat"], row["lng"], resolution=resolution),
meta=("hex_id", "string"),
axis=1,
)

# Convert datetime columns to proper format
STOPS["datetime"] = dd.to_datetime(STOPS["datetime"])
STOPS["leaving_datetime"] = dd.to_datetime(STOPS["leaving_datetime"])
STOPS["date"] = STOPS["datetime"].dt.date.astype("string")
STOPS["hex_id"] = STOPS["hex_id"].astype("string")

# Convert to a GeoDataFrame
STOPS = dask_geopandas.from_dask_dataframe(
STOPS,
geometry=dask_geopandas.points_from_xy(STOPS, "lng", "lat"),
).set_crs("EPSG:4326")

return STOPS
Loading

0 comments on commit e260d47

Please sign in to comment.