Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor and add incremental workflow functionality. #78

Merged
merged 26 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
1b42fec
Refactor and add functionality.
haileyplusplus Apr 1, 2024
b353d68
Files modified in refactor
haileyplusplus Apr 1, 2024
9bdce2f
Merge branch 'chihacknight:main' into refactor2
haileyplusplus Apr 3, 2024
c6ee240
Merge branch 'chihacknight:main' into refactor2
haileyplusplus Apr 24, 2024
191e415
Simplify the code a bit by removing unnecessary features.
haileyplusplus Apr 25, 2024
d5a9b35
Simplify cache manager by removing unnecessary functionality.
haileyplusplus Apr 25, 2024
28daf27
Skip schedule change days with CTA GTFS schedules, consistent with tr…
haileyplusplus Apr 25, 2024
de28ca0
Revert inadvertent change.
haileyplusplus Apr 25, 2024
90501e5
More progress bar fixes.
haileyplusplus Apr 25, 2024
9851cb3
Skip unnecessary scraping of transitfeeds schedule index.
haileyplusplus Apr 25, 2024
45f58fe
Improve progress bar display.
haileyplusplus Apr 25, 2024
a196073
Refactor schedule parsing and indexing code for simplicity.
haileyplusplus Apr 25, 2024
e91c8e0
Improve documentation and variable names.
haileyplusplus Apr 25, 2024
1449eb8
Simplify combiner class.
haileyplusplus Apr 25, 2024
c3a5d04
More comment and naming cleanup.
haileyplusplus Apr 25, 2024
97179bc
Merge branch 'main' into refactor2
haileyplusplus May 1, 2024
e5dd5c7
Output frontend geojson file as part of data updates.
haileyplusplus Jun 4, 2024
5f70e73
Merge branch 'generate-data-summary' into refactor2
haileyplusplus Jun 4, 2024
8e9c326
Fix bug in loading TransitFeeds schedules.
haileyplusplus Jun 8, 2024
75c216a
More updates to address review comments.
haileyplusplus Jun 11, 2024
a5fa614
Update main README to explain how to run update_data script.
haileyplusplus Jun 11, 2024
71fb7a3
Mention possible need to set PROJECT_NAME in README.
haileyplusplus Jun 11, 2024
a22e3a8
Merge branch 'chihacknight:main' into refactor2
haileyplusplus Jun 25, 2024
53cf8a6
change date format to address front end issue
lauriemerrell Sep 20, 2024
505cf9f
update date format and data.json for compatibility with front end
lauriemerrell Sep 20, 2024
2c94553
filter to only weekday for ridership calculation
lauriemerrell Oct 21, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 82 additions & 0 deletions data_analysis/cache_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from pathlib import Path

import logging
import datetime

import pandas as pd
import requests
from io import BytesIO

DATA_DIR = Path(__file__).parent.parent / "data_output" / "scratch"


class CacheManager:
def __init__(self, ignore_cached_calculation=False, verbose=False):
self.data_dir: Path = DATA_DIR
self.objects = {}
self.ignore_cached_calculation = ignore_cached_calculation
self.verbose = verbose

def log(self, *args):
if self.verbose:
logging.info(args)

def retrieve_object(self, name, func):
obj = self.objects.get(name)
if obj is None:
obj = func()
self.objects[name] = obj
return obj

def retrieve(self, subdir, filename: str, url: str) -> BytesIO:
cache_dir = self.data_dir / subdir
if not cache_dir.exists():
cache_dir.mkdir()
filepath = cache_dir / filename
if filepath.exists():
self.log(f'Retrieved cached {url} from {filename}')
return BytesIO(filepath.open('rb').read())
bytes_io = BytesIO(requests.get(url).content)
with filepath.open('wb') as ofh:
ofh.write(bytes_io.getvalue())
self.log(f'Stored cached {url} in {filename}')
return bytes_io

@staticmethod
def fix_dt_column(df, c):
def fixer(x):
if type(x) is not int:
return pd.NaT
return datetime.datetime.fromtimestamp(x / 1000).astimezone(datetime.UTC)
df[c] = df[c].apply(fixer)
return df

def retrieve_calculated_dataframe(self, subdir, filename, func, dt_fields: list[str]) -> pd.DataFrame:
haileyplusplus marked this conversation as resolved.
Show resolved Hide resolved
cache_dir = self.data_dir / subdir
if not cache_dir.exists():
cache_dir.mkdir()
filepath = cache_dir / filename
csv = filename.endswith('.csv')
if self.ignore_cached_calculation:
self.log(f'Ignoring whether {subdir}/{filename} is in cache')
return func()
if filepath.exists():
self.log(f'Retrieved {subdir}/{filename} from cache')
if csv:
logging.debug(f'Reading csv from {filepath}')
df = pd.read_csv(filepath, low_memory=False)
else:
df = pd.read_json(filepath)
assert type(df) is pd.DataFrame
if df.empty:
return pd.DataFrame()
for c in dt_fields:
df = self.fix_dt_column(df, c)
return df
self.log(f'Writing {subdir}/{filename} to cache')
df = func()
if csv:
df.to_csv(filepath)
else:
df.to_json(filepath)
return df
51 changes: 51 additions & 0 deletions data_analysis/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""
Utility functions common to both schedule and realtime analysis.
"""
from dataclasses import dataclass, field
from typing import List, Tuple

import pandas as pd


@dataclass
class AggInfo:
"""A class for storing information about
aggregation of route and schedule data

Args:
freq (str, optional): An offset alias described in the Pandas
time series docs. Defaults to None.
https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
aggvar (str, optional): variable to aggregate by.
Defaults to trip_count
byvars (List[str], optional): variables to passed to
pd.DataFrame.groupby. Defaults to ['date', 'route_id'].
"""
freq: str = 'D'
aggvar: str = 'trip_count'
byvars: List[str] = field(default_factory=lambda: ['date', 'route_id'])


def sum_by_frequency(
df: pd.DataFrame,
agg_info: AggInfo) -> pd.DataFrame:
"""Calculate total trips per route per frequency

Args:
df (pd.DataFrame): A DataFrame of route or scheduled route data
agg_info (AggInfo): An AggInfo object describing how data
is to be aggregated.

Returns:
pd.DataFrame: A DataFrame with the total number of trips per route
by a specified frequency.
"""
df = df.copy()
out = (
df.set_index(agg_info.byvars)
.groupby(
[pd.Grouper(level='date', freq=agg_info.freq),
pd.Grouper(level='route_id')])[agg_info.aggvar]
.sum().reset_index()
)
return out
Loading