From ae0cb03e9aa8b01de6de908c8763c4eab848848c Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Tue, 29 Aug 2023 20:04:20 -0500 Subject: [PATCH 01/62] add date range option for downloads. --- data_analysis/static_gtfs_analysis.py | 7 +- scrape_data/cta_data_downloads.py | 168 +++++++++++++++++++++----- 2 files changed, 141 insertions(+), 34 deletions(-) diff --git a/data_analysis/static_gtfs_analysis.py b/data_analysis/static_gtfs_analysis.py index 65bba88..32102c4 100644 --- a/data_analysis/static_gtfs_analysis.py +++ b/data_analysis/static_gtfs_analysis.py @@ -359,16 +359,15 @@ def download_zip(version_id: str) -> zipfile.ZipFile: zipfile.ZipFile: A zipfile for the CTA version id. """ logger.info('Downloading CTA data') - CTA_GTFS = zipfile.ZipFile( - BytesIO( + zipfile_bytes_io = BytesIO( requests.get( f"https://transitfeeds.com/p/chicago-transit-authority" f"/165/{version_id}/download" ).content ) - ) + CTA_GTFS = zipfile.ZipFile(zipfile_bytes_io) logging.info('Download complete') - return CTA_GTFS + return CTA_GTFS, zipfile_bytes_io def download_extract_format(version_id: str = None) -> GTFSFeed: diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 75f10d3..26d5cba 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -3,9 +3,9 @@ import data_analysis.static_gtfs_analysis as sga import data_analysis.compare_scheduled_and_rt as csrt import pendulum -from io import StringIO +from io import StringIO, BytesIO import pandas as pd - +from typing import List ACCESS_KEY = sys.argv[1] SECRET_KEY = sys.argv[2] @@ -60,56 +60,164 @@ def save_csv_to_bucket(df: pd.DataFrame, filename: str) -> None: .put(Body=csv_buffer.getvalue()) -def save_sched_daily_summary() -> None: - data = sga.GTFSFeed.extract_data(CTA_GTFS) - data = sga.format_dates_hours(data) - trip_summary = sga.make_trip_summary(data) - - route_daily_summary = ( - sga.summarize_date_rt(trip_summary) - ) - route_daily_summary['date'] = route_daily_summary['date'].astype(str) - route_daily_summary_today = route_daily_summary.loc[route_daily_summary['date'] == today] - - print(f'Saving cta_route_daily_summary_{today}.csv to public bucket') - filename = f'schedule_summaries/daily_job/cta_route_daily_summary_{today}.csv' - save_csv_to_bucket( - route_daily_summary_today, - filename=filename +def save_sched_daily_summary(date_range: List[str, str] = None) -> None: + if date_range is None: + date_range = [today] + print(f"No date range given. Using {today} only") + + start_date = pendulum.parse(min(date_range)) + end_date = pendulum.parse(max(date_range)) + period = pendulum.period(start_date, end_date) + full_date_range = [dt.to_date_string() for dt in period.range('days')] + zip_filename_list = [f'cta_schedule_zipfiles_raw/google_transit_{date}.zip' + for date in full_date_range] + + # Check for files in bucket. + found_list = keys( + csrt.BUCKET_PUBLIC, + zip_filename_list ) - print(f'Confirm that {filename} exists in bucket') - keys(csrt.BUCKET_PUBLIC, [filename]) + + def extract_date(fname: str) -> str: + return fname.split('_')[-1].split('.')[0] + + def create_route_summary(CTA_GTFS: sga.GTFSFeed) -> pd.DataFrame: + data = sga.GTFSFeed.extract_data(CTA_GTFS) + data = sga.format_dates_hours(data) + trip_summary = sga.make_trip_summary(data) + route_daily_summary = ( + sga.summarize_date_rt(trip_summary) + ) + + route_daily_summary['date'] = route_daily_summary['date'].astype(str) + route_daily_summary_today = route_daily_summary.loc[route_daily_summary['date'].isin(date_range)] + return route_daily_summary_today + + print('Using zipfiles found in public bucket') + s3zip_list = [] + for fname in found_list: + zip_bytes = BytesIO() + zip_bytes.seek(0) + client.download_fileobj(fname, zip_bytes) + zipfilesched = sga.zipfile.Zipfile(zip_bytes) + fdate = extract_date(fname) + s3zip_list.append( + { + 'zip_filename': fname, + 'zip': zipfilesched, + 'csv_filename': f'schedule_summaries/daily_job/' + f'cta/cta_route_daily_summary_{fdate}.csv' + } + ) + + s3_route_daily_summary_dict = { + 'zip_filenames': [gtfs['zip_filename'] for gtfs in s3zip_list], + 'summaries': [create_route_summary(gtfs['zip']) for gtfs in s3zip_list], + 'csv_filenames': [gtfs['csv_filename'] for gtfs in s3zip_list] + } + + transitfeeds_list = list(set(zip_filename_list).difference(set(found_list))) + print(', '.join(transitfeeds_list) + ' were not found in s3. Using transitfeeds.com') + transitfeeds_dates = [] + for fname in transitfeeds_list: + # Extract date from string after splitting on '_' and then '.' + fdate = extract_date(fname) + transitfeeds_dates.append(fdate) + + + transitfeeds_dates = sorted(transitfeeds_dates) + schedule_list = csrt.create_schedule_list(month=5, year=2022) + schedule_list_filtered = [ + s for s in schedule_list + if s['feed_start_date'] >= min(transitfeeds_dates) + and s['feed_start_date'] <= max(transitfeeds_dates) + ] + -def save_realtime_daily_summary() -> None: - if pendulum.now("America/Chicago").hour >= 11: - end_date = pendulum.yesterday("America/Chicago") - else: - end_date = pendulum.now("America/Chicago").subtract(days=2) + trip_summaries_transitfeeds_dict = {'zip_filenames': [], 'zips': [], 'csv_filenames': [], + 'summaries': []} - end_date = end_date.to_date_string() + for sched in schedule_list_filtered: + CTA_GTFS, zipfile_bytes_io = sga.download_zip(sched['schedule_version']) + trip_summaries_transitfeeds_dict['zip_filenames'].append( + f"transitfeeds_schedule_zipfiles_raw/{sched['schedule_version']}.zip" + ) + trip_summaries_transitfeeds_dict['zips'].append((CTA_GTFS, zipfile_bytes_io)) + trip_summaries_transitfeeds_dict['summaries'].append(create_route_summary(CTA_GTFS)) + trip_summaries_transitfeeds_dict['csv_filenames'].append( + f'schedule_summaries/daily_job/transitfeeds/' + f'transitfeeds_route_daily_summary_v{sched["schedule_version"]}.csv' + ) + + print(f'Saving cta schedule summary files in {date_range} to public bucket') + for filename, summary in zip( + s3_route_daily_summary_dict['csv_filenames'], + s3_route_daily_summary_dict['summaries'] + ): + save_csv_to_bucket(summary, filename=filename) + + print(f'Saving transitfeeds schedule summary files and zip files ' + f'in {date_range} to public bucket') + for csv_filename, summary, zip_filename, zipfile in zip( + trip_summaries_transitfeeds_dict['csv_filenames'], + trip_summaries_transitfeeds_dict['summaries'], + trip_summaries_transitfeeds_dict['zip_filenames'], + trip_summaries_transitfeeds_dict['zips'] + ): + save_csv_to_bucket(summary, filename=csv_filename) + # Save the zip file + client.upload_fileobj( + zipfile[1], + csrt.BUCKET_PUBLIC, + zip_filename + ) + + for fname in ['csv_filenames', 'zip_filenames']: + print('Confirm that ' + ', '.join(s3_route_daily_summary_dict[fname]) + + ' exist in bucket') + _ = keys(csrt.BUCKET_PUBLIC, s3_route_daily_summary_dict[fname]) + + print('Confirm that ' + ', '.join(trip_summaries_transitfeeds_dict[fname]) + + ' exists in bucket') + _ = keys(csrt.BUCKET_PUBLIC, trip_summaries_transitfeeds_dict[fname]) + +def save_realtime_daily_summary(date: str = None) -> None: + if date is None: + if pendulum.now("America/Chicago").hour >= 11: + date = pendulum.yesterday("America/Chicago") + else: + date = pendulum.now("America/Chicago").subtract(days=2) + + date = date.to_date_string() + print(f'Date not given. Taking the latest available date {date}.') + else: + date = pendulum.parse(date).strftime('%Y-%m-%d') daily_data = pd.read_csv( - (csrt.BASE_PATH / f"bus_full_day_data_v2/{end_date}.csv") + (csrt.BASE_PATH / f"bus_full_day_data_v2/{date}.csv") .as_uri(), low_memory=False ) daily_data = csrt.make_daily_summary(daily_data) - filename = f'realtime_summaries/daily_job/bus_full_day_data_v2/{end_date}.csv' + filename = f'realtime_summaries/daily_job/bus_full_day_data_v2/{date}.csv' save_csv_to_bucket(daily_data, filename=filename) print(f'Confirm that {filename} exists in bucket') - keys(csrt.BUCKET_PUBLIC, [filename]) + _ = keys(csrt.BUCKET_PUBLIC, [filename]) # https://stackoverflow.com/questions/30249069/listing-contents-of-a-bucket-with-boto3 def keys(bucket_name: str, filenames: list, prefix: str='/', delimiter: str='/', - start_after: str='') -> None: + start_after: str='') -> list: s3_paginator = client.get_paginator('list_objects_v2') prefix = prefix.lstrip(delimiter) start_after = (start_after or prefix) if prefix.endswith(delimiter) else start_after + found_list = [] for page in s3_paginator.paginate(Bucket=bucket_name, Prefix=prefix, StartAfter=start_after): for content in page.get('Contents', ()): if content['Key'] in filenames: print(f"{content['Key']} exists") + found_list.append(content['Key']) + return found_list \ No newline at end of file From 68a55bd43b3743f7b6ce8807f8286e5909baa080 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Tue, 29 Aug 2023 20:07:03 -0500 Subject: [PATCH 02/62] Add branch for GitHub actions --- .github/workflows/cta_data_downloads.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index c36457d..a353a2f 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -4,6 +4,7 @@ on: push: branches: - 'automate-schedule-downloads' + - 'date-range-downloads' schedule: # Run every day at 12:30pm CST which is 5:30pm UTC From 640b30a8619e39beb7c5cee322a3bb8446bd83cc Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Tue, 29 Aug 2023 20:20:06 -0500 Subject: [PATCH 03/62] Fix typing error with List[str, str] --- scrape_data/cta_data_downloads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 26d5cba..6b0abe8 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -60,7 +60,7 @@ def save_csv_to_bucket(df: pd.DataFrame, filename: str) -> None: .put(Body=csv_buffer.getvalue()) -def save_sched_daily_summary(date_range: List[str, str] = None) -> None: +def save_sched_daily_summary(date_range: List[str] = None) -> None: if date_range is None: date_range = [today] print(f"No date range given. Using {today} only") From 26169adfc2cc25a5084dc2fadc8c1aa5ef2e2897 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 3 Sep 2023 19:08:22 -0500 Subject: [PATCH 04/62] Add bucket name argument to download_fileobj --- scrape_data/cta_data_downloads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 6b0abe8..8bc5b87 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -99,7 +99,7 @@ def create_route_summary(CTA_GTFS: sga.GTFSFeed) -> pd.DataFrame: for fname in found_list: zip_bytes = BytesIO() zip_bytes.seek(0) - client.download_fileobj(fname, zip_bytes) + client.download_fileobj(Bucket=sga.BUCKET, Key=fname, Fileobj=zip_bytes) zipfilesched = sga.zipfile.Zipfile(zip_bytes) fdate = extract_date(fname) s3zip_list.append( From 9c2a93d3a10b2d18362f406abd0e7c1ba99fa993 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 3 Sep 2023 19:21:06 -0500 Subject: [PATCH 05/62] Change Zipfile to ZipFile --- scrape_data/cta_data_downloads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 8bc5b87..599514c 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -100,7 +100,7 @@ def create_route_summary(CTA_GTFS: sga.GTFSFeed) -> pd.DataFrame: zip_bytes = BytesIO() zip_bytes.seek(0) client.download_fileobj(Bucket=sga.BUCKET, Key=fname, Fileobj=zip_bytes) - zipfilesched = sga.zipfile.Zipfile(zip_bytes) + zipfilesched = sga.zipfile.ZipFile(zip_bytes) fdate = extract_date(fname) s3zip_list.append( { From 48a1d537fc2a4a20bb2128c77f01c12031a7a1e9 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 3 Sep 2023 19:41:59 -0500 Subject: [PATCH 06/62] Add case for nothing to check in transitfeeds --- scrape_data/cta_data_downloads.py | 114 +++++++++++++++--------------- 1 file changed, 58 insertions(+), 56 deletions(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 599514c..55a570a 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -77,6 +77,11 @@ def save_sched_daily_summary(date_range: List[str] = None) -> None: csrt.BUCKET_PUBLIC, zip_filename_list ) + def confirm_saved_files(file_dict: dict) -> None: + for fname in ['csv_filenames', 'zip_filenames']: + print('Confirm that ' + ', '.join(file_dict[fname]) + + ' exist in bucket') + _ = keys(csrt.BUCKET_PUBLIC, file_dict[fname]) def extract_date(fname: str) -> str: return fname.split('_')[-1].split('.')[0] @@ -115,40 +120,7 @@ def create_route_summary(CTA_GTFS: sga.GTFSFeed) -> pd.DataFrame: 'zip_filenames': [gtfs['zip_filename'] for gtfs in s3zip_list], 'summaries': [create_route_summary(gtfs['zip']) for gtfs in s3zip_list], 'csv_filenames': [gtfs['csv_filename'] for gtfs in s3zip_list] - } - - transitfeeds_list = list(set(zip_filename_list).difference(set(found_list))) - print(', '.join(transitfeeds_list) + ' were not found in s3. Using transitfeeds.com') - transitfeeds_dates = [] - for fname in transitfeeds_list: - # Extract date from string after splitting on '_' and then '.' - fdate = extract_date(fname) - transitfeeds_dates.append(fdate) - - - transitfeeds_dates = sorted(transitfeeds_dates) - schedule_list = csrt.create_schedule_list(month=5, year=2022) - schedule_list_filtered = [ - s for s in schedule_list - if s['feed_start_date'] >= min(transitfeeds_dates) - and s['feed_start_date'] <= max(transitfeeds_dates) - ] - - - trip_summaries_transitfeeds_dict = {'zip_filenames': [], 'zips': [], 'csv_filenames': [], - 'summaries': []} - - for sched in schedule_list_filtered: - CTA_GTFS, zipfile_bytes_io = sga.download_zip(sched['schedule_version']) - trip_summaries_transitfeeds_dict['zip_filenames'].append( - f"transitfeeds_schedule_zipfiles_raw/{sched['schedule_version']}.zip" - ) - trip_summaries_transitfeeds_dict['zips'].append((CTA_GTFS, zipfile_bytes_io)) - trip_summaries_transitfeeds_dict['summaries'].append(create_route_summary(CTA_GTFS)) - trip_summaries_transitfeeds_dict['csv_filenames'].append( - f'schedule_summaries/daily_job/transitfeeds/' - f'transitfeeds_route_daily_summary_v{sched["schedule_version"]}.csv' - ) + } print(f'Saving cta schedule summary files in {date_range} to public bucket') for filename, summary in zip( @@ -156,31 +128,61 @@ def create_route_summary(CTA_GTFS: sga.GTFSFeed) -> pd.DataFrame: s3_route_daily_summary_dict['summaries'] ): save_csv_to_bucket(summary, filename=filename) + + confirm_saved_files(s3_route_daily_summary_dict) - print(f'Saving transitfeeds schedule summary files and zip files ' - f'in {date_range} to public bucket') - for csv_filename, summary, zip_filename, zipfile in zip( - trip_summaries_transitfeeds_dict['csv_filenames'], - trip_summaries_transitfeeds_dict['summaries'], - trip_summaries_transitfeeds_dict['zip_filenames'], - trip_summaries_transitfeeds_dict['zips'] - ): - save_csv_to_bucket(summary, filename=csv_filename) - # Save the zip file - client.upload_fileobj( - zipfile[1], - csrt.BUCKET_PUBLIC, - zip_filename - ) + transitfeeds_list = list(set(zip_filename_list).difference(set(found_list))) + if transitfeeds_list: + print(', '.join(transitfeeds_list) + ' were not found in s3. Using transitfeeds.com') + transitfeeds_dates = [] + for fname in transitfeeds_list: + # Extract date from string after splitting on '_' and then '.' + fdate = extract_date(fname) + transitfeeds_dates.append(fdate) + + + transitfeeds_dates = sorted(transitfeeds_dates) + schedule_list = csrt.create_schedule_list(month=5, year=2022) + schedule_list_filtered = [ + s for s in schedule_list + if s['feed_start_date'] >= min(transitfeeds_dates) + and s['feed_start_date'] <= max(transitfeeds_dates) + ] + - for fname in ['csv_filenames', 'zip_filenames']: - print('Confirm that ' + ', '.join(s3_route_daily_summary_dict[fname]) - + ' exist in bucket') - _ = keys(csrt.BUCKET_PUBLIC, s3_route_daily_summary_dict[fname]) + trip_summaries_transitfeeds_dict = {'zip_filenames': [], 'zips': [], 'csv_filenames': [], + 'summaries': []} - print('Confirm that ' + ', '.join(trip_summaries_transitfeeds_dict[fname]) - + ' exists in bucket') - _ = keys(csrt.BUCKET_PUBLIC, trip_summaries_transitfeeds_dict[fname]) + for sched in schedule_list_filtered: + CTA_GTFS, zipfile_bytes_io = sga.download_zip(sched['schedule_version']) + trip_summaries_transitfeeds_dict['zip_filenames'].append( + f"transitfeeds_schedule_zipfiles_raw/{sched['schedule_version']}.zip" + ) + trip_summaries_transitfeeds_dict['zips'].append((CTA_GTFS, zipfile_bytes_io)) + trip_summaries_transitfeeds_dict['summaries'].append(create_route_summary(CTA_GTFS)) + trip_summaries_transitfeeds_dict['csv_filenames'].append( + f'schedule_summaries/daily_job/transitfeeds/' + f'transitfeeds_route_daily_summary_v{sched["schedule_version"]}.csv' + ) + print( + f'Saving transitfeeds schedule summary files and zip files ' + f'in {date_range} to public bucket' + ) + for csv_filename, summary, zip_filename, zipfile in zip( + trip_summaries_transitfeeds_dict['csv_filenames'], + trip_summaries_transitfeeds_dict['summaries'], + trip_summaries_transitfeeds_dict['zip_filenames'], + trip_summaries_transitfeeds_dict['zips'] + ): + save_csv_to_bucket(summary, filename=csv_filename) + # Save the zip file + client.upload_fileobj( + zipfile[1], + csrt.BUCKET_PUBLIC, + zip_filename + ) + confirm_saved_files(trip_summaries_transitfeeds_dict) + def save_realtime_daily_summary(date: str = None) -> None: if date is None: From ec95194cb3dda8801a8299fe10df2346af406641 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 3 Sep 2023 20:23:09 -0500 Subject: [PATCH 07/62] Test with date range --- .github/workflows/cta_data_downloads.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index a353a2f..9febec8 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -46,10 +46,13 @@ jobs: python-version: ${{ env.PYTHON_VERSION }} - name: 'Save schedule summaries' + # Test with no date and with date range run: | pip install -r requirements.txt python -c 'from scrape_data.cta_data_downloads import save_sched_daily_summary; \ save_sched_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY + python -c 'from scrape_data.cta_data_downloads import save_sched_daily_summary; \ + save_sched_daily_summary(["2023-05-02", "2023-08-02"])' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY save-realtime-daily-summary: From 98cac8e813c212b1f17361cd5352d555bf051c18 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Mon, 4 Sep 2023 19:08:44 -0500 Subject: [PATCH 08/62] Add 2022 data from transitfeeds to s3 --- .github/workflows/cta_data_downloads.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index 9febec8..1253710 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -52,7 +52,8 @@ jobs: python -c 'from scrape_data.cta_data_downloads import save_sched_daily_summary; \ save_sched_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY python -c 'from scrape_data.cta_data_downloads import save_sched_daily_summary; \ - save_sched_daily_summary(["2023-05-02", "2023-08-02"])' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY + save_sched_daily_summary(["2022-05-20", "2023-05-20"])' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY + save-realtime-daily-summary: From 3ea96303200ad0e73238b86f42600472f69c2ca8 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Mon, 4 Sep 2023 20:18:21 -0500 Subject: [PATCH 09/62] Shorten date range for testing --- .github/workflows/cta_data_downloads.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index 1253710..57cb15d 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -52,7 +52,7 @@ jobs: python -c 'from scrape_data.cta_data_downloads import save_sched_daily_summary; \ save_sched_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY python -c 'from scrape_data.cta_data_downloads import save_sched_daily_summary; \ - save_sched_daily_summary(["2022-05-20", "2023-05-20"])' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY + save_sched_daily_summary(["2023-05-20", "2023-08-20"])' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY From 0e6ceb98fa08f9bf0677caf0ee19f0b611d4648b Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 17 Sep 2023 20:56:24 -0500 Subject: [PATCH 10/62] Create data.json --- .github/workflows/cta_data_downloads.yml | 22 ++- data_analysis/compare_scheduled_and_rt.py | 46 ++++-- data_analysis/plots.py | 112 ++++++++++---- data_analysis/static_gtfs_analysis.py | 2 +- scrape_data/cta_data_downloads.py | 171 ++++++++++++++++------ 5 files changed, 265 insertions(+), 88 deletions(-) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index 57cb15d..240abac 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -5,6 +5,7 @@ on: branches: - 'automate-schedule-downloads' - 'date-range-downloads' + - 'compare-realtime-schedule' schedule: # Run every day at 12:30pm CST which is 5:30pm UTC @@ -73,4 +74,23 @@ jobs: python -c 'from scrape_data.cta_data_downloads import save_realtime_daily_summary; \ save_realtime_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY - \ No newline at end of file + + + save-frontend-map-json: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: 'Save data.json for frontend' + + run: | + pip install -r requirements.txt + + python -c 'from scrape_data.cta_data_downloads import compare_realtime_sched; \ + compare_realtime_sched()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY + \ No newline at end of file diff --git a/data_analysis/compare_scheduled_and_rt.py b/data_analysis/compare_scheduled_and_rt.py index 7ff0652..89afaf1 100644 --- a/data_analysis/compare_scheduled_and_rt.py +++ b/data_analysis/compare_scheduled_and_rt.py @@ -314,20 +314,12 @@ def build_summary( ) return summary + +def create_GTFS_data_list(schedule_feeds: List[dict] = None) -> dict: + if schedule_feeds is None: + schedule_feeds = create_schedule_list(month=5, year=2022) -def main(freq: str = 'D') -> Tuple[List[dict],pd.DataFrame, pd.DataFrame]: - """Calculate the summary by route and day across multiple schedule versions - - Args: - freq (str): Frequency of aggregation. Defaults to Daily. - Returns: - pd.DataFrame: A DataFrame of every day in the specified data with - scheduled and observed count of trips. - pd.DataFrame: A DataFrame summary across - versioned schedule comparisons. - """ - schedule_feeds = create_schedule_list(month=5, year=2022) - + GTFS_data_list = [] schedule_data_list = [] pbar = tqdm(schedule_feeds) for feed in pbar: @@ -340,15 +332,17 @@ def main(freq: str = 'D') -> Tuple[List[dict],pd.DataFrame, pd.DataFrame]: f"\nDownloading zip file for schedule version " f"{schedule_version}" ) - CTA_GTFS = static_gtfs_analysis.download_zip(schedule_version) + CTA_GTFS, _ = static_gtfs_analysis.download_zip(schedule_version) logger.info("\nExtracting data") data = static_gtfs_analysis.GTFSFeed.extract_data( CTA_GTFS, version_id=schedule_version ) data = static_gtfs_analysis.format_dates_hours(data) + GTFS_data_list.append({'fname': schedule_version, 'data': data}) logger.info("\nSummarizing trip data") + trip_summary = static_gtfs_analysis.make_trip_summary(data, pendulum.from_format(feed['feed_start_date'], 'YYYY-MM-DD'), pendulum.from_format(feed['feed_end_date'], 'YYYY-MM-DD')) @@ -360,8 +354,30 @@ def main(freq: str = 'D') -> Tuple[List[dict],pd.DataFrame, pd.DataFrame]: schedule_data_list.append( {"schedule_version": schedule_version, - "data": route_daily_summary} + "data": route_daily_summary} ) + + + return { + 'GTFS_data_list': GTFS_data_list, + 'schedule_data_list': schedule_data_list + } + +def main(freq: str = 'D', schedule_feeds: List[dict] = None) -> Tuple[List[dict],pd.DataFrame, pd.DataFrame]: + """Calculate the summary by route and day across multiple schedule versions + + Args: + freq (str): Frequency of aggregation. Defaults to Daily. + schedule_feeds (List[dict]): List of dictionaries with the keys + 'schedule_version', 'feed_start_date', and 'feed_end_date'. + Returns: + pd.DataFrame: A DataFrame of every day in the specified data with + scheduled and observed count of trips. + pd.DataFrame: A DataFrame summary across + versioned schedule comparisons. + """ + schedule_data_list = create_GTFS_data_list(schedule_feeds)['schedule_data_list'] + agg_info = AggInfo(freq=freq) combined_long, combined_grouped = combine_real_time_rt_comparison( schedule_feeds, diff --git a/data_analysis/plots.py b/data_analysis/plots.py index c1704c3..d0b993f 100644 --- a/data_analysis/plots.py +++ b/data_analysis/plots.py @@ -1,5 +1,7 @@ import os +import json from datetime import datetime +import pendulum from pathlib import Path from typing import List, Union @@ -565,19 +567,7 @@ def plot_and_save( summary_kwargs=summary_kwargs, save_name=save_name, ) - - path_name = create_save_path(save_name, DATA_PATH) - # Take only the columns related to summary_kwargs['column'] - # and those used in the map - first_cols = summary_gdf_geo.columns[:2].tolist() - last_cols = summary_gdf_geo.columns[-10:].to_list() - kwargs_cols = summary_gdf_geo.columns[ - summary_gdf_geo.columns.str.startswith(summary_kwargs["column"]) - ].tolist() - cols = first_cols + kwargs_cols + last_cols - summary_gdf_geo[cols].to_file(f"{path_name}.json", driver="GeoJSON") - summary_gdf_geo[cols].to_html(f"{path_name}_table.html", index=False) - + save_json(summary_gdf_geo, summary_kwargs, save_name) def calculate_trips_per_rider( merged_df: pd.DataFrame, num_riders: int = 1000 @@ -963,20 +953,18 @@ def run_mvp() -> None: make_ward_maps(summary_df_wk, start_date, end_date) - -def main(day_type: str = None) -> None: - """Generate maps of all routes, top 10 best routes, - top 10 worst routes, and ridership - - Args: - day_type (str, optional): day_type to filter by. Defaults to None. - """ +def create_summary_gdf_geo( + combined_long_df: pd.DataFrame, + summary_df: pd.DataFrame, + day_type: str = None) -> gpd.GeoDataFrame: + logger.info("Creating GeoDataFrame") gdf = static_gtfs_analysis.main() logger.info("Getting latest real-time and schedule comparison data") - - combined_long_df, summary_df = compare_scheduled_and_rt.main(freq="D") + combined_long_df = combined_long_df.loc[combined_long_df['route_id'] != '74'] + summary_df = summary_df.loc[summary_df['route_id'] != '74'] + gdf = gdf.loc[gdf.route_id != "74"] if day_type is not None: summary_df = filter_day_type(summary_df, day_type=day_type) @@ -1002,13 +990,10 @@ def main(day_type: str = None) -> None: summary_gdf = summary_df_mean.merge(gdf, how="right", on="route_id") - summary_gdf_geo = gpd.GeoDataFrame(summary_gdf) - combined_long_df.loc[:, "date"] = pd.to_datetime(combined_long_df.loc[:, "date"]) start_date = combined_long_df["date"].min().strftime("%Y-%m-%d") - end_date = combined_long_df["date"].max().strftime("%Y-%m-%d") - + ridership_by_rte_date = fetch_ridership_data() ridership_end_date = ridership_by_rte_date["date"].max().strftime("%Y-%m-%d") @@ -1034,8 +1019,79 @@ def main(day_type: str = None) -> None: summary_gdf = summary_df_mean.merge(gdf, how="right", on="route_id") - summary_gdf_geo = gpd.GeoDataFrame(summary_gdf) + return gpd.GeoDataFrame(summary_gdf) + + +def save_json(summary_gdf_geo: gpd.GeoDataFrame, summary_kwargs: dict, save_name: str) -> None: + """Save JSON file from GeoDataFrame + + Args: + summary_gdf_geo (gpd.GeoDataFrame): output of create_summary_gdf_geo function + summary_kwargs (dict): A dictionary with the kwargs passed to geopandas explore. Requires + a 'column' key at minimum + save_name (str): name of the json output file + """ + path_name = create_save_path(save_name, DATA_PATH) + # Take only the columns related to summary_kwargs['column'] + # and those used in the map + first_cols = summary_gdf_geo.columns[:2].tolist() + last_cols = summary_gdf_geo.columns[-10:].to_list() + kwargs_cols = summary_gdf_geo.columns[ + summary_gdf_geo.columns.str.startswith(summary_kwargs["column"]) + ].tolist() + cols = first_cols + kwargs_cols + last_cols + summary_gdf_geo[cols].to_file(f"{path_name}.json", driver="GeoJSON") + summary_gdf_geo[cols].to_html(f"{path_name}_table.html", index=False) + + +def create_frontend_json(json_file: str, start_date: str, end_date: str, save_path: str = None, save: bool = True) -> None: + """Create the data.json file that is used for the map at ghostbuses.com + + Args: + json_file (str): name of the json input file + start_date (str): start date of the data in YYYY-MM-DD format + end_date (str): end date of the data in YYYY-MM-DD format + save_path (str, optional): The path to save the output file. Defaults to None. + If save is True, this argument is required. + save (bool, optional): Whether to save the JSON output. Defaults to True. + + Raises: + ValueError: If save is True. The save_path argument cannot be None. + """ + with open(DATA_PATH / json_file) as json_data: + data = json.load(json_data) + start_dt = pendulum.parse(start_date).format('MMMM D, YYYY') + end_dt = pendulum.parse(end_date).format('MMMM D, YYYY') + data['dates'] = {'start': start_dt, 'end': end_dt} + if save: + if save_path is None: + raise ValueError('You must specify a location to save the json file') + with open(save_path, 'w') as output_json: + json.dump(data, output_json) + else: + return json.dumps(data, indent=4) + +def main(day_type: str = None) -> None: + """Generate maps of all routes, top 10 best routes, + top 10 worst routes, and ridership + + Args: + day_type (str, optional): day_type to filter by. Defaults to None. + """ + logger.info("Getting latest real-time and schedule comparison data") + + combined_long_df, summary_df = compare_scheduled_and_rt.main(freq="D") + today = datetime.now().strftime('%Y-%m-%d') + combined_long_df.to_csv(DATA_PATH / f'combined_long_df_{today}.csv', index=False) + summary_df.to_csv(DATA_PATH / f'summary_df_{today}.csv', index=False) + + combined_long_df.loc[:, "date"] = pd.to_datetime(combined_long_df.loc[:, "date"]) + + start_date = combined_long_df["date"].min().strftime("%Y-%m-%d") + end_date = combined_long_df["date"].max().strftime("%Y-%m-%d") + summary_gdf_geo = create_summary_gdf_geo(combined_long_df, summary_df, day_type) + make_all_maps( summary_gdf_geo=summary_gdf_geo, start_date=start_date, end_date=end_date ) diff --git a/data_analysis/static_gtfs_analysis.py b/data_analysis/static_gtfs_analysis.py index 32102c4..fb8060a 100644 --- a/data_analysis/static_gtfs_analysis.py +++ b/data_analysis/static_gtfs_analysis.py @@ -385,7 +385,7 @@ def download_extract_format(version_id: str = None) -> GTFSFeed: if version_id is None: CTA_GTFS, _ = download_cta_zip() else: - CTA_GTFS = download_zip(version_id) + CTA_GTFS, _ = download_zip(version_id) data = GTFSFeed.extract_data(CTA_GTFS, version_id=version_id) data = format_dates_hours(data) return data diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 55a570a..1e38d2d 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -2,10 +2,11 @@ import sys import data_analysis.static_gtfs_analysis as sga import data_analysis.compare_scheduled_and_rt as csrt +import data_analysis.plots as plots import pendulum from io import StringIO, BytesIO import pandas as pd -from typing import List +import typing ACCESS_KEY = sys.argv[1] SECRET_KEY = sys.argv[2] @@ -60,45 +61,144 @@ def save_csv_to_bucket(df: pd.DataFrame, filename: str) -> None: .put(Body=csv_buffer.getvalue()) -def save_sched_daily_summary(date_range: List[str] = None) -> None: - if date_range is None: - date_range = [today] - print(f"No date range given. Using {today} only") - +def find_s3_zipfiles(date_range: typing.List[str]) -> typing.Tuple[list]: start_date = pendulum.parse(min(date_range)) end_date = pendulum.parse(max(date_range)) period = pendulum.period(start_date, end_date) full_date_range = [dt.to_date_string() for dt in period.range('days')] zip_filename_list = [f'cta_schedule_zipfiles_raw/google_transit_{date}.zip' for date in full_date_range] - - # Check for files in bucket. found_list = keys( csrt.BUCKET_PUBLIC, zip_filename_list ) - def confirm_saved_files(file_dict: dict) -> None: - for fname in ['csv_filenames', 'zip_filenames']: - print('Confirm that ' + ', '.join(file_dict[fname]) - + ' exist in bucket') - _ = keys(csrt.BUCKET_PUBLIC, file_dict[fname]) + return zip_filename_list, found_list + + +def find_transitfeeds_zipfiles( + full_list: typing.List[str], + found_list: typing.List[str]) -> typing.List[str]: - def extract_date(fname: str) -> str: - return fname.split('_')[-1].split('.')[0] + transitfeeds_list = list(set(full_list).difference(set(found_list))) + if transitfeeds_list: + print(', '.join(transitfeeds_list) + ' were not found in s3. Using transitfeeds.com') + transitfeeds_dates = [] + for fname in transitfeeds_list: + # Extract date from string after splitting on '_' and then '.' + fdate = extract_date(fname) + transitfeeds_dates.append(fdate) + + + transitfeeds_dates = sorted(transitfeeds_dates) + schedule_list = csrt.create_schedule_list(month=5, year=2022) + schedule_list_filtered = [ + s for s in schedule_list + if s['feed_start_date'] >= min(transitfeeds_dates) + and s['feed_start_date'] <= max(transitfeeds_dates) + ] + return schedule_list_filtered + else: + print("All records found in s3 from transitchicago.com") + return full_list - def create_route_summary(CTA_GTFS: sga.GTFSFeed) -> pd.DataFrame: - data = sga.GTFSFeed.extract_data(CTA_GTFS) + +def compare_realtime_sched( + date_range: typing.List[str] = ['2022-05-20', today]) -> None: + + zip_filename_list, found_list = find_s3_zipfiles(date_range=date_range) + schedule_list_filtered = find_transitfeeds_zipfiles(zip_filename_list, found_list) + # Extract data from s3 zipfiles + s3_data_list = [] + for fname in found_list: + zip_bytes = BytesIO() + zip_bytes.seek(0) + client.download_fileobj(Bucket=sga.BUCKET, Key=fname, Fileobj=zip_bytes) + zipfilesched = sga.zipfile.ZipFile(zip_bytes) + data = sga.GTFSFeed.extract_data(zipfilesched) data = sga.format_dates_hours(data) - trip_summary = sga.make_trip_summary(data) + s3_data_list.append({'fname': fname, 'data': data}) + + transit_feeds_GTFS_data_list = csrt.create_GTFS_data_list(schedule_list_filtered)['GTFS_data_list'] + joined_list = [*s3_data_list, *transit_feeds_GTFS_data_list] + + + # Convert from list of dictionaries to dictionary with list values + joined_dict = pd.DataFrame(joined_list).to_dict(orient='list') + schedule_data_list = [{'schedule_version': fname, 'data': create_route_summary(data)} + for fname, data in joined_dict.items()] - route_daily_summary = ( - sga.summarize_date_rt(trip_summary) - ) + agg_info = csrt.AggInfo() + print('Creating combined_long_df and summary_df') + combined_long_df, summary_df = csrt.combine_real_time_rt_comparison( + schedule_feeds=schedule_list_filtered, + schedule_data_list=schedule_data_list, + agg_info=agg_info + ) - route_daily_summary['date'] = route_daily_summary['date'].astype(str) - route_daily_summary_today = route_daily_summary.loc[route_daily_summary['date'].isin(date_range)] - return route_daily_summary_today + day_type = 'wk' + start_date = combined_long_df["date"].min().strftime("%Y-%m-%d") + end_date = combined_long_df["date"].max().strftime("%Y-%m-%d") + + summary_gdf_geo = plots.create_summary_gdf_geo(combined_long_df, summary_df, day_type=day_type) + summary_kwargs = {'column': 'ratio'} + save_name = f"all_routes_{start_date}_to_{end_date}_{day_type}" + + plots.save_json( + summary_gdf_geo=summary_gdf_geo, + summary_kwargs=summary_kwargs, + save_name=save_name + ) + s3_data_json_path = 'frontend_data_files/data.json' + print(f'Saving data.json to {s3_data_json_path}') + + data_json = plots.create_frontend_json( + json_file=f'{save_name}.json', + start_date=start_date, + end_date=end_date, + save=False + ) + # Save data.json to s3 for now. This will eventually live in the frontend repo. + s3.Object( + csrt.BUCKET_PUBLIC, + f'{s3_data_json_path}')\ + .put(Body=data_json) + + _ = keys(csrt.BUCKET_PUBLIC, ['data.json']) + + +def confirm_saved_files(file_dict: dict) -> None: + for fname in ['csv_filenames', 'zip_filenames']: + print('Confirm that ' + ', '.join(file_dict[fname]) + + ' exist in bucket') + _ = keys(csrt.BUCKET_PUBLIC, file_dict[fname]) + + +def extract_date(fname: str) -> str: + return fname.split('_')[-1].split('.')[0] + + +def create_route_summary(CTA_GTFS: sga.GTFSFeed) -> pd.DataFrame: + data = sga.GTFSFeed.extract_data(CTA_GTFS) + data = sga.format_dates_hours(data) + trip_summary = sga.make_trip_summary(data) + + route_daily_summary = ( + sga.summarize_date_rt(trip_summary) + ) + + route_daily_summary['date'] = route_daily_summary['date'].astype(str) + route_daily_summary_today = route_daily_summary.loc[route_daily_summary['date'].isin(date_range)] + return route_daily_summary_today + + +def save_sched_daily_summary(date_range: typing.List[str] = None) -> None: + if date_range is None: + date_range = [today] + print(f"No date range given. Using {today} only") + + zip_filename_list, found_list = find_s3_zipfiles(date_range=date_range) + print('Using zipfiles found in public bucket') s3zip_list = [] for fname in found_list: @@ -130,26 +230,10 @@ def create_route_summary(CTA_GTFS: sga.GTFSFeed) -> pd.DataFrame: save_csv_to_bucket(summary, filename=filename) confirm_saved_files(s3_route_daily_summary_dict) - - transitfeeds_list = list(set(zip_filename_list).difference(set(found_list))) - if transitfeeds_list: - print(', '.join(transitfeeds_list) + ' were not found in s3. Using transitfeeds.com') - transitfeeds_dates = [] - for fname in transitfeeds_list: - # Extract date from string after splitting on '_' and then '.' - fdate = extract_date(fname) - transitfeeds_dates.append(fdate) - - - transitfeeds_dates = sorted(transitfeeds_dates) - schedule_list = csrt.create_schedule_list(month=5, year=2022) - schedule_list_filtered = [ - s for s in schedule_list - if s['feed_start_date'] >= min(transitfeeds_dates) - and s['feed_start_date'] <= max(transitfeeds_dates) - ] + schedule_list_filtered = find_transitfeeds_zipfiles(zip_filename_list, found_list) - + # Only download transitfeeds for files not found in s3. + if set(found_list) != set(zip_filename_list): trip_summaries_transitfeeds_dict = {'zip_filenames': [], 'zips': [], 'csv_filenames': [], 'summaries': []} @@ -209,6 +293,7 @@ def save_realtime_daily_summary(date: str = None) -> None: print(f'Confirm that {filename} exists in bucket') _ = keys(csrt.BUCKET_PUBLIC, [filename]) + # https://stackoverflow.com/questions/30249069/listing-contents-of-a-bucket-with-boto3 def keys(bucket_name: str, filenames: list, prefix: str='/', delimiter: str='/', From 4eb490fae32edb9d265a55c1ba62057b321ff7d0 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 17 Sep 2023 21:06:54 -0500 Subject: [PATCH 11/62] add dependency --- .github/workflows/cta_data_downloads.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index 240abac..70fcfa1 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -78,7 +78,7 @@ jobs: save-frontend-map-json: runs-on: ubuntu-latest - + needs: [save-realtime-daily-summary, save-schedule-daily-summary] steps: - uses: actions/checkout@v3 From 984de272632baab86461985ea624d49ab053b3ad Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Mon, 18 Sep 2023 19:13:53 -0500 Subject: [PATCH 12/62] Add date_range argument --- scrape_data/cta_data_downloads.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 1e38d2d..3923876 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -124,7 +124,7 @@ def compare_realtime_sched( # Convert from list of dictionaries to dictionary with list values joined_dict = pd.DataFrame(joined_list).to_dict(orient='list') - schedule_data_list = [{'schedule_version': fname, 'data': create_route_summary(data)} + schedule_data_list = [{'schedule_version': fname, 'data': create_route_summary(data, date_range)} for fname, data in joined_dict.items()] agg_info = csrt.AggInfo() @@ -178,7 +178,7 @@ def extract_date(fname: str) -> str: return fname.split('_')[-1].split('.')[0] -def create_route_summary(CTA_GTFS: sga.GTFSFeed) -> pd.DataFrame: +def create_route_summary(CTA_GTFS: sga.GTFSFeed, date_range: typing.List[str]) -> pd.DataFrame: data = sga.GTFSFeed.extract_data(CTA_GTFS) data = sga.format_dates_hours(data) trip_summary = sga.make_trip_summary(data) @@ -218,7 +218,7 @@ def save_sched_daily_summary(date_range: typing.List[str] = None) -> None: s3_route_daily_summary_dict = { 'zip_filenames': [gtfs['zip_filename'] for gtfs in s3zip_list], - 'summaries': [create_route_summary(gtfs['zip']) for gtfs in s3zip_list], + 'summaries': [create_route_summary(gtfs['zip'], date_range) for gtfs in s3zip_list], 'csv_filenames': [gtfs['csv_filename'] for gtfs in s3zip_list] } @@ -243,7 +243,7 @@ def save_sched_daily_summary(date_range: typing.List[str] = None) -> None: f"transitfeeds_schedule_zipfiles_raw/{sched['schedule_version']}.zip" ) trip_summaries_transitfeeds_dict['zips'].append((CTA_GTFS, zipfile_bytes_io)) - trip_summaries_transitfeeds_dict['summaries'].append(create_route_summary(CTA_GTFS)) + trip_summaries_transitfeeds_dict['summaries'].append(create_route_summary(CTA_GTFS, date_range)) trip_summaries_transitfeeds_dict['csv_filenames'].append( f'schedule_summaries/daily_job/transitfeeds/' f'transitfeeds_route_daily_summary_v{sched["schedule_version"]}.csv' From c671d616fdfdc135d262072f643b012df90dd9f9 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Tue, 19 Sep 2023 14:06:16 -0500 Subject: [PATCH 13/62] Remove date formatting. Handled by React --- data_analysis/plots.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/data_analysis/plots.py b/data_analysis/plots.py index d0b993f..695b58b 100644 --- a/data_analysis/plots.py +++ b/data_analysis/plots.py @@ -1060,9 +1060,7 @@ def create_frontend_json(json_file: str, start_date: str, end_date: str, save_pa """ with open(DATA_PATH / json_file) as json_data: data = json.load(json_data) - start_dt = pendulum.parse(start_date).format('MMMM D, YYYY') - end_dt = pendulum.parse(end_date).format('MMMM D, YYYY') - data['dates'] = {'start': start_dt, 'end': end_dt} + data['dates'] = {'start': start_date, 'end': end_date} if save: if save_path is None: raise ValueError('You must specify a location to save the json file') From fd7a87dcfb9f9c0abeae94f2f7a52fdd8370d4e8 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Tue, 19 Sep 2023 17:57:35 -0500 Subject: [PATCH 14/62] Add cta_download argument to create_GTFS_data_list --- data_analysis/compare_scheduled_and_rt.py | 13 ++++++++----- data_analysis/static_gtfs_analysis.py | 2 ++ scrape_data/cta_data_downloads.py | 2 +- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/data_analysis/compare_scheduled_and_rt.py b/data_analysis/compare_scheduled_and_rt.py index 89afaf1..f7c3696 100644 --- a/data_analysis/compare_scheduled_and_rt.py +++ b/data_analysis/compare_scheduled_and_rt.py @@ -315,7 +315,7 @@ def build_summary( return summary -def create_GTFS_data_list(schedule_feeds: List[dict] = None) -> dict: +def create_GTFS_data_list(schedule_feeds: List[dict] = None, cta_download: bool = True) -> dict: if schedule_feeds is None: schedule_feeds = create_schedule_list(month=5, year=2022) @@ -336,7 +336,8 @@ def create_GTFS_data_list(schedule_feeds: List[dict] = None) -> dict: logger.info("\nExtracting data") data = static_gtfs_analysis.GTFSFeed.extract_data( CTA_GTFS, - version_id=schedule_version + version_id=schedule_version, + cta_download=cta_download ) data = static_gtfs_analysis.format_dates_hours(data) GTFS_data_list.append({'fname': schedule_version, 'data': data}) @@ -363,20 +364,22 @@ def create_GTFS_data_list(schedule_feeds: List[dict] = None) -> dict: 'schedule_data_list': schedule_data_list } -def main(freq: str = 'D', schedule_feeds: List[dict] = None) -> Tuple[List[dict],pd.DataFrame, pd.DataFrame]: +def main(freq: str = 'D', schedule_feeds: List[dict] = None, + cta_download: bool = True) -> Tuple[List[dict],pd.DataFrame, pd.DataFrame]: """Calculate the summary by route and day across multiple schedule versions Args: freq (str): Frequency of aggregation. Defaults to Daily. schedule_feeds (List[dict]): List of dictionaries with the keys 'schedule_version', 'feed_start_date', and 'feed_end_date'. + cta_download (bool): whether data is coming from the CTA directy (transitchicago.com) Returns: pd.DataFrame: A DataFrame of every day in the specified data with - scheduled and observed count of trips. + scheduled and observed count of trips. pd.DataFrame: A DataFrame summary across versioned schedule comparisons. """ - schedule_data_list = create_GTFS_data_list(schedule_feeds)['schedule_data_list'] + schedule_data_list = create_GTFS_data_list(schedule_feeds, cta_download=cta_download)['schedule_data_list'] agg_info = AggInfo(freq=freq) combined_long, combined_grouped = combine_real_time_rt_comparison( diff --git a/data_analysis/static_gtfs_analysis.py b/data_analysis/static_gtfs_analysis.py index fb8060a..52f220f 100644 --- a/data_analysis/static_gtfs_analysis.py +++ b/data_analysis/static_gtfs_analysis.py @@ -62,6 +62,8 @@ def extract_data(cls, gtfs_zipfile: zipfile.ZipFile, 165/20220718/download or https://www.transitchicago.com/downloads/sch_data/ version_id (str, optional): The schedule version in use. Defaults to None. + cta_download (bool): whether data is coming from the CTA directy (transitchicago.com) + Returns: GTFSFeed: A GTFSFeed object containing multiple DataFrames diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 3923876..1e83f48 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -118,7 +118,7 @@ def compare_realtime_sched( data = sga.format_dates_hours(data) s3_data_list.append({'fname': fname, 'data': data}) - transit_feeds_GTFS_data_list = csrt.create_GTFS_data_list(schedule_list_filtered)['GTFS_data_list'] + transit_feeds_GTFS_data_list = csrt.create_GTFS_data_list(schedule_list_filtered, cta_download=False)['GTFS_data_list'] joined_list = [*s3_data_list, *transit_feeds_GTFS_data_list] From 14c8756b96c4ab521291ba298f4e9ea5c14135af Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Tue, 19 Sep 2023 18:45:07 -0500 Subject: [PATCH 15/62] Turn of fail-fast --- .github/workflows/cta_data_downloads.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index 70fcfa1..d4c04bb 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -79,6 +79,8 @@ jobs: save-frontend-map-json: runs-on: ubuntu-latest needs: [save-realtime-daily-summary, save-schedule-daily-summary] + strategy: + fail-fast: false steps: - uses: actions/checkout@v3 From e6b4d5325fb344f48b37a37addec4121a8ec372b Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 24 Sep 2023 19:38:44 -0500 Subject: [PATCH 16/62] save transitfeeds zipfiles to s3 --- .github/workflows/cta_data_downloads.yml | 91 ++++++++++++++---------- scrape_data/cta_data_downloads.py | 20 ++++++ 2 files changed, 74 insertions(+), 37 deletions(-) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index d4c04bb..707f61a 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -34,9 +34,8 @@ jobs: python -c 'from scrape_data.cta_data_downloads import save_cta_zip; \ save_cta_zip()' \ $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY - - - save-schedule-daily-summary: + + download-transitfeeds-data: runs-on: ubuntu-latest steps: @@ -45,54 +44,72 @@ jobs: - uses: actions/setup-python@v4 with: python-version: ${{ env.PYTHON_VERSION }} - - - name: 'Save schedule summaries' - # Test with no date and with date range + + - name: Download and save transitfeeds.com schedule data + run: | pip install -r requirements.txt - python -c 'from scrape_data.cta_data_downloads import save_sched_daily_summary; \ - save_sched_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY - python -c 'from scrape_data.cta_data_downloads import save_sched_daily_summary; \ - save_sched_daily_summary(["2023-05-20", "2023-08-20"])' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY + python -c 'from scrape_data.cta_data_downloads import save_transitfeeds_zip(); \ + save_transitfeeds_zip()' \ + $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY + + # save-schedule-daily-summary: + # runs-on: ubuntu-latest + + # steps: + # - uses: actions/checkout@v3 + + # - uses: actions/setup-python@v4 + # with: + # python-version: ${{ env.PYTHON_VERSION }} + + # - name: 'Save schedule summaries' + # # Test with no date and with date range + # run: | + # pip install -r requirements.txt + # python -c 'from scrape_data.cta_data_downloads import save_sched_daily_summary; \ + # save_sched_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY + # python -c 'from scrape_data.cta_data_downloads import save_sched_daily_summary; \ + # save_sched_daily_summary(["2023-05-20", "2023-08-20"])' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY - save-realtime-daily-summary: - runs-on: ubuntu-latest + # save-realtime-daily-summary: + # runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 + # steps: + # - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - with: - python-version: ${{ env.PYTHON_VERSION }} + # - uses: actions/setup-python@v4 + # with: + # python-version: ${{ env.PYTHON_VERSION }} - - name: 'Save realtime summaries' + # - name: 'Save realtime summaries' - run: | - pip install -r requirements.txt + # run: | + # pip install -r requirements.txt - python -c 'from scrape_data.cta_data_downloads import save_realtime_daily_summary; \ - save_realtime_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY + # python -c 'from scrape_data.cta_data_downloads import save_realtime_daily_summary; \ + # save_realtime_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY - save-frontend-map-json: - runs-on: ubuntu-latest - needs: [save-realtime-daily-summary, save-schedule-daily-summary] - strategy: - fail-fast: false - steps: - - uses: actions/checkout@v3 + # save-frontend-map-json: + # runs-on: ubuntu-latest + # needs: [save-realtime-daily-summary, save-schedule-daily-summary] + # strategy: + # fail-fast: false + # steps: + # - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - with: - python-version: ${{ env.PYTHON_VERSION }} + # - uses: actions/setup-python@v4 + # with: + # python-version: ${{ env.PYTHON_VERSION }} - - name: 'Save data.json for frontend' + # - name: 'Save data.json for frontend' - run: | - pip install -r requirements.txt + # run: | + # pip install -r requirements.txt - python -c 'from scrape_data.cta_data_downloads import compare_realtime_sched; \ - compare_realtime_sched()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY + # python -c 'from scrape_data.cta_data_downloads import compare_realtime_sched; \ + # compare_realtime_sched()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY \ No newline at end of file diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 1e83f48..e86ee43 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -43,6 +43,26 @@ def save_cta_zip() -> None: keys('chn-ghost-buses-public', [filename]) +def save_transitfeeds_zip(date_range: typing.List[str] = ['2022-05-20', today]) -> None: + schedule_list = csrt.create_schedule_list(month=5, year=2022) + schedule_list_filtered = [ + s for s in schedule_list + if s['feed_start_date'] >= min(date_range) + and s['feed_start_date'] <= max(date_range) + ] + for schedule_dict in schedule_list_filtered: + val = schedule_dict['schedule_version'] + _, zipfile_bytes_io = sga.download_zip(version_id=val) + zip_filename = f"transitfeeds_schedule_zipfiles_raw/{val}.zip" + client.upload_fileobj( + zipfile_bytes_io, + csrt.BUCKET_PUBLIC, + zip_filename + ) + + + + def save_csv_to_bucket(df: pd.DataFrame, filename: str) -> None: """Save pandas DataFrame to csv in s3 From 2ce5458d81b3c1ba7719bd371a3b0cfc120997d6 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 24 Sep 2023 20:24:41 -0500 Subject: [PATCH 17/62] Fix syntax error --- .github/workflows/cta_data_downloads.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index 707f61a..25978bf 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -49,7 +49,7 @@ jobs: run: | pip install -r requirements.txt - python -c 'from scrape_data.cta_data_downloads import save_transitfeeds_zip(); \ + python -c 'from scrape_data.cta_data_downloads import save_transitfeeds_zip; \ save_transitfeeds_zip()' \ $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY From 43e3945f0d75c14361c18b5fc124f1fad442e180 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 24 Sep 2023 20:35:22 -0500 Subject: [PATCH 18/62] confirm files exist --- scrape_data/cta_data_downloads.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index e86ee43..1ef704b 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -50,16 +50,19 @@ def save_transitfeeds_zip(date_range: typing.List[str] = ['2022-05-20', today]) if s['feed_start_date'] >= min(date_range) and s['feed_start_date'] <= max(date_range) ] + filename_list = [] for schedule_dict in schedule_list_filtered: val = schedule_dict['schedule_version'] _, zipfile_bytes_io = sga.download_zip(version_id=val) zip_filename = f"transitfeeds_schedule_zipfiles_raw/{val}.zip" + filename_list.append(zip_filename) client.upload_fileobj( zipfile_bytes_io, csrt.BUCKET_PUBLIC, zip_filename ) - + print(f'Confirm that files exist in s3') + keys('chn-ghost-buses-public', [filename_list]) From 0a9a7061e392abf5287c584163df50e74b5284d4 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 24 Sep 2023 20:56:43 -0500 Subject: [PATCH 19/62] call seek on BytesIO --- scrape_data/cta_data_downloads.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 1ef704b..f5570b6 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -56,6 +56,7 @@ def save_transitfeeds_zip(date_range: typing.List[str] = ['2022-05-20', today]) _, zipfile_bytes_io = sga.download_zip(version_id=val) zip_filename = f"transitfeeds_schedule_zipfiles_raw/{val}.zip" filename_list.append(zip_filename) + zipfile_bytes_io.seek(0) client.upload_fileobj( zipfile_bytes_io, csrt.BUCKET_PUBLIC, From 68f425b9fb009cbcb27f7611ce2f933462ada056 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Mon, 25 Sep 2023 14:25:40 -0500 Subject: [PATCH 20/62] Convert list of lists to list --- scrape_data/cta_data_downloads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index f5570b6..245d267 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -63,7 +63,7 @@ def save_transitfeeds_zip(date_range: typing.List[str] = ['2022-05-20', today]) zip_filename ) print(f'Confirm that files exist in s3') - keys('chn-ghost-buses-public', [filename_list]) + keys('chn-ghost-buses-public', filename_list) From 0f67748d194c4a2f382364aa8b481da85247268b Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Tue, 26 Sep 2023 18:44:35 -0500 Subject: [PATCH 21/62] download from s3 instead of transitfeeds --- scrape_data/cta_data_downloads.py | 39 +++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 245d267..3e195a4 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -126,28 +126,43 @@ def find_transitfeeds_zipfiles( return full_list +def download_s3_file(fname: str) -> sga.GTFSFeed: + zip_bytes = BytesIO() + zip_bytes.seek(0) + client.download_fileobj(Bucket=sga.BUCKET, Key=fname, Fileobj=zip_bytes) + zipfilesched = sga.zipfile.ZipFile(zip_bytes) + data = sga.GTFSFeed.extract_data(zipfilesched) + data = sga.format_dates_hours(data) + return data + + def compare_realtime_sched( date_range: typing.List[str] = ['2022-05-20', today]) -> None: - zip_filename_list, found_list = find_s3_zipfiles(date_range=date_range) - schedule_list_filtered = find_transitfeeds_zipfiles(zip_filename_list, found_list) + _, found_list = find_s3_zipfiles(date_range=date_range) + schedule_list = csrt.create_schedule_list(month=5, year=2022) + schedule_list_filtered = [ + s for s in schedule_list + if s['feed_start_date'] >= min(date_range) + and s['feed_start_date'] <= max(date_range) + ] + # schedule_list_filtered = find_transitfeeds_zipfiles(zip_filename_list, found_list) # Extract data from s3 zipfiles s3_data_list = [] for fname in found_list: - zip_bytes = BytesIO() - zip_bytes.seek(0) - client.download_fileobj(Bucket=sga.BUCKET, Key=fname, Fileobj=zip_bytes) - zipfilesched = sga.zipfile.ZipFile(zip_bytes) - data = sga.GTFSFeed.extract_data(zipfilesched) - data = sga.format_dates_hours(data) + data = download_s3_file(fname) + s3_data_list.append({'fname': fname, 'data': data}) - transit_feeds_GTFS_data_list = csrt.create_GTFS_data_list(schedule_list_filtered, cta_download=False)['GTFS_data_list'] - joined_list = [*s3_data_list, *transit_feeds_GTFS_data_list] - + # TODO Download the zipfiles from s3 instead of transitfeeds. + for tfname in schedule_list_filtered: + + full_name = f"transitfeeds_schedule_zipfiles_raw/{tfname}.zip" + tfdata = download_s3_file(full_name) + s3_data_list.append({'fname': tfname, 'data': tfdata}) # Convert from list of dictionaries to dictionary with list values - joined_dict = pd.DataFrame(joined_list).to_dict(orient='list') + joined_dict = pd.DataFrame(s3_data_list).to_dict(orient='list') schedule_data_list = [{'schedule_version': fname, 'data': create_route_summary(data, date_range)} for fname, data in joined_dict.items()] From ce6251aea3255ee22a65fea05d3278a6f7c86409 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Tue, 26 Sep 2023 20:03:52 -0500 Subject: [PATCH 22/62] test saving data.json --- .github/workflows/cta_data_downloads.yml | 104 +++++++++++------------ 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index 25978bf..1d4cf87 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -35,25 +35,7 @@ jobs: save_cta_zip()' \ $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY - download-transitfeeds-data: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v3 - - - uses: actions/setup-python@v4 - with: - python-version: ${{ env.PYTHON_VERSION }} - - - name: Download and save transitfeeds.com schedule data - - run: | - pip install -r requirements.txt - python -c 'from scrape_data.cta_data_downloads import save_transitfeeds_zip; \ - save_transitfeeds_zip()' \ - $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY - - # save-schedule-daily-summary: + # download-transitfeeds-data: # runs-on: ubuntu-latest # steps: @@ -62,54 +44,72 @@ jobs: # - uses: actions/setup-python@v4 # with: # python-version: ${{ env.PYTHON_VERSION }} - - # - name: 'Save schedule summaries' - # # Test with no date and with date range + + # - name: Download and save transitfeeds.com schedule data + # run: | # pip install -r requirements.txt + # python -c 'from scrape_data.cta_data_downloads import save_transitfeeds_zip; \ + # save_transitfeeds_zip()' \ + # $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY + + save-schedule-daily-summary: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: 'Save schedule summaries' + # Test with no date and with date range + run: | + pip install -r requirements.txt + python -c 'from scrape_data.cta_data_downloads import save_sched_daily_summary; \ + save_sched_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY # python -c 'from scrape_data.cta_data_downloads import save_sched_daily_summary; \ - # save_sched_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY - # python -c 'from scrape_data.cta_data_downloads import save_sched_daily_summary; \ - # save_sched_daily_summary(["2023-05-20", "2023-08-20"])' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY + # save_sched_daily_summary(["2023-05-20", "2023-08-20"])' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY - # save-realtime-daily-summary: - # runs-on: ubuntu-latest + save-realtime-daily-summary: + runs-on: ubuntu-latest - # steps: - # - uses: actions/checkout@v3 + steps: + - uses: actions/checkout@v3 - # - uses: actions/setup-python@v4 - # with: - # python-version: ${{ env.PYTHON_VERSION }} + - uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} - # - name: 'Save realtime summaries' + - name: 'Save realtime summaries' - # run: | - # pip install -r requirements.txt + run: | + pip install -r requirements.txt - # python -c 'from scrape_data.cta_data_downloads import save_realtime_daily_summary; \ - # save_realtime_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY + python -c 'from scrape_data.cta_data_downloads import save_realtime_daily_summary; \ + save_realtime_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY - # save-frontend-map-json: - # runs-on: ubuntu-latest - # needs: [save-realtime-daily-summary, save-schedule-daily-summary] - # strategy: - # fail-fast: false - # steps: - # - uses: actions/checkout@v3 + save-frontend-map-json: + runs-on: ubuntu-latest + needs: [save-realtime-daily-summary, save-schedule-daily-summary] + strategy: + fail-fast: false + steps: + - uses: actions/checkout@v3 - # - uses: actions/setup-python@v4 - # with: - # python-version: ${{ env.PYTHON_VERSION }} + - uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} - # - name: 'Save data.json for frontend' + - name: 'Save data.json for frontend' - # run: | - # pip install -r requirements.txt + run: | + pip install -r requirements.txt - # python -c 'from scrape_data.cta_data_downloads import compare_realtime_sched; \ - # compare_realtime_sched()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY + python -c 'from scrape_data.cta_data_downloads import compare_realtime_sched; \ + compare_realtime_sched()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY \ No newline at end of file From e8e5c7cff189a567a81a2a1401d30dacce9bc1cc Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Tue, 26 Sep 2023 20:26:36 -0500 Subject: [PATCH 23/62] Search for transitfeeds files after cta files --- scrape_data/cta_data_downloads.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 3e195a4..c43d132 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -139,19 +139,12 @@ def download_s3_file(fname: str) -> sga.GTFSFeed: def compare_realtime_sched( date_range: typing.List[str] = ['2022-05-20', today]) -> None: - _, found_list = find_s3_zipfiles(date_range=date_range) - schedule_list = csrt.create_schedule_list(month=5, year=2022) - schedule_list_filtered = [ - s for s in schedule_list - if s['feed_start_date'] >= min(date_range) - and s['feed_start_date'] <= max(date_range) - ] - # schedule_list_filtered = find_transitfeeds_zipfiles(zip_filename_list, found_list) + zip_filename_list, found_list = find_s3_zipfiles(date_range=date_range) + schedule_list_filtered = find_transitfeeds_zipfiles(zip_filename_list, found_list) # Extract data from s3 zipfiles s3_data_list = [] for fname in found_list: data = download_s3_file(fname) - s3_data_list.append({'fname': fname, 'data': data}) # TODO Download the zipfiles from s3 instead of transitfeeds. From 1d8c4815f084a6e7dc4d971e40a318f88c90c13b Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Fri, 29 Sep 2023 19:19:03 -0500 Subject: [PATCH 24/62] Change filter date range for transitfeeds zipfiles --- scrape_data/cta_data_downloads.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index c43d132..30d0c3a 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -115,10 +115,12 @@ def find_transitfeeds_zipfiles( transitfeeds_dates = sorted(transitfeeds_dates) schedule_list = csrt.create_schedule_list(month=5, year=2022) + # Start of saving transitchicago.com zipfiles to s3 was 2023-07-28. Don't need to check + # after this date. schedule_list_filtered = [ s for s in schedule_list if s['feed_start_date'] >= min(transitfeeds_dates) - and s['feed_start_date'] <= max(transitfeeds_dates) + and s['feed_start_date'] <= '2023-07-28' ] return schedule_list_filtered else: @@ -127,6 +129,7 @@ def find_transitfeeds_zipfiles( def download_s3_file(fname: str) -> sga.GTFSFeed: + print(f'Downloading {fname} from s3') zip_bytes = BytesIO() zip_bytes.seek(0) client.download_fileobj(Bucket=sga.BUCKET, Key=fname, Fileobj=zip_bytes) From 625a5108b0a12ebe6393583f8e1ff93bfd23a094 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Fri, 29 Sep 2023 20:31:23 -0500 Subject: [PATCH 25/62] Correct the filename on transitfeeds zip --- scrape_data/cta_data_downloads.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 30d0c3a..dc383a8 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -52,9 +52,9 @@ def save_transitfeeds_zip(date_range: typing.List[str] = ['2022-05-20', today]) ] filename_list = [] for schedule_dict in schedule_list_filtered: - val = schedule_dict['schedule_version'] - _, zipfile_bytes_io = sga.download_zip(version_id=val) - zip_filename = f"transitfeeds_schedule_zipfiles_raw/{val}.zip" + version = schedule_dict['schedule_version'] + _, zipfile_bytes_io = sga.download_zip(version_id=version) + zip_filename = f"transitfeeds_schedule_zipfiles_raw/{version}.zip" filename_list.append(zip_filename) zipfile_bytes_io.seek(0) client.upload_fileobj( @@ -151,11 +151,11 @@ def compare_realtime_sched( s3_data_list.append({'fname': fname, 'data': data}) # TODO Download the zipfiles from s3 instead of transitfeeds. - for tfname in schedule_list_filtered: - - full_name = f"transitfeeds_schedule_zipfiles_raw/{tfname}.zip" + for tfdict in schedule_list_filtered: + version = tfdict['schedule_version'] + full_name = f"transitfeeds_schedule_zipfiles_raw/{version}.zip" tfdata = download_s3_file(full_name) - s3_data_list.append({'fname': tfname, 'data': tfdata}) + s3_data_list.append({'fname': version, 'data': tfdata}) # Convert from list of dictionaries to dictionary with list values joined_dict = pd.DataFrame(s3_data_list).to_dict(orient='list') From b550387a72be5a0c3598121eee2b0dd513804dc3 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sat, 30 Sep 2023 19:27:51 -0500 Subject: [PATCH 26/62] Add manual workflow to backfill transitfeeds data --- .../workflows/transitfeeds-backfill-s3.yml | 38 +++++++++++++++++++ scrape_data/cta_data_downloads.py | 6 +-- 2 files changed, 41 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/transitfeeds-backfill-s3.yml diff --git a/.github/workflows/transitfeeds-backfill-s3.yml b/.github/workflows/transitfeeds-backfill-s3.yml new file mode 100644 index 0000000..6961099 --- /dev/null +++ b/.github/workflows/transitfeeds-backfill-s3.yml @@ -0,0 +1,38 @@ +name: Download transitfeeds.com zipfiles and save to s3 + +on: + workflow_dispatch: + inputs: + start_date: + description: 'Start date in YYYY-MM-DD format' + required: false + type: string + end_date: + description: 'End date in YYYY-MM-DD format e.g. 2023-05-20' + required: false + type: string + +env: + PYTHON_VERSION: 3.10.6 + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + +jobs: + save-transitfeeds-data: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Download and save transitfeeds.com schedule data + + run: | + pip install -r requirements.txt + python -c 'from scrape_data.cta_data_downloads import save_transitfeeds_zip; \ + save_transitfeeds_zip(start_date=${{ inputs.start_date }}, end_date=${{ inputs.end_date}})' \ + $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY + diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index dc383a8..09916f9 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -43,12 +43,12 @@ def save_cta_zip() -> None: keys('chn-ghost-buses-public', [filename]) -def save_transitfeeds_zip(date_range: typing.List[str] = ['2022-05-20', today]) -> None: +def save_transitfeeds_zip(start_date: str = '2022-05-20', end_date: str = today) -> None: schedule_list = csrt.create_schedule_list(month=5, year=2022) schedule_list_filtered = [ s for s in schedule_list - if s['feed_start_date'] >= min(date_range) - and s['feed_start_date'] <= max(date_range) + if s['feed_start_date'] >= start_date + and s['feed_start_date'] <= end_date ] filename_list = [] for schedule_dict in schedule_list_filtered: From c4049b2ef14cd9174f0f72634c4e44d8ee129a6c Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 1 Oct 2023 14:53:50 -0500 Subject: [PATCH 27/62] Add data artifacts --- .github/workflows/cta_data_downloads.yml | 69 +++++++++++++++++------- scrape_data/cta_data_downloads.py | 41 ++++++++++---- 2 files changed, 79 insertions(+), 31 deletions(-) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index 1d4cf87..d6bab22 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -35,24 +35,49 @@ jobs: save_cta_zip()' \ $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY - # download-transitfeeds-data: - # runs-on: ubuntu-latest + upload_cta_list: + runs-on: ubuntu-latest - # steps: - # - uses: actions/checkout@v3 + steps: + - uses: actions/checkout@v3 - # - uses: actions/setup-python@v4 - # with: - # python-version: ${{ env.PYTHON_VERSION }} - - # - name: Download and save transitfeeds.com schedule data + - uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: 'Upload cta list of zipfiles as an artifact' + run: | + pip install -r requirements.txt + python -c 'from scrape_data.cta_data_downloads import download_cta_files_s3; \ + download_cta_files_s3()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY - # run: | - # pip install -r requirements.txt - # python -c 'from scrape_data.cta_data_downloads import save_transitfeeds_zip; \ - # save_transitfeeds_zip()' \ - # $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY - + - uses: actions/upload-artifact@v3 + with: + name: s3_data_list + path: s3_data_list + + upload_transitfeeds_list: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: 'Upload transitfeeds list of zipfiles as an artifact' + run: | + pip install -r requirements.txt + python -c 'from scrape_data.cta_data_downloads import download_transitfeeds_files_s3; \ + download_transitfeeds_files_s3()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY + + - uses: actions/upload-artifact@v3 + with: + name: transitfeeds_data_list + path: transitfeeds_data_list + + save-schedule-daily-summary: runs-on: ubuntu-latest @@ -68,10 +93,7 @@ jobs: run: | pip install -r requirements.txt python -c 'from scrape_data.cta_data_downloads import save_sched_daily_summary; \ - save_sched_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY - # python -c 'from scrape_data.cta_data_downloads import save_sched_daily_summary; \ - # save_sched_daily_summary(["2023-05-20", "2023-08-20"])' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY - + save_sched_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY save-realtime-daily-summary: @@ -105,8 +127,15 @@ jobs: with: python-version: ${{ env.PYTHON_VERSION }} - - name: 'Save data.json for frontend' + - uses: actions/download-artifact@v3 + with: + name: transitfeeds_data_list + + - uses: actions/download-artifact@v3 + with: + name: s3_data_list + - name: 'Save data.json for frontend' run: | pip install -r requirements.txt diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 09916f9..7acfa62 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -7,6 +7,7 @@ from io import StringIO, BytesIO import pandas as pd import typing +import pickle ACCESS_KEY = sys.argv[1] SECRET_KEY = sys.argv[2] @@ -138,27 +139,45 @@ def download_s3_file(fname: str) -> sga.GTFSFeed: data = sga.format_dates_hours(data) return data - -def compare_realtime_sched( - date_range: typing.List[str] = ['2022-05-20', today]) -> None: - - zip_filename_list, found_list = find_s3_zipfiles(date_range=date_range) - schedule_list_filtered = find_transitfeeds_zipfiles(zip_filename_list, found_list) - # Extract data from s3 zipfiles +def download_cta_files_s3(date_range: typing.List[str] = ['2022-05-20', today]) -> None: + _, found_list = find_s3_zipfiles(date_range=date_range) s3_data_list = [] for fname in found_list: data = download_s3_file(fname) s3_data_list.append({'fname': fname, 'data': data}) - # TODO Download the zipfiles from s3 instead of transitfeeds. + with open('s3_data_list', 'wb') as fp: + pickle.dump(s3_data_list, fp) + + +def download_transitfeeds_files_s3(date_range: typing.List[str] = ['2022-05-20', today]) -> None: + zip_filename_list, found_list = find_s3_zipfiles(date_range=date_range) + schedule_list_filtered = find_transitfeeds_zipfiles(zip_filename_list, found_list) + transitfeeds_data_list = [] for tfdict in schedule_list_filtered: version = tfdict['schedule_version'] full_name = f"transitfeeds_schedule_zipfiles_raw/{version}.zip" tfdata = download_s3_file(full_name) - s3_data_list.append({'fname': version, 'data': tfdata}) - + transitfeeds_data_list.append({'fname': version, 'data': tfdata}) + with open('transitfeeds_data_list', 'wb') as fp: + pickle.dump(transitfeeds_data_list, fp) + + +def compare_realtime_sched( + date_range: typing.List[str] = ['2022-05-20', today]) -> None: + zip_filename_list, found_list = find_s3_zipfiles(date_range=date_range) + schedule_list_filtered = find_transitfeeds_zipfiles(zip_filename_list, found_list) + + with open('s3_data_list', 'rb') as fp: + s3_data_list = pickle.load(fp) + + with open('transitfeeds_data_list', 'rb') as fp: + transitfeeds_data_list = pickle.load(fp) + + joined_list = [*s3_data_list, *transitfeeds_data_list] + # Convert from list of dictionaries to dictionary with list values - joined_dict = pd.DataFrame(s3_data_list).to_dict(orient='list') + joined_dict = pd.DataFrame(joined_list).to_dict(orient='list') schedule_data_list = [{'schedule_version': fname, 'data': create_route_summary(data, date_range)} for fname, data in joined_dict.items()] From d9498c3904574ac8d24f2a18864f22e33a892577 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 1 Oct 2023 16:10:06 -0500 Subject: [PATCH 28/62] add logger messages for s3 downloads --- scrape_data/cta_data_downloads.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 7acfa62..e4d832e 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -8,7 +8,9 @@ import pandas as pd import typing import pickle +import logging +LOGGER = logging.getLogger(__name__) ACCESS_KEY = sys.argv[1] SECRET_KEY = sys.argv[2] @@ -35,7 +37,7 @@ def save_cta_zip() -> None: f'on {today} to public bucket') filename = f'cta_schedule_zipfiles_raw/google_transit_{today}.zip' zipfile_bytes_io.seek(0) - client.upload_fileobj( + client.uploaFd_fileobj( zipfile_bytes_io, csrt.BUCKET_PUBLIC, filename @@ -143,7 +145,9 @@ def download_cta_files_s3(date_range: typing.List[str] = ['2022-05-20', today]) _, found_list = find_s3_zipfiles(date_range=date_range) s3_data_list = [] for fname in found_list: + LOGGER.info(f"Downloading from S3: {fname}") data = download_s3_file(fname) + LOGGER.info(f"Successfully downloaded from S3: {fname}") s3_data_list.append({'fname': fname, 'data': data}) with open('s3_data_list', 'wb') as fp: @@ -155,9 +159,11 @@ def download_transitfeeds_files_s3(date_range: typing.List[str] = ['2022-05-20', schedule_list_filtered = find_transitfeeds_zipfiles(zip_filename_list, found_list) transitfeeds_data_list = [] for tfdict in schedule_list_filtered: + LOGGER.info(f"Downloading from S3: {tfdict}") version = tfdict['schedule_version'] full_name = f"transitfeeds_schedule_zipfiles_raw/{version}.zip" tfdata = download_s3_file(full_name) + LOGGER.info(f"Successfully downloaded from S3: {tfdict}") transitfeeds_data_list.append({'fname': version, 'data': tfdata}) with open('transitfeeds_data_list', 'wb') as fp: pickle.dump(transitfeeds_data_list, fp) From 0023be63fbcde4a22d7574bf88c9138785e66eb7 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 1 Oct 2023 16:32:36 -0500 Subject: [PATCH 29/62] add more logging --- scrape_data/cta_data_downloads.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index e4d832e..e755819 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -37,7 +37,7 @@ def save_cta_zip() -> None: f'on {today} to public bucket') filename = f'cta_schedule_zipfiles_raw/google_transit_{today}.zip' zipfile_bytes_io.seek(0) - client.uploaFd_fileobj( + client.upload_fileobj( zipfile_bytes_io, csrt.BUCKET_PUBLIC, filename @@ -137,7 +137,9 @@ def download_s3_file(fname: str) -> sga.GTFSFeed: zip_bytes.seek(0) client.download_fileobj(Bucket=sga.BUCKET, Key=fname, Fileobj=zip_bytes) zipfilesched = sga.zipfile.ZipFile(zip_bytes) + LOGGER.info('Extracting data') data = sga.GTFSFeed.extract_data(zipfilesched) + LOGGER.info('Extraction successful') data = sga.format_dates_hours(data) return data @@ -150,9 +152,10 @@ def download_cta_files_s3(date_range: typing.List[str] = ['2022-05-20', today]) LOGGER.info(f"Successfully downloaded from S3: {fname}") s3_data_list.append({'fname': fname, 'data': data}) + LOGGER.info(f'Pickling s3_data_list') with open('s3_data_list', 'wb') as fp: pickle.dump(s3_data_list, fp) - + LOGGER.info(f'Pickling done') def download_transitfeeds_files_s3(date_range: typing.List[str] = ['2022-05-20', today]) -> None: zip_filename_list, found_list = find_s3_zipfiles(date_range=date_range) @@ -165,9 +168,10 @@ def download_transitfeeds_files_s3(date_range: typing.List[str] = ['2022-05-20', tfdata = download_s3_file(full_name) LOGGER.info(f"Successfully downloaded from S3: {tfdict}") transitfeeds_data_list.append({'fname': version, 'data': tfdata}) + LOGGER.info('Pickling file transitfeeds_data_list') with open('transitfeeds_data_list', 'wb') as fp: pickle.dump(transitfeeds_data_list, fp) - + LOGGER.info('Pickling successful') def compare_realtime_sched( date_range: typing.List[str] = ['2022-05-20', today]) -> None: From fc20b8350a898427da8cf02749d297feb32e164b Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Mon, 2 Oct 2023 14:23:13 -0500 Subject: [PATCH 30/62] Change to ubuntu 20.04. Return to commit b550387a --- .github/workflows/cta_data_downloads.yml | 65 +++--------------------- scrape_data/cta_data_downloads.py | 50 ++++-------------- 2 files changed, 18 insertions(+), 97 deletions(-) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index d6bab22..62f7860 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -16,9 +16,11 @@ env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} +# Changing ubuntu version to 20.04 might resolve some timeout issues +# See https://github.com/actions/runner-images/issues/6680 jobs: download-cta-schedule-data: - runs-on: ubuntu-latest + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v3 @@ -34,52 +36,9 @@ jobs: python -c 'from scrape_data.cta_data_downloads import save_cta_zip; \ save_cta_zip()' \ $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY - - upload_cta_list: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v3 - - - uses: actions/setup-python@v4 - with: - python-version: ${{ env.PYTHON_VERSION }} - - - name: 'Upload cta list of zipfiles as an artifact' - run: | - pip install -r requirements.txt - python -c 'from scrape_data.cta_data_downloads import download_cta_files_s3; \ - download_cta_files_s3()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY - - - uses: actions/upload-artifact@v3 - with: - name: s3_data_list - path: s3_data_list - - upload_transitfeeds_list: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v3 - - - uses: actions/setup-python@v4 - with: - python-version: ${{ env.PYTHON_VERSION }} - - - name: 'Upload transitfeeds list of zipfiles as an artifact' - run: | - pip install -r requirements.txt - python -c 'from scrape_data.cta_data_downloads import download_transitfeeds_files_s3; \ - download_transitfeeds_files_s3()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY - - - uses: actions/upload-artifact@v3 - with: - name: transitfeeds_data_list - path: transitfeeds_data_list - - + save-schedule-daily-summary: - runs-on: ubuntu-latest + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v3 @@ -95,9 +54,8 @@ jobs: python -c 'from scrape_data.cta_data_downloads import save_sched_daily_summary; \ save_sched_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY - save-realtime-daily-summary: - runs-on: ubuntu-latest + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v3 @@ -116,7 +74,7 @@ jobs: save-frontend-map-json: - runs-on: ubuntu-latest + runs-on: ubuntu-20.04 needs: [save-realtime-daily-summary, save-schedule-daily-summary] strategy: fail-fast: false @@ -127,15 +85,8 @@ jobs: with: python-version: ${{ env.PYTHON_VERSION }} - - uses: actions/download-artifact@v3 - with: - name: transitfeeds_data_list - - - uses: actions/download-artifact@v3 - with: - name: s3_data_list - - name: 'Save data.json for frontend' + run: | pip install -r requirements.txt diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index e755819..7ffc148 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -7,10 +7,7 @@ from io import StringIO, BytesIO import pandas as pd import typing -import pickle -import logging -LOGGER = logging.getLogger(__name__) ACCESS_KEY = sys.argv[1] SECRET_KEY = sys.argv[2] @@ -137,57 +134,30 @@ def download_s3_file(fname: str) -> sga.GTFSFeed: zip_bytes.seek(0) client.download_fileobj(Bucket=sga.BUCKET, Key=fname, Fileobj=zip_bytes) zipfilesched = sga.zipfile.ZipFile(zip_bytes) - LOGGER.info('Extracting data') data = sga.GTFSFeed.extract_data(zipfilesched) - LOGGER.info('Extraction successful') data = sga.format_dates_hours(data) return data -def download_cta_files_s3(date_range: typing.List[str] = ['2022-05-20', today]) -> None: - _, found_list = find_s3_zipfiles(date_range=date_range) + +def compare_realtime_sched( + date_range: typing.List[str] = ['2022-05-20', today]) -> None: + + zip_filename_list, found_list = find_s3_zipfiles(date_range=date_range) + schedule_list_filtered = find_transitfeeds_zipfiles(zip_filename_list, found_list) + # Extract data from s3 zipfiles s3_data_list = [] for fname in found_list: - LOGGER.info(f"Downloading from S3: {fname}") data = download_s3_file(fname) - LOGGER.info(f"Successfully downloaded from S3: {fname}") s3_data_list.append({'fname': fname, 'data': data}) - LOGGER.info(f'Pickling s3_data_list') - with open('s3_data_list', 'wb') as fp: - pickle.dump(s3_data_list, fp) - LOGGER.info(f'Pickling done') - -def download_transitfeeds_files_s3(date_range: typing.List[str] = ['2022-05-20', today]) -> None: - zip_filename_list, found_list = find_s3_zipfiles(date_range=date_range) - schedule_list_filtered = find_transitfeeds_zipfiles(zip_filename_list, found_list) - transitfeeds_data_list = [] for tfdict in schedule_list_filtered: - LOGGER.info(f"Downloading from S3: {tfdict}") version = tfdict['schedule_version'] full_name = f"transitfeeds_schedule_zipfiles_raw/{version}.zip" tfdata = download_s3_file(full_name) - LOGGER.info(f"Successfully downloaded from S3: {tfdict}") - transitfeeds_data_list.append({'fname': version, 'data': tfdata}) - LOGGER.info('Pickling file transitfeeds_data_list') - with open('transitfeeds_data_list', 'wb') as fp: - pickle.dump(transitfeeds_data_list, fp) - LOGGER.info('Pickling successful') - -def compare_realtime_sched( - date_range: typing.List[str] = ['2022-05-20', today]) -> None: - zip_filename_list, found_list = find_s3_zipfiles(date_range=date_range) - schedule_list_filtered = find_transitfeeds_zipfiles(zip_filename_list, found_list) - - with open('s3_data_list', 'rb') as fp: - s3_data_list = pickle.load(fp) - - with open('transitfeeds_data_list', 'rb') as fp: - transitfeeds_data_list = pickle.load(fp) - - joined_list = [*s3_data_list, *transitfeeds_data_list] - + s3_data_list.append({'fname': version, 'data': tfdata}) + # Convert from list of dictionaries to dictionary with list values - joined_dict = pd.DataFrame(joined_list).to_dict(orient='list') + joined_dict = pd.DataFrame(s3_data_list).to_dict(orient='list') schedule_data_list = [{'schedule_version': fname, 'data': create_route_summary(data, date_range)} for fname, data in joined_dict.items()] From 799361f3c5ad29c8218665fa7ef64c5d543090e8 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Mon, 2 Oct 2023 16:47:54 -0500 Subject: [PATCH 31/62] Testing macos-latest --- .github/workflows/cta_data_downloads.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index 62f7860..bce34e3 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -20,7 +20,7 @@ env: # See https://github.com/actions/runner-images/issues/6680 jobs: download-cta-schedule-data: - runs-on: ubuntu-20.04 + runs-on: macos-latest steps: - uses: actions/checkout@v3 @@ -38,7 +38,7 @@ jobs: $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY save-schedule-daily-summary: - runs-on: ubuntu-20.04 + runs-on: macos-latest steps: - uses: actions/checkout@v3 @@ -55,7 +55,7 @@ jobs: save_sched_daily_summary()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY save-realtime-daily-summary: - runs-on: ubuntu-20.04 + runs-on: macos-latest steps: - uses: actions/checkout@v3 @@ -74,7 +74,7 @@ jobs: save-frontend-map-json: - runs-on: ubuntu-20.04 + runs-on: macos-latest needs: [save-realtime-daily-summary, save-schedule-daily-summary] strategy: fail-fast: false From 1f156d4114fd864d7177b002a55017d271048227 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Mon, 2 Oct 2023 17:53:32 -0500 Subject: [PATCH 32/62] Start a shell for failed runs --- .github/workflows/cta_data_downloads.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index bce34e3..1d1fbaa 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -92,4 +92,7 @@ jobs: python -c 'from scrape_data.cta_data_downloads import compare_realtime_sched; \ compare_realtime_sched()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY - \ No newline at end of file + + - name: tmate session on error + if: failure() + uses: mxschmitt/action-tmate@v3 \ No newline at end of file From 2d9e9c9e75c7985349ae0ea95d81549eb48abb37 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Mon, 2 Oct 2023 19:21:07 -0500 Subject: [PATCH 33/62] return zipfile from s3 without extraction --- scrape_data/cta_data_downloads.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 7ffc148..afa1b7f 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -134,10 +134,8 @@ def download_s3_file(fname: str) -> sga.GTFSFeed: zip_bytes.seek(0) client.download_fileobj(Bucket=sga.BUCKET, Key=fname, Fileobj=zip_bytes) zipfilesched = sga.zipfile.ZipFile(zip_bytes) - data = sga.GTFSFeed.extract_data(zipfilesched) - data = sga.format_dates_hours(data) - return data - + return zipfilesched + def compare_realtime_sched( date_range: typing.List[str] = ['2022-05-20', today]) -> None: From a73d3dd15a9b704b428751c05d111ba40512b0a7 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Tue, 3 Oct 2023 13:54:30 -0500 Subject: [PATCH 34/62] Add python debugger --- scrape_data/cta_data_downloads.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index afa1b7f..d5e20f9 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -156,8 +156,11 @@ def compare_realtime_sched( # Convert from list of dictionaries to dictionary with list values joined_dict = pd.DataFrame(s3_data_list).to_dict(orient='list') - schedule_data_list = [{'schedule_version': fname, 'data': create_route_summary(data, date_range)} - for fname, data in joined_dict.items()] + try: + schedule_data_list = [{'schedule_version': fname, 'data': create_route_summary(data, date_range)} + for fname, data in joined_dict.items()] + except AttributeError: + import pdb; pdb.set_trace() agg_info = csrt.AggInfo() print('Creating combined_long_df and summary_df') @@ -211,6 +214,7 @@ def extract_date(fname: str) -> str: def create_route_summary(CTA_GTFS: sga.GTFSFeed, date_range: typing.List[str]) -> pd.DataFrame: + print(f'data is {data}') data = sga.GTFSFeed.extract_data(CTA_GTFS) data = sga.format_dates_hours(data) trip_summary = sga.make_trip_summary(data) From 5cafc6769e3e3a59d7ef4a0ba2914ea3a2baaf8e Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Tue, 3 Oct 2023 14:06:02 -0500 Subject: [PATCH 35/62] Fix syntax error --- scrape_data/cta_data_downloads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index d5e20f9..5d0046b 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -214,7 +214,7 @@ def extract_date(fname: str) -> str: def create_route_summary(CTA_GTFS: sga.GTFSFeed, date_range: typing.List[str]) -> pd.DataFrame: - print(f'data is {data}') + print(f'Input is {CTA_GTFS}') data = sga.GTFSFeed.extract_data(CTA_GTFS) data = sga.format_dates_hours(data) trip_summary = sga.make_trip_summary(data) From 445795ba601cba923081c858c9e3ed9b26da33e1 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Tue, 3 Oct 2023 14:38:00 -0500 Subject: [PATCH 36/62] Add more prints --- scrape_data/cta_data_downloads.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 5d0046b..0787584 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -155,13 +155,16 @@ def compare_realtime_sched( s3_data_list.append({'fname': version, 'data': tfdata}) # Convert from list of dictionaries to dictionary with list values + print(f's3_data_list is {s3_data_list}') joined_dict = pd.DataFrame(s3_data_list).to_dict(orient='list') - try: - schedule_data_list = [{'schedule_version': fname, 'data': create_route_summary(data, date_range)} - for fname, data in joined_dict.items()] - except AttributeError: - import pdb; pdb.set_trace() - + print(f'Joined dict is {joined_dict}') + + import sys + sys.exit() + + schedule_data_list = [{'schedule_version': fname, 'data': create_route_summary(data, date_range)} + for fname, data in joined_dict.items()] + agg_info = csrt.AggInfo() print('Creating combined_long_df and summary_df') combined_long_df, summary_df = csrt.combine_real_time_rt_comparison( From e629cb060cd25ec885c78e9eeef5262694182f1a Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Tue, 3 Oct 2023 15:18:41 -0500 Subject: [PATCH 37/62] Change loop --- scrape_data/cta_data_downloads.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 0787584..a79d445 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -159,11 +159,11 @@ def compare_realtime_sched( joined_dict = pd.DataFrame(s3_data_list).to_dict(orient='list') print(f'Joined dict is {joined_dict}') - import sys - sys.exit() - - schedule_data_list = [{'schedule_version': fname, 'data': create_route_summary(data, date_range)} - for fname, data in joined_dict.items()] + schedule_data_list = [] + for elt in s3_data_list: + for zipname, zipdata in elt.items(): + schedule_data_list.append({'schedule_version': zipname, 'data': create_route_summary(zipdata, date_range)}) + agg_info = csrt.AggInfo() print('Creating combined_long_df and summary_df') From 8c5d79d604b0c0cb73a1d3c35d888f68216a79a1 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Tue, 3 Oct 2023 16:20:53 -0500 Subject: [PATCH 38/62] Fix dictionary syntax --- scrape_data/cta_data_downloads.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index a79d445..b5795e6 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -161,8 +161,7 @@ def compare_realtime_sched( schedule_data_list = [] for elt in s3_data_list: - for zipname, zipdata in elt.items(): - schedule_data_list.append({'schedule_version': zipname, 'data': create_route_summary(zipdata, date_range)}) + schedule_data_list.append({'schedule_version': elt['fname'], 'data': create_route_summary(elt['data'], date_range)}) agg_info = csrt.AggInfo() From de389909424919114d5898aed342ce7497a19e16 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Tue, 3 Oct 2023 16:21:51 -0500 Subject: [PATCH 39/62] remove tmate --- .github/workflows/cta_data_downloads.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index 1d1fbaa..b72cbef 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -92,7 +92,4 @@ jobs: python -c 'from scrape_data.cta_data_downloads import compare_realtime_sched; \ compare_realtime_sched()' $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY - - - name: tmate session on error - if: failure() - uses: mxschmitt/action-tmate@v3 \ No newline at end of file + \ No newline at end of file From acb7d4dafc1078753ff05fdf1008ca36c56589c8 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sat, 7 Oct 2023 17:50:36 -0500 Subject: [PATCH 40/62] Add cta_download argument to GTFSFeed.extract_data --- data_analysis/static_gtfs_analysis.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/data_analysis/static_gtfs_analysis.py b/data_analysis/static_gtfs_analysis.py index 52f220f..af1d954 100644 --- a/data_analysis/static_gtfs_analysis.py +++ b/data_analysis/static_gtfs_analysis.py @@ -386,9 +386,11 @@ def download_extract_format(version_id: str = None) -> GTFSFeed: """ if version_id is None: CTA_GTFS, _ = download_cta_zip() + cta_download = True else: CTA_GTFS, _ = download_zip(version_id) - data = GTFSFeed.extract_data(CTA_GTFS, version_id=version_id) + cta_download = False + data = GTFSFeed.extract_data(CTA_GTFS, version_id=version_id, cta_download=cta_download) data = format_dates_hours(data) return data From d563c04897b8f77ff59afd769af9a544d1b27a2c Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sat, 7 Oct 2023 19:29:36 -0500 Subject: [PATCH 41/62] More print statements for start and end date --- data_analysis/plots.py | 3 +++ scrape_data/cta_data_downloads.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/data_analysis/plots.py b/data_analysis/plots.py index 695b58b..6d8a228 100644 --- a/data_analysis/plots.py +++ b/data_analysis/plots.py @@ -1031,6 +1031,8 @@ def save_json(summary_gdf_geo: gpd.GeoDataFrame, summary_kwargs: dict, save_name a 'column' key at minimum save_name (str): name of the json output file """ + # Make directory for GitHub actions + Path(DATA_PATH).mkdir(parents=True, exist_ok=True) path_name = create_save_path(save_name, DATA_PATH) # Take only the columns related to summary_kwargs['column'] # and those used in the map @@ -1040,6 +1042,7 @@ def save_json(summary_gdf_geo: gpd.GeoDataFrame, summary_kwargs: dict, save_name summary_gdf_geo.columns.str.startswith(summary_kwargs["column"]) ].tolist() cols = first_cols + kwargs_cols + last_cols + print(f'Saving {path_name}') summary_gdf_geo[cols].to_file(f"{path_name}.json", driver="GeoJSON") summary_gdf_geo[cols].to_html(f"{path_name}_table.html", index=False) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index b5795e6..41697aa 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -175,7 +175,8 @@ def compare_realtime_sched( day_type = 'wk' start_date = combined_long_df["date"].min().strftime("%Y-%m-%d") end_date = combined_long_df["date"].max().strftime("%Y-%m-%d") - + print(f'Start date is {start_date}') + print(f'End date is {end_date}') summary_gdf_geo = plots.create_summary_gdf_geo(combined_long_df, summary_df, day_type=day_type) summary_kwargs = {'column': 'ratio'} save_name = f"all_routes_{start_date}_to_{end_date}_{day_type}" From fe6f522e75f2ca1d6909c09672e4a670ee8bec04 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Mon, 9 Oct 2023 19:16:02 -0500 Subject: [PATCH 42/62] change save path of output JSON --- data_analysis/plots.py | 8 ++++---- scrape_data/cta_data_downloads.py | 5 +++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/data_analysis/plots.py b/data_analysis/plots.py index 6d8a228..b35f315 100644 --- a/data_analysis/plots.py +++ b/data_analysis/plots.py @@ -1032,8 +1032,8 @@ def save_json(summary_gdf_geo: gpd.GeoDataFrame, summary_kwargs: dict, save_name save_name (str): name of the json output file """ # Make directory for GitHub actions - Path(DATA_PATH).mkdir(parents=True, exist_ok=True) - path_name = create_save_path(save_name, DATA_PATH) + + path_name = create_save_path(save_name, DATA_PATH.parent) # Take only the columns related to summary_kwargs['column'] # and those used in the map first_cols = summary_gdf_geo.columns[:2].tolist() @@ -1059,9 +1059,9 @@ def create_frontend_json(json_file: str, start_date: str, end_date: str, save_pa save (bool, optional): Whether to save the JSON output. Defaults to True. Raises: - ValueError: If save is True. The save_path argument cannot be None. + ValueError: If save is True, the save_path argument cannot be None. """ - with open(DATA_PATH / json_file) as json_data: + with open(DATA_PATH.parent / json_file) as json_data: data = json.load(json_data) data['dates'] = {'start': start_date, 'end': end_date} if save: diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 41697aa..687f7ed 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -142,6 +142,7 @@ def compare_realtime_sched( zip_filename_list, found_list = find_s3_zipfiles(date_range=date_range) schedule_list_filtered = find_transitfeeds_zipfiles(zip_filename_list, found_list) + print(f'--> schedule_list_filtered: {schedule_list_filtered}') # Extract data from s3 zipfiles s3_data_list = [] for fname in found_list: @@ -175,8 +176,8 @@ def compare_realtime_sched( day_type = 'wk' start_date = combined_long_df["date"].min().strftime("%Y-%m-%d") end_date = combined_long_df["date"].max().strftime("%Y-%m-%d") - print(f'Start date is {start_date}') - print(f'End date is {end_date}') + print(f'---> Start date is {start_date}') + print(f'---> End date is {end_date}') summary_gdf_geo = plots.create_summary_gdf_geo(combined_long_df, summary_df, day_type=day_type) summary_kwargs = {'column': 'ratio'} save_name = f"all_routes_{start_date}_to_{end_date}_{day_type}" From b3dc77fb24512e11fc776d5d7a5b8fdd44f601c5 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Thu, 12 Oct 2023 20:26:00 -0500 Subject: [PATCH 43/62] Make sure the save paths are the same --- data_analysis/plots.py | 13 ++++++------- scrape_data/cta_data_downloads.py | 11 +++++++---- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/data_analysis/plots.py b/data_analysis/plots.py index b35f315..a94620a 100644 --- a/data_analysis/plots.py +++ b/data_analysis/plots.py @@ -1033,7 +1033,7 @@ def save_json(summary_gdf_geo: gpd.GeoDataFrame, summary_kwargs: dict, save_name """ # Make directory for GitHub actions - path_name = create_save_path(save_name, DATA_PATH.parent) + path_name = create_save_path(save_name, DATA_PATH) # Take only the columns related to summary_kwargs['column'] # and those used in the map first_cols = summary_gdf_geo.columns[:2].tolist() @@ -1047,31 +1047,30 @@ def save_json(summary_gdf_geo: gpd.GeoDataFrame, summary_kwargs: dict, save_name summary_gdf_geo[cols].to_html(f"{path_name}_table.html", index=False) -def create_frontend_json(json_file: str, start_date: str, end_date: str, save_path: str = None, save: bool = True) -> None: +def create_frontend_json(json_file: str, start_date: str, end_date: str, save_path: str, save: bool = True) -> None: """Create the data.json file that is used for the map at ghostbuses.com Args: json_file (str): name of the json input file start_date (str): start date of the data in YYYY-MM-DD format end_date (str): end date of the data in YYYY-MM-DD format - save_path (str, optional): The path to save the output file. Defaults to None. - If save is True, this argument is required. + save_path (str): The path to save the output file. save (bool, optional): Whether to save the JSON output. Defaults to True. Raises: ValueError: If save is True, the save_path argument cannot be None. """ - with open(DATA_PATH.parent / json_file) as json_data: + json_path = create_save_path(json_file, DATA_PATH) + with open(json_path) as json_data: data = json.load(json_data) data['dates'] = {'start': start_date, 'end': end_date} if save: - if save_path is None: - raise ValueError('You must specify a location to save the json file') with open(save_path, 'w') as output_json: json.dump(data, output_json) else: return json.dumps(data, indent=4) + def main(day_type: str = None) -> None: """Generate maps of all routes, top 10 best routes, top 10 worst routes, and ridership diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 687f7ed..1b73cbb 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -139,7 +139,7 @@ def download_s3_file(fname: str) -> sga.GTFSFeed: def compare_realtime_sched( date_range: typing.List[str] = ['2022-05-20', today]) -> None: - + print(f'--> Date range is {date_range}') zip_filename_list, found_list = find_s3_zipfiles(date_range=date_range) schedule_list_filtered = find_transitfeeds_zipfiles(zip_filename_list, found_list) print(f'--> schedule_list_filtered: {schedule_list_filtered}') @@ -167,17 +167,20 @@ def compare_realtime_sched( agg_info = csrt.AggInfo() print('Creating combined_long_df and summary_df') - combined_long_df, summary_df = csrt.combine_real_time_rt_comparison( + combined_long_df, combined_grouped = csrt.combine_real_time_rt_comparison( schedule_feeds=schedule_list_filtered, schedule_data_list=schedule_data_list, - agg_info=agg_info - ) + agg_info=agg_info, + ) + summary_df = csrt.build_summary(combined_grouped, save=False) day_type = 'wk' start_date = combined_long_df["date"].min().strftime("%Y-%m-%d") end_date = combined_long_df["date"].max().strftime("%Y-%m-%d") print(f'---> Start date is {start_date}') print(f'---> End date is {end_date}') + if start_date == end_date: + raise ValueError('Start date and end date should be different.') summary_gdf_geo = plots.create_summary_gdf_geo(combined_long_df, summary_df, day_type=day_type) summary_kwargs = {'column': 'ratio'} save_name = f"all_routes_{start_date}_to_{end_date}_{day_type}" From bddb22c70aab89348c0d9daf3b6195e3a0c88985 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sat, 14 Oct 2023 19:00:34 -0500 Subject: [PATCH 44/62] Use main function from compare_scheduled_and_rt.py --- .github/workflows/cta_data_downloads.yml | 2 +- scrape_data/cta_data_downloads.py | 14 ++++---------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index b72cbef..1dd6f49 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -16,7 +16,7 @@ env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} -# Changing ubuntu version to 20.04 might resolve some timeout issues +# Changing ubuntu to macos might resolve some timeout issues # See https://github.com/actions/runner-images/issues/6680 jobs: download-cta-schedule-data: diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 1b73cbb..ee6ca3b 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -158,23 +158,17 @@ def compare_realtime_sched( # Convert from list of dictionaries to dictionary with list values print(f's3_data_list is {s3_data_list}') joined_dict = pd.DataFrame(s3_data_list).to_dict(orient='list') - print(f'Joined dict is {joined_dict}') schedule_data_list = [] for elt in s3_data_list: schedule_data_list.append({'schedule_version': elt['fname'], 'data': create_route_summary(elt['data'], date_range)}) - - agg_info = csrt.AggInfo() + print(f'--> schedule_data_list is {schedule_data_list}') print('Creating combined_long_df and summary_df') - combined_long_df, combined_grouped = csrt.combine_real_time_rt_comparison( - schedule_feeds=schedule_list_filtered, - schedule_data_list=schedule_data_list, - agg_info=agg_info, - ) - summary_df = csrt.build_summary(combined_grouped, save=False) - + combined_long_df, summary_df = csrt.main(schedule_feeds=schedule_list_filtered) + day_type = 'wk' + combined_long_df = plots.filter_day_type(combined_long_df, day_type=day_type) start_date = combined_long_df["date"].min().strftime("%Y-%m-%d") end_date = combined_long_df["date"].max().strftime("%Y-%m-%d") print(f'---> Start date is {start_date}') From 9f646cee4601f03c49186fd47428c4c8ee4b5ee9 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 15 Oct 2023 17:32:31 -0500 Subject: [PATCH 45/62] Remove cta_download arg --- data_analysis/compare_scheduled_and_rt.py | 15 ++++++++++----- scrape_data/cta_data_downloads.py | 6 +++++- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/data_analysis/compare_scheduled_and_rt.py b/data_analysis/compare_scheduled_and_rt.py index f7c3696..d7db708 100644 --- a/data_analysis/compare_scheduled_and_rt.py +++ b/data_analysis/compare_scheduled_and_rt.py @@ -315,7 +315,7 @@ def build_summary( return summary -def create_GTFS_data_list(schedule_feeds: List[dict] = None, cta_download: bool = True) -> dict: +def create_GTFS_data_list(schedule_feeds: List[dict] = None) -> dict: if schedule_feeds is None: schedule_feeds = create_schedule_list(month=5, year=2022) @@ -324,6 +324,12 @@ def create_GTFS_data_list(schedule_feeds: List[dict] = None, cta_download: bool pbar = tqdm(schedule_feeds) for feed in pbar: schedule_version = feed["schedule_version"] + # Files with .zip suffix come from the CTA directly. + # Otherwise, they come from transitfeeds.com + if schedule_version.endswith('.zip'): + cta_download = True + else: + cta_download = False pbar.set_description( f"Generating daily schedule data for " f"schedule version {schedule_version}" @@ -364,22 +370,21 @@ def create_GTFS_data_list(schedule_feeds: List[dict] = None, cta_download: bool 'schedule_data_list': schedule_data_list } -def main(freq: str = 'D', schedule_feeds: List[dict] = None, - cta_download: bool = True) -> Tuple[List[dict],pd.DataFrame, pd.DataFrame]: +def main(freq: str = 'D', schedule_feeds: List[dict] = None + ) -> Tuple[List[dict],pd.DataFrame, pd.DataFrame]: """Calculate the summary by route and day across multiple schedule versions Args: freq (str): Frequency of aggregation. Defaults to Daily. schedule_feeds (List[dict]): List of dictionaries with the keys 'schedule_version', 'feed_start_date', and 'feed_end_date'. - cta_download (bool): whether data is coming from the CTA directy (transitchicago.com) Returns: pd.DataFrame: A DataFrame of every day in the specified data with scheduled and observed count of trips. pd.DataFrame: A DataFrame summary across versioned schedule comparisons. """ - schedule_data_list = create_GTFS_data_list(schedule_feeds, cta_download=cta_download)['schedule_data_list'] + schedule_data_list = create_GTFS_data_list(schedule_feeds)['schedule_data_list'] agg_info = AggInfo(freq=freq) combined_long, combined_grouped = combine_real_time_rt_comparison( diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index ee6ca3b..b1cb47e 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -162,8 +162,12 @@ def compare_realtime_sched( schedule_data_list = [] for elt in s3_data_list: schedule_data_list.append({'schedule_version': elt['fname'], 'data': create_route_summary(elt['data'], date_range)}) + + for sched in schedule_data_list: + start_date = sched['data']['date'].min() + end_date = sched['date']['date'].max() + print(f"Date range is {start_date} to {end_date} for schedule_version {sched['schedule_version']}") - print(f'--> schedule_data_list is {schedule_data_list}') print('Creating combined_long_df and summary_df') combined_long_df, summary_df = csrt.main(schedule_feeds=schedule_list_filtered) From ef111074654330a39128f666ca91416aa85ed165 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 15 Oct 2023 18:31:07 -0500 Subject: [PATCH 46/62] fix key error --- scrape_data/cta_data_downloads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index b1cb47e..4a22792 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -165,7 +165,7 @@ def compare_realtime_sched( for sched in schedule_data_list: start_date = sched['data']['date'].min() - end_date = sched['date']['date'].max() + end_date = sched['data']['date'].max() print(f"Date range is {start_date} to {end_date} for schedule_version {sched['schedule_version']}") print('Creating combined_long_df and summary_df') From dc4d3d9f54c6ed4468d636ba5040127bb8f03080 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Mon, 16 Oct 2023 19:20:40 -0500 Subject: [PATCH 47/62] Add save_path argument --- scrape_data/cta_data_downloads.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 4a22792..1ead135 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -196,6 +196,7 @@ def compare_realtime_sched( json_file=f'{save_name}.json', start_date=start_date, end_date=end_date, + save_path=s3_data_json_path, save=False ) # Save data.json to s3 for now. This will eventually live in the frontend repo. From c84fc8cbe0f11a1d3f1a852517306a0f95b77954 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Tue, 17 Oct 2023 11:22:38 -0500 Subject: [PATCH 48/62] change path name --- scrape_data/cta_data_downloads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 1ead135..b516f52 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -193,7 +193,7 @@ def compare_realtime_sched( print(f'Saving data.json to {s3_data_json_path}') data_json = plots.create_frontend_json( - json_file=f'{save_name}.json', + json_file=f'{save_name}', start_date=start_date, end_date=end_date, save_path=s3_data_json_path, From cc02a8f21dd886a861535f3b5d2f98fdb256f906 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Tue, 17 Oct 2023 17:45:58 -0500 Subject: [PATCH 49/62] Add .json extension. Create scratch folder --- data_analysis/plots.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data_analysis/plots.py b/data_analysis/plots.py index a94620a..e9b4157 100644 --- a/data_analysis/plots.py +++ b/data_analysis/plots.py @@ -1032,7 +1032,7 @@ def save_json(summary_gdf_geo: gpd.GeoDataFrame, summary_kwargs: dict, save_name save_name (str): name of the json output file """ # Make directory for GitHub actions - + Path(DATA_PATH).mkdir(parents=True, exist_ok=True) path_name = create_save_path(save_name, DATA_PATH) # Take only the columns related to summary_kwargs['column'] # and those used in the map @@ -1061,7 +1061,7 @@ def create_frontend_json(json_file: str, start_date: str, end_date: str, save_pa ValueError: If save is True, the save_path argument cannot be None. """ json_path = create_save_path(json_file, DATA_PATH) - with open(json_path) as json_data: + with open(f'{json_path}.json') as json_data: data = json.load(json_data) data['dates'] = {'start': start_date, 'end': end_date} if save: From 3f89030d804484d9d34b4ca5069d4d87bf9e141f Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Thu, 19 Oct 2023 20:15:42 -0500 Subject: [PATCH 50/62] Add lineplot json data --- scrape_data/cta_data_downloads.py | 77 ++++++++++++------------------- 1 file changed, 30 insertions(+), 47 deletions(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index b516f52..5bbfcac 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -7,6 +7,7 @@ from io import StringIO, BytesIO import pandas as pd import typing +import update_data ACCESS_KEY = sys.argv[1] SECRET_KEY = sys.argv[2] @@ -135,60 +136,26 @@ def download_s3_file(fname: str) -> sga.GTFSFeed: client.download_fileobj(Bucket=sga.BUCKET, Key=fname, Fileobj=zip_bytes) zipfilesched = sga.zipfile.ZipFile(zip_bytes) return zipfilesched - -def compare_realtime_sched( - date_range: typing.List[str] = ['2022-05-20', today]) -> None: - print(f'--> Date range is {date_range}') - zip_filename_list, found_list = find_s3_zipfiles(date_range=date_range) - schedule_list_filtered = find_transitfeeds_zipfiles(zip_filename_list, found_list) - print(f'--> schedule_list_filtered: {schedule_list_filtered}') - # Extract data from s3 zipfiles - s3_data_list = [] - for fname in found_list: - data = download_s3_file(fname) - s3_data_list.append({'fname': fname, 'data': data}) - - for tfdict in schedule_list_filtered: - version = tfdict['schedule_version'] - full_name = f"transitfeeds_schedule_zipfiles_raw/{version}.zip" - tfdata = download_s3_file(full_name) - s3_data_list.append({'fname': version, 'data': tfdata}) - - # Convert from list of dictionaries to dictionary with list values - print(f's3_data_list is {s3_data_list}') - joined_dict = pd.DataFrame(s3_data_list).to_dict(orient='list') +def compare_realtime_sched() -> None: - schedule_data_list = [] - for elt in s3_data_list: - schedule_data_list.append({'schedule_version': elt['fname'], 'data': create_route_summary(elt['data'], date_range)}) - - for sched in schedule_data_list: - start_date = sched['data']['date'].min() - end_date = sched['data']['date'].max() - print(f"Date range is {start_date} to {end_date} for schedule_version {sched['schedule_version']}") - print('Creating combined_long_df and summary_df') - combined_long_df, summary_df = csrt.main(schedule_feeds=schedule_list_filtered) + combined_long_df, summary_df = csrt.main() - day_type = 'wk' - combined_long_df = plots.filter_day_type(combined_long_df, day_type=day_type) start_date = combined_long_df["date"].min().strftime("%Y-%m-%d") end_date = combined_long_df["date"].max().strftime("%Y-%m-%d") - print(f'---> Start date is {start_date}') - print(f'---> End date is {end_date}') - if start_date == end_date: - raise ValueError('Start date and end date should be different.') - summary_gdf_geo = plots.create_summary_gdf_geo(combined_long_df, summary_df, day_type=day_type) - summary_kwargs = {'column': 'ratio'} - save_name = f"all_routes_{start_date}_to_{end_date}_{day_type}" - - plots.save_json( - summary_gdf_geo=summary_gdf_geo, - summary_kwargs=summary_kwargs, - save_name=save_name + + data_update = update_data.DataUpdate( + combined_long_df=combined_long_df, + summary_df=summary_df, + start_date=start_date, + end_date=end_date ) + update_data.update_interactive_map_data(data_update) + day_type = 'wk' + save_name = f"all_routes_{start_date}_to_{end_date}_{day_type}" + s3_data_json_path = 'frontend_data_files/data.json' print(f'Saving data.json to {s3_data_json_path}') @@ -205,7 +172,23 @@ def compare_realtime_sched( f'{s3_data_json_path}')\ .put(Body=data_json) - _ = keys(csrt.BUCKET_PUBLIC, ['data.json']) + _ = keys(csrt.BUCKET_PUBLIC, [s3_data_json_path]) + + # Create and save json for lineplots + lineplots_path_name = 'schedule_vs_realtime_all_day_types_routes' + s3_schedule_vs_realtime_path = f'frontend_data_files/{lineplots_path_name}.json' + + update_data.update_lineplot_data(data_update) + with open(plots.DATA_PATH / f"{lineplots_path_name}_{start_date}_to_{end_date}.json") as json_data: + lineplot_json = plots.json.load(json_data) + + print(f'Saving {s3_schedule_vs_realtime_path}') + s3.Object( + csrt.BUCKET_PUBLIC, + f'{s3_schedule_vs_realtime_path}')\ + .put(Body=lineplot_json) + + _ = keys(csrt.BUCKET_PUBLIC, [s3_schedule_vs_realtime_path]) def confirm_saved_files(file_dict: dict) -> None: From 87eddee0aaa2049e15fdb97dcc96aa2767ff26e4 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sat, 21 Oct 2023 17:31:09 -0500 Subject: [PATCH 51/62] create schedule_feeds if None --- data_analysis/compare_scheduled_and_rt.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/data_analysis/compare_scheduled_and_rt.py b/data_analysis/compare_scheduled_and_rt.py index d7db708..a03b8b2 100644 --- a/data_analysis/compare_scheduled_and_rt.py +++ b/data_analysis/compare_scheduled_and_rt.py @@ -315,10 +315,21 @@ def build_summary( return summary -def create_GTFS_data_list(schedule_feeds: List[dict] = None) -> dict: - if schedule_feeds is None: - schedule_feeds = create_schedule_list(month=5, year=2022) +def create_GTFS_data_list(schedule_feeds: List[dict]) -> dict: + """ Create list of GTFS data for each schedule version + + Args: + schedule_feeds (List[dict]): List of dictionaries with the keys + 'schedule_version', 'feed_start_date', and 'feed_end_date'. + Returns: + dict: A dictionary with keys 'GTFS_data_list' and 'schedule_data_list'. + 'GTFS_data_list' is a list of dictionaries with the keys 'schedule_version' + and 'data', which is the extracted data from the GTFS zip file. + 'schedule_data_list' is a list of dictionaries with the same keys as 'GTFS_data_list', + except that 'data' here is the route_daily_summary. + """ + GTFS_data_list = [] schedule_data_list = [] pbar = tqdm(schedule_feeds) @@ -384,6 +395,9 @@ def main(freq: str = 'D', schedule_feeds: List[dict] = None pd.DataFrame: A DataFrame summary across versioned schedule comparisons. """ + if schedule_feeds is None: + schedule_feeds = create_schedule_list(month=5, year=2022) + schedule_data_list = create_GTFS_data_list(schedule_feeds)['schedule_data_list'] agg_info = AggInfo(freq=freq) From 81d06695163670f2957006dacd948892a669bd08 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sat, 21 Oct 2023 18:56:35 -0500 Subject: [PATCH 52/62] move Path.mkdir to cta_data_downloads.py --- data_analysis/plots.py | 1 - scrape_data/cta_data_downloads.py | 3 +++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/data_analysis/plots.py b/data_analysis/plots.py index e9b4157..cc7ef3e 100644 --- a/data_analysis/plots.py +++ b/data_analysis/plots.py @@ -1032,7 +1032,6 @@ def save_json(summary_gdf_geo: gpd.GeoDataFrame, summary_kwargs: dict, save_name save_name (str): name of the json output file """ # Make directory for GitHub actions - Path(DATA_PATH).mkdir(parents=True, exist_ok=True) path_name = create_save_path(save_name, DATA_PATH) # Take only the columns related to summary_kwargs['column'] # and those used in the map diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 5bbfcac..d37ba93 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -151,6 +151,9 @@ def compare_realtime_sched() -> None: start_date=start_date, end_date=end_date ) + # Create directory on the GitHub Action runner. + plots.Path(plots.DATA_PATH).mkdir(parents=True, exist_ok=True) + update_data.update_interactive_map_data(data_update) day_type = 'wk' From 4db28f061c65387cc9ea1a7d801f41673e650600 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sat, 21 Oct 2023 20:44:09 -0500 Subject: [PATCH 53/62] Check the save path in update_data.py --- scrape_data/cta_data_downloads.py | 2 +- update_data.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index d37ba93..7eab416 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -160,7 +160,6 @@ def compare_realtime_sched() -> None: save_name = f"all_routes_{start_date}_to_{end_date}_{day_type}" s3_data_json_path = 'frontend_data_files/data.json' - print(f'Saving data.json to {s3_data_json_path}') data_json = plots.create_frontend_json( json_file=f'{save_name}', @@ -170,6 +169,7 @@ def compare_realtime_sched() -> None: save=False ) # Save data.json to s3 for now. This will eventually live in the frontend repo. + print(f'Saving {s3_data_json_path}') s3.Object( csrt.BUCKET_PUBLIC, f'{s3_data_json_path}')\ diff --git a/update_data.py b/update_data.py index 553620f..3961750 100644 --- a/update_data.py +++ b/update_data.py @@ -170,9 +170,9 @@ def update_interactive_map_data(data_update: DataUpdate) -> None: # JSON files for frontend interactive map by day type for day_type in plots.DAY_NAMES.keys(): summary_df_mean_day = plots.filter_day_type(summary_df_mean, day_type=day_type) - save_path = ( - plots.DATA_PATH / f"all_routes_{start_date}_to_{end_date}_{day_type}" - ) + save_name = f"all_routes_{start_date}_to_{end_date}_{day_type}" + save_path = plots.create_save_path(save_name, dir_name=plots.DATA_PATH) + print(f'--> Saving {save_path}') summary_df_mean_day.to_json( f"{save_path}.json", date_format="iso", orient="records" ) From f89f9f8fee319e8de6d4b92d3d36d7e1d48a2051 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 22 Oct 2023 13:22:14 -0500 Subject: [PATCH 54/62] create GeoJSON files. Fix save paths --- scrape_data/cta_data_downloads.py | 3 +- update_data.py | 85 +++++++++---------------------- 2 files changed, 26 insertions(+), 62 deletions(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 7eab416..ff090ae 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -182,7 +182,8 @@ def compare_realtime_sched() -> None: s3_schedule_vs_realtime_path = f'frontend_data_files/{lineplots_path_name}.json' update_data.update_lineplot_data(data_update) - with open(plots.DATA_PATH / f"{lineplots_path_name}_{start_date}_to_{end_date}.json") as json_data: + lineplot_json_file = plots.create_save_path(lineplots_path_name, plots.DATA_PATH) + with open(f"{lineplot_json_file}.json") as json_data: lineplot_json = plots.json.load(json_data) print(f'Saving {s3_schedule_vs_realtime_path}') diff --git a/update_data.py b/update_data.py index 3961750..f3f5bc4 100644 --- a/update_data.py +++ b/update_data.py @@ -121,63 +121,16 @@ def update_interactive_map_data(data_update: DataUpdate) -> None: summary_df = data_update.summary_df.copy() start_date = data_update.start_date end_date = data_update.end_date + summary_gdf_geo = plots.create_summary_gdf_geo(combined_long_df, summary_df) - # Remove 74 Fullerton bus from data - combined_long_df = combined_long_df.loc[combined_long_df["route_id"] != "74"] - summary_df = summary_df.loc[summary_df["route_id"] != "74"] - - route_daily_mean = ( - combined_long_df.groupby(["route_id"])["trip_count_rt"] - .mean() - .round(1) - .reset_index() - ) - - route_daily_mean.rename( - columns={"trip_count_rt": "avg_trip_count_rt"}, inplace=True - ) - - summary_df_mean = summary_df.merge(route_daily_mean, on="route_id") - - combined_long_df.loc[:, "date"] = pd.to_datetime(combined_long_df["date"]) - - # Add ridership data to summary_df_mean - ridership_by_rte_date = plots.fetch_ridership_data() - - ridership_end_date = ridership_by_rte_date["date"].max().strftime("%Y-%m-%d") - - merged_df = plots.merge_ridership_combined( - combined_long_df=combined_long_df, - ridership_df=ridership_by_rte_date, - start_date=start_date, - ridership_end_date=ridership_end_date, - ) - - daily_means_riders = plots.calculate_trips_per_rider(merged_df) - - # This is the average trip count corresponding to the ridership data, - # which is usually a few months out of date. So we can drop it here and use - # the up-to-date avg_trip_count_rt in summary_df_mean. - - daily_means_riders.drop(columns="avg_trip_count_rt", inplace=True) - - summary_df_mean = summary_df_mean.merge(daily_means_riders, on="route_id") - - # Skip route_id and day_type in the percentile and ranking calculations - for col in summary_df_mean.columns[2:]: - summary_df_mean = plots.calculate_percentile_and_rank(summary_df_mean, col=col) # JSON files for frontend interactive map by day type + summary_kwargs = {'column': 'ratio'} for day_type in plots.DAY_NAMES.keys(): - summary_df_mean_day = plots.filter_day_type(summary_df_mean, day_type=day_type) + summary_df_mean_day = plots.filter_day_type(summary_gdf_geo, day_type=day_type) save_name = f"all_routes_{start_date}_to_{end_date}_{day_type}" - save_path = plots.create_save_path(save_name, dir_name=plots.DATA_PATH) - print(f'--> Saving {save_path}') - summary_df_mean_day.to_json( - f"{save_path}.json", date_format="iso", orient="records" - ) - summary_df_mean_day.to_html(f"{save_path}_table.html", index=False) - + plots.save_json(summary_df_mean_day, summary_kwargs=summary_kwargs, save_name=save_name) + def update_lineplot_data(data_update: DataUpdate) -> None: """Refresh data for lineplots of bus performance over time @@ -205,36 +158,46 @@ def update_lineplot_data(data_update: DataUpdate) -> None: # JSON files for lineplots json_cols = ["date", "trip_count_rt", "trip_count_sched", "ratio", "route_id"] - + all_day_types_path = plots.create_save_path( + f"schedule_vs_realtime_all_day_types_routes_{start_date}_to_{end_date}", + plots.DATA_PATH + ) combined_long_df[json_cols].to_json( - plots.DATA_PATH / f"schedule_vs_realtime_all_day_types_routes_" - f"{start_date}_to_{end_date}.json", + all_day_types_path, date_format="iso", orient="records", ) combined_long_df_wk = plots.filter_day_type(combined_long_df, "wk") + wk_path = plots.create_save_path("schedule_vs_realtime_wk_routes" + f"_{start_date}_to_{end_date}", plots.DATA_PATH) combined_long_df_wk[json_cols].to_json( - plots.DATA_PATH / f"schedule_vs_realtime_wk_routes" - f"_{start_date}_to_{end_date}.json", + wk_path, date_format="iso", orient="records", ) json_cols.pop() combined_long_groupby_date = plots.groupby_long_df(combined_long_df, "date") + all_day_types_overall_path = plots.create_save_path( + f"schedule_vs_realtime_all_day_types_overall_" + f"{start_date}_to_{end_date}", + plots.DATA_PATH + ) combined_long_groupby_date[json_cols].to_json( - plots.DATA_PATH / f"schedule_vs_realtime_all_day_types_overall_" - f"{start_date}_to_{end_date}.json", + all_day_types_overall_path, date_format="iso", orient="records", ) combined_long_groupby_date_wk = plots.groupby_long_df(combined_long_df_wk, "date") - combined_long_groupby_date_wk[json_cols].to_json( + wk_overall_path = plots.create_save_path( + f"schedule_vs_realtime_wk_overall_{start_date}_to_{end_date}", plots.DATA_PATH - / f"schedule_vs_realtime_wk_overall_{start_date}_to_{end_date}.json", + ) + combined_long_groupby_date_wk[json_cols].to_json( + wk_overall_path, date_format="iso", orient="records", ) From 2e4e28e5cb7981c56551b292eb82ed4b4e34830b Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 22 Oct 2023 18:29:20 -0500 Subject: [PATCH 55/62] create 'ratio' column --- scrape_data/cta_data_downloads.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index ff090ae..a37fffa 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -141,7 +141,10 @@ def compare_realtime_sched() -> None: print('Creating combined_long_df and summary_df') combined_long_df, summary_df = csrt.main() - + combined_long_df.loc[:, "ratio"] = ( + combined_long_df.loc[:, "trip_count_rt"] + / combined_long_df.loc[:, "trip_count_sched"] + ) start_date = combined_long_df["date"].min().strftime("%Y-%m-%d") end_date = combined_long_df["date"].max().strftime("%Y-%m-%d") From 31139ba7527eef717faccddf75cf7175f661be01 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 22 Oct 2023 20:27:28 -0500 Subject: [PATCH 56/62] fix path name --- scrape_data/cta_data_downloads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index a37fffa..4fcb448 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -181,7 +181,7 @@ def compare_realtime_sched() -> None: _ = keys(csrt.BUCKET_PUBLIC, [s3_data_json_path]) # Create and save json for lineplots - lineplots_path_name = 'schedule_vs_realtime_all_day_types_routes' + lineplots_path_name = f'schedule_vs_realtime_all_day_types_routes_{start_date}_to_{end_date}' s3_schedule_vs_realtime_path = f'frontend_data_files/{lineplots_path_name}.json' update_data.update_lineplot_data(data_update) From 675bb008bfeef9bd36da1760845fb927b682bbbb Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Sun, 22 Oct 2023 21:11:34 -0500 Subject: [PATCH 57/62] Fix path name --- scrape_data/cta_data_downloads.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 4fcb448..d07c0de 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -181,8 +181,9 @@ def compare_realtime_sched() -> None: _ = keys(csrt.BUCKET_PUBLIC, [s3_data_json_path]) # Create and save json for lineplots - lineplots_path_name = f'schedule_vs_realtime_all_day_types_routes_{start_date}_to_{end_date}' - s3_schedule_vs_realtime_path = f'frontend_data_files/{lineplots_path_name}.json' + lineplots_download_name = 'schedule_vs_realtime_all_day_types_routes' + lineplots_path_name = f'{lineplots_download_name}_{start_date}_to_{end_date}' + s3_schedule_vs_realtime_path = f'frontend_data_files/{lineplots_download_name}.json' update_data.update_lineplot_data(data_update) lineplot_json_file = plots.create_save_path(lineplots_path_name, plots.DATA_PATH) From 0134f35d353bb48a7f5efb949e19f209f7a9444a Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Mon, 23 Oct 2023 17:26:17 -0500 Subject: [PATCH 58/62] Add .json to file path --- scrape_data/cta_data_downloads.py | 2 +- update_data.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index d07c0de..9a0998f 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -165,7 +165,7 @@ def compare_realtime_sched() -> None: s3_data_json_path = 'frontend_data_files/data.json' data_json = plots.create_frontend_json( - json_file=f'{save_name}', + json_file=save_name, start_date=start_date, end_date=end_date, save_path=s3_data_json_path, diff --git a/update_data.py b/update_data.py index f3f5bc4..97ca88e 100644 --- a/update_data.py +++ b/update_data.py @@ -163,7 +163,7 @@ def update_lineplot_data(data_update: DataUpdate) -> None: plots.DATA_PATH ) combined_long_df[json_cols].to_json( - all_day_types_path, + f"{all_day_types_path}.json", date_format="iso", orient="records", ) @@ -172,7 +172,7 @@ def update_lineplot_data(data_update: DataUpdate) -> None: wk_path = plots.create_save_path("schedule_vs_realtime_wk_routes" f"_{start_date}_to_{end_date}", plots.DATA_PATH) combined_long_df_wk[json_cols].to_json( - wk_path, + f"{wk_path}.json", date_format="iso", orient="records", ) @@ -185,7 +185,7 @@ def update_lineplot_data(data_update: DataUpdate) -> None: plots.DATA_PATH ) combined_long_groupby_date[json_cols].to_json( - all_day_types_overall_path, + f"{all_day_types_overall_path}.json", date_format="iso", orient="records", ) @@ -197,7 +197,7 @@ def update_lineplot_data(data_update: DataUpdate) -> None: plots.DATA_PATH ) combined_long_groupby_date_wk[json_cols].to_json( - wk_overall_path, + f"{wk_overall_path}.json", date_format="iso", orient="records", ) From bc85e5f5e4d55c3d6bb190023e97968deb041af5 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Mon, 23 Oct 2023 19:16:10 -0500 Subject: [PATCH 59/62] remove datetime in JSON --- update_data.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/update_data.py b/update_data.py index 97ca88e..7a07381 100644 --- a/update_data.py +++ b/update_data.py @@ -156,6 +156,8 @@ def update_lineplot_data(data_update: DataUpdate) -> None: start_date = data_update.start_date end_date = data_update.end_date + # Remove the datetime. + combined_long_df.loc[:, 'date'] = combined_long_df.loc[:, 'date'].dt.strftime('%Y-%m-%d') # JSON files for lineplots json_cols = ["date", "trip_count_rt", "trip_count_sched", "ratio", "route_id"] all_day_types_path = plots.create_save_path( From 80fac255d898f37b6adbfcec069795607b26aadb Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Mon, 23 Oct 2023 21:17:16 -0500 Subject: [PATCH 60/62] convert lineplot_json to bytes --- scrape_data/cta_data_downloads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrape_data/cta_data_downloads.py b/scrape_data/cta_data_downloads.py index 9a0998f..41a1161 100644 --- a/scrape_data/cta_data_downloads.py +++ b/scrape_data/cta_data_downloads.py @@ -194,7 +194,7 @@ def compare_realtime_sched() -> None: s3.Object( csrt.BUCKET_PUBLIC, f'{s3_schedule_vs_realtime_path}')\ - .put(Body=lineplot_json) + .put(Body=bytes(plots.json.dumps(lineplot_json, default=str).encode())) _ = keys(csrt.BUCKET_PUBLIC, [s3_schedule_vs_realtime_path]) From 23f96538114d0e15921dfd94e5d2f5ebb313a4b3 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Tue, 9 Jan 2024 12:59:07 -0600 Subject: [PATCH 61/62] Add workflow_dispatch. --- .github/workflows/cta_data_downloads.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/cta_data_downloads.yml b/.github/workflows/cta_data_downloads.yml index 1dd6f49..b306145 100644 --- a/.github/workflows/cta_data_downloads.yml +++ b/.github/workflows/cta_data_downloads.yml @@ -1,6 +1,7 @@ name: Automate CTA schedule and realtime downloads on: + workflow_dispatch: push: branches: - 'automate-schedule-downloads' From d5fa57b754b5e2c32d0998260f6e2a3c35cb8891 Mon Sep 17 00:00:00 2001 From: dcjohnson24 Date: Wed, 10 Jan 2024 12:04:57 -0600 Subject: [PATCH 62/62] Pass day_type to summary_gdf_geo to create correct rankings --- update_data.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/update_data.py b/update_data.py index 7a07381..d1c5718 100644 --- a/update_data.py +++ b/update_data.py @@ -121,15 +121,14 @@ def update_interactive_map_data(data_update: DataUpdate) -> None: summary_df = data_update.summary_df.copy() start_date = data_update.start_date end_date = data_update.end_date - summary_gdf_geo = plots.create_summary_gdf_geo(combined_long_df, summary_df) # JSON files for frontend interactive map by day type summary_kwargs = {'column': 'ratio'} for day_type in plots.DAY_NAMES.keys(): - summary_df_mean_day = plots.filter_day_type(summary_gdf_geo, day_type=day_type) + summary_gdf_geo_day = plots.create_summary_gdf_geo(combined_long_df, summary_df, day_type='wk') save_name = f"all_routes_{start_date}_to_{end_date}_{day_type}" - plots.save_json(summary_df_mean_day, summary_kwargs=summary_kwargs, save_name=save_name) + plots.save_json(summary_gdf_geo_day, summary_kwargs=summary_kwargs, save_name=save_name) def update_lineplot_data(data_update: DataUpdate) -> None: