-
Notifications
You must be signed in to change notification settings - Fork 22
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Refactor fix update/code clean up #4042
Changes from all commits
fff6c3f
cc7f77e
b82fe30
0183031
88f3ffe
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -554,7 +554,6 @@ def aggregate_low_cost_sensors_data(data: pd.DataFrame) -> pd.DataFrame: | |||||
numeric_columns = data.select_dtypes(include=["number"]).columns | ||||||
numeric_columns = numeric_columns.difference(["device_number"]) | ||||||
data_for_aggregation = data[["timestamp", "device_id"] + list(numeric_columns)] | ||||||
|
||||||
aggregated = ( | ||||||
data_for_aggregation.groupby("device_id") | ||||||
.apply(lambda group: group.resample("1H", on="timestamp").mean()) | ||||||
|
@@ -744,20 +743,19 @@ def process_data_for_api(data: pd.DataFrame, frequency: Frequency) -> list: | |||||
devices = airqo_api.get_devices() | ||||||
|
||||||
device_lookup = { | ||||||
device["device_number"]: device | ||||||
for device in devices | ||||||
if device.get("device_number") | ||||||
device["device_id"]: device for device in devices if device.get("device_id") | ||||||
} | ||||||
|
||||||
for _, row in data.iterrows(): | ||||||
try: | ||||||
device_number = row["device_number"] | ||||||
device_id = row["device_id"] | ||||||
|
||||||
# Get device details from the lookup dictionary | ||||||
device_details = device_lookup.get(device_number) | ||||||
device_details = device_lookup.get(device_id) | ||||||
if not device_details: | ||||||
logger.exception( | ||||||
f"Device number {device_number} not found in device list." | ||||||
f"Device number {device_id} not found in device list." | ||||||
) | ||||||
continue | ||||||
|
||||||
|
@@ -766,7 +764,7 @@ def process_data_for_api(data: pd.DataFrame, frequency: Frequency) -> list: | |||||
"device_id": device_details["_id"], | ||||||
"site_id": row["site_id"], | ||||||
"device_number": device_number, | ||||||
"tenant": str(Tenant.AIRQO), | ||||||
"network": device_details["network"], | ||||||
"location": { | ||||||
"latitude": {"value": row["latitude"]}, | ||||||
"longitude": {"value": row["longitude"]}, | ||||||
|
@@ -832,7 +830,7 @@ def merge_aggregated_weather_data( | |||||
airqo_api = AirQoApi() | ||||||
sites: List[Dict[str, Any]] = [] | ||||||
|
||||||
for site in airqo_api.get_sites(tenant=Tenant.AIRQO): | ||||||
for site in airqo_api.get_sites(network="airqo"): | ||||||
sites.extend( | ||||||
[ | ||||||
{ | ||||||
|
@@ -894,7 +892,8 @@ def merge_aggregated_weather_data( | |||||
numeric_columns = measurements.select_dtypes(include=["number"]).columns | ||||||
numeric_columns = numeric_columns.difference(["device_number"]) | ||||||
numeric_counts = measurements[numeric_columns].notna().sum(axis=1) | ||||||
measurements = measurements[numeric_counts >= 1] | ||||||
# Raws with more than 1 numeric values | ||||||
measurements = measurements[numeric_counts > 1] | ||||||
return measurements | ||||||
|
||||||
@staticmethod | ||||||
|
@@ -1012,12 +1011,10 @@ def calibrate_data(data: pd.DataFrame) -> pd.DataFrame: | |||||
|
||||||
data["timestamp"] = pd.to_datetime(data["timestamp"]) | ||||||
sites = AirQoApi().get_sites() | ||||||
sites_df = pd.DataFrame(sites, columns=["_id", "city"]).rename( | ||||||
columns={"_id": "site_id"} | ||||||
) | ||||||
sites_df = pd.DataFrame(sites, columns=["site_id", "city"]) | ||||||
|
||||||
data = pd.merge(data, sites_df, on="site_id", how="left") | ||||||
data.dropna(subset=["device_id", "timestamp"], inplace=True) | ||||||
|
||||||
columns_to_fill = [ | ||||||
"s1_pm2_5", | ||||||
"s1_pm10", | ||||||
|
@@ -1027,9 +1024,9 @@ def calibrate_data(data: pd.DataFrame) -> pd.DataFrame: | |||||
"humidity", | ||||||
] | ||||||
|
||||||
data[columns_to_fill] = data[columns_to_fill].fillna(0) | ||||||
# TODO: Need to opt for a different approach eg forward fill, can't do here as df only has data of last 1 hour. Perhaps use raw data only? | ||||||
# May have to rewrite entire pipeline flow | ||||||
data[columns_to_fill] = data[columns_to_fill].fillna(0) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Re-evaluate filling NaN values with zero in sensor data. Filling missing sensor readings with zero may introduce inaccuracies since a zero value could be misinterpreted as a valid measurement rather than missing data. Consider using a method that preserves the distinction between missing and zero values, such as forward-filling or imputing with the mean or median where appropriate. - data[columns_to_fill] = data[columns_to_fill].fillna(0)
+ data[columns_to_fill] = data[columns_to_fill].interpolate(method='linear', limit_direction='forward', axis=0) 📝 Committable suggestion
Suggested change
|
||||||
|
||||||
# additional input columns for calibration | ||||||
data["avg_pm2_5"] = data[["s1_pm2_5", "s2_pm2_5"]].mean(axis=1).round(2) | ||||||
|
@@ -1052,9 +1049,12 @@ def calibrate_data(data: pd.DataFrame) -> pd.DataFrame: | |||||
"pm2_5_pm10_mod", | ||||||
] | ||||||
data[input_variables] = data[input_variables].replace([np.inf, -np.inf], 0) | ||||||
data.dropna(subset=input_variables, inplace=True) | ||||||
|
||||||
grouped_df = data.groupby("city", dropna=False) | ||||||
# Explicitly filter data to calibrate. | ||||||
to_calibrate = data["network"] == "airqo" | ||||||
data_to_calibrate = data.loc[to_calibrate] | ||||||
data_to_calibrate.dropna(subset=input_variables, inplace=True) | ||||||
grouped_df = data_to_calibrate.groupby("city", dropna=False) | ||||||
Comment on lines
+1053
to
+1057
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💡 Codebase verification Missing city values are handled with a default value The codebase shows that missing city values are handled by filling them with a default value "Missing City" before any groupby operations, as seen in Consider adding:
🔗 Analysis chainHandle potential missing 'city' values when grouping data. Grouping by 'city' when calibrating data assumes all entries have a 'city' value. If 'city' is missing for some records, this could lead to unexpected behavior. Run the following script to identify records with missing 'city' values: Let me analyze the codebase to understand how the city column is handled before and after the grouping operation. This will help us determine if there are any existing safeguards or potential issues with missing city values. This will help us understand how the city column is handled and if there are any existing safeguards for missing values before the groupby operation. 🏁 Scripts executedThe following scripts were executed for the analysis: Script: #!/bin/bash
# Description: Find records without 'city' after merging.
# Test: Check for missing 'city' values in data_to_calibrate. Expect: No records with missing 'city'.
python -c '
import pandas as pd
data = pd.read_csv("data.csv") # Replace with the actual data source
to_calibrate = data["network"] == "airqo"
data_to_calibrate = data.loc[to_calibrate]
missing_city = data_to_calibrate[data_to_calibrate["city"].isnull()]
if not missing_city.empty:
print("Records with missing city:")
print(missing_city)
'
Length of output: 447 Script: #!/bin/bash
# Let's analyze the codebase to understand the data processing flow and city handling
# First, find where the data is loaded and how city is handled
rg -l "city" --type py
# Then look for specific patterns related to city column handling
rg "city" --type py -A 3 -B 3
# Also check for any data validation or preprocessing of city values
ast-grep --pattern 'def $func($$$) {
$$$
city$$$
$$$
}'
Length of output: 52521 Script: #!/bin/bash
# Let's check how the city column is populated before the groupby operation
rg -A 5 -B 5 "data\[\"city\"\]|data\['city'\]|data\.city" --type py
# Also check for any data validation or preprocessing specifically for city values
rg "dropna.*city" --type py
rg "fillna.*city" --type py
# Look for any error handling around city values
rg "try.*city.*except" --type py -A 5
Length of output: 2973 |
||||||
|
||||||
rf_model = GCSUtils.get_trained_model_from_gcs( | ||||||
project_name=project_id, | ||||||
|
@@ -1071,6 +1071,8 @@ def calibrate_data(data: pd.DataFrame) -> pd.DataFrame: | |||||
), | ||||||
) | ||||||
for city, group in grouped_df: | ||||||
# What was the intention of this? | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Mnoble-19 have you ever noticed this? |
||||||
# If the below condition fails, the rf_model and lasso_model default to the previously ones used and the ones set as "default" outside the forloop. | ||||||
if str(city).lower() in [c.value.lower() for c in CityModel]: | ||||||
try: | ||||||
rf_model = GCSUtils.get_trained_model_from_gcs( | ||||||
|
@@ -1087,6 +1089,7 @@ def calibrate_data(data: pd.DataFrame) -> pd.DataFrame: | |||||
) | ||||||
except Exception as ex: | ||||||
logger.exception(f"Error getting model: {ex}") | ||||||
continue | ||||||
group["pm2_5_calibrated_value"] = rf_model.predict(group[input_variables]) | ||||||
group["pm10_calibrated_value"] = lasso_model.predict(group[input_variables]) | ||||||
|
||||||
|
@@ -1100,15 +1103,20 @@ def calibrate_data(data: pd.DataFrame) -> pd.DataFrame: | |||||
data["pm2_5_raw_value"] = data[["s1_pm2_5", "s2_pm2_5"]].mean(axis=1) | ||||||
data["pm10_raw_value"] = data[["s1_pm10", "s2_pm10"]].mean(axis=1) | ||||||
if "pm2_5_calibrated_value" in data.columns: | ||||||
data["pm2_5"] = data["pm2_5_calibrated_value"] | ||||||
data.loc[to_calibrate, "pm2_5"] = data.loc[ | ||||||
to_calibrate, "pm2_5_calibrated_value" | ||||||
] | ||||||
else: | ||||||
data["pm2_5_calibrated_value"] = None | ||||||
data["pm2_5"] = None | ||||||
data.loc[to_calibrate, "pm2_5_calibrated_value"] = None | ||||||
data.loc[to_calibrate, "pm2_5"] = None | ||||||
if "pm10_calibrated_value" in data.columns: | ||||||
data["pm10"] = data["pm10_calibrated_value"] | ||||||
data.loc[to_calibrate, "pm10"] = data.loc[ | ||||||
to_calibrate, "pm10_calibrated_value" | ||||||
] | ||||||
else: | ||||||
data["pm10_calibrated_value"] = None | ||||||
data["pm10"] = None | ||||||
data.loc[to_calibrate, "pm10_calibrated_value"] = None | ||||||
data.loc[to_calibrate, "pm10"] = None | ||||||
|
||||||
data["pm2_5"] = data["pm2_5"].fillna(data["pm2_5_raw_value"]) | ||||||
data["pm10"] = data["pm10"].fillna(data["pm10_raw_value"]) | ||||||
|
||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🛠️ Refactor suggestion
Use appropriate logging level for missing device details.
The use of
logger.exception
is intended for logging exceptions along with stack traces. Since not finding a device in the lookup is not an exception but a possible data inconsistency, consider usinglogger.warning
instead.Apply this change to adjust the logging level:
Additionally, consider correcting the log message to reference "Device ID" instead of "Device number" for clarity.
📝 Committable suggestion