Skip to content

Commit

Permalink
Merge pull request #3846 from airqo-platform/reporting-with-llm
Browse files Browse the repository at this point in the history
auto reporting
  • Loading branch information
Baalmart authored Nov 27, 2024
2 parents 13de854 + e6544e0 commit 4135652
Show file tree
Hide file tree
Showing 5 changed files with 419 additions and 3 deletions.
6 changes: 4 additions & 2 deletions src/spatial/configure.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@ class Config:
BIGQUERY_SATELLITE_MODEL_PREDICTIONS = os.getenv(
"BIGQUERY_SATELLITE_MODEL_PREDICTIONS"
)


ANALTICS_URL = os.getenv("ANALTICS_URL")
HUGGING_FACE_TOKEN = os.getenv("HUGGING_FACE_TOKEN")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
class ProductionConfig(Config):
DEBUG = False
TESTING = False
Expand Down
13 changes: 13 additions & 0 deletions src/spatial/controllers/controllers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from views.satellite_predictions import SatellitePredictionView
from views.site_category_view import SiteCategorizationView
from views.site_selection_views import SiteSelectionView
from views.report_view import ReportView


controller_bp = Blueprint("controller", __name__)
Expand Down Expand Up @@ -66,3 +67,15 @@ def site_selection():
@controller_bp.route("/satellite_prediction", methods=["POST"])
def get_satellite_prediction():
return SatellitePredictionView.make_predictions()

@controller_bp.route("/air_quality_report", methods=["POST"])
def fetch_air_quality():
return ReportView.generate_air_quality_report_with_gemini()

@controller_bp.route("/air_quality_report_without_llm", methods=["POST"])
def fetch_air_quality_without_llm():
return ReportView.generate_air_quality_report_without_llm()

@controller_bp.route("/air_quality_report_with_customised_prompt", methods=["POST"])
def fetch_air_quality_with_customised_prompt():
return ReportView.generate_air_quality_report_with_customised_prompt_gemini()
281 changes: 281 additions & 0 deletions src/spatial/models/report_datafetcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,281 @@
import requests
import openai
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import login
from configure import Config
import google.generativeai as genai
import logging
from functools import lru_cache


# Configure API keys
GOOGLE_API_KEY = Config.GOOGLE_API_KEY
genai.configure(api_key=GOOGLE_API_KEY)
hf_token = Config.HUGGING_FACE_TOKEN


if hf_token:
login(hf_token)
else:
print("Hugging Face token is missing. Set the 'HUGGING_FACE_TOKEN' environment variable.")

class DataFetcher:
@staticmethod
@lru_cache(maxsize=128) # Cache up to 128 most recent queries
def fetch_air_quality_data_a(grid_id, start_time, end_time):
token = Config.AIRQO_API_TOKEN
analytics_url = Config.ANALTICS_URL
if token is None:
print("Error: AIRQO_API_TOKEN environment variable is not set.")
return None

url= f"{analytics_url}?token={token}"
payload = {"grid_id": grid_id, "start_time": start_time, "end_time": end_time}

try:
response = requests.post(url, json=payload)
response.raise_for_status()
return response.json()
except requests.exceptions.HTTPError as http_err:
print(f"HTTP error occurred: {http_err}")
logging.error(f"HTTP error occurred: {http_err}")
except requests.exceptions.RequestException as req_err:
print(f"Request error occurred: {req_err}")
logging.error(f"Request error occurred: {req_err}")
except ValueError as json_err:
print(f"JSON decoding error: {json_err}")
logging.error(f"JSON decoding error: {json_err}")

return None

class AirQualityReport:
def __init__(self, data):
self.data = data
self.grid_name = data.get('airquality', {}).get('sites', {}).get('grid name', [None])
self.annual_data = data.get('airquality', {}).get('annual_pm', [None])[0]
self.daily_mean_data = data.get('airquality', {}).get('daily_mean_pm', [])
self.diurnal = data.get('airquality', {}).get('diurnal', [])
self.monthly_data = data.get('airquality', {}).get('site_monthly_mean_pm', [])
self.monthly_name_data = data.get('airquality', {}).get('pm_by_month_name', [])
self.site_annual_mean_pm = data.get('airquality', {}).get('site_annual_mean_pm', [])
self.site_mean_pm = data.get('airquality', {}).get('site_mean_pm', [])
main_site_info = self.monthly_data[0] if self.monthly_data else {}
self.main_site = main_site_info.get('site_name')
self.site_names = [item.get('site_name', None) for item in self.data.get('airquality', {}).get('site_annual_mean_pm', [])]
self.site_latitude = main_site_info.get('site_latitude')
self.site_longitude = main_site_info.get('site_longitude')
self.num_sites = data.get('airquality', {}).get('sites', {}).get('number_of_sites')
self.starttime = data.get('airquality', {}).get('period', {}).get('startTime', '')[:10]
self.endtime = data.get('airquality', {}).get('period', {}).get('endTime', '')[:10]

self.annual_pm2_5_calibrated_value = self.annual_data.get("pm2_5_calibrated_value")
self.annual_pm10_calibrated_value = self.annual_data.get("pm10_calibrated_value")

# Finding the minimum and maximum values
if self.daily_mean_data:
filtered_data = [
item for item in self.daily_mean_data
if 'pm2_5_calibrated_value' in item and isinstance(item['pm2_5_calibrated_value'], (int, float))
]
if filtered_data:
self.daily_min_pm2_5 = min(filtered_data, key=lambda x: x['pm2_5_calibrated_value'])
self.daily_max_pm2_5 = max(filtered_data, key=lambda x: x['pm2_5_calibrated_value'])
else:
self.daily_min_pm2_5 = None
self.daily_max_pm2_5 = None
else:
self.daily_min_pm2_5 = None
self.daily_max_pm2_5 = None


# Initialize models once in the constructor
self.gemini_model = genai.GenerativeModel('gemini-pro')
openai.api_key = Config.OPENAI_API_KEY

def _prepare_base_info(self):
return (

f"The air quality report is for {self.grid_name} for the period of {self.starttime} to {self.endtime}. "
f"These air quality monitoring sites are {self.site_names} and measure PM2.5 and PM10, "
f"at coordinates {self.site_latitude}°N, {self.site_longitude}°E. "
f"The annual PM2.5 concentration averages {self.annual_data} µg/m³."
)

def _generate_prompt(self, audience):
base_info = self._prepare_base_info()
if audience == "researcher":
return (
- f"Generate a comprehensive air quality assessment report for {self.grid_name} for the period of {self.starttime} to {self.endtime}. Begin with a detailed introduction (100-130 words) covering the city's geographical location, climate characteristics, population density, and major pollution sources. "
- f"{base_info} include the period under review."
- f"Daily mean measurements show: {self.daily_mean_data}. "
- f"Diurnal patterns indicate: {self.diurnal}. Monthly trends reveal: {self.monthly_data}. "
+ f"Generate a comprehensive air quality assessment report for {self.grid_name} for the period of {self.starttime} to {self.endtime}. Begin with a detailed introduction (100-130 words) covering the city's geographical location, climate characteristics, population density, and major pollution sources.\n"
+ f"{base_info}\n"
+ f"Daily mean measurements show values ranging from {self.daily_min_pm2_5['pm2_5_calibrated_value']} to {self.daily_max_pm2_5['pm2_5_calibrated_value']} µg/m³.\n"
+ f"Diurnal patterns indicate peak pollution levels at {self._format_diurnal_peak()}.\n"
+ f"Monthly trends reveal fluctuations correlated with seasonal changes.\n"
f"Provide a thorough analysis of spatial and temporal air quality variations, identify pollution hotspots and clean zones, examine seasonal patterns, and assess compliance with WHO guidelines. "
f"Conclude with actionable recommendations for air quality improvement and public health protection. Data source: AirQo monitoring network."
)

elif audience == "policymaker":
return (
f"Create an executive summary of air quality conditions in {self.grid_name} for the period of {self.starttime} to {self.endtime}. for policy decision-making. Begin with key findings and their policy implications (50-75 words). "
f"{base_info} include the period under review."
f"Highlight critical trends: {self.monthly_data}. Diurnal patterns indicate: {self.diurnal}. "
f"Focus on: 1) Areas exceeding air quality standards, 2) Population exposure risk assessment, "
f"3) Economic implications of poor air quality. Present clear, actionable policy recommendations with expected outcomes and implementation timeframes. "
f"Include cost-benefit considerations and potential regulatory measures. Data source: AirQo monitoring network."
)
elif audience == "general public":
return (
f"{base_info} include the period under review."
f"Create a clear, easy-to-understand report about air quality in {self.grid_name} for the period of {self.starttime} to {self.endtime}. Start with a simple explanation of why air quality matters for public health. "
f"We have {self.num_sites} air quality monitors in your area. The average PM2.5 level this year is {self.annual_pm2_5_calibrated_value} µg/m³. "
f"Diurnal patterns indicate: {self.diurnal}. Monthly trends reveal: {self.monthly_data}. "
f"Explain what these numbers mean for daily activities. Include: 1) When air quality is best and worst during the day, "
f"2) Which areas have better or worse air quality, 3) Simple steps people can take to protect their health, "
f"4) How to access daily air quality updates. Use plain language and avoid technical terms. "
f"Add practical tips for reducing exposure to air pollution. Data source: AirQo monitoring network."
)
else:
raise ValueError("Invalid audience type. Please specify 'researcher', 'policymaker', or 'general public'.")

def generate_report_with_gemini(self, audience):
prompt = self._generate_prompt(audience)
try:
response = self.gemini_model.generate_content(prompt)
gemini_output = response.text
return self._prepare_report_json(gemini_output)
except Exception as e:
print(f"Error: {e}")
return None
# Generate report with customised prompt
@lru_cache(maxsize=64) # Cache up to 64 most recent reports
def generate_report_with_customised_prompt_gemini(self, custom_prompt):
"""
Generate an air quality report using a customised user-provided prompt.
"""
base_info = self._prepare_base_info()
full_prompt = (

f"{base_info} include the period under review."
f"diurnal patterns indicate: {self.diurnal}. "
f"number of sites or devices or airqo binos: {self.num_sites}. "
f"{self.daily_mean_data}"
f"site mean{self.site_mean_pm}"
f" daily {self.daily_mean_data}"
f"{custom_prompt}"
)
try:
response = self.gemini_model.generate_content(full_prompt)
gemini_output = response.text
return self._prepare_customised_report_json(gemini_output)
except Exception as e:
print(f"Error: {e}")

def generate_report_with_openai(self, audience):
prompt = self._generate_prompt(audience)
try:
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": prompt}]
)
openai_output = response.choices[0].message['content']
return self._prepare_report_json(openai_output)
except Exception as e:
print(f"Error: {e}")
return None


# Use non-LLM template text as report content
def generate_report_template_without_LLM(self, audience):
prompt = self._generate_prompt(audience)
report_content = prompt
return self._prepare_report_json(report_content)

def generate_report_without_llm(self):
# Determine peak time and least PM2.5 values
if self.diurnal:
peak_data = max(self.diurnal, key=lambda x: x['pm2_5_calibrated_value'])
peak_time = peak_data['hour']
peak_pm2_5 = peak_data['pm2_5_calibrated_value']
least_data = min(self.diurnal, key=lambda x: x['pm2_5_calibrated_value'])
least_pm2_5 = least_data['pm2_5_calibrated_value']
least_pm2_5_time = least_data['hour']
else:
peak_time = None
peak_pm2_5 = None
least_pm2_5 = None
least_pm2_5_time = None


introduction = (
f"The air quality report for {self.grid_name} covers the period from {self.starttime} to {self.endtime}. "
f"The {self.num_sites} monitored sites include: {', '.join(self.site_names)}. "
f"Measurements are taken for PM2.5 and PM10 concentrations. "
f"The annual average PM2.5 concentration is {self.annual_pm2_5_calibrated_value} µg/m³."
)

diurnal_description = (
f"Diurnal patterns observed include the following: {self.diurnal}. "
f"These patterns provide insight into air quality fluctuations throughout the day. "
f"The peak in PM2.5 {peak_pm2_5} levels occurs around {peak_time}:00 hr, indicating a period of higher pollution, often associated with increased activity or traffic. "
f"Conversely, the period with the least PM2.5 {least_pm2_5} µg/m³ levels is around {least_pm2_5_time} :00 hr , "
f"which usually represents a period of lower activity or better atmospheric dispersion."
f"Understanding the patterns of pollution and their impacts on public health is crucial for effective environmental management and policy-making. "
f"Throughout this report, we will explore key trends in PM2.5 and PM10 concentrations, the diurnal variations, and the impact of these levels on air quality across the region."

)

daily_mean_description = (
f"Daily mean PM2.5 measurements during the period were recorded as follows: {self.daily_mean_data}. "
f"This data reveals variations in air quality on a day-to-day basis."
)

site_pm25_description = (
f"The concentration of PM2.5 across different sites shows variability: "
f"{', '.join([f'{site} with PM2.5 levels' for site in self.site_names])}. "
f"These variations indicate site-specific air quality differences for the known grids."
)
conclusion = (
f"Overall, the air quality report highlights the importance of monitoring and understanding the patterns of PM2.5 and PM10 concentrations in the {self.grid_name} "
f"The analysis of the data reveals that air quality varies significantly over time, with periods of both moderate and unhealthy conditions. "
f"It’s observed that these fluctuations may be influenced by various factors, including seasonal changes. For instance, the washout effect during the rainy"
f" season could potentially contribute to these variations. Specifically, for the period from {self.starttime} to {self.endtime},"
f" the PM2.5 raw values ranged from {self.daily_min_pm2_5['pm2_5_raw_value']} µg/m³ on {self.daily_min_pm2_5['date']} to {self.daily_max_pm2_5['pm2_5_raw_value']} µg/m³ on {self.daily_max_pm2_5['date']}. respectively."
f"This pattern underscores the importance of continuous monitoring and the implementation of"
f"effective interventions to maintain air quality within safe limits. Ensuring good air quality is crucial for "
f"the well-being of both residents and visitors. Therefore, it’s imperative to adopt long-term"
f"strategies and measures that can effectively mitigate the impact of factors leading to poor airquality."
f"In conclusion, continuous monitoring, timely intervention, and effective policies are key to maintaining good air quality and safeguarding public health. "
)

report_content = (
f"{introduction}\n\n"
f"{diurnal_description}\n\n"
f"{daily_mean_description}\n\n"
f"{site_pm25_description}\n\n"
f"{conclusion}"
)


return self._prepare_report_json(report_content)

def _prepare_report_json(self, report_content):
return {
"grid_name": self.grid_name,
"main_site": self.main_site,
"annual_data": self.annual_data,
"daily_mean_data": self.daily_mean_data,
"diurnal": self.diurnal,
"monthly_data": self.monthly_data,
"report": report_content
}

def _prepare_customised_report_json(self, report_content):
return {
"grid_name": self.grid_name,
"start_end_time": self.starttime + " to " + self.endtime,
"report": report_content
}
10 changes: 9 additions & 1 deletion src/spatial/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,12 @@ scikit-learn~=1.5.2
gcsfs~=2024.9.0.post1
joblib~=1.4.2
lightgbm~=4.1.0
numpy~=1.25.2
numpy~=1.25.2
numpy
torch
transformers
datasets
sentencepiece
huggingface_hub
google-generativeai
openai
Loading

0 comments on commit 4135652

Please sign in to comment.