VisitData-org · petramika · Apr 16, 2020 · Apr 17, 2020 · Apr 20, 2020 · Apr 20, 2020
diff --git a/.gitignore b/.gitignore
@@ -8,3 +8,5 @@ localdata/
 *.iml
 datascratch/
 .gcpprj
+
+scripts/__pycache__
diff --git a/README.md b/README.md
@@ -57,6 +57,36 @@ Historic data snapshots are also hosted on https://data.visitdata.org/
 For example, you can retrieve
 https://data.visitdata.org/processed/vendor/foursquare/asof/20200403-v0/taxonomy.json
 
+# Weather Data
+
+Weather data is retrieved from weatherapi.com. After retrieving the data, it creates a file with the following structure:
+`<state>.json:`
+```
+{
+    <county-1>: {
+        forecast: {
+            day-1-timestamp: { <weather_data> },
+            day-2-timestamp: { <weather_data> },
+            ...
+        }
+    },
+    <county-2>: { .... }
+    ...
+}
+```
+
+In the **production environment**, a task will be exececuted daily (`etl/dags/extract_weather_data.py`) to store
+the weather data into a google cloud bucket.
+
+ `BUCKET_NAME` variable must be defined in the airflow server
+
+Once the data is stored in the bucket, we can get the weather data for one state in the route 
+`weather/<state>`
+
+To get that data from the google cloud the `BUCKET_NAME` environment variable must be defined. Otherwise,
+data will be stored locally in the path `localdata/`
+
+
 # Importing new data
 To import new data:
 

diff --git a/app.yaml b/app.yaml
@@ -2,4 +2,5 @@ runtime: python37
 entrypoint: gunicorn -b :$PORT main:app
 
 env_variables:
-  FOURSQUARE_DATA_VERSION: "20200503-v0"
+  FOURSQUARE_DATA_VERSION: "20200504-v0"
+  BUCKET_NAME: "vd-weather-data"
diff --git a/etl/dags/extract_weather_data.py b/etl/dags/extract_weather_data.py
@@ -0,0 +1,94 @@
+from datetime import timedelta, datetime
+from airflow import DAG
+from airflow.operators.python_operator import PythonOperator
+from airflow.utils.dates import days_ago
+import json
+from google.cloud.storage import Client
+import requests
+from deepmerge import always_merger
+import os
+
+BASE_URL = "http://api.weatherapi.com/v1/history.json?key={}&q={}+united+states&dt={}"
+# Get a weatherapi.com api key
+API_KEY = os.environ.get("API_WEATHER_KEY", "a70a4e2736644cdcb9d85348202404")
+BUCKET_NAME = os.environ.get("BUCKET_NAME", "default")
+
+default_args = {
+    'owner': 'airflow',
+    'depends_on_past': False,
+    'start_date': days_ago(1),
+    'retries': 1,
+    'retry_delay': timedelta(minutes=5)
+}
+
+gcs = Client()
+bucket = gcs.bucket(BUCKET_NAME)
+state_file = bucket.get_blob("states_counties.json")
+STATES = json.loads(state_file.download_as_string())
+
+def slugify_state(state):
+    return "-".join(state.split())
+
+def get_weather_data(query):
+    weather = {}
+    weather["forecast"] = {}
+    date = datetime.today()
+    full_url = BASE_URL.format(API_KEY, query, date.strftime('%Y-%m-%d'))
+    response = requests.get(full_url)
+    data = response.json()
+    try:
+        forecast = data["forecast"]["forecastday"][0]
+        location = data["location"]
+        forecast["day"].pop("condition")
+        weather = {**weather, **location}
+
+        weather["forecast"][forecast["date_epoch"]] = forecast["day"]
+    except:
+        return weather
+
+    return weather
+
+def weather_func_builder(state):
+    selected_state = state
+    def get_weather():
+        data = {"updated": datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+        counties = STATES[selected_state]
+        blob = bucket.get_blob("{}.json".format(selected_state))
+        if blob is None:
+            stated_cached_data = {}
+        else:
+            stated_cached_data = json.loads(blob.download_as_string())
+        for county in counties:
+            api_data = get_weather_data(county)
+            cached_data = stated_cached_data.get(county, {})
+            data[county] = always_merger.merge(cached_data, api_data)
+
+        state_blob = bucket.blob("{}.json".format(selected_state))
+        state_blob.upload_from_string(json.dumps(data))
+
+        return True
+    return get_weather
+
+
+
+def create_dag(dag_id, state):
+    dag = DAG(
+        dag_id=dag_id,
+        description="Weather DAG",
+        default_args=default_args,
+        schedule_interval='@daily'
+    )
+
+
+    get_data_api = PythonOperator(
+        task_id="get-data-{}".format(slugify_state(state)),
+        python_callable=weather_func_builder(state),
+        dag=dag
+    )
+
+    return dag
+
+for state in STATES.keys():
+    dag_id = "{}-weather".format(slugify_state(state))
+    globals()[dag_id] = create_dag(dag_id, state)
+
diff --git a/etl/requirements.txt b/etl/requirements.txt
@@ -1,2 +1,4 @@
 apache-airflow==1.10.10
 google-cloud-storage==1.27.0
+requests
+deepmerge
diff --git a/main.py b/main.py
@@ -9,11 +9,13 @@
 import yaml
 from flask import Flask, redirect, render_template, request
 from google.cloud import storage
+from scripts.weather import get_state_weather_locally, get_state_weather_cloud
 
 
 app = Flask(__name__, static_url_path="", static_folder="static")
 app_state = {
     "maps_api_key": "",
+    "weather_path_data": "vd-weather-data",
     "foursquare_data_url": "",
     "foursquare_data_version": ""
 }
@@ -143,6 +145,13 @@ def data(path):
                            snapshot_id=app_state['foursquare_data_version'])
 
 
+@app.route("/weather/<state>")
+def weather(state):
+    if app_state["weather_path_data"] != "":
+        return get_state_weather_cloud(state, app_state["weather_path_data"])
+    else:
+        return get_state_weather_locally(state)
+
 def page_not_found(e):
     return render_template('404.html'), 404
 
@@ -183,11 +192,21 @@ def _init_data_env():
     app_state["foursquare_data_url"] =\
         f"//data.visitdata.org/processed/vendor/foursquare/asof/{foursquare_data_version}"
 
+def _init_weather_data_env():
+    # Gcloud bucket name
+    bucket_name = os.getenv("BUCKET_NAME", "vd-weather-data")
+
+    if bucket_name == "":
+        error("Weather data will be stored locally")
+
+    app_state["weather_path_data"] = bucket_name
+
 
 def _init():
     app.config["SEND_FILE_MAX_AGE_DEFAULT"] = 60
     app.register_error_handler(404, page_not_found)
     _init_maps_api_key()
+    _init_weather_data_env()
     _init_data_env()
     print(app_state)
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,6 @@
 Flask==1.1.1
-gunicorn==19.10.0
+gunicorn==20.0.
 google-cloud-storage==1.27.0
 pyyaml==5.3.1
+requests
+deepmerge
diff --git a/scripts/config.py b/scripts/config.py
@@ -0,0 +1,7 @@
+BASE_URL = "http://api.weatherapi.com/v1/history.json?key={}&q={}+united+states&dt={}"
+API_KEY = "a70a4e2736644cdcb9d85348202404"
+DATA_PATH = "localdata/"
+
+NO_STATE_ERROR_RESPONSE = {
+    "error": "There is no data for that state"
+}
diff --git a/scripts/weather.py b/scripts/weather.py
@@ -0,0 +1,62 @@
+import requests, json
+from datetime import datetime, timedelta
+from deepmerge import always_merger
+from scripts.config import *
+from google.cloud import storage
+
+
+def get_weather_data(query, limit=30):
+    weather = {}
+    weather["forecast"] = {}
+    for day in range(limit, -1, -1):
+        date = datetime.today() - timedelta(days=day)
+        full_url = BASE_URL.format(API_KEY, query, date.strftime('%Y-%m-%d'))
+        response = requests.get(full_url)
+        data = response.json()
+        try:
+            forecast = data["forecast"]["forecastday"][0]
+            location = data["location"]
+            forecast["day"].pop("condition")
+            weather = {**weather, **location}
+
+            weather["forecast"][forecast["date_epoch"]] = forecast["day"]
+        except :
+            continue
+
+    return weather
+
+def __load_state_file(state):
+    try:
+        with open("{}{}.json".format(DATA_PATH, state)) as f:
+            data = json.load(f);
+            return data
+    except FileNotFoundError:
+        return {}
+
+def __update_state_file(state, data):
+    with open("{}{}.json".format(DATA_PATH, state), "w+") as f:
+        json.dump(data, f)
+
+def get_state_weather_locally(state):
+    STATES = {}
+
+    with open("states_counties.json") as f:
+        STATES = json.load(f)
+
+    if state not in STATES:
+        return json.dumps(NO_STATE_ERROR_RESPONSE)
+    cached_data = __load_state_file(state)
+    for county in STATES[state]:
+        weather_data = get_weather_data(county)
+        county_data = cached_data.get(county, {})
+        cached_data[county] = always_merger.merge(county_data, weather_data)
+    __update_state_file(state, cached_data)
+    return json.dumps(cached_data)
+
+def get_state_weather_cloud(state, bucket_name):
+    storage_client = storage.Client()
+    bucket = storage_client.bucket(bucket_name)
+    file = bucket.get_blob("{}.json".format(state))
+    return json.loads(file.download_as_string()) if file is not None else {}
+
+
diff --git a/static/css/visitdata.css b/static/css/visitdata.css
@@ -59,10 +59,6 @@ h4 {
       margin-top: 20px;
     }
 
-    .mobile-dropdown {
-      margin-left: 60px;
-    }
-
 }
 
 @media (min-width: 990px) and (max-width: 1199px) {
@@ -83,10 +79,6 @@ h4 {
     height: 566px;
   }
 
-  .mobile-btn {
-    margin-left: 25px;
-  }
-
   .mobile-header {
     margin-top: 15px;
   }
@@ -98,10 +90,6 @@ h4 {
   .mobile-search {
     margin-top: 20px;
   }
-
-  .mobile-dropdown {
-    margin-left: 60px;
-  }
 }
 
 @media (max-width: 989px) {
@@ -118,10 +106,6 @@ h4 {
       max-width: 240px;
   }
 
-  .mobile-btn {
-    margin-left: 10px;
-  }
-
   .mobile-header {
     margin-top: 15px;
   }
@@ -130,9 +114,6 @@ h4 {
     margin-top: 20px;
   }
 
-  .mobile-dropdown {
-    margin-left: 60px;
-  }
 }
 
 @media (min-width: 745px) and (max-width: 815px) {
@@ -193,3 +174,8 @@ h4 {
     margin-left: 0px;
   }
 }
+
+#weather-label {
+  text-overflow: ellipsis;
+  overflow: hidden;
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,3 +8,5 @@ localdata/ @@
     *.iml
     datascratch/
     .gcpprj
+    scripts/__pycache__