From d86260f5fe36681b8d5572504f31ad653312b24c Mon Sep 17 00:00:00 2001 From: n03an Date: Sat, 17 Aug 2024 21:49:08 +0800 Subject: [PATCH] fix: singapore rainfall data cleanup --- .../1.0_data_cleaning_cch.ipynb | 584 ++++++++++++++++-- 1 file changed, 533 insertions(+), 51 deletions(-) diff --git a/Assorted_Projects/singapore_rainfall/1.0_data_cleaning_cch.ipynb b/Assorted_Projects/singapore_rainfall/1.0_data_cleaning_cch.ipynb index 6bc2d45..d27ca0f 100644 --- a/Assorted_Projects/singapore_rainfall/1.0_data_cleaning_cch.ipynb +++ b/Assorted_Projects/singapore_rainfall/1.0_data_cleaning_cch.ipynb @@ -21,19 +21,7 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'pandas'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[1], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mglob\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n", - "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'pandas'" - ] - } - ], + "outputs": [], "source": [ "import glob\n", "import pandas as pd" @@ -48,19 +36,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Combining the separate CSV files into one\n", "raw = pd.concat(\n", - " [pd.read_csv(f) for f in glob.glob(\"../raw/*.csv\")], ignore_index=True\n", + " [pd.read_csv(f) for f in glob.glob(\"raw/*.csv\")], ignore_index=True\n", ")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -76,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -86,7 +74,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -101,9 +89,39 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 13483 entries, 0 to 13482\n", + "Data columns (total 16 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Station 13483 non-null object \n", + " 1 Year 13483 non-null int64 \n", + " 2 Month 13483 non-null int64 \n", + " 3 Day 13483 non-null int64 \n", + " 4 Daily Rainfall Total (mm) 13483 non-null float64 \n", + " 5 Highest 30 Min Rainfall (mm) 13483 non-null object \n", + " 6 Highest 60 Min Rainfall (mm) 13483 non-null object \n", + " 7 Highest 120 Min Rainfall (mm) 13483 non-null object \n", + " 8 Mean Temperature (°C) 13483 non-null float64 \n", + " 9 Maximum Temperature (°C) 13483 non-null float64 \n", + " 10 Minimum Temperature (°C) 13483 non-null float64 \n", + " 11 Mean Wind Speed (km/h) 13473 non-null float64 \n", + " 12 Max Wind Speed (km/h) 13472 non-null float64 \n", + " 13 Date 13483 non-null datetime64[ns]\n", + " 14 Month_Name 13483 non-null object \n", + " 15 Quarter 13483 non-null int32 \n", + "dtypes: datetime64[ns](1), float64(6), int32(1), int64(3), object(5)\n", + "memory usage: 1.6+ MB\n" + ] + } + ], "source": [ "raw.info()" ] @@ -117,7 +135,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -131,7 +149,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -148,7 +166,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -171,7 +189,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -180,7 +198,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -189,9 +207,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 13483 entries, 12323 to 4385\n", + "Data columns (total 12 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Date 13483 non-null datetime64[ns]\n", + " 1 Year 13483 non-null int64 \n", + " 2 Month 13483 non-null int64 \n", + " 3 Month_Name 13483 non-null object \n", + " 4 Quarter 13483 non-null int32 \n", + " 5 Day 13483 non-null int64 \n", + " 6 Daily Rainfall Total (mm) 13483 non-null float64 \n", + " 7 Mean Temperature (°C) 13483 non-null float64 \n", + " 8 Maximum Temperature (°C) 13483 non-null float64 \n", + " 9 Minimum Temperature (°C) 13483 non-null float64 \n", + " 10 Mean Wind Speed (km/h) 13483 non-null float64 \n", + " 11 Max Wind Speed (km/h) 13483 non-null float64 \n", + "dtypes: datetime64[ns](1), float64(6), int32(1), int64(3), object(1)\n", + "memory usage: 1.3+ MB\n" + ] + } + ], "source": [ "weather.info()\n", "# no null values" @@ -199,18 +243,231 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Date', 'Year', 'Month', 'Month_Name', 'Quarter', 'Day',\n", + " 'Daily Rainfall Total (mm)', 'Mean Temperature (°C)',\n", + " 'Maximum Temperature (°C)', 'Minimum Temperature (°C)',\n", + " 'Mean Wind Speed (km/h)', 'Max Wind Speed (km/h)'],\n", + " dtype='object')" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "weather.columns" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DateYearMonthQuarterDayDaily Rainfall Total (mm)Mean Temperature (°C)Maximum Temperature (°C)Minimum Temperature (°C)Mean Wind Speed (km/h)Max Wind Speed (km/h)
count1348313483.00000013483.00000013483.00000013483.00000013483.00000013483.00000013483.00000013483.00000013483.00000013483.000000
mean2001-06-16 00:00:002000.9579476.5104212.50515515.7287705.81227527.66900531.52276924.9047477.44569134.048553
min1983-01-01 00:00:001983.0000001.0000001.0000001.0000000.00000022.80000023.60000020.2000000.2000004.700000
25%1992-03-24 12:00:001992.0000004.0000002.0000008.0000000.00000026.90000030.80000024.0000004.80000028.800000
50%2001-06-16 00:00:002001.0000007.0000003.00000016.0000000.00000027.70000031.80000024.9000006.80000033.100000
75%2010-09-07 12:00:002010.00000010.0000004.00000023.0000004.40000028.60000032.50000025.8000009.70000038.200000
max2019-11-30 00:00:002019.00000012.0000004.00000031.000000216.20000030.90000036.00000029.10000022.20000090.700000
stdNaN10.6545573.4427451.1161058.79997114.4180901.1753341.5711841.2681233.4748448.031279
\n", + "
" + ], + "text/plain": [ + " Date Year Month Quarter \\\n", + "count 13483 13483.000000 13483.000000 13483.000000 \n", + "mean 2001-06-16 00:00:00 2000.957947 6.510421 2.505155 \n", + "min 1983-01-01 00:00:00 1983.000000 1.000000 1.000000 \n", + "25% 1992-03-24 12:00:00 1992.000000 4.000000 2.000000 \n", + "50% 2001-06-16 00:00:00 2001.000000 7.000000 3.000000 \n", + "75% 2010-09-07 12:00:00 2010.000000 10.000000 4.000000 \n", + "max 2019-11-30 00:00:00 2019.000000 12.000000 4.000000 \n", + "std NaN 10.654557 3.442745 1.116105 \n", + "\n", + " Day Daily Rainfall Total (mm) Mean Temperature (°C) \\\n", + "count 13483.000000 13483.000000 13483.000000 \n", + "mean 15.728770 5.812275 27.669005 \n", + "min 1.000000 0.000000 22.800000 \n", + "25% 8.000000 0.000000 26.900000 \n", + "50% 16.000000 0.000000 27.700000 \n", + "75% 23.000000 4.400000 28.600000 \n", + "max 31.000000 216.200000 30.900000 \n", + "std 8.799971 14.418090 1.175334 \n", + "\n", + " Maximum Temperature (°C) Minimum Temperature (°C) \\\n", + "count 13483.000000 13483.000000 \n", + "mean 31.522769 24.904747 \n", + "min 23.600000 20.200000 \n", + "25% 30.800000 24.000000 \n", + "50% 31.800000 24.900000 \n", + "75% 32.500000 25.800000 \n", + "max 36.000000 29.100000 \n", + "std 1.571184 1.268123 \n", + "\n", + " Mean Wind Speed (km/h) Max Wind Speed (km/h) \n", + "count 13483.000000 13483.000000 \n", + "mean 7.445691 34.048553 \n", + "min 0.200000 4.700000 \n", + "25% 4.800000 28.800000 \n", + "50% 6.800000 33.100000 \n", + "75% 9.700000 38.200000 \n", + "max 22.200000 90.700000 \n", + "std 3.474844 8.031279 " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "weather.describe()\n", "# The Daily Rainfall cols have some obvious outliers. But let's deal with that later, as and when required" @@ -218,9 +475,159 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DateYearMonthMonth_NameQuarterDayDaily Rainfall Total (mm)Mean Temperature (°C)Maximum Temperature (°C)Minimum Temperature (°C)Mean Wind Speed (km/h)Max Wind Speed (km/h)
123232019-11-30201911November4309.427.632.125.06.828.1
123222019-11-29201911November42973.426.532.024.06.131.3
123212019-11-28201911November4289.027.831.425.37.627.4
123202019-11-27201911November4271.827.531.525.47.231.3
123192019-11-26201911November4260.028.533.025.710.434.9
\n", + "
" + ], + "text/plain": [ + " Date Year Month Month_Name Quarter Day \\\n", + "12323 2019-11-30 2019 11 November 4 30 \n", + "12322 2019-11-29 2019 11 November 4 29 \n", + "12321 2019-11-28 2019 11 November 4 28 \n", + "12320 2019-11-27 2019 11 November 4 27 \n", + "12319 2019-11-26 2019 11 November 4 26 \n", + "\n", + " Daily Rainfall Total (mm) Mean Temperature (°C) \\\n", + "12323 9.4 27.6 \n", + "12322 73.4 26.5 \n", + "12321 9.0 27.8 \n", + "12320 1.8 27.5 \n", + "12319 0.0 28.5 \n", + "\n", + " Maximum Temperature (°C) Minimum Temperature (°C) \\\n", + "12323 32.1 25.0 \n", + "12322 32.0 24.0 \n", + "12321 31.4 25.3 \n", + "12320 31.5 25.4 \n", + "12319 33.0 25.7 \n", + "\n", + " Mean Wind Speed (km/h) Max Wind Speed (km/h) \n", + "12323 6.8 28.1 \n", + "12322 6.1 31.3 \n", + "12321 7.6 27.4 \n", + "12320 7.2 31.3 \n", + "12319 10.4 34.9 " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "weather.head()" ] @@ -251,16 +658,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ - "monthly_rain = pd.read_csv('../raw/monthly_data/monthly_rain.csv')" + "monthly_rain = pd.read_csv('raw/monthly_data/monthly_rain.csv')" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -271,7 +678,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -282,7 +689,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -303,9 +710,84 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Total_Monthly_Rainfall (mm)YearMonth
44531.620192
44672.220193
447174.820194
44869.020195
449173.820196
\n", + "
" + ], + "text/plain": [ + " Total_Monthly_Rainfall (mm) Year Month\n", + "445 31.6 2019 2\n", + "446 72.2 2019 3\n", + "447 174.8 2019 4\n", + "448 69.0 2019 5\n", + "449 173.8 2019 6" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "monthly_rain.tail()" ] @@ -319,16 +801,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ - "mean_temp = pd.read_csv('../raw/monthly_data/monthly_temp_mean.csv')" + "mean_temp = pd.read_csv('raw/monthly_data/monthly_temp_mean.csv')" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -339,7 +821,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -354,7 +836,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -381,16 +863,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ - "max_temp = pd.read_csv('../raw/monthly_data/monthly_temp_max.csv')" + "max_temp = pd.read_csv('raw/monthly_data/monthly_temp_max.csv')" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -401,7 +883,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -416,7 +898,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -451,7 +933,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.6" + "version": "3.11.9" } }, "nbformat": 4,