-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
154 lines (120 loc) · 6.65 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import os
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import geopandas as gpd
import geoplot
def cleaning_stations(data):
'''
Input:
- data: row Pandas DataFrame with information about the stations
Output:
- a geopandas dataframe with all relevant stations and geometry(x,y coordinate as points)
'''
mask = data.recaudoestacion.isin(['', '0', '00000', '01234', '06112', '22000', '12345'])
exclude_stations = tuple(data.recaudoestacion[mask])
# #Grupping by recaudoestacion because is a better id for the estation
# df = data[~mask].groupby('recaudoestacion')\
# [['nombreestacion','idlinea','latitud','longitud']].first().reset_index()
# #There are 10 duplicated stations (ID is different, but station should be the same)
# duplicated = {'04100':'04004', '57503':'07503', '50001':'04100','08100':'40000','40004':'40003',
# '50002':'09100', '50003':'09001', '50004':'04003', '50006':'04100', '50007':'07006',}
# df.recaudoestacion = df.recaudoestacion.replace(duplicated)
df = data[~mask].copy()
missing_xy = {'(50001) Portal Eldorado[Intermedium]' : (4.681520, -74.121143),
'(40000) Cable Portal Tunal': (4.568481,-74.139379),
'(40001) Juan Pablo II': (4.555476,-74.147523),
'(40002) Manitas': (4.550445,-74.150598),
'(40003) Mirador del Paraiso': (4.550019,-74.159009),
'(40004) Bicicletero Mirador del Paraíso': (4.550019,-74.159009),
'(08100) Portal Tunal Cable': (4.568481,-74.139379),
'Cable Portal Tunal(40000)' : (4.568481,-74.139379),
'Juan Pablo II(40001)' : (4.555476,-74.147523),
'Manitas(40002)' : (4.550445,-74.150598),
'Mirador del Paraiso(40003)': (4.550019,-74.159009),
'(14005) Las Aguas': (4.60255, -74.068687),
'Ampliacion San Mateo(57503)': (4.589146,-74.199496),
'Corral Molinos(50003)': (4.556805,-74.121705),
'Corral Avenida Ciudad de Cali(50004)':(4.702865,-74.100733),
'Corral Calle 40 Sur(50002)':(4.575937,-74.119233),
'EL CAMPIN(07106)':(4.645663,-74.078697),
'Corral General Santander(50007)':(4.593200,-74.128343),
'Corral Carrera 77(50006)': (4.698383,-74.094176),
'Centro Comercial Santa Fe(02001)':(4.763741, -74.044402),
'Las Aguas(14005)':(4.60255, -74.068687),
'(50007) Corral General Santander': (4.593200,-74.128343),
'(07106) EL CAMPIN':(4.645663,-74.078697),
'(02001) Centro Comercial Santa Fe': (4.763741, -74.044402),
'(50003) Corral Molinos':(4.556805,-74.121705),
'(50004) Corral Avenida Ciudad de Cali':(4.702865,-74.100733),
'(50002) Corral Calle 40 Sur':(4.575937,-74.119233),
'(57503) Ampliacion San Mateo':(4.589146,-74.199496),
'(50006) Corral Carrera 77':(4.698383,-74.094176)}
for key, value in missing_xy.items():
df.loc[df[df.nombreestacion == key].index[0], ['latitud', 'longitud']] = value
gdf = gpd.GeoDataFrame(df,
geometry = gpd.points_from_xy(df.longitud, df.latitud))
return gdf
def assign_strata(stations, blocks):
'''
Input:
- stations: row Pandas DataFrame with information about the stations
- blocks: shp file with strata information by block
Output:
- Stations geoPandas dataframe with an assigned strata
'''
stations = cleaning_stations(stations)
#Creating buffer ~500m
stations['catchment_area'] = stations.geometry.buffer(0.005) # 0.001° = 111 m
stations_buffer = gpd.GeoDataFrame(stations, geometry = 'catchment_area')
# Spatial Join stations buffer and blocks
estacion_estrata = gpd.sjoin(stations_buffer, blocks, how = 'left', op = 'intersects' )
# Average strata by BRT station
strata = estacion_estrata.groupby('idestacion').agg({'recaudoestacion':'first',
'nombreestacion':'first',
'ESTRATO':'mean',
'latitud':'first',
'longitud':'first',
'geometry':'first'})
# BRT stations outside the Bogota Metro Region are only located in Soacha, with a strata of 1.
strata.ESTRATO.fillna(1, inplace = True) # Replace nan values with strata 1
strata.ESTRATO = strata.ESTRATO.round(decimals = 0).astype(int)
return gpd.GeoDataFrame(strata, geometry = 'geometry')
def cleaning_transactions(transactions, stations):
'''
Creates a timestamp colum with transaction datetime information
Orders the table by user_id and transaction timestamp
Merge with a station unique Id (col = recaudoestacion)
Input:
-data: row transaccion data
-stations: clean stations with ids and unique "recaudoestacion"
Output
-Clean and ordered transactions dataframe
'''
data = transactions.copy()
#Creating dataetime column with dataetime format
data.horatransaccion = data.horatransaccion.astype(str)
data.fechatransaccion = data.fechatransaccion.astype(str)
data['sec'] = data.horatransaccion.apply(lambda x: x[-2:])
data['minute'] = data.horatransaccion.apply(lambda x: x[-4:-2])
data['hour'] = data.horatransaccion.apply(lambda x: x[:-4])
data['hour'].replace('', '0', inplace = True)
data['minute'].replace('', '0', inplace = True)
data['year'] = data.fechatransaccion.apply(lambda x: x[:4])
data['month'] = data.fechatransaccion.apply(lambda x: x[4:6])
data['day'] = data.fechatransaccion.apply(lambda x: x[6:8])
#Converting to a stand alone dataaframe for conversion
df = pd.DataFrame({'year': data['year'],
'month': data['month'],
'day': data['day'],
'hour': data['hour'],
'minutes':data['minute'] ,
'seconds': data['sec']})
data['datetime'] = pd.to_datetime(df)
df = data.merge(stations, how = 'left', on='idestacion') #merging station unique ID
df = df[~df.recaudoestacion.isnull()] # Remove observations with invalid station ID
df.sort_values(['idnumerotarjeta', 'datetime'], inplace = True )
cols_order = ['idnumerotarjeta', 'datetime', 'recaudoestacion', 'idtipotarjeta',
'idtipotarifa','saldopreviotransaccion', 'valor', 'saldodespues_transaccion']
return df[cols_order]