generated from streamlit/Interactive-Data-Explorer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvert_csv_to_parquet.py
68 lines (61 loc) · 2.45 KB
/
convert_csv_to_parquet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import pandas as pd
# List of all individual CSV files
csv_files = [
"data/Fanflux_Intensity_MLB_AAPI.csv",
"data/Fanflux_Intensity_MLB_American_Indian.csv",
"data/Fanflux_Intensity_MLB_Asian.csv",
"data/Fanflux_Intensity_MLB_Black.csv",
"data/Fanflux_Intensity_MLB_Hispanic.csv",
"data/Fanflux_Intensity_MLB_White.csv",
"data/Fanflux_Intensity_NBA_AAPI.csv",
"data/Fanflux_Intensity_NBA_American Indian.csv",
"data/Fanflux_Intensity_NBA_Asian.csv",
"data/Fanflux_Intensity_NBA_Black.csv",
"data/Fanflux_Intensity_NBA_Hispanic.csv",
"data/Fanflux_Intensity_NBA_White.csv",
"data/Fanflux_Intensity_NFL_Black.csv",
"data/Fanflux_Intensity_NFL_Hispanic.csv",
"data/Fanflux_Intensity_NFL_White.csv",
"data/Fanflux_Intensity_NHL_AAPI.csv",
"data/Fanflux_Intensity_NHL_American_Indian.csv",
"data/Fanflux_Intensity_NHL_Asian.csv",
"data/Fanflux_Intensity_NHL_Black.csv",
"data/Fanflux_Intensity_NHL_Hispanic.csv",
"data/Fanflux_Intensity_NHL_White.csv",
# Add paths for MLS files as needed
]
# Load all individual CSV files into a single DataFrame
df_list = [pd.read_csv(file) for file in csv_files]
df_all = pd.concat(df_list, ignore_index=True)
# Clean the data: Remove 'mapping' column if it exists
if 'mapping' in df_all.columns:
df_all.drop(columns=['mapping'], inplace=True)
# Convert columns to appropriate data types
def convert_columns_to_numeric(df, columns):
for col in columns:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce')
return df
# List of income columns that need conversion
income_columns = [
"Struggling (Less than $10,000)",
"Getting By ($10,000 to $14,999)",
"Getting By ($15,000 to $19,999)",
"Starting Out ($20,000 to $24,999)",
"Starting Out ($25,000 to $29,999)",
"Starting Out ($30,000 to $34,999)",
"Middle Class ($35,000 to $39,999)",
"Middle Class ($40,000 to $44,999)",
"Middle Class ($45,000 to $49,999)",
"Comfortable ($50,000 to $59,999)",
"Comfortable ($60,000 to $74,999)",
"Doing Well ($75,000 to $99,999)",
"Prosperous ($100,000 to $124,999)",
"Prosperous ($125,000 to $149,999)",
"Wealthy ($150,000 to $199,999)",
"Affluent ($200,000 or more)"
]
# Apply the conversion function
df_all = convert_columns_to_numeric(df_all, income_columns)
# Save the cleaned DataFrame to a Parquet file
df_all.to_parquet("data/Fanflux_Intensity_All_Leagues_Cleaned.parquet")