-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbigmart.py
76 lines (64 loc) · 2.77 KB
/
bigmart.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('Train.csv')
# check for categorical attributes
cat_col = []
for x in df.dtypes.index:
if df.dtypes[x] == 'object':
cat_col.append(x)
cat_col.remove('Item_Identifier')
cat_col.remove('Outlet_Identifier')
item_weight_mean = df.pivot_table(values = "Item_Weight", index = 'Item_Identifier')
miss_bool = df['Item_Weight'].isnull()
for i, item in enumerate(df['Item_Identifier']):
if miss_bool[i]:
if item in item_weight_mean:
df['Item_Weight'][i] = item_weight_mean.loc[item]['Item_Weight']
else:
df['Item_Weight'][i] = np.mean(df['Item_Weight'])
outlet_size_mode = df.pivot_table(values='Outlet_Size', columns='Outlet_Type', aggfunc=(lambda x: x.mode()[0]))
miss_bool = df['Outlet_Size'].isnull()
df.loc[miss_bool, 'Outlet_Size'] = df.loc[miss_bool, 'Outlet_Type'].apply(lambda x: outlet_size_mode[x])
# replace zeros with mean
df.loc[:, 'Item_Visibility'].replace([0], [df['Item_Visibility'].mean()], inplace=True)
# combine item fat content
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({'LF':'Low Fat', 'reg':'Regular', 'low fat':'Low Fat'})
df['Item_Fat_Content'].value_counts()
#Creation of New Attributes
df['New_Item_Type'] = df['Item_Identifier'].apply(lambda x: x[:2])
df['New_Item_Type'] = df['New_Item_Type'].map({'FD':'Food', 'NC':'Non-Consumable', 'DR':'Drinks'})
df.loc[df['New_Item_Type']=='Non-Consumable', 'Item_Fat_Content'] = 'Non-Edible'
# create small values for establishment year
df['Outlet_Years'] = 2013 - df['Outlet_Establishment_Year']
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Outlet'] = le.fit_transform(df['Outlet_Identifier'])
cat_col = ['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'New_Item_Type']
for col in cat_col:
df[col] = le.fit_transform(df[col])
#Input Split
X = df.drop(columns=['Outlet_Establishment_Year', 'Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales'])
Y = df['Item_Outlet_Sales']
#Model Training
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
def train(model, X, Y):
# train the model
model.fit(X, Y)
# predict the training set
pred = model.predict(X)
# perform cross-validation
cv_score = cross_val_score(model, X, Y, scoring='neg_mean_squared_error', cv=5)
cv_score = np.abs(np.mean(cv_score))
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
train(model, X, Y)
coef = pd.Series(model.feature_importances_, X.columns).sort_values(ascending=False)
file = open('model.pkl','wb')
#dump information to that file
pickle.dump(model, file)