forked from sourabhvarshney111/ML-Submission-Repo
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcode_project.py
91 lines (74 loc) · 3.35 KB
/
code_project.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
import pandas as pd
train = pd.read_csv('/kaggle/input/titanic/train.csv')
train.head()
test = pd.read_csv('/kaggle/input/titanic/test.csv')
test.head()
test.info()
train.isnull().sum()
train1 = train.drop(['Cabin','Ticket','PassengerId','Name'], axis = 1)
test1 = test.drop(['Cabin','Ticket','PassengerId','Name'], axis = 1)
train1['SP'] = train1['SibSp'] + train1['Parch'] + 1
train1.drop(['SibSp','Parch'], axis = 1,inplace = True)
test1['SP'] = test1['SibSp'] + test1['Parch'] + 1
test1.drop(['SibSp','Parch'], axis = 1,inplace = True)
train1['Embarked'].fillna('S',inplace = True)
train1['Age'].fillna(train1.Age.mean(), inplace = True)
Embarked_dummy = pd.get_dummies(train1['Embarked'],drop_first = True)
Embarked_dummy = pd.get_dummies(train1['Embarked'],drop_first = True)
Sex = pd.get_dummies(train1['Sex'],drop_first = True)
train2 = pd.concat([train1,Embarked_dummy,Sex], axis = 1)
train2.rename(columns={'male':'Sex1'}, inplace = True)
train2.drop(['Embarked','Sex'], axis = 1, inplace = True)
train2.head()
X = test1[(test1['Sex']== 'male') & (test1['SP'] == 1) & (test1['Pclass'] == 3) & (test1['Embarked'] == 'S')]
v = X['Fare'].mean()
test1['Fare'].fillna(v,inplace = True)
test1['Age'].fillna(test1.Age.mean(),inplace = True)
test1.info()
Embarked_dummy2 = pd.get_dummies(test1['Embarked'],drop_first = True)
sex = pd.get_dummies(test1['Sex'],drop_first = True)
test2 = pd.concat([test1,Embarked_dummy2,sex], axis = 1)
test2.rename(columns={'male':'sex'}, inplace = True)
test2.drop(['Embarked','Sex'], axis = 1, inplace = True)
test2.head()
X = train2.drop(['Survived'], axis = 1)
Y = train2.Survived.copy()
kfolds = StratifiedKFold(n_splits=4, random_state = 2)
log_reg = LogisticRegression()
accuracy = 0
z = 0
for train_index,test_index in kfolds.split(X,Y):
Xtrain = X.loc[train_index]
Ytrain = Y.loc[train_index]
Xtest = X.loc[test_index]
Ytest = Y.loc[test_index]
log_reg.fit(Xtrain,Ytrain)
Ypredict = log_reg.predict(Xtest)
accuracy += accuracy_score(Ytest,Ypredict)
z += 1
accuracy/=4
print(accuracy)
output = log_reg.predict(test2)
output1 = [pd.DataFrame(test['PassengerId']) , pd.DataFrame(output,columns = ['Survived'])]
Output_Final = pd.concat(output1, axis = 1)
Output_Final.to_csv('submission.csv')