-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTraining.py
128 lines (91 loc) · 3.49 KB
/
Training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# %%
import json
import pandas as pd
import numpy as np
# %%
'''Data washing'''
# 加载训练集 JSON 数据
with open('train_data.json', 'r', encoding='utf-8') as f:
train_data = json.load(f)
# 加载测试集 JSON 数据
with open('test_data.json', 'r', encoding='utf-8') as f:
test_data = json.load(f)
# 将 JSON 数据转换为 DataFrame
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)
# 将字符串类型的数值转换为数值类型
train_df['Age'] = train_df['Age'].replace('', np.nan)
train_df['Fare'] = train_df['Fare'].replace('', np.nan)
test_df['Age'] = test_df['Age'].replace('', np.nan)
test_df['Fare'] = test_df['Fare'].replace('', np.nan)
train_df['Age'] = train_df['Age'].astype(float)
train_df['Fare'] = train_df['Fare'].astype(float)
train_df['Parch'] = train_df['Parch'].astype(int)
train_df['SibSp'] = train_df['SibSp'].astype(int)
train_df['Pclass'] = train_df['Pclass'].astype(int)
test_df['Age'] = test_df['Age'].astype(float)
test_df['Fare'] = test_df['Fare'].astype(float)
test_df['Parch'] = test_df['Parch'].astype(int)
test_df['SibSp'] = test_df['SibSp'].astype(int)
test_df['Pclass'] = test_df['Pclass'].astype(int)
# 处理缺失值
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].median())
train_df['Embarked'] = train_df['Embarked'].fillna(train_df['Embarked'].mode()[0])
train_df['Cabin'] = train_df['Cabin'].fillna('Unknown')
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].median())
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].median())
test_df['Cabin'] = test_df['Cabin'].fillna('Unknown')
# check
# 检查训练集的缺失值
print("The loss of train set:")
print(train_df.isnull().sum())
# 检查测试集的缺失值
print("\nThe loss of test set:")
print(test_df.isnull().sum())
# %%
# 从姓名中提取称呼(如 Mr, Miss, Mrs)
train_df['Title'] = train_df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
test_df['Title'] = test_df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
# 将性别转换为数值
train_df['Sex'] = train_df['Sex'].map({'male': 0, 'female': 1})
test_df['Sex'] = test_df['Sex'].map({'male': 0, 'female': 1})
# 创建家庭大小特征
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch'] + 1
# 将登船港口转换为数值
train_df['Embarked'] = train_df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
test_df['Embarked'] = test_df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
# %%
# 选择特征
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'FamilySize']
X_train = train_df[features]
y_train = train_df['Survived']
X_test = test_df[features]
# %%
print(X_train.dtypes)
# %%
from sklearn.ensemble import RandomForestClassifier
# 初始化模型
model = RandomForestClassifier(n_estimators=100, random_state=42)
# 训练模型
model.fit(X_train, y_train)
# %%
from sklearn.metrics import accuracy_score, confusion_matrix
# 在训练集上进行预测
y_train_pred = model.predict(X_train)
# 计算准确率
accuracy = accuracy_score(y_train, y_train_pred)
print(f"训练集准确率: {accuracy:.2f}")
# 绘制混淆矩阵
conf_matrix = confusion_matrix(y_train, y_train_pred)
print("混淆矩阵:")
print(conf_matrix)
# %%
# 对测试集进行预测
test_predictions = model.predict(X_test)
# 将预测结果保存到 CSV 文件
output = pd.DataFrame({
'PassengerId': test_df['PassengerId'],
'Survived': test_predictions
})
output.to_csv('submission.csv', index=False)