-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathC4-1.py
143 lines (123 loc) · 3.83 KB
/
C4-1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# 加载数据集
train_data = pd.read_csv('KDDTrain+.txt', header=None)
test_data = pd.read_csv('KDDTest+.txt', header=None)
# 定义分类标签映射字典
labels = {
'normal': 0,
'back': 1,
'land': 1,
'pod': 1,
'neptune': 1,
'smurf': 1,
'teardrop': 1,
'apache2': 1,
'udpstorm': 1,
'processtable': 1,
'mailbomb': 1,
'buffer_overflow': 1,
'loadmodule': 1,
'perl': 1,
'rootkit': 1,
'sqlattack': 1,
'xterm': 1,
'ps': 1,
'httptunnel': 1,
'named': 1,
'sendmail': 1,
'snmpgetattack': 1,
'snmpguess': 1,
'worm': 1,
'xlock': 1,
'xsnoop': 1,
'imap': 1,
'ftp_write': 1,
'guess_passwd': 1,
'multihop': 1,
'phf': 1,
'spy': 1,
'warezclient': 1,
'warezmaster': 1,
'portsweep': 1,
'ipsweep': 1,
'nmap': 1,
'satan': 1,
'mscan': 1,
'saint': 1
}
# 对分类标签进行映射
train_data[41] = train_data[41].apply(lambda x: labels[x])
test_data[41] = test_data[41].apply(lambda x: labels[x])
# 将非数值特征进行编码
train_data = pd.get_dummies(train_data, columns=[1, 2, 3])
test_data = pd.get_dummies(test_data, columns=[1, 2, 3])
# 对训练集和测试集的列数进行对齐
train_features, test_features = train_data.align(test_data, join='outer', axis=1, fill_value=0)
# 分离特征和标签
train_labels = train_features.pop(41)
test_labels = test_features.pop(41)
# 对特征进行标准化
scaler = StandardScaler()
train_features.columns = train_features.columns.astype(str)
test_features.columns = test_features.columns.astype(str)
train_features = scaler.fit_transform(train_features)
test_features = scaler.transform(test_features)
# 定义超参数网格
param_grid = {
'criterion': ['entropy'],
'max_depth': [6],
'min_samples_split': [2],
'min_samples_leaf': [1]
}
# 初始化C4.5决策树分类器
dt = DecisionTreeClassifier()
# 定义网格搜索器
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, scoring='accuracy')
# 训练模型
grid_search.fit(train_features, train_labels)
# 输出最优超参数和对应的交叉验证得分
print('Best Parameters:', grid_search.best_params_)
print('Best CV Score:', grid_search.best_score_)
# 进行预测
test_pred = grid_search.predict(test_features)
# 计算分类性能指标
accuracy = accuracy_score(test_labels, test_pred)
precision = precision_score(test_labels, test_pred)
recall = recall_score(test_labels, test_pred)
f1 = f1_score(test_labels, test_pred)
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
# 绘制柱状图
labels = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
values = [accuracy, precision, recall, f1]
plt.bar(labels, values)
plt.title('Performance Metrics')
plt.xlabel('Metric')
plt.ylabel('Value')
# 在每个条形上方添加数字(以百分比形式显示)
for i, v in enumerate(values):
plt.text(i-0.1, v+0.01, '{:.2%}'.format(v), fontsize=10)
# 添加标题
# plt.title('Experimental Results of C4.5 decision tree')
plt.title('Ensemble Model Performance')
plt.show()
# 绘制ROC曲线
fpr, tpr, thresholds = roc_curve(test_labels, test_pred)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1, label='ROC (AUC = %0.2f)' % (roc_auc))
plt.plot([0, 1], [0, 1], '--', color='gray', label='Random Guess')
plt.title('Receiver Operating Characteristic')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.show()