-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdecisiontree.py
84 lines (55 loc) · 2.26 KB
/
decisiontree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import log_loss, confusion_matrix
from quantfeatures import dataToNumpy
MAX_DEPTH = 6
MIN_SAMPLES_LEAF = 0.0
def getModelFromNumpy(X, y, maxDepth=MAX_DEPTH, minSamplesLeaf=MIN_SAMPLES_LEAF):
clf = DecisionTreeClassifier(max_depth= maxDepth, min_samples_leaf=minSamplesLeaf)
clf.fit(X, y)
return clf
def getModelFromDataframe(df, max_depth=MAX_DEPTH, min_samples_leaf=MIN_SAMPLES_LEAF):
X_scaled,y = dataToNumpy(df)
return getModelFromNumpy(X_scaled, y, maxDepth=max_depth, minSamplesLeaf=min_samples_leaf)
def getModelFromCSV(csv):
df = pd.read_csv(csv)
getModelFromDataframe(df)
def trainAndTestFromDataframes(trainDf, testDf, max_depth=MAX_DEPTH, min_samples_leaf=MIN_SAMPLES_LEAF): # method to be used in backtester
model = getModelFromDataframe(trainDf, max_depth=max_depth, min_samples_leaf=min_samples_leaf)
testX, testY = dataToNumpy(testDf, 'testing df')
predictions = model.predict(testX)
return predictions
if __name__ == '__main__':
X_scaled, y = dataToNumpy('data/bigOne.csv')
clf = getModelFromNumpy(X_scaled, y)
length = X_scaled.shape[0]
# MANUAL TUNING DONE HERE
predictions = clf.predict(X_scaled)
predictions_proba = clf.predict_proba(X_scaled)
print("confusion matrix:")
print(confusion_matrix(y, predictions))
print(f'prediction: {clf.predict_proba([X_scaled[140]])}')
zeroCounter = 0
oneCounter = 0
twoCounter = 0
for i in range(length):
if predictions[i] == 0:
zeroCounter += 1
if predictions[i] == 1:
oneCounter += 1
if predictions[i] == 2:
twoCounter += 1
print(f'zeroCounter: {zeroCounter}')
print(f'oneCounter: {oneCounter}')
print(f'twoCounter: {twoCounter}')
error = 0
averageTrueTwoPredictionZeroProba = np.zeros(3)
averageTrueTwoPredictionTwoProba = np.zeros(3) # 80 of these
for i in range(length):
if(y[i] == 2 and predictions[i] == 2):
averageTrueTwoPredictionTwoProba += predictions_proba[i]
continue
if y[i] != predictions[i]:
error += 1
print(f'Misclass rate: {error/length}')