-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_split.py
299 lines (227 loc) · 10.4 KB
/
data_split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
#%%
import open3d as o3d
import pandas as pd
from sklearn.model_selection import train_test_split
from o3d_tools.visualize import PointCloudProject
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
# Load point cloud data project-wise
project1 = PointCloudProject(project='Project1')
project2 = PointCloudProject(project='Project2')
project3 = PointCloudProject(project='Project3')
project4 = PointCloudProject(project='Project4')
#%% Plot distributions of bounding box dimensions for each class
from sklearn.preprocessing import StandardScaler
projects = [project1, project2, project3, project4]
# Iterate through each project
dfs_list = []
for project in projects:
# Get the point cloud data for each project
objects_df = project.objects_df # Adjust for how data is accessed
project_name = project.project
# Split class-specific data
for class_name, df in objects_df.items():
# Add project name as a column
df['Project'] = project_name
dfs_list.append(df)
combined_dfs = pd.concat(dfs_list, ignore_index=True)
type_dict = {'HVAC_duct': 'Duct', 'Pipe': 'Pipe', 'Structural_ColumnBeam': 'Column Beam', 'Structural_IBeam': 'IBeam'}
combined_dfs[' Label'] = combined_dfs[' Label'].map(type_dict)
# Compute height, width, and depth for each object in the combined dataset
combined_dfs['Height'] = combined_dfs[' BB.Max.Y '] - combined_dfs[' BB.Min.Y ']
combined_dfs['Width'] = combined_dfs[' BB.Max.X '] - combined_dfs[' BB.Min.X ']
combined_dfs['Depth'] = combined_dfs[' BB.Max.Z'] - combined_dfs[' BB.Min.Z ']
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Set the figure size and create subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6), dpi = 200)
# List of features to plot
features = ['Height', 'Width', 'Depth']
# Define a color palette for the labels
palette = sns.color_palette('Set2')
# Iterate over the features and create a distribution plot for each one
for i, feature in enumerate(features):
sns.boxplot(data=combined_dfs, x=' Label', y=feature, ax=axes[i], palette=palette)
# sns.kdeplot(data=combined_dfs, x=feature, hue=' Label', ax=axes[i], fill=True, common_norm=False, palette=palette)
axes[i].set_title(f'Distribution of {feature}', fontsize = 16)
axes[i].set_xlabel('')
axes[i].set_ylabel('')
axes[i].tick_params(axis='both', which='major', labelsize=16)
axes[i].grid(True)
# Add a global title to the figure
plt.suptitle('Distribution of Bounding Box Dimensions by Object', fontsize=16)
# Display the plot
plt.tight_layout()
plt.savefig('bounding_box_dimensions.png')
plt.show()
# # Standardize the height, width, and depth
# scaler = StandardScaler()
# combined_dfs[['Height', 'Width', 'Depth']] = scaler.fit_transform(combined_dfs[['Height', 'Width', 'Depth']])
# # Set the figure size and create subplots
# fig, axes = plt.subplots(1, 3, figsize=(18, 6))
# # List of features to plot
# features = ['Height', 'Width', 'Depth']
# # Define a color palette for the labels
# palette = sns.color_palette('Set2')
# # Iterate over the features and create a boxplot for each one
# for i, feature in enumerate(features):
# # sns.kdeplot(data=combined_dfs, x=feature, hue=' Label', ax=axes[i], fill=True, common_norm=False, palette=palette)
# sns.boxplot(data=combined_dfs, x=' Label', y=feature, ax=axes[i], palette=palette)
# axes[i].set_title(f'Standardized {feature} by Label')
# axes[i].set_xlabel('Label')
# axes[i].set_ylabel(f'Standardized {feature}')
# # Add a global title to the figure
# plt.suptitle('Standardized Boxplots of Bounding Box Dimensions (Height, Width, Depth) by Label', fontsize=16)
# # Display the plot
# plt.tight_layout()
# plt.show()
#%% Randon Forest Classifier
def stratified_train_test_split(projects, test_size=0.2, random_state=None):
"""
Splits point cloud data from multiple projects into stratified train and test sets.
Parameters:
- projects: List of PointCloudProject objects.
- test_size: Proportion of data for the test set (default is 0.2).
- random_state: Random seed for reproducibility.
Returns:
- train_df_combined: Combined train dataframe.
- test_df_combined: Combined test dataframe.
"""
train_dfs_list = []
test_dfs_list = []
# Iterate through each project
for project in projects:
# Get the point cloud data for each project
objects_df = project.objects_df # Adjust for how data is accessed
project_name = project.project
# Split class-specific data
for class_name, df in objects_df.items():
# Add project name as a column
df['Project'] = project_name
# Perform stratified split
train_df, test_df = train_test_split(
df,
test_size=test_size,
random_state=random_state,
stratify=df[' Label']
)
train_dfs_list.append(train_df)
test_dfs_list.append(test_df)
# Combine train and test data from all projects
train_df_combined = pd.concat(train_dfs_list, ignore_index=True)
test_df_combined = pd.concat(test_dfs_list, ignore_index=True)
return train_df_combined, test_df_combined
#%% Example usage with four projects
from sklearn.preprocessing import StandardScaler, LabelEncoder
projects = [project1, project2, project3, project4]
train_set, test_set = stratified_train_test_split(projects, test_size=0.2, random_state=42)
# Encode labels as categories
label_encoder = LabelEncoder()
# Select features (bounding box coordinates) and the target (label) for training and testing
X_train = train_set[[' BB.Min.X ', ' BB.Min.Y ', ' BB.Min.Z ', ' BB.Max.X ', ' BB.Max.Y ', ' BB.Max.Z']]
y_train = label_encoder.fit_transform(train_set[' Label'])
X_test = test_set[[' BB.Min.X ', ' BB.Min.Y ', ' BB.Min.Z ', ' BB.Max.X ', ' BB.Max.Y ', ' BB.Max.Z']]
y_test = label_encoder.transform(test_set[' Label'])
# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Initialize the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the model on the training data
rf_model.fit(X_train, y_train)
# Make predictions on the test data
y_pred = rf_model.predict(X_test)
# Assess the performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
# Detailed classification report (precision, recall, F1-score)
print('Classification Report:')
print(classification_report(y_test, y_pred))
# Confusion matrix
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
# %% Nested Cross-Validation
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Hyperparameter grid for Random Forest
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20],
'min_samples_split': [2, 5, 10]
}
# Outer cross-validation loop (for model performance evaluation)
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Inner cross-validation loop for hyperparameter tuning
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
# Initialize Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)
# Set up GridSearchCV for inner cross-validation loop
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=inner_cv, scoring='accuracy')
# Nested cross-validation loop for overall performance estimation
nested_cv_scores = cross_val_score(grid_search, X_train, y_train, cv=outer_cv, scoring='accuracy')
# Print the accuracy for each outer fold
for i, score in enumerate(nested_cv_scores, 1):
print(f'Outer Fold {i}: Accuracy = {score:.2f}')
# Print mean and standard deviation of the nested cross-validation scores
print(f'Nested CV Accuracy: {nested_cv_scores.mean():.2f} ± {nested_cv_scores.std():.2f}')
# Now, train the final model using the best hyperparameters from GridSearchCV
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
# Predict on the test set
y_pred = best_model.predict(X_test)
# Assess the performance on the test set
accuracy = accuracy_score(y_test, y_pred)
print(f'Test Set Accuracy: {accuracy:.2f}')
#%% Print the best hyperparameters
print("Best Hyperparameters found by GridSearchCV:")
print(grid_search.best_params_)
#Detailed classification report
print('Classification Report:')
print(classification_report(y_test, y_pred))
# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)
# Plot the confusion matrix
import seaborn as sns
import matplotlib.pyplot as plt
# Normalize the confusion matrix to show percentages
conf_matrix_normalized = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis] * 100
# Create a custom annotation format with the percentage sign
labels = np.array([["{:.1f}%".format(value) for value in row] for row in conf_matrix_normalized])
type_dict = {'HVAC_duct': 'Duct', 'Pipe': 'Pipe', 'Structural_ColumnBeam': 'Column Beam', 'Structural_IBeam': 'IBeam'}
plt.figure(figsize=(6, 6), dpi = 200)
sns.heatmap(conf_matrix_normalized, annot=labels, fmt="", cmap="Blues", cbar=False,
xticklabels=type_dict.values(),
yticklabels=type_dict.values(),
annot_kws={"size": 14})
plt.title('Confusion Matrix', fontsize = 14)
plt.xlabel('Predicted Labels', fontsize = 14)
plt.ylabel('True Labels', fontsize = 14)
plt.show()
# %% Retrain the model on the entire dataset and run inference
# Combine the training and test sets
X = np.vstack((X_train, X_test))
y = np.hstack((y_train, y_test))
# Train the final model on the entire dataset
final_model = RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_split=2, random_state=42)
final_model.fit(X, y)
# Load X_test and y_test from the previous code cell
X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv')
# Make predictions on the test data
y_pred = rf_model.predict(X_test)
# Assess the performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
# Detailed classification report (precision, recall, F1-score)
print('Classification Report:')
print(classification_report(y_test, y_pred))
# Confusion matrix
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))