-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbirthOutcome.py
121 lines (106 loc) · 4.85 KB
/
birthOutcome.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.feature_selection import SelectFromModel
from scipy.stats import randint as sp_randint
import matplotlib.pyplot as plt
from sklearn import metrics
# Hidden author comment: Code refactored and generalized by [Your Name].
def load_dataset(filepath):
"""Load dataset from a file."""
dataset = pd.read_csv(filepath)
print(f"Dataset shape: {dataset.shape}")
print(dataset.isnull().sum())
return dataset
def impute_missing_values(dataset, mean_impute_cols, locf_impute_cols):
"""Impute missing values using mean and LOCF methods."""
for col in mean_impute_cols:
dataset[col] = dataset[col].fillna(dataset[col].mean()).round(0)
for col in locf_impute_cols:
dataset[col] = dataset[col].fillna(method='ffill')
print("After Imputation:")
print(dataset.isnull().sum())
return dataset
def preprocess_data(dataset, target_col, test_size=0.3, random_state=7):
"""Split the data into train and test sets and apply scaling."""
X = dataset.drop(columns=[target_col])
y = dataset[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)
return X_train_std, X_test_std, y_train, y_test, X.columns
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test, cv=5):
"""Train a model and evaluate its performance with cross-validation."""
model.fit(X_train, y_train)
acc = model.score(X_test, y_test)
scores = cross_val_score(model, X_train, y_train, cv=cv)
print(f"Accuracy: {acc}")
print(f"Cross-validation scores: {scores}")
print(f"Average CV Score: {np.mean(scores)}")
return model
def plot_roc_curves(models, X_test, y_test, model_names):
"""Plot ROC curves for given models."""
plt.figure(figsize=(10, 5))
lw = 2
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
for model, name in zip(models, model_names):
if hasattr(model, "predict_proba"):
y_score = model.predict_proba(X_test)[:, 1]
else:
y_score = model.decision_function(X_test)
fpr, tpr, _ = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=lw, label=f"{name} ROC (area = {roc_auc:.2f})")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves")
plt.legend(loc="lower right")
plt.show()
def feature_importance_plot(model, feature_names, title="Feature Importance"):
"""Plot feature importance."""
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 5))
plt.title(title)
plt.bar(range(len(importances)), importances[indices], align="center")
plt.xticks(range(len(importances)), feature_names[indices], rotation=90)
plt.tight_layout()
plt.show()
# Main Script
if __name__ == "__main__":
# Configurable file paths and column names
input_filepath = "data/input_data.csv"
output_filepath = "data/output_data.csv"
mean_impute_cols = ["Column1", "Column2", "Column3"] # Replace with actual column names
locf_impute_cols = ["Column4", "Column5", "Column6"] # Replace with actual column names
target_col = "Target" # Replace with actual target column name
# Load and preprocess the data
dataset = load_dataset(input_filepath)
dataset = impute_missing_values(dataset, mean_impute_cols, locf_impute_cols)
# Save and reload the dataset
dataset.to_csv(output_filepath, index=False)
dataset = pd.read_csv(output_filepath)
# Preprocess and split data
X_train_std, X_test_std, y_train, y_test, feature_names = preprocess_data(dataset, target_col)
# Initialize models
models = [
LogisticRegression(solver="liblinear", multi_class="ovr", random_state=1),
SVC(gamma="auto", probability=True, random_state=1),
RandomForestClassifier(n_estimators=60, random_state=1),
]
model_names = ["Logistic Regression", "SVM", "Random Forest"]
# Train and evaluate each model
trained_models = [
train_and_evaluate_model(model, X_train_std, X_test_std, y_train, y_test)
for model in models
]
# Plot ROC curves
plot_roc_curves(trained_models, X_test_std, y_test, model_names)
# Feature Importance for Random Forest
feature_importance_plot(trained_models[2], feature_names, title="Feature Importance in Data")