-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpipeline.R
200 lines (193 loc) · 7.88 KB
/
pipeline.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
run_pipeline <- function(data_dir = "data", seq_len = 10, train_ratio = 0.7,
batch_size = 50, epochs = 10, run_hrs = 8,
db_file = "shiny/exoplanet_db.sqlite") {
# unlink("exoplanet_db.sqlite")
mydb <- dbConnect(RSQLite::SQLite(), db_file)
shp <- c(seq_len - 1, 1)
reduceLr <- callback_reduce_lr_on_plateau(
monitor = "val_loss", factor = sqrt(0.1),
patience = 2)
# Early stopping is not working for some reason
# earlyStopping <- EarlyStopping(monitor = "val_acc", min_delta = -0.001,
# patience = 2)
files <- list.files(path = paste0(data_dir, "/"), pattern = "*.tbl",
full.names = T)
if(!("kepler_star" %in% dbListTables(mydb))) {
system("sh counts.sh")
text <- system("sh counts_df.sh", intern = T)
text <- t(sapply(strsplit(text, "\t"), function(elem) c(elem[1], elem[2])))
text <- data.frame(text, stringsAsFactors = F)
text$X1 <- as.integer(text$X1)
text$X2 <- as.numeric(text$X2)
colnames(text) = c("id", "num_planets")
dbWriteTable(mydb, "kepler_star", text)
}
out_files <- sapply(strsplit(files, "/"), function(strs) strs[length(strs)])
out_files <- sapply(strsplit(out_files, "\\."), function(strs) strs[1])
out_files1 <- sapply(strsplit(out_files, "_"), function(strs) strs[1])
out_files1 <- as.integer(gsub(x = out_files1, pattern = "kplr", replacement = ""))
dir.create("plots", showWarnings = F)
setwd("plots")
dir.create("learning_curve", showWarnings = F)
dir.create("test_pred_plot", showWarnings = F)
dir.create("train_pred_plot", showWarnings = F)
setwd("../")
dir.create("trained_models", showWarnings = F)
# Randomizing the input files
rand_idx <- sample(x = 1:length(files), size = length(files), replace = F)
files <- files[rand_idx]
out_files <- out_files[rand_idx]
out_files1 <- out_files1[rand_idx]
tm <- Sys.time()
i <- 1
diff_time <- 0
metrics_df <- data.frame()
train_idx_df <- data.frame()
test_idx_df <- data.frame()
while(i <= length(files) & diff_time < 3600 * run_hrs) {
tryCatch({
print(i)
file <- files[i]
wave <- get_wave(file)
train_test_split <- split_train_test(wave, train_ratio, seq_len)
x_train <- train_test_split$x_train
y_train <- train_test_split$y_train
x_test <- train_test_split$x_test
y_test <- train_test_split$y_test
mdl_file <- paste0("trained_models/", out_files[i], ".hdf5")
if(!file.exists(mdl_file)) {
model <- keras_model_sequential() %>%
# layer_cudnn_lstm(input_shape = shp, units = 64, return_sequences = T) %>%
layer_lstm(input_shape = shp, units = 64, return_sequences = T) %>%
layer_dropout(0.2) %>%
# layer_cudnn_lstm(256, return_sequences = T) %>%
layer_lstm(256, return_sequences = T) %>%
layer_dropout(0.2) %>%
# layer_cudnn_lstm(100, return_sequences = T) %>%
layer_lstm(100, return_sequences = T) %>%
layer_dropout(0.2) %>%
layer_flatten() %>%
layer_dense(units = 1, activation = "linear")
model %>% compile(loss = "mse", optimizer = optimizer_adam(lr = 0.1),
metrics = c("mape", "mae"))
his <- model %>% fit(
x_train, y_train,
batch_size = batch_size,
epochs = epochs,
callbacks = list(reduceLr),
validation_split = 0.3,
verbose = 0
)
save_model_hdf5(model, paste0("trained_models/", out_files[i], ".hdf5"))
png(paste0("plots/learning_curve/", out_files[i], "_learning.png"),
width = 1366, height = 768)
print(plot(his))
dev.off()
metrics_df <- rbind(metrics_df, data.frame(
train_loss = his$metrics$loss[epochs],
train_mape = his$metrics$mean_absolute_percentage_error[epochs],
train_mae = his$metrics$mean_absolute_error[epochs],
val_loss = his$metrics$val_loss[epochs],
val_mape = his$metrics$val_mean_absolute_percentage_error[epochs],
val_mae = his$metrics$val_mean_absolute_error[epochs]))
} else {
model <- load_model_hdf5(mdl_file)
}
lwr_upr <- get_period_range(wave)
lwr <- lwr_upr[1]
upr <- lwr_upr[2]
if(lwr == upr) {
thr <- as.numeric("Inf")
} else {
y_train_pred <- predict(model, x_train)
sqr_error <- (y_train_pred - y_train)^2
avg_error <- mean(sqr_error)
std_error <- sd(sqr_error)
thr <- (avg_error + 2 * std_error)
}
y_train_pred <- predict(model, x_train)
temp_train_idx_df <- save_plot(
y_pred = y_train_pred, y = y_train,
out_file = paste0("plots/train_pred_plot/", out_files[i],
"_train_plot.png"),
thr = thr, lwr = lwr, upr = upr)
if(nrow(temp_train_idx_df) > 0) {
temp_train_idx_df$id <- out_files1[i]
temp_train_idx_df <- temp_train_idx_df[, c("id", "start", "end")]
} else {
temp_train_idx_df <- data.frame(id = out_files1, start = 0, end = 0)
}
train_idx_df <- rbind(train_idx_df, temp_train_idx_df)
y_test_pred <- predict(model, x_test)
temp_test_idx_df <- save_plot(
y_pred = y_test_pred, y = y_test,
out_file = paste0("plots/test_pred_plot/", out_files[i],
"_test_plot.png"),
thr = thr, lwr = lwr, upr = upr)
if(nrow(temp_test_idx_df) > 0) {
temp_test_idx_df$id <- out_files1[i]
temp_test_idx_df <- temp_test_idx_df[, c("id", "start", "end")]
} else {
temp_test_idx_df <- data.frame(id = out_files1, start = 0, end = 0)
}
test_idx_df <- rbind(test_idx_df, temp_test_idx_df)
tm1 <- Sys.time()
diff_time <- as.double.difftime(tm1 - tm, units = "secs")
i <- i + 1
print(c(nrow(train_idx_df), nrow(test_idx_df)))
}, error = function(e) {
i <- i + 1
err <- as.character(e)
write(x = err, file = "error_log.txt", append = T)
})
}
metrics_df$file <- files[1:(i - 1)]
if(!("test_idx" %in% dbListTables(mydb))) {
dbWriteTable(mydb, "train_idx", train_idx_df)
dbWriteTable(mydb, "test_idx", test_idx_df)
} else {
insert_into_db(db_file, "train_idx", train_idx_df)
insert_into_db(db_file, "test_idx", test_idx_df)
}
png("Presentation/example.png")
wave <- get_wave(file)
plot(wave)
mn <- mean(wave, na.rm = T)
stdev <- sd(wave, na.rm = T)
lns1 <- mn + c(-1, 1) * stdev
lns2 <- mn + c(-2, 2) * stdev
lns3 <- mn + c(-3, 3) * stdev
abline(h = lns1, col = "red")
abline(h = lns2, col = "green")
abline(h = lns3, col = "blue")
legend("bottomleft", legend = c("1 * std", "2 * std", "3 * std"),
fill = c("red", "green", "blue"))
dev.off()
met_df <- melt(data = metrics_df, id = "file")
png("Report/loss.png", width = 554, height = 412)
loss_df <- met_df[grep(met_df$variable, pattern = ".*_loss$"), ]
with(loss_df,
plot(x = as.integer(factor(file)), y = value, col = variable, pch = 20,
xlab = "File", xaxt = 'n'))
legend("topright", legend = unique(loss_df$variable),
fill = unique(loss_df$variable))
dev.off()
png("Report/mae.png", width = 554, height = 412)
mae_df <- met_df[grep(met_df$variable, pattern = ".*_mae$"), ]
with(mae_df,
plot(x = as.integer(factor(file)), y = value, col = variable, pch = 20,
xlab = "File", xaxt = 'n'))
legend("topright", legend = unique(mae_df$variable),
fill = unique(mae_df$variable))
dev.off()
png("Report/mape.png", width = 554, height = 412)
mape_df <- met_df[grep(met_df$variable, pattern = ".*_mape$"), ]
with(mape_df,
plot(x = as.integer(factor(file)), y = value, col = variable, pch = 20,
xlab = "File", xaxt = 'n'))
legend("topright", legend = unique(mape_df$variable),
fill = unique(mape_df$variable))
dev.off()
dbDisconnect(mydb)
return(metrics_df)
}