-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpadMis.syn.r
125 lines (112 loc) · 5.91 KB
/
padMis.syn.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
padMis.syn <- function(data, method, predictor.matrix, visit.sequence,
nvar, rules, rvalues, default.method, cont.na,
smoothing, event, denom) {
# Function called by syn to make dummy/factor variable for missing values
# in continuous variables. Data is augmented by columns for dummy/factor
# variables when they are used in synthesis.
# Check presence of missing values not covered by missing rules
# (missing values for non-numeric variables are not counted)
No.NA <- vector("list", nvar)
yes.rules <- sapply(rules, function(x) any(x != ""))
com.rules <- lapply(rules, paste, collapse = " | ")
for (j in 1:nvar) {
if (yes.rules[j]) {
No.NA[j] <- with(data,sum(data[!eval(parse(text = com.rules[[j]])), j] %in% cont.na[[j]]))
} else {
No.NA[j] <- sum(data[,j] %in% cont.na[[j]])
}
}
No.NA <- sapply(No.NA,function(x) x > 0)
inpred <- apply(predictor.matrix != 0, 1, any) | apply(predictor.matrix != 0, 2, any)
factorNA <- rep(FALSE, nvar)
for (j in 1:nvar) {
# if (No.NA[j] & is.numeric(data[,j]) & inpred[j]==TRUE){ #GR1.7-1
if (No.NA[j] & is.numeric(data[,j]) & inpred[j] == TRUE &
!is.passive(method[j]) & !method[j] %in% c("nested", "ipf", "catall")) {
# augment the data with a column for the original continuous variable with
# missing values replaced by zeros and a column for a new factor for
# missing values
nonmiscode <- 10^(nchar(round(max(data[,j], na.rm = TRUE))) + 1) - 1
y.0 <- ifelse(data[,j] %in% c(cont.na[[j]], rvalues[[j]]), 0, data[,j])
y.NA <- ifelse(data[,j] %in% c(cont.na[[j]], rvalues[[j]]), data[,j], nonmiscode) #BN13/11 0 changed with nonmiscode
y.NA <- addNA(y.NA, ifany = TRUE)
levels(y.NA)[is.na(levels(y.NA))] <- "NAtemp" # to allow random forest
data <- cbind(data,y.0,y.NA)
name.0 <- paste(attr(data,"names")[j], 0, sep = ".")
name.NA <- paste(attr(data,"names")[j], NA, sep = ".")
names(data)[(ncol(data) - 1):ncol(data)] <- c(name.0, name.NA)
factorNA[(ncol(data) - 1):ncol(data)] <- c(FALSE,TRUE)
# predictor.matrix is given two extra rows and columns for the new variables
# rows and columns are copied from an original variable j in predictor.matrix
predictor.matrix <- rbind(predictor.matrix, matrix(rep(predictor.matrix[j,],
times = 2), byrow = TRUE, nrow = 2))
predictor.matrix <- cbind(predictor.matrix, matrix(rep(predictor.matrix[,j],
times = 2), ncol = 2))
# the original variable is removed from predictors (=insert zeros)
predictor.matrix[,j] <- 0
# the original variable is synthesized passively so its predictors can be removed as well
predictor.matrix[j,] <- 0
# add methods for new variables
method[ncol(data) - 1] <- method[j]
if (method[j] %in% c("ctree", "ctree.proper", "cart", "cart.proper",
"rf", "ranger", "bag",
"sample", "")) {
method[ncol(data)] <- method[j]
} else {
method[ncol(data)] <- ifelse(nlevels(data[,ncol(data)]) == 2,
default.method[2], default.method[3])
}
# pass smoothing to new variable and remove from original one
smoothing[ncol(data) - 1] <- smoothing[j]
smoothing[ncol(data)] <- ""
smoothing[j] <- ""
cont.na[ncol(data) - 1] <- cont.na[j]
cont.na[ncol(data)] <- NA
cont.na[j] <- NA
# pass denom and event for new variable and remove from original one
denom[ncol(data) - 1] <- denom[j] #!!!!!!!!!!!!!! check if correct
denom[ncol(data)] <- 0
denom[j] <- 0
event[ncol(data) - 1] <- event[j]
event[ncol(data)] <- 0
event[j] <- 0
# insert the column numbers for the new variables into the visit sequence
# before the jth column
if (any(visit.sequence == j)) {
newcols <- c(ncol(data), ncol(data) - 1)
idx <- (1:length(visit.sequence))[visit.sequence == j] - 1
visit.sequence <- append(visit.sequence,newcols,idx)
# modify method for the original variable
method[j] <- paste0("~(ifelse(",name.NA,"!=", nonmiscode," | is.na(",name.0,
"),as.numeric(levels(",name.NA,"))[",name.NA,"],",name.0,"))")
}
# update missing rules and values for the new variables
if (any(rules[[j]] != "")) rules[[ncol(data) - 1]] <-
c(rules[[j]][rules[[j]] != ""],paste(name.NA, "!=", nonmiscode, sep = "")) #BN13/11
else rules[[ncol(data) - 1]] <- paste(name.NA, "!=", nonmiscode, sep = "") #BN13/11
rules[[ncol(data)]] <- rules[[j]]
rules[[j]] <- ""
#!BN1513 rule "year_death.NA!=0" should have only one correspnding
#! rvalue equal to 0; before a vector c(NA,0) was assigned instead of 0
if (length(rules[[j]]) == 1) rvalues[[ncol(data) - 1]] <- 0
else rvalues[[ncol(data) - 1]] <- c(rvalues[[j]], 0)
rvalues[[ncol(data)]] <- rvalues[[j]]
}
}
varnames <- dimnames(data)[[2]]
dimnames(predictor.matrix) <- list(varnames,varnames)
names(method) <- varnames
names(visit.sequence) <- varnames[visit.sequence]
return(list(data = as.data.frame(data),
nvar = ncol(data),
predictor.matrix = predictor.matrix,
method = method,
visit.sequence = visit.sequence,
rules = rules,
rvalues = rvalues,
factorNA = factorNA,
smoothing = smoothing,
event = event,
denom = denom,
cont.na = cont.na))
}