Skip to content

Commit

Permalink
add option to downsample training data
Browse files Browse the repository at this point in the history
  • Loading branch information
borauyar committed Apr 3, 2024
1 parent ea802f9 commit 75f5b6e
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 11 deletions.
28 changes: 17 additions & 11 deletions flexynesis/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def main():
parser.add_argument("--hpo_iter", help="Number of iterations for hyperparameter optimisation", type=int, default = 5)
parser.add_argument("--correlation_threshold", help="Correlation threshold to drop highly redundant features (default: 0.8; set to 1 for no redundancy filtering)", type=float, default = 0.8)
parser.add_argument("--restrict_to_features", help="Restrict the analyis to the list of features provided by the user (default: None)", type = str, default = None)
parser.add_argument("--subsample", help="Downsample training set to randomly drawn N samples for training. Disabled when set to 0", type=int, default = 0)
parser.add_argument("--features_min", help="Minimum number of features to retain after feature selection", type=int, default = 500)
parser.add_argument("--features_top_percentile", help="Top percentile features to retain after feature selection", type=float, default = 20)
parser.add_argument("--data_types", help="(Required) Which omic data matrices to work on, comma-separated: e.g. 'gex,cnv'", type=str, required = True)
Expand Down Expand Up @@ -188,6 +189,10 @@ class AvailableModels(NamedTuple):
string_organism=args.string_organism,
string_node_name=args.string_node_name)
train_dataset, test_dataset = data_importer.import_data(force = True)

if args.subsample > 0:
print("[INFO] Randomly drawing",args.subsample,"samples for training")
train_dataset = flexynesis.downsample(train_dataset, N = args.subsample)

# print feature logs to file (we use these tables to track which features are dropped/selected and why)
feature_logs = data_importer.feature_logs
Expand Down Expand Up @@ -245,18 +250,19 @@ class AvailableModels(NamedTuple):
embeddings_test.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'embeddings_test.csv'])), header=True)

# also filter embeddings to remove batch-associated dims and only keep target-variable associated dims
print("[INFO] Printing filtered embeddings")
embeddings_train_filtered = flexynesis.remove_batch_associated_variables(data = embeddings_train,
batch_dict={x: train_dataset.ann[x] for x in model.batch_variables} if model.batch_variables is not None else None,
target_dict={x: train_dataset.ann[x] for x in model.target_variables},
variable_types=train_dataset.variable_types)
# filter test embeddings to keep the same dims as the filtered training embeddings
embeddings_test_filtered = embeddings_test[embeddings_train_filtered.columns]
if args.batch_variables is not None:
print("[INFO] Printing filtered embeddings")
embeddings_train_filtered = flexynesis.remove_batch_associated_variables(data = embeddings_train,
batch_dict={x: train_dataset.ann[x] for x in model.batch_variables} if model.batch_variables is not None else None,
target_dict={x: train_dataset.ann[x] for x in model.target_variables},
variable_types=train_dataset.variable_types)
# filter test embeddings to keep the same dims as the filtered training embeddings
embeddings_test_filtered = embeddings_test[embeddings_train_filtered.columns]

# save
embeddings_train_filtered.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'embeddings_train.filtered.csv'])), header=True)
embeddings_test_filtered.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'embeddings_test.filtered.csv'])), header=True)

# save
embeddings_train_filtered.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'embeddings_train.filtered.csv'])), header=True)
embeddings_test_filtered.to_csv(os.path.join(args.outdir, '.'.join([args.prefix, 'embeddings_test.filtered.csv'])), header=True)

# for architectures with decoders; print decoded output layers
if args.model_class == 'CrossModalPred':
print("[INFO] Printing decoded output layers")
Expand Down
11 changes: 11 additions & 0 deletions flexynesis/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -954,3 +954,14 @@ def split_by_median(tensor_dict):
# If tensor is not numerical, leave it as it is
new_dict[key] = tensor
return new_dict


# downsample a given MultiOmicDataset
def downsample(dataset, N = None):
idx = np.random.choice(range(0,len(dataset)), N, replace = False)
sub = dataset[idx]

sub_dataset = MultiomicDataset(dat = sub[0], ann = sub[1], variable_types = dataset.variable_types,
features = dataset.features, samples = [dataset.samples[i] for i in idx],
label_mappings=dataset.label_mappings, feature_ann=dataset.feature_ann)
return sub_dataset

0 comments on commit 75f5b6e

Please sign in to comment.