diff --git a/reproducibility/simulations/benchmark.smk b/reproducibility/simulations/benchmark.smk index 352ec3f..66f2971 100644 --- a/reproducibility/simulations/benchmark.smk +++ b/reproducibility/simulations/benchmark.smk @@ -64,6 +64,8 @@ CONET_OUTPUT = os.path.join(config["output"], "conet") CONET_GINKGO_OUTPUT = os.path.join(config["output"], "conet_ginkgo") CONET_TREE_OUTPUT = os.path.join(config["output"], "conet_tree_distances") CONET_GINKGO_TREE_OUTPUT = os.path.join(config["output"], "conet_ginkgo_tree_distances") +CONET_CBS_OUTPUT = os.path.join(config["output"], "conet_cbs") +CONET_CBS_TREE_OUTPUT = os.path.join(config["output"], "conet_cbs_tree_distances") try: other_cnv_methods = config["other_methods"] @@ -557,6 +559,79 @@ rule conet_ginkgo_inference: shell: "python {params.script} --bin_path {params.bin_path} --counts {input.d_mat} --cnvs {input.init_cnvs} --intermediate_dir {params.intermediate_dir} --seed {wildcards.rep_id} --out_cnvs {output.conet_inferred_cnvs} --out_tree {output.conet_inferred_tree} --out_attachments {output.conet_inferred_attachments}" +rule conet_cbs_inference: + params: + script = config["conet"]["script"], + n_nodes = n_nodes, + scratch = config["conet"]["scratch"], + mem = config["conet"]["mem"], + time = config["conet"]["time"], + script_inp = str(n_nodes)+"nodes_{rep_id}", + intermediate_dir = CONET_CBS_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '/', + bin_path = config["conet"]["bin_path"], + em_iters = config["conet"]["em_iters"], + pt_iters = config["conet"]["pt_iters"], + input: + d_mat = SIM_OUTPUT+ '_' + sim_prefix +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_d_mat.txt', + output: + conet_inferred_cnvs = CONET_CBS_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/'+ '{rep_id}' + '_conet_cbs_inferred.txt', + conet_inferred_tree = CONET_CBS_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/'+ '{rep_id}' + '_conet_cbs_tree.txt', + conet_inferred_attachments = CONET_CBS_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_conet_cbsattachments.txt' + threads: 10 + # shell: + # "python {params.script} --bin_path {params.bin_path} --counts {input.d_mat} --cnvs {input.init_cnvs} --intermediate_dir {params.intermediate_dir} --seed {wildcards.rep_id} --em_iters {params.em_iters} --pt_iters {params.pt_iters} --out_cnvs {output.conet_inferred_cnvs} --out_tree {output.conet_inferred_tree} --out_attachments {output.conet_inferred_attachments}" + run: + real_ind = list(map(lambda x: x.start, list(tree.nodes))) + real_ind.extend(list(map(lambda x: x.end, list(tree.nodes)))) + real_ind = list(set(real_ind)) + real_ind.sort() + if wildcards.known_breakpoints: + # Extract real breakpoints indices + candidates = real_ind + save_counts_in_CONET_format(dirpath + "counts_synthetic", corr_reads, candidates) + else: + save_counts_in_CONET_format(dirpath + "counts_synthetic", corr_reads, []) + subprocess.run(["Rscript", "CBS_MergeLevels.R", "--mincells=5", + f"--output={dirpath}cbs_out", f"--dataset={dirpath}counts_synthetic"]) + X = np.loadtxt(dirpath + "cbs_out", delimiter=",") + candidates = [] + for i in range(X.shape[0]): + if X[i, 4] == 1.0: + candidates.append(i) + save_counts_in_CONET_format(dirpath + "counts_synthetic", corr_reads, candidates) + print(f"Found {len(candidates)} breakpoint candidates...") + + # Convert model data to CONET format + data_converter = dc.DataConverter(dirpath + "counts_synthetic", + delimiter=';', + default_bin_length=1, + event_length_normalizer=corr_reads.shape[1], # number of loci + add_chromosome_ends=False, + neutral_cn=2.0) + data_converter.create_CoNET_input_files(dirpath, add_chr_ends_to_indices=False) + + # Perform CONET inference + conet = c.CONET(str(Path(conf.conet_binary_dir) / Path("CONET"))) + params = cp.CONETParameters(tree_structure_prior_k1=0.01, + data_dir=dirpath, counts_penalty_s1=100000, counts_penalty_s2=100000, + param_inf_iters=500000, seed=random.randint(0, 1000), mixture_size=2, pt_inf_iters=1000000, + use_event_lengths_in_attachment=False, + event_length_penalty_k0=1) + conet.infer_tree(params) + print(f"CONET inference finished on model {i}") + result = ir.InferenceResult(conf.conet_binary_dir, corr_reads.T) + + inferred_cn = result.get_inferred_copy_numbers(2, conf.bins, conf.cells) + inferred_brkp = result.bp_matrix.astype(int) + inferred_nodes = set(result.tree.nodes) + inferred_edges = set(result.tree.edges) + + real_cn = data[0] + real_brkp = data[4][:, real_ind].astype(int) + real_nodes = set((n.start, n.end) for n in tree.nodes) + real_edges = set(((e[0].start, e[0].end), (e[1].start, e[1].end)) for e in tree.edges) + + rule run_medalt: params: script = config["medalt"]["script"], @@ -610,3 +685,160 @@ rule compute_medalt_ginkgo_tree_distances: medalt_tree_distance = f'{MEDALT_TREE_OUTPUT}/{str(n_nodes)}nodes_' + '{regions}regions_{reads}reads/{rep_id}_medalt_ginkgo_tree_distance.txt' shell: "python {params.script} {input.medalt_tree} {input.cnvs} {output.medalt_tree_distance}" + +rule compute_deltas: + input: + simulations = expand(f'{SIM_OUTPUT}_{sim_prefix}/{str(n_nodes)}nodes_' + '{regions}regions_{reads}reads/{rep_id}_d_mat.txt', + regions=[x for x in n_regions], reads=[x for x in n_reads], rep_id=[x for x in range(0,all_n_tps)]), + best_full_tree = expand(f'{TREES_OUTPUT}_best_full_tree/{str(n_nodes)}nodes_' + '{regions}regions_{reads}reads/{rep_id}_full_tree.txt', + regions=[x for x in n_regions], reads=[x for x in n_reads], rep_id=[x for x in range(0,all_n_tps)], tree_rep_id=[x for x in range(0,tree_rep)]), + + best_cluster_tree = expand(f'{TREES_OUTPUT}_best_cluster_tree/{str(n_nodes)}nodes_' + '{regions}regions_{reads}reads/{rep_id}_cluster_tree.txt', + regions=[x for x in n_regions], reads=[x for x in n_reads], rep_id=[x for x in range(0,all_n_tps)]), + + best_full_tree_sum = expand(f'{TREES_OUTPUT}_best_full_tree_sum/{str(n_nodes)}nodes_' + '{regions}regions_{reads}reads/{rep_id}_full_tree.txt', + regions=[x for x in n_regions], reads=[x for x in n_reads], rep_id=[x for x in range(0,all_n_tps)], tree_rep_id=[x for x in range(0,tree_rep)]), + + best_cluster_tree_sum = expand(f'{TREES_OUTPUT}_best_cluster_tree_sum/{str(n_nodes)}nodes_' + '{regions}regions_{reads}reads/{rep_id}_cluster_tree.txt', + regions=[x for x in n_regions], reads=[x for x in n_reads], rep_id=[x for x in range(0,all_n_tps)]), + + other_cnvs = expand(os.path.join(config["output"], '{method}') +'/'+ str(n_nodes) + 'nodes_' + '{regions}'+'regions_'+ '{reads}'+'reads'+ '/' + '{rep_id}' + '_{method}_inferred.txt', + regions=[x for x in n_regions], reads=[x for x in n_reads], rep_id=[x for x in range(0,all_n_tps)], method=other_cnv_methods), + output: + out_fname = os.path.join(output_path, 'deltas_sims.csv') + run: + rows = [] + rows.append('index,rep_id,method,delta') + i = 0 + for true_cnvs in input.simulations: + # True + rep_id = true_cnvs.split('/')[-1].split('_')[0] + gt = np.loadtxt(true_cnvs, delimiter=',') + if gt.shape[0] == n_bins: # should be cells by bins + gt = np.transpose(gt) + + # Diploid + i+=1 + method = 'diploid' + inf = np.ones(gt.shape) * 2 + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + # i += 1 + # method = 'ginkgo' + # inf_cnvs = f'{GINKGO_OUTPUT}/{n_nodes}nodes/{rep_id}_ginkgo_inferred.txt' + # inf = np.loadtxt(inf_cnvs, delimiter=' ') + # error = np.sqrt(np.mean((gt-inf)**2)) + # rows.append(f'{i},{rep_id},{method},{error}') + # + # i += 1 + # method = 'conet_ginkgo' + # inf_cnvs = f'{CONET_GINKGO_OUTPUT}/{n_nodes}nodes/{rep_id}_conet_ginkgo_inferred.txt' + # inf = np.loadtxt(inf_cnvs, delimiter=',') + # error = np.sqrt(np.mean((gt-inf)**2)) + # rows.append(f'{i},{rep_id},{method},{error}') + + # i += 1 + # method = 'conet_knownbp' + # inf_cnvs = f'{CONET_KNOWNBP_OUTPUT}/{n_nodes}nodes/{rep_id}_conet_knownbp_inferred.txt' + # inf = np.loadtxt(inf_cnvs, delimiter=';').T + # error = np.sqrt(np.mean((gt-inf)**2)) + # rows.append(f'{i},{rep_id},{method},{error}') + + for other_cnvs_file in other_cnvs: + i += 1 + method = other_cnvs_file.split('') + inf = np.loadtxt(other_cnvs_file, delimiter=',') + if inf.shape != gt.shape: + inf = inf.T + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + for fairness in ['unfair', 'fair']: + i += 1 + method = f'cluster_tree_{fairness}' + inf_cnvs = f'{TREES_OUTPUT}_best_cluster_tree/{fairness}/{n_nodes}nodes/{rep_id}_cluster_tree_cnvs.csv' + inf = np.loadtxt(inf_cnvs, delimiter=',') + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + i += 1 + method = f'cluster_tree_sum_{fairness}' + inf_cnvs = f'{TREES_OUTPUT}_best_cluster_tree_sum/{fairness}/{n_nodes}nodes/{rep_id}_cluster_tree_cnvs.csv' + inf = np.loadtxt(inf_cnvs, delimiter=',') + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + i += 1 + method = f'full_tree_{fairness}' + inf_cnvs = f'{TREES_OUTPUT}_best_full_tree/{fairness}/{n_nodes}nodes/{rep_id}_full_tree_cnvs.csv' + inf = np.loadtxt(inf_cnvs, delimiter=',') + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + i += 1 + method = f'full_tree_sum_{fairness}' + inf_cnvs = f'{TREES_OUTPUT}_best_full_tree_sum/{fairness}/{n_nodes}nodes/{rep_id}_full_tree_cnvs.csv' + inf = np.loadtxt(inf_cnvs, delimiter=',') + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + # i += 1 + # method = f'cluster_tree_{fairness}_knownbps' + # inf_cnvs = f'{TREES_OUTPUT}_best_cluster_tree/{fairness}/{n_nodes}nodes/{rep_id}_cluster_tree_cnvs_knownbps.csv' + # inf = np.loadtxt(inf_cnvs, delimiter=',') + # error = np.sqrt(np.mean((gt-inf)**2)) + # rows.append(f'{i},{rep_id},{method},{error}') + # + # i += 1 + # method = f'cluster_tree_sum_{fairness}_knownbps' + # inf_cnvs = f'{TREES_OUTPUT}_best_cluster_tree_sum/{fairness}/{n_nodes}nodes/{rep_id}_cluster_tree_cnvs_knownbps.csv' + # inf = np.loadtxt(inf_cnvs, delimiter=',') + # error = np.sqrt(np.mean((gt-inf)**2)) + # rows.append(f'{i},{rep_id},{method},{error}') + # + # i += 1 + # method = f'full_tree_{fairness}_knownbps' + # inf_cnvs = f'{TREES_OUTPUT}_best_full_tree/{fairness}/{n_nodes}nodes/{rep_id}_full_tree_cnvs_knownbps.csv' + # inf = np.loadtxt(inf_cnvs, delimiter=',') + # error = np.sqrt(np.mean((gt-inf)**2)) + # rows.append(f'{i},{rep_id},{method},{error}') + # + # i += 1 + # method = f'full_tree_sum_{fairness}_knownbps' + # inf_cnvs = f'{TREES_OUTPUT}_best_full_tree_sum/{fairness}/{n_nodes}nodes/{rep_id}_full_tree_cnvs_knownbps.csv' + # inf = np.loadtxt(inf_cnvs, delimiter=',') + # error = np.sqrt(np.mean((gt-inf)**2)) + # rows.append(f'{i},{rep_id},{method},{error}') + + i += 1 + method = f'cluster_tree_{fairness}_adapted' + inf_cnvs = f'{TREES_OUTPUT}_best_cluster_tree/{fairness}/{n_nodes}nodes/{rep_id}_cluster_tree_cnvs_adapted.csv' + inf = np.loadtxt(inf_cnvs, delimiter=',') + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + i += 1 + method = f'cluster_tree_sum_{fairness}_adapted' + inf_cnvs = f'{TREES_OUTPUT}_best_cluster_tree_sum/{fairness}/{n_nodes}nodes/{rep_id}_cluster_tree_cnvs_adapted.csv' + inf = np.loadtxt(inf_cnvs, delimiter=',') + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + i += 1 + method = f'full_tree_{fairness}_adapted' + inf_cnvs = f'{TREES_OUTPUT}_best_full_tree/{fairness}/{n_nodes}nodes/{rep_id}_full_tree_cnvs_adapted.csv' + inf = np.loadtxt(inf_cnvs, delimiter=',') + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + i += 1 + method = f'full_tree_sum_{fairness}_adapted' + inf_cnvs = f'{TREES_OUTPUT}_best_full_tree_sum/{fairness}/{n_nodes}nodes/{rep_id}_full_tree_cnvs_adapted.csv' + inf = np.loadtxt(inf_cnvs, delimiter=',') + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + + with open(output.out_fname, 'w') as f: + f.write('\n'.join(rows)) diff --git a/reproducibility/simulations/conet_comparisons/conet_tree_distances.py b/reproducibility/simulations/conet_comparisons/conet_tree_distances.py new file mode 100644 index 0000000..76a1a15 --- /dev/null +++ b/reproducibility/simulations/conet_comparisons/conet_tree_distances.py @@ -0,0 +1,489 @@ +import glob +from tqdm import tqdm +import numpy as np +import pandas as pd +from pathlib import Path +import os + +from scicone import Tree + +def set_events(tree): + tree['event'] = np.zeros((tree['cnv'].shape[0],)) + # Fill events + def descend(root): + for child in root['children']: + child['event'] = child['cnv'] - root['cnv'] + descend(child) + return + descend(tree) + return tree + +def get_n_undoing(tree): + n_undoing = 0. + def descend(root): + for child in root['children']: + par = root['cnv'] + chi = child['cnv'] + dif = chi-par + is_event = np.logical_or(np.logical_and(par > 2, dif < 0), np.logical_and(par < 2, dif > 0)) + evs = np.where(is_event)[0] + child['n_undoing'] += len(evs) + print(child['label'], dif[is_event][:10], par[is_event][:10]) + descend(child) + return + descend(tree) + node_dict = tree_to_dict(tree) + for node in node_dict: + n_undoing += node_dict[node]['n_undoing'] + return n_undoing + + +def create_cell_node_ids(self): + nodes = list(self.node_dict.keys()) + n_cells = self.outputs['inferred_cnvs'].shape[0] + self.outputs['cell_node_ids'] = np.zeros((n_cells, 2)) + self.outputs['cell_node_ids'][:,0] = np.arange(n_cells) + for node in nodes: + idx = np.where(np.all(self.outputs['inferred_cnvs'] == self.node_dict[node]['cnv'], axis=1))[0] + print(node, idx) + self.outputs['cell_node_ids'][idx,1] = node + print(np.unique(self.outputs['cell_node_ids'][:,1], return_counts=True)) + +def path_to_node(tree, node_id): + path = [] + path.append(node_id) + parent_id = tree[node_id]['parent_id'] + while parent_id != '-1' and parent_id != 'NULL': + path.append(parent_id) + parent_id = tree[parent_id]['parent_id'] + return path[::-1][:] + +def path_between_nodes(tree, nodeA, nodeB): + pathA = np.array(path_to_node(tree, nodeA)) + pathB = np.array(path_to_node(tree, nodeB)) + path = [] + # Get MRCA + i = -1 + for node in pathA: + if node in pathB: + i += 1 + else: + break + mrca = pathA[i] + pathA = np.array(pathA[::-1]) + # Get path from A to MRCA + path = path + list(pathA[:np.where(pathA == mrca)[0][0]]) + # Get path from MRCA to B + path = path + list(pathB[np.where(pathB == mrca)[0][0]:]) + return path + +def get_distance(tree, id1, id2, n_nodes=False, paths=None): + if paths is None: + path = path_between_nodes(tree, id1, id2) + else: + try: + path = paths[id1][id2] + except KeyError: + path = paths[id2][id1] + + dist = 0 + if n_nodes: + dist = len(path) + else: + for i, node in enumerate(path): + if i <= len(path) - 2: + if tree[node]['cnv'][0] != -1 and tree[path[i+1]]['cnv'][0] != -1: + dist += np.sum(np.abs(tree[node]['cnv'] - tree[path[i+1]]['cnv'])) + + return dist + +def get_pairwise_cell_distances(tree, n_cells, attachments, paths=None, n_nodes=False): + mat = np.zeros((n_cells, n_cells)) + + for i in range(1, n_cells): + id1 = attachments['node'].loc[i] + for j in range(i): + id2 = attachments['node'].loc[j] + mat[i][j] = get_distance(tree, str(id1), str(id2), paths=paths, n_nodes=n_nodes) + + return mat + + +def tree_to_dict(tree): + tree_dict = dict() + + def descend(node, par_id): + label = node['label'] + tree_dict[label] = dict() + tree_dict[label]["cnv"] = node["cnv"] + tree_dict[label]["parent_id"] = par_id + tree_dict[label]["n_undoing"] = node["n_undoing"] + for child in node['children']: + descend(child, node['label']) + + descend(tree, '-1') + + return tree_dict + +def dict_to_tree(tree_dict, root_node): + # Make children + for i in tree_dict: + tree_dict[i]["children"] = [] + for i in tree_dict: + for j in tree_dict: + if tree_dict[j]["parent_id"] == i: + tree_dict[i]["children"].append(j) + + # Make tree + root = { + "cnv": tree_dict[root_node]["cnv"], + "children": [], + "label": root_node, + "parent_id": '-1', + "n_undoing": 0, + } + + # Recursively construct tree + def descend(super_tree, label): + for i, child in enumerate(tree_dict[label]["children"]): + super_tree["children"].append( + { + "cnv": tree_dict[child]["cnv"], + "children": [], + "label": child, + "parent_id": label, + "n_undoing": 0, + } + ) + + descend(super_tree["children"][-1], child) + + descend(root, root_node) + return root + +def get_conet_tree_distances(file_conet_tree, file_attachments, file_cnvs): + tree = pd.read_csv(file_conet_tree, sep="-", header=None, names=['from', 'to']) + attachments = pd.read_csv(file_attachments, sep=";", header=None, names=['cellname', 'idx', 'lstart', 'lend']) + cnvs = np.loadtxt(file_cnvs, delimiter=',').T + + tree['from'] = tree['from'].apply(lambda x: x.replace('(','').replace(')','') ) + tree['to'] = tree['to'].apply(lambda x: x.replace('(','').replace(')','') ) + + attachments['node'] = attachments[['lstart', 'lend']].apply(lambda x: ','.join(x[x.notnull()]), axis = 1) + + # Build the tree dict from files + print("Building tree...") + tree_dict = dict() + for i, row in tree.iterrows(): + parent_id = row['from'] + child_id = row['to'] + # Get the first cell attached to this node + cell_idx = np.where(attachments['node'].values == child_id)[0] + if len(cell_idx) > 0: + cell_idx = cell_idx[0] + node_cnv = cnvs[cell_idx].ravel() + else: + node_cnv = -1 * np.ones((cnvs.shape[1],)) + tree_dict[str(child_id)] = dict(parent_id=str(parent_id), cnv=node_cnv) + + # Add root + tree_dict['0,0'] = dict(parent_id='NULL', cnv=2*np.ones(cnvs.shape[1],)) + + # Fill nodes + def fill(node_dict, root_node): + # Create recursive dict + tree = dict_to_tree(node_dict, root_node) + + # Fill empty with mean of children + def descend(root): + cnvs = [] + for child in root['children']: + cr = descend(child) + cnvs.extend([cr]) + if root['cnv'][0] == -1: + print(root['label']) + cnvs = np.vstack(cnvs) + root['cnv'] = np.round(np.mean(cnvs, axis=0)) + print(f"Filling this node: {root['label']}") + print(root['cnv'][:10]) + print(cnvs[:,:10]) + return root['cnv'] + descend(tree) + + # Update node_dict + tree_dict = tree_to_dict(tree) + return tree_dict + + tree_dict = fill(tree_dict, '0,0') + + # Compute all paths between nodes + nodes = list(tree_dict.keys()) + reps = [] + for i, row in attachments.iterrows(): + if row['lstart'] == row['lend']: + reps.append(row['node']) + + reps = np.unique(reps) + if len(reps): + attachments = attachments.replace(reps,['0,0']) + + print("Computing distances...") + # Compute cell-cell distances + mat = get_pairwise_cell_distances(tree_dict, n_cells, attachments, n_nodes=False) + return mat, tree_dict + +def get_true_conet_tree_distances(file_conet_tree, file_attachments, file_cnvs): + tree = pd.read_csv(file_conet_tree, sep=" ", header=None, names=['from', 'to', 'none']) + tree = tree.drop(columns=['none']) + attachments = pd.read_csv(file_attachments, sep=",", header=None, names=['lstart', 'lend']) + cnvs = np.loadtxt(file_cnvs, delimiter=',') + + tree['from'] = tree['from'].apply(lambda x: x.replace('(','').replace(')','') ) + tree['to'] = tree['to'].apply(lambda x: x.replace('(','').replace(')','') ) + + attachments['lstart'] = attachments['lstart'].astype(int).astype(str) + attachments['lend'] = attachments['lend'].astype(int).astype(str) + attachments['node'] = attachments[['lstart', 'lend']].apply(lambda x: ','.join(x[x.notnull()]), axis = 1) + + # Build the tree dict from files + print("Building tree...") + tree_dict = dict() + for i, row in tree.iterrows(): + parent_id = row['from'] + child_id = row['to'] + # Get the first cell attached to this node + cell_idx = np.where(attachments['node'].values == child_id)[0] + if len(cell_idx) > 0: + cell_idx = cell_idx[0] + node_cnv = cnvs[cell_idx].ravel() + else: + node_cnv = -1 * np.ones((cnvs.shape[1],)) + tree_dict[str(child_id)] = dict(parent_id=str(parent_id), cnv=node_cnv) + + # Add root + tree_dict['0,0'] = dict(parent_id='NULL', cnv=2*np.ones(cnvs.shape[1],)) + + # Fill nodes + def fill(node_dict, root_node): + # Create recursive dict + tree = dict_to_tree(node_dict, root_node) + + # Fill empty with mean of children + def descend(root): + cnvs = [] + for child in root['children']: + cr = descend(child) + cnvs.extend([cr]) + if root['cnv'][0] == -1: + print(root['label']) + cnvs = np.vstack(cnvs) + root['cnv'] = np.round(np.mean(cnvs, axis=0)) + print(f"Filling this node: {root['label']}") + print(root['cnv'][:10]) + print(cnvs[:,:10]) + return root['cnv'] + descend(tree) + + # Update node_dict + tree_dict = tree_to_dict(tree) + return tree_dict + + tree_dict = fill(tree_dict, '0,0') + print(tree_dict) + + # Compute all paths between nodes + nodes = list(tree_dict.keys()) + reps = [] + for i, row in attachments.iterrows(): + if row['lstart'] == row['lend']: + reps.append(row['node']) + + reps = np.unique(reps) + if len(reps): + attachments = attachments.replace(reps,['0,0']) + + print("Computing true distances...") + # Compute cell-cell distances + mat = get_pairwise_cell_distances(tree_dict, n_cells, attachments, n_nodes=False) + return mat, tree_dict + +errs = 0 +compute_errors = True +simsdir = "/cluster/work/bewi/members/pedrof/sc-dna/sims2020_20_nodes" +scicone_path = "/cluster/work/bewi/members/pedrof/tupro_code/SCICoNE/build3" +output_path = "/cluster/work/bewi/members/pedrof/" + +all_folders = glob.glob(simsdir + "/results2023_boxplot3/*") +all_folders = np.sort([f for f in all_folders if "nu_on" not in f and "inference_full" not in f]) + +#methods = ['cluster_tree', 'full_tree', 'cluster_tree_sum', 'full_tree_sum', 'conet'] +methods = [ +'cluster_tree_unfair', 'full_tree_unfair', 'cluster_tree_sum_unfair', 'full_tree_sum_unfair', +'cluster_tree_fair', 'full_tree_fair', 'cluster_tree_sum_fair', 'full_tree_sum_fair', +'cluster_tree_unfair_adapted', 'full_tree_unfair_adapted', 'cluster_tree_sum_unfair_adapted', 'full_tree_sum_unfair_adapted', +'cluster_tree_fair_adapted', 'full_tree_fair_adapted', 'cluster_tree_sum_fair_adapted', 'full_tree_sum_fair_adapted', 'conet'] +#methods = ['conet'] + +#methods = ['cluster_tree_fair', 'full_tree_fair', 'conet'] + +n_cells = 200 +n_reps = 40 +n_nodes = 20 +n_bins = 1500 +sim_folder = [folder for folder in all_folders if "simulation" in folder][0] + +files_missing = [] +n_files_per_method = n_reps +n_files_missing = {key: 0 for key in methods} + +row_list = [] +for rep_id in tqdm(range(n_reps)): + print(f'{rep_id}:starting...') + true_tree_path = sim_folder + f"/{n_nodes}nodes/" + str(rep_id) + "_tree.txt" + true_cnvs_path = sim_folder + f"/{n_nodes}nodes/" + str(rep_id) + "_ground_truth.txt" + true_attachments_path = sim_folder + f"/{n_nodes}nodes/" + str(rep_id) + "_attachments.txt" + if compute_errors: + true_cnvs = np.loadtxt(true_cnvs_path, delimiter=',') + n_regions = np.count_nonzero(np.sum((np.diff(true_cnvs,axis=1) != 0) != 0, axis=0)) + + # Load in true CONET tree + real_distances, true_tree = get_true_conet_tree_distances(true_tree_path, true_attachments_path, true_cnvs_path) + real_distances *= n_regions/n_bins + print("Real distances") + print(real_distances[:10,:10]) + print(f'Norm: {np.linalg.norm(real_distances)}') + nodes_tree = len(true_tree.keys()) + print(f"Number of nodes in real tree: {nodes_tree}") + tree = dict_to_tree(true_tree, '0,0') + n_undoing = get_n_undoing(tree) + print("Undoing") + print(f'Real tree total number of undoing events: {n_undoing}') + + for method in methods: + method_ = str(method) + if 'fair' in method_: + adapted = '' + if 'adapted' in method_: + adapted = '_adapted' + fair = method_.split('_')[-2] + method_ = method_.rsplit('_',2)[0] + else: + fair = method_.split('_')[-1] + method_ = method_.rsplit('_',1)[0] + print(adapted, fair, method_) + #if 'ginkgo' in method: + # method_ = 'medalt_tree_distances' + method_folder = [folder for folder in all_folders if method_ in folder][0] + if 'medalt' not in method and 'conet' not in method: + if "sum" in method_: + method_ = method_.replace('_sum', '') + inferred_tree_path = method_folder + f"/{fair}/{n_nodes}nodes/" + str(rep_id) + '_' + method_ + adapted + ".txt" + inferred_cnvs_path = method_folder + f"/{fair}/{n_nodes}nodes/" + str(rep_id) + '_' + method_ + f"_cnvs{adapted}.csv" + print(f"Looking for {inferred_cnvs_path}") + my_file = Path(inferred_cnvs_path) + if not my_file.is_file(): + print(f'{inferred_cnvs_path} not found!') + n_files_missing[method] = n_files_missing[method] + 1 + files_missing.append(inferred_cnvs_path) + continue + elif 'medalt' in method: + distances_path = method_folder + f"/{n_nodes}nodes/" + str(rep_id) + '_' + method[:-1] + ".txt" + my_file = Path(distances_path) + if not my_file.is_file(): + print(f'{distances_path} not found!') + n_files_missing[method] = n_files_missing[method] + 1 + files_missing.append(inferred_cnvs_path) + continue + elif 'conet' in method: + file_cnvs = method_folder + f"/{n_nodes}nodes/{rep_id}_conet_cbs_inferred.txt" + my_file = Path(file_cnvs) + if not my_file.is_file(): + print(f'{file_cnvs} not found!') + n_files_missing[method] = n_files_missing[method] + 1 + files_missing.append(file_cnvs) + continue + + if compute_errors: + if 'tree' in method: + # Load in tree + inf_regions_path = f"/cluster/work/bewi/members/pedrof/sc-dna/sims2020_20_nodes/results2023_boxplot3/bp_detection__trees_{fair}/" + f"{n_nodes}nodes/" + str(rep_id) + "_segmented_region_sizes.txt" + inf_regions = np.loadtxt(inf_regions_path, delimiter=',').ravel() + + tree_inf = Tree(scicone_path, output_path, n_bins=n_bins) + tree_inf.outputs['inferred_cnvs'] = np.loadtxt(inferred_cnvs_path, delimiter=',') + tree_inf.outputs['region_sizes'] = inf_regions + tree_inf.outputs['region_neutral_states'] = np.ones((tree_inf.outputs['region_sizes'].shape[0],)) * 2 + print('Inf: reading tree...') + tree_inf.read_from_file(inferred_tree_path) + print('Inf: creating cell_node_ids...') + create_cell_node_ids(tree_inf) + print('Inf: getting pairwise distances...') + tree_inf.count_nodes_bins() + inf_distances = tree_inf.get_pairwise_cell_distances(distance='events') + inf_distances *= n_regions/n_bins + nodes_tree = len(tree_inf.node_dict.keys()) + print(f"Number of nodes in SCICoNE tree: {nodes_tree}") + tree = dict_to_tree(tree_inf.node_dict, '0') + n_undoing = get_n_undoing(tree) + print("Undoing") + print(f'SCICoNE total number of undoing events: {n_undoing}') + #print(tree_inf.tree_str) + #print(f"n_regions: {len(tree_inf.outputs['region_sizes'])}") + #for node in tree_inf.node_dict: + # print(tree_inf.node_dict[node]['region_event_dict']) + # print(f"n_bins: {tree_inf.node_dict[node]['n_bins']}") + + #node1 = list(tree_inf.node_dict.keys())[1] + #node2 = list(tree_inf.node_dict.keys())[-1] + #print(f"{node1}->{node2}: {tree_inf.path_between_nodes(node1, node2)}") + elif 'conet' in method: + file_conet_tree = method_folder + f"/{n_nodes}nodes/{rep_id}_conet_cbs_tree.txt" + file_attachments = method_folder + f"/{n_nodes}nodes/{rep_id}_conet_cbsattachments.txt" + file_cnvs = method_folder + f"/{n_nodes}nodes/{rep_id}_conet_cbs_inferred.txt" + try: + inf_distances, conet_tree = get_conet_tree_distances(file_conet_tree, file_attachments, file_cnvs) + inf_distances *= n_regions/n_bins + nodes_tree = len(conet_tree.keys()) + print(f"Number of nodes in CONET tree: {nodes_tree}") + tree = dict_to_tree(conet_tree, '0,0') + n_undoing = get_n_undoing(tree) + print("Undoing") + print(f'CONET total number of undoing events: {n_undoing}') + except Exception as e: + print(e) + print("CONET FAILED!") + errs += 1 + print(errs) + continue + else: + distances_path = method_folder + f"/{n_nodes}nodes/" + str(rep_id) + '_' + method[:-1] + ".txt" + inf_distances = np.loadtxt(distances_path, delimiter=',') + inf_distances *= n_regions/n_bins + + + print(f"\n**{method}**") + print("Inferred distances") + print(inf_distances) + print(f'Norm: {np.linalg.norm(inf_distances)}') + delta = np.linalg.norm(inf_distances - real_distances) / np.sqrt(n_cells*(n_cells-1)/2) + print(f'Tree delta: {delta}') + cnvs_delta = np.sqrt(np.mean((true_cnvs-tree_inf.outputs['inferred_cnvs'])**2)) + print(f'CNVs delta: {cnvs_delta}') + tup = (rep_id, n_nodes, method, delta) + row_list.append(tup) + print(f'{rep_id}:{method} done!') + +if len(files_missing) >= 0: + print('No files missing.') + if compute_errors: + df = pd.DataFrame(data=row_list, columns=['rep_id', 'n_nodes', 'method', 'delta']) + results_file = simsdir + f'/results2023_boxplot3/{n_nodes}_nodes_tau.csv' + df.to_csv(results_file) + print(df) + print(f'Results file is available at {results_file}.') +else: + print(f'FAILED: There are {len(files_missing)} files missing in total.') + print(f'Number of missing CNV output files per method (out of {n_files_per_method}):') + for method in n_files_missing: + print(f'{method}: {n_files_missing[method]}') + diff --git a/reproducibility/simulations/conet_comparisons/default/Snakefile b/reproducibility/simulations/conet_comparisons/default/Snakefile new file mode 100644 index 0000000..e4794c6 --- /dev/null +++ b/reproducibility/simulations/conet_comparisons/default/Snakefile @@ -0,0 +1,1134 @@ +""" +Here we show that the SCICoNE results in the CONET paper are due to SCICoNE +being passed normalised instead of raw counts. + +We assess two cases: +1. SCICoNE with w=20 and input = normalized data to recover their results +2. SCICoNE with w=4 and input = normalized data * 50 to obtain proper results +""" + +import os +import shutil +import pandas as pd +import numpy as np +from tqdm import tqdm as tqdm +import random +from math import sqrt +import matplotlib.pyplot as plt +import seaborn as sns +import subprocess + +import tempfile +import conet +import conet.data_converter.data_converter as dc +import conet.conet as c +import conet.conet_parameters as cp +import conet.inference_result as ir +from conet.per_bin_model.cn_sampler import * +from conet.per_bin_model.node_label import * +from conet.per_bin_model.tree_generator import * +from conet.per_bin_model.counts_generator import * +import networkx as nx + +from scicone import SCICoNE, Tree + +all_n_tps = config["simulate"]["n_reps"] +n_cells = config["simulate"]["n_cells"] +n_bins = config["simulate"]["n_bins"] +n_nodes = config["simulate"]["n_nodes"] +window_size = config["bp_detection"]["window_size"] + +sim_prefix=config["simulate"]["prefix"] +binaries_path = config['binaries_path'] +output_path = config["output"] + +SIM_OUTPUT= os.path.join(config["output"], "simulation") +BP_OUTPUT = os.path.join(config["output"], "bp_detection") +TREES_OUTPUT = os.path.join(config["output"], "inference") +GINKGO_OUTPUT = os.path.join(config["output"], "ginkgo") +CONET_GINKGO_OUTPUT = os.path.join(config["output"], "conet_ginkgo") +CONET_CBS_OUTPUT = os.path.join(config["output"], "conet_cbs") +CONET_KNOWNBP_OUTPUT = os.path.join(config["output"], "conet_knownbp") + +output_temp = config["inference"]["output_temp"] +try: + os.mkdir(output_temp) +except: + pass + +# the default case for cluster_fraction variable +try: + cf = config["inference"]["full_trees"]["cluster_fraction"] +except KeyError: + cf = 1.0 + +rule all: + input: + deltas = os.path.join(output_path, 'deltas_conetsims.csv') + run: + print("rule all") + +rule cluster_and_conet: + input: + best_cluster_tree = expand(f'{TREES_OUTPUT}_best_cluster_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_cnvs.csv', + rep_id=[x for x in range(0,all_n_tps)], fairness=["fair", "unfair"]), + + best_cluster_tree_adapted = expand(f'{TREES_OUTPUT}_best_cluster_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_cnvs_adapted.csv', + rep_id=[x for x in range(0,all_n_tps)], fairness=["fair", "unfair"]), + + conet_cbs = expand(CONET_CBS_OUTPUT+'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_conet_cbs_inferred.txt', rep_id=[x for x in range(0,all_n_tps)]), + run: + print("rule cluster_and_conet") + +rule run_sim_conet: + params: + n_nodes = n_nodes, + n_bins = n_bins, + n_cells = n_cells, + output: + d_mat = SIM_OUTPUT+ '_' + sim_prefix +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_d_mat.txt', + ground_truth = SIM_OUTPUT+ '_' + sim_prefix +'/'+ str(n_nodes) + 'nodes' + '/'+ '{rep_id}' + '_ground_truth.txt', + tree = SIM_OUTPUT+ '_' + sim_prefix +'/'+ str(n_nodes) + 'nodes' + '/'+ '{rep_id}' + '_tree.txt', + attachments = SIM_OUTPUT+ '_' + sim_prefix +'/'+ str(n_nodes) + 'nodes' + '/'+ '{rep_id}' + '_attachments.txt', + run: + import random + from math import sqrt + + loci = params.n_bins + tree_size = params.n_nodes + cells = params.n_cells + random.seed(int(wildcards.rep_id)) + np.random.seed(int(wildcards.rep_id)) + + cn_s = \ + CNSampler({0: 0.020240121, 1: 0.203724532, 3: 0.050340118, 4: 0.038828672, 2: 0.686866557}, + {0: 0.449, 1: 0.116, 2: 1.41 * 0.187, 3: 0.114, 4: 0.279, 5: 0.0957, 6: 0.4833, 7: 0.2760, + 8: 6.15780, 9: 4.72105270}) + cn_s.NOISE_PROB = 0.01 + + t_gen = TreeGenerator(cn_s) + d_gen = CountsGenerator(cn_s) + + tree, trunk = t_gen.generate_random_tree(loci, tree_size) + counts, attachment, corrected_counts, diff_matrix, brkp_matrix = d_gen.generate_data(loci, tree, cells, trunk) + + # # generate event tree and cell data, this might take a while + # cn_s = CNSampler.create_default_sampler() + # t_gen = EventTreeGenerator(cn_sampler=cn_s, tree_size=tree_size, no_loci=loci) + # tree: EventTree = t_gen.generate_random_tree() + # d_gen = CountsGenerator(cn_s, tree) + # counts, attachment, corrected_counts, brkp_matrix = d_gen.generate_data(loci, cells) + conet_cc = np.array(corrected_counts) + counts = np.array(counts) + + assert counts.shape[0] == n_cells and counts.shape[1] == n_bins + assert conet_cc.shape[0] == n_cells and conet_cc.shape[1] == n_bins + + np.savetxt(output.d_mat, conet_cc, delimiter=',') + np.savetxt(output.ground_truth, counts, delimiter=',') + nx.write_edgelist(tree, output.tree) + np.savetxt(output.attachments, attachment, delimiter=',') + +rule detect_breakpoints: + params: + binary = config["bp_detection"]["bin"], + window_size = config["bp_detection"]["window_size"], + verbosity = config["bp_detection"]["verbosity"], + threshold = config["bp_detection"]["threshold"], + bp_limit = config["bp_detection"]["bp_limit"], + postfix = sim_prefix + input: + d_matrix_file = SIM_OUTPUT+ '_' + sim_prefix +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_d_mat.txt' + output: + segmented_regions = BP_OUTPUT + '_' + sim_prefix +'_trees' + '_{fairness}' + '/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_segmented_regions.txt', + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees' + '_{fairness}' + '/' + str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_segmented_region_sizes.txt', + run: + data = np.loadtxt(input.d_matrix_file, delimiter=',') + n_cells = data.shape[0] + n_bins = data.shape[1] + + # Optional filter for low coverages + def filter_lr(lr_matrix, H=None): + freq = np.fft.fftfreq(lr_matrix.shape[-1], 1) # 1 Hz sampling rate + + filtered_lr = np.empty(lr_matrix.shape) + for c in range(lr_matrix.shape[0]): + X = np.fft.fft(lr_matrix[c]) + Y = X * H + y = np.fft.ifft(Y) + filtered_lr[c] = y + + return filtered_lr + + sci = SCICoNE(binaries_path, output_temp, persistence=False, postfix=f"{n_nodes}nodes_{wildcards.fairness}_{wildcards.rep_id}_{params.postfix}") + + window_size = params.window_size + if wildcards.fairness == 'fair': + # Assuming data was simulated with coverage of 100reads per bin + data = data * 50 + window_size = 1#4 + + freq = np.fft.fftfreq(n_bins, 1) + df = 0.015 + gpl = np.exp(- ((freq-1/(2*window_size))/(2*df))**2) # pos. frequencies + gmn = np.exp(- ((freq+1/(2*window_size))/(2*df))**2) + g = gpl + gmn + + bps = dict() + if np.sum(data[0])/n_bins <= 10 and window_size > 1: + bps = sci.detect_breakpoints(data, window_size=window_size, threshold=0.1, bp_limit=params.bp_limit, compute_sp=False, evaluate_peaks=False) + filtered_lr = filter_lr(bps['lr_vec'].T, H=g) + bps = sci.detect_breakpoints(data, window_size=window_size, bp_limit=params.bp_limit, threshold=params.threshold, lr=filtered_lr) + else: + bps = sci.detect_breakpoints(data, window_size=window_size, threshold=params.threshold, bp_limit=params.bp_limit) + + np.savetxt(output.segmented_regions, bps['segmented_regions'], delimiter=',') + np.savetxt(output.segmented_region_sizes, bps['segmented_region_sizes'], delimiter=',') + +rule segment_regions: + input: + d_matrix_file = SIM_OUTPUT+ '_' + sim_prefix +'/'+ str(n_nodes) + 'nodes' +\ + '/' + '{rep_id}' + '_d_mat.txt', + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/'+ '{rep_id}' + '_segmented_region_sizes.txt' + output: + segmented_counts = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/'+ '{rep_id}' + "_segmented_counts.csv", + segmented_counts_shape = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/'+ '{rep_id}' + "_segmented_counts_shape.txt" + run: + filtered_counts = np.loadtxt(input.d_matrix_file, delimiter=',') + if wildcards.fairness == 'fair': + # Assuming data was simulated with coverage of 100reads per bin + filtered_counts = filtered_counts * 50 + + n_cells = filtered_counts.shape[0] + region_sizes = np.loadtxt(input.segmented_region_sizes) + n_regions = len(region_sizes) + sum_region_sizes = np.sum(region_sizes) + condensed_mat = np.zeros((n_cells, n_regions)) + + print("segmenting the bins...") + for i in tqdm(range(n_cells)): + region_id = 0 + region_count = 0 + # import ipdb; ipdb.set_trace() # debugging starts here + for j in range(filtered_counts.shape[1]): + to_add = filtered_counts[i][j] + condensed_mat[i][region_id] += to_add + region_count += 1 + if region_count == region_sizes[region_id]: + region_id += 1 + region_count = 0 + + if not np.allclose(condensed_mat.sum(axis=1), filtered_counts.sum(axis=1)): + raise AssertionError( + "not all values of the sums before & after " + "segmentation are close") + + print("saving the segmented regions...") + np.savetxt(output.segmented_counts, condensed_mat, delimiter=",") + np.savetxt(output.segmented_counts_shape, condensed_mat.shape) + +rule learn_cluster_trees: + params: + cluster_tree_n_iters = config["inference"]["cluster_trees"]["n_iters"], + cluster_tree_n_tries = config["inference"]["cluster_trees"]["n_tries"], + cluster_tree_n_reps = config["inference"]["cluster_trees"]["n_reps"], + posfix = "ctree" + f"{str(n_nodes)}" + "nodes_{fairness}_{rep_id}", + input: + segmented_counts = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + "_segmented_counts.csv", + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + '_segmented_region_sizes.txt' + output: + cluster_tree = f'{TREES_OUTPUT}_best_cluster_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree.txt', + cluster_tree_inferred_cnvs = f'{TREES_OUTPUT}_best_cluster_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_cnvs.csv' + threads: 10 + run: + segmented_counts = np.loadtxt(input.segmented_counts, delimiter=',') + segmented_region_sizes = np.loadtxt(input.segmented_region_sizes, delimiter=',') + cov = np.mean(np.sum(segmented_counts, axis=1) / np.sum(segmented_region_sizes)) + alpha = 1./segmented_counts.shape[1] + gamma = 1./cov + sci = SCICoNE(binaries_path, output_temp, persistence=False, postfix=params.posfix + f"{sim_prefix}") + + # Run cluster trees + sci.learn_tree(segmented_counts, segmented_region_sizes, verbosity=2, n_reps=params.cluster_tree_n_reps, cluster=True, full=False, cluster_tree_n_iters=params.cluster_tree_n_iters, max_tries=params.cluster_tree_n_tries, robustness_thr=0.5, alpha=alpha, max_scoring=True, gamma=gamma) + + # Store best cluster tree + with open(output.cluster_tree, "w") as file: + for line in sci.best_cluster_tree.tree_str.splitlines(): + file.write(f"{line}\n") + np.savetxt(output.cluster_tree_inferred_cnvs, sci.best_cluster_tree.outputs['inferred_cnvs'], delimiter=',') + +rule learn_cluster_trees_sum: + params: + cluster_tree_n_iters = config["inference"]["cluster_trees"]["n_iters"], + cluster_tree_n_tries = config["inference"]["cluster_trees"]["n_tries"], + cluster_tree_n_reps = config["inference"]["cluster_trees"]["n_reps"], + posfix = "ctree_sum" + f"{str(n_nodes)}" + "nodes_{fairness}_{rep_id}" + input: + segmented_counts = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/'+ '{rep_id}' + "_segmented_counts.csv", + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/'+ '{rep_id}' + '_segmented_region_sizes.txt' + output: + cluster_tree_sum = f'{TREES_OUTPUT}_best_cluster_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree.txt', + cluster_tree_inferred_cnvs_sum = f'{TREES_OUTPUT}_best_cluster_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_cnvs.csv' + threads: 10 + run: + segmented_counts = np.loadtxt(input.segmented_counts, delimiter=',') + segmented_region_sizes = np.loadtxt(input.segmented_region_sizes, delimiter=',') + alpha = 1./segmented_counts.shape[1] + + sci = SCICoNE(binaries_path, output_temp, persistence=False, postfix=params.posfix + f"{sim_prefix}") + + # Run cluster trees + sci.learn_tree(segmented_counts, segmented_region_sizes, verbosity=2, cluster=True, full=False, cluster_tree_n_iters=params.cluster_tree_n_iters, + max_tries=params.cluster_tree_n_tries, robustness_thr=0.5, alpha=alpha, max_scoring=False, n_reps=params.cluster_tree_n_reps) + + # Store best cluster tree + with open(output.cluster_tree_sum, "w") as file: + for line in sci.best_cluster_tree.tree_str.splitlines(): + file.write(f"{line}\n") + np.savetxt(output.cluster_tree_inferred_cnvs_sum, sci.best_cluster_tree.outputs['inferred_cnvs'], delimiter=',') + +rule learn_full_trees: + params: + full_tree_n_iters = config["inference"]["full_trees"]["n_iters"], + move_probs = config["inference"]["full_trees"]["move_probs"], + cf = cf, + posfix = "ftree" + f"{str(n_nodes)}" + "nodes_{fairness}_{rep_id}" + input: + cluster_tree = f'{TREES_OUTPUT}_best_cluster_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree.txt', + segmented_counts = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + "_segmented_counts.csv", + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}/'+ str(n_nodes)\ + + 'nodes' + '/'+ '{rep_id}' + '_segmented_region_sizes.txt' + output: + full_tree = f'{TREES_OUTPUT}_best_full_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree.txt', + full_tree_inferred_cnvs = f'{TREES_OUTPUT}_best_full_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree_cnvs.csv' + threads: 10 + run: + segmented_counts = np.loadtxt(input.segmented_counts, delimiter=',') + segmented_region_sizes = np.loadtxt(input.segmented_region_sizes, delimiter=',') + alpha = 1./segmented_counts.shape[1] + + sci = SCICoNE(binaries_path, output_temp, persistence=False, postfix=params.posfix + f"{sim_prefix}") + + tree = Tree(binaries_path + '/inference', "", postfix=params.posfix) + tree.outputs['region_sizes'] = segmented_region_sizes + tree.outputs['region_neutral_states'] = np.ones((segmented_region_sizes.shape[0],)) * 2 + + tree.read_from_file(input.cluster_tree) + sci.best_cluster_tree = tree + + # Run full trees starting from cluster tree + sci.learn_tree(segmented_counts, segmented_region_sizes, verbosity=2, n_reps=10, cluster=False, full=True, full_tree_n_iters=params.full_tree_n_iters, + max_tries=3, robustness_thr=0.5, alpha=alpha, move_probs=params.move_probs, max_scoring=True) + + # Store best full tree + with open(output.full_tree, "w") as file: + for line in sci.best_full_tree.tree_str.splitlines(): + file.write(f"{line}\n") + np.savetxt(output.full_tree_inferred_cnvs, sci.best_full_tree.outputs['inferred_cnvs'], delimiter=',') + +rule learn_full_trees_sum: + params: + full_tree_n_iters = config["inference"]["full_trees"]["n_iters"], + move_probs = config["inference"]["full_trees"]["move_probs"], + cf = cf, + posfix = "ftreesum" + f"{str(n_nodes)}" + "nodes_{fairness}_{rep_id}" + input: + cluster_tree = f'{TREES_OUTPUT}_best_cluster_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree.txt', + segmented_counts = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + "_segmented_counts.csv", + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + '_segmented_region_sizes.txt' + output: + full_tree = f'{TREES_OUTPUT}_best_full_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree.txt', + full_tree_inferred_cnvs = f'{TREES_OUTPUT}_best_full_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree_cnvs.csv' + threads: 10 + run: + segmented_counts = np.loadtxt(input.segmented_counts, delimiter=',') + segmented_region_sizes = np.loadtxt(input.segmented_region_sizes, delimiter=',') + alpha = 1./segmented_counts.shape[1] + + sci = SCICoNE(binaries_path, output_temp, persistence=False, postfix=params.posfix + f"{sim_prefix}") + + tree = Tree(binaries_path + '/inference', "", postfix=params.posfix) + tree.outputs['region_sizes'] = segmented_region_sizes + tree.outputs['region_neutral_states'] = np.ones((segmented_region_sizes.shape[0],)) * 2 + + tree.read_from_file(input.cluster_tree) + sci.best_cluster_tree = tree + + # Run full trees starting from cluster tree + sci.learn_tree(segmented_counts, segmented_region_sizes, verbosity=2, n_reps=10, cluster=False, full=True, full_tree_n_iters=params.full_tree_n_iters, + max_tries=3, robustness_thr=0.5, alpha=alpha, move_probs=params.move_probs, max_scoring=False) + + # Store best full tree + with open(output.full_tree, "w") as file: + for line in sci.best_full_tree.tree_str.splitlines(): + file.write(f"{line}\n") + np.savetxt(output.full_tree_inferred_cnvs, sci.best_full_tree.outputs['inferred_cnvs'], delimiter=',') + + +rule learn_cluster_trees_adapted: + params: + cluster_tree_n_iters = config["inference"]["cluster_trees"]["n_iters"], + cluster_tree_n_tries = config["inference"]["cluster_trees"]["n_tries"], + cluster_tree_n_reps = config["inference"]["cluster_trees"]["n_reps"], + posfix = "ctree" + f"{str(n_nodes)}" + "nodes_{fairness}_{rep_id}_adapted", + input: + segmented_counts = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + "_segmented_counts.csv", + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + '_segmented_region_sizes.txt' + output: + cluster_tree = f'{TREES_OUTPUT}_best_cluster_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_adapted.txt', + cluster_tree_inferred_cnvs = f'{TREES_OUTPUT}_best_cluster_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_cnvs_adapted.csv' + threads: 10 + run: + segmented_counts = np.loadtxt(input.segmented_counts, delimiter=',') + segmented_region_sizes = np.loadtxt(input.segmented_region_sizes, delimiter=',') + cov = np.mean(np.sum(segmented_counts, axis=1) / np.sum(segmented_region_sizes)) + alpha = 1./segmented_counts.shape[1] + gamma = 1./cov + sci = SCICoNE(binaries_path, output_temp, persistence=False, postfix=params.posfix + f"{sim_prefix}") + + # Run cluster trees + sci.learn_tree(segmented_counts, segmented_region_sizes, verbosity=2, n_reps=params.cluster_tree_n_reps, cluster=True, full=False, cluster_tree_n_iters=params.cluster_tree_n_iters, max_tries=params.cluster_tree_n_tries, robustness_thr=0.5, alpha=alpha, max_scoring=True, gamma=gamma, + eta=0.4) + + # Store best cluster tree + with open(output.cluster_tree, "w") as file: + for line in sci.best_cluster_tree.tree_str.splitlines(): + file.write(f"{line}\n") + np.savetxt(output.cluster_tree_inferred_cnvs, sci.best_cluster_tree.outputs['inferred_cnvs'], delimiter=',') + +rule learn_cluster_trees_sum_adapted: + params: + cluster_tree_n_iters = config["inference"]["cluster_trees"]["n_iters"], + cluster_tree_n_tries = config["inference"]["cluster_trees"]["n_tries"], + cluster_tree_n_reps = config["inference"]["cluster_trees"]["n_reps"], + posfix = "ctree_sum" + f"{str(n_nodes)}" + "nodes_{fairness}_{rep_id}_adapted" + input: + segmented_counts = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/'+ '{rep_id}' + "_segmented_counts.csv", + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/'+ '{rep_id}' + '_segmented_region_sizes.txt' + output: + cluster_tree_sum = f'{TREES_OUTPUT}_best_cluster_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_adapted.txt', + cluster_tree_inferred_cnvs_sum = f'{TREES_OUTPUT}_best_cluster_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_cnvs_adapted.csv' + threads: 10 + run: + segmented_counts = np.loadtxt(input.segmented_counts, delimiter=',') + segmented_region_sizes = np.loadtxt(input.segmented_region_sizes, delimiter=',') + alpha = 1./segmented_counts.shape[1] + + sci = SCICoNE(binaries_path, output_temp, persistence=False, postfix=params.posfix + f"{sim_prefix}") + + # Run cluster trees + sci.learn_tree(segmented_counts, segmented_region_sizes, verbosity=2, cluster=True, full=False, cluster_tree_n_iters=params.cluster_tree_n_iters, + max_tries=params.cluster_tree_n_tries, robustness_thr=0.5, alpha=alpha, max_scoring=False, n_reps=params.cluster_tree_n_reps, + eta=0.4) + + # Store best cluster tree + with open(output.cluster_tree_sum, "w") as file: + for line in sci.best_cluster_tree.tree_str.splitlines(): + file.write(f"{line}\n") + np.savetxt(output.cluster_tree_inferred_cnvs_sum, sci.best_cluster_tree.outputs['inferred_cnvs'], delimiter=',') + +rule learn_full_trees_adapted: + params: + full_tree_n_iters = config["inference"]["full_trees"]["n_iters"], + move_probs = config["inference"]["full_trees"]["move_probs"], + cf = cf, + posfix = "ftree" + f"{str(n_nodes)}" + "nodes_{fairness}_{rep_id}_adapted" + input: + cluster_tree = f'{TREES_OUTPUT}_best_cluster_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_adapted.txt', + segmented_counts = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + "_segmented_counts.csv", + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}/'+ str(n_nodes)\ + + 'nodes' + '/'+ '{rep_id}' + '_segmented_region_sizes.txt' + output: + full_tree = f'{TREES_OUTPUT}_best_full_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree_adapted.txt', + full_tree_inferred_cnvs = f'{TREES_OUTPUT}_best_full_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree_cnvs_adapted.csv' + threads: 10 + run: + segmented_counts = np.loadtxt(input.segmented_counts, delimiter=',') + segmented_region_sizes = np.loadtxt(input.segmented_region_sizes, delimiter=',') + alpha = 1./segmented_counts.shape[1] + + sci = SCICoNE(binaries_path, output_temp, persistence=False, postfix=params.posfix + f"{sim_prefix}") + + tree = Tree(binaries_path + '/inference', "", postfix=params.posfix) + tree.outputs['region_sizes'] = segmented_region_sizes + tree.outputs['region_neutral_states'] = np.ones((segmented_region_sizes.shape[0],)) * 2 + + tree.read_from_file(input.cluster_tree) + sci.best_cluster_tree = tree + + # Run full trees starting from cluster tree + sci.learn_tree(segmented_counts, segmented_region_sizes, verbosity=2, n_reps=10, cluster=False, full=True, full_tree_n_iters=params.full_tree_n_iters, + max_tries=3, robustness_thr=0.5, alpha=alpha, move_probs=params.move_probs, max_scoring=True, + eta=0.4) + + # Store best full tree + with open(output.full_tree, "w") as file: + for line in sci.best_full_tree.tree_str.splitlines(): + file.write(f"{line}\n") + np.savetxt(output.full_tree_inferred_cnvs, sci.best_full_tree.outputs['inferred_cnvs'], delimiter=',') + +rule learn_full_trees_sum_adapted: + params: + full_tree_n_iters = config["inference"]["full_trees"]["n_iters"], + move_probs = config["inference"]["full_trees"]["move_probs"], + cf = cf, + posfix = "ftreesum" + f"{str(n_nodes)}" + "nodes_{fairness}_{rep_id}_adapted" + input: + cluster_tree = f'{TREES_OUTPUT}_best_cluster_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_adapted.txt', + segmented_counts = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + "_segmented_counts.csv", + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + '_segmented_region_sizes.txt' + output: + full_tree = f'{TREES_OUTPUT}_best_full_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree_adapted.txt', + full_tree_inferred_cnvs = f'{TREES_OUTPUT}_best_full_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree_cnvs_adapted.csv' + threads: 10 + run: + segmented_counts = np.loadtxt(input.segmented_counts, delimiter=',') + segmented_region_sizes = np.loadtxt(input.segmented_region_sizes, delimiter=',') + alpha = 1./segmented_counts.shape[1] + + sci = SCICoNE(binaries_path, output_temp, persistence=False, postfix=params.posfix + f"{sim_prefix}") + + tree = Tree(binaries_path + '/inference', "", postfix=params.posfix) + tree.outputs['region_sizes'] = segmented_region_sizes + tree.outputs['region_neutral_states'] = np.ones((segmented_region_sizes.shape[0],)) * 2 + + tree.read_from_file(input.cluster_tree) + sci.best_cluster_tree = tree + + # Run full trees starting from cluster tree + sci.learn_tree(segmented_counts, segmented_region_sizes, verbosity=2, n_reps=10, cluster=False, full=True, full_tree_n_iters=params.full_tree_n_iters, + max_tries=3, robustness_thr=0.5, alpha=alpha, move_probs=params.move_probs, max_scoring=False, + eta=0.4) + + # Store best full tree + with open(output.full_tree, "w") as file: + for line in sci.best_full_tree.tree_str.splitlines(): + file.write(f"{line}\n") + np.savetxt(output.full_tree_inferred_cnvs, sci.best_full_tree.outputs['inferred_cnvs'], delimiter=',') + + + + +rule segment_regions_knownbps: + input: + d_matrix_file = SIM_OUTPUT+ '_' + sim_prefix +'/'+ str(n_nodes) + 'nodes' +\ + '/' + '{rep_id}' + '_d_mat.txt', + ground_truth = SIM_OUTPUT+ '_' + sim_prefix +'/'+ str(n_nodes) + 'nodes' +\ + '/'+ '{rep_id}' + '_ground_truth.txt', + output: + segmented_counts = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/'+ '{rep_id}' + "_segmented_counts_knownbps.csv", + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/'+ '{rep_id}' + "_segmented_region_sizes_knownbps.txt" + run: + filtered_counts = np.loadtxt(input.d_matrix_file, delimiter=',') + if wildcards.fairness == 'fair': + # Assuming data was simulated with coverage of 100reads per bin + filtered_counts = filtered_counts * 50 + + n_cells = filtered_counts.shape[0] + + cnvs = np.loadtxt(input.ground_truth, delimiter=',') + # True breakpoints: + bps = np.where(np.any(np.diff(cnvs, axis=1) != 0, axis=0))[0] #+ 1 + bps = np.unique(np.sort(np.concatenate([bps, bps+1]))) + print(f"Got {len(bps)} true breakpoints") + bps = list(bps) + bps.append(cnvs.shape[1]) + bps.insert(0, 0) + bps = np.unique(bps) + + region_sizes = np.diff(bps) + + n_regions = len(region_sizes) + sum_region_sizes = np.sum(region_sizes) + condensed_mat = np.zeros((n_cells, n_regions)) + + print("segmenting the bins...") + for i in tqdm(range(n_cells)): + region_id = 0 + region_count = 0 + for j in range(filtered_counts.shape[1]): + to_add = filtered_counts[i][j] + condensed_mat[i][region_id] += to_add + region_count += 1 + if region_count == region_sizes[region_id]: + region_id += 1 + region_count = 0 + + if not np.allclose(condensed_mat.sum(axis=1), filtered_counts.sum(axis=1)): + raise AssertionError( + "not all values of the sums before & after " + "segmentation are close") + + print("saving the segmented regions...") + np.savetxt(output.segmented_counts, condensed_mat, delimiter=",") + np.savetxt(output.segmented_region_sizes, region_sizes, delimiter=',') + +rule learn_cluster_trees_knownbps: + params: + cluster_tree_n_iters = config["inference"]["cluster_trees"]["n_iters"], + cluster_tree_n_tries = config["inference"]["cluster_trees"]["n_tries"], + cluster_tree_n_reps = config["inference"]["cluster_trees"]["n_reps"], + posfix = "ctree" + f"{str(n_nodes)}" + "nodes_{fairness}_knownbps_{rep_id}", + input: + segmented_counts = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + "_segmented_counts_knownbps.csv", + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + '_segmented_region_sizes_knownbps.txt' + output: + cluster_tree = f'{TREES_OUTPUT}_best_cluster_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_knownbps.txt', + cluster_tree_inferred_cnvs = f'{TREES_OUTPUT}_best_cluster_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_cnvs_knownbps.csv' + threads: 10 + run: + segmented_counts = np.loadtxt(input.segmented_counts, delimiter=',') + segmented_region_sizes = np.loadtxt(input.segmented_region_sizes, delimiter=',') + cov = np.mean(np.sum(segmented_counts, axis=1) / np.sum(segmented_region_sizes)) + alpha = 1./segmented_counts.shape[1] + gamma = 1./cov + sci = SCICoNE(binaries_path, output_temp, persistence=False, postfix=params.posfix + f"{sim_prefix}") + + # Run cluster trees + sci.learn_tree(segmented_counts, segmented_region_sizes, verbosity=2, n_reps=params.cluster_tree_n_reps, cluster=True, full=False, cluster_tree_n_iters=params.cluster_tree_n_iters, max_tries=params.cluster_tree_n_tries, robustness_thr=0.5, alpha=alpha, max_scoring=True, gamma=gamma) + + # Store best cluster tree + with open(output.cluster_tree, "w") as file: + for line in sci.best_cluster_tree.tree_str.splitlines(): + file.write(f"{line}\n") + np.savetxt(output.cluster_tree_inferred_cnvs, sci.best_cluster_tree.outputs['inferred_cnvs'], delimiter=',') + +rule learn_cluster_trees_sum_knownbps: + params: + cluster_tree_n_iters = config["inference"]["cluster_trees"]["n_iters"], + cluster_tree_n_tries = config["inference"]["cluster_trees"]["n_tries"], + cluster_tree_n_reps = config["inference"]["cluster_trees"]["n_reps"], + posfix = "ctree_sum" + f"{str(n_nodes)}" + "nodes_{fairness}_knownbps_{rep_id}" + input: + segmented_counts = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/'+ '{rep_id}' + "_segmented_counts_knownbps.csv", + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/'+ '{rep_id}' + '_segmented_region_sizes_knownbps.txt' + output: + cluster_tree_sum = f'{TREES_OUTPUT}_best_cluster_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_knownbps.txt', + cluster_tree_inferred_cnvs_sum = f'{TREES_OUTPUT}_best_cluster_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_cnvs_knownbps.csv' + threads: 10 + run: + segmented_counts = np.loadtxt(input.segmented_counts, delimiter=',') + segmented_region_sizes = np.loadtxt(input.segmented_region_sizes, delimiter=',') + alpha = 1./segmented_counts.shape[1] + + sci = SCICoNE(binaries_path, output_temp, persistence=False, postfix=params.posfix + f"{sim_prefix}") + + # Run cluster trees + sci.learn_tree(segmented_counts, segmented_region_sizes, verbosity=2, cluster=True, full=False, cluster_tree_n_iters=params.cluster_tree_n_iters, + max_tries=params.cluster_tree_n_tries, robustness_thr=0.5, alpha=alpha, max_scoring=False, n_reps=params.cluster_tree_n_reps) + + # Store best cluster tree + with open(output.cluster_tree_sum, "w") as file: + for line in sci.best_cluster_tree.tree_str.splitlines(): + file.write(f"{line}\n") + np.savetxt(output.cluster_tree_inferred_cnvs_sum, sci.best_cluster_tree.outputs['inferred_cnvs'], delimiter=',') + +rule learn_full_trees_knownbps: + params: + full_tree_n_iters = config["inference"]["full_trees"]["n_iters"], + move_probs = config["inference"]["full_trees"]["move_probs"], + cf = cf, + posfix = "ftree" + f"{str(n_nodes)}" + "nodes_{fairness}_knownbps_{rep_id}" + input: + cluster_tree = f'{TREES_OUTPUT}_best_cluster_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_knownbps.txt', + segmented_counts = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + "_segmented_counts_knownbps.csv", + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}/'+ str(n_nodes)\ + + 'nodes' + '/'+ '{rep_id}' + '_segmented_region_sizes_knownbps.txt' + output: + full_tree = f'{TREES_OUTPUT}_best_full_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree_knownbps.txt', + full_tree_inferred_cnvs = f'{TREES_OUTPUT}_best_full_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree_cnvs_knownbps.csv' + threads: 10 + run: + segmented_counts = np.loadtxt(input.segmented_counts, delimiter=',') + segmented_region_sizes = np.loadtxt(input.segmented_region_sizes, delimiter=',') + alpha = 1./segmented_counts.shape[1] + + sci = SCICoNE(binaries_path, output_temp, persistence=False, postfix=params.posfix + f"{sim_prefix}") + + tree = Tree(binaries_path + '/inference', "", postfix=params.posfix) + tree.outputs['region_sizes'] = segmented_region_sizes + tree.outputs['region_neutral_states'] = np.ones((segmented_region_sizes.shape[0],)) * 2 + + tree.read_from_file(input.cluster_tree) + sci.best_cluster_tree = tree + + # Run full trees starting from cluster tree + sci.learn_tree(segmented_counts, segmented_region_sizes, verbosity=2, n_reps=10, cluster=False, full=True, full_tree_n_iters=params.full_tree_n_iters, + max_tries=3, robustness_thr=0.5, alpha=alpha, move_probs=params.move_probs, max_scoring=True) + + # Store best full tree + with open(output.full_tree, "w") as file: + for line in sci.best_full_tree.tree_str.splitlines(): + file.write(f"{line}\n") + np.savetxt(output.full_tree_inferred_cnvs, sci.best_full_tree.outputs['inferred_cnvs'], delimiter=',') + +rule learn_full_trees_sum_knownbps: + params: + full_tree_n_iters = config["inference"]["full_trees"]["n_iters"], + move_probs = config["inference"]["full_trees"]["move_probs"], + cf = cf, + posfix = "ftreesum" + f"{str(n_nodes)}" + "nodes_{fairness}_knownbps_{rep_id}" + input: + cluster_tree = f'{TREES_OUTPUT}_best_cluster_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_knownbps.txt', + segmented_counts = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + "_segmented_counts_knownbps.csv", + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + '_segmented_region_sizes_knownbps.txt' + output: + full_tree = f'{TREES_OUTPUT}_best_full_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree_knownbps.txt', + full_tree_inferred_cnvs = f'{TREES_OUTPUT}_best_full_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree_cnvs_knownbps.csv' + threads: 10 + run: + segmented_counts = np.loadtxt(input.segmented_counts, delimiter=',') + segmented_region_sizes = np.loadtxt(input.segmented_region_sizes, delimiter=',') + alpha = 1./segmented_counts.shape[1] + + sci = SCICoNE(binaries_path, output_temp, persistence=False, postfix=params.posfix + f"{sim_prefix}") + + tree = Tree(binaries_path + '/inference', "", postfix=params.posfix) + tree.outputs['region_sizes'] = segmented_region_sizes + tree.outputs['region_neutral_states'] = np.ones((segmented_region_sizes.shape[0],)) * 2 + + tree.read_from_file(input.cluster_tree) + sci.best_cluster_tree = tree + + # Run full trees starting from cluster tree + sci.learn_tree(segmented_counts, segmented_region_sizes, verbosity=2, n_reps=10, cluster=False, full=True, full_tree_n_iters=params.full_tree_n_iters, + max_tries=3, robustness_thr=0.5, alpha=alpha, move_probs=params.move_probs, max_scoring=False) + + # Store best full tree + with open(output.full_tree, "w") as file: + for line in sci.best_full_tree.tree_str.splitlines(): + file.write(f"{line}\n") + np.savetxt(output.full_tree_inferred_cnvs, sci.best_full_tree.outputs['inferred_cnvs'], delimiter=',') + +rule ginkgo_inference: + params: + script = config["ginkgo"]["script"], + n_nodes = n_nodes, + scratch = config["ginkgo"]["scratch"], + mem = config["ginkgo"]["mem"], + time = config["ginkgo"]["time"], + script_inp = str(n_nodes)+"nodes_{rep_id}" + input: + d_mat = SIM_OUTPUT+ '_' + sim_prefix +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_d_mat.txt' + output: + ginkgo_inferred_cnvs = GINKGO_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_ginkgo_inferred.txt' + threads: 10 + shell: + " Rscript {params.script} {input.d_mat} {output.ginkgo_inferred_cnvs}" + +rule conet_ginkgo_inference: + params: + script = config["conet"]["script"], + n_nodes = n_nodes, + scratch = config["conet"]["scratch"], + mem = config["conet"]["mem"], + time = config["conet"]["time"], + script_inp = str(n_nodes)+"nodes_{rep_id}", + intermediate_dir = CONET_GINKGO_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '/', + bin_path = config["conet"]["bin_path"], + em_iters = config["conet"]["em_iters"], + pt_iters = config["conet"]["pt_iters"], + cbs_script = config["conet"]["cbs_script"], + input: + d_mat = SIM_OUTPUT+ '_' + sim_prefix +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_d_mat.txt', + init_cnvs = GINKGO_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/'+ '{rep_id}' + '_ginkgo_inferred.txt', + output: + conet_inferred_cnvs = CONET_GINKGO_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/'+ '{rep_id}' + '_conet_ginkgo_inferred.txt', + conet_inferred_tree = CONET_GINKGO_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/'+ '{rep_id}' + '_conet_ginkgo_tree.txt', + conet_inferred_attachments = CONET_GINKGO_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_conet_ginkgo_attachments.txt' + threads: 10 + # shell: + # "python {params.script} --bin_path {params.bin_path} --counts {input.d_mat} --cnvs {input.init_cnvs} --intermediate_dir {params.intermediate_dir} --seed {wildcards.rep_id} --em_iters {params.em_iters} --pt_iters {params.pt_iters} --out_cnvs {output.conet_inferred_cnvs} --out_tree {output.conet_inferred_tree} --out_attachments {output.conet_inferred_attachments}" + run: + real_ind = list(map(lambda x: x.start, list(tree.nodes))) + real_ind.extend(list(map(lambda x: x.end, list(tree.nodes)))) + real_ind = list(set(real_ind)) + real_ind.sort() + if wildcards.known_breakpoints: + # Extract real breakpoints indices + candidates = real_ind + save_counts_in_CONET_format(dirpath + "counts_synthetic", corr_reads, candidates) + else: + save_counts_in_CONET_format(dirpath + "counts_synthetic", corr_reads, []) + subprocess.run(["Rscript", f"{paramscbs_script}", "--mincells=5", + f"--output={dirpath}cbs_out", f"--dataset={dirpath}counts_synthetic"]) + X = np.loadtxt(dirpath + "cbs_out", delimiter=",") + candidates = [] + for i in range(X.shape[0]): + if X[i, 4] == 1.0: + candidates.append(i) + save_counts_in_CONET_format(dirpath + "counts_synthetic", corr_reads, candidates) + print(f"Found {len(candidates)} breakpoint candidates...") + + # Convert model data to CONET format + data_converter = dc.DataConverter(dirpath + "counts_synthetic", + delimiter=';', + default_bin_length=1, + event_length_normalizer=corr_reads.shape[1], # number of loci + add_chromosome_ends=False, + neutral_cn=2.0) + data_converter.create_CoNET_input_files(dirpath, add_chr_ends_to_indices=False) + + # Perform CONET inference + conet = c.CONET(str(Path(conf.conet_binary_dir) / Path("CONET"))) + params = cp.CONETParameters(tree_structure_prior_k1=0.01, + data_dir=dirpath, counts_penalty_s1=100000, counts_penalty_s2=100000, + param_inf_iters=500000, seed=random.randint(0, 1000), mixture_size=2, pt_inf_iters=1000000, + use_event_lengths_in_attachment=False, + event_length_penalty_k0=1) + conet.infer_tree(params) + print(f"CONET inference finished on model {i}") + result = ir.InferenceResult(conf.conet_binary_dir, corr_reads.T) + + inferred_cn = result.get_inferred_copy_numbers(2, conf.bins, conf.cells) + inferred_brkp = result.bp_matrix.astype(int) + inferred_nodes = set(result.tree.nodes) + inferred_edges = set(result.tree.edges) + + real_cn = data[0] + real_brkp = data[4][:, real_ind].astype(int) + real_nodes = set((n.start, n.end) for n in tree.nodes) + real_edges = set(((e[0].start, e[0].end), (e[1].start, e[1].end)) for e in tree.edges) + +rule conet_cbs_inference: + params: + script = config["conet"]["script"], + n_nodes = n_nodes, + scratch = config["conet"]["scratch"], + mem = config["conet"]["mem"], + time = config["conet"]["time"], + script_inp = str(n_nodes)+"nodes_{rep_id}", + intermediate_dir = CONET_CBS_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '/', + bin_path = config["conet"]["bin_path"], + em_iters = config["conet"]["em_iters"], + pt_iters = config["conet"]["pt_iters"], + cbs_script = config["conet"]["cbs_script"], + input: + d_mat = SIM_OUTPUT+ '_' + sim_prefix +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_d_mat.txt', + output: + conet_inferred_cnvs = CONET_CBS_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/'+ '{rep_id}' + '_conet_cbs_inferred.txt', + conet_inferred_tree = CONET_CBS_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/'+ '{rep_id}' + '_conet_cbs_tree.txt', + conet_inferred_attachments = CONET_CBS_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_conet_cbsattachments.txt' + threads: 10 + # shell: + run: + ## Save file in CONET format + def save_counts_in_CONET_format(path, counts, indices): + no_cells = counts.shape[0] + no_loci = counts.shape[1] + counts = np.transpose(counts) + + # Add columns required by CONET + add = np.zeros([no_loci, 5], dtype=np.float64) + add.fill(1) + add[:, 1] = range(0, no_loci) + add[:, 2] = range(1, no_loci + 1) + add[:, 4] = 0 + add[indices, 4] = 1 + full_counts = np.hstack([add, counts]) + + # Add first row with cell names + names = np.zeros([1, 5 + no_cells], dtype=np.float64) + names[0, 5:] = range(0, no_cells) + full_counts = np.vstack([names, full_counts]) + + np.savetxt(path, full_counts, delimiter=";") + + corr_reads = np.loadtxt(input.d_mat, delimiter=',') + + dirpath = tempfile.mkdtemp() + if not dirpath.endswith("/"): + dirpath += "/" + + save_counts_in_CONET_format(dirpath + "counts_synthetic", corr_reads, []) + subprocess.run(["Rscript", f"{params.cbs_script}", "--mincells=5", + f"--output={dirpath}cbs_out", f"--dataset={dirpath}counts_synthetic"]) + X = np.loadtxt(dirpath + "cbs_out", delimiter=",") + candidates = [] + for i in range(X.shape[0]): + if X[i, 4] == 1.0: + candidates.append(i) + + save_counts_in_CONET_format(dirpath + "counts_synthetic", corr_reads, candidates) + print(f"Found {len(candidates)} breakpoint candidates...") + + # Convert model data to CONET format + data_converter = dc.DataConverter(dirpath + "counts_synthetic", + delimiter=';', + default_bin_length=1, + event_length_normalizer=corr_reads.shape[1], # number of loci + add_chromosome_ends=False, + neutral_cn=2.0) + data_converter.create_CoNET_input_files(dirpath, add_chr_ends_to_indices=False) + + # Perform CONET inference + conet = c.CONET(params.bin_path) + conetparams = cp.CONETParameters(tree_structure_prior_k1=0.01, + data_dir=dirpath, counts_penalty_s1=100000, counts_penalty_s2=100000, + param_inf_iters=500000, seed=random.randint(0, 1000), mixture_size=2, pt_inf_iters=1000000, + use_event_lengths_in_attachment=False, + event_length_penalty_k0=1, + output_dir=dirpath) + conet.infer_tree(conetparams) + result = ir.InferenceResult(dirpath, corr_reads.T) + + inferred_cn = result.get_inferred_copy_numbers(2, n_bins, n_cells) + np.savetxt(output.conet_inferred_cnvs, inferred_cn, delimiter=",") + + # Move results to proper output + shutil.copy(os.path.join(dirpath, "inferred_attachment"), output.conet_inferred_attachments) + shutil.copy(os.path.join(dirpath, "inferred_tree"), output.conet_inferred_tree) + + +rule conet_knownbp_inference: + params: + script = config["conet"]["script"], + n_nodes = n_nodes, + scratch = config["conet"]["scratch"], + mem = config["conet"]["mem"], + time = config["conet"]["time"], + script_inp = str(n_nodes)+"nodes_{rep_id}", + intermediate_dir = CONET_KNOWNBP_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '/', + bin_path = config["conet"]["bin_path"], + em_iters = config["conet"]["em_iters"], + pt_iters = config["conet"]["pt_iters"], + input: + d_mat = SIM_OUTPUT+ '_' + sim_prefix +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_d_mat.txt', + init_cnvs = SIM_OUTPUT+ '_' + sim_prefix +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_ground_truth.txt', + output: + conet_inferred_cnvs = CONET_KNOWNBP_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/'+ '{rep_id}' + '_conet_knownbp_inferred.txt', + conet_inferred_tree = CONET_KNOWNBP_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/'+ '{rep_id}' + '_conet_knownbp_tree.txt', + conet_inferred_attachments = CONET_KNOWNBP_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_conet_knownbp_attachments.txt' + threads: 10 + shell: + "python {params.script} --delim , --bin_path {params.bin_path} --counts {input.d_mat} --cnvs {input.init_cnvs} --intermediate_dir {params.intermediate_dir} --seed {wildcards.rep_id} --em_iters {params.em_iters} --pt_iters {params.pt_iters} --out_cnvs {output.conet_inferred_cnvs} --out_tree {output.conet_inferred_tree} --out_attachments {output.conet_inferred_attachments}" + + +rule compute_deltas: + input: + simulations = expand(f'{SIM_OUTPUT}_{sim_prefix}/{str(n_nodes)}nodes' + '/{rep_id}_ground_truth.txt', + rep_id=[x for x in range(all_n_tps)]), + + best_full_tree = expand(f'{TREES_OUTPUT}_best_full_tree' + '/{fairness}/' + f'{str(n_nodes)}nodes' + '/{rep_id}_full_tree_cnvs.csv', + rep_id=[x for x in range(all_n_tps)], fairness=["fair", "unfair"]), + + best_cluster_tree = expand(f'{TREES_OUTPUT}_best_cluster_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_cnvs.csv', + rep_id=[x for x in range(all_n_tps)], fairness=["fair", "unfair"]), + + best_full_tree_sum = expand(f'{TREES_OUTPUT}_best_full_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree_cnvs.csv', + rep_id=[x for x in range(all_n_tps)], fairness=["fair", "unfair"]), + + best_cluster_tree_sum = expand(f'{TREES_OUTPUT}_best_cluster_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_cnvs.csv', + rep_id=[x for x in range(all_n_tps)], fairness=["fair", "unfair"]), + + + best_full_tree_adapted = expand(f'{TREES_OUTPUT}_best_full_tree' + '/{fairness}/' + f'{str(n_nodes)}nodes' + '/{rep_id}_full_tree_cnvs_adapted.csv', + rep_id=[x for x in range(all_n_tps)], fairness=["fair", "unfair"]), + + best_cluster_tree_adapted = expand(f'{TREES_OUTPUT}_best_cluster_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_cnvs_adapted.csv', + rep_id=[x for x in range(all_n_tps)], fairness=["fair", "unfair"]), + + best_full_tree_sum_adapted = expand(f'{TREES_OUTPUT}_best_full_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree_cnvs_adapted.csv', + rep_id=[x for x in range(all_n_tps)], fairness=["fair", "unfair"]), + + best_cluster_tree_sum_adapted = expand(f'{TREES_OUTPUT}_best_cluster_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_cnvs_adapted.csv', + rep_id=[x for x in range(all_n_tps)], fairness=["fair", "unfair"]), + + conet_cbs = expand(CONET_CBS_OUTPUT+'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_conet_cbs_inferred.txt', + rep_id=[x for x in range(all_n_tps)]), + + # best_full_tree_knownbps = expand(f'{TREES_OUTPUT}_best_full_tree' + '/{fairness}/' + f'{str(n_nodes)}nodes' + '/{rep_id}_full_tree_cnvs_knownbps.csv', + # rep_id=[x for x in range(0,all_n_tps)], fairness=["fair", "unfair"]), + # + # best_cluster_tree_knownbps = expand(f'{TREES_OUTPUT}_best_cluster_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_cnvs_knownbps.csv', + # rep_id=[x for x in range(0,all_n_tps)], fairness=["fair", "unfair"]), + # + # best_full_tree_sum_knownbps = expand(f'{TREES_OUTPUT}_best_full_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree_cnvs_knownbps.csv', + # rep_id=[x for x in range(0,all_n_tps)], fairness=["fair", "unfair"]), + # + # best_cluster_tree_sum_knownbps = expand(f'{TREES_OUTPUT}_best_cluster_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_cnvs_knownbps.csv', + # rep_id=[x for x in range(0,all_n_tps)], fairness=["fair", "unfair"]), + + # ginkgo = expand(GINKGO_OUTPUT+'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_ginkgo_inferred.txt', + # rep_id=[x for x in range(0,all_n_tps)]), + # + # conet = expand(CONET_GINKGO_OUTPUT+'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_conet_ginkgo_inferred.txt', + # rep_id=[x for x in range(0,all_n_tps)]), + + + # conet_knwonbp = expand(CONET_KNOWNBP_OUTPUT+'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_conet_knownbp_inferred.txt', + # rep_id=[x for x in range(0,all_n_tps)]), + output: + out_fname = os.path.join(output_path, 'deltas_conetsims.csv') + run: + rows = [] + rows.append('index,rep_id,method,delta') + i = 0 + for true_cnvs in input.simulations: + rep_id = true_cnvs.split('/')[-1].split('_')[0] + gt = np.loadtxt(true_cnvs, delimiter=',') + if gt.shape[0] == n_bins: # should be cells by bins + gt = np.transpose(gt) + + i+=1 + method = 'diploid' + inf = np.ones(gt.shape) * 2 + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + # i += 1 + # method = 'ginkgo' + # inf_cnvs = f'{GINKGO_OUTPUT}/{n_nodes}nodes/{rep_id}_ginkgo_inferred.txt' + # inf = np.loadtxt(inf_cnvs, delimiter=' ') + # error = np.sqrt(np.mean((gt-inf)**2)) + # rows.append(f'{i},{rep_id},{method},{error}') + # + # i += 1 + # method = 'conet_ginkgo' + # inf_cnvs = f'{CONET_GINKGO_OUTPUT}/{n_nodes}nodes/{rep_id}_conet_ginkgo_inferred.txt' + # inf = np.loadtxt(inf_cnvs, delimiter=',') + # error = np.sqrt(np.mean((gt-inf)**2)) + # rows.append(f'{i},{rep_id},{method},{error}') + + # i += 1 + # method = 'conet_knownbp' + # inf_cnvs = f'{CONET_KNOWNBP_OUTPUT}/{n_nodes}nodes/{rep_id}_conet_knownbp_inferred.txt' + # inf = np.loadtxt(inf_cnvs, delimiter=';').T + # error = np.sqrt(np.mean((gt-inf)**2)) + # rows.append(f'{i},{rep_id},{method},{error}') + + i += 1 + method = 'conet_cbs' + inf_cnvs = f'{CONET_CBS_OUTPUT}/{n_nodes}nodes/{rep_id}_conet_cbs_inferred.txt' + inf = np.loadtxt(inf_cnvs, delimiter=',').T + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + for fairness in ['unfair', 'fair']: + i += 1 + method = f'cluster_tree_{fairness}' + inf_cnvs = f'{TREES_OUTPUT}_best_cluster_tree/{fairness}/{n_nodes}nodes/{rep_id}_cluster_tree_cnvs.csv' + inf = np.loadtxt(inf_cnvs, delimiter=',') + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + i += 1 + method = f'cluster_tree_sum_{fairness}' + inf_cnvs = f'{TREES_OUTPUT}_best_cluster_tree_sum/{fairness}/{n_nodes}nodes/{rep_id}_cluster_tree_cnvs.csv' + inf = np.loadtxt(inf_cnvs, delimiter=',') + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + i += 1 + method = f'full_tree_{fairness}' + inf_cnvs = f'{TREES_OUTPUT}_best_full_tree/{fairness}/{n_nodes}nodes/{rep_id}_full_tree_cnvs.csv' + inf = np.loadtxt(inf_cnvs, delimiter=',') + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + i += 1 + method = f'full_tree_sum_{fairness}' + inf_cnvs = f'{TREES_OUTPUT}_best_full_tree_sum/{fairness}/{n_nodes}nodes/{rep_id}_full_tree_cnvs.csv' + inf = np.loadtxt(inf_cnvs, delimiter=',') + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + # i += 1 + # method = f'cluster_tree_{fairness}_knownbps' + # inf_cnvs = f'{TREES_OUTPUT}_best_cluster_tree/{fairness}/{n_nodes}nodes/{rep_id}_cluster_tree_cnvs_knownbps.csv' + # inf = np.loadtxt(inf_cnvs, delimiter=',') + # error = np.sqrt(np.mean((gt-inf)**2)) + # rows.append(f'{i},{rep_id},{method},{error}') + # + # i += 1 + # method = f'cluster_tree_sum_{fairness}_knownbps' + # inf_cnvs = f'{TREES_OUTPUT}_best_cluster_tree_sum/{fairness}/{n_nodes}nodes/{rep_id}_cluster_tree_cnvs_knownbps.csv' + # inf = np.loadtxt(inf_cnvs, delimiter=',') + # error = np.sqrt(np.mean((gt-inf)**2)) + # rows.append(f'{i},{rep_id},{method},{error}') + # + # i += 1 + # method = f'full_tree_{fairness}_knownbps' + # inf_cnvs = f'{TREES_OUTPUT}_best_full_tree/{fairness}/{n_nodes}nodes/{rep_id}_full_tree_cnvs_knownbps.csv' + # inf = np.loadtxt(inf_cnvs, delimiter=',') + # error = np.sqrt(np.mean((gt-inf)**2)) + # rows.append(f'{i},{rep_id},{method},{error}') + # + # i += 1 + # method = f'full_tree_sum_{fairness}_knownbps' + # inf_cnvs = f'{TREES_OUTPUT}_best_full_tree_sum/{fairness}/{n_nodes}nodes/{rep_id}_full_tree_cnvs_knownbps.csv' + # inf = np.loadtxt(inf_cnvs, delimiter=',') + # error = np.sqrt(np.mean((gt-inf)**2)) + # rows.append(f'{i},{rep_id},{method},{error}') + + i += 1 + method = f'cluster_tree_{fairness}_adapted' + inf_cnvs = f'{TREES_OUTPUT}_best_cluster_tree/{fairness}/{n_nodes}nodes/{rep_id}_cluster_tree_cnvs_adapted.csv' + inf = np.loadtxt(inf_cnvs, delimiter=',') + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + i += 1 + method = f'cluster_tree_sum_{fairness}_adapted' + inf_cnvs = f'{TREES_OUTPUT}_best_cluster_tree_sum/{fairness}/{n_nodes}nodes/{rep_id}_cluster_tree_cnvs_adapted.csv' + inf = np.loadtxt(inf_cnvs, delimiter=',') + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + i += 1 + method = f'full_tree_{fairness}_adapted' + inf_cnvs = f'{TREES_OUTPUT}_best_full_tree/{fairness}/{n_nodes}nodes/{rep_id}_full_tree_cnvs_adapted.csv' + inf = np.loadtxt(inf_cnvs, delimiter=',') + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + i += 1 + method = f'full_tree_sum_{fairness}_adapted' + inf_cnvs = f'{TREES_OUTPUT}_best_full_tree_sum/{fairness}/{n_nodes}nodes/{rep_id}_full_tree_cnvs_adapted.csv' + inf = np.loadtxt(inf_cnvs, delimiter=',') + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + + with open(output.out_fname, 'w') as f: + f.write('\n'.join(rows)) + +rule plot_deltas: + input: + deltas_csv = os.path.join(output_path, 'deltas_conetsims.csv') + output: + deltas_png = os.path.join(output_path, 'deltas_conetsims.png') + run: + df = pd.read_csv(input.deltas_csv, index_col=0) + plt.figure(figsize=(10,6)) + bp = sns.boxplot(data=df, x='method', y='delta', hue='method', dodge=False) + plt.setp(bp.get_xticklabels(), rotation=45) + plt.ylim([0,0.5]) + plt.savefig(output.deltas_png) + diff --git a/reproducibility/simulations/conet_comparisons/default/config.json b/reproducibility/simulations/conet_comparisons/default/config.json new file mode 100644 index 0000000..3bb3588 --- /dev/null +++ b/reproducibility/simulations/conet_comparisons/default/config.json @@ -0,0 +1,84 @@ +{ + "binaries_path":"/cluster/work/bewi/members/pedrof/tupro_code/SCICoNE/build3", + "output":"/cluster/work/bewi/members/pedrof/sc-dna/sims2020_20_nodes/results2023_bps/", + "simulate": + { + "n_cells":200, + "n_nodes":20, + "n_bins":1500, + "n_reps":40, + "prefix":"", + "bin":"/cluster/work/bewi/members/pedrof/tupro_code/SCICoNE/build3/simulation" + }, + "bp_detection": + { + "window_size":20, + "threshold":3, + "bp_limit":300, + "verbosity":1, + "bin":"/cluster/work/bewi/members/pedrof/sc-dna/build3/breakpoint_detection" + }, + "inference": + { + "cluster_trees": + { + "n_reps":10, + "n_iters":40000, + "n_nodes":0, + "move_probs":[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.01, 0.1, 0.01, 1.0, 0.01], + "n_tries":3 + }, + "full_trees": + { + "n_reps":10, + "n_iters":200000, + "cluster_fraction":1.0, + "n_nodes": 0, + "move_probs":[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.01, 0.1, 0.01, 1.0, 0.01], + }, + "ploidy":2, + "verbosity":1, + "copy_number_limit":4, + "seed":42, + "bin":"/cluster/work/bewi/members/pedrof/tupro_code/SCICoNE/build3/inference", + "output_temp":"/cluster/work/bewi/members/pedrof/sc-dna/sims2020_20_nodes/results2023_bps_temp/" + }, + "hmm_copy": + { + "threads":1, + "scratch":20000, + "mem":"3200", + "time": "600", + "script":"/cluster/work/bewi/members/pedrof/cna_other_methods/HMMCopyInference.R" + }, + "ginkgo": + { + "threads":1, + "scratch":20000, + "mem":"3200", + "time": "600", + "script":"/cluster/work/bewi/members/pedrof/cna_other_methods/ginkgo.R" + }, + "conet": + { + "threads":1, + "scratch":20000, + "mem":"3200", + "time":"600", + "bin_path":"/cluster/work/bewi/members/pedrof/sc-dna/sims2020_20_nodes/snake_analysis_files_2023/CONET-reproducibility/src/CONET", + "script":"/cluster/work/bewi/members/pedrof/cna_other_methods/run_conet.py", + "cbs_script":"/cluster/work/bewi/members/pedrof/cna_other_methods/CBS_MergeLevels.R", + "em_iters": 500000, + "pt_iters": 1000000 + }, + "conet_tree_distances": + { + "threads":1, + "scratch":20000, + "mem":"3200", + "time":"600", + "bin_path":"/cluster/work/bewi/members/pedrof/CONET/src/CONET", + "script":"/cluster/work/bewi/members/pedrof/cna_other_methods/run_conet_tree_distances.py" + } +} + diff --git a/reproducibility/simulations/conet_comparisons/low_sd/Snakefile b/reproducibility/simulations/conet_comparisons/low_sd/Snakefile new file mode 100644 index 0000000..6884b98 --- /dev/null +++ b/reproducibility/simulations/conet_comparisons/low_sd/Snakefile @@ -0,0 +1,1134 @@ +""" +Here we show that the SCICoNE results in the CONET paper are due to SCICoNE +being passed normalised instead of raw counts. + +We assess two cases: +1. SCICoNE with w=20 and input = normalized data to recover their results +2. SCICoNE with w=4 and input = normalized data * 50 to obtain proper results +""" + +import os +import shutil +import pandas as pd +import numpy as np +from tqdm import tqdm as tqdm +import random +from math import sqrt +import matplotlib.pyplot as plt +import seaborn as sns +import subprocess + +import tempfile +import conet +import conet.data_converter.data_converter as dc +import conet.conet as c +import conet.conet_parameters as cp +import conet.inference_result as ir +from conet.per_bin_model.cn_sampler import * +from conet.per_bin_model.node_label import * +from conet.per_bin_model.tree_generator import * +from conet.per_bin_model.counts_generator import * +import networkx as nx + +from scicone import SCICoNE, Tree + +all_n_tps = config["simulate"]["n_reps"] +n_cells = config["simulate"]["n_cells"] +n_bins = config["simulate"]["n_bins"] +n_nodes = config["simulate"]["n_nodes"] +window_size = config["bp_detection"]["window_size"] + +sim_prefix=config["simulate"]["prefix"] +binaries_path = config['binaries_path'] +output_path = config["output"] + +SIM_OUTPUT= os.path.join(config["output"], "simulation") +BP_OUTPUT = os.path.join(config["output"], "bp_detection") +TREES_OUTPUT = os.path.join(config["output"], "inference") +GINKGO_OUTPUT = os.path.join(config["output"], "ginkgo") +CONET_GINKGO_OUTPUT = os.path.join(config["output"], "conet_ginkgo") +CONET_CBS_OUTPUT = os.path.join(config["output"], "conet_cbs") +CONET_KNOWNBP_OUTPUT = os.path.join(config["output"], "conet_knownbp") + +output_temp = config["inference"]["output_temp"] +try: + os.mkdir(output_temp) +except: + pass + +# the default case for cluster_fraction variable +try: + cf = config["inference"]["full_trees"]["cluster_fraction"] +except KeyError: + cf = 1.0 + +rule all: + input: + deltas = os.path.join(output_path, 'deltas_conetsims.csv') + run: + print("rule all") + +rule cluster_and_conet: + input: + best_cluster_tree = expand(f'{TREES_OUTPUT}_best_cluster_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_cnvs.csv', + rep_id=[x for x in range(0,all_n_tps)], fairness=["fair", "unfair"]), + + best_cluster_tree_adapted = expand(f'{TREES_OUTPUT}_best_cluster_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_cnvs_adapted.csv', + rep_id=[x for x in range(0,all_n_tps)], fairness=["fair", "unfair"]), + + conet_cbs = expand(CONET_CBS_OUTPUT+'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_conet_cbs_inferred.txt', rep_id=[x for x in range(0,all_n_tps)]), + run: + print("rule cluster_and_conet") + +rule run_sim_conet: + params: + n_nodes = n_nodes, + n_bins = n_bins, + n_cells = n_cells, + output: + d_mat = SIM_OUTPUT+ '_' + sim_prefix +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_d_mat.txt', + ground_truth = SIM_OUTPUT+ '_' + sim_prefix +'/'+ str(n_nodes) + 'nodes' + '/'+ '{rep_id}' + '_ground_truth.txt', + tree = SIM_OUTPUT+ '_' + sim_prefix +'/'+ str(n_nodes) + 'nodes' + '/'+ '{rep_id}' + '_tree.txt', + attachments = SIM_OUTPUT+ '_' + sim_prefix +'/'+ str(n_nodes) + 'nodes' + '/'+ '{rep_id}' + '_attachments.txt', + run: + import random + from math import sqrt + + loci = params.n_bins + tree_size = params.n_nodes + cells = params.n_cells + random.seed(int(wildcards.rep_id)) + np.random.seed(int(wildcards.rep_id)) + + cn_s = \ + CNSampler({0: 0.020240121, 1: 0.203724532, 3: 0.050340118, 4: 0.038828672, 2: 0.686866557}, + {0: 0.0449, 1: 0.116, 2: 1.41 * 0.187, 3: 0.114, 4: 0.279, 5: 0.0957, 6: 0.4833, 7: 0.2760, + 8: 6.15780, 9: 4.72105270}) + cn_s.NOISE_PROB = 0.01 + + t_gen = TreeGenerator(cn_s) + d_gen = CountsGenerator(cn_s) + + tree, trunk = t_gen.generate_random_tree(loci, tree_size) + counts, attachment, corrected_counts, diff_matrix, brkp_matrix = d_gen.generate_data(loci, tree, cells, trunk) + + # # generate event tree and cell data, this might take a while + # cn_s = CNSampler.create_default_sampler() + # t_gen = EventTreeGenerator(cn_sampler=cn_s, tree_size=tree_size, no_loci=loci) + # tree: EventTree = t_gen.generate_random_tree() + # d_gen = CountsGenerator(cn_s, tree) + # counts, attachment, corrected_counts, brkp_matrix = d_gen.generate_data(loci, cells) + conet_cc = np.array(corrected_counts) + counts = np.array(counts) + + assert counts.shape[0] == n_cells and counts.shape[1] == n_bins + assert conet_cc.shape[0] == n_cells and conet_cc.shape[1] == n_bins + + np.savetxt(output.d_mat, conet_cc, delimiter=',') + np.savetxt(output.ground_truth, counts, delimiter=',') + nx.write_edgelist(tree, output.tree) + np.savetxt(output.attachments, attachment, delimiter=',') + +rule detect_breakpoints: + params: + binary = config["bp_detection"]["bin"], + window_size = config["bp_detection"]["window_size"], + verbosity = config["bp_detection"]["verbosity"], + threshold = config["bp_detection"]["threshold"], + bp_limit = config["bp_detection"]["bp_limit"], + postfix = sim_prefix + input: + d_matrix_file = SIM_OUTPUT+ '_' + sim_prefix +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_d_mat.txt' + output: + segmented_regions = BP_OUTPUT + '_' + sim_prefix +'_trees' + '_{fairness}' + '/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_segmented_regions.txt', + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees' + '_{fairness}' + '/' + str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_segmented_region_sizes.txt', + run: + data = np.loadtxt(input.d_matrix_file, delimiter=',') + n_cells = data.shape[0] + n_bins = data.shape[1] + + # Optional filter for low coverages + def filter_lr(lr_matrix, H=None): + freq = np.fft.fftfreq(lr_matrix.shape[-1], 1) # 1 Hz sampling rate + + filtered_lr = np.empty(lr_matrix.shape) + for c in range(lr_matrix.shape[0]): + X = np.fft.fft(lr_matrix[c]) + Y = X * H + y = np.fft.ifft(Y) + filtered_lr[c] = y + + return filtered_lr + + sci = SCICoNE(binaries_path, output_temp, persistence=False, postfix=f"{n_nodes}nodes_{wildcards.fairness}_{wildcards.rep_id}_{params.postfix}") + + window_size = params.window_size + if wildcards.fairness == 'fair': + # Assuming data was simulated with coverage of 100reads per bin + data = data * 50 + window_size = 1#4 + + freq = np.fft.fftfreq(n_bins, 1) + df = 0.015 + gpl = np.exp(- ((freq-1/(2*window_size))/(2*df))**2) # pos. frequencies + gmn = np.exp(- ((freq+1/(2*window_size))/(2*df))**2) + g = gpl + gmn + + bps = dict() + if np.sum(data[0])/n_bins <= 10 and window_size > 1: + bps = sci.detect_breakpoints(data, window_size=window_size, threshold=0.1, bp_limit=params.bp_limit, compute_sp=False, evaluate_peaks=False) + filtered_lr = filter_lr(bps['lr_vec'].T, H=g) + bps = sci.detect_breakpoints(data, window_size=window_size, bp_limit=params.bp_limit, threshold=params.threshold, lr=filtered_lr) + else: + bps = sci.detect_breakpoints(data, window_size=window_size, threshold=params.threshold, bp_limit=params.bp_limit) + + np.savetxt(output.segmented_regions, bps['segmented_regions'], delimiter=',') + np.savetxt(output.segmented_region_sizes, bps['segmented_region_sizes'], delimiter=',') + +rule segment_regions: + input: + d_matrix_file = SIM_OUTPUT+ '_' + sim_prefix +'/'+ str(n_nodes) + 'nodes' +\ + '/' + '{rep_id}' + '_d_mat.txt', + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/'+ '{rep_id}' + '_segmented_region_sizes.txt' + output: + segmented_counts = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/'+ '{rep_id}' + "_segmented_counts.csv", + segmented_counts_shape = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/'+ '{rep_id}' + "_segmented_counts_shape.txt" + run: + filtered_counts = np.loadtxt(input.d_matrix_file, delimiter=',') + if wildcards.fairness == 'fair': + # Assuming data was simulated with coverage of 100reads per bin + filtered_counts = filtered_counts * 50 + + n_cells = filtered_counts.shape[0] + region_sizes = np.loadtxt(input.segmented_region_sizes) + n_regions = len(region_sizes) + sum_region_sizes = np.sum(region_sizes) + condensed_mat = np.zeros((n_cells, n_regions)) + + print("segmenting the bins...") + for i in tqdm(range(n_cells)): + region_id = 0 + region_count = 0 + # import ipdb; ipdb.set_trace() # debugging starts here + for j in range(filtered_counts.shape[1]): + to_add = filtered_counts[i][j] + condensed_mat[i][region_id] += to_add + region_count += 1 + if region_count == region_sizes[region_id]: + region_id += 1 + region_count = 0 + + if not np.allclose(condensed_mat.sum(axis=1), filtered_counts.sum(axis=1)): + raise AssertionError( + "not all values of the sums before & after " + "segmentation are close") + + print("saving the segmented regions...") + np.savetxt(output.segmented_counts, condensed_mat, delimiter=",") + np.savetxt(output.segmented_counts_shape, condensed_mat.shape) + +rule learn_cluster_trees: + params: + cluster_tree_n_iters = config["inference"]["cluster_trees"]["n_iters"], + cluster_tree_n_tries = config["inference"]["cluster_trees"]["n_tries"], + cluster_tree_n_reps = config["inference"]["cluster_trees"]["n_reps"], + posfix = "ctree" + f"{str(n_nodes)}" + "nodes_{fairness}_{rep_id}", + input: + segmented_counts = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + "_segmented_counts.csv", + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + '_segmented_region_sizes.txt' + output: + cluster_tree = f'{TREES_OUTPUT}_best_cluster_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree.txt', + cluster_tree_inferred_cnvs = f'{TREES_OUTPUT}_best_cluster_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_cnvs.csv' + threads: 10 + run: + segmented_counts = np.loadtxt(input.segmented_counts, delimiter=',') + segmented_region_sizes = np.loadtxt(input.segmented_region_sizes, delimiter=',') + cov = np.mean(np.sum(segmented_counts, axis=1) / np.sum(segmented_region_sizes)) + alpha = 1./segmented_counts.shape[1] + gamma = 1./cov + sci = SCICoNE(binaries_path, output_temp, persistence=False, postfix=params.posfix + f"{sim_prefix}") + + # Run cluster trees + sci.learn_tree(segmented_counts, segmented_region_sizes, verbosity=2, n_reps=params.cluster_tree_n_reps, cluster=True, full=False, cluster_tree_n_iters=params.cluster_tree_n_iters, max_tries=params.cluster_tree_n_tries, robustness_thr=0.5, alpha=alpha, max_scoring=True, gamma=gamma) + + # Store best cluster tree + with open(output.cluster_tree, "w") as file: + for line in sci.best_cluster_tree.tree_str.splitlines(): + file.write(f"{line}\n") + np.savetxt(output.cluster_tree_inferred_cnvs, sci.best_cluster_tree.outputs['inferred_cnvs'], delimiter=',') + +rule learn_cluster_trees_sum: + params: + cluster_tree_n_iters = config["inference"]["cluster_trees"]["n_iters"], + cluster_tree_n_tries = config["inference"]["cluster_trees"]["n_tries"], + cluster_tree_n_reps = config["inference"]["cluster_trees"]["n_reps"], + posfix = "ctree_sum" + f"{str(n_nodes)}" + "nodes_{fairness}_{rep_id}" + input: + segmented_counts = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/'+ '{rep_id}' + "_segmented_counts.csv", + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/'+ '{rep_id}' + '_segmented_region_sizes.txt' + output: + cluster_tree_sum = f'{TREES_OUTPUT}_best_cluster_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree.txt', + cluster_tree_inferred_cnvs_sum = f'{TREES_OUTPUT}_best_cluster_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_cnvs.csv' + threads: 10 + run: + segmented_counts = np.loadtxt(input.segmented_counts, delimiter=',') + segmented_region_sizes = np.loadtxt(input.segmented_region_sizes, delimiter=',') + alpha = 1./segmented_counts.shape[1] + + sci = SCICoNE(binaries_path, output_temp, persistence=False, postfix=params.posfix + f"{sim_prefix}") + + # Run cluster trees + sci.learn_tree(segmented_counts, segmented_region_sizes, verbosity=2, cluster=True, full=False, cluster_tree_n_iters=params.cluster_tree_n_iters, + max_tries=params.cluster_tree_n_tries, robustness_thr=0.5, alpha=alpha, max_scoring=False, n_reps=params.cluster_tree_n_reps) + + # Store best cluster tree + with open(output.cluster_tree_sum, "w") as file: + for line in sci.best_cluster_tree.tree_str.splitlines(): + file.write(f"{line}\n") + np.savetxt(output.cluster_tree_inferred_cnvs_sum, sci.best_cluster_tree.outputs['inferred_cnvs'], delimiter=',') + +rule learn_full_trees: + params: + full_tree_n_iters = config["inference"]["full_trees"]["n_iters"], + move_probs = config["inference"]["full_trees"]["move_probs"], + cf = cf, + posfix = "ftree" + f"{str(n_nodes)}" + "nodes_{fairness}_{rep_id}" + input: + cluster_tree = f'{TREES_OUTPUT}_best_cluster_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree.txt', + segmented_counts = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + "_segmented_counts.csv", + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}/'+ str(n_nodes)\ + + 'nodes' + '/'+ '{rep_id}' + '_segmented_region_sizes.txt' + output: + full_tree = f'{TREES_OUTPUT}_best_full_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree.txt', + full_tree_inferred_cnvs = f'{TREES_OUTPUT}_best_full_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree_cnvs.csv' + threads: 10 + run: + segmented_counts = np.loadtxt(input.segmented_counts, delimiter=',') + segmented_region_sizes = np.loadtxt(input.segmented_region_sizes, delimiter=',') + alpha = 1./segmented_counts.shape[1] + + sci = SCICoNE(binaries_path, output_temp, persistence=False, postfix=params.posfix + f"{sim_prefix}") + + tree = Tree(binaries_path + '/inference', "", postfix=params.posfix) + tree.outputs['region_sizes'] = segmented_region_sizes + tree.outputs['region_neutral_states'] = np.ones((segmented_region_sizes.shape[0],)) * 2 + + tree.read_from_file(input.cluster_tree) + sci.best_cluster_tree = tree + + # Run full trees starting from cluster tree + sci.learn_tree(segmented_counts, segmented_region_sizes, verbosity=2, n_reps=10, cluster=False, full=True, full_tree_n_iters=params.full_tree_n_iters, + max_tries=3, robustness_thr=0.5, alpha=alpha, move_probs=params.move_probs, max_scoring=True) + + # Store best full tree + with open(output.full_tree, "w") as file: + for line in sci.best_full_tree.tree_str.splitlines(): + file.write(f"{line}\n") + np.savetxt(output.full_tree_inferred_cnvs, sci.best_full_tree.outputs['inferred_cnvs'], delimiter=',') + +rule learn_full_trees_sum: + params: + full_tree_n_iters = config["inference"]["full_trees"]["n_iters"], + move_probs = config["inference"]["full_trees"]["move_probs"], + cf = cf, + posfix = "ftreesum" + f"{str(n_nodes)}" + "nodes_{fairness}_{rep_id}" + input: + cluster_tree = f'{TREES_OUTPUT}_best_cluster_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree.txt', + segmented_counts = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + "_segmented_counts.csv", + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + '_segmented_region_sizes.txt' + output: + full_tree = f'{TREES_OUTPUT}_best_full_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree.txt', + full_tree_inferred_cnvs = f'{TREES_OUTPUT}_best_full_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree_cnvs.csv' + threads: 10 + run: + segmented_counts = np.loadtxt(input.segmented_counts, delimiter=',') + segmented_region_sizes = np.loadtxt(input.segmented_region_sizes, delimiter=',') + alpha = 1./segmented_counts.shape[1] + + sci = SCICoNE(binaries_path, output_temp, persistence=False, postfix=params.posfix + f"{sim_prefix}") + + tree = Tree(binaries_path + '/inference', "", postfix=params.posfix) + tree.outputs['region_sizes'] = segmented_region_sizes + tree.outputs['region_neutral_states'] = np.ones((segmented_region_sizes.shape[0],)) * 2 + + tree.read_from_file(input.cluster_tree) + sci.best_cluster_tree = tree + + # Run full trees starting from cluster tree + sci.learn_tree(segmented_counts, segmented_region_sizes, verbosity=2, n_reps=10, cluster=False, full=True, full_tree_n_iters=params.full_tree_n_iters, + max_tries=3, robustness_thr=0.5, alpha=alpha, move_probs=params.move_probs, max_scoring=False) + + # Store best full tree + with open(output.full_tree, "w") as file: + for line in sci.best_full_tree.tree_str.splitlines(): + file.write(f"{line}\n") + np.savetxt(output.full_tree_inferred_cnvs, sci.best_full_tree.outputs['inferred_cnvs'], delimiter=',') + + +rule learn_cluster_trees_adapted: + params: + cluster_tree_n_iters = config["inference"]["cluster_trees"]["n_iters"], + cluster_tree_n_tries = config["inference"]["cluster_trees"]["n_tries"], + cluster_tree_n_reps = config["inference"]["cluster_trees"]["n_reps"], + posfix = "ctree" + f"{str(n_nodes)}" + "nodes_{fairness}_{rep_id}_adapted", + input: + segmented_counts = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + "_segmented_counts.csv", + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + '_segmented_region_sizes.txt' + output: + cluster_tree = f'{TREES_OUTPUT}_best_cluster_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_adapted.txt', + cluster_tree_inferred_cnvs = f'{TREES_OUTPUT}_best_cluster_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_cnvs_adapted.csv' + threads: 10 + run: + segmented_counts = np.loadtxt(input.segmented_counts, delimiter=',') + segmented_region_sizes = np.loadtxt(input.segmented_region_sizes, delimiter=',') + cov = np.mean(np.sum(segmented_counts, axis=1) / np.sum(segmented_region_sizes)) + alpha = 1./segmented_counts.shape[1] + gamma = 1./cov + sci = SCICoNE(binaries_path, output_temp, persistence=False, postfix=params.posfix + f"{sim_prefix}") + + # Run cluster trees + sci.learn_tree(segmented_counts, segmented_region_sizes, verbosity=2, n_reps=params.cluster_tree_n_reps, cluster=True, full=False, cluster_tree_n_iters=params.cluster_tree_n_iters, max_tries=params.cluster_tree_n_tries, robustness_thr=0.5, alpha=alpha, max_scoring=True, gamma=gamma, c_penalise=0., + eta=0.4) + + # Store best cluster tree + with open(output.cluster_tree, "w") as file: + for line in sci.best_cluster_tree.tree_str.splitlines(): + file.write(f"{line}\n") + np.savetxt(output.cluster_tree_inferred_cnvs, sci.best_cluster_tree.outputs['inferred_cnvs'], delimiter=',') + +rule learn_cluster_trees_sum_adapted: + params: + cluster_tree_n_iters = config["inference"]["cluster_trees"]["n_iters"], + cluster_tree_n_tries = config["inference"]["cluster_trees"]["n_tries"], + cluster_tree_n_reps = config["inference"]["cluster_trees"]["n_reps"], + posfix = "ctree_sum" + f"{str(n_nodes)}" + "nodes_{fairness}_{rep_id}_adapted" + input: + segmented_counts = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/'+ '{rep_id}' + "_segmented_counts.csv", + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/'+ '{rep_id}' + '_segmented_region_sizes.txt' + output: + cluster_tree_sum = f'{TREES_OUTPUT}_best_cluster_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_adapted.txt', + cluster_tree_inferred_cnvs_sum = f'{TREES_OUTPUT}_best_cluster_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_cnvs_adapted.csv' + threads: 10 + run: + segmented_counts = np.loadtxt(input.segmented_counts, delimiter=',') + segmented_region_sizes = np.loadtxt(input.segmented_region_sizes, delimiter=',') + alpha = 1./segmented_counts.shape[1] + + sci = SCICoNE(binaries_path, output_temp, persistence=False, postfix=params.posfix + f"{sim_prefix}") + + # Run cluster trees + sci.learn_tree(segmented_counts, segmented_region_sizes, verbosity=2, cluster=True, full=False, cluster_tree_n_iters=params.cluster_tree_n_iters, + max_tries=params.cluster_tree_n_tries, robustness_thr=0.5, alpha=alpha, max_scoring=False, n_reps=params.cluster_tree_n_reps, + eta=0.4, c_penalise=0.) + + # Store best cluster tree + with open(output.cluster_tree_sum, "w") as file: + for line in sci.best_cluster_tree.tree_str.splitlines(): + file.write(f"{line}\n") + np.savetxt(output.cluster_tree_inferred_cnvs_sum, sci.best_cluster_tree.outputs['inferred_cnvs'], delimiter=',') + +rule learn_full_trees_adapted: + params: + full_tree_n_iters = config["inference"]["full_trees"]["n_iters"], + move_probs = config["inference"]["full_trees"]["move_probs"], + cf = cf, + posfix = "ftree" + f"{str(n_nodes)}" + "nodes_{fairness}_{rep_id}_adapted" + input: + cluster_tree = f'{TREES_OUTPUT}_best_cluster_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_adapted.txt', + segmented_counts = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + "_segmented_counts.csv", + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}/'+ str(n_nodes)\ + + 'nodes' + '/'+ '{rep_id}' + '_segmented_region_sizes.txt' + output: + full_tree = f'{TREES_OUTPUT}_best_full_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree_adapted.txt', + full_tree_inferred_cnvs = f'{TREES_OUTPUT}_best_full_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree_cnvs_adapted.csv' + threads: 10 + run: + segmented_counts = np.loadtxt(input.segmented_counts, delimiter=',') + segmented_region_sizes = np.loadtxt(input.segmented_region_sizes, delimiter=',') + alpha = 1./segmented_counts.shape[1] + + sci = SCICoNE(binaries_path, output_temp, persistence=False, postfix=params.posfix + f"{sim_prefix}") + + tree = Tree(binaries_path + '/inference', "", postfix=params.posfix) + tree.outputs['region_sizes'] = segmented_region_sizes + tree.outputs['region_neutral_states'] = np.ones((segmented_region_sizes.shape[0],)) * 2 + + tree.read_from_file(input.cluster_tree) + sci.best_cluster_tree = tree + + # Run full trees starting from cluster tree + sci.learn_tree(segmented_counts, segmented_region_sizes, verbosity=2, n_reps=10, cluster=False, full=True, full_tree_n_iters=params.full_tree_n_iters, + max_tries=3, robustness_thr=0.5, alpha=alpha, move_probs=params.move_probs, max_scoring=True, + eta=0.4, c_penalise=0.) + + # Store best full tree + with open(output.full_tree, "w") as file: + for line in sci.best_full_tree.tree_str.splitlines(): + file.write(f"{line}\n") + np.savetxt(output.full_tree_inferred_cnvs, sci.best_full_tree.outputs['inferred_cnvs'], delimiter=',') + +rule learn_full_trees_sum_adapted: + params: + full_tree_n_iters = config["inference"]["full_trees"]["n_iters"], + move_probs = config["inference"]["full_trees"]["move_probs"], + cf = cf, + posfix = "ftreesum" + f"{str(n_nodes)}" + "nodes_{fairness}_{rep_id}_adapted" + input: + cluster_tree = f'{TREES_OUTPUT}_best_cluster_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_adapted.txt', + segmented_counts = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + "_segmented_counts.csv", + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + '_segmented_region_sizes.txt' + output: + full_tree = f'{TREES_OUTPUT}_best_full_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree_adapted.txt', + full_tree_inferred_cnvs = f'{TREES_OUTPUT}_best_full_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree_cnvs_adapted.csv' + threads: 10 + run: + segmented_counts = np.loadtxt(input.segmented_counts, delimiter=',') + segmented_region_sizes = np.loadtxt(input.segmented_region_sizes, delimiter=',') + alpha = 1./segmented_counts.shape[1] + + sci = SCICoNE(binaries_path, output_temp, persistence=False, postfix=params.posfix + f"{sim_prefix}") + + tree = Tree(binaries_path + '/inference', "", postfix=params.posfix) + tree.outputs['region_sizes'] = segmented_region_sizes + tree.outputs['region_neutral_states'] = np.ones((segmented_region_sizes.shape[0],)) * 2 + + tree.read_from_file(input.cluster_tree) + sci.best_cluster_tree = tree + + # Run full trees starting from cluster tree + sci.learn_tree(segmented_counts, segmented_region_sizes, verbosity=2, n_reps=10, cluster=False, full=True, full_tree_n_iters=params.full_tree_n_iters, + max_tries=3, robustness_thr=0.5, alpha=alpha, move_probs=params.move_probs, max_scoring=False, + eta=0.4, c_penalise=0.) + + # Store best full tree + with open(output.full_tree, "w") as file: + for line in sci.best_full_tree.tree_str.splitlines(): + file.write(f"{line}\n") + np.savetxt(output.full_tree_inferred_cnvs, sci.best_full_tree.outputs['inferred_cnvs'], delimiter=',') + + + + +rule segment_regions_knownbps: + input: + d_matrix_file = SIM_OUTPUT+ '_' + sim_prefix +'/'+ str(n_nodes) + 'nodes' +\ + '/' + '{rep_id}' + '_d_mat.txt', + ground_truth = SIM_OUTPUT+ '_' + sim_prefix +'/'+ str(n_nodes) + 'nodes' +\ + '/'+ '{rep_id}' + '_ground_truth.txt', + output: + segmented_counts = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/'+ '{rep_id}' + "_segmented_counts_knownbps.csv", + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/'+ '{rep_id}' + "_segmented_region_sizes_knownbps.txt" + run: + filtered_counts = np.loadtxt(input.d_matrix_file, delimiter=',') + if wildcards.fairness == 'fair': + # Assuming data was simulated with coverage of 100reads per bin + filtered_counts = filtered_counts * 50 + + n_cells = filtered_counts.shape[0] + + cnvs = np.loadtxt(input.ground_truth, delimiter=',') + # True breakpoints: + bps = np.where(np.any(np.diff(cnvs, axis=1) != 0, axis=0))[0] #+ 1 + bps = np.unique(np.sort(np.concatenate([bps, bps+1]))) + print(f"Got {len(bps)} true breakpoints") + bps = list(bps) + bps.append(cnvs.shape[1]) + bps.insert(0, 0) + bps = np.unique(bps) + + region_sizes = np.diff(bps) + + n_regions = len(region_sizes) + sum_region_sizes = np.sum(region_sizes) + condensed_mat = np.zeros((n_cells, n_regions)) + + print("segmenting the bins...") + for i in tqdm(range(n_cells)): + region_id = 0 + region_count = 0 + for j in range(filtered_counts.shape[1]): + to_add = filtered_counts[i][j] + condensed_mat[i][region_id] += to_add + region_count += 1 + if region_count == region_sizes[region_id]: + region_id += 1 + region_count = 0 + + if not np.allclose(condensed_mat.sum(axis=1), filtered_counts.sum(axis=1)): + raise AssertionError( + "not all values of the sums before & after " + "segmentation are close") + + print("saving the segmented regions...") + np.savetxt(output.segmented_counts, condensed_mat, delimiter=",") + np.savetxt(output.segmented_region_sizes, region_sizes, delimiter=',') + +rule learn_cluster_trees_knownbps: + params: + cluster_tree_n_iters = config["inference"]["cluster_trees"]["n_iters"], + cluster_tree_n_tries = config["inference"]["cluster_trees"]["n_tries"], + cluster_tree_n_reps = config["inference"]["cluster_trees"]["n_reps"], + posfix = "ctree" + f"{str(n_nodes)}" + "nodes_{fairness}_knownbps_{rep_id}", + input: + segmented_counts = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + "_segmented_counts_knownbps.csv", + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + '_segmented_region_sizes_knownbps.txt' + output: + cluster_tree = f'{TREES_OUTPUT}_best_cluster_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_knownbps.txt', + cluster_tree_inferred_cnvs = f'{TREES_OUTPUT}_best_cluster_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_cnvs_knownbps.csv' + threads: 10 + run: + segmented_counts = np.loadtxt(input.segmented_counts, delimiter=',') + segmented_region_sizes = np.loadtxt(input.segmented_region_sizes, delimiter=',') + cov = np.mean(np.sum(segmented_counts, axis=1) / np.sum(segmented_region_sizes)) + alpha = 1./segmented_counts.shape[1] + gamma = 1./cov + sci = SCICoNE(binaries_path, output_temp, persistence=False, postfix=params.posfix + f"{sim_prefix}") + + # Run cluster trees + sci.learn_tree(segmented_counts, segmented_region_sizes, verbosity=2, n_reps=params.cluster_tree_n_reps, cluster=True, full=False, cluster_tree_n_iters=params.cluster_tree_n_iters, max_tries=params.cluster_tree_n_tries, robustness_thr=0.5, alpha=alpha, max_scoring=True, gamma=gamma) + + # Store best cluster tree + with open(output.cluster_tree, "w") as file: + for line in sci.best_cluster_tree.tree_str.splitlines(): + file.write(f"{line}\n") + np.savetxt(output.cluster_tree_inferred_cnvs, sci.best_cluster_tree.outputs['inferred_cnvs'], delimiter=',') + +rule learn_cluster_trees_sum_knownbps: + params: + cluster_tree_n_iters = config["inference"]["cluster_trees"]["n_iters"], + cluster_tree_n_tries = config["inference"]["cluster_trees"]["n_tries"], + cluster_tree_n_reps = config["inference"]["cluster_trees"]["n_reps"], + posfix = "ctree_sum" + f"{str(n_nodes)}" + "nodes_{fairness}_knownbps_{rep_id}" + input: + segmented_counts = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/'+ '{rep_id}' + "_segmented_counts_knownbps.csv", + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}' + '/'+ str(n_nodes)\ + + 'nodes' + '/'+ '{rep_id}' + '_segmented_region_sizes_knownbps.txt' + output: + cluster_tree_sum = f'{TREES_OUTPUT}_best_cluster_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_knownbps.txt', + cluster_tree_inferred_cnvs_sum = f'{TREES_OUTPUT}_best_cluster_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_cnvs_knownbps.csv' + threads: 10 + run: + segmented_counts = np.loadtxt(input.segmented_counts, delimiter=',') + segmented_region_sizes = np.loadtxt(input.segmented_region_sizes, delimiter=',') + alpha = 1./segmented_counts.shape[1] + + sci = SCICoNE(binaries_path, output_temp, persistence=False, postfix=params.posfix + f"{sim_prefix}") + + # Run cluster trees + sci.learn_tree(segmented_counts, segmented_region_sizes, verbosity=2, cluster=True, full=False, cluster_tree_n_iters=params.cluster_tree_n_iters, + max_tries=params.cluster_tree_n_tries, robustness_thr=0.5, alpha=alpha, max_scoring=False, n_reps=params.cluster_tree_n_reps) + + # Store best cluster tree + with open(output.cluster_tree_sum, "w") as file: + for line in sci.best_cluster_tree.tree_str.splitlines(): + file.write(f"{line}\n") + np.savetxt(output.cluster_tree_inferred_cnvs_sum, sci.best_cluster_tree.outputs['inferred_cnvs'], delimiter=',') + +rule learn_full_trees_knownbps: + params: + full_tree_n_iters = config["inference"]["full_trees"]["n_iters"], + move_probs = config["inference"]["full_trees"]["move_probs"], + cf = cf, + posfix = "ftree" + f"{str(n_nodes)}" + "nodes_{fairness}_knownbps_{rep_id}" + input: + cluster_tree = f'{TREES_OUTPUT}_best_cluster_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_knownbps.txt', + segmented_counts = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + "_segmented_counts_knownbps.csv", + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}/'+ str(n_nodes)\ + + 'nodes' + '/'+ '{rep_id}' + '_segmented_region_sizes_knownbps.txt' + output: + full_tree = f'{TREES_OUTPUT}_best_full_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree_knownbps.txt', + full_tree_inferred_cnvs = f'{TREES_OUTPUT}_best_full_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree_cnvs_knownbps.csv' + threads: 10 + run: + segmented_counts = np.loadtxt(input.segmented_counts, delimiter=',') + segmented_region_sizes = np.loadtxt(input.segmented_region_sizes, delimiter=',') + alpha = 1./segmented_counts.shape[1] + + sci = SCICoNE(binaries_path, output_temp, persistence=False, postfix=params.posfix + f"{sim_prefix}") + + tree = Tree(binaries_path + '/inference', "", postfix=params.posfix) + tree.outputs['region_sizes'] = segmented_region_sizes + tree.outputs['region_neutral_states'] = np.ones((segmented_region_sizes.shape[0],)) * 2 + + tree.read_from_file(input.cluster_tree) + sci.best_cluster_tree = tree + + # Run full trees starting from cluster tree + sci.learn_tree(segmented_counts, segmented_region_sizes, verbosity=2, n_reps=10, cluster=False, full=True, full_tree_n_iters=params.full_tree_n_iters, + max_tries=3, robustness_thr=0.5, alpha=alpha, move_probs=params.move_probs, max_scoring=True) + + # Store best full tree + with open(output.full_tree, "w") as file: + for line in sci.best_full_tree.tree_str.splitlines(): + file.write(f"{line}\n") + np.savetxt(output.full_tree_inferred_cnvs, sci.best_full_tree.outputs['inferred_cnvs'], delimiter=',') + +rule learn_full_trees_sum_knownbps: + params: + full_tree_n_iters = config["inference"]["full_trees"]["n_iters"], + move_probs = config["inference"]["full_trees"]["move_probs"], + cf = cf, + posfix = "ftreesum" + f"{str(n_nodes)}" + "nodes_{fairness}_knownbps_{rep_id}" + input: + cluster_tree = f'{TREES_OUTPUT}_best_cluster_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_knownbps.txt', + segmented_counts = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + "_segmented_counts_knownbps.csv", + segmented_region_sizes = BP_OUTPUT + '_' + sim_prefix +'_trees_{fairness}/'+ str(n_nodes)\ + + 'nodes' + '/' + '{rep_id}' + '_segmented_region_sizes_knownbps.txt' + output: + full_tree = f'{TREES_OUTPUT}_best_full_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree_knownbps.txt', + full_tree_inferred_cnvs = f'{TREES_OUTPUT}_best_full_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree_cnvs_knownbps.csv' + threads: 10 + run: + segmented_counts = np.loadtxt(input.segmented_counts, delimiter=',') + segmented_region_sizes = np.loadtxt(input.segmented_region_sizes, delimiter=',') + alpha = 1./segmented_counts.shape[1] + + sci = SCICoNE(binaries_path, output_temp, persistence=False, postfix=params.posfix + f"{sim_prefix}") + + tree = Tree(binaries_path + '/inference', "", postfix=params.posfix) + tree.outputs['region_sizes'] = segmented_region_sizes + tree.outputs['region_neutral_states'] = np.ones((segmented_region_sizes.shape[0],)) * 2 + + tree.read_from_file(input.cluster_tree) + sci.best_cluster_tree = tree + + # Run full trees starting from cluster tree + sci.learn_tree(segmented_counts, segmented_region_sizes, verbosity=2, n_reps=10, cluster=False, full=True, full_tree_n_iters=params.full_tree_n_iters, + max_tries=3, robustness_thr=0.5, alpha=alpha, move_probs=params.move_probs, max_scoring=False) + + # Store best full tree + with open(output.full_tree, "w") as file: + for line in sci.best_full_tree.tree_str.splitlines(): + file.write(f"{line}\n") + np.savetxt(output.full_tree_inferred_cnvs, sci.best_full_tree.outputs['inferred_cnvs'], delimiter=',') + +rule ginkgo_inference: + params: + script = config["ginkgo"]["script"], + n_nodes = n_nodes, + scratch = config["ginkgo"]["scratch"], + mem = config["ginkgo"]["mem"], + time = config["ginkgo"]["time"], + script_inp = str(n_nodes)+"nodes_{rep_id}" + input: + d_mat = SIM_OUTPUT+ '_' + sim_prefix +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_d_mat.txt' + output: + ginkgo_inferred_cnvs = GINKGO_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_ginkgo_inferred.txt' + threads: 10 + shell: + " Rscript {params.script} {input.d_mat} {output.ginkgo_inferred_cnvs}" + +rule conet_ginkgo_inference: + params: + script = config["conet"]["script"], + n_nodes = n_nodes, + scratch = config["conet"]["scratch"], + mem = config["conet"]["mem"], + time = config["conet"]["time"], + script_inp = str(n_nodes)+"nodes_{rep_id}", + intermediate_dir = CONET_GINKGO_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '/', + bin_path = config["conet"]["bin_path"], + em_iters = config["conet"]["em_iters"], + pt_iters = config["conet"]["pt_iters"], + cbs_script = config["conet"]["cbs_script"], + input: + d_mat = SIM_OUTPUT+ '_' + sim_prefix +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_d_mat.txt', + init_cnvs = GINKGO_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/'+ '{rep_id}' + '_ginkgo_inferred.txt', + output: + conet_inferred_cnvs = CONET_GINKGO_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/'+ '{rep_id}' + '_conet_ginkgo_inferred.txt', + conet_inferred_tree = CONET_GINKGO_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/'+ '{rep_id}' + '_conet_ginkgo_tree.txt', + conet_inferred_attachments = CONET_GINKGO_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_conet_ginkgo_attachments.txt' + threads: 10 + # shell: + # "python {params.script} --bin_path {params.bin_path} --counts {input.d_mat} --cnvs {input.init_cnvs} --intermediate_dir {params.intermediate_dir} --seed {wildcards.rep_id} --em_iters {params.em_iters} --pt_iters {params.pt_iters} --out_cnvs {output.conet_inferred_cnvs} --out_tree {output.conet_inferred_tree} --out_attachments {output.conet_inferred_attachments}" + run: + real_ind = list(map(lambda x: x.start, list(tree.nodes))) + real_ind.extend(list(map(lambda x: x.end, list(tree.nodes)))) + real_ind = list(set(real_ind)) + real_ind.sort() + if wildcards.known_breakpoints: + # Extract real breakpoints indices + candidates = real_ind + save_counts_in_CONET_format(dirpath + "counts_synthetic", corr_reads, candidates) + else: + save_counts_in_CONET_format(dirpath + "counts_synthetic", corr_reads, []) + subprocess.run(["Rscript", f"{paramscbs_script}", "--mincells=5", + f"--output={dirpath}cbs_out", f"--dataset={dirpath}counts_synthetic"]) + X = np.loadtxt(dirpath + "cbs_out", delimiter=",") + candidates = [] + for i in range(X.shape[0]): + if X[i, 4] == 1.0: + candidates.append(i) + save_counts_in_CONET_format(dirpath + "counts_synthetic", corr_reads, candidates) + print(f"Found {len(candidates)} breakpoint candidates...") + + # Convert model data to CONET format + data_converter = dc.DataConverter(dirpath + "counts_synthetic", + delimiter=';', + default_bin_length=1, + event_length_normalizer=corr_reads.shape[1], # number of loci + add_chromosome_ends=False, + neutral_cn=2.0) + data_converter.create_CoNET_input_files(dirpath, add_chr_ends_to_indices=False) + + # Perform CONET inference + conet = c.CONET(str(Path(conf.conet_binary_dir) / Path("CONET"))) + params = cp.CONETParameters(tree_structure_prior_k1=0.01, + data_dir=dirpath, counts_penalty_s1=100000, counts_penalty_s2=100000, + param_inf_iters=500000, seed=random.randint(0, 1000), mixture_size=2, pt_inf_iters=1000000, + use_event_lengths_in_attachment=False, + event_length_penalty_k0=1) + conet.infer_tree(params) + print(f"CONET inference finished on model {i}") + result = ir.InferenceResult(conf.conet_binary_dir, corr_reads.T) + + inferred_cn = result.get_inferred_copy_numbers(2, conf.bins, conf.cells) + inferred_brkp = result.bp_matrix.astype(int) + inferred_nodes = set(result.tree.nodes) + inferred_edges = set(result.tree.edges) + + real_cn = data[0] + real_brkp = data[4][:, real_ind].astype(int) + real_nodes = set((n.start, n.end) for n in tree.nodes) + real_edges = set(((e[0].start, e[0].end), (e[1].start, e[1].end)) for e in tree.edges) + +rule conet_cbs_inference: + params: + script = config["conet"]["script"], + n_nodes = n_nodes, + scratch = config["conet"]["scratch"], + mem = config["conet"]["mem"], + time = config["conet"]["time"], + script_inp = str(n_nodes)+"nodes_{rep_id}", + intermediate_dir = CONET_CBS_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '/', + bin_path = config["conet"]["bin_path"], + em_iters = config["conet"]["em_iters"], + pt_iters = config["conet"]["pt_iters"], + cbs_script = config["conet"]["cbs_script"], + input: + d_mat = SIM_OUTPUT+ '_' + sim_prefix +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_d_mat.txt', + output: + conet_inferred_cnvs = CONET_CBS_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/'+ '{rep_id}' + '_conet_cbs_inferred.txt', + conet_inferred_tree = CONET_CBS_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/'+ '{rep_id}' + '_conet_cbs_tree.txt', + conet_inferred_attachments = CONET_CBS_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_conet_cbsattachments.txt' + threads: 10 + # shell: + run: + ## Save file in CONET format + def save_counts_in_CONET_format(path, counts, indices): + no_cells = counts.shape[0] + no_loci = counts.shape[1] + counts = np.transpose(counts) + + # Add columns required by CONET + add = np.zeros([no_loci, 5], dtype=np.float64) + add.fill(1) + add[:, 1] = range(0, no_loci) + add[:, 2] = range(1, no_loci + 1) + add[:, 4] = 0 + add[indices, 4] = 1 + full_counts = np.hstack([add, counts]) + + # Add first row with cell names + names = np.zeros([1, 5 + no_cells], dtype=np.float64) + names[0, 5:] = range(0, no_cells) + full_counts = np.vstack([names, full_counts]) + + np.savetxt(path, full_counts, delimiter=";") + + corr_reads = np.loadtxt(input.d_mat, delimiter=',') + + dirpath = tempfile.mkdtemp() + if not dirpath.endswith("/"): + dirpath += "/" + + save_counts_in_CONET_format(dirpath + "counts_synthetic", corr_reads, []) + subprocess.run(["Rscript", f"{params.cbs_script}", "--mincells=5", + f"--output={dirpath}cbs_out", f"--dataset={dirpath}counts_synthetic"]) + X = np.loadtxt(dirpath + "cbs_out", delimiter=",") + candidates = [] + for i in range(X.shape[0]): + if X[i, 4] == 1.0: + candidates.append(i) + + save_counts_in_CONET_format(dirpath + "counts_synthetic", corr_reads, candidates) + print(f"Found {len(candidates)} breakpoint candidates...") + + # Convert model data to CONET format + data_converter = dc.DataConverter(dirpath + "counts_synthetic", + delimiter=';', + default_bin_length=1, + event_length_normalizer=corr_reads.shape[1], # number of loci + add_chromosome_ends=False, + neutral_cn=2.0) + data_converter.create_CoNET_input_files(dirpath, add_chr_ends_to_indices=False) + + # Perform CONET inference + conet = c.CONET(params.bin_path) + conetparams = cp.CONETParameters(tree_structure_prior_k1=0.01, + data_dir=dirpath, counts_penalty_s1=100000, counts_penalty_s2=100000, + param_inf_iters=500000, seed=random.randint(0, 1000), mixture_size=2, pt_inf_iters=1000000, + use_event_lengths_in_attachment=False, + event_length_penalty_k0=1, + output_dir=dirpath) + conet.infer_tree(conetparams) + result = ir.InferenceResult(dirpath, corr_reads.T) + + inferred_cn = result.get_inferred_copy_numbers(2, n_bins, n_cells) + np.savetxt(output.conet_inferred_cnvs, inferred_cn, delimiter=",") + + # Move results to proper output + shutil.copy(os.path.join(dirpath, "inferred_attachment"), output.conet_inferred_attachments) + shutil.copy(os.path.join(dirpath, "inferred_tree"), output.conet_inferred_tree) + + +rule conet_knownbp_inference: + params: + script = config["conet"]["script"], + n_nodes = n_nodes, + scratch = config["conet"]["scratch"], + mem = config["conet"]["mem"], + time = config["conet"]["time"], + script_inp = str(n_nodes)+"nodes_{rep_id}", + intermediate_dir = CONET_KNOWNBP_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '/', + bin_path = config["conet"]["bin_path"], + em_iters = config["conet"]["em_iters"], + pt_iters = config["conet"]["pt_iters"], + input: + d_mat = SIM_OUTPUT+ '_' + sim_prefix +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_d_mat.txt', + init_cnvs = SIM_OUTPUT+ '_' + sim_prefix +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_ground_truth.txt', + output: + conet_inferred_cnvs = CONET_KNOWNBP_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/'+ '{rep_id}' + '_conet_knownbp_inferred.txt', + conet_inferred_tree = CONET_KNOWNBP_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/'+ '{rep_id}' + '_conet_knownbp_tree.txt', + conet_inferred_attachments = CONET_KNOWNBP_OUTPUT +'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_conet_knownbp_attachments.txt' + threads: 10 + shell: + "python {params.script} --delim , --bin_path {params.bin_path} --counts {input.d_mat} --cnvs {input.init_cnvs} --intermediate_dir {params.intermediate_dir} --seed {wildcards.rep_id} --em_iters {params.em_iters} --pt_iters {params.pt_iters} --out_cnvs {output.conet_inferred_cnvs} --out_tree {output.conet_inferred_tree} --out_attachments {output.conet_inferred_attachments}" + + +rule compute_deltas: + input: + simulations = expand(f'{SIM_OUTPUT}_{sim_prefix}/{str(n_nodes)}nodes' + '/{rep_id}_ground_truth.txt', + rep_id=[x for x in range(all_n_tps)]), + + best_full_tree = expand(f'{TREES_OUTPUT}_best_full_tree' + '/{fairness}/' + f'{str(n_nodes)}nodes' + '/{rep_id}_full_tree_cnvs.csv', + rep_id=[x for x in range(all_n_tps)], fairness=["fair", "unfair"]), + + best_cluster_tree = expand(f'{TREES_OUTPUT}_best_cluster_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_cnvs.csv', + rep_id=[x for x in range(all_n_tps)], fairness=["fair", "unfair"]), + + best_full_tree_sum = expand(f'{TREES_OUTPUT}_best_full_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree_cnvs.csv', + rep_id=[x for x in range(all_n_tps)], fairness=["fair", "unfair"]), + + best_cluster_tree_sum = expand(f'{TREES_OUTPUT}_best_cluster_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_cnvs.csv', + rep_id=[x for x in range(all_n_tps)], fairness=["fair", "unfair"]), + + + best_full_tree_adapted = expand(f'{TREES_OUTPUT}_best_full_tree' + '/{fairness}/' + f'{str(n_nodes)}nodes' + '/{rep_id}_full_tree_cnvs_adapted.csv', + rep_id=[x for x in range(all_n_tps)], fairness=["fair", "unfair"]), + + best_cluster_tree_adapted = expand(f'{TREES_OUTPUT}_best_cluster_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_cnvs_adapted.csv', + rep_id=[x for x in range(all_n_tps)], fairness=["fair", "unfair"]), + + best_full_tree_sum_adapted = expand(f'{TREES_OUTPUT}_best_full_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree_cnvs_adapted.csv', + rep_id=[x for x in range(all_n_tps)], fairness=["fair", "unfair"]), + + best_cluster_tree_sum_adapted = expand(f'{TREES_OUTPUT}_best_cluster_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_cnvs_adapted.csv', + rep_id=[x for x in range(all_n_tps)], fairness=["fair", "unfair"]), + + conet_cbs = expand(CONET_CBS_OUTPUT+'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_conet_cbs_inferred.txt', + rep_id=[x for x in range(all_n_tps)]), + + # best_full_tree_knownbps = expand(f'{TREES_OUTPUT}_best_full_tree' + '/{fairness}/' + f'{str(n_nodes)}nodes' + '/{rep_id}_full_tree_cnvs_knownbps.csv', + # rep_id=[x for x in range(0,all_n_tps)], fairness=["fair", "unfair"]), + # + # best_cluster_tree_knownbps = expand(f'{TREES_OUTPUT}_best_cluster_tree' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_cnvs_knownbps.csv', + # rep_id=[x for x in range(0,all_n_tps)], fairness=["fair", "unfair"]), + # + # best_full_tree_sum_knownbps = expand(f'{TREES_OUTPUT}_best_full_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_full_tree_cnvs_knownbps.csv', + # rep_id=[x for x in range(0,all_n_tps)], fairness=["fair", "unfair"]), + # + # best_cluster_tree_sum_knownbps = expand(f'{TREES_OUTPUT}_best_cluster_tree_sum' + '/{fairness}' + f'/{str(n_nodes)}nodes' + '/{rep_id}_cluster_tree_cnvs_knownbps.csv', + # rep_id=[x for x in range(0,all_n_tps)], fairness=["fair", "unfair"]), + + # ginkgo = expand(GINKGO_OUTPUT+'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_ginkgo_inferred.txt', + # rep_id=[x for x in range(0,all_n_tps)]), + # + # conet = expand(CONET_GINKGO_OUTPUT+'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_conet_ginkgo_inferred.txt', + # rep_id=[x for x in range(0,all_n_tps)]), + + + # conet_knwonbp = expand(CONET_KNOWNBP_OUTPUT+'/'+ str(n_nodes) + 'nodes' + '/' + '{rep_id}' + '_conet_knownbp_inferred.txt', + # rep_id=[x for x in range(0,all_n_tps)]), + output: + out_fname = os.path.join(output_path, 'deltas_conetsims.csv') + run: + rows = [] + rows.append('index,rep_id,method,delta') + i = 0 + for true_cnvs in input.simulations: + rep_id = true_cnvs.split('/')[-1].split('_')[0] + gt = np.loadtxt(true_cnvs, delimiter=',') + if gt.shape[0] == n_bins: # should be cells by bins + gt = np.transpose(gt) + + i+=1 + method = 'diploid' + inf = np.ones(gt.shape) * 2 + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + # i += 1 + # method = 'ginkgo' + # inf_cnvs = f'{GINKGO_OUTPUT}/{n_nodes}nodes/{rep_id}_ginkgo_inferred.txt' + # inf = np.loadtxt(inf_cnvs, delimiter=' ') + # error = np.sqrt(np.mean((gt-inf)**2)) + # rows.append(f'{i},{rep_id},{method},{error}') + # + # i += 1 + # method = 'conet_ginkgo' + # inf_cnvs = f'{CONET_GINKGO_OUTPUT}/{n_nodes}nodes/{rep_id}_conet_ginkgo_inferred.txt' + # inf = np.loadtxt(inf_cnvs, delimiter=',') + # error = np.sqrt(np.mean((gt-inf)**2)) + # rows.append(f'{i},{rep_id},{method},{error}') + + # i += 1 + # method = 'conet_knownbp' + # inf_cnvs = f'{CONET_KNOWNBP_OUTPUT}/{n_nodes}nodes/{rep_id}_conet_knownbp_inferred.txt' + # inf = np.loadtxt(inf_cnvs, delimiter=';').T + # error = np.sqrt(np.mean((gt-inf)**2)) + # rows.append(f'{i},{rep_id},{method},{error}') + + i += 1 + method = 'conet_cbs' + inf_cnvs = f'{CONET_CBS_OUTPUT}/{n_nodes}nodes/{rep_id}_conet_cbs_inferred.txt' + inf = np.loadtxt(inf_cnvs, delimiter=',').T + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + for fairness in ['unfair', 'fair']: + i += 1 + method = f'cluster_tree_{fairness}' + inf_cnvs = f'{TREES_OUTPUT}_best_cluster_tree/{fairness}/{n_nodes}nodes/{rep_id}_cluster_tree_cnvs.csv' + inf = np.loadtxt(inf_cnvs, delimiter=',') + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + i += 1 + method = f'cluster_tree_sum_{fairness}' + inf_cnvs = f'{TREES_OUTPUT}_best_cluster_tree_sum/{fairness}/{n_nodes}nodes/{rep_id}_cluster_tree_cnvs.csv' + inf = np.loadtxt(inf_cnvs, delimiter=',') + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + i += 1 + method = f'full_tree_{fairness}' + inf_cnvs = f'{TREES_OUTPUT}_best_full_tree/{fairness}/{n_nodes}nodes/{rep_id}_full_tree_cnvs.csv' + inf = np.loadtxt(inf_cnvs, delimiter=',') + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + i += 1 + method = f'full_tree_sum_{fairness}' + inf_cnvs = f'{TREES_OUTPUT}_best_full_tree_sum/{fairness}/{n_nodes}nodes/{rep_id}_full_tree_cnvs.csv' + inf = np.loadtxt(inf_cnvs, delimiter=',') + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + # i += 1 + # method = f'cluster_tree_{fairness}_knownbps' + # inf_cnvs = f'{TREES_OUTPUT}_best_cluster_tree/{fairness}/{n_nodes}nodes/{rep_id}_cluster_tree_cnvs_knownbps.csv' + # inf = np.loadtxt(inf_cnvs, delimiter=',') + # error = np.sqrt(np.mean((gt-inf)**2)) + # rows.append(f'{i},{rep_id},{method},{error}') + # + # i += 1 + # method = f'cluster_tree_sum_{fairness}_knownbps' + # inf_cnvs = f'{TREES_OUTPUT}_best_cluster_tree_sum/{fairness}/{n_nodes}nodes/{rep_id}_cluster_tree_cnvs_knownbps.csv' + # inf = np.loadtxt(inf_cnvs, delimiter=',') + # error = np.sqrt(np.mean((gt-inf)**2)) + # rows.append(f'{i},{rep_id},{method},{error}') + # + # i += 1 + # method = f'full_tree_{fairness}_knownbps' + # inf_cnvs = f'{TREES_OUTPUT}_best_full_tree/{fairness}/{n_nodes}nodes/{rep_id}_full_tree_cnvs_knownbps.csv' + # inf = np.loadtxt(inf_cnvs, delimiter=',') + # error = np.sqrt(np.mean((gt-inf)**2)) + # rows.append(f'{i},{rep_id},{method},{error}') + # + # i += 1 + # method = f'full_tree_sum_{fairness}_knownbps' + # inf_cnvs = f'{TREES_OUTPUT}_best_full_tree_sum/{fairness}/{n_nodes}nodes/{rep_id}_full_tree_cnvs_knownbps.csv' + # inf = np.loadtxt(inf_cnvs, delimiter=',') + # error = np.sqrt(np.mean((gt-inf)**2)) + # rows.append(f'{i},{rep_id},{method},{error}') + + i += 1 + method = f'cluster_tree_{fairness}_adapted' + inf_cnvs = f'{TREES_OUTPUT}_best_cluster_tree/{fairness}/{n_nodes}nodes/{rep_id}_cluster_tree_cnvs_adapted.csv' + inf = np.loadtxt(inf_cnvs, delimiter=',') + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + i += 1 + method = f'cluster_tree_sum_{fairness}_adapted' + inf_cnvs = f'{TREES_OUTPUT}_best_cluster_tree_sum/{fairness}/{n_nodes}nodes/{rep_id}_cluster_tree_cnvs_adapted.csv' + inf = np.loadtxt(inf_cnvs, delimiter=',') + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + i += 1 + method = f'full_tree_{fairness}_adapted' + inf_cnvs = f'{TREES_OUTPUT}_best_full_tree/{fairness}/{n_nodes}nodes/{rep_id}_full_tree_cnvs_adapted.csv' + inf = np.loadtxt(inf_cnvs, delimiter=',') + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + i += 1 + method = f'full_tree_sum_{fairness}_adapted' + inf_cnvs = f'{TREES_OUTPUT}_best_full_tree_sum/{fairness}/{n_nodes}nodes/{rep_id}_full_tree_cnvs_adapted.csv' + inf = np.loadtxt(inf_cnvs, delimiter=',') + error = np.sqrt(np.mean((gt-inf)**2)) + rows.append(f'{i},{rep_id},{method},{error}') + + + with open(output.out_fname, 'w') as f: + f.write('\n'.join(rows)) + +rule plot_deltas: + input: + deltas_csv = os.path.join(output_path, 'deltas_conetsims.csv') + output: + deltas_png = os.path.join(output_path, 'deltas_conetsims.png') + run: + df = pd.read_csv(input.deltas_csv, index_col=0) + plt.figure(figsize=(10,6)) + bp = sns.boxplot(data=df, x='method', y='delta', hue='method', dodge=False) + plt.setp(bp.get_xticklabels(), rotation=45) + plt.ylim([0,0.5]) + plt.savefig(output.deltas_png) + diff --git a/reproducibility/simulations/conet_comparisons/low_sd/config.json b/reproducibility/simulations/conet_comparisons/low_sd/config.json new file mode 100644 index 0000000..5704d4d --- /dev/null +++ b/reproducibility/simulations/conet_comparisons/low_sd/config.json @@ -0,0 +1,84 @@ +{ + "binaries_path":"/cluster/work/bewi/members/pedrof/tupro_code/SCICoNE/build3", + "output":"/cluster/work/bewi/members/pedrof/sc-dna/sims2020_20_nodes/results2023_boxplot3/", + "simulate": + { + "n_cells":200, + "n_nodes":20, + "n_bins":1500, + "n_reps":40, + "prefix":"", + "bin":"/cluster/work/bewi/members/pedrof/tupro_code/SCICoNE/build3/simulation" + }, + "bp_detection": + { + "window_size":20, + "threshold":3, + "bp_limit":300, + "verbosity":1, + "bin":"/cluster/work/bewi/members/pedrof/sc-dna/build3/breakpoint_detection" + }, + "inference": + { + "cluster_trees": + { + "n_reps":10, + "n_iters":40000, + "n_nodes":0, + "move_probs":[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.01, 0.1, 0.01, 1.0, 0.01], + "n_tries":3 + }, + "full_trees": + { + "n_reps":10, + "n_iters":200000, + "cluster_fraction":1.0, + "n_nodes": 0, + "move_probs":[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.01, 0.1, 0.01, 1.0, 0.01], + }, + "ploidy":2, + "verbosity":1, + "copy_number_limit":4, + "seed":42, + "bin":"/cluster/work/bewi/members/pedrof/tupro_code/SCICoNE/build3/inference", + "output_temp":"/cluster/work/bewi/members/pedrof/sc-dna/sims2020_20_nodes/results2023_boxplot3_temp/" + }, + "hmm_copy": + { + "threads":1, + "scratch":20000, + "mem":"3200", + "time": "600", + "script":"/cluster/work/bewi/members/pedrof/cna_other_methods/HMMCopyInference.R" + }, + "ginkgo": + { + "threads":1, + "scratch":20000, + "mem":"3200", + "time": "600", + "script":"/cluster/work/bewi/members/pedrof/cna_other_methods/ginkgo.R" + }, + "conet": + { + "threads":1, + "scratch":20000, + "mem":"3200", + "time":"600", + "bin_path":"/cluster/work/bewi/members/pedrof/sc-dna/sims2020_20_nodes/snake_analysis_files_2023/CONET-reproducibility/src/CONET", + "script":"/cluster/work/bewi/members/pedrof/cna_other_methods/run_conet.py", + "cbs_script":"/cluster/work/bewi/members/pedrof/cna_other_methods/CBS_MergeLevels.R", + "em_iters": 500000, + "pt_iters": 1000000 + }, + "conet_tree_distances": + { + "threads":1, + "scratch":20000, + "mem":"3200", + "time":"600", + "bin_path":"/cluster/work/bewi/members/pedrof/CONET/src/CONET", + "script":"/cluster/work/bewi/members/pedrof/cna_other_methods/run_conet_tree_distances.py" + } +} +