From da8d7c0ac632e57222e704c4a32ada1e14fefb57 Mon Sep 17 00:00:00 2001 From: svarona Date: Fri, 10 Jan 2025 15:17:56 +0100 Subject: [PATCH 01/31] Fixed path to files not containing sample name in basename --- relecov_tools/conf/bioinfo_config.json | 4 ++++ relecov_tools/read_bioinfo_metadata.py | 12 ++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/relecov_tools/conf/bioinfo_config.json b/relecov_tools/conf/bioinfo_config.json index 8e5747b6..3867336c 100644 --- a/relecov_tools/conf/bioinfo_config.json +++ b/relecov_tools/conf/bioinfo_config.json @@ -6,6 +6,7 @@ "header_row_idx": 1, "required": true, "function": null, + "multiple_samples": true, "split_by_batch": true, "content": { "analysis_date": "analysis_date", @@ -44,6 +45,7 @@ "sample_col_idx": 1, "header_row_idx": 1, "required": true, + "multiple_samples": true, "split_by_batch": true, "function": "parse_long_table", "content": { @@ -84,6 +86,7 @@ "fn": "summary_variants_metrics_mqc.csv", "sample_col_idx": 1, "header_row_idx": 1, + "multiple_samples": true, "required": true, "function": null, "content": { @@ -100,6 +103,7 @@ "workflow_summary": { "fn": "multiqc_report.html", "required": true, + "multiple_samples": true, "function": null, "content": { "software_version": { diff --git a/relecov_tools/read_bioinfo_metadata.py b/relecov_tools/read_bioinfo_metadata.py index 98aa0b9f..138b61d0 100755 --- a/relecov_tools/read_bioinfo_metadata.py +++ b/relecov_tools/read_bioinfo_metadata.py @@ -601,6 +601,7 @@ def add_bioinfo_files_path(self, files_found_dict, j_data): """ method_name = f"{self.add_bioinfo_files_path.__name__}" sample_name_error = 0 + multiple_sample_files = self.get_multiple_sample_files() for row in j_data: row["bioinfo_metadata_file"] = self.out_filename if not row.get("sequencing_sample_id"): @@ -614,10 +615,13 @@ def add_bioinfo_files_path(self, files_found_dict, j_data): for key, values in files_found_dict.items(): file_path = "Not Provided [GENEPIO:0001668]" if values: # Check if value is not empty - for file in values: - if sample_name in file: - file_path = file - break # Exit loop if match found + if key in multiple_sample_files: + file_path = values[0] + else: + for file in values: + if sample_name in file: + file_path = file + break # Exit loop if match found path_key = f"{self.software_name}_filepath_{key}" row[path_key] = file_path if self.software_config[key].get("extract"): From 8eede463a0b22b8966d6ab040dbb9437f9a8b6f6 Mon Sep 17 00:00:00 2001 From: svarona Date: Mon, 13 Jan 2025 13:08:34 +0100 Subject: [PATCH 02/31] Added batchID/date to filenames instead of date --- relecov_tools/assets/pipeline_utils/viralrecon.py | 13 ++++++------- relecov_tools/read_bioinfo_metadata.py | 11 ++++++----- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/relecov_tools/assets/pipeline_utils/viralrecon.py b/relecov_tools/assets/pipeline_utils/viralrecon.py index eb0ff96d..4371b133 100644 --- a/relecov_tools/assets/pipeline_utils/viralrecon.py +++ b/relecov_tools/assets/pipeline_utils/viralrecon.py @@ -153,10 +153,9 @@ def convert_to_json(self, samp_dict): j_list.append(j_dict) return j_list - def save_to_file(self, j_list): + def save_to_file(self, j_list, batch_date): """Transform the parsed data into a json file""" - date_now = datetime.now().strftime("%Y%m%d%H%M%S") - file_name = "long_table_" + date_now + ".json" + file_name = "long_table_" + batch_date + ".json" file_path = os.path.join(self.output_directory, file_name) try: @@ -180,7 +179,7 @@ def parsing_csv(self): # START util functions -def handle_pangolin_data(files_list, output_folder=None): +def handle_pangolin_data(files_list, batch_date, output_folder=None): """File handler to parse pangolin data (csv) into JSON structured format. Args: @@ -320,7 +319,7 @@ def get_pango_data_version(files_list): return pango_data_processed -def parse_long_table(files_list, output_folder=None): +def parse_long_table(files_list, batch_id, output_folder=None): """File handler to retrieve data from long table files and convert it into a JSON structured format. This function utilizes the LongTableParse class to parse the long table data. Since this utility handles and maps data using a custom way, it returns None to be avoid being transferred to method read_bioinfo_metadata.BioinfoMetadata.mapping_over_table(). @@ -349,7 +348,7 @@ def parse_long_table(files_list, output_folder=None): # Parsing long table data and saving it long_table_data = long_table.parsing_csv() # Saving long table data into a file - long_table.save_to_file(long_table_data) + long_table.save_to_file(long_table_data, batch_id) stderr.print("[green]\tProcess completed") elif len(files_list) > 1: method_log_report.update_log_report( @@ -361,7 +360,7 @@ def parse_long_table(files_list, output_folder=None): return None -def handle_consensus_fasta(files_list, output_folder=None): +def handle_consensus_fasta(files_list, batch_id, output_folder=None): """File handler to parse consensus data (fasta) into JSON structured format. Args: diff --git a/relecov_tools/read_bioinfo_metadata.py b/relecov_tools/read_bioinfo_metadata.py index 138b61d0..21a0bc34 100755 --- a/relecov_tools/read_bioinfo_metadata.py +++ b/relecov_tools/read_bioinfo_metadata.py @@ -232,7 +232,7 @@ def validate_software_mandatory_files(self, files_dict): self.log_report.print_log_report(method_name, ["valid", "warning"]) return - def add_bioinfo_results_metadata(self, files_dict, j_data, output_folder=None): + def add_bioinfo_results_metadata(self, files_dict, j_data, batch_id, output_folder=None): """Adds metadata from bioinformatics results to j_data. It first calls file_handlers and then maps the handled data into j_data. @@ -241,6 +241,7 @@ def add_bioinfo_results_metadata(self, files_dict, j_data, output_folder=None): files_dict (dict{str:str}): A dictionary containing file paths found based on the definitions provided in the bioinformatic JSON file within the software scope (self.software_config). j_data (list(dict{str:str}): A list of dictionaries containing metadata lab (list item per sample). output_folder (str): Path to save output files generated during handling_files() process. + batch_id(str): ID of the batch which corresponds with the data download date. Returns: j_data_mapped: A list of dictionaries with bioinformatics metadata mapped into j_data. @@ -263,7 +264,7 @@ def add_bioinfo_results_metadata(self, files_dict, j_data, output_folder=None): ) continue # Handling files - data_to_map = self.handling_files(files_dict[key], output_folder) + data_to_map = self.handling_files(files_dict[key], output_folder, batch_id) # Mapping data to j_data mapping_fields = self.software_config[key].get("content") if not mapping_fields: @@ -324,7 +325,7 @@ def handling_tables(self, file_list, conf_tab_name): sys.exit(self.log_report.print_log_report(method_name, ["error"])) return data - def handling_files(self, file_list, output_folder): + def handling_files(self, file_list, output_folder, batch_id): """Handles different file formats to extract data regardless of their structure. The goal is to extract the data contained in files specified in ${file_list}, using either 'standard' handlers defined in this class or pipeline-specific file handlers. @@ -368,7 +369,7 @@ def handling_files(self, file_list, output_folder): import_statement = f"import {utils_name}" exec(import_statement) # Get method name and execute it. - data = eval(utils_name + "." + func_name + "(file_list, output_folder)") + data = eval(utils_name + "." + func_name + "(file_list, batch_id, output_folder)") except Exception as e: self.log_report.update_log_report( self.add_bioinfo_results_metadata.__name__, @@ -801,7 +802,7 @@ def create_bioinfo_file(self): batch_data = batch_dict["j_data"] stderr.print("[blue]Adding bioinfo metadata to read lab metadata...") batch_data = self.add_bioinfo_results_metadata( - files_found_dict, batch_data, batch_dir + files_found_dict, batch_data, batch_date, batch_dir ) stderr.print("[blue]Adding software versions to read lab metadata...") batch_data = self.get_multiqc_software_versions( From ea5a94740f3f1f3c95ea3ffbe7f040d9603138a4 Mon Sep 17 00:00:00 2001 From: svarona Date: Mon, 13 Jan 2025 13:10:17 +0100 Subject: [PATCH 03/31] add current date to splitted files --- relecov_tools/read_bioinfo_metadata.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/relecov_tools/read_bioinfo_metadata.py b/relecov_tools/read_bioinfo_metadata.py index 21a0bc34..2f142557 100755 --- a/relecov_tools/read_bioinfo_metadata.py +++ b/relecov_tools/read_bioinfo_metadata.py @@ -6,6 +6,7 @@ import re import shutil from bs4 import BeautifulSoup +from datetime import datetime import pandas as pd import relecov_tools.utils @@ -740,7 +741,7 @@ def split_tables_by_batch(self, files_found_dict, batch_data, output_dir): output_dir (str): Output location for the generated tabular file """ - def extract_batch_rows_to_file(file): + def extract_batch_rows_to_file(file, sufix): """Create a new table file only with rows matching samples in batch_data""" extdict = {".csv": ",", ".tsv": "\t", ".tab": "\t"} file_extension = os.path.splitext(file)[1] @@ -750,14 +751,18 @@ def extract_batch_rows_to_file(file): sample_col = file_df.columns[sample_colpos] file_df[sample_col] = file_df[sample_col].astype(str) file_df = file_df[file_df[sample_col].isin(batch_samples)] + + base, ext = os.path.splitext(os.path.basename(file)) + new_filename = f"{base}_{sufix}{ext}" output_path = os.path.join( - output_dir, "analysis_results", os.path.basename(file) + output_dir, "analysis_results", new_filename ) file_df.to_csv(output_path, index=False, sep=extdict.get(file_extension)) return method_name = self.split_tables_by_batch.__name__ namekey = "sequencing_sample_id" + sufix = datetime.now().strftime("%Y%m%d%H%M%S") batch_samples = [row.get(namekey) for row in batch_data] for key, files in files_found_dict.items(): if not self.software_config[key].get("split_by_batch"): @@ -766,7 +771,7 @@ def extract_batch_rows_to_file(file): sample_colpos = self.get_sample_idx_colpos(key) for file in files: try: - extract_batch_rows_to_file(file) + extract_batch_rows_to_file(file,sufix) except Exception as e: if self.software_config[key].get("required"): log_type = "error" From d7a1516ee522d51ab58d98659e2c382d9af0dd9d Mon Sep 17 00:00:00 2001 From: svarona Date: Mon, 13 Jan 2025 13:10:52 +0100 Subject: [PATCH 04/31] add year to outdir path --- relecov_tools/read_bioinfo_metadata.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/relecov_tools/read_bioinfo_metadata.py b/relecov_tools/read_bioinfo_metadata.py index 2f142557..9c6c5434 100755 --- a/relecov_tools/read_bioinfo_metadata.py +++ b/relecov_tools/read_bioinfo_metadata.py @@ -831,6 +831,11 @@ def create_bioinfo_file(self): ) log.info("Created output json file: %s" % batch_filepath) stderr.print(f"[green]Created batch json file: {batch_filepath}") + + year = str(datetime.now().year) + out_path = os.path.join(self.output_folder, year) + os.makedirs(out_path, exist_ok=True) + stderr.print("[blue]Writting output json file") os.makedirs(self.output_folder, exist_ok=True) file_path = os.path.join(self.output_folder, self.out_filename) From 48a6185441474b2a3b236942b3444c52e21dc89e Mon Sep 17 00:00:00 2001 From: svarona Date: Mon, 13 Jan 2025 13:11:53 +0100 Subject: [PATCH 05/31] test whether samples correspond to same batch date or not --- relecov_tools/read_bioinfo_metadata.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/relecov_tools/read_bioinfo_metadata.py b/relecov_tools/read_bioinfo_metadata.py index 9c6c5434..edf39946 100755 --- a/relecov_tools/read_bioinfo_metadata.py +++ b/relecov_tools/read_bioinfo_metadata.py @@ -800,6 +800,19 @@ def create_bioinfo_file(self): self.validate_software_mandatory_files(files_found_dict) # Split files found based on each batch of samples data_by_batch = self.split_data_by_batch(self.j_data) + batch_dates = [] + #Get batch date for all the samples + for batch_dir, batch_dict in data_by_batch.items(): + if batch_dir.split("/")[-1] not in batch_dates: + batch_dates.append(batch_dir.split("/")[-1]) + + if len(batch_dates) == 1: + batch_dates = str(batch_dates[0]) + else: + stderr.print(f"[orange]More than one batch date in the same json data. Using current date as batch date.") + log.info("]More than one batch date in the same json data. Using current date as batch date.") + batch_dates = datetime.now().strftime("%Y%m%d%H%M%S") + # Add bioinfo metadata to j_data for batch_dir, batch_dict in data_by_batch.items(): self.log_report.logsum.feed_key(batch_dir) From ffbf5986ad8ab19fe60d5c36da7e976c2ddb8279 Mon Sep 17 00:00:00 2001 From: svarona Date: Mon, 13 Jan 2025 13:14:34 +0100 Subject: [PATCH 06/31] Added function to save splitted files to outdir --- relecov_tools/read_bioinfo_metadata.py | 47 ++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/relecov_tools/read_bioinfo_metadata.py b/relecov_tools/read_bioinfo_metadata.py index edf39946..8830f6a4 100755 --- a/relecov_tools/read_bioinfo_metadata.py +++ b/relecov_tools/read_bioinfo_metadata.py @@ -785,6 +785,49 @@ def extract_batch_rows_to_file(file, sufix): self.log_report.print_log_report(method_name, ["valid", "warning", "error"]) return + def save_splitted_files(self, files_dict, batch_date, output_folder=None): + """ + Process and save files that where split by batch and that have a function to be processed + + Args: + files_dict (dict): A dictionary containing file paths identified for each configuration item. + batch_date (str): Date or ID of the batch to be used in the output file name. + output_folder (str): Path to save output files generated during processing. + + Returns: + None + """ + method_name = f"{self.save_splitted_files.__name__}" + for key, config in self.software_config.items(): + func_name = config.get("function") + # Skip configurations that do not match the conditions + if config.get("split_by_batch") is True and func_name: + try: + file_path = files_dict[key] + stderr.print(f"[blue]Processing splitted file: {file_path}") + except KeyError: + self.log_report.update_log_report( + method_name, + "warning", + f"No file path found for '{file_path}'", + ) + continue + try: + # Dynamically import the function from the specified module + utils_name = f"relecov_tools.assets.pipeline_utils.{self.software_name}" + import_statement = f"import {utils_name}" + exec(import_statement) + # Get method name and execute it. + data = eval(utils_name + "." + func_name + "(file_path, batch_date, output_folder)") + except Exception as e: + self.log_report.update_log_report( + self.save_splitted_files.__name__, + "error", + f"Error occurred while parsing '{func_name}': {e}.", + ) + sys.exit(self.log_report.print_log_report(method_name, ["error"])) + return + def create_bioinfo_file(self): """Create the bioinfodata json with collecting information from lab metadata json, mapping_stats, and more information from the files @@ -849,6 +892,10 @@ def create_bioinfo_file(self): out_path = os.path.join(self.output_folder, year) os.makedirs(out_path, exist_ok=True) + stderr.print("[blue]Saving previously splitted files to output directory") + + self.save_splitted_files(files_found_dict, batch_dates, out_path) + batch_filename = tag + batch_dates + ".json" stderr.print("[blue]Writting output json file") os.makedirs(self.output_folder, exist_ok=True) file_path = os.path.join(self.output_folder, self.out_filename) From 2b0447eb93a70f1113d9f4a7152ab0226f421041 Mon Sep 17 00:00:00 2001 From: svarona Date: Mon, 13 Jan 2025 13:15:26 +0100 Subject: [PATCH 07/31] test if bioinfo_lab_metadata exist, if exists merge --- relecov_tools/read_bioinfo_metadata.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/relecov_tools/read_bioinfo_metadata.py b/relecov_tools/read_bioinfo_metadata.py index 8830f6a4..b2629243 100755 --- a/relecov_tools/read_bioinfo_metadata.py +++ b/relecov_tools/read_bioinfo_metadata.py @@ -892,14 +892,19 @@ def create_bioinfo_file(self): out_path = os.path.join(self.output_folder, year) os.makedirs(out_path, exist_ok=True) + tag = "bioinfo_lab_metadata_" stderr.print("[blue]Saving previously splitted files to output directory") self.save_splitted_files(files_found_dict, batch_dates, out_path) batch_filename = tag + batch_dates + ".json" stderr.print("[blue]Writting output json file") - os.makedirs(self.output_folder, exist_ok=True) - file_path = os.path.join(self.output_folder, self.out_filename) - relecov_tools.utils.write_json_fo_file(self.j_data, file_path) + file_path = os.path.join(out_path, batch_filename) + if os.path.exists(file_path): + stderr.print(f"[blue]Bioinfo metadata {file_path} file already exists. Merging new data if possible.") + log.info("Bioinfo metadata %s file already exists. Merging new data if possible." % file_path) + batch_data = self.merge_metadata(file_path, self.j_data) + else: + relecov_tools.utils.write_json_fo_file(self.j_data, file_path) stderr.print(f"[green]Sucessful creation of bioinfo analyis file: {file_path}") self.log_report.logsum.create_error_summary( called_module="read-bioinfo-metadata", logs=self.log_report.logsum.logs From 1d30be21054ae3ab23f1ecc0f40df2ac29bee4b0 Mon Sep 17 00:00:00 2001 From: svarona Date: Mon, 13 Jan 2025 13:18:44 +0100 Subject: [PATCH 08/31] Created code to merge metadata if already exist --- relecov_tools/read_bioinfo_metadata.py | 39 ++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/relecov_tools/read_bioinfo_metadata.py b/relecov_tools/read_bioinfo_metadata.py index b2629243..5f5f60f6 100755 --- a/relecov_tools/read_bioinfo_metadata.py +++ b/relecov_tools/read_bioinfo_metadata.py @@ -785,6 +785,34 @@ def extract_batch_rows_to_file(file, sufix): self.log_report.print_log_report(method_name, ["valid", "warning", "error"]) return + def merge_metadata(self, batch_filepath, batch_data): + """ + Merge metadata json if sample does not exist in the metadata file + + Args: + batch_filepath (str): Path to save the json file with the metadata. + batch_data (dict): A dictionary containing metadata of the samples. + Returns: + None + """ + merged_metadata = relecov_tools.utils.read_json_file(batch_filepath) + prev_metadata_dict = {item["sequencing_sample_id"]: item for item in merged_metadata} + for item in batch_data: + sample_id = item["sequencing_sample_id"] + if sample_id in prev_metadata_dict: + # When sample already in metadata, checking whether dictionary is the same + if prev_metadata_dict[sample_id] != item: + stderr.print(f"[red] Sample {sample_id} has different data in {batch_filepath} and new metadata. Can't merge.") + log.error( + "Sample %s has different data in %s and new metadata. Can't merge." % (sample_id, batch_filepath) + ) + sys.exit(1) + else: + merged_metadata.append(item) + + relecov_tools.utils.write_json_fo_file(merged_metadata, batch_filepath) + return merged_metadata + def save_splitted_files(self, files_dict, batch_date, output_folder=None): """ Process and save files that where split by batch and that have a function to be processed @@ -858,6 +886,8 @@ def create_bioinfo_file(self): # Add bioinfo metadata to j_data for batch_dir, batch_dict in data_by_batch.items(): + lab_code = batch_dir.split("/")[-2] + batch_date = batch_dir.split("/")[-1] self.log_report.logsum.feed_key(batch_dir) stderr.print(f"[blue]Processing data from {batch_dir}") batch_data = batch_dict["j_data"] @@ -875,12 +905,15 @@ def create_bioinfo_file(self): stderr.print("[blue]Adding files path to read lab metadata") batch_data = self.add_bioinfo_files_path(files_found_dict, batch_data) self.split_tables_by_batch(files_found_dict, batch_data, batch_dir) - lab_code = batch_dir.split("/")[-2] - batch_date = batch_dir.split("/")[-1] tag = "bioinfo_lab_metadata_" batch_filename = tag + lab_code + "_" + batch_date + ".json" batch_filepath = os.path.join(batch_dir, batch_filename) - relecov_tools.utils.write_json_fo_file(batch_data, batch_filepath) + if os.path.exists(batch_filepath): + stderr.print(f"[blue]Bioinfo metadata {batch_filepath} file already exists. Merging new data if possible.") + log.info("Bioinfo metadata %s file already exists. Merging new data if possible." % batch_filepath) + batch_data = self.merge_metadata(batch_filepath, batch_data) + else: + relecov_tools.utils.write_json_fo_file(batch_data, batch_filepath) for sample in batch_data: self.log_report.logsum.feed_key( key=batch_dir, sample=sample.get("sequencing_sample_id") From e9b04da95df4e05891512db238f923ee872b6ae3 Mon Sep 17 00:00:00 2001 From: svarona Date: Mon, 13 Jan 2025 13:19:54 +0100 Subject: [PATCH 09/31] created function to search files containing muiltiple samples --- relecov_tools/read_bioinfo_metadata.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/relecov_tools/read_bioinfo_metadata.py b/relecov_tools/read_bioinfo_metadata.py index 5f5f60f6..8b2287be 100755 --- a/relecov_tools/read_bioinfo_metadata.py +++ b/relecov_tools/read_bioinfo_metadata.py @@ -856,6 +856,14 @@ def save_splitted_files(self, files_dict, batch_date, output_folder=None): sys.exit(self.log_report.print_log_report(method_name, ["error"])) return + def get_multiple_sample_files(self): + method_name = f"{self.add_bioinfo_files_path.__name__}:{self.get_multiple_sample_files.__name__}" + multiple_sample_files = [] + for key in self.software_config.keys(): + if self.software_config[key].get("multiple_samples"): + multiple_sample_files.append(key) + return multiple_sample_files + def create_bioinfo_file(self): """Create the bioinfodata json with collecting information from lab metadata json, mapping_stats, and more information from the files From f61e5389bfa074cfa6093383d9e5a68797793601 Mon Sep 17 00:00:00 2001 From: svarona Date: Mon, 13 Jan 2025 13:20:13 +0100 Subject: [PATCH 10/31] test whether file exist and try to merge --- .../assets/pipeline_utils/viralrecon.py | 35 +++++++++++++++---- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/relecov_tools/assets/pipeline_utils/viralrecon.py b/relecov_tools/assets/pipeline_utils/viralrecon.py index 4371b133..6c0eedd0 100644 --- a/relecov_tools/assets/pipeline_utils/viralrecon.py +++ b/relecov_tools/assets/pipeline_utils/viralrecon.py @@ -157,13 +157,36 @@ def save_to_file(self, j_list, batch_date): """Transform the parsed data into a json file""" file_name = "long_table_" + batch_date + ".json" file_path = os.path.join(self.output_directory, file_name) + if os.path.exists(file_path): + stderr.print(f"[blue]Long table {file_path} file already exists. Merging new data if possible.") + log.info("Long table %s file already exists. Merging new data if possible." % file_path) + original_table = relecov_tools.utils.read_json_file(file_path) + samples_indict = {item["sample_name"]: item for item in original_table} + for item in j_list: + sample_name = item["sample_name"] + if sample_name in samples_indict: + if samples_indict[sample_name] != item: + stderr.print(f"[red]Same sample has different data in both long tables.") + log.error( + "Sample %s has different data in %s and new long table. Can't merge." % (sample_name, file_path) + ) + return None + else: + original_table.append(item) + try: + with open(file_path, "w") as fh: + fh.write(json.dumps(original_table, indent=4)) + stderr.print("[green]\tParsed data successfully saved to file:", file_path) + except Exception as e: + stderr.print("[red]\tError saving parsed data to file:", str(e)) - try: - with open(file_path, "w") as fh: - fh.write(json.dumps(j_list, indent=4)) - stderr.print("[green]\tParsed data successfully saved to file:", file_path) - except Exception as e: - stderr.print("[red]\tError saving parsed data to file:", str(e)) + else: + try: + with open(file_path, "w") as fh: + fh.write(json.dumps(j_list, indent=4)) + stderr.print("[green]\tParsed data successfully saved to file:", file_path) + except Exception as e: + stderr.print("[red]\tError saving parsed data to file:", str(e)) def parsing_csv(self): """ From f226a8cf33a241e39b2948bfe16494e3bf23504f Mon Sep 17 00:00:00 2001 From: svarona Date: Mon, 13 Jan 2025 13:21:52 +0100 Subject: [PATCH 11/31] fixed black --- .../assets/pipeline_utils/viralrecon.py | 32 +++++--- relecov_tools/read_bioinfo_metadata.py | 73 +++++++++++++------ 2 files changed, 74 insertions(+), 31 deletions(-) diff --git a/relecov_tools/assets/pipeline_utils/viralrecon.py b/relecov_tools/assets/pipeline_utils/viralrecon.py index 6c0eedd0..c71009c6 100644 --- a/relecov_tools/assets/pipeline_utils/viralrecon.py +++ b/relecov_tools/assets/pipeline_utils/viralrecon.py @@ -158,25 +158,35 @@ def save_to_file(self, j_list, batch_date): file_name = "long_table_" + batch_date + ".json" file_path = os.path.join(self.output_directory, file_name) if os.path.exists(file_path): - stderr.print(f"[blue]Long table {file_path} file already exists. Merging new data if possible.") - log.info("Long table %s file already exists. Merging new data if possible." % file_path) + stderr.print( + f"[blue]Long table {file_path} file already exists. Merging new data if possible." + ) + log.info( + "Long table %s file already exists. Merging new data if possible." + % file_path + ) original_table = relecov_tools.utils.read_json_file(file_path) samples_indict = {item["sample_name"]: item for item in original_table} for item in j_list: sample_name = item["sample_name"] if sample_name in samples_indict: if samples_indict[sample_name] != item: - stderr.print(f"[red]Same sample has different data in both long tables.") + stderr.print( + f"[red]Same sample has different data in both long tables." + ) log.error( - "Sample %s has different data in %s and new long table. Can't merge." % (sample_name, file_path) - ) + "Sample %s has different data in %s and new long table. Can't merge." + % (sample_name, file_path) + ) return None else: original_table.append(item) try: with open(file_path, "w") as fh: fh.write(json.dumps(original_table, indent=4)) - stderr.print("[green]\tParsed data successfully saved to file:", file_path) + stderr.print( + "[green]\tParsed data successfully saved to file:", file_path + ) except Exception as e: stderr.print("[red]\tError saving parsed data to file:", str(e)) @@ -184,7 +194,9 @@ def save_to_file(self, j_list, batch_date): try: with open(file_path, "w") as fh: fh.write(json.dumps(j_list, indent=4)) - stderr.print("[green]\tParsed data successfully saved to file:", file_path) + stderr.print( + "[green]\tParsed data successfully saved to file:", file_path + ) except Exception as e: stderr.print("[red]\tError saving parsed data to file:", str(e)) @@ -311,9 +323,9 @@ def get_pango_data_version(files_list): ) # Add custom content in pangolin pango_data_key = next(iter(pango_data)) - pango_data[pango_data_key]["lineage_analysis_date"] = ( - relecov_tools.utils.get_file_date(pango_file) - ) + pango_data[pango_data_key][ + "lineage_analysis_date" + ] = relecov_tools.utils.get_file_date(pango_file) pango_data[pango_data_key]["pangolin_database_version"] = pango_data_v # Rename key in f_data pango_data_updated = { diff --git a/relecov_tools/read_bioinfo_metadata.py b/relecov_tools/read_bioinfo_metadata.py index 8b2287be..4d3a284e 100755 --- a/relecov_tools/read_bioinfo_metadata.py +++ b/relecov_tools/read_bioinfo_metadata.py @@ -233,7 +233,9 @@ def validate_software_mandatory_files(self, files_dict): self.log_report.print_log_report(method_name, ["valid", "warning"]) return - def add_bioinfo_results_metadata(self, files_dict, j_data, batch_id, output_folder=None): + def add_bioinfo_results_metadata( + self, files_dict, j_data, batch_id, output_folder=None + ): """Adds metadata from bioinformatics results to j_data. It first calls file_handlers and then maps the handled data into j_data. @@ -370,7 +372,12 @@ def handling_files(self, file_list, output_folder, batch_id): import_statement = f"import {utils_name}" exec(import_statement) # Get method name and execute it. - data = eval(utils_name + "." + func_name + "(file_list, batch_id, output_folder)") + data = eval( + utils_name + + "." + + func_name + + "(file_list, batch_id, output_folder)" + ) except Exception as e: self.log_report.update_log_report( self.add_bioinfo_results_metadata.__name__, @@ -751,12 +758,10 @@ def extract_batch_rows_to_file(file, sufix): sample_col = file_df.columns[sample_colpos] file_df[sample_col] = file_df[sample_col].astype(str) file_df = file_df[file_df[sample_col].isin(batch_samples)] - + base, ext = os.path.splitext(os.path.basename(file)) new_filename = f"{base}_{sufix}{ext}" - output_path = os.path.join( - output_dir, "analysis_results", new_filename - ) + output_path = os.path.join(output_dir, "analysis_results", new_filename) file_df.to_csv(output_path, index=False, sep=extdict.get(file_extension)) return @@ -771,7 +776,7 @@ def extract_batch_rows_to_file(file, sufix): sample_colpos = self.get_sample_idx_colpos(key) for file in files: try: - extract_batch_rows_to_file(file,sufix) + extract_batch_rows_to_file(file, sufix) except Exception as e: if self.software_config[key].get("required"): log_type = "error" @@ -794,17 +799,22 @@ def merge_metadata(self, batch_filepath, batch_data): batch_data (dict): A dictionary containing metadata of the samples. Returns: None - """ + """ merged_metadata = relecov_tools.utils.read_json_file(batch_filepath) - prev_metadata_dict = {item["sequencing_sample_id"]: item for item in merged_metadata} + prev_metadata_dict = { + item["sequencing_sample_id"]: item for item in merged_metadata + } for item in batch_data: sample_id = item["sequencing_sample_id"] if sample_id in prev_metadata_dict: # When sample already in metadata, checking whether dictionary is the same if prev_metadata_dict[sample_id] != item: - stderr.print(f"[red] Sample {sample_id} has different data in {batch_filepath} and new metadata. Can't merge.") + stderr.print( + f"[red] Sample {sample_id} has different data in {batch_filepath} and new metadata. Can't merge." + ) log.error( - "Sample %s has different data in %s and new metadata. Can't merge." % (sample_id, batch_filepath) + "Sample %s has different data in %s and new metadata. Can't merge." + % (sample_id, batch_filepath) ) sys.exit(1) else: @@ -842,11 +852,18 @@ def save_splitted_files(self, files_dict, batch_date, output_folder=None): continue try: # Dynamically import the function from the specified module - utils_name = f"relecov_tools.assets.pipeline_utils.{self.software_name}" + utils_name = ( + f"relecov_tools.assets.pipeline_utils.{self.software_name}" + ) import_statement = f"import {utils_name}" exec(import_statement) # Get method name and execute it. - data = eval(utils_name + "." + func_name + "(file_path, batch_date, output_folder)") + data = eval( + utils_name + + "." + + func_name + + "(file_path, batch_date, output_folder)" + ) except Exception as e: self.log_report.update_log_report( self.save_splitted_files.__name__, @@ -855,7 +872,7 @@ def save_splitted_files(self, files_dict, batch_date, output_folder=None): ) sys.exit(self.log_report.print_log_report(method_name, ["error"])) return - + def get_multiple_sample_files(self): method_name = f"{self.add_bioinfo_files_path.__name__}:{self.get_multiple_sample_files.__name__}" multiple_sample_files = [] @@ -880,7 +897,7 @@ def create_bioinfo_file(self): # Split files found based on each batch of samples data_by_batch = self.split_data_by_batch(self.j_data) batch_dates = [] - #Get batch date for all the samples + # Get batch date for all the samples for batch_dir, batch_dict in data_by_batch.items(): if batch_dir.split("/")[-1] not in batch_dates: batch_dates.append(batch_dir.split("/")[-1]) @@ -888,8 +905,12 @@ def create_bioinfo_file(self): if len(batch_dates) == 1: batch_dates = str(batch_dates[0]) else: - stderr.print(f"[orange]More than one batch date in the same json data. Using current date as batch date.") - log.info("]More than one batch date in the same json data. Using current date as batch date.") + stderr.print( + f"[orange]More than one batch date in the same json data. Using current date as batch date." + ) + log.info( + "]More than one batch date in the same json data. Using current date as batch date." + ) batch_dates = datetime.now().strftime("%Y%m%d%H%M%S") # Add bioinfo metadata to j_data @@ -917,8 +938,13 @@ def create_bioinfo_file(self): batch_filename = tag + lab_code + "_" + batch_date + ".json" batch_filepath = os.path.join(batch_dir, batch_filename) if os.path.exists(batch_filepath): - stderr.print(f"[blue]Bioinfo metadata {batch_filepath} file already exists. Merging new data if possible.") - log.info("Bioinfo metadata %s file already exists. Merging new data if possible." % batch_filepath) + stderr.print( + f"[blue]Bioinfo metadata {batch_filepath} file already exists. Merging new data if possible." + ) + log.info( + "Bioinfo metadata %s file already exists. Merging new data if possible." + % batch_filepath + ) batch_data = self.merge_metadata(batch_filepath, batch_data) else: relecov_tools.utils.write_json_fo_file(batch_data, batch_filepath) @@ -941,8 +967,13 @@ def create_bioinfo_file(self): stderr.print("[blue]Writting output json file") file_path = os.path.join(out_path, batch_filename) if os.path.exists(file_path): - stderr.print(f"[blue]Bioinfo metadata {file_path} file already exists. Merging new data if possible.") - log.info("Bioinfo metadata %s file already exists. Merging new data if possible." % file_path) + stderr.print( + f"[blue]Bioinfo metadata {file_path} file already exists. Merging new data if possible." + ) + log.info( + "Bioinfo metadata %s file already exists. Merging new data if possible." + % file_path + ) batch_data = self.merge_metadata(file_path, self.j_data) else: relecov_tools.utils.write_json_fo_file(self.j_data, file_path) From b37e8a4063cf454395c092183e90d8b7198ad745 Mon Sep 17 00:00:00 2001 From: svarona Date: Mon, 13 Jan 2025 13:39:37 +0100 Subject: [PATCH 12/31] Fixed flake8 --- relecov_tools/assets/pipeline_utils/viralrecon.py | 3 +-- relecov_tools/read_bioinfo_metadata.py | 7 +++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/relecov_tools/assets/pipeline_utils/viralrecon.py b/relecov_tools/assets/pipeline_utils/viralrecon.py index c71009c6..5dbf2d21 100644 --- a/relecov_tools/assets/pipeline_utils/viralrecon.py +++ b/relecov_tools/assets/pipeline_utils/viralrecon.py @@ -8,7 +8,6 @@ import os.path from pathlib import Path -from datetime import datetime import relecov_tools.utils from relecov_tools.config_json import ConfigJson @@ -172,7 +171,7 @@ def save_to_file(self, j_list, batch_date): if sample_name in samples_indict: if samples_indict[sample_name] != item: stderr.print( - f"[red]Same sample has different data in both long tables." + f"[red]Same sample {sample_name} has different data in both long tables." ) log.error( "Sample %s has different data in %s and new long table. Can't merge." diff --git a/relecov_tools/read_bioinfo_metadata.py b/relecov_tools/read_bioinfo_metadata.py index 4d3a284e..3c8cad2b 100755 --- a/relecov_tools/read_bioinfo_metadata.py +++ b/relecov_tools/read_bioinfo_metadata.py @@ -871,10 +871,9 @@ def save_splitted_files(self, files_dict, batch_date, output_folder=None): f"Error occurred while parsing '{func_name}': {e}.", ) sys.exit(self.log_report.print_log_report(method_name, ["error"])) - return + return data def get_multiple_sample_files(self): - method_name = f"{self.add_bioinfo_files_path.__name__}:{self.get_multiple_sample_files.__name__}" multiple_sample_files = [] for key in self.software_config.keys(): if self.software_config[key].get("multiple_samples"): @@ -906,10 +905,10 @@ def create_bioinfo_file(self): batch_dates = str(batch_dates[0]) else: stderr.print( - f"[orange]More than one batch date in the same json data. Using current date as batch date." + "[orange]More than one batch date in the same json data. Using current date as batch date." ) log.info( - "]More than one batch date in the same json data. Using current date as batch date." + "More than one batch date in the same json data. Using current date as batch date." ) batch_dates = datetime.now().strftime("%Y%m%d%H%M%S") From 97d944e89f01cfa034cdf29e906e035aff89332c Mon Sep 17 00:00:00 2001 From: svarona Date: Tue, 14 Jan 2025 10:04:53 +0100 Subject: [PATCH 13/31] fixed black --- relecov_tools/assets/pipeline_utils/viralrecon.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/relecov_tools/assets/pipeline_utils/viralrecon.py b/relecov_tools/assets/pipeline_utils/viralrecon.py index 5dbf2d21..b48e7c6b 100644 --- a/relecov_tools/assets/pipeline_utils/viralrecon.py +++ b/relecov_tools/assets/pipeline_utils/viralrecon.py @@ -322,9 +322,9 @@ def get_pango_data_version(files_list): ) # Add custom content in pangolin pango_data_key = next(iter(pango_data)) - pango_data[pango_data_key][ - "lineage_analysis_date" - ] = relecov_tools.utils.get_file_date(pango_file) + pango_data[pango_data_key]["lineage_analysis_date"] = ( + relecov_tools.utils.get_file_date(pango_file) + ) pango_data[pango_data_key]["pangolin_database_version"] = pango_data_v # Rename key in f_data pango_data_updated = { From e10a094c113c1f8ec0ed76a6e07fe20b0bbd3085 Mon Sep 17 00:00:00 2001 From: svarona Date: Tue, 14 Jan 2025 10:06:00 +0100 Subject: [PATCH 14/31] added creation of log in batch folder --- relecov_tools/read_bioinfo_metadata.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/relecov_tools/read_bioinfo_metadata.py b/relecov_tools/read_bioinfo_metadata.py index 3c8cad2b..095f99d5 100755 --- a/relecov_tools/read_bioinfo_metadata.py +++ b/relecov_tools/read_bioinfo_metadata.py @@ -980,4 +980,10 @@ def create_bioinfo_file(self): self.log_report.logsum.create_error_summary( called_module="read-bioinfo-metadata", logs=self.log_report.logsum.logs ) + for batch_dir, batch_dict in data_by_batch.items(): + self.log_report.logsum.create_error_summary( + called_module="read-bioinfo-metadata", + filepath=batch_dir, + logs=self.log_report.logsum.logs + ) return True From 6864e95585b2a0c29522e0a8d68e46f7d6327d5e Mon Sep 17 00:00:00 2001 From: svarona Date: Tue, 14 Jan 2025 15:28:56 +0100 Subject: [PATCH 15/31] fixed regex to find long table in analysis_results --- relecov_tools/assets/pipeline_utils/viralrecon.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/relecov_tools/assets/pipeline_utils/viralrecon.py b/relecov_tools/assets/pipeline_utils/viralrecon.py index b48e7c6b..46546147 100644 --- a/relecov_tools/assets/pipeline_utils/viralrecon.py +++ b/relecov_tools/assets/pipeline_utils/viralrecon.py @@ -134,7 +134,7 @@ def convert_to_json(self, samp_dict): j_list = [] # Grab date from filename result_regex = re.search( - "variants_long_table(?:_\d{8})?\.csv", os.path.basename(self.file_path) + "variants_long_table(?:_\d{14})?\.csv", os.path.basename(self.file_path) ) if result_regex is None: stderr.print( From f58e0736bf52030d6c52fe0cfde16ea51b91b5e2 Mon Sep 17 00:00:00 2001 From: svarona Date: Tue, 14 Jan 2025 15:29:43 +0100 Subject: [PATCH 16/31] removed log that was not working --- relecov_tools/read_bioinfo_metadata.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/relecov_tools/read_bioinfo_metadata.py b/relecov_tools/read_bioinfo_metadata.py index 095f99d5..3c8cad2b 100755 --- a/relecov_tools/read_bioinfo_metadata.py +++ b/relecov_tools/read_bioinfo_metadata.py @@ -980,10 +980,4 @@ def create_bioinfo_file(self): self.log_report.logsum.create_error_summary( called_module="read-bioinfo-metadata", logs=self.log_report.logsum.logs ) - for batch_dir, batch_dict in data_by_batch.items(): - self.log_report.logsum.create_error_summary( - called_module="read-bioinfo-metadata", - filepath=batch_dir, - logs=self.log_report.logsum.logs - ) return True From 718e66b7e36d147db95acb2e478d9bde58cfce75 Mon Sep 17 00:00:00 2001 From: svarona Date: Tue, 14 Jan 2025 15:30:29 +0100 Subject: [PATCH 17/31] split files by batch first --- relecov_tools/read_bioinfo_metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/relecov_tools/read_bioinfo_metadata.py b/relecov_tools/read_bioinfo_metadata.py index 3c8cad2b..297c7ecf 100755 --- a/relecov_tools/read_bioinfo_metadata.py +++ b/relecov_tools/read_bioinfo_metadata.py @@ -920,6 +920,7 @@ def create_bioinfo_file(self): stderr.print(f"[blue]Processing data from {batch_dir}") batch_data = batch_dict["j_data"] stderr.print("[blue]Adding bioinfo metadata to read lab metadata...") + self.split_tables_by_batch(files_found_dict, sufix, batch_data, batch_dir) batch_data = self.add_bioinfo_results_metadata( files_found_dict, batch_data, batch_date, batch_dir ) @@ -932,7 +933,6 @@ def create_bioinfo_file(self): # Adding files path stderr.print("[blue]Adding files path to read lab metadata") batch_data = self.add_bioinfo_files_path(files_found_dict, batch_data) - self.split_tables_by_batch(files_found_dict, batch_data, batch_dir) tag = "bioinfo_lab_metadata_" batch_filename = tag + lab_code + "_" + batch_date + ".json" batch_filepath = os.path.join(batch_dir, batch_filename) From 72c76b3ca4c960a1c489a4e63cdc17ea8a554ae3 Mon Sep 17 00:00:00 2001 From: svarona Date: Tue, 14 Jan 2025 15:31:57 +0100 Subject: [PATCH 18/31] created unique sufix for all files in batch --- relecov_tools/read_bioinfo_metadata.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/relecov_tools/read_bioinfo_metadata.py b/relecov_tools/read_bioinfo_metadata.py index 297c7ecf..9a659f8d 100755 --- a/relecov_tools/read_bioinfo_metadata.py +++ b/relecov_tools/read_bioinfo_metadata.py @@ -234,7 +234,7 @@ def validate_software_mandatory_files(self, files_dict): return def add_bioinfo_results_metadata( - self, files_dict, j_data, batch_id, output_folder=None + self, files_dict, j_data, sufix, batch_id, output_folder=None ): """Adds metadata from bioinformatics results to j_data. It first calls file_handlers and then maps the handled @@ -243,6 +243,7 @@ def add_bioinfo_results_metadata( Args: files_dict (dict{str:str}): A dictionary containing file paths found based on the definitions provided in the bioinformatic JSON file within the software scope (self.software_config). j_data (list(dict{str:str}): A list of dictionaries containing metadata lab (list item per sample). + sufix (str): Sufix added to splitted tables file name. output_folder (str): Path to save output files generated during handling_files() process. batch_id(str): ID of the batch which corresponds with the data download date. @@ -267,7 +268,7 @@ def add_bioinfo_results_metadata( ) continue # Handling files - data_to_map = self.handling_files(files_dict[key], output_folder, batch_id) + data_to_map = self.handling_files(files_dict[key], sufix, output_folder, batch_id) # Mapping data to j_data mapping_fields = self.software_config[key].get("content") if not mapping_fields: @@ -328,7 +329,7 @@ def handling_tables(self, file_list, conf_tab_name): sys.exit(self.log_report.print_log_report(method_name, ["error"])) return data - def handling_files(self, file_list, output_folder, batch_id): + def handling_files(self, file_list, sufix, output_folder, batch_id): """Handles different file formats to extract data regardless of their structure. The goal is to extract the data contained in files specified in ${file_list}, using either 'standard' handlers defined in this class or pipeline-specific file handlers. @@ -739,11 +740,12 @@ def split_data_by_batch(self, j_data): ] return data_by_batch - def split_tables_by_batch(self, files_found_dict, batch_data, output_dir): + def split_tables_by_batch(self, files_found_dict, sufix, batch_data, output_dir): """Filter table content to output a new table containing only the samples present in given metadata Args: files_found_dict (dict): A dictionary containing file paths identified for each configuration item. + sufix (str): Sufix to be added to the new table file name. batch_data (list(dict)): Metadata corresponding to a single folder with samples (folder) output_dir (str): Output location for the generated tabular file """ @@ -767,7 +769,6 @@ def extract_batch_rows_to_file(file, sufix): method_name = self.split_tables_by_batch.__name__ namekey = "sequencing_sample_id" - sufix = datetime.now().strftime("%Y%m%d%H%M%S") batch_samples = [row.get(namekey) for row in batch_data] for key, files in files_found_dict.items(): if not self.software_config[key].get("split_by_batch"): @@ -896,6 +897,7 @@ def create_bioinfo_file(self): # Split files found based on each batch of samples data_by_batch = self.split_data_by_batch(self.j_data) batch_dates = [] + sufix = datetime.now().strftime("%Y%m%d%H%M%S") # Get batch date for all the samples for batch_dir, batch_dict in data_by_batch.items(): if batch_dir.split("/")[-1] not in batch_dates: @@ -922,7 +924,7 @@ def create_bioinfo_file(self): stderr.print("[blue]Adding bioinfo metadata to read lab metadata...") self.split_tables_by_batch(files_found_dict, sufix, batch_data, batch_dir) batch_data = self.add_bioinfo_results_metadata( - files_found_dict, batch_data, batch_date, batch_dir + files_found_dict, batch_data, sufix, batch_date, batch_dir ) stderr.print("[blue]Adding software versions to read lab metadata...") batch_data = self.get_multiqc_software_versions( From 68f6b983487c75d4a99f7a0cfb435c93dfa2b75e Mon Sep 17 00:00:00 2001 From: svarona Date: Tue, 14 Jan 2025 15:32:31 +0100 Subject: [PATCH 19/31] renamed save_splitted_files to save_merged_files --- relecov_tools/read_bioinfo_metadata.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/relecov_tools/read_bioinfo_metadata.py b/relecov_tools/read_bioinfo_metadata.py index 9a659f8d..d4061548 100755 --- a/relecov_tools/read_bioinfo_metadata.py +++ b/relecov_tools/read_bioinfo_metadata.py @@ -824,9 +824,9 @@ def merge_metadata(self, batch_filepath, batch_data): relecov_tools.utils.write_json_fo_file(merged_metadata, batch_filepath) return merged_metadata - def save_splitted_files(self, files_dict, batch_date, output_folder=None): + def save_merged_files(self, files_dict, batch_date, output_folder=None): """ - Process and save files that where split by batch and that have a function to be processed + Process and save files that where split by cod and that have a function to be processed Args: files_dict (dict): A dictionary containing file paths identified for each configuration item. @@ -836,7 +836,7 @@ def save_splitted_files(self, files_dict, batch_date, output_folder=None): Returns: None """ - method_name = f"{self.save_splitted_files.__name__}" + method_name = f"{self.save_merged_files.__name__}" for key, config in self.software_config.items(): func_name = config.get("function") # Skip configurations that do not match the conditions @@ -867,7 +867,7 @@ def save_splitted_files(self, files_dict, batch_date, output_folder=None): ) except Exception as e: self.log_report.update_log_report( - self.save_splitted_files.__name__, + self.save_merged_files.__name__, "error", f"Error occurred while parsing '{func_name}': {e}.", ) @@ -963,7 +963,7 @@ def create_bioinfo_file(self): tag = "bioinfo_lab_metadata_" stderr.print("[blue]Saving previously splitted files to output directory") - self.save_splitted_files(files_found_dict, batch_dates, out_path) + self.save_merged_files(files_found_dict, batch_dates, out_path) batch_filename = tag + batch_dates + ".json" stderr.print("[blue]Writting output json file") file_path = os.path.join(out_path, batch_filename) From 0d82573148860e159673265e9492e4472e01c389 Mon Sep 17 00:00:00 2001 From: svarona Date: Tue, 14 Jan 2025 15:33:00 +0100 Subject: [PATCH 20/31] added code to save splitted long table to batch dir --- relecov_tools/read_bioinfo_metadata.py | 73 +++++++++++++++++++------- 1 file changed, 53 insertions(+), 20 deletions(-) diff --git a/relecov_tools/read_bioinfo_metadata.py b/relecov_tools/read_bioinfo_metadata.py index d4061548..9a9720f2 100755 --- a/relecov_tools/read_bioinfo_metadata.py +++ b/relecov_tools/read_bioinfo_metadata.py @@ -361,31 +361,64 @@ def handling_files(self, file_list, sufix, output_folder, batch_id): data: A dictionary containing bioinfo metadata handled for each sample. """ method_name = f"{self.add_bioinfo_results_metadata.__name__}:{self.handling_files.__name__}" + splitted_path = os.path.join(output_folder, "analysis_results") file_name = self.software_config[self.current_config_key].get("fn") # Parsing files - func_name = self.software_config[self.current_config_key]["function"] + current_config = self.software_config[self.current_config_key] + func_name = current_config.get("function") if func_name is None: data = self.handling_tables(file_list=file_list, conf_tab_name=file_name) else: - try: - # Dynamically import the function from the specified module - utils_name = f"relecov_tools.assets.pipeline_utils.{self.software_name}" - import_statement = f"import {utils_name}" - exec(import_statement) - # Get method name and execute it. - data = eval( - utils_name - + "." - + func_name - + "(file_list, batch_id, output_folder)" - ) - except Exception as e: - self.log_report.update_log_report( - self.add_bioinfo_results_metadata.__name__, - "error", - f"Error occurred while parsing '{func_name}': {e}.", - ) - sys.exit(self.log_report.print_log_report(method_name, ["error"])) + if current_config.get("split_by_batch") is True: + file_extension = current_config.get("fn").rsplit(".", 1)[1] + base_filename = current_config.get("fn").rsplit(".", 1)[0] + pattern = re.compile(f"{base_filename}_{sufix}.{re.escape(file_extension)}") + matching_files = [ + f for f in os.listdir(splitted_path) if pattern.match(f) + ] + full_paths = [os.path.join(splitted_path, f) for f in matching_files] + try: + # Dynamically import the function from the specified module + utils_name = ( + f"relecov_tools.assets.pipeline_utils.{self.software_name}" + ) + import_statement = f"import {utils_name}" + exec(import_statement) + # Get method name and execute it. + print("lanzar_variants_long_table") + data = eval( + utils_name + + "." + + func_name + + "(full_paths, batch_id, output_folder)" + ) + except Exception as e: + self.log_report.update_log_report( + self.save_splitted_files.__name__, + "error", + f"Error occurred while parsing '{func_name}': {e}.", + ) + sys.exit(self.log_report.print_log_report(method_name, ["error"])) + else: + try: + # Dynamically import the function from the specified module + utils_name = f"relecov_tools.assets.pipeline_utils.{self.software_name}" + import_statement = f"import {utils_name}" + exec(import_statement) + # Get method name and execute it. + data = eval( + utils_name + + "." + + func_name + + "(file_list, batch_id, output_folder)" + ) + except Exception as e: + self.log_report.update_log_report( + self.add_bioinfo_results_metadata.__name__, + "error", + f"Error occurred while parsing '{func_name}': {e}.", + ) + sys.exit(self.log_report.print_log_report(method_name, ["error"])) return data def mapping_over_table(self, j_data, map_data, mapping_fields, table_name): From 09b58dad41d84a69966dbf625cfb9192046b47a8 Mon Sep 17 00:00:00 2001 From: svarona Date: Tue, 14 Jan 2025 15:33:49 +0100 Subject: [PATCH 21/31] fixed black --- relecov_tools/read_bioinfo_metadata.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/relecov_tools/read_bioinfo_metadata.py b/relecov_tools/read_bioinfo_metadata.py index 9a9720f2..69b045cd 100755 --- a/relecov_tools/read_bioinfo_metadata.py +++ b/relecov_tools/read_bioinfo_metadata.py @@ -268,7 +268,9 @@ def add_bioinfo_results_metadata( ) continue # Handling files - data_to_map = self.handling_files(files_dict[key], sufix, output_folder, batch_id) + data_to_map = self.handling_files( + files_dict[key], sufix, output_folder, batch_id + ) # Mapping data to j_data mapping_fields = self.software_config[key].get("content") if not mapping_fields: @@ -372,7 +374,9 @@ def handling_files(self, file_list, sufix, output_folder, batch_id): if current_config.get("split_by_batch") is True: file_extension = current_config.get("fn").rsplit(".", 1)[1] base_filename = current_config.get("fn").rsplit(".", 1)[0] - pattern = re.compile(f"{base_filename}_{sufix}.{re.escape(file_extension)}") + pattern = re.compile( + f"{base_filename}_{sufix}.{re.escape(file_extension)}" + ) matching_files = [ f for f in os.listdir(splitted_path) if pattern.match(f) ] @@ -402,7 +406,9 @@ def handling_files(self, file_list, sufix, output_folder, batch_id): else: try: # Dynamically import the function from the specified module - utils_name = f"relecov_tools.assets.pipeline_utils.{self.software_name}" + utils_name = ( + f"relecov_tools.assets.pipeline_utils.{self.software_name}" + ) import_statement = f"import {utils_name}" exec(import_statement) # Get method name and execute it. From f08c12a88a53a1985cb13e0e60f01eb7651d5b99 Mon Sep 17 00:00:00 2001 From: svarona Date: Tue, 14 Jan 2025 15:44:18 +0100 Subject: [PATCH 22/31] added log errors --- relecov_tools/assets/pipeline_utils/viralrecon.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/relecov_tools/assets/pipeline_utils/viralrecon.py b/relecov_tools/assets/pipeline_utils/viralrecon.py index 46546147..7ec18895 100644 --- a/relecov_tools/assets/pipeline_utils/viralrecon.py +++ b/relecov_tools/assets/pipeline_utils/viralrecon.py @@ -188,7 +188,7 @@ def save_to_file(self, j_list, batch_date): ) except Exception as e: stderr.print("[red]\tError saving parsed data to file:", str(e)) - + log.error("Error saving parsed data to file: %s", e) else: try: with open(file_path, "w") as fh: @@ -198,6 +198,7 @@ def save_to_file(self, j_list, batch_date): ) except Exception as e: stderr.print("[red]\tError saving parsed data to file:", str(e)) + log.error("Error saving parsed data to file: %s", e) def parsing_csv(self): """ From d09ca2617e85a4fc19d5d3925ca2e19038feee92 Mon Sep 17 00:00:00 2001 From: svarona Date: Tue, 14 Jan 2025 15:45:07 +0100 Subject: [PATCH 23/31] replaced batch_id with batch_date --- relecov_tools/assets/pipeline_utils/viralrecon.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/relecov_tools/assets/pipeline_utils/viralrecon.py b/relecov_tools/assets/pipeline_utils/viralrecon.py index 7ec18895..9613ffa8 100644 --- a/relecov_tools/assets/pipeline_utils/viralrecon.py +++ b/relecov_tools/assets/pipeline_utils/viralrecon.py @@ -354,7 +354,7 @@ def get_pango_data_version(files_list): return pango_data_processed -def parse_long_table(files_list, batch_id, output_folder=None): +def parse_long_table(files_list, batch_date, output_folder=None): """File handler to retrieve data from long table files and convert it into a JSON structured format. This function utilizes the LongTableParse class to parse the long table data. Since this utility handles and maps data using a custom way, it returns None to be avoid being transferred to method read_bioinfo_metadata.BioinfoMetadata.mapping_over_table(). @@ -383,7 +383,7 @@ def parse_long_table(files_list, batch_id, output_folder=None): # Parsing long table data and saving it long_table_data = long_table.parsing_csv() # Saving long table data into a file - long_table.save_to_file(long_table_data, batch_id) + long_table.save_to_file(long_table_data, batch_date) stderr.print("[green]\tProcess completed") elif len(files_list) > 1: method_log_report.update_log_report( @@ -395,7 +395,7 @@ def parse_long_table(files_list, batch_id, output_folder=None): return None -def handle_consensus_fasta(files_list, batch_id, output_folder=None): +def handle_consensus_fasta(files_list, batch_date, output_folder=None): """File handler to parse consensus data (fasta) into JSON structured format. Args: From 55bd6117351daff59d071614b571b2f89f4824cc Mon Sep 17 00:00:00 2001 From: svarona Date: Tue, 14 Jan 2025 15:46:59 +0100 Subject: [PATCH 24/31] replaced batch_id with batch_date --- relecov_tools/read_bioinfo_metadata.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/relecov_tools/read_bioinfo_metadata.py b/relecov_tools/read_bioinfo_metadata.py index 69b045cd..a02a9e18 100755 --- a/relecov_tools/read_bioinfo_metadata.py +++ b/relecov_tools/read_bioinfo_metadata.py @@ -234,7 +234,7 @@ def validate_software_mandatory_files(self, files_dict): return def add_bioinfo_results_metadata( - self, files_dict, j_data, sufix, batch_id, output_folder=None + self, files_dict, j_data, sufix, batch_date, output_folder=None ): """Adds metadata from bioinformatics results to j_data. It first calls file_handlers and then maps the handled @@ -245,7 +245,7 @@ def add_bioinfo_results_metadata( j_data (list(dict{str:str}): A list of dictionaries containing metadata lab (list item per sample). sufix (str): Sufix added to splitted tables file name. output_folder (str): Path to save output files generated during handling_files() process. - batch_id(str): ID of the batch which corresponds with the data download date. + batch_date(str): Number of the batch which corresponds with the data download date. Returns: j_data_mapped: A list of dictionaries with bioinformatics metadata mapped into j_data. @@ -269,7 +269,7 @@ def add_bioinfo_results_metadata( continue # Handling files data_to_map = self.handling_files( - files_dict[key], sufix, output_folder, batch_id + files_dict[key], sufix, output_folder, batch_date ) # Mapping data to j_data mapping_fields = self.software_config[key].get("content") @@ -331,7 +331,7 @@ def handling_tables(self, file_list, conf_tab_name): sys.exit(self.log_report.print_log_report(method_name, ["error"])) return data - def handling_files(self, file_list, sufix, output_folder, batch_id): + def handling_files(self, file_list, sufix, output_folder, batch_date): """Handles different file formats to extract data regardless of their structure. The goal is to extract the data contained in files specified in ${file_list}, using either 'standard' handlers defined in this class or pipeline-specific file handlers. @@ -358,6 +358,7 @@ def handling_files(self, file_list, sufix, output_folder, batch_id): Args: file_list (list): A list of file path/s to be processed. output_folder (str): Path to save output files from imported method if necessary + batch_date(str): Number of the batch which corresponds with the data download date. Returns: data: A dictionary containing bioinfo metadata handled for each sample. @@ -394,7 +395,7 @@ def handling_files(self, file_list, sufix, output_folder, batch_id): utils_name + "." + func_name - + "(full_paths, batch_id, output_folder)" + + "(full_paths, batch_date, output_folder)" ) except Exception as e: self.log_report.update_log_report( @@ -416,7 +417,7 @@ def handling_files(self, file_list, sufix, output_folder, batch_id): utils_name + "." + func_name - + "(file_list, batch_id, output_folder)" + + "(file_list, batch_date, output_folder)" ) except Exception as e: self.log_report.update_log_report( From a480315d6ad57a8c0758ab6da634b776f051ee5e Mon Sep 17 00:00:00 2001 From: svarona Date: Tue, 14 Jan 2025 15:50:06 +0100 Subject: [PATCH 25/31] Updated changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 753732a2..262eb0f0 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ Code contributions to the release: #### Fixes +- Fixed read-bioinfo-metadata module [#367](https://github.com/BU-ISCIII/relecov-tools/pull/367) + #### Changed #### Removed From d6cd9beadde79c4f1b80cb53fbc91bfef53daf36 Mon Sep 17 00:00:00 2001 From: svarona Date: Wed, 15 Jan 2025 14:41:08 +0100 Subject: [PATCH 26/31] Added creation of analysis_results earlier for module to work --- relecov_tools/read_bioinfo_metadata.py | 1 + 1 file changed, 1 insertion(+) diff --git a/relecov_tools/read_bioinfo_metadata.py b/relecov_tools/read_bioinfo_metadata.py index a02a9e18..429ae414 100755 --- a/relecov_tools/read_bioinfo_metadata.py +++ b/relecov_tools/read_bioinfo_metadata.py @@ -803,6 +803,7 @@ def extract_batch_rows_to_file(file, sufix): base, ext = os.path.splitext(os.path.basename(file)) new_filename = f"{base}_{sufix}{ext}" + os.makedirs(os.path.join(output_dir, "analysis_results"), exist_ok=True) output_path = os.path.join(output_dir, "analysis_results", new_filename) file_df.to_csv(output_path, index=False, sep=extdict.get(file_extension)) return From 2af25d7abb54f82fa1feb4fb0b149fe6ca50439f Mon Sep 17 00:00:00 2001 From: svarona Date: Wed, 15 Jan 2025 16:33:13 +0100 Subject: [PATCH 27/31] ignored flake error --- relecov_tools/read_bioinfo_metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/relecov_tools/read_bioinfo_metadata.py b/relecov_tools/read_bioinfo_metadata.py index 429ae414..4ac782b7 100755 --- a/relecov_tools/read_bioinfo_metadata.py +++ b/relecov_tools/read_bioinfo_metadata.py @@ -381,7 +381,7 @@ def handling_files(self, file_list, sufix, output_folder, batch_date): matching_files = [ f for f in os.listdir(splitted_path) if pattern.match(f) ] - full_paths = [os.path.join(splitted_path, f) for f in matching_files] + full_paths = [os.path.join(splitted_path, f) for f in matching_files] # noqua: F841 try: # Dynamically import the function from the specified module utils_name = ( From a68a3ebebdf382b749acbacafa4500946fd512bf Mon Sep 17 00:00:00 2001 From: svarona Date: Wed, 15 Jan 2025 16:34:35 +0100 Subject: [PATCH 28/31] flixed black and flake --- relecov_tools/read_bioinfo_metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/relecov_tools/read_bioinfo_metadata.py b/relecov_tools/read_bioinfo_metadata.py index 4ac782b7..b070c656 100755 --- a/relecov_tools/read_bioinfo_metadata.py +++ b/relecov_tools/read_bioinfo_metadata.py @@ -381,7 +381,7 @@ def handling_files(self, file_list, sufix, output_folder, batch_date): matching_files = [ f for f in os.listdir(splitted_path) if pattern.match(f) ] - full_paths = [os.path.join(splitted_path, f) for f in matching_files] # noqua: F841 + full_paths = [os.path.join(splitted_path, f) for f in matching_files] # noqua: F841 try: # Dynamically import the function from the specified module utils_name = ( From 97be5731d83121081792cfce51e8bfd2089b689b Mon Sep 17 00:00:00 2001 From: svarona Date: Wed, 15 Jan 2025 16:36:33 +0100 Subject: [PATCH 29/31] fixed black and flake --- relecov_tools/read_bioinfo_metadata.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/relecov_tools/read_bioinfo_metadata.py b/relecov_tools/read_bioinfo_metadata.py index b070c656..a7fdc39f 100755 --- a/relecov_tools/read_bioinfo_metadata.py +++ b/relecov_tools/read_bioinfo_metadata.py @@ -381,7 +381,9 @@ def handling_files(self, file_list, sufix, output_folder, batch_date): matching_files = [ f for f in os.listdir(splitted_path) if pattern.match(f) ] - full_paths = [os.path.join(splitted_path, f) for f in matching_files] # noqua: F841 + full_paths = [ + os.path.join(splitted_path, f) for f in matching_files + ] # noqa: F841 try: # Dynamically import the function from the specified module utils_name = ( From 6c2556e1770da57ea5cbe5117fdec5bcb237c8ee Mon Sep 17 00:00:00 2001 From: svarona Date: Wed, 15 Jan 2025 16:39:33 +0100 Subject: [PATCH 30/31] finally fixed flake8 --- relecov_tools/read_bioinfo_metadata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/relecov_tools/read_bioinfo_metadata.py b/relecov_tools/read_bioinfo_metadata.py index a7fdc39f..db5d98f8 100755 --- a/relecov_tools/read_bioinfo_metadata.py +++ b/relecov_tools/read_bioinfo_metadata.py @@ -381,9 +381,9 @@ def handling_files(self, file_list, sufix, output_folder, batch_date): matching_files = [ f for f in os.listdir(splitted_path) if pattern.match(f) ] - full_paths = [ + full_paths = [ # noqa: F841 os.path.join(splitted_path, f) for f in matching_files - ] # noqa: F841 + ] try: # Dynamically import the function from the specified module utils_name = ( From e90d33db27821b285e67c8bda7df8995db7382f3 Mon Sep 17 00:00:00 2001 From: svarona Date: Wed, 15 Jan 2025 16:50:53 +0100 Subject: [PATCH 31/31] removed print --- relecov_tools/read_bioinfo_metadata.py | 1 - 1 file changed, 1 deletion(-) diff --git a/relecov_tools/read_bioinfo_metadata.py b/relecov_tools/read_bioinfo_metadata.py index db5d98f8..0e0d8020 100755 --- a/relecov_tools/read_bioinfo_metadata.py +++ b/relecov_tools/read_bioinfo_metadata.py @@ -392,7 +392,6 @@ def handling_files(self, file_list, sufix, output_folder, batch_date): import_statement = f"import {utils_name}" exec(import_statement) # Get method name and execute it. - print("lanzar_variants_long_table") data = eval( utils_name + "."