From 0ce5b1f04c3ee27195f00142c168a83e7f2024f1 Mon Sep 17 00:00:00 2001 From: Sergei Date: Wed, 10 Jan 2024 17:28:23 -0500 Subject: [PATCH 1/9] BLAS hooks for Apple --- CMakeLists.txt | 19 +++- .../SelectionAnalyses/FUBAR.bf | 4 +- src/core/matrix.cpp | 51 ++++++++- src/core/matrix_mult.cpp | 9 +- src/core/tree_evaluator.cpp | 101 ++++++++++++++++-- 5 files changed, 169 insertions(+), 15 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 53b09a36c..44eca12e9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.12) project(HyPhy) -cmake_policy(VERSION 3.0.0) +#cmake_policy(VERSION 3.0.0) enable_testing() #enable_language(Fortran) @@ -316,6 +316,11 @@ endif() MESSAGE ("Set compiler flags to ${DEFAULT_COMPILE_FLAGS}") MESSAGE ("Set linker flags to ${DEFAULT_LINK_FLAGS}") +set(BLA_VENDOR Apple) +find_package(BLAS) + + + #------------------------------------------------------------------------------- # OpenMP support #------------------------------------------------------------------------------- @@ -422,6 +427,10 @@ include_directories( contrib/SQLite-3.8.2 # SQLite ) +if (${BLAS_FOUND}) + add_definitions (-D_SLKP_USE_APPLE_BLAS) + set(DEFAULT_LIBRARIES "${DEFAULT_LIBRARIES}") +endif() #------------------------------------------------------------------------------- # shared hyphy hbl stdlib target @@ -442,6 +451,10 @@ else(${OPENMP_FOUND}) target_link_libraries(hyphy PRIVATE ${DEFAULT_LIBRARIES}) endif(${OPENMP_FOUND}) +if(${BLAS_FOUND}) + target_link_libraries(hyphy PRIVATE ${BLAS_LIBRARIES}) +endif () + add_custom_target(MP DEPENDS hyphy) install( @@ -477,6 +490,10 @@ if(${MPI_FOUND}) target_link_libraries(HYPHYMPI ${DEFAULT_LIBRARIES} ${MPI_LIBRARIES}) endif(${OPENMP_FOUND}) + if(${BLAS_FOUND}) + target_link_libraries(hyphy PRIVATE ${BLAS_LIBRARIES}) + endif () + install( TARGETS HYPHYMPI RUNTIME DESTINATION bin diff --git a/res/TemplateBatchFiles/SelectionAnalyses/FUBAR.bf b/res/TemplateBatchFiles/SelectionAnalyses/FUBAR.bf index df9267d3b..d2fb56622 100644 --- a/res/TemplateBatchFiles/SelectionAnalyses/FUBAR.bf +++ b/res/TemplateBatchFiles/SelectionAnalyses/FUBAR.bf @@ -650,12 +650,13 @@ return fubar.json; //---------------------------------------------------------------------------- lfunction fubar.ComputeENFP_CI (p_i,sig_level) { + N = Abs (p_i); - PDF_old = {{1-p_i[0],p_i[0]}}; PDF = PDF_old; for (i = 1; i < N; i+=1) { + PDF = {1,i+2}; PDF[0] = PDF_old[0] * (1-p_i[i]); for (X = 1; X < i+1; X+=1) { @@ -678,6 +679,7 @@ lfunction fubar.ComputeENFP_CI (p_i,sig_level) { sum += PDF[_idx]; } + return {{lb__, _idx__}} } diff --git a/src/core/matrix.cpp b/src/core/matrix.cpp index 64f253b7d..1a9cd2edd 100644 --- a/src/core/matrix.cpp +++ b/src/core/matrix.cpp @@ -94,6 +94,27 @@ _Trie _HY_MatrixRandomValidPDFs; +#ifdef _SLKP_USE_APPLE_BLAS + enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102 }; + enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, + AtlasConj=114}; + extern "C" void cblas_dgemv(const enum CBLAS_ORDER __Order, + const enum CBLAS_TRANSPOSE __TransA, + const int __M, const int __N, + const double __alpha, const double *__A, + const int __lda, const double *__X, const int __incX, + const double __beta, double *__Y, const int __incY); + + extern "C" void cblas_daxpy( + const int N, + const double alpha, + const double * X, + const int incX, + double* Y, + const int incY); + +#endif + //----------------------------------------------------------------------------------------------------------------- //----------------------------------------------------------------------------------------------------------------- //----------------------------------------------------------------------------------------------------------------- @@ -4088,7 +4109,33 @@ void _Matrix::Multiply (_Matrix& storage, _Matrix const& secondArg) const if (compressedIndex) { - +#ifdef _SLKP_USE_APPLE_BLAS_NOT_USED + hyFloat * _hprestrict_ res = storage.theData; + long currentXIndex = 0L; + for (long i = 0; i < 61; i++) { + long up = compressedIndex[i]; + + + if (currentXIndex < up) { + + for (long cxi = currentXIndex; cxi < up; cxi++) { + long currentXColumn = compressedIndex[cxi + 61]; + hyFloat * secArg = secondArg.theData + currentXColumn*61; + hyFloat value = theData[cxi]; + cblas_daxpy(61, + value, + secArg, + 1, + res, + 1); + } + + + } + res += 61; + currentXIndex = up; + } +#else #ifdef _SLKP_USE_ARM_NEON @@ -4251,7 +4298,7 @@ void _Matrix::Multiply (_Matrix& storage, _Matrix const& secondArg) const } #endif - +#endif } else { diff --git a/src/core/matrix_mult.cpp b/src/core/matrix_mult.cpp index ca6064f93..83c2179f4 100644 --- a/src/core/matrix_mult.cpp +++ b/src/core/matrix_mult.cpp @@ -7,7 +7,7 @@ #include -/*#ifdef __USE_APPLE_BLAS__ +#ifdef _SLKP_USE_APPLE_BLAS enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102 }; enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, AtlasConj=114}; @@ -17,7 +17,7 @@ extern "C" void cblas_dgemm(const enum CBLAS_ORDER __Order, const int __K, const double __alpha, const double *__A, const int __lda, const double *__B, const int __ldb, const double __beta, double *__C, const int __ldc); -#endif*/ +#endif //----------------------------------------------------------------------------------------------------------------- //----------------------------------------------------------------------------------------------------------------- @@ -3617,12 +3617,13 @@ void _hy_matrix_multiply_3x4x3 (double *C, double* A, double *B, int stride, boo void _hy_matrix_multiply_NxN_blocked4 (double * C, double *A, double *B, int D) { -/*#ifdef __USE_APPLE_BLAS__ +#ifdef _SLKP_USE_APPLE_BLAS if (D>=16) { + //printf ("HERE\n"); cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, D, D, D, 1., A, D, B, D, 0.0, C, D); return; } -#endif*/ +#endif auto offset = [D](int i, int j) -> int { return (i<<2)*D + (j << 2); }; diff --git a/src/core/tree_evaluator.cpp b/src/core/tree_evaluator.cpp index 701b9e6e7..4f78f10ed 100644 --- a/src/core/tree_evaluator.cpp +++ b/src/core/tree_evaluator.cpp @@ -125,6 +125,17 @@ inline double _sse_sum_2 (__m128d const & x) { } #endif +#ifdef _SLKP_USE_APPLE_BLAS_2 +enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102 }; +enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, + AtlasConj=114}; +extern "C" void cblas_dgemv(const enum CBLAS_ORDER __Order, + const enum CBLAS_TRANSPOSE __TransA, + const int __M, const int __N, + const double __alpha, const double *__A, + const int __lda, const double *__X, const int __incX, + const double __beta, double *__Y, const int __incY); +#endif /* template inline void __ll_handle_matrix_transpose (hyFloat const * __restrict transitionMatrix, hyFloat * __restrict tMatrixT) { @@ -3234,7 +3245,22 @@ hyFloat _TheTree::ComputeTreeBlockByBranch ( _SimpleList printf ("\t %d %g %g\n", k, parentConditionals[k], childVector[k]); }*/ - _hy_mvp_blocked_4x4<20> (mvs, tMatrix, childVector); + #ifdef _SLKP_USE_APPLE_BLAS_2 + cblas_dgemv(CblasRowMajor, + CblasNoTrans, + 20, + 20, + 1., + tMatrix, + 20, + childVector, + 1, + 0., + mvs, + 1); + #else + _hy_mvp_blocked_4x4<20> (mvs, tMatrix, childVector); + #endif sum += _hy_vvmult_sum<20> (parentConditionals, mvs); __ll_loop_handle_scaling<20L, true> (sum, parentConditionals, scalingAdjustments, didScale, parentCode, siteCount, siteID, localScalerChange, theFilter->theFrequencies.get (siteOrdering.list_data[siteID])); @@ -3294,8 +3320,23 @@ hyFloat _TheTree::ComputeTreeBlockByBranch ( _SimpleList } }*/ - _hy_mvp_blocked_4x4<61> (mvs, tMatrix, childVector); - sum += _hy_vvmult_sum<61> (parentConditionals, mvs); + #ifdef _SLKP_USE_APPLE_BLAS_2 + cblas_dgemv(CblasRowMajor, + CblasNoTrans, + 61, + 61, + 1., + tMatrix, + 61, + childVector, + 1, + 0., + mvs, + 1); + #else + _hy_mvp_blocked_4x4<61> (mvs, tMatrix, childVector); + #endif + sum += _hy_vvmult_sum<61> (parentConditionals, mvs); //#pragma omp critical @@ -3329,7 +3370,22 @@ hyFloat _TheTree::ComputeTreeBlockByBranch ( _SimpleList - _hy_mvp_blocked_4x4<62> (mvs, tMatrix, childVector); + #ifdef _SLKP_USE_APPLE_BLAS_2 + cblas_dgemv(CblasRowMajor, + CblasNoTrans, + 62, + 62, + 1., + tMatrix, + 62, + childVector, + 1, + 0., + mvs, + 1); + #else + _hy_mvp_blocked_4x4<62> (mvs, tMatrix, childVector); + #endif sum += _hy_vvmult_sum<62> (parentConditionals, mvs); __ll_loop_handle_scaling<62L, true> (sum, parentConditionals, scalingAdjustments, didScale, parentCode, siteCount, siteID, localScalerChange, theFilter->theFrequencies.get (siteOrdering.list_data[siteID])); @@ -3345,7 +3401,23 @@ hyFloat _TheTree::ComputeTreeBlockByBranch ( _SimpleList } - _hy_mvp_blocked_4x4<63> (mvs, tMatrix, childVector); + #ifdef _SLKP_USE_APPLE_BLAS_2 + cblas_dgemv(CblasRowMajor, + CblasNoTrans, + 63, + 63, + 1., + tMatrix, + 63, + childVector, + 1, + 0., + mvs, + 1); + #else + _hy_mvp_blocked_4x4<63> (mvs, tMatrix, childVector); + #endif + sum += _hy_vvmult_sum<63> (parentConditionals, mvs); __ll_loop_handle_scaling<63L, true> (sum, parentConditionals, scalingAdjustments, didScale, parentCode, siteCount, siteID, localScalerChange, theFilter->theFrequencies.get (siteOrdering.list_data[siteID])); @@ -3361,8 +3433,23 @@ hyFloat _TheTree::ComputeTreeBlockByBranch ( _SimpleList lNodeFlags, isLeaf, nodeCode, setBranch, flatTree.lLength, siteID, siteFrom, siteCount, siteOrdering, parentConditionals, tMatrix, lNodeResolutions, childVector, tcc, currentTCCBit, currentTCCIndex, lastUpdatedSite, setBranchTo, alphabetDimension)) { continue; } - - _hy_matrix_vector_product_blocked_4x4 (mvs, tMatrix, childVector, alphabetDimension); + #ifdef _SLKP_USE_APPLE_BLAS_2 + cblas_dgemv(CblasRowMajor, + CblasNoTrans, + alphabetDimension, + alphabetDimension, + 1., + tMatrix, + alphabetDimension, + childVector, + 1, + 0., + mvs, + 1); + #else + _hy_matrix_vector_product_blocked_4x4 (mvs, tMatrix, childVector, alphabetDimension); + #endif + //_hy_mvp_blocked_4x4<16> (mvs, tMatrix, childVector); sum += _hy_vvmult_sum_generic (parentConditionals, mvs, alphabetDimension); //sum += _hy_vvmult_sum<16> (parentConditionals, mvs); From a5531a9dd22d1e455b363bb6c63b9cd5c45a37b2 Mon Sep 17 00:00:00 2001 From: Sergei Date: Fri, 19 Jan 2024 18:10:07 -0500 Subject: [PATCH 2/9] error-filter fix; adding BLAS for OS X --- CMakeLists.txt | 2 +- res/TemplateBatchFiles/SelectionAnalyses/error-filter.bf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 44eca12e9..3812aad9a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -491,7 +491,7 @@ if(${MPI_FOUND}) endif(${OPENMP_FOUND}) if(${BLAS_FOUND}) - target_link_libraries(hyphy PRIVATE ${BLAS_LIBRARIES}) + target_link_libraries(HYPHYMPI ${BLAS_LIBRARIES}) endif () install( diff --git a/res/TemplateBatchFiles/SelectionAnalyses/error-filter.bf b/res/TemplateBatchFiles/SelectionAnalyses/error-filter.bf index 6225f6323..e87e17b1e 100644 --- a/res/TemplateBatchFiles/SelectionAnalyses/error-filter.bf +++ b/res/TemplateBatchFiles/SelectionAnalyses/error-filter.bf @@ -207,7 +207,7 @@ for (p = 0; p < efilter.input[terms.json.partition_count]; p+=1) { efilter.masked_sites [ntm] + (site + efilter.site_offset); } } - if (efilter.verbose_logging) {} + if (efilter.verbose_logging) { console.log ("Masking everything " + site + " " + node + " " + Abs (efilter.leaf_descendants) / Abs (efilter.sequences)); } break; From c0ffe37a30441017918244fd2a3f5735d3b1dfee Mon Sep 17 00:00:00 2001 From: Sergei Date: Thu, 22 Feb 2024 07:47:46 -0500 Subject: [PATCH 3/9] RELAX multifile (and general multi-file support in load_file; some bug fixes for 2.5.60 --- .../SelectionAnalyses/RELAX.bf | 275 +++-- .../modules/io_functions.ibf | 14 + .../modules/shared-load-file.bf | 219 ++-- res/TemplateBatchFiles/libv3/all-terms.bf | 1 + .../libv3/tasks/alignments.bf | 17 +- .../libv3/tasks/estimators.bf | 2 +- res/TemplateBatchFiles/libv3/tasks/mpi.bf | 10 +- src/core/alignment.cpp | 1069 +++++++++++------ src/core/associative_list.cpp | 16 + src/core/batchlan.cpp | 23 +- src/core/batchlanruntime.cpp | 38 +- src/core/global_things.cpp | 2 +- src/core/include/alignment.h | 27 +- src/core/include/associative_list.h | 1 + 14 files changed, 1111 insertions(+), 603 deletions(-) diff --git a/res/TemplateBatchFiles/SelectionAnalyses/RELAX.bf b/res/TemplateBatchFiles/SelectionAnalyses/RELAX.bf index fcb572573..219128111 100644 --- a/res/TemplateBatchFiles/SelectionAnalyses/RELAX.bf +++ b/res/TemplateBatchFiles/SelectionAnalyses/RELAX.bf @@ -44,8 +44,9 @@ relax.analysis_description = { Version 3.1.1 adds some bug fixes for better convergence. Version 4.0 adds support for synonymous rate variation. Version 4.1 adds further support for multiple hit models. - Version 4.1.1 adds reporting for convergence diagnostics", - terms.io.version : "3.1.1", + Version 4.1.1 adds reporting for convergence diagnostics. + Version 4.5 adds support for multiple datasets for joint testing.", + terms.io.version : "4.5", terms.io.reference : "RELAX: Detecting Relaxed Selection in a Phylogenetic Framework (2015). Mol Biol Evol 32 (3): 820-832", terms.io.authors : "Sergei L Kosakovsky Pond, Ben Murrell, Steven Weaver and Temple iGEM / UCSD viral evolution g", terms.io.contact : "spond@temple.edu", @@ -121,16 +122,33 @@ relax.display_orders = {terms.original_name: -1, /*------------------------------------------------------------------------------*/ +KeywordArgument ("multiple-files", "Use multiple files as input", "No"); -KeywordArgument ("code", "Which genetic code should be used", "Universal"); +relax.multiple_files = io.SelectAnOption ({ + {"No", "Load data and trees from a single file"} + {"Yes", "Prompt for a file list and load files (and trees) from it"} + }, "Use multiple files as input") == "Yes"; + + + +if (relax.multiple_files) { + KeywordArgument ("filelist", "A line list of file paths for the alignments to include in this analysis"); + KeywordArgument ("code", "Which genetic code should be used", "Universal"); /** keyword, description (for inline documentation and help messages), default value */ -KeywordArgument ("alignment", "An in-frame codon alignment in one of the formats supported by HyPhy"); +} else { + KeywordArgument ("code", "Which genetic code should be used", "Universal"); /** - keyword, description (for inline documentation and help messages), no default value, - meaning that it will be required + keyword, description (for inline documentation and help messages), default value */ + KeywordArgument ("alignment", "An in-frame codon alignment in one of the formats supported by HyPhy"); + /** + keyword, description (for inline documentation and help messages), no default value, + meaning that it will be required + */ + +} KeywordArgument ("tree", "A phylogenetic tree (optionally annotated with {})", null, "Please select a tree file for the data:"); /** the use of null as the default argument means that the default expectation is for the @@ -140,22 +158,27 @@ KeywordArgument ("tree", "A phylogenetic tree (optionally annotated with {} This allows handling some branching logic conditionals */ + KeywordArgument ("mode", "Run mode", "Classic mode", "Group test mode"); KeywordArgument ("test", "Branches to use as the test set", null, "Choose the set of branches to use as the _test_ set"); KeywordArgument ("reference", "Branches to use as the reference set", null, "Choose the set of branches to use as the _reference_ set"); - io.DisplayAnalysisBanner ( relax.analysis_description ); selection.io.startTimer (relax.json [terms.json.timers], "Overall", 0); namespace relax { LoadFunctionLibrary ("modules/shared-load-file.bf"); - load_file ({utility.getGlobalValue("terms.prefix"): "relax", utility.getGlobalValue("terms.settings") : {utility.getGlobalValue("terms.settings.branch_selector") : "relax.select_branches"}}); + load_file ({ + utility.getGlobalValue("terms.multiple_files") : multiple_files, + utility.getGlobalValue("terms.prefix"): "relax", + utility.getGlobalValue("terms.settings") : {utility.getGlobalValue("terms.settings.branch_selector") : "relax.select_branches"} + }); LoadFunctionLibrary ("modules/grid_compute.ibf"); } KeywordArgument ("output", "Write the resulting JSON to this file (default is to save to the same path as the alignment file + 'RELAX.json')", relax.codon_data_info [terms.json.json]); + relax.codon_data_info [terms.json.json] = io.PromptUserForFilePath ("Save the resulting JSON file to"); KeywordArgument ("grid-size", "The number of points in the initial distributional guess for likelihood fitting", 250); @@ -291,16 +314,40 @@ namespace relax { } +relax.run_full_mg94 = TRUE; + +if (Type (relax.save_intermediate_fits) == "AssociativeList") { + if (None != relax.save_intermediate_fits[^"terms.data.value"]) { + if (utility.Has (relax.save_intermediate_fits[^"terms.data.value"], "Full-MG94", "AssociativeList")) { + relax.final_partitioned_mg_results = (relax.save_intermediate_fits[^"terms.data.value"])["Full-MG94"]; + relax.run_full_mg94 = FALSE; + } + } +} + io.ReportProgressMessageMD ("RELAX", "codon-refit", "Improving branch lengths, nucleotide substitution biases, and global dN/dS ratios under a full codon model"); +if (relax.run_full_mg94) { + relax.final_partitioned_mg_results = estimators.FitMGREV (relax.filter_names, relax.trees, relax.codon_data_info [terms.code], { + terms.run_options.model_type: terms.local, + terms.run_options.partitioned_omega: relax.selected_branches, + }, relax.partitioned_mg_results); + + if (Type (relax.save_intermediate_fits) == "AssociativeList") { + (relax.save_intermediate_fits[^"terms.data.value"])["Full-MG94"] = relax.final_partitioned_mg_results; + Export (lfe, ^relax.final_partitioned_mg_results[^"terms.likelihood_function"]); + (relax.save_intermediate_fits[^"terms.data.value"])["Full-MG94-LF"] = lfe; + io.SpoolJSON (relax.save_intermediate_fits[^"terms.data.value"],relax.save_intermediate_fits[^"terms.data.file"]); + } +} -relax.final_partitioned_mg_results = estimators.FitMGREV (relax.filter_names, relax.trees, relax.codon_data_info [terms.code], { - terms.run_options.model_type: terms.local, - terms.run_options.partitioned_omega: relax.selected_branches, -}, relax.partitioned_mg_results); io.ReportProgressMessageMD("RELAX", "codon-refit", "* " + selection.io.report_fit (relax.final_partitioned_mg_results, 0, relax.codon_data_info[terms.data.sample_size])); +io.ReportProgressMessageMD ("RELAX", "codon-refit", "* " + selection.io.report_fit_secondary_stats (relax.final_partitioned_mg_results)); + +selection.io.report_fit_secondary_stats (relax.final_partitioned_mg_results); + relax.global_dnds = selection.io.extract_global_MLE_re (relax.final_partitioned_mg_results, "^" + terms.parameters.omega_ratio); relax.report_dnds = {}; @@ -319,7 +366,7 @@ selection.io.json_store_lf_withEFV (relax.json, utility.ArrayToDict (utility.Map (relax.global_dnds, "_value_", "{'key': _value_[terms.description], 'value' : Eval({{_value_ [terms.fit.MLE],1}})}")), (relax.final_partitioned_mg_results[terms.efv_estimate])["VALUEINDEXORDER"][0], relax.display_orders[relax.MG94_name]); -//single partition only for relax, but can't hurt . + utility.ForEachPair (relax.filter_specification, "_key_", "_value_", 'selection.io.json_store_branch_attribute(relax.json,relax.MG94_name, terms.branch_length, relax.display_orders[relax.MG94_name], _key_, @@ -465,7 +512,12 @@ if (relax.model_set == "All") { // run all the models io.ReportProgressMessageMD ("RELAX", "gd", "Fitting the general descriptive (separate k per branch) model"); selection.io.startTimer (relax.json [terms.json.timers], "General descriptive model fitting", 2); + //utility.SetEnvVariable ("VERBOSITY_LEVEL",10); + relax.ge_model_map = {}; + for (relax.index, relax.junk ; in; relax.filter_names) { + relax.ge_model_map [relax.index] = {"DEFAULT" : "relax.ge"}; + } if (Type (relax.ge_guess) != "Matrix") { @@ -485,40 +537,45 @@ if (relax.model_set == "All") { // run all the models relax.nm.precision = -0.00025*relax.final_partitioned_mg_results[terms.fit.log_likelihood]; - parameters.DeclareGlobalWithRanges ("relax.bl.scaler", 1, 0, 1000); - for (i, v; in; relax.initial_grid) { - v["relax.bl.scaler"] = {terms.id : "relax.bl.scaler", terms.fit.MLE : Random (2,4)}; - } + relax.nm.bl_scalers = {}; - - relax.grid_search.results = estimators.FitLF (relax.filter_names, relax.trees,{ "0" : {"DEFAULT" : "relax.ge"}}, - relax.final_partitioned_mg_results, - relax.model_object_map, - { - "retain-lf-object": TRUE, - terms.run_options.apply_user_constraints: "relax.init.k", - terms.run_options.proportional_branch_length_scaler : - {"0" : "relax.bl.scaler"}, - - terms.run_options.optimization_settings : - { - "OPTIMIZATION_METHOD" : "nedler-mead", - "MAXIMUM_OPTIMIZATION_ITERATIONS" : 500, - "OPTIMIZATION_PRECISION" : relax.nm.precision - } , - - terms.search_grid : relax.initial_grid, - terms.search_restarts : relax.N.initial_guesses, - terms.run_options.optimization_log : relax.optimization_log_file (".GE-1-log.json") - } + for (relax.index, relax.junk ; in; relax.filter_names) { + relax.scaler.id = "relax.bl.scaler." + relax.index; + relax.nm.bl_scalers[relax.index] = relax.scaler.id ; + parameters.DeclareGlobalWithRanges (relax.scaler.id, 1, 0, 1000); + for (i, v; in; relax.initial_grid) { + v[relax.scaler.id] = {terms.id : relax.scaler.id, terms.fit.MLE : Random (2,4)}; + } + } + + + relax.grid_search.results = estimators.FitLF (relax.filter_names, relax.trees, relax.ge_model_map, + relax.final_partitioned_mg_results, + relax.model_object_map, + { + "retain-lf-object": TRUE, + terms.run_options.apply_user_constraints: "relax.init.k", + terms.run_options.proportional_branch_length_scaler : + relax.nm.bl_scalers, + + terms.run_options.optimization_settings : + { + "OPTIMIZATION_METHOD" : "nedler-mead", + "MAXIMUM_OPTIMIZATION_ITERATIONS" : 500, + "OPTIMIZATION_PRECISION" : relax.nm.precision + } , + + terms.search_grid : relax.initial_grid, + terms.search_restarts : relax.N.initial_guesses, + terms.run_options.optimization_log : relax.optimization_log_file (".GE-1-log.json") + } ); - relax.general_descriptive.fit = estimators.FitLF (relax.filter_names, relax.trees, - { "0" : {"DEFAULT" : "relax.ge"}}, + relax.ge_model_map, relax.grid_search.results, relax.model_object_map, { @@ -533,7 +590,7 @@ if (relax.model_set == "All") { // run all the models parameters.SetStickBreakingDistribution (relax.distribution, relax.ge_guess); relax.general_descriptive.fit = estimators.FitLF (relax.filter_names, relax.trees, - { "0" : {"DEFAULT" : "relax.ge"}}, + relax.ge_model_map, relax.final_partitioned_mg_results, relax.model_object_map, { @@ -543,10 +600,9 @@ if (relax.model_set == "All") { // run all the models }); } - + estimators.TraverseLocalParameters (relax.general_descriptive.fit [terms.likelihood_function], relax.model_object_map, "relax.set.k2"); - relax.general_descriptive.fit = estimators.FitExistingLF (relax.general_descriptive.fit [terms.likelihood_function], relax.model_object_map); @@ -726,7 +782,11 @@ for (relax.k = 0; relax.k < relax.numbers_of_tested_groups; relax.k += 1) { } } -relax.model_map = utility.Map (relax.model_to_group_name, "_groupid_", 'utility.Filter (relax.selected_branches[0], "_branchgroup_", "_branchgroup_ == \'" + _groupid_ + "\'")'); +relax.model_map = {}; +for (relax.index, relax.junk ; in; relax.filter_names) { + relax.model_map [relax.index]= utility.Map (relax.model_to_group_name, "_groupid_", 'utility.Filter (relax.selected_branches[relax.index], "_branchgroup_", "_branchgroup_ == \'" + _groupid_ + "\'")'); +} + // constrain the proportions to be the same @@ -801,7 +861,12 @@ if (relax.has_unclassified) { parameters.SetRange (model.generic.GetGlobalParameter (relax.unclassified.bsrel_model , terms.AddCategory (terms.parameters.omega_ratio,relax.rate_classes)), terms.range_gte1); relax.model_object_map ["relax.unclassified"] = relax.unclassified.bsrel_model; - relax.model_map ["relax.unclassified"] = utility.Filter (relax.selected_branches[0], '_value_', '_value_ == relax.unclassified_branches_name'); + + + for (relax.index, relax.junk ; in; relax.filter_names) { + (relax.model_map[relax.index]) ["relax.unclassified"] = utility.Filter (relax.selected_branches[relax.index], '_value_', '_value_ == relax.unclassified_branches_name'); + } + models.BindGlobalParameters ({"0" : relax.model_object_map[relax.reference_model_namespace], "1" : relax.unclassified.bsrel_model}, terms.nucleotideRate("[ACGT]","[ACGT]")); } @@ -919,20 +984,30 @@ function relax._report_srv (relax_model_fit, is_null) { //------------------------------------ function relax.FitMainTestPair (prompt) { - _varname_ = model.generic.GetGlobalParameter (relax.model_object_map[relax.model_namespaces[1]] , terms.AddCategory (terms.parameters.omega_ratio,1)); + //_varname_ = model.generic.GetGlobalParameter (relax.model_object_map[relax.model_namespaces[1]] , terms.AddCategory (terms.parameters.omega_ratio,1)); - if (relax.do_lhc) { + + if (relax.do_lhc) { relax.nm.precision = -0.00025*relax.final_partitioned_mg_results[terms.fit.log_likelihood]; - parameters.DeclareGlobalWithRanges ("relax.bl.scaler", 1, 0, 1000); + //parameters.DeclareGlobalWithRanges ("relax.bl.scaler", 1, 0, 1000); + relax.main.bl_scalers = {}; + + //utility.SetEnvVariable ("VERBOSITY_LEVEL",10); + + for (relax.index, relax.junk ; in; relax.filter_names) { + relax.scaler.id = "relax.bl.scaler." + relax.index; + relax.main.bl_scalers[relax.index] = relax.scaler.id ; + parameters.DeclareGlobalWithRanges (relax.scaler.id, 1, 0, 1000); + } - relax.general_descriptive.fit = estimators.FitLF (relax.filter_names, relax.trees,{ "0" : relax.model_map}, + relax.general_descriptive.fit = estimators.FitLF (relax.filter_names, relax.trees, relax.model_map, relax.general_descriptive.fit, relax.model_object_map, { "retain-lf-object": TRUE, terms.run_options.proportional_branch_length_scaler : - {"0" : "relax.bl.scaler"}, + relax.main.bl_scalers, terms.run_options.optimization_settings : { @@ -953,7 +1028,7 @@ function relax.FitMainTestPair (prompt) { relax.alternative_model.fit = estimators.FitLF (relax.filter_names, relax.trees, - { "0" : relax.model_map}, + relax.model_map, relax.general_descriptive.fit, relax.model_object_map, { @@ -1015,7 +1090,7 @@ function relax.FitMainTestPair (prompt) { //assert (__SIGTRAP__); - relax.alternative_model.fit.take2 = estimators.FitLF (relax.filter_names, relax.trees, { "0" : relax.model_map}, + relax.alternative_model.fit.take2 = estimators.FitLF (relax.filter_names, relax.trees, relax.model_map, relax.alternative_model.fit , relax.model_object_map, { @@ -1302,7 +1377,7 @@ lfunction relax.set.k (tree_name, node_name, model_description, ignore) { lfunction relax.set.k2 (tree_name, node_name, model_description, ignore) { if (utility.Has (model_description [utility.getGlobalValue ("terms.local")], utility.getGlobalValue ("terms.relax.k"), "String")) { - k = (model_description [utility.getGlobalValue ("terms.local")])[utility.getGlobalValue ("terms.relax.k")]; + k = (model_description [utility.getGlobalValue ("terms.local")])[utility.getGlobalValue ("terms.relax.k")]; parameters.RemoveConstraint (tree_name + "." + node_name + "." + k); } return tree_name + "." + node_name + "." + k; @@ -1460,8 +1535,6 @@ lfunction relax.BS_REL._GenerateRate.MH (fromChar, toChar, namespace, model_type lfunction relax.BS_REL._GenerateRate (fromChar, toChar, namespace, model_type, _tt, alpha, alpha_term, beta, beta_term, omega, omega_term) { - - p = {}; diff = models.codon.diff(fromChar, toChar); @@ -1498,8 +1571,6 @@ lfunction relax.BS_REL._GenerateRate (fromChar, toChar, namespace, model_type, _ } } } - - return p; } @@ -1566,18 +1637,44 @@ lfunction relax.select_branches(partition_info) { kGroupMode = ^"relax.kGroupMode"; - io.CheckAssertion("utility.Array1D (`&partition_info`) == 1", "RELAX only works on a single partition dataset"); + tree_count = utility.Array1D (partition_info); + //io.CheckAssertion(" == 1", "RELAX only works on a single partition dataset"); available_models = {}; - return_set = {}; + return_set = {}; tree_configuration = {}; - tree_for_analysis = (partition_info[0])[utility.getGlobalValue("terms.data.tree")]; - utility.ForEach (tree_for_analysis[utility.getGlobalValue("terms.trees.model_map")], "_value_", "`&available_models`[_value_] += 1"); - list_models = utility.sortStrings(utility.Keys(available_models)); // get keys + for (i,p; in; partition_info) { + tree_for_analysis = p[utility.getGlobalValue("terms.data.tree")]; + tree_configuration [i] = {}; + if (+i == 0) { + for (_value_; in; tree_for_analysis[utility.getGlobalValue("terms.trees.model_map")]) { + available_models [_value_] = 1; + } + } else { + partition_models = {}; + for (_value_; in; tree_for_analysis[utility.getGlobalValue("terms.trees.model_map")]) { + partition_models [_value_] = 1; + } + for (k, v; in; partition_models) { + available_models[k] += 1; + } + } + } + + list_models = {}; + for (i,p; in; available_models) { + if (p == tree_count) { + list_models[i] = 1; + } + } + + + + list_models = utility.sortStrings(utility.Keys(list_models)); // get keys option_count = Abs (available_models); + can_run_group_mode = (option_count == Abs (available_models)); - io.CheckAssertion ("`&option_count` >= 2", "RELAX requires at least one designated set of branches in the tree."); nontrivial_groups = option_count; @@ -1587,25 +1684,26 @@ lfunction relax.select_branches(partition_info) { } run_mode = None; - - - if (nontrivial_groups >= 2) { // could run as a group set + if (nontrivial_groups >= 2 && can_run_group_mode) { // could run as a group set run_mode = io.SelectAnOption ({ {kGroupMode, "Run the test for equality of selective regimes among " + nontrivial_groups + " groups of branches"} {"Classic mode", "Select one test and one reference group of branches, with the rest of the branches treated as unclassified"} }, "Group test mode"); + if (run_mode == kGroupMode) { utility.SetEnvVariable ("relax.numbers_of_tested_groups", nontrivial_groups); - for (_key_, _value_; in; tree_for_analysis[utility.getGlobalValue("terms.trees.model_map")]) { - if ('' == _value_ ) { - tree_configuration[_key_] = utility.getGlobalValue('relax.unclassified_branches_name'); - } else { - tree_configuration[_key_] = _value_; - } - } - + for (i,p; in; partition_info) { + tree_for_analysis = p[utility.getGlobalValue("terms.data.tree")]; + for (_key_, _value_; in; tree_for_analysis[utility.getGlobalValue("terms.trees.model_map")]) { + if ('' == _value_ ) { + (tree_configuration[i])[_key_] = utility.getGlobalValue('relax.unclassified_branches_name'); + } else { + (tree_configuration[i])[_key_] = _value_; + } + } + } utility.SetEnvVariable ("relax.analysis_run_mode", kGroupMode); } } @@ -1645,21 +1743,24 @@ lfunction relax.select_branches(partition_info) { tag_reference = ""; } - utility.ForEachPair (tree_for_analysis[utility.getGlobalValue("terms.trees.model_map")], "_key_", "_value_", " - if (`&tag_test` == _value_ ) { - `&tree_configuration`[_key_] = utility.getGlobalValue('relax.test_branches_name'); - } else { - if (`&tag_reference` == _value_ ) { - `&tree_configuration`[_key_] = utility.getGlobalValue('relax.reference_branches_name'); - } else { - `&tree_configuration`[_key_] = utility.getGlobalValue('relax.unclassified_branches_name'); - } - } - "); + + for (i,p; in; partition_info) { + tree_for_analysis = p[utility.getGlobalValue("terms.data.tree")]; + for (_key_,_value_; in; tree_for_analysis[utility.getGlobalValue("terms.trees.model_map")]) { + if (tag_test == _value_ ) { + (tree_configuration[i])[_key_] = utility.getGlobalValue('relax.test_branches_name'); + } else { + if (tag_reference == _value_ ) { + (tree_configuration[i])[_key_] = utility.getGlobalValue('relax.reference_branches_name'); + } else { + (tree_configuration[i])[_key_] = utility.getGlobalValue('relax.unclassified_branches_name'); + } + } + } + } } - return_set + tree_configuration; - return return_set; + return tree_configuration; } //------------------------------------------------------------------------------ diff --git a/res/TemplateBatchFiles/SelectionAnalyses/modules/io_functions.ibf b/res/TemplateBatchFiles/SelectionAnalyses/modules/io_functions.ibf index 8c7b5d99e..fb07b808d 100644 --- a/res/TemplateBatchFiles/SelectionAnalyses/modules/io_functions.ibf +++ b/res/TemplateBatchFiles/SelectionAnalyses/modules/io_functions.ibf @@ -6,6 +6,20 @@ LoadFunctionLibrary("libv3/convenience/regexp.bf"); LoadFunctionLibrary("libv3/all-terms.bf"); +//------------------------------------------------------------------------------ +lfunction selection.io.SelectAllBranchSets(partition_info) { + + branch_set = {}; + + tree_for_analysis = (partition_info[0])[utility.getGlobalValue("terms.data.tree")]; + + return { + "0" : tree_for_analysis[utility.getGlobalValue("terms.trees.model_map")] + }; + + +} + //------------------------------------------------------------------------------ lfunction selection.io.defineBranchSets(partition_info) { diff --git a/res/TemplateBatchFiles/SelectionAnalyses/modules/shared-load-file.bf b/res/TemplateBatchFiles/SelectionAnalyses/modules/shared-load-file.bf index 43eb665b0..626fbd277 100644 --- a/res/TemplateBatchFiles/SelectionAnalyses/modules/shared-load-file.bf +++ b/res/TemplateBatchFiles/SelectionAnalyses/modules/shared-load-file.bf @@ -115,20 +115,90 @@ function load_nuc_file (prefix) { } +function annotate_codon_info (codon_info, filter_name) { + sample_size=codon_info[utility.getGlobalValue("terms.data.sites")]*codon_info[utility.getGlobalValue("terms.data.sequences")]; + + + codon_info[utility.getGlobalValue("terms.data.sample_size")] = sample_size; + upper_prefix = prefix && 1; //uppercase the prefix for json name + codon_info[utility.getGlobalValue("terms.json.json")] = codon_info[utility.getGlobalValue("terms.data.file")] + "."+upper_prefix+".json"; + + name_mapping = codon_info[utility.getGlobalValue("terms.data.name_mapping")]; + + /** + will contain "mapped" -> "original" associations with sequence names; or null if no mapping was necessary + */ + + if (None == name_mapping) { /** create a 1-1 mapping if nothing was done */ + name_mapping = {}; + for (_value_; in; alignments.GetSequenceNames (filter_name)) { + name_mapping[_value_] = _value_; + } + } + + codon_info[^"terms.data.name_mapping"] = name_mapping; + + // check for duplicates + duplicate_sequences = codon_info[^"terms.data.sequences"] - alignments.HasDuplicateSequences (codon_info[^"terms.data.datafilter"],-1); + if (duplicate_sequences > 0) { + fprintf(stdout, "\n-------\n", io.FormatLongStringToWidth( + ">[WARNING] '`codon_info[utility.getGlobalValue("terms.data.file")]`' contains " + duplicate_sequences + " duplicate " + io.SingularOrPlural (duplicate_sequences, 'sequence', 'sequences') + ". + Identical sequences do not contribute any information to the analysis and only slow down computation. + Please consider removing duplicate or 'nearly' duplicate sequences, + e.g. using https://github.com/veg/hyphy-analyses/tree/master/remove-duplicates + prior to running selection analyses", 72), + "\n-------\n"); + } +} function load_file (prefix) { settings = None; + multiple_files = FALSE; + if (Type (prefix) == "AssociativeList") { - settings = prefix[utility.getGlobalValue("terms.settings")]; - prefix = prefix [utility.getGlobalValue("terms.prefix")]; + multiple_files = prefix [utility.getGlobalValue("terms.multiple_files")]; + settings = prefix[utility.getGlobalValue("terms.settings")]; + prefix = prefix [utility.getGlobalValue("terms.prefix")]; } - - - codon_data_info = alignments.PromptForGeneticCodeAndAlignment(prefix+".codon_data", prefix+".codon_filter"); + if (multiple_files) { + SetDialogPrompt ("Supply a list of files to include in the analysis (one per line)"); + codon_data_info = {}; + fscanf (PROMPT_FOR_FILE, "Lines", file_list_path); + codon_data_info[utility.getGlobalValue("terms.data.file")] = ^"LAST_FILE_PATH"; + file_list = io.validate_a_list_of_files (file_list_path); + file_count = utility.Array1D (file_list); + io.CheckAssertion("`&file_count` > 1", "At least one valid filepath is required"); + partitions_and_trees = {}; + codon_data_info [utility.getGlobalValue("terms.data.sequences")] = {1, file_count}; + codon_data_info [utility.getGlobalValue("terms.data.sites")] = {1, file_count}; + codon_data_info [utility.getGlobalValue("terms.data.sample_size")] = 0; + datasets = {}; + + for (i, filepath; in; file_list) { + datasets[i] = prefix+".codon_data_" + i; + if (+i == 0) { + codon_data_info [filepath] = + alignments.LoadCodonDataFile(datasets[i], prefix+".codon_filter_" + i , alignments.ReadCodonDataSetFromPath(prefix+".codon_data_" + i, filepath)); + (codon_data_info)[^"terms.code"] = (codon_data_info[filepath])[^"terms.code"]; + (codon_data_info)[^"terms.stop_codons"] = (codon_data_info[filepath])[^"terms.stop_codons"]; + } else { + codon_data_info [filepath] = + alignments.LoadCodonDataFile(datasets[i], prefix+".codon_filter_" + i , alignments.ReadCodonDataSetFromPathGivenCode(prefix+".codon_data_" + i, filepath, (codon_data_info)[^"terms.code"] , (codon_data_info)[^"terms.stop_codons"] )); + } + annotate_codon_info ( codon_data_info [filepath], prefix+".codon_filter_" + i); + partitions_and_trees + (trees.LoadAnnotatedTreeTopology.match_partitions ({{"file_" + i ,""}}, (codon_data_info [filepath])[^"terms.data.name_mapping"]))[0]; + (codon_data_info [utility.getGlobalValue("terms.data.sequences")])[+i] = (codon_data_info[filepath]) [utility.getGlobalValue("terms.data.sequences")]; + (codon_data_info [utility.getGlobalValue("terms.data.sites")])[+i] = (codon_data_info[filepath]) [utility.getGlobalValue("terms.data.sites")]; + codon_data_info [utility.getGlobalValue("terms.data.sample_size")] += (codon_data_info[filepath]) [utility.getGlobalValue("terms.data.sample_size")]; + } + + } else { + datasets = prefix+".codon_data"; + codon_data_info = alignments.PromptForGeneticCodeAndAlignment(datasets, prefix+".codon_filter"); /** example output { "sequences": 13, @@ -160,98 +230,71 @@ function load_file (prefix) { } */ - - sample_size=codon_data_info[utility.getGlobalValue("terms.data.sites")]*codon_data_info[utility.getGlobalValue("terms.data.sequences")]; - - - codon_data_info[utility.getGlobalValue("terms.data.sample_size")] = sample_size; - upper_prefix = prefix && 1; //uppercase the prefix for json name - codon_data_info[utility.getGlobalValue("terms.json.json")] = codon_data_info[utility.getGlobalValue("terms.data.file")] + "."+upper_prefix+".json"; - - name_mapping = codon_data_info[utility.getGlobalValue("terms.data.name_mapping")]; - - /** - will contain "mapped" -> "original" associations with sequence names; or null if no mapping was necessary - */ - - if (None == name_mapping) { /** create a 1-1 mapping if nothing was done */ - name_mapping = {}; - utility.ForEach (alignments.GetSequenceNames (prefix+".codon_data"), "_value_", "`&name_mapping`[_value_] = _value_"); - } - - // check for duplicates - duplicate_sequences = codon_data_info[^"terms.data.sequences"] - alignments.HasDuplicateSequences (codon_data_info[^"terms.data.datafilter"],-1); - if (duplicate_sequences > 0) { - fprintf(stdout, "\n-------\n", io.FormatLongStringToWidth( - ">[WARNING] This dataset contains " + duplicate_sequences + " duplicate " + io.SingularOrPlural (duplicate_sequences, 'sequence', 'sequences') + ". - Identical sequences do not contribute any information to the analysis and only slow down computation. - Please consider removing duplicate or 'nearly' duplicate sequences, - e.g. using https://github.com/veg/hyphy-analyses/tree/master/remove-duplicates - prior to running selection analyses", 72), - "\n-------\n"); - } - - utility.SetEnvVariable(utility.getGlobalValue ("terms.trees.data_for_neighbor_joining"), - codon_data_info[utility.getGlobalValue("terms.data.datafilter")]); - - - partitions_and_trees = trees.LoadAnnotatedTreeTopology.match_partitions (codon_data_info[utility.getGlobalValue("terms.data.partitions")], name_mapping); - - - utility.SetEnvVariable(utility.getGlobalValue ("terms.trees.data_for_neighbor_joining"), None); - - /** this will return a dictionary of partition strings and trees; one set per partition, as in - { - "0": { - "name:" ... , - "filter-string": "0-587", - "tree": { - "string": ... - "string_with_lengths": ... - "branch_lengths": { - "AF_231119": 0.00519475, - ... - }, - "annotated_string": ... , - "model_map": { - "AF_231119": "", - ... + annotate_codon_info (codon_data_info, prefix+".codon_filter"); + utility.SetEnvVariable(utility.getGlobalValue ("terms.trees.data_for_neighbor_joining"), + codon_data_info[utility.getGlobalValue("terms.data.datafilter")]); + partitions_and_trees = trees.LoadAnnotatedTreeTopology.match_partitions (codon_data_info[utility.getGlobalValue("terms.data.partitions")], name_mapping); + /** this will return a dictionary of partition strings and trees; one set per partition, as in + { + "0": { + "name:" ... , + "filter-string": "0-587", + "tree": { + "string": ... + "string_with_lengths": ... + "branch_lengths": { + "AF_231119": 0.00519475, + ... + }, + "annotated_string": ... , + "model_map": { + "AF_231119": "", + ... + }, + "partitioned": { + "AF_231119": "leaf", + }, + "model_list": { + { + } + } }, - "partitioned": { - "AF_231119": "leaf", - }, - "model_list": { - { - } - } - }, - ... - */ + ... + */ + + for (_key_, _value_; in; partitions_and_trees) { + (partitions_and_trees[_key_])[utility.getGlobalValue("terms.data.filter_string")] = + selection.io.adjust_partition_string (_value_[utility.getGlobalValue("terms.data.filter_string")], 3*codon_data_info[utility.getGlobalValue("terms.data.sites")]); + } + } + + utility.SetEnvVariable(utility.getGlobalValue ("terms.trees.data_for_neighbor_joining"), None); partition_count = Abs (partitions_and_trees); - - // TODO: DE-HARDCODE "filter-string" - utility.ForEachPair (partitions_and_trees, - "_key_", - "_value_", - '(`&partitions_and_trees`[_key_])[utility.getGlobalValue("terms.data.filter_string")] = selection.io.adjust_partition_string (_value_[utility.getGlobalValue("terms.data.filter_string")], 3*`&codon_data_info`[utility.getGlobalValue("terms.data.sites")])'); - /** - ensure that all partitions fall on codon boundaries if they are contiguous - */ - - io.ReportProgressMessage ("", ">Loaded a multiple sequence alignment with **" + codon_data_info[utility.getGlobalValue("terms.data.sequences")] + "** sequences, **" + codon_data_info[utility.getGlobalValue("terms.data.sites")] + "** codons, and **" + partition_count + "** partitions from \`" + codon_data_info[utility.getGlobalValue("terms.data.file")] + "\`"); + if (multiple_files) { + io.ReportProgressMessage ("", ">Loaded **`partition_count`** alignments from from \`" + codon_data_info[utility.getGlobalValue("terms.data.file")] + "\`"); + for (filepath, fileinfo; in; codon_data_info) { + if (Type (fileinfo) == "AssociativeList") { + if (utility.Has (fileinfo, utility.getGlobalValue("terms.data.sequences"), "Number")) { + io.ReportProgressMessage ("", "**\``filepath`\`** : " + fileinfo[utility.getGlobalValue("terms.data.sequences")] + "** sequences, **" + fileinfo[utility.getGlobalValue("terms.data.sites")] + "** codons"); + } + } + } + + } else { + io.ReportProgressMessage ("", ">Loaded a multiple sequence alignment with **" + codon_data_info[utility.getGlobalValue("terms.data.sequences")] + "** sequences, **" + codon_data_info[utility.getGlobalValue("terms.data.sites")] + "** codons, and **" + partition_count + "** partitions from \`" + codon_data_info[utility.getGlobalValue("terms.data.file")] + "\`"); + } if (utility.Has (settings, utility.getGlobalValue("terms.settings.branch_selector"), "String")) { selected_branches = Call (settings[utility.getGlobalValue("terms.settings.branch_selector")], partitions_and_trees); } else { selected_branches = selection.io.defineBranchSets(partitions_and_trees); } - // Place in own attribute called `tested` - selection.io.json_store_key_value_pair (json, None, utility.getGlobalValue("terms.json.tested"), selected_branches); + selection.io.json_store_key_value_pair (json, None, utility.getGlobalValue("terms.json.tested"), selected_branches); /** this will return a dictionary of selected branches; one set per partition, like in { @@ -281,9 +324,8 @@ function load_file (prefix) { // The trees should go into input as well and they should be w/ their branch lengths but ONLY if they have any. - - - filter_specification = alignments.DefineFiltersForPartitions (partitions_and_trees, "`prefix`.codon_data" , "`prefix`.filter.", codon_data_info); + + filter_specification = alignments.DefineFiltersForPartitions (partitions_and_trees, datasets , "`prefix`.filter.", codon_data_info); /** defines codon filters for each partition, and returns the (codon) sites mapped to each filter { { @@ -459,9 +501,6 @@ function doGTR (prefix) { if (^(prefix + ".selected_branches") / index) { deleted_test = utility.Array1D ((^(prefix + ".selected_branches"))[index]); - //fprintf ("/Users/sergei/Desktop/slac.txt", CLEAR_FILE, "tree = " + tree + ";\n\n"); - //fprintf ("/Users/sergei/Desktop/slac.txt", "lengths = " + (gtr_results[^"terms.branch_length"])[index] + ";\n\n"); - //fprintf ("/Users/sergei/Desktop/slac.txt", "tested = " + (^(prefix + ".selected_branches"))[index] + ";\n\n"); trees[index] = trees.KillZeroBranches (tree, (gtr_results[^"terms.branch_length"])[index], (^(prefix + ".selected_branches"))[index], deleted); } else { trees[index] = trees.KillZeroBranches (tree, (gtr_results[^"terms.branch_length"])[index], null, deleted); diff --git a/res/TemplateBatchFiles/libv3/all-terms.bf b/res/TemplateBatchFiles/libv3/all-terms.bf index 5d8459762..1284b4ba9 100644 --- a/res/TemplateBatchFiles/libv3/all-terms.bf +++ b/res/TemplateBatchFiles/libv3/all-terms.bf @@ -45,6 +45,7 @@ namespace terms{ substitutions = "substitutions"; search_grid = "search_grid"; search_restarts = "search_restarts"; + multiple_files = "multiple files"; parameters = "parameters"; local = "local"; diff --git a/res/TemplateBatchFiles/libv3/tasks/alignments.bf b/res/TemplateBatchFiles/libv3/tasks/alignments.bf index 93840aed2..0d4c0d3a9 100644 --- a/res/TemplateBatchFiles/libv3/tasks/alignments.bf +++ b/res/TemplateBatchFiles/libv3/tasks/alignments.bf @@ -550,12 +550,16 @@ lfunction alignments.DefineFiltersForPartitions(partitions, source_data, prefix, if (utility.CheckKey(data_info, utility.getGlobalValue("terms.code"), "Matrix")) { - for (i = 0; i < part_count; i += 1) { + if (Type (source_data) == "String") { + src = source_data; + } else { + src = source_data[i]; + } this_filter = {}; - DataSetFilter test = CreateFilter( ^ source_data, 1, (partitions[i])[utility.getGlobalValue("terms.data.filter_string")]); + DataSetFilter test = CreateFilter( ^ src, 1, (partitions[i])[utility.getGlobalValue("terms.data.filter_string")]); this_filter[utility.getGlobalValue("terms.data.name")] = prefix + (partitions[i])[utility.getGlobalValue("terms.data.name")]; - DataSetFilter ^ (this_filter[utility.getGlobalValue("terms.data.name")]) = CreateFilter( ^ source_data, 3, (partitions[i])[utility.getGlobalValue("terms.data.filter_string")], , data_info[utility.getGlobalValue("terms.stop_codons")]); + DataSetFilter ^ (this_filter[utility.getGlobalValue("terms.data.name")]) = CreateFilter( ^ src, 3, (partitions[i])[utility.getGlobalValue("terms.data.filter_string")], , data_info[utility.getGlobalValue("terms.stop_codons")]); diff = test.sites - 3 * ^ (this_filter[utility.getGlobalValue("terms.data.name")] + ".sites"); io.CheckAssertion("`&diff` == 0", "Partition " + this_filter[utility.getGlobalValue("terms.data.name")] + " is either has stop codons or is not in frame"); @@ -566,10 +570,15 @@ lfunction alignments.DefineFiltersForPartitions(partitions, source_data, prefix, } else { for (i = 0; i < part_count; i += 1) { + if (Type (source_data) == "String") { + src = source_data; + } else { + src = source_data[i]; + } this_filter = {}; this_filter[utility.getGlobalValue("terms.data.name")] = prefix + (partitions[i])[utility.getGlobalValue("terms.data.name")]; - DataSetFilter ^ (this_filter[utility.getGlobalValue("terms.data.name")]) = CreateFilter( ^ source_data, 1, (partitions[i])[utility.getGlobalValue("terms.data.filter_string")]); + DataSetFilter ^ (this_filter[utility.getGlobalValue("terms.data.name")]) = CreateFilter( ^ src, 1, (partitions[i])[utility.getGlobalValue("terms.data.filter_string")]); this_filter[utility.getGlobalValue("terms.data.coverage")] = ^ (this_filter[utility.getGlobalValue("terms.data.name")] + ".site_map"); filters + this_filter; diff --git a/res/TemplateBatchFiles/libv3/tasks/estimators.bf b/res/TemplateBatchFiles/libv3/tasks/estimators.bf index ce85b9dfe..f0ff55a11 100644 --- a/res/TemplateBatchFiles/libv3/tasks/estimators.bf +++ b/res/TemplateBatchFiles/libv3/tasks/estimators.bf @@ -769,7 +769,6 @@ lfunction estimators.FitLF(data_filter, tree, model_map, initial_values, model_o can_do_restarts = null; - if (utility.Has (run_options, utility.getGlobalValue("terms.search_grid"),"AssociativeList")) { grid_results = mpi.ComputeOnGrid (&likelihoodFunction, run_options [utility.getGlobalValue("terms.search_grid")], "mpi.ComputeOnGrid.SimpleEvaluator", "mpi.ComputeOnGrid.ResultHandler"); if (utility.Has (run_options, utility.getGlobalValue("terms.search_restarts"),"Number")) { @@ -798,6 +797,7 @@ lfunction estimators.FitLF(data_filter, tree, model_map, initial_values, model_o utility.ToggleEnvVariable("PRODUCE_OPTIMIZATION_LOG", 1); } + if (Type (can_do_restarts) == "AssociativeList") { io.ReportProgressBar("", "Working on crude initial optimizations"); bestlog = -1e100; diff --git a/res/TemplateBatchFiles/libv3/tasks/mpi.bf b/res/TemplateBatchFiles/libv3/tasks/mpi.bf index 48825b2aa..c1f5eb384 100644 --- a/res/TemplateBatchFiles/libv3/tasks/mpi.bf +++ b/res/TemplateBatchFiles/libv3/tasks/mpi.bf @@ -307,26 +307,25 @@ namespace mpi { scores = {}; + queue = mpi.CreateQueue ({^"terms.mpi.LikelihoodFunctions": {{lf_id}}, ^"terms.mpi.Headers" : utility.GetListOfLoadedModules ("libv3/")}); - io.ReportProgressBar("", "Computing LF on a grid"); for (i = 1; i < Abs (jobs); i += 1) { - io.ReportProgressBar("", "Computing LF on a grid " + i + "/" + Abs (jobs)); + //io.ReportProgressBar("", "Computing LF on a grid " + i + "/" + Abs (jobs)); mpi.QueueJob (queue, handler, {"0" : lf_id, "1" : jobs [i], "2" : &scores}, callback); } - io.ClearProgressBar(); Call (callback, -1, Call (handler, lf_id, jobs[0], &scores), {"0" : lf_id, "1" : jobs [0], "2" : &scores}); - mpi.QueueComplete (queue); + io.ClearProgressBar(); + return scores; - } //------------------------------------------------------------------------------ @@ -348,6 +347,7 @@ namespace mpi { parameters.SetValues (tasks[task_ids[i]]); LFCompute (^lf_id, ll); results [task_ids[i]] = ll; + io.ReportProgressBar("", "Grid result " + i + " = " + ll + " (best = " + Max (results, 0)[^"terms.data.value"] + ")"); } LFCompute (^lf_id, LF_DONE_COMPUTE); diff --git a/src/core/alignment.cpp b/src/core/alignment.cpp index 17c2358b1..4a1707385 100644 --- a/src/core/alignment.cpp +++ b/src/core/alignment.cpp @@ -40,12 +40,43 @@ #include #include -#include "global_things.h" + +#include +#include + #include "alignment.h" +#define MAX_OP(X,Y) ((X) >= (Y) ? (X) : (Y)) //____________________________________________________________________________________ +/** + The following numerical codes define "moves" available to the algorithm working in codon space for pairwise alignment. + The first (reference( sequence (represented by ROWS in the dynamic programming matrix) is the "blessed" reference sequence, i.e. it's in frame (no stop codons, divisble by 3) + The second (query) sequence (represented by COLUMNS in the dynamic programming matrix) is the one being mapped to the reference, and it admits out-of-frame indels + + The mnemonic for each macro is as follows + HY_XXX_YYY + Len (X) = Len (Y) = the number of characters output in the final alignment + Each '1' in the pattern corresponds to an actual alignment character being consumed from the input, and '0' - to a gap (indel) + + For example, i.f the pattern is + + HY_1101_1111, and the current context of the strings being aligned is + + REF: ... AGTACA ... + QRY: ...AAGTACA... + + then the output will receive + A-GT + AaGT + and the counter will advance 3 characters in the reference and 4 characters in the query + +*/ + + +#define HY_ALIGNMENT_TYPES_COUNT 24 + // match or skip whole codons #define HY_111_111 0 #define HY_111_000 1 @@ -64,6 +95,9 @@ #define HY_111_101 7 #define HY_111_011 8 +/** + all operations consuming 3 chars in the reference and 2 in the query start with index 6 and there are 3 of them +*/ #define HY_3X2_START 6 #define HY_3X2_COUNT 3 @@ -73,10 +107,14 @@ #define HY_1011_1111 11 #define HY_0111_1111 12 +/** + all operations consuming 3 chars in the reference and 4 in the query start with index 9 and there are 4 of them +*/ #define HY_3X4_START 9 #define HY_3X4_COUNT 4 // match 3 in the ref to 5 in the query + #define HY_11100_11111 13 #define HY_11010_11111 14 #define HY_11001_11111 15 @@ -88,16 +126,49 @@ #define HY_01011_11111 21 #define HY_00111_11111 22 +/** + all operations consuming 3 chars in the reference and 5 in the query start with index 13 and there are 10 of them +*/ +#define HY_3X5_START 13 +#define HY_3X5_COUNT 10 + // the local align move (i.e. take a direct shortcut to any internal position in the alignment matrix) + #define HY_LOCAL_ALIGN_SHORTCUT 23 -#define HY_3X5_START 13 -#define HY_3X5_COUNT 10 +//____________________________________________________________________________________ -#define HY_ALIGNMENT_TYPES_COUNT 24 -//____________________________________________________________________________________ +/** + * @name CodonAlignStringsStep + * Perform a single step in the codon alignment algorithm; returns the best operation and updates DP matrices + * + * @param score_matrix an NxN score matrix (N = 65, to accommodate all 64 codons plus an explicit "unresolved" character) + * @param reference the reference string remapped with characters remapped to indices in score_matrix + * @param query the query string remapped with characters remapped to indices in score_matrix + * @param r index in reference (counted out in codons) + * @param q index in query (counted out in nucleotides) + * @param score_cols the number of columns in score_matrix + * @param char_count the number of valid characters + * @param miscall_cost the cost of introducing an out-of-frame indel + * @param open_deletion the cost of opening an indel in the reference + * @param extend_deletion the cost of extending an indel in the reference + * @param open_inserion the cost of opening an indel in the query + * @param extend_inserion the cost of extending an indel in the query + * @param cost_matrix the main DP matrix + * @param cost_stride the number of columns in the DP matrix + * @param insertion_matrix the DP matrix for affine insertions + * @param deletion_matrix the DP matrix for affine deletions + * @param codon3x5 the 3-ref 5-qry scoring matrix + * @param codon3x4 the 3-ref 4-qry scoring matrix + * @param codon3x2 the 3-ref 2-qry scoring matrix + * @param codon3x1 the 3-ref 1-qry scoring matrix + * @param do_local if TRUE, perform a local alignment (no prefix/suffix indel cost) + * + * @return the best scoring alignment operation + + */ long CodonAlignStringsStep( double * const score_matrix , long * const reference @@ -133,19 +204,25 @@ long CodonAlignStringsStep( double * const score_matrix * rpos is the position of r in the reference * do_local is true if we wish to perform a local alignment */ - const long curr = ( r - 0 ) * score_cols + q, // where we currently are - prev = ( r - 1 ) * score_cols + q, // up a codon in the reference + const long curr = ( r ) * score_cols + q, // where we currently are in the DP matrix + prev = curr - score_cols, // upstream a codon in the reference for the DP matrix + // offsets are strides in the corresponding scoring matrices offset3x5 = HY_3X5_COUNT * char_count * char_count * char_count, // both 3x5 and 3x4 are offset3x4 = HY_3X4_COUNT * char_count * char_count * char_count, // full codons offset3x2 = HY_3X2_COUNT * char_count * char_count, offset3x1 = HY_3X1_COUNT * char_count, rpos = r * 3; // the real position in R + // mutable vars long r_codon = -1, q_codon = -1, best_choice = 0, - i, choice, partial_codons[ 10 ]; // we need to multiply by 3 to get the NUC position - // 3x5 codon specifications (negative indices) + //i, + choice, + partial_codons[ 10 ]; + // we need to multiply by 3 to get the NUC position + + // 3x5 codon specifications (inverted indices) static long const codon_spec_3x5[ 10 ][ 3 ] = { { 5, 4, 3 }, // 11100 { 5, 4, 2 }, // 11010 @@ -158,7 +235,7 @@ long CodonAlignStringsStep( double * const score_matrix { 4, 2, 1 }, // 01011 { 3, 2, 1 } // 00111 }; - // 3x4 codon specifications (negative indices) + // 3x4 codon specifications (inverted indices) static long const long codon_spec_3x4[ 4 ][ 3 ] = { { 4, 3, 2 }, // 1110 { 4, 3, 1 }, // 1101 @@ -174,7 +251,7 @@ long CodonAlignStringsStep( double * const score_matrix // store the scores of our choices in choices, // pre-initialize to -infinity - for ( i = 0; i < HY_ALIGNMENT_TYPES_COUNT; i++ ) { + for (long i = 0; i < HY_ALIGNMENT_TYPES_COUNT; i++ ) { choices[ i ] = -INFINITY; } @@ -183,7 +260,7 @@ long CodonAlignStringsStep( double * const score_matrix if ( r >= 1 ) { // if we're doing affine gaps (deletions) if ( deletion_matrix ) { - choices[ HY_111_000 ] = MAX( + choices[ HY_111_000 ] = MAX_OP( score_matrix[ prev ] - open_deletion, deletion_matrix[ prev ] - ( r > 1 ? extend_deletion : open_deletion ) ); @@ -197,7 +274,9 @@ long CodonAlignStringsStep( double * const score_matrix + reference[ rpos - 1 ] ; if ( r_codon < 0 ) { + // anything other than a fully resolved codon gets mapped to the "generic unresolved" character r_codon = cost_stride - 1; + /* 20240219: SLKP optimization note this needs to be adjusted; let's not print stuff to stderr */ fprintf (stderr, "*** NIL CODON *** %ld %ld %ld %ld\n", reference[ rpos - 3 ], reference[ rpos - 2], reference[ rpos - 1], r_codon); } } @@ -206,7 +285,7 @@ long CodonAlignStringsStep( double * const score_matrix if ( q >= 3 ) { // if we're doing affine gaps (insertions) if ( insertion_matrix ) { - choices[ HY_000_111 ] = MAX( + choices[ HY_000_111 ] = MAX_OP( score_matrix[ curr - 3 ] - open_insertion, insertion_matrix[ curr - 3 ] - ( q > 3 ? extend_insertion : open_insertion ) ); @@ -220,19 +299,18 @@ long CodonAlignStringsStep( double * const score_matrix + query[ q - 1 ] ; if ( q_codon < 0 ) { - q_codon = cost_stride - 1; + // anything other than a fully resolved codon gets mapped to the "generic unresolved" character + q_codon = cost_stride - 1; } } // if q_codon and r_codon both exist, set the score equal to match - if ( q_codon >= 0 ) { - if ( r_codon >= 0 ) { - const double move_cost = cost_matrix[ r_codon * cost_stride + q_codon ]; - choices[ HY_111_111 ] = score_matrix[ prev - 3 ] + move_cost; - if (do_local && choices [HY_LOCAL_ALIGN_SHORTCUT] < move_cost) { - local_shortcut_came_from_this_move = HY_111_111; - choices [HY_LOCAL_ALIGN_SHORTCUT] = move_cost; - } + if ( q_codon >= 0 && r_codon >= 0 ) { + const double move_cost = cost_matrix[ r_codon * cost_stride + q_codon ]; + choices[ HY_111_111 ] = score_matrix[ prev - 3 ] + move_cost; + if (do_local && choices [HY_LOCAL_ALIGN_SHORTCUT] < move_cost) { + local_shortcut_came_from_this_move = HY_111_111; + choices [HY_LOCAL_ALIGN_SHORTCUT] = move_cost; } } @@ -247,14 +325,15 @@ long CodonAlignStringsStep( double * const score_matrix // fill in the partial codons table // use a 10x1 array to allow for load hoisting, // we don't want to be bouncing cachelines in this critical inner loop - for ( i = 0; i < HY_3X5_COUNT; ++i ) { + for (long i = 0; i < HY_3X5_COUNT; ++i ) { partial_codons[ i ] = ( query[ q - codon_spec_3x5[ i ][ 0 ] ] * char_count + query[ q - codon_spec_3x5[ i ][ 1 ] ] ) * char_count + query[ q - codon_spec_3x5[ i ][ 2 ] ] ; } // go over each choice, fill it in - for ( i = 0; i < HY_3X5_COUNT; ++i ) { + for (long i = 0; i < HY_3X5_COUNT; ++i ) { if ( partial_codons[ i ] >= 0 ) { + // this partial codon is resolved choice = HY_3X5_START + i; // if we have a double ragged edge, don't penalize if ( ( q == 5 && choice == HY_00111_11111 ) @@ -293,13 +372,13 @@ long CodonAlignStringsStep( double * const score_matrix // 3x4 partial codons if ( q >= 4 ) { // fill in partial codons table - for ( i = 0; i < HY_3X4_COUNT; ++i ) { + for (long i = 0; i < HY_3X4_COUNT; ++i ) { partial_codons[ i ] = ( query[ q - codon_spec_3x4[ i ][ 0 ] ] * char_count + query[ q - codon_spec_3x4[ i ][ 1 ] ] ) * char_count + query[ q - codon_spec_3x4[ i ][ 2 ] ] ; } // fill in choices - for ( i = 0; i < HY_3X4_COUNT; ++i ) { + for (long i = 0; i < HY_3X4_COUNT; ++i ) { if ( partial_codons[ i ] >= 0 ) { choice = HY_3X4_START + i; // if we have a ragged edge, @@ -329,7 +408,7 @@ long CodonAlignStringsStep( double * const score_matrix + query[ q - 1 ] ; // fill in choices if ( partial_codons[ 0 ] >= 0 ) { - for ( i = 0; i < HY_3X2_COUNT; ++i ) { + for (long i = 0; i < HY_3X2_COUNT; ++i ) { choice = HY_3X2_START + i; // if we have a ragged edge at the beginning or end, // respectively, don't penalize it @@ -358,7 +437,7 @@ long CodonAlignStringsStep( double * const score_matrix partial_codons[ 0 ] = query[ q - 1 ]; // fill in choices if ( partial_codons[ 0 ] >= 0 ) { - for ( i = 0; i < HY_3X1_COUNT; ++i ) { + for (long i = 0; i < HY_3X1_COUNT; ++i ) { choice = HY_3X1_START + i; // if we have a double ragged edge, // don't enforce a miscall penalty @@ -389,14 +468,14 @@ long CodonAlignStringsStep( double * const score_matrix // find the best possible choice if (do_local) { - for ( i = 0; i < HY_ALIGNMENT_TYPES_COUNT ; ++i ) { + for (long i = 0; i < HY_ALIGNMENT_TYPES_COUNT ; ++i ) { if ( choices[ i ] > max_score ) { best_choice = i; max_score = choices[ i ]; } } } else { - for ( i = 0; i < HY_ALIGNMENT_TYPES_COUNT - 1 ; ++i ) { + for (long i = 0; i < HY_ALIGNMENT_TYPES_COUNT - 1 ; ++i ) { if ( choices[ i ] > max_score ) { best_choice = i; max_score = choices[ i ]; @@ -418,6 +497,21 @@ long CodonAlignStringsStep( double * const score_matrix //____________________________________________________________________________________ +/** + * @name BacktrackAlign + * Perform a single backtracking step in the dynamic programming matrix for single character alignment (nucleotides or amino-acids) + * @param edit_ops an array of chars (only 3 states used) to record which of the three possible steps is taken + * 0 : match (ref: X, qry : Y) + * -1 : deletion (ref: X, qry : -) + * 1 : insterion (ref: -, qry: Y) + * @param edit ptr: current index into the edit_ops array (gets ++ with each step) + * @param r current character index in the reference + * @param q current character index in the query + * @param deletion cost of the deletion (-1) op + * @param insertion cost of the insertion (-1) op + * @param match cost of the match (0) op + */ + inline void BacktrackAlign( signed char * const edit_ops , long & edit_ptr , long & r @@ -425,8 +519,7 @@ inline void BacktrackAlign( signed char * const edit_ops , double deletion , double insertion , double match - ) -{ + ) { if ( match >= deletion && match >= insertion ) { --r; --q; @@ -442,6 +535,21 @@ inline void BacktrackAlign( signed char * const edit_ops //____________________________________________________________________________________ +/** + * @name BacktrackAlignCodon + * Perform a single backtracking step in the dynamic programming matrix for codon alignment + * @param edit_ops an array of chars (only 3 states used) to record which of the three possible steps is taken + * 0 : match (ref: X, qry : Y) + * -1 : deletion (ref: X, qry : -) + * -2 : deletion with frameshift (ref Xx, qry Y-) + * 1 : insterion (ref: -, qry: Y) + * 2 : deletion with frameshift (ref X-, qry Yy) + * @param edit ptr: current index into the edit_ops array (gets ++ with each step) + * @param r current character index in the reference + * @param q current character index in the query + * @param code operation code; the operation yielding the maximum score for this step +*/ + inline void BacktrackAlignCodon( signed char * const edit_ops , long & edit_ptr , long & r @@ -449,133 +557,146 @@ inline void BacktrackAlignCodon( signed char * const edit_ops , const long code ) { - long r_str[ 5 ] = { 0, 0, 0, 0, 0 }, - q_str[ 5 ] = { 0, 0, 0, 0, 0 }, + /** SLKP 20240419: + Optimization notes (if warranted) + 1. this switch statement needs to be profiled to see if the ordering of the cases + 2. there's also an opportinity to use bitmasks + + */ + switch ( code ) { + // match + case HY_111_111: + r -= 3; q -= 3; + edit_ops [edit_ptr] = 1; + edit_ops [edit_ptr+1] = 1; + edit_ops [edit_ptr+2] = 1; + edit_ptr += 3; + return; + + // deletion + case HY_111_000: + r -= 3; + edit_ops [edit_ptr] = -1; + edit_ops [edit_ptr+1] = -1; + edit_ops [edit_ptr+2] = -1; + edit_ptr += 3; + return; + + // insertion + case HY_000_111: + q -= 3; + edit_ops [edit_ptr] = 1; + edit_ops [edit_ptr+1] = 1; + edit_ops [edit_ptr+2] = 1; + edit_ptr += 3; + return; + } + + unsigned char r_str[ 5 ] = { 1, 1, 1, 1, 1 }, + // which characters are we taking from the reference (up to 5); 1 take, 0 leave + q_str[ 5 ] = { 1, 1, 1, 1, 1 }, + // which characters are we taking from the query (up to 5) idx = 2; + // how many characters are we taking (max over both strings); index so the actual number is +1 + // can be 2,3, or 4 - bool frameshift = true; - switch ( code ) { - // match - case HY_111_111: - r_str[0] = r_str[1] = r_str[2] = 1; - q_str[0] = q_str[1] = q_str[2] = 1; - frameshift = false; - break; - - // deletion - case HY_111_000: - r_str[0] = r_str[1] = r_str[2] = 1; - frameshift = false; - break; - - // insertion - case HY_000_111: - q_str[0] = q_str[1] = q_str[2] = 1; - frameshift = false; - break; - - // 3x2 - case HY_111_110: - r_str[0] = r_str[1] = r_str[2] = 1; - q_str[0] = q_str[1] = 1; - break; - case HY_111_101: - r_str[0] = r_str[1] = r_str[2] = 1; - q_str[0] = q_str[2] = 1; - break; - case HY_111_011: - r_str[0] = r_str[1] = r_str[2] = 1; - q_str[1] = q_str[2] = 1; - break; - - // 3x1 - case HY_111_100: - r_str[0] = r_str[1] = r_str[2] = 1; - q_str[0] = 1; - break; - case HY_111_010: - r_str[0] = r_str[1] = r_str[2] = 1; - q_str[1] = 1; - break; - case HY_111_001: - r_str[0] = r_str[1] = r_str[2] = 1; - q_str[2] = 1; - break; - - // 3x4 - case HY_1110_1111: - q_str[0] = q_str[1] = q_str[2] = q_str[3] = 1; - r_str[0] = r_str[1] = r_str[2] = 1; - idx = 3; - break; - case HY_1101_1111: - q_str[0] = q_str[1] = q_str[2] = q_str[3] = 1; - r_str[0] = r_str[1] = r_str[3] = 1; - idx = 3; - break; - case HY_1011_1111: - q_str[0] = q_str[1] = q_str[2] = q_str[3] = 1; - r_str[0] = r_str[2] = r_str[3] = 1; - idx = 3; - break; - case HY_0111_1111: - q_str[0] = q_str[1] = q_str[2] = q_str[3] = 1; - r_str[1] = r_str[2] = r_str[3] = 1; - idx = 3; - break; - - // 3x5 - case HY_11100_11111: - q_str[0] = q_str[1] = q_str[2] = q_str[3] = q_str[4] = 1; - r_str[0] = r_str[1] = r_str[2] = 1; - idx = 4; - break; - case HY_11010_11111: - q_str[0] = q_str[1] = q_str[2] = q_str[3] = q_str[4] = 1; - r_str[0] = r_str[1] = r_str[3] = 1; - idx = 4; - break; - case HY_11001_11111: - q_str[0] = q_str[1] = q_str[2] = q_str[3] = q_str[4] = 1; - r_str[0] = r_str[1] = r_str[4] = 1; - idx = 4; - break; - case HY_10110_11111: - q_str[0] = q_str[1] = q_str[2] = q_str[3] = q_str[4] = 1; - r_str[0] = r_str[2] = r_str[3] = 1; - idx = 4; - break; - case HY_10101_11111: - q_str[0] = q_str[1] = q_str[2] = q_str[3] = q_str[4] = 1; - r_str[0] = r_str[2] = r_str[4] = 1; - idx = 4; - break; - case HY_10011_11111: - q_str[0] = q_str[1] = q_str[2] = q_str[3] = q_str[4] = 1; - r_str[0] = r_str[3] = r_str[4] = 1; - idx = 4; - break; - case HY_01110_11111: - q_str[0] = q_str[1] = q_str[2] = q_str[3] = q_str[4] = 1; - r_str[1] = r_str[2] = r_str[3] = 1; - idx = 4; - break; - case HY_01101_11111: - q_str[0] = q_str[1] = q_str[2] = q_str[3] = q_str[4] = 1; - r_str[1] = r_str[2] = r_str[4] = 1; - idx = 4; - break; - case HY_01011_11111: - q_str[0] = q_str[1] = q_str[2] = q_str[3] = q_str[4] = 1; - r_str[1] = r_str[3] = r_str[4] = 1; - idx = 4; - break; - case HY_00111_11111: - q_str[0] = q_str[1] = q_str[2] = q_str[3] = q_str[4] = 1; - r_str[2] = r_str[3] = r_str[4] = 1; - idx = 4; - break; + + switch (code) { + // 3x2 + case HY_111_110: + q_str[2] = 0; + break; + case HY_111_101: + q_str[1] = 0; + break; + case HY_111_011: + q_str[0] = 0; + break; + + // 3x1 + case HY_111_100: + q_str[1] = 0; + q_str[2] = 0; + break; + case HY_111_010: + q_str[0] = 0; + q_str[2] = 0; + break; + case HY_111_001: + q_str[0] = 0; + q_str[1] = 0; + break; + + // 3x4 + case HY_1110_1111: + r_str[3] = 0; + idx = 3; + break; + case HY_1101_1111: + r_str[2] = 0; + idx = 3; + break; + case HY_1011_1111: + r_str[1] = 0; + idx = 3; + break; + case HY_0111_1111: + r_str[0] = 0; + idx = 3; + break; + + // 3x5 + case HY_11100_11111: + r_str[3] = 0; + r_str[4] = 0; + idx = 4; + break; + case HY_11010_11111: + r_str[2] = 0; + r_str[4] = 0; + idx = 4; + break; + case HY_11001_11111: + r_str[2] = 0; + r_str[3] = 0; + idx = 4; + break; + case HY_10110_11111: + r_str[1] = 0; + r_str[4] = 0; + idx = 4; + break; + case HY_10101_11111: + r_str[1] = 0; + r_str[3] = 0; + idx = 4; + break; + case HY_10011_11111: + r_str[1] = 0; + r_str[2] = 0; + idx = 4; + break; + case HY_01110_11111: + r_str[0] = 0; + r_str[4] = 0; + idx = 4; + break; + case HY_01101_11111: + r_str[0] = 0; + r_str[3] = 0; + idx = 4; + break; + case HY_01011_11111: + r_str[0] = 0; + r_str[2] = 0; + idx = 4; + break; + case HY_00111_11111: + r_str[0] = 0; + r_str[1] = 0; + idx = 4; + break; } for ( ; idx >= 0 ; --idx ) { @@ -586,15 +707,33 @@ inline void BacktrackAlignCodon( signed char * const edit_ops edit_ops[ edit_ptr++ ] = 0; } else { --r; - edit_ops[ edit_ptr++ ] = -( frameshift ? 2 : 1 ); + edit_ops[ edit_ptr++ ] = -2; } } else { --q; - edit_ops[ edit_ptr++ ] = ( frameshift ? 2 : 1 ); + edit_ops[ edit_ptr++ ] = 2; } } } +//____________________________________________________________________________________ + +/** + * @name MatchScore + * Looks up the score for matching two characters at the current positions in REF and QRY sequences, assuming they are valid + * Increments the score parameter by that value (which of course can also be negative) + * If either character is invalid, the implied score is 0 + * + * @param r_str the reference string + * @param q_str the query string + * @param r index in the reference (1-based) + * @param q index in the query (1-based) + * @param char_map a map from valid character to the encoding used by the score matrix + * @param cost_matrix the pairwise character cost matrix + * @param cost_stride the column dimension of the cost matrix + * @param score the cumulative alignment score (will be modified) + +*/ //____________________________________________________________________________________ @@ -618,6 +757,39 @@ inline void MatchScore( char const * r_str //____________________________________________________________________________________ + +/** + * @name AlignStrings + * Performs a pairwise string alignment; the nature of the alignment algoritm invoked will depend on option flags + * returns the score of the alignment + * r_res and q_res parameters will the receive aligned strings they will be allocated and populated by this function + * + * @param r_str the reference string + * @param q_str the query string + * @param r_res will receive the resulting REFERENCE alignment + * @param q_res will receive the resulting QUERY alignment + * @param char_map a char->long map which takes a character from the input string and maps into in [-1,N-1] where N is the dimension of the scoring matrix; an "indvalid" character is mapped to -1 + * @param cost_matrix an NxN (row-major) matrix of pairwise character scores + * @param cost_stride N (column dimension of cost_matrix) + * @param gap the character to use for an indel (e.g. '-') + * @param open_insertion the cost of opening a gap in the REFERENCE + * @param extend_insertion the cost of extending a gap in the REFERENCE + * @param open_deletion the cost of opening a gap in the QUERY + * @param extend_deletion the cost of extending a gap in the QUERY + * @param miscall_cost the cost of introducing any out-of-frame indels + * @param do_local if TRUE, local alignment will be performed (no prefix or suffix gap cost) + * @param do_affine if TRUE, use the affine gap penalty + * @param do_codon if TRUE, do a codon-aware alignment + * @param char_count number of alphabet (N-1, where N == cost_stride) + * @param codon3x5 (do_codon == TRUE), the cost of aligning codon X with any of the 10 options where 3 refs align to 5 qry + * @param codon3x4 (do_codon == TRUE), the cost of aligning codon X with any of the 4 options where 3 refs align to 4 qry + * @param codon3x2 (do_codon == TRUE), the cost of aligning codon X with any of the 3 options where 3 refs align to 2 qry + * @param codon3x1 (do_codon == TRUE), the cost of aligning codon X with any of the 3 options where 3 refs align to 1 qry + * @param do_true_local if TRUE, perform a true local (best scoring substings) alignment + * @return the alignment score + +*/ + double AlignStrings( char const * r_str , char const * q_str , char * & r_res @@ -645,22 +817,29 @@ double AlignStrings( char const * r_str const unsigned long r_len = strlen( r_str ), q_len = strlen( q_str ), ref_stride = ( do_codon ? 3 : 1 ), + // when doing codon alignments, the algorithm will advance in the reference by full codons score_rows = r_len / ref_stride + 1, + // R = number of rows in the dynamic programming matrix score_cols = q_len + 1; + // C = number of columns in the dynamic programming matrix + // the extra row/column is to accommodate prefix gaps + - long i, j, k, m; + //long i, j, k, m; - double score = 0.; if ( do_codon && ( r_len % 3 != 0 ) ) { - hy_global::HandleApplicationError ( "Reference sequence length not divisible by 3 in AlignStrings (codon mode)" ); return -INFINITY; } - // handle some corner cases, + double score = 0.; + + // handle some edge cases, // return early if possible if ( score_rows <= 1 ) { + // the reference string is EMPTY if ( score_cols > 1 ) { + // the query string is NON-EMPTY r_res = new char[ q_len + 1 ]; q_res = new char[ q_len + 1 ]; // no ref, just query, which remains untouched @@ -670,7 +849,7 @@ double AlignStrings( char const * r_str // null terminate r_res[ q_len ] = '\0'; q_res[ q_len ] = '\0'; - // compute score + // compute the score for this "all-gaps" alignment if ( ! do_local ) { if ( do_affine ) score = -open_insertion - ( q_len - 1 ) * extend_insertion; @@ -679,7 +858,9 @@ double AlignStrings( char const * r_str } } } else { - if ( score_rows <= 1 ) { + // the reference string is NON-EMPTY + if ( score_cols <= 1 ) { + // the query string is EMPTY r_res = new char[ r_len + 1 ]; q_res = new char[ r_len + 1 ]; // no query, just ref, which remains untouched @@ -699,37 +880,56 @@ double AlignStrings( char const * r_str } else { long edit_ptr = 0; // don't forget the optional termination character + signed char * const edit_ops = new signed char[ r_len + q_len ]; + // allocate enough storage to store the best scoring path through the DP matrix double * const score_matrix = new double[ score_rows * score_cols ], * const insertion_matrix = do_affine ? new double[ score_rows * score_cols ] : NULL, * const deletion_matrix = do_affine ? new double[ score_rows * score_cols ] : NULL; + + // allocate the DP matrix and, if using affine gaps, also allocate insertion and deletion DP matrices // encode each string using the character map (char_map) - long * const r_enc = new long[ r_len ], - * const q_enc = new long[ q_len ]; - - // zero manually, memset not guaranteed to work - for ( i = 0; i < score_rows * score_cols; ++i ) - score_matrix[ i ] = 0.; + long * r_enc = NULL, + * q_enc = NULL; + if ( do_codon ) { - for ( i = 0; i < r_len; ++i ) + // encode both strings in [0, charCount], + // i.e. something that can be directly looked up in the scoring matrix + r_enc = new long[ r_len ]; + q_enc = new long[ q_len ]; + + for (long i = 0; i < r_len; ++i ) { r_enc[ i ] = char_map[ (unsigned char) r_str[ i ] ]; - for ( i = 0; i < q_len; ++i ) + } + for (long i = 0; i < q_len; ++i ) { q_enc[ i ] = char_map[ (unsigned char) q_str[ i ] ]; + } } + + memset (score_matrix, 0, sizeof (double) * score_rows * score_cols); + /* + // SLKP 20240219 memset to 0 should work + + for ( i = 0; i < score_rows * score_cols; ++i ) + score_matrix[ i ] = 0.; + */ if ( do_affine ) { - // zero manually, memset not guaranteed to work + /* for ( i = 0; i < score_rows * score_cols; ++i ) { insertion_matrix[ i ] = 0.; deletion_matrix[ i ] = 0.; - } + }*/ + memset (insertion_matrix, 0, sizeof (double) * score_rows * score_cols); + memset (deletion_matrix , 0, sizeof (double) * score_rows * score_cols); } // pre-initialize the values in the various matrices if ( ! do_local ) { + // full global alignment, i.e. indels at the beginning and end ARE penalized double cost; // initialize gap costs in first column and first row // they are 0 for local alignments, so ignore @@ -739,7 +939,7 @@ double AlignStrings( char const * r_str insertion_matrix[ 0 ] = cost; - for ( i = 1; i < score_cols; ++i, cost -= extend_insertion ) { + for (long i = 1; i < score_cols; ++i, cost -= extend_insertion ) { score_matrix[ i ] = cost; insertion_matrix[ i ] = cost; deletion_matrix[ i ] = cost; @@ -750,68 +950,74 @@ double AlignStrings( char const * r_str deletion_matrix[ 0 ] = cost; - for ( i = score_cols; i < score_rows * score_cols; i += score_cols, cost -= extend_deletion ) { + /** 20240219 : SLKP optimization note; may be faster to do 3 loops because of memory locality */ + for (long i = score_cols; i < score_rows * score_cols; i += score_cols, cost -= extend_deletion ) { score_matrix[ i ] = cost; insertion_matrix[ i ] = cost; deletion_matrix[ i ] = cost; } } else { - // handle the do_local, regular (non codon-alignment) case + // no affine gaps if ( ! do_codon ) { cost = -open_insertion; - for ( i = 1; i < score_cols; ++i, cost -= open_insertion ) + for (long i = 1; i < score_cols; ++i, cost -= open_insertion ) score_matrix[ i ] = cost; cost = -open_deletion; - for ( i = score_cols; i < score_rows * score_cols; i += score_cols, cost -= open_deletion ) + for (long i = score_cols; i < score_rows * score_cols; i += score_cols, cost -= open_deletion ) score_matrix[ i ] = cost; // handle the do_local, do_codon case } else { cost = -open_insertion; - for ( i = 1; i < score_cols; ++i, cost -= open_insertion ) + /** 20240219 : SLKP optimization note; surely the next two loops don't need to do integer remainer at each iteration */ + for (long i = 1; i < score_cols; ++i, cost -= open_insertion ) score_matrix[ i ] = cost - ( i % 3 != 1 ? miscall_cost : 0 ); cost = -open_deletion; - for ( i = score_cols, j = 0; i < score_rows * score_cols; i += score_cols, cost -= open_insertion, ++j ) + for (long i = score_cols, j = 0; i < score_rows * score_cols; i += score_cols, cost -= open_insertion, ++j ) score_matrix[ i ] = cost - ( j % 3 != 0 ? miscall_cost : 0 ); } } - // if we're doing a local alignment, - // the costs of opening a deletion or an insertion - // remain the same no matter how far down the ref or query - // we've traveled, respectively - } else { + } else { + // here we're doing a local alignment, + // the costs of opening a deletion or an insertion + // remain the same no matter how far down the ref or query + // we've traveled, respectively + if ( do_affine ) { if ( do_codon ) { + /** 20240219 : SLKP optimization note; surely the next two loops don't need to do integer remainer at each iteration */ + // XXX: should we be including the frameshift penalty here? I think not // fill in the first row of the affine deletion matrix // with the deletion cost plus the miscall penalty - for ( i = 1; i < score_cols; ++i ) + for (long i = 1; i < score_cols; ++i ) deletion_matrix[ i ] = -open_deletion - ( i % 3 != 1 ? miscall_cost : 0 ); // fill in the first column of the affine insertion matrix // with the insertion cost plus the miscall penalty - for ( i = score_cols, j = 0; i < score_rows * score_cols; i += score_cols, ++j ) + for (long i = score_cols, j = 0; i < score_rows * score_cols; i += score_cols, ++j ) insertion_matrix[ i ] = -open_insertion - ( j % 3 != 0 ? miscall_cost : 0 ); } else { // fill in the first row of the affine deletion matrix // with the deletion cost - for ( i = 1; i < score_cols; ++i ) + for (long i = 1; i < score_cols; ++i ) deletion_matrix[ i ] = -open_deletion; // fill in the first column of the affine insertion matrix // with the insertion cost - for ( i = score_cols; i < score_rows * score_cols; i += score_cols ) + for (long i = score_cols; i < score_rows * score_cols; i += score_cols ) insertion_matrix[ i ] = -open_insertion; } } } if ( do_codon ) { - for ( i = 1; i < score_rows; ++i ) - for ( j = 1; j < score_cols; ++j ) + /** populate the dynamic programming matrix here */ + for (long i = 1; i < score_rows; ++i ) + for (long j = 1; j < score_cols; ++j ) CodonAlignStringsStep( score_matrix , r_enc , q_enc @@ -836,11 +1042,13 @@ double AlignStrings( char const * r_str ); // not doing codon alignment } else { - for ( i = 1; i < score_rows; ++i ) { + /** populate the dynamic programming matrix here */ + for (long i = 1; i < score_rows; ++i ) { const long r_char = char_map[ (unsigned char) r_str[ i - 1 ] ]; - for ( j = 1; j < score_cols; ++j ) { - const long curr = ( i - 0 ) * score_cols + j, - prev = ( i - 1 ) * score_cols + j; + for (long j = 1; j < score_cols; ++j ) { + + const long curr = ( i ) * score_cols + j, + prev = ( i - 1 ) * score_cols + j; // ref but not query is deletion // query but not ref is insertion @@ -858,25 +1066,27 @@ double AlignStrings( char const * r_str // if we're doing affine gaps, // look up potential moves in the affine gap matrices + // 20240219: make sure this works as intended + if ( do_affine ) { - deletion = MAX( deletion, + deletion = MAX_OP( deletion, deletion_matrix[ prev ] - ( i > 1 ? extend_deletion : open_deletion ) ), - insertion = MAX( insertion, + insertion = MAX_OP( insertion, insertion_matrix[ curr - 1 ] - ( j > 1 ? extend_insertion : open_insertion ) ), // store the values back in the gap matrices deletion_matrix[ curr ] = deletion; insertion_matrix[ curr ] = insertion; } - score_matrix[ curr ] = MAX( match, MAX( deletion, insertion ) ); + score_matrix[ curr ] = MAX_OP( match, MAX_OP( deletion, insertion ) ); } } } // set these indices to point at the ends // of the ref and query, respectively - i = r_len; - j = q_len; + long index_R = r_len; + long index_Q = q_len; bool took_local_shortcut = false; @@ -888,14 +1098,16 @@ double AlignStrings( char const * r_str // find the best score in the matrix // except for the first row/first column // and start backtracking from there - for (m = 1; m < score_rows; m ++) { - for (k = 1; k < score_cols; k ++) { - if ( score_matrix[ m*score_cols + k ] > score ) { - score = score_matrix[ m*score_cols + k ]; - i = ref_stride * m; - j = k; + const double * score_row = score_matrix + score_cols; + for (long m = 1; m < score_rows; m ++) { + for (long k = 1; k < score_cols; k ++) { + if ( score_row[ k ] > score ) { + score = score_row[ k ]; + index_R = ref_stride * m; + index_Q = k; } } + score_row += score_cols; } } else @@ -905,37 +1117,40 @@ double AlignStrings( char const * r_str if ( do_local ) { // grab the best score from the last column of the score matrix, // skipping the very last entry ( we already checked it ) - for ( k = score_cols - 1; k < score_rows * score_cols - 1; k += score_cols ) + + for (long k = score_cols - 1; k < score_rows * score_cols - 1; k += score_cols ) if ( score_matrix[ k ] > score ) { score = score_matrix[ k ]; // if do_codon, k / score_cols indexes into the codon space // of the reference, which is resolved by multiplication // by ref_stride ( which is 3 ), otherwise this // directly indexes into the reference - i = ref_stride * ( k / score_cols ); + index_R = ref_stride * ( k / score_cols ); } // grab the best score from the last row of the score matrix, // skipping the very last entry ( we already checked it ) - for ( k = ( score_rows - 1 ) * score_cols; k < score_rows * score_cols - 1; ++k ) + for (long k = ( score_rows - 1 ) * score_cols; k < score_rows * score_cols - 1; ++k ) if ( score_matrix[ k ] > score ) { score = score_matrix[ k ]; // if we've found a better score here, // don't forget to reset the ref index - i = r_len; + index_R = r_len; // remove the initial value! - j = k - ( score_rows - 1 ) * score_cols; + index_Q = k - ( score_rows - 1 ) * score_cols; } // fill in the edit_ops with the difference // between r_len and i - for ( k = i; k < r_len; ++k ) + for (long k = index_R; k < r_len; ++k ) { edit_ops[ edit_ptr++ ] = -1; + } // fill in the edit_ops with the difference // between q_len and j - for ( k = j; k < q_len; ++k ) + for (long k = index_R; k < q_len; ++k ) { edit_ops[ edit_ptr++ ] = 1; + } } // backtrack now @@ -956,14 +1171,14 @@ double AlignStrings( char const * r_str if ( do_codon ) { // if either index hits 0, we're done // or if both indices fall below 3, we're done - while ( i && j && ( i >= 3 || j >= 3 ) && !took_local_shortcut ) { + while ( index_R && index_Q && ( index_R >= 3 || index_Q >= 3 ) && !took_local_shortcut ) { // perform a step long code = CodonAlignStringsStep( score_matrix , r_enc , q_enc // divide by 3 to index into codon space - , ( i / 3 ) - , j + , ( index_R / 3 ) + , index_Q , score_cols , char_count , miscall_cost @@ -989,11 +1204,11 @@ double AlignStrings( char const * r_str took_local_shortcut = true; } - BacktrackAlignCodon( edit_ops, edit_ptr, i, j, code ); + BacktrackAlignCodon( edit_ops, edit_ptr, index_R, index_Q, code ); // if anything drops below 0, something bad happened - if ( i < 0 || j < 0 ) { + if ( index_R < 0 || index_Q < 0 ) { delete [] edit_ops; return -INFINITY; } @@ -1001,15 +1216,15 @@ double AlignStrings( char const * r_str // handle the affine cases if ( do_affine ) { // divide by 3 to index into codon space - k = ( i / 3 ) * score_cols + j; + long k = ( index_R / 3 ) * score_cols + index_Q; // reference matched but not query, a deletion if ( code == HY_111_000 ) { // while deletion is preferential to match - while ( i >= 3 + while ( index_R >= 3 && score_matrix[ k ] - open_deletion <= deletion_matrix[ k ] - extend_deletion ) { // take a codon out of the reference - i -= 3; + index_R -= 3; edit_ops[ edit_ptr++ ] = -1; edit_ops[ edit_ptr++ ] = -1; edit_ops[ edit_ptr++ ] = -1; @@ -1020,11 +1235,11 @@ double AlignStrings( char const * r_str // query matched but not reference, insertion } else if ( code == HY_000_111 ) { // while insertion is preferential to match - while ( j >= 3 + while ( index_Q >= 3 && score_matrix[ k ] - open_insertion <= insertion_matrix[ k ] - extend_insertion ) { // take a codon out of the query - j -= 3; + index_Q -= 3; edit_ops[ edit_ptr++ ] = 1; edit_ops[ edit_ptr++ ] = 1; edit_ops[ edit_ptr++ ] = 1; @@ -1037,9 +1252,9 @@ double AlignStrings( char const * r_str } } else { if ( do_affine ) { - while ( i && j ) { - long curr = ( i - 0 ) * score_cols + j, - prev = ( i - 1 ) * score_cols + j, + while ( index_R && index_Q ) { + long curr = ( index_R ) * score_cols + index_Q, + prev = ( index_R - 1 ) * score_cols + index_Q, best_choice = 0; // check the current affine scores and the match score @@ -1049,29 +1264,32 @@ double AlignStrings( char const * r_str score_matrix[ prev - 1 ] }, max_score = scores[ best_choice ]; - MatchScore( r_str, q_str, i, j, char_map, cost_matrix, cost_stride, scores[2] ); + MatchScore( r_str, q_str, index_R, index_Q, char_map, cost_matrix, cost_stride, scores[2] ); // look at choice other than 0 - for ( k = 1; k < 3; ++k ) - if ( scores[ k ] > max_score ) { - max_score = scores[ k ]; - best_choice = k; - } - + if (scores[1] > max_score) { + max_score = scores[1]; + best_choice = 1; + } + if (scores[2] > max_score) { + max_score = scores[2]; + best_choice = 2; + } + switch ( best_choice ) { case 0: // we have at least 1 deletion - --i; + --index_R; edit_ops[ edit_ptr++ ] = -1; // deletion is travel in the reference but not query, // look at scores back in the reference, // and while they are better for the deletion case, // move backwards in the reference - while ( i + while ( index_R && score_matrix[ curr - score_cols ] - open_deletion <= deletion_matrix[ curr - score_cols ] - extend_deletion ) { - --i; + --index_R; edit_ops[ edit_ptr++ ] = -1; curr -= score_cols; } @@ -1079,17 +1297,17 @@ double AlignStrings( char const * r_str case 1: // we have at least 1 insertion - --j; + --index_Q; edit_ops[ edit_ptr++ ] = 1; // insertion is travel in the query but not the reference, // look at scores back in the query, // and while they are better than for the insertion case, // move backwards in the query - while ( j + while ( index_Q && score_matrix[ curr - 1 ] - open_insertion <= insertion_matrix[ curr - 1 ] - extend_insertion ) { - --j; + --index_Q; edit_ops[ edit_ptr++ ] = 1; --curr; } @@ -1097,24 +1315,24 @@ double AlignStrings( char const * r_str case 2: // it's a match! move back in both - --i; - --j; + --index_R; + --index_Q; edit_ops[ edit_ptr++ ] = 0; break; } } // no affine gaps, no codons } else { - while ( i && j ) { - const long curr = ( i - 0 ) * score_cols + j, - prev = ( i - 1 ) * score_cols + j; + while ( index_R && index_Q ) { + const long curr = ( index_R ) * score_cols + index_Q, + prev = ( index_R - 1 ) * score_cols + index_Q; double deletion = score_matrix[ prev ] - open_deletion, insertion = score_matrix[ curr - 1 ] - open_insertion, match = score_matrix[ prev - 1 ]; - MatchScore( r_str, q_str, i, j, char_map, cost_matrix, cost_stride, match ); - BacktrackAlign( edit_ops, edit_ptr, i, j, deletion, insertion, match ); + MatchScore( r_str, q_str, index_R, index_Q, char_map, cost_matrix, cost_stride, match ); + BacktrackAlign( edit_ops, edit_ptr, index_R, index_Q, deletion, insertion, match ); } } } @@ -1124,46 +1342,48 @@ double AlignStrings( char const * r_str // for anything that remains, // don't forget it!!! // reference - while ( --i >= 0 ) + while ( --index_R >= 0 ) edit_ops[ edit_ptr++ ] = -1; // then query - while ( --j >= 0 ) + while ( --index_Q >= 0 ) edit_ops[ edit_ptr++ ] = 1; } if ( edit_ptr > 0 ) { // reset indices to 0 if (!took_local_shortcut){ - i = j = 0; + index_Q = index_R = 0; } // rebuild the strings from the edit_ops // with room for the null terminator r_res = new char[ edit_ptr + 1 ]; q_res = new char[ edit_ptr + 1 ]; - for ( --edit_ptr, k = 0; edit_ptr >= 0; --edit_ptr, ++k ) { + --edit_ptr; + long k = 0; + for (; edit_ptr >= 0; --edit_ptr, ++k ) { switch ( edit_ops[ edit_ptr ] ) { // match! include characters from both strings case 0: - r_res[ k ] = r_str[ i++ ]; - q_res[ k ] = q_str[ j++ ]; + r_res[ k ] = r_str[ index_R++ ]; + q_res[ k ] = q_str[ index_Q++ ]; break; // insertion! case 1: r_res[ k ] = gap; - q_res[ k ] = q_str[ j++ ]; + q_res[ k ] = q_str[ index_Q++ ]; break; case 2: r_res[ k ] = gap; - q_res[ k ] = tolower( q_str[ j++ ] ); + q_res[ k ] = tolower( q_str[ index_Q++ ] ); break; case -1: - r_res[ k ] = r_str[ i++ ]; + r_res[ k ] = r_str[ index_R++ ]; q_res[ k ] = gap; break; case -2: - r_res[ k ] = tolower( r_str[ i++ ] ); + r_res[ k ] = tolower( r_str[ index_R++ ] ); q_res[ k ] = gap; break; } @@ -1197,8 +1417,10 @@ double AlignStrings( char const * r_str delete [] deletion_matrix; } - delete [] r_enc; - delete [] q_enc; + if (do_codon) { + delete [] r_enc; + delete [] q_enc; + } } } @@ -1213,117 +1435,118 @@ double AlignStrings( char const * r_str //____________________________________________________________________________________ -hyFloat CostOnly (_String const * s1, // first string - _String const * s2, // second string - long from1, // start here in string1 - long from2, // start here in string2 - long to1, // up to here in string1 // not inclusive - long to2, // up to here in string2 // not inclusive - bool rev1, // reverse string1 - bool rev2, // reverse string2 - long* cmap, // char -> position in scoring matrix mapper - _Matrix * ccost, // NxN matrix of edit distances on characters - hyFloat gopen, // the cost of opening a gap in sequence 1 - hyFloat gextend, // the cost of extending a gap in sequence 1 (ignored unless doAffine == true) - hyFloat gopen2, // the cost of opening a gap in sequence 2 - hyFloat gextend2, // the cost of opening a gap in sequence 2 (ignored unless doAffine == true) - bool doLocal, // ignore prefix and suffix gaps - bool doAffine, // use affine gap penalties - _Matrix & scoreMatrix, // where to write the last row of the scoring matrix - _Matrix * gapScore1, // where to write the last row of open gap in 1st sequence matrix (ignored unless doAffine == true) - _Matrix * gapScore2, // same but for open gap in 2nd sequence matrix - char secondGap, - char * howAchieved - ) +double CostOnly ( const char * s1, // first string + const char * s2, // second string + const long s1L, // length of S1 + const long s2L, // length of S2 + long from1, // start here in string1 + long from2, // start here in string2 + long to1, // up to here in string1 // not inclusive + long to2, // up to here in string2 // not inclusive + bool rev1, // reverse string1 + bool rev2, // reverse string2 + long* cmap, // char -> position in scoring matrix mapper + double const * ccost, // NxN matrix of edit distances on characters + const long mapL, // dimension of ccost + double gopen, // the cost of opening a gap in sequence 1 + double gextend, // the cost of extending a gap in sequence 1 (ignored unless doAffine == true) + double gopen2, // the cost of opening a gap in sequence 2 + double gextend2, // the cost of opening a gap in sequence 2 (ignored unless doAffine == true) + bool doLocal, // ignore prefix and suffix gaps + bool doAffine, // use affine gap penalties + double * scoreMatrix, // where to write the last row of the scoring matrix + double * gapScore1, // where to write the last row of open gap in 1st sequence matrix (ignored unless doAffine == true) + double * gapScore2, // same but for open gap in 2nd sequence matrix + char secondGap, + char * howAchieved + ) { - hyFloat score = 0.; + double score = 0.; long s1Length = to1-from1, - s2Length = to2-from2; + s2Length = to2-from2; bool doLocal1S = false, - doLocal1E = false, - doLocal2S = false, - doLocal2E = false; + doLocal1E = false, + doLocal2S = false, + doLocal2E = false; if (doLocal) { if (rev1) { - doLocal1S = (to1==s1->length()); + doLocal1S = (to1==s1L); doLocal1E = from1 == 0L; //doLocal1 = (to1==s1->sLength)*_ALIGNMENT_LOCAL_START + (from1 == 0)*_ALIGNMENT_LOCAL_END; } else { - doLocal1E = (to1==s1->length()); + doLocal1E = (to1==s1L); doLocal1S = from1 == 0L; //doLocal1 = (from1==0)*_ALIGNMENT_LOCAL_START + (to1==s1->sLength)*_ALIGNMENT_LOCAL_END; } if (rev2) { doLocal2E = from2 == 0L; - doLocal2S = (to2==s2->length()); + doLocal2S = (to2==s2L); //doLocal2 = (to2==s2->sLength)*_ALIGNMENT_LOCAL_START + (from2 == 0)*_ALIGNMENT_LOCAL_END; } else { doLocal2S = from2 == 0L; - doLocal2E = (to2==s2->length()); + doLocal2E = (to2==s2L); //doLocal2 = (from2==0)*_ALIGNMENT_LOCAL_START + (to2==s2->sLength)*_ALIGNMENT_LOCAL_END; } } - if (s1Length) + if (s1Length) { // first string not empty - { - if (s2Length) + if (s2Length) { // second string not empty - { - hyFloat aux2; - long colCount = s2Length+1; + double aux2; + long colCount = s2Length+1; - scoreMatrix.theData[0] = 0.; + scoreMatrix[0] = 0.; if (doAffine) { - gapScore1->theData[0] = gapScore2->theData[0] = 0.; + gapScore1[0] = gapScore2[0] = 0.; } if (doLocal1S == 0) { - hyFloat cost = -gopen; + double cost = -gopen; if (doAffine) { for (long k=1; k < colCount; k++, cost-=gextend) { - scoreMatrix.theData[k] = cost; - gapScore1->theData [k] = cost; - gapScore2->theData [k] = cost; + scoreMatrix [k] = cost; + gapScore1 [k] = cost; + gapScore2 [k] = cost; } } else for (long m=1; m < colCount; m++, cost-=gopen) { - scoreMatrix.theData[m] = cost; + scoreMatrix [m] = cost; } } else { for (long k=1; k < colCount; k++) { - scoreMatrix.theData[k] = 0.; + scoreMatrix [k] = 0.; } if (doAffine) { for (long k=1; k < colCount; k++) { - gapScore1->theData[k] = 0; - gapScore2->theData[k] = -(secondGap==1?gextend2:gopen2); + gapScore1[k] = 0; + gapScore2[k] = -(secondGap==1?gextend2:gopen2); // prefix gaps in the second sequence } - gapScore1->theData[0] = -gopen; + gapScore1[0] = -gopen; } } - - long mapL = ccost->GetVDim(); // how many valid characters + // long mapL = ccost->GetVDim(); + // how many valid characters if (doAffine) { aux2 = 0.; if (doLocal2S == 0) { - gapScore1->theData[0] = gapScore2->theData[0] = -(secondGap==1?gextend2:gopen2); + gapScore1[0] = gapScore2[0] = -(secondGap==1?gextend2:gopen2); } from2 --; from1 --; for (long r=1; r<=s1Length; r++) { // iterate by rows - long c1 = cmap[s1->get_uchar (rev1?(to1-r):(from1+r))]; + long c1 = cmap[s1[rev1?(to1-r):(from1+r)]]; if (doLocal2S) { aux2 = 0.; @@ -1331,11 +1554,11 @@ hyFloat CostOnly (_String const * s1, // first string if (r>1) { aux2 = -((r-2)*gextend2 + (secondGap==1?gextend2:gopen2)); } - scoreMatrix.theData[0] = -((secondGap==1?gextend2:gopen2) + (r-1)*gextend2); + scoreMatrix[0] = -((secondGap==1?gextend2:gopen2) + (r-1)*gextend2); } for (long c=1; c<=s2Length; c++) { // iterate by columns - hyFloat gscore1 , // gap in 2nd + double gscore1 , // gap in 2nd gscore2 , // gap in 1st gscore3 = aux2, // no gap t; @@ -1345,13 +1568,13 @@ hyFloat CostOnly (_String const * s1, // first string if (doLocal1E && r == s1Length) { //gscore2 = MAX(scoreMatrix.theData[c-1],gapScore1->theData[c-1]); - gscore2 = scoreMatrix.theData[c-1]; - if (gapScore1->theData[c-1] > gscore2) { - gscore2 = gapScore1->theData[c-1]; + gscore2 = scoreMatrix[c-1]; + if (gapScore1[c-1] > gscore2) { + gscore2 = gapScore1[c-1]; } } else { - gscore2 = scoreMatrix.theData[c-1]-gopen; - t = gapScore1->theData[c-1]-((c>1)?gextend:gopen); + gscore2 = scoreMatrix[c-1]-gopen; + t = gapScore1[c-1]-((c>1)?gextend:gopen); if (t > gscore2) { gscore2 = t; } @@ -1359,14 +1582,14 @@ hyFloat CostOnly (_String const * s1, // first string if (doLocal2E && c == s2Length) { //gscore1 = MAX(scoreMatrix.theData[c],gapScore2->theData[c]); - gscore1 = scoreMatrix.theData[c]; - if (gscore1 < gapScore2->theData[c]) { - gscore1 = gapScore2->theData[c]; + gscore1 = scoreMatrix[c]; + if (gscore1 < gapScore2[c]) { + gscore1 = gapScore2[c]; } } else { //gscore1 = MAX(scoreMatrix.theData[c]-gopen2,gapScore2->theData[c]-((r>1)?gextend2:gopen2)); - gscore1 = scoreMatrix.theData[c]-gopen2; - t = gapScore2->theData[c]-((r>1)?gextend2:gopen2); + gscore1 = scoreMatrix [c]-gopen2; + t = gapScore2[c]-((r>1)?gextend2:gopen2); if (t > gscore1) { gscore1 = t; } @@ -1375,14 +1598,14 @@ hyFloat CostOnly (_String const * s1, // first string // if this is the second row, then we start a gap in the second sequence -| if (c1>=0) { - long c2 = cmap[s2->get_uchar (rev2?(to2-c):(from2+c))]; + long c2 = cmap[s2[rev2?(to2-c):(from2+c)]]; if (c2>=0) { - gscore3 += ccost->theData[c1*mapL+c2]; + gscore3 += ccost[c1*mapL+c2]; } } - aux2 = scoreMatrix.theData[c]; + aux2 = scoreMatrix[c]; char option = 0; t = gscore2; @@ -1397,7 +1620,7 @@ hyFloat CostOnly (_String const * s1, // first string option = 2; } } - scoreMatrix.theData[c] = t; + scoreMatrix[c] = t; if (howAchieved) { howAchieved[c] = option; } @@ -1405,36 +1628,35 @@ hyFloat CostOnly (_String const * s1, // first string //if (rev2 && secondGap==2 && c == s2Length) // gscore1 = MAX(scoreMatrix.theData[c]-gextend2,gapScore2->theData[c]-((r>1)?gextend2:gopen2)); - gapScore2->theData [c] = gscore1; - gapScore1->theData [c] = gscore2; + gapScore2 [c] = gscore1; + gapScore1 [c] = gscore2; } if (doLocal2S && r < s1Length) { - gapScore1->theData[0]-=gextend2; - gapScore2->theData[0]-=gextend2; + gapScore1[0] -= gextend2; + gapScore2[0] -= gextend2; } } - } else + } else { // populate the cost matrix row by row - { aux2 = 0.; for (long r=1; r<=s1Length; r++) { if (doLocal2S) { aux2 = 0.; } else { - scoreMatrix.theData[0] = -(gopen2 * r); + scoreMatrix[0] = -(gopen2 * r); if (r>1) { aux2 = -((r-1)*gopen2); } } //printf ("%d: %g\t", r, scoreMatrix.theData[0]); - long c1 = cmap[s1->get_uchar (rev1?(to1-r):(from1+r-1))]; + long c1 = cmap[s1[rev1?(to1-r):(from1+r-1)]]; for (long c=1; c<=s2Length; c++) { - hyFloat score1 = scoreMatrix.theData[c], // gap in 2nd - score2 = scoreMatrix.theData[c-1], // gap in 1st + double score1 = scoreMatrix[c], // gap in 2nd + score2 = scoreMatrix[c-1], // gap in 1st score3 = aux2; if (c < s2Length || doLocal2E == 0) { @@ -1445,22 +1667,22 @@ hyFloat CostOnly (_String const * s1, // first string } if (c1>=0) { - long c2 = cmap[s2->get_uchar (rev2?(to2-c):(from2+c-1))]; + long c2 = cmap[s2[rev2?(to2-c):(from2+c-1)]]; if (c2>=0) { - score3 += ccost->theData[c1*mapL+c2]; + score3 += ccost [c1*mapL+c2]; } } - aux2 = scoreMatrix.theData[c]; + aux2 = scoreMatrix[c]; char option = 0; - scoreMatrix.theData[c] = score1; + scoreMatrix[c] = score1; if (score2 > score1) { - scoreMatrix.theData[c] = score2; + scoreMatrix[c] = score2; option = 1; } - if (score3 > scoreMatrix.theData[c]) { - scoreMatrix.theData[c] = score3; + if (score3 > scoreMatrix[c]) { + scoreMatrix[c] = score3; option = 2; } if (howAchieved) { @@ -1471,7 +1693,7 @@ hyFloat CostOnly (_String const * s1, // first string } } - score = scoreMatrix.theData[s2Length]; + score = scoreMatrix[s2Length]; } else { // 2nd string empty if ((doLocal2S || doLocal2E) == false) { if (doAffine) { @@ -1486,29 +1708,30 @@ hyFloat CostOnly (_String const * s1, // first string if ((doLocal1S || doLocal1E) == false) { score = -gopen; - scoreMatrix.theData[0] = 0.0; + scoreMatrix[0] = 0.0; if (doAffine) { - gapScore1->theData[0] = gapScore2->theData[0] = 0.0; + gapScore1[0] = gapScore2[0] = 0.0; for (long k = 1; k <= s2Length; k++, score-=gextend) { - scoreMatrix.theData[k] = gapScore1->theData[k] = gapScore2->theData[k] = score; + scoreMatrix[k] = gapScore1[k] = gapScore2[k] = score; } score += gextend; } else { for (long k = 1; k <= s2Length; k++, score-=gopen) { - scoreMatrix.theData[k] = score; + scoreMatrix[k] = score; } score += gopen; } } else { for (long k = 0; k <= s2Length; k++) { - scoreMatrix.theData[k] = 0.; + scoreMatrix[k] = 0.; } - if (doAffine) + if (doAffine) { for (long k = 0; k <= s2Length; k++) { - gapScore1->theData[k] = 0.; - gapScore2->theData[k] = 0.; + gapScore1[k] = 0.; + gapScore2[k] = 0.; } + } } } @@ -1518,25 +1741,29 @@ hyFloat CostOnly (_String const * s1, // first string //____________________________________________________________________________________ -hyFloat LinearSpaceAlign (_String const *s1, // first string - _String const *s2, // second string - long * cmap, // char -> position in scoring matrix mapper - _Matrix* ccost, // NxN matrix of edit distances on characters - hyFloat gopen, // the cost of opening a gap in sequence 1 - hyFloat gextend, // the cost of extending a gap in sequence 1 (ignored unless doAffine == true) - hyFloat gopen2, // the cost of opening a gap in sequence 2 - hyFloat gextend2, // the cost of opening a gap in sequence 2 (ignored unless doAffine == true) - bool doLocal, // ignore prefix and suffix gaps - bool doAffine, // use affine gap penalties - _SimpleList& ops, // edit operations for the optimal alignment - hyFloat scoreCheck, // check the score of the alignment - long from1, - long to1, - long from2, - long to2, - _Matrix **buffer, // matrix storage, - char parentGapLink, - char *ha +double LinearSpaceAlign ( const char *s1, // first string + const char *s2, // second string + const long s1L, // length of s1 + const long s2L, // length of s2 + + long * cmap, // char -> position in scoring matrix mapper + double const* ccost, // NxN matrix of edit distances on characters + const long costD, // dimension N in ccost + double gopen, // the cost of opening a gap in sequence 1 + double gextend, // the cost of extending a gap in sequence 1 (ignored unless doAffine == true) + double gopen2, // the cost of opening a gap in sequence 2 + double gextend2, // the cost of opening a gap in sequence 2 (ignored unless doAffine == true) + bool doLocal, // ignore prefix and suffix gaps + bool doAffine, // use affine gap penalties + long* ops, // edit operations for the optimal alignment + double scoreCheck, // check the score of the alignment + long from1, + long to1, + long from2, + long to2, + double **buffer, // matrix storage, + char parentGapLink, + char *ha ) { if (to2 == from2 || to1 == from1) { @@ -1546,24 +1773,77 @@ hyFloat LinearSpaceAlign (_String const *s1, // first stri long midpoint = (from1 + to1)/2, span = to2-from2, span1 = to1-from1; + + /* + hyFloat CostOnly ( const char * s1, // first string + const char * s2, // second string + const long s1L, // length of S1 + const long s2L, // length of S2 + long from1, // start here in string1 + long from2, // start here in string2 + long to1, // up to here in string1 // not inclusive + long to2, // up to here in string2 // not inclusive + bool rev1, // reverse string1 + bool rev2, // reverse string2 + long* cmap, // char -> position in scoring matrix mapper + double const * ccost, // NxN matrix of edit distances on characters + const long mapL, // dimension of ccost + double gopen, // the cost of opening a gap in sequence 1 + double gextend, // the cost of extending a gap in sequence 1 (ignored unless doAffine == true) + double gopen2, // the cost of opening a gap in sequence 2 + double gextend2, // the cost of opening a gap in sequence 2 (ignored unless doAffine == true) + bool doLocal, // ignore prefix and suffix gaps + bool doAffine, // use affine gap penalties + double * scoreMatrix, // where to write the last row of the scoring matrix + double * gapScore1, // where to write the last row of open gap in 1st sequence matrix (ignored unless doAffine == true) + double * gapScore2, // same but for open gap in 2nd sequence matrix + char secondGap, + char * howAchieved + ) + + */ if (span1 > 1) { - CostOnly (s1,s2,from1,from2,midpoint,to2,false,false,cmap,ccost,gopen,gextend,gopen2,gextend2,doLocal,doAffine,*(buffer[0]), buffer[1], buffer[2], parentGapLink>=2, ha); - CostOnly (s1,s2,midpoint,from2,to1,to2,true,true, cmap,ccost,gopen,gextend,gopen2,gextend2,doLocal,doAffine,*(buffer[3]), buffer[4], buffer[5], 2*(parentGapLink%2), ha+s2->length()+1UL); + CostOnly (s1, + s2, + s1L, + s2L, + from1, + from2, + midpoint, + to2, + false, + false, + cmap, + ccost, + costD, + gopen, + gextend, + gopen2, + gextend2, + doLocal, + doAffine, + buffer[0], + buffer[1], + buffer[2], + parentGapLink>=2, + ha); + CostOnly (s1,s2,s1L,s2L, midpoint,from2,to1,to2,true,true, cmap,ccost,costD, gopen,gextend,gopen2,gextend2,doLocal,doAffine,buffer[3], buffer[4], buffer[5], 2*(parentGapLink%2), ha+s2L+1UL); } else { - CostOnly (s1,s2,from1,from2,to1,to2,false,false,cmap,ccost,gopen,gextend,gopen2,gextend2,doLocal,doAffine,*(buffer[0]), buffer[1], buffer[2], (parentGapLink>=2), ha); + CostOnly (s1,s2,s1L,s2L,from1,from2,to1,to2,false,false,cmap,ccost,costD, gopen,gextend,gopen2,gextend2,doLocal,doAffine,buffer[0], buffer[1], buffer[2], (parentGapLink>=2), ha); } - hyFloat maxScore = -1e100; + double maxScore = -1e100; long maxIndex = 0; bool gapLink = false; char alignmentKind = 0; - hyFloat gapOffsetScore = gopen2-gextend2; + double gapOffsetScore = gopen2-gextend2; + if (!doAffine) { if (span1 > 1) { for (long k = 0; k <= span; k++) { - hyFloat currentScore = buffer[0]->theData[k] + buffer[3]->theData[span-k]; + double currentScore = buffer[0][k] + buffer[3][span-k]; if (currentScore > maxScore) { maxScore = currentScore; maxIndex = k; @@ -1571,9 +1851,9 @@ hyFloat LinearSpaceAlign (_String const *s1, // first stri } } else { // handle the case of a single row span correctly for (long k = 0; k <= span; k++) { - hyFloat currentScore = buffer[0]->theData[k]; + double currentScore = buffer[0][k]; - if (!doLocal || to1 != s1->length()) { + if (! doLocal || to1 != s1L ) { currentScore -= gopen*(span-k); } @@ -1590,11 +1870,11 @@ hyFloat LinearSpaceAlign (_String const *s1, // first stri // or gap-to-gap link for (long k = 0; k <= span; k++) { - hyFloat currentScoreNoGap = buffer[0]->theData[k] + buffer[3]->theData[span-k], - currentScoreWithGap2 = buffer[2]->theData[k] + buffer[5]->theData[span-k] + gapOffsetScore; + double currentScoreNoGap = buffer[0][k] + buffer[3][span-k], + currentScoreWithGap2 = buffer[2][k] + buffer[5][span-k] + gapOffsetScore; - if (doAffine && (((from1 == 0 || from2==0) && k == 0) || ((to1 == s1->length() || to2 == s2->length()) && k == span))) { + if (doAffine && (((from1 == 0 || from2==0) && k == 0) || ((to1 == s1L || to2 == s2L) && k == span))) { currentScoreWithGap2 -= gapOffsetScore; } @@ -1617,14 +1897,14 @@ hyFloat LinearSpaceAlign (_String const *s1, // first stri } else { // handle the case of a single row span correctly if (parentGapLink == 1) { maxIndex = span; - maxScore = buffer[2]->theData[span]; + maxScore = buffer[2][span]; alignmentKind = 1; } else { for (long k = 0; k <= span; k++) { - hyFloat currentScoreNoGap = buffer[0]->theData[k], - currentScoreWithGap2 = buffer[2]->theData[k]; + double currentScoreNoGap = buffer[0][k], + currentScoreWithGap2 = buffer[2][k]; - if (!doLocal || to1 != s1->length()) // indel in sequence 1 + if (!doLocal || to1 != s1L) // indel in sequence 1 if (span-k) { currentScoreNoGap -= gopen; currentScoreWithGap2 -= gopen; @@ -1643,6 +1923,7 @@ hyFloat LinearSpaceAlign (_String const *s1, // first stri maxIndex = k; alignmentKind = ha[k]; } + if (currentScoreWithGap2 > maxScore) { maxScore = currentScoreWithGap2; maxIndex = k; @@ -1655,14 +1936,14 @@ hyFloat LinearSpaceAlign (_String const *s1, // first stri if (span1 == 1) { if (alignmentKind == 2) { - ops.list_data[from1+1] = from2+maxIndex-1; + ops[from1+1] = from2+maxIndex-1; } else if (alignmentKind == 0 && maxIndex == 0/*&& to2 == s2->sLength && to1 == s1->sLength*/) { - ops.list_data[from1+1] = -3; + ops[from1+1] = -3; } } else { - hyFloat check1 = buffer[0]->theData[maxIndex], - check2 = buffer[3]->theData[span-maxIndex]; + double check1 = buffer[0][maxIndex], + check2 = buffer[3][span-maxIndex]; if (span1>1) { if (maxIndex > 0) { @@ -1670,10 +1951,10 @@ hyFloat LinearSpaceAlign (_String const *s1, // first stri if (parentGapLink >= 2) { gapCode += 2; } - LinearSpaceAlign (s1,s2,cmap,ccost,gopen,gextend,gopen2,gextend2,doLocal,doAffine,ops,check1, from1, midpoint, from2, from2 + maxIndex, buffer, gapCode, ha); + LinearSpaceAlign (s1,s2,s1L,s2L, cmap,ccost,costD, gopen,gextend,gopen2,gextend2,doLocal,doAffine,ops,check1, from1, midpoint, from2, from2 + maxIndex, buffer, gapCode, ha); } else if (from2 == 0) for (long k = from1; k < midpoint; k++) { - ops.list_data[k+1] = -3; + ops[k+1] = -3; } if (maxIndex < span) { @@ -1681,7 +1962,7 @@ hyFloat LinearSpaceAlign (_String const *s1, // first stri if (parentGapLink % 2 == 1) { gapCode ++; } - LinearSpaceAlign (s1,s2,cmap,ccost,gopen,gextend,gopen2,gextend2,doLocal,doAffine,ops,check2, midpoint, to1, from2 + maxIndex, to2, buffer, gapCode, ha); + LinearSpaceAlign (s1,s2,s1L, s2L, cmap,ccost,costD, gopen,gextend,gopen2,gextend2,doLocal,doAffine,ops,check2, midpoint, to1, from2 + maxIndex, to2, buffer, gapCode, ha); } } } diff --git a/src/core/associative_list.cpp b/src/core/associative_list.cpp index 05ed93a04..6fd5ad757 100644 --- a/src/core/associative_list.cpp +++ b/src/core/associative_list.cpp @@ -597,6 +597,22 @@ _List* _AssociativeList::GetKeys (void) const { } +//__________________________________________________________________________________ +_String* _AssociativeList::GetSmallestNumericalKey (void) const { + + _SimpleList keys; + + for (AVLListXLIteratorKeyValue key_value : AVLListXLIterator (&avl)) { + keys << ((_String*)avl.Retrieve(key_value.get_index()))->to_long(); + } + if (keys.empty()) { + return new _String; + } + keys.Sort(); + return new _String (keys.GetElement(0)); + +} + //_____________________________________________________________________________________________ void _AssociativeList::FillInList (_List& fill_me) { diff --git a/src/core/batchlan.cpp b/src/core/batchlan.cpp index 3c471bf60..66077f008 100644 --- a/src/core/batchlan.cpp +++ b/src/core/batchlan.cpp @@ -1154,7 +1154,9 @@ _String* _ExecutionList::FetchFromStdinRedirect (_String const * dialog_tag, if (user_argument) { // user argument provided user_argument -> AddAReference(); kwarg_used = (_String*)current_tag->GetItem(0); - kwargs->DeleteByKey(*kwarg_used); + if (handle_multi_choice || user_argument->ObjectClass() != ASSOCIATIVE_LIST) { + kwargs->DeleteByKey(*kwarg_used); + } ref_manager < user_argument; } else { // see if there are defaults if (current_tag->countitems() > 2 && ! ignore_kw_defaults) { @@ -1184,6 +1186,25 @@ _String* _ExecutionList::FetchFromStdinRedirect (_String const * dialog_tag, user_argument->AddAReference(); throw (user_argument); } else { + if (user_argument->ObjectClass() == ASSOCIATIVE_LIST) { + _AssociativeList* list_arg = (_AssociativeList*)user_argument; + if (list_arg->countitems() >= 1) { + _String* smallest_key = list_arg->GetSmallestNumericalKey (); + _FString* value = (_FString*)list_arg->GetByKey (*smallest_key); + _String *ret_value = new _String (value->get_str()); + echo_argument (kwarg_used, value->get_str()); + list_arg->DeleteByKey(smallest_key); // this will also delete smallest_key + if (list_arg->countitems() == 0) { + kwargs->DeleteByKey(*kwarg_used); + } else { + currentKwarg --; + } + return ret_value; + } else { + throw _String ("Ran out of multi-choice keyword options"); + } + + } throw _String ("Multi-choice keyword argument not supported in this context"); } } diff --git a/src/core/batchlanruntime.cpp b/src/core/batchlanruntime.cpp index 864c7869c..92e29be79 100644 --- a/src/core/batchlanruntime.cpp +++ b/src/core/batchlanruntime.cpp @@ -1071,11 +1071,15 @@ bool _ElementaryCommand::HandleAlignSequences(_ExecutionList& current_progr if (do_linear) { unsigned long size_allocation = sequence2->length()+1UL; - _Matrix *buffers[6] = {nil}; - + _Matrix *buffers[6] = {nil}; + double *data_buffers[6] = {nil}; + ArrayForEach(buffers, 6, [=] (_Matrix* m, unsigned long) -> _Matrix* { return new _Matrix (size_allocation,1,false,true); }); + ArrayForEach(data_buffers, 6, [=] (double* m, unsigned long i) -> double* { + return buffers[i]->theData; + }); char *alignment_route = new char[2*(size_allocation)] {0}; @@ -1083,10 +1087,28 @@ bool _ElementaryCommand::HandleAlignSequences(_ExecutionList& current_progr ops.list_data[reference_sequence->length()+1] = sequence2->length(); ops.list_data[0] = -1; - score = LinearSpaceAlign(reference_sequence,sequence2,character_map_to_integers,score_matrix, - gap_open,gap_extend,gap_open2,gap_extend2, - do_local,do_affine,ops,score,0, - reference_sequence->length(),0,sequence2->length(),buffers,0,alignment_route); + score = LinearSpaceAlign (reference_sequence->get_str(), + sequence2->get_str(), + reference_sequence->length(), + sequence2->length(), + character_map_to_integers, + score_matrix->theData, + score_matrix->GetVDim(), + gap_open, + gap_extend, + gap_open2, + gap_extend2, + do_local, + do_affine, + ops.list_data, + score, + 0, + reference_sequence->length(), + 0, + sequence2->length(), + data_buffers, + 0, + alignment_route); delete[] alignment_route; @@ -1168,6 +1190,10 @@ bool _ElementaryCommand::HandleAlignSequences(_ExecutionList& current_progr * str2r = nil; + if (do_codon && reference_sequence->length() % 3 != 0) { + hy_global::HandleApplicationError ( "Reference sequence length not divisible by 3 in AlignStrings (codon mode)" ); + } + score = AlignStrings (reference_sequence->get_str() ? reference_sequence->get_str() : "", sequence2->get_str() ? sequence2->get_str() : "", str1r, diff --git a/src/core/global_things.cpp b/src/core/global_things.cpp index 4f2365158..58d6d5911 100644 --- a/src/core/global_things.cpp +++ b/src/core/global_things.cpp @@ -122,7 +122,7 @@ namespace hy_global { kErrorStringMatrixExportError ("Export matrix called with a non-polynomial matrix argument"), kErrorStringNullOperand ("Attempting to operate on an undefined value; this is probably the result of an earlier 'soft' error condition"), kErrorNumerical ("To treat numerical errors as warnings, please specify \"ENV=TOLERATE_NUMERICAL_ERRORS=1;\" as the command line argument. This often resolves the issue, which is indicative of numerical instability."), - kHyPhyVersion = _String ("2.5.59"), + kHyPhyVersion = _String ("2.5.60"), kNoneToken = "None", kNullToken = "null", diff --git a/src/core/include/alignment.h b/src/core/include/alignment.h index 29c1117d3..71feec015 100644 --- a/src/core/include/alignment.h +++ b/src/core/include/alignment.h @@ -40,10 +40,6 @@ #ifndef __ALIGNMENT_HEADER_FILE__ #define __ALIGNMENT_HEADER_FILE__ -#include "baseobj.h" -#include "likefunc.h" -#include "matrix.h" -#include "simplelist.h" double AlignStrings( char const * r_str , char const * q_str @@ -69,23 +65,26 @@ double AlignStrings( char const * r_str , const bool do_true_local = false ); -hyFloat LinearSpaceAlign( _String const * s1 // first string - , _String const * s2 // second string +double LinearSpaceAlign( const char * s1 // first string + , const char * s2 // second string + , const long s1L + , const long s2L , long* cmap // char -> position in scoring matrix mapper - , _Matrix * ccost // NxN matrix of edit distances on characters - , hyFloat gopen // the cost of opening a gap in sequence 1 - , hyFloat gextend // the cost of extending a gap in sequence 1 (ignored unless doAffine == true) - , hyFloat gopen2 // the cost of opening a gap in sequence 2 - , hyFloat gextend2 // the cost of opening a gap in sequence 2 (ignored unless doAffine == true) + , const double * ccost // NxN matrix of edit distances on characters + , const long costD + , double gopen // the cost of opening a gap in sequence 1 + , double gextend // the cost of extending a gap in sequence 1 (ignored unless doAffine == true) + , double gopen2 // the cost of opening a gap in sequence 2 + , double gextend2 // the cost of opening a gap in sequence 2 (ignored unless doAffine == true) , bool doLocal // ignore prefix and suffix gaps , bool doAffine // use affine gap penalties - , _SimpleList & ops // edit operations for the optimal alignment - , hyFloat scoreCheck // check the score of the alignment + , long * ops // edit operations for the optimal alignment + , double scoreCheck // check the score of the alignment , long from1 , long to1 , long from2 , long to2 - , _Matrix ** buffer // matrix storage, + , double ** buffer // matrix storage, , char parentGapLink , char * ha ); diff --git a/src/core/include/associative_list.h b/src/core/include/associative_list.h index 2240a2f91..83de3bc91 100644 --- a/src/core/include/associative_list.h +++ b/src/core/include/associative_list.h @@ -133,6 +133,7 @@ class _AssociativeList: public _MathObject { return ASSOCIATIVE_LIST; } _List* GetKeys (void) const; + _String* GetSmallestNumericalKey (void) const; void FillInList (_List&); unsigned long Length (void) const { return avl.countitems(); From 82e2945d313404d7925cbad35cd2f0988bce9e47 Mon Sep 17 00:00:00 2001 From: Sergei Date: Mon, 26 Feb 2024 22:20:25 -0500 Subject: [PATCH 4/9] Fixing an important bug for sequence-by-sequence simulation, impacting, in particular FEL and MEME resampling --- .../SelectionAnalyses/BUSTED.bf | 27 ++++++++++++------- .../SelectionAnalyses/FEL.bf | 8 +++--- src/core/batchlan.cpp | 5 ++++ src/core/matrix.cpp | 9 ++++--- 4 files changed, 32 insertions(+), 17 deletions(-) diff --git a/res/TemplateBatchFiles/SelectionAnalyses/BUSTED.bf b/res/TemplateBatchFiles/SelectionAnalyses/BUSTED.bf index 8b571766c..e711e11c1 100644 --- a/res/TemplateBatchFiles/SelectionAnalyses/BUSTED.bf +++ b/res/TemplateBatchFiles/SelectionAnalyses/BUSTED.bf @@ -1,4 +1,4 @@ -RequireVersion ("2.5.38"); +RequireVersion ("2.5.51"); LoadFunctionLibrary("libv3/all-terms.bf"); // must be loaded before CF3x4 @@ -22,17 +22,22 @@ utility.SetEnvVariable ("NORMALIZE_SEQUENCE_NAMES", TRUE); busted.analysis_description = { - terms.io.info : -"BUSTED (branch-site unrestricted statistical test of episodic diversification) uses a random effects branch-site model fitted -jointly to all or a subset of tree branches in order to test for alignment-wide evidence of episodic diversifying selection. -Assuming there is evidence of positive selection (i.e. there is an omega > 1), BUSTED will also perform a quick evidence-ratio -style analysis to explore which individual sites may have been subject to selection. v2.0 adds support for synonymous rate variation, -and relaxes the test statistic to 0.5 (chi^2_0 + chi^2_2). Version 2.1 adds a grid search for the initial starting point. -Version 2.2 changes the grid search to LHC, and adds an initial search phase to use adaptive Nedler-Mead. Version 3.0 implements the option -for branch-site variation in synonymous substitution rates. Version 3.1 adds HMM auto-correlation option for SRV, and binds SRV distributions for multiple branch sets. + + terms.io.info : "BUSTED (branch-site unrestricted statistical test of episodic diversification) uses a random effects branch-site model fitted jointly to all or a subset of tree branches in order to test for alignment-wide evidence of episodic diversifying selection. Assuming there is evidence of positive selection (i.e. there is an omega > 1), BUSTED will also perform a quick evidence-ratio style analysis to explore which individual sites may have been subject to selection. v2.0 adds support for synonymous rate variation, and relaxes the test statistic to 0.5 (chi^2_0 + chi^2_2). + +Version 2.1 adds a grid search for the initial starting point. + +Version 2.2 changes the grid search to LHC, and adds an initial search phase to use adaptive Nedler-Mead. + +Version 3.0 implements the option for branch-site variation in synonymous substitution rates. + +Version 3.1 adds HMM auto-correlation option for SRV, and binds SRV distributions for multiple branch sets. + Version 4.0 adds support for multiple hits (MH), ancestral state reconstruction saved to JSON, and profiling of branch-site level support for selection / multiple hits. + Version 4.2 adds calculation of MH-attributable fractions of substitutions. -Version 4.5 adds an "error absorption" component [experimental] + +Version 4.5 adds an 'error absorption' component [experimental] ", terms.io.version : "4.5", terms.io.reference : "*Gene-wide identification of episodic selection*, Mol Biol Evol. 32(5):1365-71, *Synonymous Site-to-Site Substitution Rate Variation Dramatically Inflates False Positive Rates of Selection Analyses: Ignore at Your Own Peril*, Mol Biol Evol. 37(8):2430-2439", @@ -42,6 +47,8 @@ Version 4.5 adds an "error absorption" component [experimental] terms.io.requirements : "in-frame codon alignment and a phylogenetic tree (optionally annotated with {})" }; + +console.log (busted.analysis_description); io.DisplayAnalysisBanner (busted.analysis_description); busted.FG = "Test"; diff --git a/res/TemplateBatchFiles/SelectionAnalyses/FEL.bf b/res/TemplateBatchFiles/SelectionAnalyses/FEL.bf index 6b8ac00cf..e3ed69629 100644 --- a/res/TemplateBatchFiles/SelectionAnalyses/FEL.bf +++ b/res/TemplateBatchFiles/SelectionAnalyses/FEL.bf @@ -736,7 +736,6 @@ lfunction fel.handle_a_site (lf, filter_data, partition_index, pattern_info, mod } ); - if (^"fel.ci") { if (!sim_mode) { @@ -761,6 +760,8 @@ lfunction fel.handle_a_site (lf, filter_data, partition_index, pattern_info, mod ci = None; } + + if (sim_mode) { lrt = 2*results[1][0]; } else { @@ -777,9 +778,10 @@ lfunction fel.handle_a_site (lf, filter_data, partition_index, pattern_info, mod ^"fel.alpha_scaler" = (^"fel.alpha_scaler" + 3*^"fel.beta_scaler_test")/4; parameters.SetConstraint ("fel.beta_scaler_test","fel.alpha_scaler", ""); - + Optimize (results, ^lf); + if (sim_mode) { return lrt - 2*results[1][0]; } else { @@ -792,11 +794,11 @@ lfunction fel.handle_a_site (lf, filter_data, partition_index, pattern_info, mod tree_name = (lfInfo["Trees"])[0]; sum = (BranchLength (^tree_name, -1)*^"fel.selected_branches_index")[0]; if (^"fel.resample") { + N = ^"fel.resample"; sims = {}; GetDataInfo (fi, ^((lfInfo["Datafilters"])[0]), "PARAMETERS"); //Export (lfe, ^lf); - //console.log (lfe); for (i = 0; i < N; i+=1) { DataSet null_sim = SimulateDataSet (^lf); DataSetFilter null_filter = CreateFilter (null_sim,3,,,fi["EXCLUSIONS"]); diff --git a/src/core/batchlan.cpp b/src/core/batchlan.cpp index 66077f008..2a114fb91 100644 --- a/src/core/batchlan.cpp +++ b/src/core/batchlan.cpp @@ -2263,6 +2263,7 @@ BaseRef _ElementaryCommand::toStr (unsigned long) { auto assignment = [&] (long i, const _String& call) -> _String const { return _StringBuffer (_HY_ValidHBLExpressions.RetrieveKeyByPayload(i)) + << " " << parameter_to_string (0) << " = " << call @@ -2359,6 +2360,10 @@ BaseRef _ElementaryCommand::toStr (unsigned long) { } break; + case 12: { //simulate data + (*string_form) << assignment (HY_HBL_COMMAND_SIMULATE_DATA_SET, kEmptyString); + } + break; case 13: { // a function (*string_form) << "function " diff --git a/src/core/matrix.cpp b/src/core/matrix.cpp index 1a9cd2edd..5fa1a3730 100644 --- a/src/core/matrix.cpp +++ b/src/core/matrix.cpp @@ -1530,6 +1530,7 @@ HBLObjectRef _Matrix::MultByFreqs (long freqID, bool reuse_value_object) { if (theIndex) { _Matrix* vm = (_Matrix*) value; hyFloat * __restrict dp = vm ->theData; + const long *__restrict ip = vm->theIndex; if (vm->compressedIndex) { //vm->_validateCompressedStorage(); @@ -1573,8 +1574,8 @@ HBLObjectRef _Matrix::MultByFreqs (long freqID, bool reuse_value_object) { InitializeArray(tempDiags, hDim, 0.0); if (freq_matrix) { - for (long i=0; ilDim; i++) { + long p = ip[i]; if (p != -1) { long h = p / vDim; p = p - h*vDim; @@ -1585,8 +1586,8 @@ HBLObjectRef _Matrix::MultByFreqs (long freqID, bool reuse_value_object) { } } else { - for (long i=0; ilDim; i++) { + long p = ip[i]; if (p != -1) { long h = p / vDim; p = p - h*vDim; From ed17e64ace4b1efd834bbf71a46fa04068a247f5 Mon Sep 17 00:00:00 2001 From: Sergei Date: Thu, 29 Feb 2024 21:11:50 -0500 Subject: [PATCH 5/9] 2.5.60 RC --- .../SelectionAnalyses/BUSTED.bf | 103 ++++++++++++++---- .../SelectionAnalyses/RELAX.bf | 78 ++++++++++++- res/TemplateBatchFiles/libv3/all-terms.bf | 3 +- .../libv3/models/codon/MG_REV.bf | 4 +- .../libv3/models/model_functions.bf | 45 +++++++- .../libv3/models/parameters.bf | 24 ++++ .../libv3/tasks/estimators.bf | 33 ++++-- res/TemplateBatchFiles/libv3/tasks/mpi.bf | 66 +++++++++++ src/core/batchlan.cpp | 36 +++--- src/core/batchlan2.cpp | 6 + src/core/batchlanruntime.cpp | 74 ++++++++++++- src/core/include/batchlan.h | 1 + src/core/include/defines.h | 1 + 13 files changed, 416 insertions(+), 58 deletions(-) diff --git a/res/TemplateBatchFiles/SelectionAnalyses/BUSTED.bf b/res/TemplateBatchFiles/SelectionAnalyses/BUSTED.bf index e711e11c1..c54a0e858 100644 --- a/res/TemplateBatchFiles/SelectionAnalyses/BUSTED.bf +++ b/res/TemplateBatchFiles/SelectionAnalyses/BUSTED.bf @@ -555,8 +555,8 @@ for (busted.partition_index = 0; busted.partition_index < busted.partition_count busted.global_scaler_list [busted.partition_index] = "busted.bl.scaler_" + busted.partition_index; parameters.DeclareGlobalWithRanges (busted.global_scaler_list [busted.partition_index], 3, 0, 1000); busted.initial_ranges [busted.global_scaler_list [busted.partition_index]] = { - terms.lower_bound : 2.5, - terms.upper_bound : 3.5 + terms.lower_bound : 2.0, + terms.upper_bound : 4.0 }; } @@ -569,7 +569,7 @@ busted.initial_grid = estimators.LHC (busted.initial_ranges,busted.initi //estimators.CreateInitialGrid (busted.initial_grid, busted.initial_grid.N, busted.initial_grid_presets); busted.initial_grid = utility.Map (busted.initial_grid, "_v_", - 'busted._renormalize (_v_, "busted.distribution", busted.initial.test_mean, busted.error_sink)' + 'busted._renormalize_with_weights (_v_, "busted.distribution", busted.initial.test_mean, busted.error_sink)' ); if (busted.has_background) { //GDD rate category @@ -611,24 +611,29 @@ debug.checkpoint = utility.GetEnvVariable ("DEBUG_CHECKPOINT"); if (Type (debug.checkpoint) != "String") { // constrain nucleotide rate parameters - busted.tmp_fixed = models.FixParameterSetRegExp (terms.nucleotideRatePrefix,busted.test.bsrel_model); + // copy global nucleotide parameter estimates to busted.test.bsrel_model) + + + busted.tmp_fixed = models.FixParameterSetRegExpFromReference (terms.nucleotideRatePrefix,busted.test.bsrel_model, busted.final_partitioned_mg_results[terms.global]); + + busted.grid_search.results = estimators.FitLF (busted.filter_names, busted.trees, busted.model_map, busted.final_partitioned_mg_results, busted.model_object_map, + { + terms.run_options.retain_lf_object: TRUE, + terms.run_options.proportional_branch_length_scaler : + busted.global_scaler_list, + + terms.run_options.optimization_settings : + { + "OPTIMIZATION_METHOD" : "nedler-mead", + "MAXIMUM_OPTIMIZATION_ITERATIONS" : 500, + "OPTIMIZATION_PRECISION" : busted.nm.precision + } , + + terms.search_grid : busted.initial_grid, + terms.search_restarts : busted.N.initial_guesses + } + ); - - busted.grid_search.results = estimators.FitLF (busted.filter_names, busted.trees, busted.model_map, busted.final_partitioned_mg_results, busted.model_object_map, { - "retain-lf-object": TRUE, - terms.run_options.proportional_branch_length_scaler : - busted.global_scaler_list, - - terms.run_options.optimization_settings : - { - "OPTIMIZATION_METHOD" : "nedler-mead", - "MAXIMUM_OPTIMIZATION_ITERATIONS" : 500, - "OPTIMIZATION_PRECISION" : busted.nm.precision - } , - - terms.search_grid : busted.initial_grid, - terms.search_restarts : busted.N.initial_guesses - }); parameters.RemoveConstraint (busted.tmp_fixed ); @@ -1254,6 +1259,64 @@ lfunction busted._renormalize (v, distro, mean, skip_first) { //------------------------------------------------------------------------------ +lfunction busted._renormalize_with_weights (v, distro, mean, skip_first) { + + parameters.SetValues (v); + m = parameters.GetStickBreakingDistribution (^distro); + d = Rows (m); + + if (skip_first) { + m0 = m[0][0]*m[0][1]; + } else { + m0 = 0; + } + + over_one = m[d-1][0] * m[d-1][1]; + + if (over_one >= mean*0.95) { + //console.log ("OVERAGE"); + new_weight = mean * Random (0.9, 0.95) / m[d-1][0]; + diff = (m[d-1][1] - new_weight)/(d-1); + for (k = (skip_first != 0); k < d-1; k += 1) { + m[k][1] += diff; + } + m[d-1][1] = new_weight; + } + + over_one = m[d-1][0] * m[d-1][1]; + under_one = (+(m[-1][0] $ m[-1][1])) / (1-m[d-1][1]); // current mean + + for (i = (skip_first != 0); i < d-1; i+=1) { + m[i][0] = m[i][0] * mean / under_one; + } + + under_one = +(m[-1][0] $ m[-1][1]); + + for (i = (skip_first != 0); i < d; i+=1) { + m[i][0] = m[i][0] * mean / under_one; + } + + m = m%0; + wts = parameters.SetStickBreakingDistributionWeigths (m[-1][1]); + + //console.log (v); + + for (i = (skip_first != 0); i < d; i+=1) { + (v[((^distro)["rates"])[i]])[^"terms.fit.MLE"] = m[i][0]; + } + for (i = (skip_first != 0); i < d-1; i+=1) { + (v[((^distro)["weights"])[i]])[^"terms.fit.MLE"] = wts[i]; + } + + //console.log (v); + + //assert (0); + return v; + +} + +//------------------------------------------------------------------------------ + lfunction busted.get_multi_hit (model_fit) { params = selection.io.extract_global_MLE_re (model_fit, '^' + ^'terms.parameters.multiple_hit_rate'); for (k,v; in; selection.io.extract_global_MLE_re (model_fit, '^' + ^'terms.parameters.triple_hit_rate')) { diff --git a/res/TemplateBatchFiles/SelectionAnalyses/RELAX.bf b/res/TemplateBatchFiles/SelectionAnalyses/RELAX.bf index 219128111..4919ed064 100644 --- a/res/TemplateBatchFiles/SelectionAnalyses/RELAX.bf +++ b/res/TemplateBatchFiles/SelectionAnalyses/RELAX.bf @@ -519,6 +519,8 @@ if (relax.model_set == "All") { // run all the models relax.ge_model_map [relax.index] = {"DEFAULT" : "relax.ge"}; } + + if (Type (relax.ge_guess) != "Matrix") { @@ -677,6 +679,7 @@ if (relax.model_set == "All") { // run all the models } } else { + // MINIMAL models branch relax.general_descriptive.fit = relax.final_partitioned_mg_results; } @@ -833,10 +836,16 @@ if (relax.do_srv) { if (relax.model_set != "All") { relax.do_lhc = TRUE; relax.distribution = models.codon.BS_REL.ExtractMixtureDistribution(relax.model_object_map[relax.reference_model_namespace]); - relax.initial.test_mean = ((selection.io.extract_global_MLE_re (relax.final_partitioned_mg_results, "^" + terms.parameters.omega_ratio + ".+`relax.test_branches_name`.+"))["0"])[terms.fit.MLE]; + relax.initial.test_mean = ((selection.io.extract_global_MLE_re (relax.final_partitioned_mg_results, "^" + terms.parameters.omega_ratio + ".+`relax.reference_branches_name`.+"))["0"])[terms.fit.MLE]; relax.init_grid_setup (relax.distribution); + relax.initial_ranges[relax.relaxation_parameter] = {terms.lower_bound : 0.10, + terms.upper_bound : 2.0}; + + relax.initial_grid = estimators.LHC (relax.initial_ranges,relax.initial_grid.N); - //parameters.SetStickBreakingDistribution (relax.distribution, relax.ge_guess); + relax.initial_grid = utility.Map (relax.initial_grid, "_v_", + 'relax._renormalize_with_weights (_v_, "relax.distribution", relax.initial.test_mean)' + ); } if (relax.has_unclassified) { @@ -1001,6 +1010,8 @@ function relax.FitMainTestPair (prompt) { parameters.DeclareGlobalWithRanges (relax.scaler.id, 1, 0, 1000); } + + relax.general_descriptive.fit = estimators.FitLF (relax.filter_names, relax.trees, relax.model_map, relax.general_descriptive.fit, relax.model_object_map, @@ -1082,6 +1093,7 @@ function relax.FitMainTestPair (prompt) { io.ReportProgressMessageMD("RELAX", "alt-2", "* Potential convergence issues due to flat likelihood surfaces; checking to see whether K > 1 or K < 1 is robustly inferred"); + if (relax.fitted.K > 1) { parameters.SetRange (model.generic.GetGlobalParameter (relax.model_object_map ["relax.test"] , terms.relax.k), terms.range01); } else { @@ -1089,6 +1101,8 @@ function relax.FitMainTestPair (prompt) { } //assert (__SIGTRAP__); + + VERBOSITY_LEVEL = 10; relax.alternative_model.fit.take2 = estimators.FitLF (relax.filter_names, relax.trees, relax.model_map, relax.alternative_model.fit , @@ -1788,7 +1802,7 @@ function relax.init_grid_setup (omega_distro) { } else { relax.initial_ranges [_name_] = { terms.lower_bound : 1, - terms.upper_bound : 10 + terms.upper_bound : 5 }; } ' @@ -1846,6 +1860,64 @@ lfunction relax._renormalize (v, distro, mean) { } +//------------------------------------------------------------------------------ + +lfunction relax._renormalize_with_weights (v, distro, mean) { + + parameters.SetValues (v); + m = parameters.GetStickBreakingDistribution (^distro); + //console.log (v); + //console.log (m); + //console.log (mean); + d = Rows (m); + + over_one = m[d-1][0] * m[d-1][1]; + + if (over_one >= mean*0.95) { + //console.log ("OVERAGE"); + new_weight = mean * Random (0.9, 0.95) / m[d-1][0]; + diff = (m[d-1][1] - new_weight)/(d-1); + for (k = 0; k < d-1; k += 1) { + m[k][1] += diff; + } + m[d-1][1] = new_weight; + } + + over_one = m[d-1][0] * m[d-1][1]; + under_one = (+(m[-1][0] $ m[-1][1])) / (1-m[d-1][1]); // current mean + + for (i = 0; i < d-1; i+=1) { + m[i][0] = m[i][0] * mean / under_one; + } + + under_one = +(m[-1][0] $ m[-1][1]); + + for (i = 0; i < d; i+=1) { + m[i][0] = m[i][0] * mean / under_one; + } + + m = m%0; + wts = parameters.SetStickBreakingDistributionWeigths (m[-1][1]); + + + + //console.log (v); + + for (i = 0; i < d; i+=1) { + (v[((^distro)["rates"])[i]])[^"terms.fit.MLE"] = m[i][0]; + } + for (i = 0; i < d-1; i+=1) { + (v[((^distro)["weights"])[i]])[^"terms.fit.MLE"] = wts[i]; + } + + //console.log (v); + + //assert (0); + return v; + +} + + //------------------------------------------------------------------------------ function relax.optimization_log_file (extension) { if (relax.OPTIMIZATION_LOGS) { diff --git a/res/TemplateBatchFiles/libv3/all-terms.bf b/res/TemplateBatchFiles/libv3/all-terms.bf index 1284b4ba9..5077b4fa9 100644 --- a/res/TemplateBatchFiles/libv3/all-terms.bf +++ b/res/TemplateBatchFiles/libv3/all-terms.bf @@ -396,6 +396,7 @@ namespace terms{ empirical = "empirical"; branch_length_constrain = "branch length constrain";// TODO + branch_length_override = "branch length override";// TODO get_branch_length = "get-branch-length"; set_branch_length = "set-branch-length"; constrain_branch_length = "constrain-branch-length"; @@ -509,7 +510,7 @@ namespace terms{ optimization_settings = "optimization-settings"; keep_filters = "use-filters"; optimization_log = "optimization_log"; - + maintain_branch_lengths_for_grid = "maintain-branch-lengths-for-grid"; } diff --git a/res/TemplateBatchFiles/libv3/models/codon/MG_REV.bf b/res/TemplateBatchFiles/libv3/models/codon/MG_REV.bf index 1a627e2c1..783bfb5f0 100644 --- a/res/TemplateBatchFiles/libv3/models/codon/MG_REV.bf +++ b/res/TemplateBatchFiles/libv3/models/codon/MG_REV.bf @@ -244,8 +244,10 @@ function models.codon.MG_REV.set_branch_length(model, value, parameter) { } if (utility.Array1D (models.codon.MG_REV.set_branch_length.params)) { bl_string = Simplify (model[terms.model.branch_length_string],models.codon.MG_REV.set_branch_length.params.subs); + + ConvertBranchLength (models.codon.MG_REV.set_branch_length.lp, bl_string , ^(models.codon.MG_REV.set_branch_length.params[0]), 3*value); - ExecuteCommands("FindRoot (models.codon.MG_REV.set_branch_length.lp,(" + bl_string + ")-(" + 3*value + ")," + models.codon.MG_REV.set_branch_length.params[0] + ",0,10000)"); + //ExecuteCommands("FindRoot (models.codon.MG_REV.set_branch_length.lp,(" + bl_string + ")-(" + 3*value + ")," + models.codon.MG_REV.set_branch_length.params[0] + ",0,10000)"); Eval (models.codon.MG_REV.set_branch_length.params[0] + "=" + models.codon.MG_REV.set_branch_length.lp); } diff --git a/res/TemplateBatchFiles/libv3/models/model_functions.bf b/res/TemplateBatchFiles/libv3/models/model_functions.bf index 92c89f3bf..c0c9133ca 100644 --- a/res/TemplateBatchFiles/libv3/models/model_functions.bf +++ b/res/TemplateBatchFiles/libv3/models/model_functions.bf @@ -408,7 +408,6 @@ function models.generic.SetBranchLength (model, value, parameter) { if (Abs((model[terms.parameters])[terms.local]) > 1) { models.generic.SetBranchLength.bl = Call (model [terms.model.time], model[terms.model.type]); - models.generic.SetBranchLength.bl.p = parameter + "." + models.generic.SetBranchLength.bl; if (parameters.IsIndependent (models.generic.SetBranchLength.bl.p) == FALSE) { @@ -447,33 +446,36 @@ function models.generic.SetBranchLength (model, value, parameter) { } - if (parameters.IsIndependent (models.generic.SetBranchLength.bl.p)) { + if (parameters.IsIndependent (models.generic.SetBranchLength.bl.p) || model[terms.model.branch_length_override]) { if (Type (value) == "AssociativeList") { if (Abs (models.generic.SetBranchLength.expression)) { - ExecuteCommands ("FindRoot (models.generic.SetBranchLength.t,(" + models.generic.SetBranchLength.expression + ")-(" + value[terms.branch_length] + ")," + models.generic.SetBranchLength.bl + ",0,10000)"); + ConvertBranchLength (models.generic.SetBranchLength.t, models.generic.SetBranchLength.expression, ^models.generic.SetBranchLength.bl, value[terms.branch_length]); Eval (models.generic.SetBranchLength.bl.p + ":=(" + value[terms.model.branch_length_scaler] + ")*" + models.generic.SetBranchLength.t); messages.log ("models.generic.SetBranchLength: " + models.generic.SetBranchLength.bl.p + ":=(" + value[terms.model.branch_length_scaler] + ")*" + models.generic.SetBranchLength.t); } else { Eval (models.generic.SetBranchLength.bl.p + ":=(" + value[terms.model.branch_length_scaler] + ")*" + value[terms.branch_length]); messages.log ("models.generic.SetBranchLength: " + models.generic.SetBranchLength.bl.p + ":=(" + value[terms.model.branch_length_scaler] + ")*" + models.generic.SetBranchLength.t); + //console.log ("models.generic.SetBranchLength: " + models.generic.SetBranchLength.bl.p + ":=(" + value[terms.model.branch_length_scaler] + ")*" + models.generic.SetBranchLength.t); } return 1; } else { - - ExecuteCommands ("FindRoot (models.generic.SetBranchLength.t,(" + models.generic.SetBranchLength.expression + ")-(" + value + ")," + models.generic.SetBranchLength.bl + ",0,10000)"); + ConvertBranchLength (models.generic.SetBranchLength.t, models.generic.SetBranchLength.expression, ^models.generic.SetBranchLength.bl, value); + + //ExecuteCommands ("FindRoot (models.generic.SetBranchLength.t,(" + models.generic.SetBranchLength.expression + ")-(" + value + ")," + models.generic.SetBranchLength.bl + ",0,10000)"); Eval (models.generic.SetBranchLength.bl.p + "=" + models.generic.SetBranchLength.t); messages.log ("models.generic.SetBranchLength: " + models.generic.SetBranchLength.bl.p + "=" + models.generic.SetBranchLength.t); } } else { messages.log (models.generic.SetBranchLength.bl.p + " was already constrained in models.generic.SetBranchLength"); + } } else { messages.log ("models.generic.SetBranchLength: missing branch-length-string"); - } + } } else { messages.log ("models.generic.SetBranchLength: no local model parameters"); } @@ -715,3 +717,34 @@ lfunction models.FixParameterSetRegExp (re, model_spec) { return constraints; } +/** + * Constrain the set of global model parameters defined by the regexp to own values + * @name parameters.ConstrainParameterSet + * @param {String} re - match this regular expression + * @param {Dict} model_spec - model definition + * @param {Dict} ref - a dictionary of values to use if available + * @returns {Dict} the list of constrained parameters + */ + +lfunction models.FixParameterSetRegExpFromReference (re, model_spec, ref) { + constraints = {}; + for (tag, id; in; (model_spec[^"terms.parameters"])[^"terms.global"]) { + if (None != regexp.Find (tag, re)) { + if (parameters.IsIndependent (id)) { + mle = ref[tag]; + if (Type (mle) == "AssociativeList") { + if (mle / ^"terms.fit.MLE") { + parameters.SetConstraint (id, mle[^"terms.fit.MLE"], ""); + constraints + id; + continue; + } + } + parameters.SetConstraint (id, Eval (id), ""); + constraints + id; + } + } + } + + return constraints; +} + diff --git a/res/TemplateBatchFiles/libv3/models/parameters.bf b/res/TemplateBatchFiles/libv3/models/parameters.bf index 16c6e373e..1192dbf1b 100644 --- a/res/TemplateBatchFiles/libv3/models/parameters.bf +++ b/res/TemplateBatchFiles/libv3/models/parameters.bf @@ -727,6 +727,30 @@ lfunction parameters.GetStickBreakingDistributionWeigths (weights) { return w; } +/** + * @name parameters.StStickBreakingDistributionWeigths + * @param {Matrix} 1xN matrix of weights + * @returns {Matrix} 1x(N-1) matrix of weights + */ + +lfunction parameters.SetStickBreakingDistributionWeigths (weights) { + rate_count = utility.Array1D (weights); + + + w = {1, rate_count-1}; + + w[0] = weights[0]; + left_over = (1-w[0]); + + for (i = 1; i < rate_count - 1; i += 1) { + w [i] = weights[i] / left_over; + left_over = left_over * (1-w[i]); + + } + //w[i] = left_over; + return w; +} + /** * @name parameters.helper.stick_breaking diff --git a/res/TemplateBatchFiles/libv3/tasks/estimators.bf b/res/TemplateBatchFiles/libv3/tasks/estimators.bf index f0ff55a11..992ed68a7 100644 --- a/res/TemplateBatchFiles/libv3/tasks/estimators.bf +++ b/res/TemplateBatchFiles/libv3/tasks/estimators.bf @@ -176,7 +176,6 @@ function estimators.SetGlobals2(key2, value) { } - estimators.ApplyExistingEstimates.set_globals[key3] = { terms.id: key3, terms.fit.MLE: value @@ -212,10 +211,11 @@ function estimators.SetGlobals2(key2, value) { * @returns nothing */ function estimators.SetGlobals(key, value) { - /*_did_set = {}; - for (i,v; in; ((value[terms.parameters])[terms.global])) { - _did_set [v] = 0; - }*/ + //_did_set = {}; + //console.log (((value[terms.parameters])[terms.global])); + //for (i,v; in; ((value[terms.parameters])[terms.global])) { + // _did_set [v] = 0; + //} ((value[terms.parameters])[terms.global])["estimators.SetGlobals2"][""]; @@ -496,9 +496,12 @@ function estimators.ApplyExistingEstimatesToTree (_tree_name, model_descriptions estimators.ApplyExistingEstimatesToTree.constraint_count = 0; - ExecuteCommands("GetInformation (estimators.ApplyExistingEstimatesToTree.map, `_tree_name`);"); estimators.ApplyExistingEstimatesToTree.branch_names = Rows(estimators.ApplyExistingEstimatesToTree.map); + + for (i,v; in; model_descriptions) { + parameters.SetCategoryVariables (v); + } for (estimators.ApplyExistingEstimatesToTree.b = 0; estimators.ApplyExistingEstimatesToTree.b < Abs(estimators.ApplyExistingEstimatesToTree.map); estimators.ApplyExistingEstimatesToTree.b += 1) { _branch_name = estimators.ApplyExistingEstimatesToTree.branch_names[estimators.ApplyExistingEstimatesToTree.b]; @@ -565,6 +568,8 @@ function estimators.ApplyExistingEstimates(likelihood_function_id, model_descrip estimators.ApplyExistingEstimates.df_correction = 0; // copy global variables first + + estimators.ApplyExistingEstimates.results[terms.global] = {}; model_descriptions["estimators.SetCategory"][""]; // the above line traverses all model descriptions and sets @@ -591,6 +596,8 @@ function estimators.ApplyExistingEstimates(likelihood_function_id, model_descrip _application_type = None; + //console.log (branch_length_conditions); + if (Type (branch_length_conditions) == "AssociativeList") { if (Abs(branch_length_conditions) > estimators.ApplyExistingEstimates.i) { _application_type = branch_length_conditions[estimators.ApplyExistingEstimates.i]; @@ -762,19 +769,24 @@ lfunction estimators.FitLF(data_filter, tree, model_map, initial_values, model_o if (Type(initial_values) == "AssociativeList") { utility.ToggleEnvVariable("USE_LAST_RESULTS", 1); - //console.log (initial_values); df = estimators.ApplyExistingEstimates("`&likelihoodFunction`", model_objects, initial_values, run_options[utility.getGlobalValue("terms.run_options.proportional_branch_length_scaler")]); - } - + } + can_do_restarts = null; if (utility.Has (run_options, utility.getGlobalValue("terms.search_grid"),"AssociativeList")) { - grid_results = mpi.ComputeOnGrid (&likelihoodFunction, run_options [utility.getGlobalValue("terms.search_grid")], "mpi.ComputeOnGrid.SimpleEvaluator", "mpi.ComputeOnGrid.ResultHandler"); + grid_results = mpi.ComputeOnGridSetValues (&likelihoodFunction, run_options [utility.getGlobalValue("terms.search_grid")], { + 0: model_objects, + 1: initial_values, + 2: run_options[utility.getGlobalValue("terms.run_options.proportional_branch_length_scaler")] + }, "mpi.ComputeOnGrid.SimpleEvaluatorWithValues", "mpi.ComputeOnGrid.ResultHandler"); + if (utility.Has (run_options, utility.getGlobalValue("terms.search_restarts"),"Number")) { restarts = run_options[utility.getGlobalValue("terms.search_restarts")]; if (restarts > 1) { grid_results = utility.DictToSortedArray (grid_results); + //console.log (grid_results); can_do_restarts = {}; for (i = 1; i <= restarts; i += 1) { can_do_restarts + (run_options [utility.getGlobalValue("terms.search_grid")])[grid_results[Rows(grid_results)-i][1]]; @@ -787,6 +799,7 @@ lfunction estimators.FitLF(data_filter, tree, model_map, initial_values, model_o } //console.log (best_value); //console.log ((run_options [utility.getGlobalValue("terms.search_grid")])[best_value["key"]]); + //console.log (can_do_restarts); //assert (0); } diff --git a/res/TemplateBatchFiles/libv3/tasks/mpi.bf b/res/TemplateBatchFiles/libv3/tasks/mpi.bf index c1f5eb384..a94a61d01 100644 --- a/res/TemplateBatchFiles/libv3/tasks/mpi.bf +++ b/res/TemplateBatchFiles/libv3/tasks/mpi.bf @@ -298,6 +298,71 @@ namespace mpi { } } + //------------------------------------------------------------------------------ + + lfunction ComputeOnGridSetValues (lf_id, grid, values, handler, callback) { + + jobs = mpi.PartitionIntoBlocks(grid); + + scores = {}; + + + queue = mpi.CreateQueue ({^"terms.mpi.LikelihoodFunctions": {{lf_id}}, + ^"terms.mpi.Headers" : utility.GetListOfLoadedModules ("libv3/")}); + + io.ReportProgressBar("", "Computing LF on a grid"); + for (i = 1; i < Abs (jobs); i += 1) { + //io.ReportProgressBar("", "Computing LF on a grid " + i + "/" + Abs (jobs)); + mpi.QueueJob (queue, handler, {"0" : lf_id, + "1" : jobs [i], + "2" : values, + "3" : &scores}, callback); + } + + + Call (callback, -1, Call (handler, lf_id, jobs[0], values, &scores), {"0" : lf_id, "1" : jobs [0], "3" : values, "2" : &scores}); + mpi.QueueComplete (queue); + + io.ClearProgressBar(); + + return scores; + } + + + //------------------------------------------------------------------------------ + + lfunction ComputeOnGrid.SimpleEvaluatorWithValues (lf_id, tasks, values, scores) { + + + //#profile START; + + LFCompute (^lf_id, LF_START_COMPUTE); + + results = {}; + task_ids = utility.Keys (tasks); + task_count = Abs (tasks); + for (i, v; in; values[0]) { + v[^"terms.model.branch_length_override"] = TRUE; + } + for (i = 0; i < task_count; i+=1) { + + parameters.SetValues (tasks[task_ids[i]]); + estimators.ApplyExistingEstimates (lf_id, values[0], values[1], values[2]); + //fprintf (stdout, ^lf_id); + LFCompute (^lf_id, ll); + results [task_ids[i]] = ll; + io.ReportProgressBar("", "Grid result " + i + " = " + ll + " (best = " + Max (results, 0)[^"terms.data.value"] + ")"); + + } + LFCompute (^lf_id, LF_DONE_COMPUTE); + for (i, v; in; values[0]) { + v - ^"terms.model.branch_length_override"; + } + + //#profile _hyphy_profile_dump; + //utility.FinishAndPrintProfile (_hyphy_profile_dump); + return results; + } //------------------------------------------------------------------------------ @@ -338,6 +403,7 @@ namespace mpi { //------------------------------------------------------------------------------ lfunction ComputeOnGrid.SimpleEvaluator (lf_id, tasks, scores) { + LFCompute (^lf_id, LF_START_COMPUTE); results = {}; diff --git a/src/core/batchlan.cpp b/src/core/batchlan.cpp index 2a114fb91..f34141369 100644 --- a/src/core/batchlan.cpp +++ b/src/core/batchlan.cpp @@ -2001,6 +2001,7 @@ bool _ExecutionList::BuildList (_StringBuffer& s, _SimpleList* bc, bool case HY_HBL_COMMAND_CONSTRUCT_CATEGORY_MATRIX: case HY_HBL_COMMAND_KEYWORD_ARGUMENT: case HY_HBL_COMMAND_DO_SQL: + case HY_HBL_COMMAND_CONVERT_BRANCH_LENGTH: { _ElementaryCommand::ExtractValidateAddHBLCommand (currentLine, prefixTreeCode, pieces, commandExtraInfo, *this); break; @@ -2168,26 +2169,25 @@ _ElementaryCommand::_ElementaryCommand (_String& s) { //____________________________________________________________________________________ _ElementaryCommand::~_ElementaryCommand (void) { - if (CanFreeMe()) { - if (code==4) { // IF - if (simpleParameters.lLength>2) { - _Formula* f = (_Formula*)simpleParameters(2); + + auto clear_formulas = [this] (long start_at) -> void { + for (long i = start_at; i 2) { delete (AVLListXLIterator*)simpleParameters.get(2); @@ -2341,6 +2341,7 @@ BaseRef _ElementaryCommand::toStr (unsigned long) { case HY_HBL_COMMAND_SELECT_TEMPLATE_MODEL: case HY_HBL_COMMAND_KEYWORD_ARGUMENT: case HY_HBL_COMMAND_GET_INFORMATION: + case HY_HBL_COMMAND_CONVERT_BRANCH_LENGTH: case HY_HBL_COMMAND_SIMULATE_DATA_SET: { (*string_form) << procedure (code); } @@ -3713,6 +3714,9 @@ bool _ElementaryCommand::Execute (_ExecutionList& chain) { return HandleFindRootOrIntegrate(chain, code == HY_HBL_COMMAND_INTEGRATE); break; + case HY_HBL_COMMAND_CONVERT_BRANCH_LENGTH: + return HandleConvertBranchLength(chain); + break; case HY_HBL_COMMAND_MPI_SEND: return HandleMPISend (chain); diff --git a/src/core/batchlan2.cpp b/src/core/batchlan2.cpp index 321d6d027..b76311970 100644 --- a/src/core/batchlan2.cpp +++ b/src/core/batchlan2.cpp @@ -175,6 +175,8 @@ void _HBL_Init_Const_Arrays (void) _HY_ValidHBLExpressions.Insert ("BGM ", HY_HBL_COMMAND_BGM); _HY_ValidHBLExpressions.Insert ("SimulateDataSet", HY_HBL_COMMAND_SIMULATE_DATA_SET); _HY_ValidHBLExpressions.Insert ("KeywordArgument", HY_HBL_COMMAND_KEYWORD_ARGUMENT); + _HY_ValidHBLExpressions.Insert ("ConvertBranchLength(", HY_HBL_COMMAND_CONVERT_BRANCH_LENGTH); + /* const long cut, const long conditions, const char sep, const bool doTrim, const bool isAssignment, const bool needsVerb, length options */ @@ -462,6 +464,10 @@ const long cut, const long conditions, const char sep, const bool doTrim, const // matrix global arrays + _HY_HBLCommandHelper.Insert ((BaseRef)HY_HBL_COMMAND_CONVERT_BRANCH_LENGTH, + (long)_hyInitCommandExtras (_HY_ValidHBLExpressions.Insert ("ConvertBranchLength(", HY_HBL_COMMAND_CONVERT_BRANCH_LENGTH,false), + 4, + "ConvertBranchLength(, , , )",',')); _HY_MatrixRandomValidPDFs.Insert ("Dirichlet", _HY_MATRIX_RANDOM_DIRICHLET, "Gaussian", _HY_MATRIX_RANDOM_GAUSSIAN, diff --git a/src/core/batchlanruntime.cpp b/src/core/batchlanruntime.cpp index 92e29be79..50e5591ea 100644 --- a/src/core/batchlanruntime.cpp +++ b/src/core/batchlanruntime.cpp @@ -314,6 +314,79 @@ bool _ElementaryCommand::HandleDifferentiate(_ExecutionList& current_progra //____________________________________________________________________________________ +bool _ElementaryCommand::HandleConvertBranchLength(_ExecutionList& current_program){ + + _Variable * receptacle = nil; + current_program.advance (); + + auto cleanup_cache = [this] (long start_at) -> void { + for (long k = simpleParameters.countitems() - 1; k >= start_at; k --) { + _Formula* f = (_Formula*)simpleParameters.get (k); + if (f) { + delete f; + } + simpleParameters.Delete(k); + } + }; + + try { + receptacle = _ValidateStorageVariable (current_program); + _List dynamic_manager; + _FString* expression = (_FString*)_ProcessAnArgumentByType(*GetIthParameter(1), STRING, current_program, &dynamic_manager); + _Formula * lhs_expression = nil, + * lhs_derivative = nil; + + if (parameters.countitems () > 4) { + _String * cached_fla = GetIthParameter(4); + if (!cached_fla->Equal(expression->get_str()) || simpleParameters.empty()) { + cleanup_cache (0); + parameters.Delete (4); + } else { + lhs_expression = (_Formula*)simpleParameters.get (0); + if (simpleParameters.countitems() > 1) { + lhs_derivative = (_Formula*)simpleParameters.get (1); + } + } + } + + if (parameters.countitems () <= 4) { + parameters < expression->get_str_ref(); + } + + if (!lhs_expression) { + lhs_expression = new _Formula; + _CheckExpressionForCorrectness (*lhs_expression, expression->get_str(), current_program); + simpleParameters << (long)lhs_expression; + } + + + _Variable * target_variable = _CheckForExistingVariableByType (*GetIthParameter(2),current_program,NUMBER); + + if (!lhs_expression->DependsOnVariable(target_variable->get_index())) { + throw (expression->get_str().Enquote() & " does not depend on the variable " & target_variable->GetName()->Enquote()); + } + + hyFloat target_value = _ProcessNumericArgumentWithExceptions(*GetIthParameter(3),current_program.nameSpacePrefix); + + if (!lhs_derivative && simpleParameters.countitems() < 2) { + lhs_derivative = lhs_expression->Differentiate (*target_variable->GetName(),false); + simpleParameters << (long)lhs_derivative; + } + + if (lhs_derivative) { + receptacle->SetValue (new _Constant (lhs_expression->Newton (*lhs_derivative,target_variable, target_value, 0.0, 10000.)),false,true, NULL); + } else { + receptacle->SetValue (new _Constant (lhs_expression->Brent (target_variable, 0.0, 10000.,1.e-7, nil, target_value)), false,true, NULL); + } + + } catch (const _String& error) { + return _DefaultExceptionHandler (receptacle, error, current_program); + } + return true; +} + +//____________________________________________________________________________________ + bool _ElementaryCommand::HandleFindRootOrIntegrate (_ExecutionList& currentProgram, bool do_integrate){ _Variable * receptacle = nil; @@ -331,7 +404,6 @@ bool _ElementaryCommand::HandleFindRootOrIntegrate (_ExecutionList& current if (!parsed_expression.DependsOnVariable(target_variable->get_index()) && !do_integrate) { throw (expression.Enquote() & " does not depend on the variable " & target_variable->GetName()->Enquote()); } - hyFloat lb = _ProcessNumericArgumentWithExceptions (*GetIthParameter(3),currentProgram.nameSpacePrefix), ub = _ProcessNumericArgumentWithExceptions (*GetIthParameter(4),currentProgram.nameSpacePrefix); diff --git a/src/core/include/batchlan.h b/src/core/include/batchlan.h index 4ed0b20e3..e14d456f0 100644 --- a/src/core/include/batchlan.h +++ b/src/core/include/batchlan.h @@ -282,6 +282,7 @@ class _ElementaryCommand: public _String // string contains the literal for th bool HandleExport (_ExecutionList&); bool HandleDifferentiate (_ExecutionList&); bool HandleFindRootOrIntegrate (_ExecutionList&, bool do_integrate = false); + bool HandleConvertBranchLength (_ExecutionList&); bool HandleMPISend (_ExecutionList&); bool HandleMPIReceive (_ExecutionList&); bool HandleExecuteCommandsCases (_ExecutionList&, bool do_load_from_file = false, bool do_load_library = false); diff --git a/src/core/include/defines.h b/src/core/include/defines.h index 767f81024..18fbe3225 100644 --- a/src/core/include/defines.h +++ b/src/core/include/defines.h @@ -319,6 +319,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define HY_HBL_COMMAND_KEYWORD_ARGUMENT 567L #define HY_HBL_COMMAND_INIT_ITERATOR 568L #define HY_HBL_COMMAND_ADVANCE_ITERATOR 569L +#define HY_HBL_COMMAND_CONVERT_BRANCH_LENGTH 570L //! HyPhy standard directory locations From ad21fc61a59f989902e96dd21b5944b85f82bc6f Mon Sep 17 00:00:00 2001 From: Sergei Date: Fri, 1 Mar 2024 09:49:42 -0500 Subject: [PATCH 6/9] RELAX --- .../SelectionAnalyses/RELAX.bf | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/res/TemplateBatchFiles/SelectionAnalyses/RELAX.bf b/res/TemplateBatchFiles/SelectionAnalyses/RELAX.bf index 4919ed064..10015b390 100644 --- a/res/TemplateBatchFiles/SelectionAnalyses/RELAX.bf +++ b/res/TemplateBatchFiles/SelectionAnalyses/RELAX.bf @@ -1068,12 +1068,15 @@ function relax.FitMainTestPair (prompt) { relax.inferred_distribution_ref = parameters.GetStickBreakingDistribution (models.codon.BS_REL.ExtractMixtureDistribution (relax.model_object_map ["relax.reference"])) % 0; selection.io.report_dnds (relax.inferred_distribution_ref); + relax.take1_snapshot = estimators.TakeLFStateSnapshot(relax.alternative_model.fit[terms.likelihood_function]); + relax.lf.raw = relax.ComputeOnGrid ( relax.alternative_model.fit[terms.likelihood_function], relax.grid.MatrixToDict ({200,1}["_MATRIX_ELEMENT_ROW_*0.025"]), "relax.pass1.evaluator", "relax.pass1.result_handler" ); + // FIND the difference between K < 1 and K > 1 @@ -1102,7 +1105,6 @@ function relax.FitMainTestPair (prompt) { //assert (__SIGTRAP__); - VERBOSITY_LEVEL = 10; relax.alternative_model.fit.take2 = estimators.FitLF (relax.filter_names, relax.trees, relax.model_map, relax.alternative_model.fit , @@ -1112,6 +1114,17 @@ function relax.FitMainTestPair (prompt) { terms.run_options.optimization_log : relax.optimization_log_file("MainALT-redo-log.json")} ); + io.ReportProgressMessageMD("RELAX", "alt2", "* Attempt to fit an alternative direction of K"); + io.ReportProgressMessageMD("RELAX", "alt2", "* " + selection.io.report_fit (relax.alternative_model.fit.take2, 9, relax.codon_data_info[terms.data.sample_size])); + io.ReportProgressMessageMD("RELAX", "alt2", "* Relaxation/intensification parameter (K) = " + Format(estimators.GetGlobalMLE (relax.alternative_model.fit.take2,terms.relax.k),8,2)); + io.ReportProgressMessageMD("RELAX", "alt2", "* The following rate distribution was inferred for **test** branches"); + relax.inferred_distribution2 = parameters.GetStickBreakingDistribution (models.codon.BS_REL.ExtractMixtureDistribution (relax.model_object_map ["relax.test"])) % 0; + selection.io.report_dnds (relax.inferred_distribution2); + + + io.ReportProgressMessageMD("RELAX", "alt2", "* The following rate distribution was inferred for **reference** branches"); + relax.inferred_distribution_ref2 = parameters.GetStickBreakingDistribution (models.codon.BS_REL.ExtractMixtureDistribution (relax.model_object_map ["relax.reference"])) % 0; + selection.io.report_dnds (relax.inferred_distribution_ref2); @@ -1140,6 +1153,8 @@ function relax.FitMainTestPair (prompt) { relax.alternative_model.fit = relax.alternative_model.fit.take2; io.SpoolLFToPath(relax.alternative_model.fit.take2[terms.likelihood_function], relax.save_fit_path); + } else { + estimators.RestoreLFStateFromSnapshot(relax.alternative_model.fit[terms.likelihood_function], relax.take1_snapshot); } DeleteObject (relax.alternative_model.fit.take2[terms.likelihood_function]); From 87bf24284ac09899160e2bb63fd027f61f2af097 Mon Sep 17 00:00:00 2001 From: Steven Weaver Date: Fri, 1 Mar 2024 08:53:45 -0800 Subject: [PATCH 7/9] Update alignment.cpp --- src/core/alignment.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/core/alignment.cpp b/src/core/alignment.cpp index 4a1707385..bda824642 100644 --- a/src/core/alignment.cpp +++ b/src/core/alignment.cpp @@ -825,8 +825,6 @@ double AlignStrings( char const * r_str // the extra row/column is to accommodate prefix gaps - //long i, j, k, m; - if ( do_codon && ( r_len % 3 != 0 ) ) { return -INFINITY; From c7264dc41f80eabec03ae4bcef3174e407048927 Mon Sep 17 00:00:00 2001 From: Sergei Date: Mon, 4 Mar 2024 17:45:45 -0500 Subject: [PATCH 8/9] Updating some libv3 --- .../libv3/models/model_functions.bf | 44 ++++++++++++++----- .../libv3/models/parameters.bf | 13 ++++++ 2 files changed, 46 insertions(+), 11 deletions(-) diff --git a/res/TemplateBatchFiles/libv3/models/model_functions.bf b/res/TemplateBatchFiles/libv3/models/model_functions.bf index c0c9133ca..2af115b59 100644 --- a/res/TemplateBatchFiles/libv3/models/model_functions.bf +++ b/res/TemplateBatchFiles/libv3/models/model_functions.bf @@ -570,29 +570,51 @@ lfunction model.MatchAlphabets (a1, a2) { */ lfunction models.BindGlobalParameters (models, filter) { + c = models.BindGlobalParametersDeferred (models, filter); + if (None != c) { + parameters.BatchApplyConstraints (c); + } + return c; +} + +/** + * @name models.BindGlobalParametersDeferred + * Only populate the list of constraints, do no apply them + * compute the algebraic expression for the branch length + * @param {Dict} models - list of model objects + * @param {RegEx} filter - only apply to parameters matching this expression + * @return {Dict} the set of constraints applied + */ + +lfunction models.BindGlobalParametersDeferred (models, filter) { if (Type (models) == "AssociativeList" && utility.Array1D (models) > 1) { reference_set = (((models[0])[utility.getGlobalValue("terms.parameters")])[utility.getGlobalValue("terms.global")]); - candidate_set = utility.UniqueValues(utility.Filter (utility.Keys (reference_set), "_key_", - "regexp.Find (_key_,`&filter`)" - )); + candidate_set = {}; - + for (_key_, _ignore_; in; reference_set) { + if (regexp.Find (_key_, filter)) { + candidate_set[_key_] = 1; + } + } + + candidate_set = utility.Keys (candidate_set); constraints_set = {}; for (k = 1; k < Abs (models); k+=1) { parameter_set = (((models[k])[utility.getGlobalValue("terms.parameters")])[utility.getGlobalValue("terms.global")]); if (Type (parameter_set) == "AssociativeList") { - utility.ForEach (candidate_set, "_p_", - "if (`¶meter_set` / _p_) { - if (parameters.IsIndependent (`¶meter_set`[_p_])) { - parameters.SetConstraint (`¶meter_set`[_p_], `&reference_set`[_p_], ''); - `&constraints_set` [`¶meter_set`[_p_]] = `&reference_set`[_p_]; + + for (_p_; in; candidate_set) { + if (parameter_set / _p_) { + if (parameters.IsIndependent (parameter_set[_p_])) { + //parameters.SetConstraint (parameter_set[_p_], reference_set[_p_], ''); + constraints_set [parameter_set[_p_]] = reference_set[_p_]; } - }" - ); + } + } } } diff --git a/res/TemplateBatchFiles/libv3/models/parameters.bf b/res/TemplateBatchFiles/libv3/models/parameters.bf index 1192dbf1b..a5c4a6bec 100644 --- a/res/TemplateBatchFiles/libv3/models/parameters.bf +++ b/res/TemplateBatchFiles/libv3/models/parameters.bf @@ -937,3 +937,16 @@ lfunction parameters.ValidateIDs (ids) { return result; } +/** + * Apply a set of constraints of the form LHS := RHS (stored as key/values) + * @param {Dict} constraints - id=>constraint +*/ + +lfunction parameters.BatchApplyConstraints (constraints) { + SetParameter (DEFER_CONSTRAINT_APPLICATION, 1, 0); + for (p, v; in; constraints) { + parameters.SetConstraint (p,v, ""); + } + SetParameter (DEFER_CONSTRAINT_APPLICATION, 0, 0); +} + From 55e743af865e6fbdc7e90f604935409d235b5cae Mon Sep 17 00:00:00 2001 From: Sergei Date: Mon, 4 Mar 2024 17:47:07 -0500 Subject: [PATCH 9/9] this and that --- src/core/include/global_things.h | 2 ++ src/core/include/likefunc.h | 18 ++++++++++++++++++ src/core/likefunc.cpp | 10 ++++------ src/mains/unix.cpp | 25 +++++++++++++++++++++++-- 4 files changed, 47 insertions(+), 8 deletions(-) diff --git a/src/core/include/global_things.h b/src/core/include/global_things.h index 2792b7bb3..f713d9007 100644 --- a/src/core/include/global_things.h +++ b/src/core/include/global_things.h @@ -274,6 +274,8 @@ namespace hy_global { Global variables (grouped by type, then alphabetically) */ + _String* ConstructAnErrorMessage (_String const&); + extern _AVLList _hy_application_globals; extern _List _hy_standard_library_extensions, diff --git a/src/core/include/likefunc.h b/src/core/include/likefunc.h index 40dece66e..bb5d7f124 100644 --- a/src/core/include/likefunc.h +++ b/src/core/include/likefunc.h @@ -287,6 +287,18 @@ class _LikelihoodFunction: public BaseObj { #endif #endif + + void ComputeDependencyLists (_List& receptacle, long max_dep = 0x7fffffff) const; + /** + This function computes a mapping from the index of each independent variable to the list of all dependent variables that it affects. + index => {n1,n2,...n3} + special cases: + index => null (no dependencies) + index => {} (all variables, or >max_dep) + + @param max_dep the maximum number of dependent variables affected by a paritcular independent to generate an explicit list + */ + bool ProcessPartitionList (_SimpleList&, _Matrix*) const; // given a matrix argument (argument 2; can be nil to include all) @@ -536,6 +548,12 @@ class _LikelihoodFunction: public BaseObj { void SetupParameterMapping (void); void CleanupParameterMapping (void); + void ConvertDependenciesToSimpleFormulas (); + /** + Traverse the list of dependent variables and generate simple formulas if they can be created. + */ + + _SimpleList theTrees, theDataFilters, theProbabilities, diff --git a/src/core/likefunc.cpp b/src/core/likefunc.cpp index 19f674a99..b94c9b95e 100644 --- a/src/core/likefunc.cpp +++ b/src/core/likefunc.cpp @@ -1932,18 +1932,15 @@ bool _LikelihoodFunction::SendOffToMPI (long index) { //_______________________________________________________________________________________ -bool _LikelihoodFunction::PreCompute (void) -{ +bool _LikelihoodFunction::PreCompute (void) { useGlobalUpdateFlag = true; // mod 20060125 to only update large globals once - unsigned long i = 0UL; + unsigned long i; _SimpleList * arrayToCheck = nonConstantDep?nonConstantDep:&indexDep; - - - for (; i < arrayToCheck->lLength; i++) { + for (i = 0UL; i < arrayToCheck->lLength; i++) { _Variable* cornholio = LocateVar(arrayToCheck->list_data[i]); hyFloat tp = cornholio->Compute()->Value(); if (!cornholio->IsValueInBounds(tp)){ @@ -4252,6 +4249,7 @@ _Matrix* _LikelihoodFunction::Optimize (_AssociativeList const * options) CheckDependentBounds(); + currentPrecision = get_optimization_setting (kStartingPrecision, 0.1); precision = get_optimization_setting (kOptimizationPrecision, 0.001); maxItersPerVar = get_optimization_setting (kMaximumIterationsPerVariable, 5000.); diff --git a/src/mains/unix.cpp b/src/mains/unix.cpp index 19c867ffe..4df918f7f 100644 --- a/src/mains/unix.cpp +++ b/src/mains/unix.cpp @@ -185,7 +185,8 @@ void mpiOptimizerLoop (int, int); #ifdef __HYPHY_HANDLE_TERM_SIGNAL__ volatile sig_atomic_t hyphy_sigterm_in_progress = 0; - + volatile sig_atomic_t hyphy_sigstop_in_process = 0; + void hyphy_sigterm_handler (int sig) { if (hyphy_sigterm_in_progress) raise (sig); @@ -202,6 +203,17 @@ void mpiOptimizerLoop (int, int); signal (sig, SIG_DFL); raise (sig); } + + void hyphy_sigstop_handler (int sig) { + if (hyphy_sigstop_in_process) + raise (sig); + + hyphy_sigstop_in_process = 1; + StringToConsole (new _String (ConstructAnErrorMessage ("HyPhy suspended."))); + + signal (sig, SIG_DFL); + raise (sig); + } #endif @@ -660,8 +672,17 @@ int main (int argc, char* argv[]) { #ifdef __HYPHY_HANDLE_TERM_SIGNAL__ if (signal (SIGTERM, hyphy_sigterm_handler) == SIG_IGN) signal (SIGTERM, SIG_IGN); - if (signal (SIGINT, hyphy_sigterm_handler) == SIG_IGN) + + if (signal (SIGINT, hyphy_sigterm_handler) == SIG_IGN) signal (SIGINT, SIG_IGN); + + if (signal (SIGTSTP, hyphy_sigstop_handler) == SIG_IGN) + signal (SIGTSTP, SIG_IGN); + + //if (signal (SIGCONT, hyphy_sigresume_handler) == SIG_IGN) + // signal (SIGCONT, SIG_IGN); + + #endif //printf ("%e\n", mapParameterToInverval (1.322753E-23, 0x2, false)); //exit (0);