Skip to content

Commit

Permalink
Merge pull request #1549 from veg/develop
Browse files Browse the repository at this point in the history
Bug fixes
  • Loading branch information
spond authored Dec 3, 2022
2 parents 3fdd655 + 3f0750b commit 6776da2
Show file tree
Hide file tree
Showing 23 changed files with 296 additions and 208 deletions.
54 changes: 21 additions & 33 deletions res/TemplateBatchFiles/CodonToProtein.bf
Original file line number Diff line number Diff line change
@@ -1,78 +1,66 @@
function setUpCodonToAA (dataSetID)
{
function setUpCodonToAA (dataSetID) {
codonToAAMap = {};
codeToAA = "FLIMVSPTAYXHQNKDECWRG";

nucChars = "ACGT";

for (p1=0; p1<64; p1=p1+1)
{
codon = nucChars[p1$16]+nucChars[p1%16$4]+nucChars[p1%4];
ccode = _Genetic_Code[p1];
codonToAAMap[codon] = codeToAA[ccode];
for (p1=0; p1<64; p1 += 1) {
codonToAAMap[nucChars[p1$16]+nucChars[p1%16$4]+nucChars[p1%4]] = codeToAA[_Genetic_Code[p1]];
}

codonToAAMap["---"] = "-";
codonToAAMap ["---"] = "-";

ExecuteCommands ("DataSetFilter _converterfilteredData = CreateFilter ("+dataSetID+",1);");
DataSetFilter _converterfilteredData = CreateFilter (^dataSetID,1);
GetInformation (theSequences,_converterfilteredData);
ExecuteCommands ("DataSetFilter _converterfilteredDataC = CreateFilter ("+dataSetID+",3);");
DataSetFilter _converterfilteredDataC = CreateFilter (^dataSetID,3);
GetDataInfo (siteToPatternMap,_converterfilteredDataC);
return 0;
}

/*--------------------------------------------------------------------------------------------*/

function doTheMapping (dummy)
{
function doTheMapping () {
outSequences = "";
outSequences * (bigDataSet.sites* bigDataSet.species);

freqCount = {};

for (seqCounter = 0; seqCounter < bigDataSet.species; seqCounter = seqCounter+1)
{

for (seqCounter = 0; seqCounter < bigDataSet.species; seqCounter += 1) {
aSeq = theSequences[seqCounter];
seqLen = Abs(aSeq)-2;
GetString (seqName, _converterfilteredData, seqCounter);
translString = "";
translString * (seqLen/3+1);
for (seqPos = 0; seqPos < seqLen; seqPos = seqPos+3)
for (seqPos = 0; seqPos < seqLen; seqPos +=3)
{
codon = aSeq[seqPos][seqPos+2];

gap_count = codon$"[N-]";
if (gap_count[0] >= 0)
{
if (gap_count[0] >= 0) {
/* handle cases where codon contains one or two gap characters - this was yielding 'F' in original script */
prot = "?";
prot = codonToAAMap[codon];
if (Abs (prot) == 0) {
prot = "?";
}
}
else
{
prot = codonToAAMap[codon];
if (Abs(prot) == 0)
{
if (Abs(prot) == 0) {
/*
see if we can map this presumed ambiguitiy to a single
amino-acid
*/
GetDataInfo (mappedToCodon, _converterfilteredDataC, seqCounter, siteToPatternMap[seqPos$3]);
resolutionMapping = {21,1};
for (resID = 0; resID < 64; resID = resID + 1)
{
if (mappedToCodon[resID])
{
for (resID = 0; resID < 64; resID += 1) {
if (mappedToCodon[resID]) {
resolutionMapping[_Genetic_Code[resID]] = 1;
}
}

if ((+resolutionMapping) == 1)
{

if ((+resolutionMapping) == 1) {
prot = codeToAA[((Transpose(resolutionMapping))["_MATRIX_ELEMENT_COLUMN_"])[0]];
}
else
{
else {
prot = "?";
}

Expand Down Expand Up @@ -132,7 +120,7 @@ if (_runAsFunctionLibrary != 1)
fprintf (stdout, "Read ", bigDataSet.species, " sequences with ", bigDataSet.sites, " sites.");
fprintf (stdout, "\nRead:\n", bigDataSet);

outSequences = doTheMapping (0);
outSequences = doTheMapping ();

sht = IS_TREE_PRESENT_IN_DATA;
sdt = DATAFILE_TREE;
Expand Down
30 changes: 25 additions & 5 deletions res/TemplateBatchFiles/MSS-selector-2.bf
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,14 @@ io.ReportProgressMessageMD("mss", "data" , "* Loaded a list with **" + mss_selec
KeywordArgument ("code", "Which genetic code should be used", "Universal");
mss.genetic_code = alignments.LoadGeneticCode (None);


KeywordArgument ("ic", "Which IC should be used for scoring", "AIC-c");
mss.ic_score = io.SelectAnOption ({"AIC-c" : "Small Sample AIC score",
"BIC" : "Bayesian Information Criterion (more conservative)"}, "Which IC should be used for scoring");

mss.is_bic = mss.ic_score == "BIC";


mss.file_records = {};
mss.file_info = {};
mss.tree_info = {};
Expand Down Expand Up @@ -132,7 +140,7 @@ lfunction mss.store_initial_fit (node, result, arguments) {

^"mss.parameters" += (result["init"])[^"terms.parameters"];
^"mss.baselineLL" += (result["init"])[^"terms.fit.log_likelihood"];
^"mss.sample_size" += (result["init"])[^"terms.data.sample_size"];
//^"mss.sample_size" += (result["init"])[^"terms.data.sample_size"];
mss_selector.print_row [0] = result["path"];
info = (^"mss.file_records")[result["path"]];
mss_selector.print_row [1] = info [^"terms.data.sequences"];
Expand Down Expand Up @@ -195,6 +203,8 @@ for (mss.counter, mss_selector.path; in; mss_selector.file_list) {
mss.namespace = "mss_" + mss.counter;
mss.handle_initial_fit (mss_selector.path, mss.namespace, mss.genetic_code[terms.id], FALSE);
mss.file_records [mss_selector.path] = ^"`mss.namespace`.codon_data_info";
mss.sample_size += (^"`mss.namespace`.codon_data_info")[terms.data.sample_size];


mpi.QueueJob (mss_selector.queue, "mss.handle_initial_fit", {"0" : mss_selector.path,
"1" : mss.namespace,
Expand All @@ -219,7 +229,7 @@ for (mss.counter, mss_selector.path; in; mss_selector.file_list) {
//io.ClearProgressBar();
mpi.QueueComplete (mss_selector.queue);

mss.baseline_AIC = selection.io.getIC(mss.baselineLL, mss.parameters, mss.sample_size);
mss.baseline_AIC = mss.getIC (mss.baselineLL, mss.parameters, mss.sample_size, mss.is_bic);

io.ReportProgressMessageMD("mss", "fit0done" , "Baseline model fit");
io.ReportProgressMessageMD("mss", "fit0done", "**LogL = " + mss.baselineLL + "**" + ", **AIC-c = " + mss.baseline_AIC + "** \n");
Expand Down Expand Up @@ -469,7 +479,7 @@ lfunction mss.GA.fit_model (model, lfid, xp, ss) {
utility.SetEnvVariable ("AUTO_PARALLELIZE_OPTIMIZE", 1.);
Optimize (res, ^lfid);
return_expr = {1,res[1][1] + 3};
return_expr [0] = selection.io.getIC(res[1][0], xp + res[1][1] ,ss);
return_expr [0] = mss.getIC (res[1][0], xp + res[1][1] ,ss, ^"mss.is_bic");
return_expr [1] = res[1][0];

for (i = 0; i < res[1][1] + 1; i+=1) {
Expand Down Expand Up @@ -523,8 +533,8 @@ function mss.GA.evaluateModels (models) {

for (modelID, cAIC; in; models) {
if (cAIC == math.Infinity) {
models[modelID] = (mss.GA.fit_model ( Eval (modelID), mss.lf_id, mss.parameters, mss.sample_size))[0];
console.log (models[modelID]);
models[modelID] = (mss.GA.fit_model ( Eval (modelID), mss.lf_id, mss.parameters, mss.sample_size))[0];
console.log ("\t" + Join("", Eval (modelID)) + " IC = " + Format (models[modelID], 15, 4) + ", delta = " + Format ((-(models[modelID]) + mss.bestOverall_cAIC_soFar), 8, 2));
}
}

Expand Down Expand Up @@ -677,6 +687,16 @@ lfunction mss.GA.generateNewGenerationOfModelsByMutatingModelSet(parentModels, n
return nextGenOfModels;
}

//------------------------------------------------------------------------------------------------------------------------

lfunction mss.getIC(logl, params, samples, is_bic) {
if (is_bic) {
return -2 * logl + Log (samples) * params;
}
return -2 * logl + 2 * samples / (samples - params - 1) * params;
}





Expand Down
24 changes: 19 additions & 5 deletions res/TemplateBatchFiles/MSS-selector.bf
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,13 @@ io.ReportProgressMessageMD("mss", "data" , "* Loaded a list with **" + mss_selec
KeywordArgument ("code", "Which genetic code should be used", "Universal");
mss.genetic_code = alignments.LoadGeneticCode (None);

KeywordArgument ("ic", "Which IC should be used for scoring", "AIC-c");
mss.ic_score = io.SelectAnOption ({"AIC-c" : "Small Sample AIC score",
"BIC" : "Bayesian Information Criterion (more conservative)"}, "Which IC should be used for scoring");

mss.is_bic = mss.ic_score == "BIC";


mss.file_records = {};
mss.file_info = {};
mss.tree_info = {};
Expand Down Expand Up @@ -132,7 +139,7 @@ lfunction mss.store_initial_fit (node, result, arguments) {

^"mss.parameters" += (result["init"])[^"terms.parameters"];
^"mss.baselineLL" += (result["init"])[^"terms.fit.log_likelihood"];
^"mss.sample_size" += (result["init"])[^"terms.data.sample_size"];
//^"mss.sample_size" += (result["init"])[^"terms.data.sample_size"];
mss_selector.print_row [0] = result["path"];
info = (^"mss.file_records")[result["path"]];
mss_selector.print_row [1] = info [^"terms.data.sequences"];
Expand Down Expand Up @@ -195,6 +202,7 @@ for (mss.counter, mss_selector.path; in; mss_selector.file_list) {
mss.namespace = "mss_" + mss.counter;
mss.handle_initial_fit (mss_selector.path, mss.namespace, mss.genetic_code[terms.id], FALSE);
mss.file_records [mss_selector.path] = ^"`mss.namespace`.codon_data_info";
mss.sample_size += (^"`mss.namespace`.codon_data_info")[terms.data.sample_size];

mpi.QueueJob (mss_selector.queue, "mss.handle_initial_fit", {"0" : mss_selector.path,
"1" : mss.namespace,
Expand All @@ -219,7 +227,7 @@ for (mss.counter, mss_selector.path; in; mss_selector.file_list) {
//io.ClearProgressBar();
mpi.QueueComplete (mss_selector.queue);

mss.baseline_AIC = selection.io.getIC(mss.baselineLL, mss.parameters, mss.sample_size);
mss.baseline_AIC = mss.getIC (mss.baselineLL, mss.parameters, mss.sample_size, mss.is_bic);

io.ReportProgressMessageMD("mss", "fit0done" , "Baseline model fit");
io.ReportProgressMessageMD("mss", "fit0done", "**LogL = " + mss.baselineLL + "**" + ", **AIC-c = " + mss.baseline_AIC + "** \n");
Expand Down Expand Up @@ -463,7 +471,7 @@ lfunction mss.GA.fit_model (model, lfid, xp, ss) {
utility.SetEnvVariable ("AUTO_PARALLELIZE_OPTIMIZE", 1.);
Optimize (res, ^lfid);
return_expr = {1,res[1][1] + 3};
return_expr [0] = selection.io.getIC(res[1][0], xp + res[1][1] ,ss);
return_expr [0] = mss.getIC (res[1][0], xp + res[1][1] ,ss, ^"mss.is_bic");
return_expr [1] = res[1][0];

for (i = 0; i < res[1][1] + 1; i+=1) {
Expand Down Expand Up @@ -518,8 +526,7 @@ function mss.GA.evaluateModels (models) {
for (modelID, cAIC; in; models) {
if (cAIC == math.Infinity) {
models[modelID] = (mss.GA.fit_model ( Eval (modelID), mss.lf_id, mss.parameters, mss.sample_size))[0];
console.log (models[modelID]);
}
console.log ("\t" + Join("", Eval (modelID)) + " IC = " + Format (models[modelID], 15, 4) + ", delta = " + Format ((-(models[modelID]) + mss.bestOverall_cAIC_soFar), 8, 2)); }
}

}
Expand Down Expand Up @@ -672,6 +679,13 @@ lfunction mss.GA.generateNewGenerationOfModelsByMutatingModelSet(parentModels, n
}


//------------------------------------------------------------------------------------------------------------------------

lfunction mss.getIC(logl, params, samples, is_bic) {
if (is_bic) {
return -2 * logl + Log (samples) * params;
}
return -2 * logl + 2 * samples / (samples - params - 1) * params;
}


21 changes: 12 additions & 9 deletions res/TemplateBatchFiles/SelectionAnalyses/BUSTED.bf
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ Version 2.2 changes the grid search to LHC, and adds an initial search phase to
for branch-site variation in synonymous substitution rates. Version 3.1 adds HMM auto-correlation option for SRV, and binds SRV distributions for multiple branch sets.
Version 4.0 adds support for multiple hits, ancestral state reconstruction saved to JSON, and profiling of branch-site level support for selection / multiple hits.
",
terms.io.version : "4.0",
terms.io.version : "4.1",
terms.io.reference : "*Gene-wide identification of episodic selection*, Mol Biol Evol. 32(5):1365-71, *Synonymous Site-to-Site Substitution Rate Variation Dramatically Inflates False Positive Rates of Selection Analyses: Ignore at Your Own Peril*, Mol Biol Evol. 37(8):2430-2439",
terms.io.authors : "Sergei L Kosakovsky Pond",
terms.io.contact : "[email protected]",
Expand Down Expand Up @@ -174,18 +174,21 @@ busted.codon_data_info [terms.json.json] = io.PromptUserForFilePath ("Save the r

io.ReportProgressMessageMD('BUSTED', 'selector', 'Branches to test for selection in the BUSTED analysis');

utility.ForEachPair (busted.selected_branches, "_partition_", "_selection_",
"_selection_ = utility.Filter (_selection_, '_value_', '_value_ == terms.tree_attributes.test');
io.ReportProgressMessageMD('BUSTED', 'selector', '* Selected ' + Abs(_selection_) + ' branches to test in the BUSTED analysis: \\\`' + Join (', ',utility.Keys(_selection_)) + '\\\`')");
for (_partition_, _selection_; in; busted.selected_branches) {
_selection_ = utility.Filter (_selection_, '_value_', '_value_ == terms.tree_attributes.test');
io.ReportProgressMessageMD('BUSTED', 'selector', '* Selected ' + Abs(_selection_) + ' branches to test in the BUSTED analysis: \`' + Join (', ',utility.Keys(_selection_)) + '\`');
}


// check to see if there are any background branches
busted.has_background = FALSE;

utility.ForEachPair (busted.selected_branches, "_partition_", "_selection_",
"_selection_ = utility.Filter (_selection_, '_value_', '_value_ != terms.tree_attributes.test');
if (utility.Array1D (_selection_)) { busted.has_background = TRUE;} ");
busted.json[busted.json.background] = busted.has_background;
for (_partition_, _selection_; in; busted.selected_branches) {
_selection_ = utility.Filter (_selection_, '_value_', '_value_ != terms.tree_attributes.test');
if (utility.Array1D (_selection_)) { busted.has_background = TRUE; break;}
}

busted.json[busted.json.background] = busted.has_background;
selection.io.startTimer (busted.json [terms.json.timers], "Preliminary model fitting", 1);

namespace_tag = "busted";
Expand Down Expand Up @@ -1151,7 +1154,7 @@ lfunction busted._renormalize (v, distro, mean) {
d = Rows (m);
m = +(m[-1][0] $ m[-1][1]); // current mean
for (i = 0; i < d; i+=1) {
(v[((^"busted.distribution")["rates"])[i]])[^"terms.fit.MLE"] = (v[((^"busted.distribution")["rates"])[i]])[^"terms.fit.MLE"] / m * mean;
(v[((^distro)["rates"])[i]])[^"terms.fit.MLE"] = (v[((^distro)["rates"])[i]])[^"terms.fit.MLE"] / m * mean;
}
return v;

Expand Down
24 changes: 18 additions & 6 deletions res/TemplateBatchFiles/SelectionAnalyses/PRIME.bf
Original file line number Diff line number Diff line change
Expand Up @@ -640,10 +640,13 @@ lfunction prime.handle_a_site (lf_fel, lf_prop, filter_data, partition_index, pa
);
fel = estimators.ExtractMLEs (lf_fel, model_mapping);
fel[utility.getGlobalValue("terms.fit.log_likelihood")] = results[1][0];
//console.log ("\n" + results[1][0]);

console.log ("\nFEL = " + results[1][0]);
console.log ("alpha = " + ^"prime.site_alpha");
console.log ("beta = " + ^"prime.site_beta");

// Export (lfe, ^lf_prop);
// fprintf ("/Users/sergei/Desktop/PRIME/site." + (pattern_info["sites"])[0] + ".bf",CLEAR_FILE,lfe);
//Export (lfe, ^lf_prop);
//fprintf ("/tmp/PRIME-site." + (pattern_info["sites"])[0] + ".bf",CLEAR_FILE,lfe);

/*if (^"prime.site_beta" > 0) {
// only matters for sites with non-syn changes
Expand Down Expand Up @@ -726,8 +729,10 @@ lfunction prime.handle_a_site (lf_fel, lf_prop, filter_data, partition_index, pa
"OPTIMIZATION_PRECISION": 1e-4
});

Export (lfe, ^lf_prop);
fprintf ("/tmp/PRIME-site." + (pattern_info["sites"])[0] + ".bf",CLEAR_FILE,lfe);

//console.log ("\n" + ^"LF_INITIAL_GRID_MAXIMUM_VALUE" + "\n"+ ^"LF_INITIAL_GRID_MAXIMUM" + " : " + results[1][0] + "\n");
console.log ("\n" + ^"LF_INITIAL_GRID_MAXIMUM_VALUE" + "\nGrid best"+ ^"LF_INITIAL_GRID_MAXIMUM" + " / optimized " + results[1][0] + "\n");
//Optimize (results, ^lf_prop,);
//console.log ("\n" + results[1][0] + "\n");
//fprintf (stdout, ^lf_prop);
Expand All @@ -753,7 +758,7 @@ lfunction prime.handle_a_site (lf_fel, lf_prop, filter_data, partition_index, pa
"MAXIMUM_OPTIMIZATION_ITERATIONS" : 1000,
"OPTIMIZATION_PRECISION": 1e-4
});
//console.log (k + " => " + (- results[1][0] + altL));
console.log (k + " => " + (- results[1][0] + altL));
if (results[1][0] - altL > 1e-2) {
done = FALSE;
break;
Expand All @@ -779,6 +784,11 @@ lfunction prime.handle_a_site (lf_fel, lf_prop, filter_data, partition_index, pa

}
}

console.log ("\nPRIME = " + altL);
console.log ("alpha = " + ^"prime.site_alpha");
console.log ("beta = " + ^"prime.site_beta");

character_map = None;
if (^"prime.impute_states") {
DataSet anc = ReconstructAncestors ( ^lf_prop, {{0}}, MARGINAL, DOLEAVES);
Expand All @@ -804,7 +814,9 @@ lfunction prime.handle_a_site (lf_fel, lf_prop, filter_data, partition_index, pa
branch_mapping = ancestral.ComputeCompressedSubstitutionsBySite (ancestral_info,0);
DeleteObject (ancestral_info);

//console.log (branch_substitution_information);
console.log (branch_substitution_information);
console.log ("END " +(pattern_info["sites"]) + " pattern");
console.log ("----------------------------------------------");

alternative [utility.getGlobalValue("terms.fit.log_likelihood")] = altL;

Expand Down
Loading

1 comment on commit 6776da2

@kjlevitz
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Performance Alert ⚠️

Possible performance regression was detected for benchmark 'Benchmark.js Benchmark'.
Benchmark result of this commit is worse than the previous benchmark result exceeding threshold 2.

Benchmark suite Current: 6776da2 Previous: 3f0750b Ratio
BUSTED-SRV.wbf 37.54035588257377 secs/op (±53.332121%) 16.627315353662997 secs/op (±13.839997%) 2.26

This comment was automatically generated by workflow using github-action-benchmark.

CC: @klevitz

Please sign in to comment.