Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Subindexes in one file and remove frequent kmer filtering #282

Merged
merged 20 commits into from
Oct 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
02b810e
fix: Remove debugging messages and sequence id manager state dumping
ekg Oct 14, 2024
68c38f3
fix: Remove debugging messages from SequenceIdManager and computeMap
ekg Oct 14, 2024
396cc58
feat: Implement indexing and reading of multiple sub-indexes in a sin…
ekg Oct 14, 2024
9df76fd
fix: replace getSequenceNames() with getTargetSequenceNames()
ekg Oct 14, 2024
7a5133d
feat: add debugging statements to show index and expected sequences
ekg Oct 14, 2024
f7370b6
fix: Write only target subset sequences to sub-index header
ekg Oct 14, 2024
faff9ae
feat: fix writeIndex function call in computeMap.hpp
ekg Oct 15, 2024
9af8e11
refactor: Remove debugging output for sequence names in readSubIndexH…
ekg Oct 15, 2024
1f65013
fix: remove frequent kmer tracking and checking
ekg Oct 15, 2024
a781927
fix: Remove unused variables and functions from winSketch.hpp
ekg Oct 15, 2024
9352788
cleanup comments about frequent kmer filter
ekg Oct 15, 2024
06d0808
remove reference to kmer frequency filter
ekg Oct 15, 2024
9073ae2
feat: Add index loading for target sequence subsets
ekg Oct 15, 2024
8bfbac6
feat: Update Sketch constructor to load index directly from input stream
ekg Oct 15, 2024
2e35221
fix: remove duplicate declaration of indexStream
ekg Oct 15, 2024
3c43669
feat: Remove unused loadIndex method from winSketch.hpp
ekg Oct 15, 2024
7e8f0fa
fix: Replace `loadIndex` with `readIndex` in `winSketch.hpp`
ekg Oct 15, 2024
47991be
cleanup multi-index-making
ekg Oct 15, 2024
ffaf2e2
feat: Replace no-hg-filter flag with hg-filter and update logic
ekg Oct 17, 2024
9ed5185
fix: Revert to using --no-hg-filter while removing numerical flags
ekg Oct 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 4 additions & 11 deletions src/interface/parse_args.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ void parse_args(int argc,
args::ValueFlag<uint32_t> num_mappings_for_segments(mapping_opts, "N", "number of mappings to retain for each query/reference pair [default: 1]", {'n', "num-mappings-for-segment"});
args::ValueFlag<uint32_t> num_mappings_for_short_seq(mapping_opts, "N", "number of mappings to retain for each query/reference pair where the query sequence is shorter than segment length [default: 1]", {'S', "num-mappings-for-short-seq"});
args::ValueFlag<int> kmer_size(mapping_opts, "N", "kmer size [default: 15]", {'k', "kmer"});
args::ValueFlag<float> kmer_pct_threshold(mapping_opts, "%", "ignore the top % most-frequent kmers [default: 0.001]", {'H', "kmer-threshold"});
args::Flag lower_triangular(mapping_opts, "", "only map shorter sequences against longer", {'L', "lower-triangular"});
args::Flag skip_self(mapping_opts, "", "skip self mappings when the query and target name is the same (for all-vs-all mode)", {'X', "skip-self"});
args::Flag one_to_one(mapping_opts, "", "Perform one-to-one filtering", {'4', "one-to-one"});
Expand All @@ -94,9 +93,9 @@ void parse_args(int argc,
//ToFix: args::Flag keep_ties(mapping_opts, "", "keep all mappings with equal score even if it results in more than n mappings", {'D', "keep-ties"});
args::ValueFlag<int64_t> sketch_size(mapping_opts, "N", "sketch size for sketching.", {'w', "sketch-size"});
args::ValueFlag<double> kmer_complexity(mapping_opts, "F", "Drop segments w/ predicted kmer complexity below this cutoff. Kmer complexity defined as #kmers / (s - k + 1)", {'J', "kmer-complexity"});
args::Flag no_hg_filter(mapping_opts, "", "Don't use the hypergeometric filtering and instead use the MashMap2 first pass filtering.", {'1', "no-hg-filter"});
args::ValueFlag<double> hg_filter_ani_diff(mapping_opts, "%", "Filter out mappings unlikely to be this ANI less than the best mapping [default: 0.0]", {'2', "hg-filter-ani-diff"});
args::ValueFlag<double> hg_filter_conf(mapping_opts, "%", "Confidence value for the hypergeometric filtering [default: 99.9%]", {'3', "hg-filter-conf"});
args::Flag no_hg_filter(mapping_opts, "", "Don't use the hypergeometric filtering and instead use the MashMap2 first pass filtering.", {"no-hg-filter"});
args::ValueFlag<double> hg_filter_ani_diff(mapping_opts, "%", "Filter out mappings unlikely to be this ANI less than the best mapping [default: 0.0]", {"hg-filter-ani-diff"});
args::ValueFlag<double> hg_filter_conf(mapping_opts, "%", "Confidence value for the hypergeometric filtering [default: 99.9%]", {"hg-filter-conf"});
//args::Flag window_minimizers(mapping_opts, "", "Use window minimizers rather than world minimizers", {'U', "window-minimizers"});
//args::ValueFlag<std::string> path_high_frequency_kmers(mapping_opts, "FILE", " input file containing list of high frequency kmers", {'H', "high-freq-kmers"});
//args::ValueFlag<std::string> spaced_seed_params(mapping_opts, "spaced-seeds", "Params to generate spaced seeds <weight_of_seed> <number_of_seeds> <similarity> <region_length> e.g \"10 5 0.75 20\"", {'e', "spaced-seeds"});
Expand Down Expand Up @@ -471,12 +470,6 @@ void parse_args(int argc,
map_parameters.kmerSize = 15;
}

if (kmer_pct_threshold) {
map_parameters.kmer_pct_threshold = args::get(kmer_pct_threshold);
} else {
map_parameters.kmer_pct_threshold = 0.001; // in percent! so we keep 99.999% of kmers
}

//if (spaced_seed_params) {
//const std::string foobar = args::get(spaced_seed_params);

Expand Down Expand Up @@ -615,7 +608,7 @@ void parse_args(int argc,

map_parameters.filterLengthMismatches = true;

map_parameters.stage1_topANI_filter = !bool(no_hg_filter);
map_parameters.stage1_topANI_filter = !bool(no_hg_filter);
map_parameters.stage2_full_scan = true;

if (hg_filter_ani_diff)
Expand Down
63 changes: 30 additions & 33 deletions src/map/include/computeMap.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -200,17 +200,6 @@ namespace skch
p.query_list,
p.target_list))
{
std::cerr << "Initializing Map with parameters:" << std::endl;
std::cerr << "Query sequences: " << p.querySequences.size() << std::endl;
std::cerr << "Reference sequences: " << p.refSequences.size() << std::endl;
std::cerr << "Query prefix: " << (p.query_prefix.empty() ? "None" : p.query_prefix[0]) << std::endl;
std::cerr << "Target prefix: " << (p.target_prefix.empty() ? "None" : p.target_prefix) << std::endl;
std::cerr << "Prefix delimiter: '" << p.prefix_delim << "'" << std::endl;
std::cerr << "Query list: " << (p.query_list.empty() ? "None" : p.query_list) << std::endl;
std::cerr << "Target list: " << (p.target_list.empty() ? "None" : p.target_list) << std::endl;

idManager->dumpState();

if (p.stage1_topANI_filter) {
this->setProbs();
}
Expand Down Expand Up @@ -478,35 +467,43 @@ namespace skch

std::unordered_map<seqno_t, MappingResultsVector_t> combinedMappings;

// Build index for the current subset
// Open the index file once
std::ifstream indexStream;
if (!param.indexFilename.empty() && !param.create_index_only) {
indexStream.open(param.indexFilename.string(), std::ios::binary);
if (!indexStream) {
std::cerr << "Error: Unable to open index file: " << param.indexFilename << std::endl;
exit(1);
}
}

// For each subset of target sequences
uint64_t subset_count = 0;
std::cerr << "[mashmap::skch::Map::mapQuery] Number of target subsets: " << target_subsets.size() << std::endl;
for (const auto& target_subset : target_subsets) {
std::cerr << "processing subset " << subset_count << " of " << target_subsets.size() << std::endl;
std::cerr << "entries: ";
for (const auto& seqName : target_subset) {
std::cerr << seqName << " ";
}
std::cerr << std::endl;
if (target_subset.empty()) {
continue; // Skip empty subsets
}

// Build index for the current subset
if (!param.indexFilename.empty() && !param.create_index_only) {
// load index from file
std::string indexFilename = param.indexFilename.string() + "." + std::to_string(subset_count);
refSketch = new skch::Sketch(param, *idManager, target_subset, indexFilename);
} else {
refSketch = new skch::Sketch(param, *idManager, target_subset);
}

if (param.create_index_only) {
// Save the index to a file
std::string indexFilename = param.indexFilename.string() + "." + std::to_string(subset_count);
refSketch->writeIndex(indexFilename);
std::cerr << "[mashmap::skch::Map::mapQuery] Building and saving index for subset " << subset_count << " with " << target_subset.size() << " sequences" << std::endl;
refSketch = new skch::Sketch(param, *idManager, target_subset);
std::string indexFilename = param.indexFilename.string();
bool append = (subset_count != 0); // Append if not the first subset
refSketch->writeIndex(target_subset, indexFilename, append);
std::cerr << "[mashmap::skch::Map::mapQuery] Index created for subset " << subset_count
<< " and saved to " << indexFilename << std::endl;
} else {
if (!param.indexFilename.empty()) {
// Load index from file
std::cerr << "[mashmap::skch::Map::mapQuery] Loading index for subset " << subset_count << " with " << target_subset.size() << " sequences" << std::endl;
refSketch = new skch::Sketch(param, *idManager, target_subset, &indexStream);
} else {
std::cerr << "[mashmap::skch::Map::mapQuery] Building index for subset " << subset_count << " with " << target_subset.size() << " sequences" << std::endl;
refSketch = new skch::Sketch(param, *idManager, target_subset);
}
processSubset(subset_count, target_subsets.size(), total_seq_length, input_queue, merged_queue,
fragment_queue, reader_done, workers_done, fragments_done, combinedMappings);
}
Expand All @@ -517,6 +514,10 @@ namespace skch
++subset_count;
}

if (indexStream.is_open()) {
indexStream.close();
}

if (param.create_index_only) {
std::cerr << "[mashmap::skch::Map::mapQuery] All indices created successfully. Exiting." << std::endl;
exit(0);
Expand Down Expand Up @@ -1021,11 +1022,7 @@ namespace skch
const double max_hash_01 = (long double)(Q.minmerTableQuery.back().hash) / std::numeric_limits<hash_t>::max();
Q.kmerComplexity = (double(Q.minmerTableQuery.size()) / max_hash_01) / ((Q.len - param.kmerSize + 1)*2);

// TODO remove them from the original sketch instead of removing for each read
auto new_end = std::remove_if(Q.minmerTableQuery.begin(), Q.minmerTableQuery.end(), [&](auto& mi) {
return refSketch->isFreqSeed(mi.hash);
});
Q.minmerTableQuery.erase(new_end, Q.minmerTableQuery.end());
// Removed frequent kmer filtering

Q.sketchSize = Q.minmerTableQuery.size();
#ifdef DEBUG
Expand Down
1 change: 0 additions & 1 deletion src/map/include/map_parameters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ struct ales_params {
struct Parameters
{
int kmerSize; //kmer size for sketching
float kmer_pct_threshold; //use only kmers not in the top kmer_pct_threshold %-ile
offset_t segLength; //For split mapping case, this represents the fragment length
//for noSplit, it represents minimum read length to multimap
offset_t block_length; // minimum (potentially merged) block to keep if we aren't split
Expand Down
9 changes: 0 additions & 9 deletions src/map/include/parseCmdArgs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,6 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir
cmd.defineOption("kmer", "kmer size [default : 19]", ArgvParser::OptionRequiresValue);
cmd.defineOptionAlternative("kmer","k");

cmd.defineOption("kmerThreshold", "ignore the top \% most-frequent kmer window [default: 0.001]", ArgvParser::OptionRequiresValue);
cmd.defineOption("kmerComplexity", "threshold for kmer complexity [0, 1] [default : 0.0]", ArgvParser::OptionRequiresValue);


Expand Down Expand Up @@ -491,14 +490,6 @@ sequences shorter than segment length will be ignored", ArgvParser::OptionRequir
parameters.keep_low_pct_id = true;
}

if (cmd.foundOption("kmerThreshold")) {
str << cmd.optionValue("kmerThreshold");
str >> parameters.kmer_pct_threshold;
} else {
parameters.kmer_pct_threshold = 0.001; // in percent! so we keep 99.999% of kmers
}
str.clear();

if (cmd.foundOption("numMappingsForSegment")) {
uint32_t n;
str << cmd.optionValue("numMappingsForSegment");
Expand Down
29 changes: 0 additions & 29 deletions src/map/include/sequenceIds.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,35 +35,6 @@ class SequenceIdManager {
allPrefixes.insert(allPrefixes.end(), targetPrefixes.begin(), targetPrefixes.end());
populateFromFiles(queryFiles, targetFiles, queryPrefixes, targetPrefixes, prefixDelim, queryList, targetList);
buildRefGroups();
dumpState(); // Add this line to dump the state after initialization
}

// Add this method to dump the state of SequenceIdManager
void dumpState() const {
std::cerr << "SequenceIdManager State:" << std::endl;
std::cerr << "Total sequences: " << metadata.size() << std::endl;
std::cerr << "Query sequences: " << querySequenceNames.size() << std::endl;
std::cerr << "Target sequences: " << targetSequenceNames.size() << std::endl;
std::cerr << "\nSequence details:" << std::endl;
for (size_t i = 0; i < metadata.size(); ++i) {
std::cerr << "ID: " << i
<< ", Name: " << metadata[i].name
<< ", Length: " << metadata[i].len
<< ", Group: " << metadata[i].groupId
<< ", Type: " << (std::find(querySequenceNames.begin(), querySequenceNames.end(), metadata[i].name) != querySequenceNames.end() ? "Query" : "Target")
<< std::endl;
}
std::cerr << "\nGroup details:" << std::endl;
std::unordered_map<int, std::vector<std::string>> groupToSequences;
for (const auto& info : metadata) {
groupToSequences[info.groupId].push_back(info.name);
}
for (const auto& [groupId, sequences] : groupToSequences) {
std::cerr << "Group " << groupId << ": " << sequences.size() << " sequences" << std::endl;
for (const auto& seq : sequences) {
std::cerr << " " << seq << std::endl;
}
}
}

seqno_t getSequenceId(const std::string& sequenceName) const {
Expand Down
Loading
Loading