From 91646ba72c5dceaa034ecabe37c41b633e4dce4d Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Wed, 11 Jun 2025 14:01:38 -0500 Subject: [PATCH 1/3] feat: enhance k-mer frequency filtering and adjust threshold logic --- src/map/include/winSketch.hpp | 66 +++++++++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 15 deletions(-) diff --git a/src/map/include/winSketch.hpp b/src/map/include/winSketch.hpp index 1d76418e..d6370ef4 100644 --- a/src/map/include/winSketch.hpp +++ b/src/map/include/winSketch.hpp @@ -293,6 +293,54 @@ namespace skch } } + // print frequency map + for (const auto& [hash, freq] : kmer_freqs) { + std::cout << hash << "\t" << freq << std::endl; + } + + // Calculate count_threshold ONCE before parallel section + uint64_t min_occ = 10; + uint64_t max_occ = std::numeric_limits::max(); + uint64_t count_threshold; + + if (param.max_kmer_freq <= 1.0) { + count_threshold = std::min(max_occ, + std::max(min_occ, + (uint64_t)(total_windows * param.max_kmer_freq))); + } else { + count_threshold = std::min(max_occ, + std::max(min_occ, + (uint64_t)param.max_kmer_freq)); + } + + // Safety check to prevent filtering all k-mers + size_t would_filter = 0; + for (const auto& [hash, freq] : kmer_freqs) { + if (freq > count_threshold && freq > min_occ) { + would_filter++; + } + } + + // If we would filter too many k-mers (>70%), adjust threshold + if (would_filter > kmer_freqs.size() * 0.7) { + // Collect all frequencies and sort them + std::vector all_freqs; + all_freqs.reserve(kmer_freqs.size()); + for (const auto& [hash, freq] : kmer_freqs) { + all_freqs.push_back(freq); + } + std::sort(all_freqs.begin(), all_freqs.end()); + + // Find threshold that keeps at least 10% of k-mers + size_t keep_index = kmer_freqs.size() - (kmer_freqs.size() / 10); + count_threshold = all_freqs[keep_index]; + + std::cerr << "[wfmash::mashmap] WARNING: Adjusted k-mer frequency threshold from " + << (uint64_t)(total_windows * param.max_kmer_freq) + << " to " << count_threshold + << " to prevent filtering all k-mers" << std::endl; + } + // Parallel index building std::vector thread_pos_indexes(param.threads); std::vector thread_minmer_indexes(param.threads); @@ -301,7 +349,7 @@ namespace skch std::vector index_threads; for (size_t t = 0; t < param.threads; ++t) { - index_threads.emplace_back([&, t]() { + index_threads.emplace_back([&, t, count_threshold]() { size_t start = t * chunk_size; size_t end = std::min(start + chunk_size, threadOutputs.size()); @@ -315,20 +363,8 @@ namespace skch } uint64_t freq = freq_it->second; - uint64_t min_occ = 10; - uint64_t max_occ = std::numeric_limits::max(); - uint64_t count_threshold; - if (param.max_kmer_freq <= 1.0) { - count_threshold = std::min(max_occ, - std::max(min_occ, - (uint64_t)(total_windows * param.max_kmer_freq))); - } else { - count_threshold = std::min(max_occ, - std::max(min_occ, - (uint64_t)param.max_kmer_freq)); - } - + // Use the captured count_threshold instead of recalculating if (freq > count_threshold && freq > min_occ) { thread_filtered_kmers[t]++; continue; @@ -400,7 +436,7 @@ namespace skch std::cerr << "[wfmash::mashmap] Processed " << totalSeqProcessed << " sequences (" << totalSeqSkipped << " skipped, " << total_seq_length << " total bp), " << minmerPosLookupIndex.size() << " unique hashes, " << minmerIndex.size() << " windows" << std::endl << "[wfmash::mashmap] Filtered " << filtered_kmers << "/" << total_kmers - << " k-mers occurring > " << freq_cutoff << " times" + << " k-mers occurring > " << count_threshold << " times" << " (target: " << (param.max_kmer_freq <= 1.0 ? ([&]() { std::stringstream ss; From 9ea278522256e74c507924ed4bb00c86aa56270a Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Wed, 11 Jun 2025 14:55:16 -0500 Subject: [PATCH 2/3] fix: add --quiet option to test command for cleaner output --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 516c03c1..0adf1e6f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -274,7 +274,7 @@ add_test( add_test( NAME wfmash-mapping-coverage-with-8-yeast-genomes-to-PAF - COMMAND bash -c "${INVOKE} data/scerevisiae8.fa.gz -p 95 -n 7 -m -L -Y \\# > scerevisiae8.paf && ./scripts/test.sh data/scerevisiae8.fa.gz.fai scerevisiae8.paf 0.89" + COMMAND bash -c "${INVOKE} data/scerevisiae8.fa.gz -p 95 -n 7 -m -L -Y \\# --quiet > scerevisiae8.paf && ./scripts/test.sh data/scerevisiae8.fa.gz.fai scerevisiae8.paf 0.89" WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) add_test( From 3b16f62332151862983201d0c05942743e8e4f68 Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Wed, 11 Jun 2025 15:20:46 -0500 Subject: [PATCH 3/3] fix: remove --quiet option from test command for improved output visibility --- CMakeLists.txt | 2 +- scripts/test.sh | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0adf1e6f..516c03c1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -274,7 +274,7 @@ add_test( add_test( NAME wfmash-mapping-coverage-with-8-yeast-genomes-to-PAF - COMMAND bash -c "${INVOKE} data/scerevisiae8.fa.gz -p 95 -n 7 -m -L -Y \\# --quiet > scerevisiae8.paf && ./scripts/test.sh data/scerevisiae8.fa.gz.fai scerevisiae8.paf 0.89" + COMMAND bash -c "${INVOKE} data/scerevisiae8.fa.gz -p 95 -n 7 -m -L -Y \\# > scerevisiae8.paf && ./scripts/test.sh data/scerevisiae8.fa.gz.fai scerevisiae8.paf 0.89" WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) add_test( diff --git a/scripts/test.sh b/scripts/test.sh index 2cbaba0a..411ed7a6 100755 --- a/scripts/test.sh +++ b/scripts/test.sh @@ -6,13 +6,18 @@ COVERAGE=$3 PREFIX=${4:-""} # Optional prefix to filter sequences, defaults to empty string cat $FASTA_FAI | awk -v OFS='\t' '{print($1,"0",$2)}' > $PAF.sequences.bed +sed -n '11239p' $PAF.sequences.bed + cat \ <(cat $PAF | awk -v OFS='\t' '{print $1, $3, $4, "", "", $5}') \ <(cat $PAF | awk -v OFS='\t' '{print $6, $8, $9, "", "", "+"}') \ | bedtools sort | bedtools merge > $PAF.query+target.bed +sed -n '11239p' $PAF.query+target.bed echo "#seq.name" coverage | tr ' ' '\t' > $PAF.coverage.txt bedtools intersect -a $PAF.sequences.bed -b $PAF.query+target.bed -wo > $PAF.overlap.bed +sed -n '11239p' $PAF.overlap.bed + awk 'BEGIN{FS=OFS="\t"}{ if(NR==FNR){ len[$1]=$3-$2; coverage[$1]=0;