8000 Avoid filtering all k-mers by AndreaGuarracino · Pull Request #366 · waveygang/wfmash · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Avoid filtering all k-mers #366

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions scripts/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,18 @@ COVERAGE=$3
PREFIX=${4:-""} # Optional prefix to filter sequences, defaults to empty string

cat $FASTA_FAI | awk -v OFS='\t' '{print($1,"0",$2)}' > $PAF.sequences.bed
sed -n '11239p' $PAF.sequences.bed

cat \
<(cat $PAF | awk -v OFS='\t' '{print $1, $3, $4, "", "", $5}') \
<(cat $PAF | awk -v OFS='\t' '{print $6, $8, $9, "", "", "+"}') \
| bedtools sort | bedtools merge > $PAF.query+target.bed
sed -n '11239p' $PAF.query+target.bed

echo "#seq.name" coverage | tr ' ' '\t' > $PAF.coverage.txt
bedtools intersect -a $PAF.sequences.bed -b $PAF.query+target.bed -wo > $PAF.overlap.bed
sed -n '11239p' $PAF.overlap.bed

awk 'BEGIN{FS=OFS="\t"}{
if(NR==FNR){
len[$1]=$3-$2; coverage[$1]=0;
Expand Down
66 changes: 51 additions & 15 deletions src/map/include/winSketch.hpp
8000
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,54 @@ namespace skch
}
}

// print frequency map
for (const auto& [hash, freq] : kmer_freqs) {
std::cout << hash << "\t" << freq << std::endl;
}

// Calculate count_threshold ONCE before parallel section
uint64_t min_occ = 10;
uint64_t max_occ = std::numeric_limits<uint64_t>::max();
uint64_t count_threshold;

if (param.max_kmer_freq <= 1.0) {
count_threshold = std::min(max_occ,
std::max(min_occ,
(uint64_t)(total_windows * param.max_kmer_freq)));
} else {
count_threshold = std::min(max_occ,
std::max(min_occ,
(uint64_t)param.max_kmer_freq));
}

// Safety check to prevent filtering all k-mers
size_t would_filter = 0;
for (const auto& [hash, freq] : kmer_freqs) {
if (freq > count_threshold && freq > min_occ) {
would_filter++;
}
}

// If we would filter too many k-mers (>70%), adjust threshold
if (would_filter > kmer_freqs.size() * 0.7) {
// Collect all frequencies and sort them
std::vector<uint64_t> all_freqs;
all_freqs.reserve(kmer_freqs.size());
for (const auto& [hash, freq] : kmer_freqs) {
all_freqs.push_back(freq);
}
std::sort(all_freqs.begin(), all_freqs.end());

// Find threshold that keeps at least 10% of k-mers
size_t keep_index = kmer_freqs.size() - (kmer_freqs.size() / 10);
count_threshold = all_freqs[keep_index];

std::cerr << "[wfmash::mashmap] WARNING: Adjusted k-mer frequency threshold from "
<< (uint64_t)(total_windows * param.max_kmer_freq)
<< " to " << count_threshold
<< " to prevent filtering all k-mers" << std::endl;
}

// Parallel index building
std::vector<MI_Map_t> thread_pos_indexes(param.threads);
std::vector<MI_Type> thread_minmer_indexes(param.threads);
Expand All @@ -301,7 +349,7 @@ namespace skch
std::vector<std::thread> index_threads;

for (size_t t = 0; t < param.threads; ++t) {
index_threads.emplace_back([&, t]() {
index_threads.emplace_back([&, t, count_threshold]() {
size_t start = t * chunk_size;
size_t end = std::min(start + chunk_size, threadOutputs.size());

Expand All @@ -315,20 +363,8 @@ namespace skch
}

uint64_t freq = freq_it->second;
uint64_t min_occ = 10;
uint64_t max_occ = std::numeric_limits<uint64_t>::max();
uint64_t count_threshold;

if (param.max_kmer_freq <= 1.0) {
count_threshold = std::min(max_occ,
std::max(min_occ,
(uint64_t)(total_windows * param.max_kmer_freq)));
} else {
count_threshold = std::min(max_occ,
std::max(min_occ,
(uint64_t)param.max_kmer_freq));
}

// Use the captured count_threshold instead of recalculating
if (freq > count_threshold && freq > min_occ) {
thread_filtered_kmers[t]++;
continue;
Expand Down Expand Up @@ -400,7 +436,7 @@ namespace skch
std::cerr << "[wfmash::mashmap] Processed " << totalSeqProcessed << " sequences (" << totalSeqSkipped << " skipped, " << total_seq_length << " total bp), "
<< minmerPosLookupIndex.size() << " unique hashes, " << minmerIndex.size() << " windows" << std::endl
<< "[wfmash::mashmap] Filtered " << filtered_kmers << "/" << total_kmers
<< " k-mers occurring > " << freq_cutoff << " times"
<< " k-mers occurring > " << count_threshold << " times"
<< " (target: " << (param.max_kmer_freq <= 1.0 ?
([&]() {
std::stringstream ss;
Expand Down
Loading
0