Skip to content

Commit 9a67cca

Browse files
authored
Merge pull request #19 from bacpop/aa_hash
Add support for protein (aa) sequences
2 parents e79bdbd + b3896b1 commit 9a67cca

13 files changed

+5644
-343
lines changed

src/bloom_filter.rs

+6
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,12 @@ impl KmerFilter {
108108
}
109109
}
110110

111+
pub fn clear(&mut self) {
112+
self.buffer.clear();
113+
self.counts.clear();
114+
self.init();
115+
}
116+
111117
/// Add an observation of a k-mer and middle base to the filter, and return if it passed
112118
/// minimum count filtering criterion.
113119
pub fn filter(&mut self, hash: u64) -> Ordering {

src/cli.rs

+14-38
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,7 @@
11
//! Command line interface, built using [`crate::clap` with `Derive`](https://docs.rs/clap/latest/clap/_derive/_tutorial/index.html)
22
use clap::{ArgGroup, Parser, Subcommand};
33

4-
/* C++ interface
5-
6-
Usage:
7-
sketchlib sketch <files>... -o <output> [-k <kseq>|--kmer <k>] [-s <size>] [--single-strand] [--codon-phased] [--min-count <count>] [--exact-counter] [--cpus <cpus>] [--gpu <gpu>]
8-
sketchlib sketch -l <file-list> -o <output> [-k <kseq>|--kmer <k>] [-s <size>] [--single-strand] [--codon-phased] [--min-count <count>] [--exact-counter] [--cpus <cpus>] [--gpu <gpu>]
9-
sketchlib query dist <db1> [<db2>] [-o <output>] [--adj-random] [--subset <file>] [--cpus <cpus>] [--gpu <gpu>]
10-
sketchlib query jaccard <db1> [<db2>] [-o <output>] [--kmer <k>] [--adj-random] [--subset <file>] [--cpus <cpus>]
11-
sketchlib query sparse <db1> (--kNN <k>|--threshold <max>) [-o <output>] [--accessory] [--adj-random] [--subset <file>] [--cpus <cpus>] [--gpu <gpu>]
12-
sketchlib query sparse jaccard <db1> --kNN <k> --kmer <k> [-o <output>] [--adj-random] [--subset <file>] [--cpus <cpus>]
13-
sketchlib join <db1> <db2> -o <output>
14-
sketchlib (add|remove) random <db1> [--single-strand] [--cpus <cpus>]
15-
sketchlib (-h | --help)
16-
sketchlib (--version)
17-
18-
Options:
19-
-h --help Show this help.
20-
--version Show version.
21-
22-
-o <output> Output prefix.
23-
-l <file-list> File with a list of input files.
24-
--cpus <cpus> Number of CPU threads to use [default: 1].
25-
--gpu <gpu> Use GPU with specified device ID [default: -1].
26-
27-
-k <kseq> Sequence of k-mers to sketch (min,max,step) [default: 15,31,4].
28-
--kmer <k> Sketch (or distance) at a single k-mer length k.
29-
-s <size> Sketch size [default: 10000].
30-
--single-strand Ignore the reverse complement (e.g. in RNA viruses).
31-
--codon-phased Use codon phased seeds X--X--X
32-
--min-count <count> Minimum coverage count for k-mers from reads to be sketched [default: 20].
33-
--exact-counter Use an exact k-mer count filter for reads (for genomes >10Mb)
34-
35-
--adj-random Adjust query matches for their chance of occurring at random
36-
--subset <file> Only query samples matching names in file
37-
38-
--kNN <k> Use k nearest neighbours to sparsify
39-
--threshold <max> Remove distances over max to sparsify
40-
--accessory Use accessory distances rather than core to sparsify
41-
*/
4+
use super::hashing::{AaLevel, HashType, DEFAULT_LEVEL};
425

436
/// Default single strand (which is equivalent to !rc)
447
pub const DEFAULT_STRAND: bool = false;
@@ -105,6 +68,11 @@ pub enum Commands {
10568
#[arg(short, group = "input")]
10669
file_list: Option<String>,
10770

71+
/// Treat every sequence in an input file as a new sample (aa only)
72+
// TODO: for now, could be extended to dna, but probably no need
73+
#[arg(long, default_value_t = false)]
74+
concat_fasta: bool,
75+
10876
/// Output prefix
10977
#[arg(short)]
11078
output: String,
@@ -121,6 +89,14 @@ pub enum Commands {
12189
#[arg(short, long, default_value_t = DEFAULT_SKETCHSIZE)]
12290
sketch_size: u64,
12391

92+
/// Type of sequence to hash
93+
#[arg(long, value_enum, default_value_t = HashType::DNA)]
94+
seq_type: HashType,
95+
96+
/// aaHash 'level'
97+
#[arg(long, value_enum, default_value_t = DEFAULT_LEVEL)]
98+
level: AaLevel,
99+
124100
/// Ignore reverse complement (all contigs are oriented along same strand)
125101
#[arg(long, default_value_t = DEFAULT_STRAND)]
126102
single_strand: bool,

src/distances.rs

+12-6
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ use crate::multisketch::MultiSketch;
88
#[inline(always)]
99
pub fn square_to_condensed(i: usize, j: usize, n: usize) -> usize {
1010
debug_assert!(j > i);
11-
return n * i - ((i * (i + 1)) >> 1) + j - 1 - i;
11+
n * i - ((i * (i + 1)) >> 1) + j - 1 - i
1212
}
1313

1414
#[inline(always)]
@@ -52,9 +52,15 @@ pub enum DistType {
5252

5353
impl fmt::Display for DistType {
5454
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
55-
match self {
56-
&DistType::CoreAcc => write!(f, "Distances: core/accessory regression"),
57-
&DistType::Jaccard(k, ani) => if ani { write!(f, "Distances: ANI at k={k}") } else { write!(f, "Distances: Jaccard distances at k={k}") },
55+
match *self {
56+
DistType::CoreAcc => write!(f, "Distances: core/accessory regression"),
57+
DistType::Jaccard(k, ani) => {
58+
if ani {
59+
write!(f, "Distances: ANI at k={k}")
60+
} else {
61+
write!(f, "Distances: Jaccard distances at k={k}")
62+
}
63+
}
5864
}
5965
}
6066
}
@@ -141,7 +147,7 @@ impl<'a> fmt::Display for DistanceMatrix<'a> {
141147
write!(f, "\t{}", self.distances[dist_idx + 1])?;
142148
dist_idx += 1;
143149
}
144-
write!(f, "\n")?;
150+
writeln!(f)?;
145151
dist_idx += 1;
146152
}
147153
}
@@ -157,7 +163,7 @@ impl<'a> fmt::Display for DistanceMatrix<'a> {
157163
write!(f, "\t{}", self.distances[dist_idx + 1])?;
158164
dist_idx += 1;
159165
}
160-
write!(f, "\n")?;
166+
writeln!(f)?;
161167
dist_idx += 1;
162168
}
163169
}

src/hashing/aahash_iterator.rs

+190
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
use needletail::parse_fastx_file;
2+
use std::cmp::Ordering;
3+
4+
use super::*;
5+
6+
#[inline(always)]
7+
pub fn valid_aa(aa: u8) -> bool {
8+
matches!(aa, b'a' | b'c'..=b'i' | b'k'..=b'n' | b'p'..=b't' | b'v' | b'w' | b'y' | b'A' | b'C'..=b'I' | b'K'..=b'N' | b'P'..=b'T' | b'V' | b'W' | b'Y' )
9+
}
10+
11+
// Split a 64-bit word into 33 and 31-bit sub-words and left-rotate them separately.
12+
// Increases period from 64 to 1023.
13+
#[inline(always)]
14+
pub fn srol(x: u64) -> u64 {
15+
let m: u64 = ((x & 0x8000000000000000) >> 30) | ((x & 0x100000000) >> 32);
16+
((x << 1) & 0xFFFFFFFDFFFFFFFF) | m
17+
}
18+
19+
/// Stores forward and (optionally) reverse complement hashes of k-mers in a nucleotide sequence
20+
#[derive(Debug, Clone)]
21+
pub struct AaHashIterator {
22+
k: usize,
23+
level: AaLevel,
24+
fh: u64,
25+
index: usize,
26+
seq: Vec<u8>,
27+
invalid_count: usize,
28+
}
29+
30+
impl RollHash for AaHashIterator {
31+
fn set_k(&mut self, k: usize) {
32+
self.k = k;
33+
if let Some(new_it) = Self::new_iterator(0, &self.level, &self.seq, k) {
34+
self.fh = new_it.0;
35+
self.index = new_it.1;
36+
} else {
37+
panic!("K-mer larger than smallest valid sequence");
38+
}
39+
}
40+
41+
/// Retrieve the current hash (minimum of forward and reverse complement hashes)
42+
fn curr_hash(&self) -> u64 {
43+
self.fh
44+
}
45+
46+
fn hash_type(&self) -> HashType {
47+
HashType::AA(self.level.clone())
48+
}
49+
50+
fn seq_len(&self) -> usize {
51+
self.seq.len()
52+
}
53+
54+
fn sketch_data(&self) -> (bool, [usize; 4], usize) {
55+
(false, [0, 0, 0, 0], self.invalid_count)
56+
}
57+
}
58+
59+
impl AaHashIterator {
60+
pub fn default(level: AaLevel) -> Self {
61+
Self {
62+
k: 0,
63+
level,
64+
fh: 0,
65+
index: 0,
66+
seq: Vec::new(),
67+
invalid_count: 0,
68+
}
69+
}
70+
71+
pub fn new(file: &str, level: AaLevel, concat_fasta: bool) -> Vec<Self> {
72+
let mut hash_vec = Vec::new();
73+
74+
// Read sequence into memory (as we go through multiple times)
75+
log::debug!("Preprocessing sequence");
76+
let mut reader =
77+
parse_fastx_file(file).unwrap_or_else(|_| panic!("Invalid path/file: {file}"));
78+
loop {
79+
let record_read = reader.next();
80+
let mut seq_hash_it = Self::default(level.clone());
81+
if let Some(record) = record_read {
82+
let seqrec = record.expect("Invalid FASTA/Q record");
83+
if seqrec.qual().is_some() {
84+
panic!("Unexpected quality information with AA sequences in {file}. Correct sequence type set?");
85+
} else {
86+
for aa in seqrec.seq().iter() {
87+
if valid_aa(*aa) {
88+
seq_hash_it.seq.push(*aa)
89+
} else {
90+
seq_hash_it.invalid_count += 1;
91+
seq_hash_it.seq.push(SEQSEP);
92+
}
93+
}
94+
}
95+
if concat_fasta {
96+
hash_vec.push(seq_hash_it);
97+
} else {
98+
seq_hash_it.seq.push(SEQSEP);
99+
}
100+
} else {
101+
if !concat_fasta {
102+
hash_vec.push(seq_hash_it);
103+
}
104+
break;
105+
}
106+
}
107+
hash_vec
108+
}
109+
110+
fn new_iterator(
111+
mut start: usize,
112+
level: &AaLevel,
113+
seq: &[u8],
114+
k: usize,
115+
) -> Option<(u64, usize)> {
116+
let mut fh = 0_u64;
117+
'outer: while start < (seq.len() - k) {
118+
'_inner: for (i, v) in seq[start..(start + k)].iter().enumerate() {
119+
// If invalid seq
120+
if !valid_aa(*v) {
121+
start += i + 1;
122+
if start >= seq.len() {
123+
return None;
124+
}
125+
fh = 0;
126+
continue 'outer; // Try again from new start
127+
}
128+
fh = srol(fh);
129+
fh ^= level.aa_seed_table(*v);
130+
}
131+
break 'outer; // success
132+
}
133+
if start >= (seq.len() - k) {
134+
return None;
135+
}
136+
137+
Some((fh, start + k))
138+
}
139+
140+
/// Move to the next k-mer by adding a new base, removing a base from the end, efficiently updating the hash.
141+
fn roll_fwd(&mut self, old_aa: u8, new_aa: u8) {
142+
self.fh = srol(self.fh);
143+
self.fh ^= self.level.aa_seed_table(new_aa);
144+
self.fh ^= self.level.aa_roll_table(old_aa, self.k);
145+
}
146+
}
147+
148+
impl Iterator for AaHashIterator {
149+
type Item = u64;
150+
151+
fn next(&mut self) -> Option<Self::Item> {
152+
match self.index.cmp(&self.seq_len()) {
153+
Ordering::Less => {
154+
let current = self.curr_hash();
155+
let new_aa = self.seq[self.index];
156+
// Restart hash if invalid base
157+
if !valid_aa(new_aa) {
158+
if let Some(new_it) =
159+
Self::new_iterator(self.index + 1, &self.level, &self.seq, self.k)
160+
{
161+
self.fh = new_it.0;
162+
self.index = new_it.1;
163+
} else {
164+
self.index = self.seq_len(); // End of valid sequence
165+
}
166+
} else {
167+
self.roll_fwd(self.seq[self.index - self.k], new_aa);
168+
self.index += 1;
169+
}
170+
Some(current)
171+
}
172+
Ordering::Equal => {
173+
// Final hash, do not roll forward further
174+
self.index += 1;
175+
Some(self.curr_hash())
176+
}
177+
Ordering::Greater => {
178+
// End of sequence
179+
None
180+
}
181+
}
182+
}
183+
184+
fn size_hint(&self) -> (usize, Option<usize>) {
185+
(self.seq_len(), Some(self.seq_len()))
186+
}
187+
}
188+
189+
// This lets you use collect etc
190+
impl ExactSizeIterator for AaHashIterator {}

0 commit comments

Comments
 (0)