Skip to content

Commit b6f6eca

Browse files
author
Johanna
committed
inverted index
1 parent 94a6745 commit b6f6eca

17 files changed

+582
-290
lines changed

sketch_test

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
18153994266624113052 0
2+
10522786856694395636 0
3+
12746864318965039004 0
4+
218427210448011872 0
5+
17989614872418695534 0
6+
12800905151731317102 0
7+
2470650427243364712 0
8+
10422581833234463478 0
9+
7851966067738306971 0
10+
15775669711182969348 0
11+
2542708038333793128 0
12+
18014948802298000 0
13+
17989546934229245854 0
14+
10594770970814812166 0

sketch_test.skd

112 Bytes
Binary file not shown.

sketch_test.skm

250 Bytes
Binary file not shown.

src/cli.rs

+10-5
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
//! Command line interface, built using [`crate::clap` with `Derive`](https://docs.rs/clap/latest/clap/_derive/_tutorial/index.html)
22
use clap::{ArgGroup, Parser, Subcommand};
33

4+
use crate::DEFAULT_KMER;
5+
46
use super::hashing::{AaLevel, HashType, DEFAULT_LEVEL};
57

68
/// Default single strand (which is equivalent to !rc)
@@ -173,18 +175,21 @@ pub enum Commands {
173175
#[arg(long, default_value_t = DEFAULT_MINQUAL)]
174176
min_qual: u8,
175177

176-
/// Treat every sequence in an input file as a new sample (aa only)
177-
// TODO: for now, could be extended to dna, but probably no need
178-
#[arg(long, default_value_t = false)]
179-
concat_fasta: bool,
180-
181178
/// Number of CPU threads
182179
#[arg(long, value_parser = valid_cpus, default_value_t = 1)]
183180
threads: usize,
184181

185182
/// aaHash 'level'
186183
#[arg(long, value_enum, default_value_t = DEFAULT_LEVEL)]
187184
level: AaLevel,
185+
186+
/// Sketch size
187+
#[arg(short, long, default_value_t = DEFAULT_SKETCHSIZE)]
188+
sketch_size: u64,
189+
190+
/// K-mer size
191+
#[arg(short, long, default_value_t = DEFAULT_KMER)]
192+
kmer_length: usize,
188193
},
189194
/// Merge two sketch files (.skm and .skd pair)
190195
Merge {

src/hashing/mod.rs

-2
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,4 @@ pub trait RollHash: Iterator<Item = u64> {
125125
fn reads(&self) -> bool {
126126
false
127127
}
128-
129-
130128
}

src/inverted.rs

+189
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
use std::sync::mpsc;
2+
3+
extern crate needletail;
4+
use hashbrown::HashMap;
5+
use indicatif::{ParallelProgressIterator, ProgressStyle};
6+
use rayon::prelude::*;
7+
use serde::{Deserialize, Serialize};
8+
9+
use super::hashing::{nthash_iterator::NtHashIterator, HashType, RollHash};
10+
use crate::hashing::aahash_iterator::AaHashIterator;
11+
use crate::io::InputFastx;
12+
use crate::sketch::*;
13+
use anyhow::Error;
14+
use std::fs::File;
15+
use std::io::{BufReader, BufWriter};
16+
17+
/// Bin bits (lowest of 64-bits to keep)
18+
pub const BBITS: u64 = 14;
19+
/// Total width of all bins (used as sign % sign_mod)
20+
pub const SIGN_MOD: u64 = (1 << 61) - 1;
21+
22+
#[derive(Serialize, Deserialize, Debug, Default, Clone, PartialEq)]
23+
pub struct Inverted {
24+
index: Vec<HashMap<u64, Vec<u32>>>,
25+
sample_names: Vec<String>,
26+
}
27+
28+
impl Inverted {
29+
pub fn new(
30+
input_files: &[InputFastx],
31+
k: &usize,
32+
sketch_size: u64,
33+
seq_type: &HashType,
34+
rc: bool,
35+
min_count: u16,
36+
min_qual: u8,
37+
) -> Self {
38+
let (sketches, sample_names) = Self::sketch_files_inverted(
39+
input_files,
40+
k,
41+
sketch_size,
42+
seq_type,
43+
rc,
44+
min_count,
45+
min_qual,
46+
);
47+
let inverted_index = Self::build_inverted_index(&sketches, sketch_size);
48+
Self {
49+
index: inverted_index,
50+
sample_names,
51+
}
52+
}
53+
54+
pub fn save(&self, file_prefix: &str) -> Result<(), Error> {
55+
let filename = format!("{}.ski", file_prefix);
56+
log::info!("Saving inverted index to {filename}");
57+
let serial_file = BufWriter::new(File::create(filename)?);
58+
let mut compress_writer = snap::write::FrameEncoder::new(serial_file);
59+
ciborium::ser::into_writer(self, &mut compress_writer)?;
60+
Ok(())
61+
}
62+
63+
pub fn load(file_prefix: &str) -> Result<Self, Error> {
64+
let filename = format!("{}.ski", file_prefix);
65+
log::info!("Loading inverted index from {filename}");
66+
let ski_file = BufReader::new(File::open(filename)?);
67+
let decompress_reader = snap::read::FrameDecoder::new(ski_file);
68+
let ski_obj: Self = ciborium::de::from_reader(decompress_reader)?;
69+
Ok(ski_obj)
70+
}
71+
72+
pub fn query_against_inverted_index(&self, query_sigs: &[u64], n_samples: usize) -> Vec<u32> {
73+
let mut match_counts = vec![0; n_samples];
74+
75+
for (bin_idx, query_bin_hash) in query_sigs.iter().enumerate() {
76+
if let Some(matching_samples) = self.index[bin_idx].get(query_bin_hash) {
77+
for &sample_idx in matching_samples {
78+
match_counts[sample_idx as usize] += 1;
79+
}
80+
}
81+
}
82+
match_counts
83+
}
84+
// // example for how query against II might work
85+
// // iterate over bins
86+
// for query_bin, inverted_index_bins in query_sigs.zip(inverted_index) { // iterating over u64, HashMap<u64, Vec<String>>
87+
// // look up bin value in the hash map
88+
// if query in inverted_index_bins {
89+
// let same_bin_samples = inverted_index_bins[query_bin]; // sample_bin_samples: Vec<String>
90+
// for sample in same_bin_samples { // for each sample String
91+
// dist_vec[sample] += 1;
92+
// }
93+
// // add one to all samples distances in the vec
94+
// for
95+
// }
96+
// }
97+
98+
fn sketch_files_inverted(
99+
input_files: &[InputFastx],
100+
k: &usize,
101+
sketch_size: u64,
102+
seq_type: &HashType,
103+
rc: bool,
104+
min_count: u16,
105+
min_qual: u8,
106+
) -> (Vec<Vec<u64>>, Vec<String>) {
107+
let (tx, rx) = mpsc::channel();
108+
let sample_names: Vec<String> = input_files
109+
.iter()
110+
.map(|(name, _, _)| name.clone())
111+
.collect();
112+
113+
let bar_style =
114+
ProgressStyle::with_template("{human_pos}/{human_len} {bar:80.cyan/blue} eta:{eta}")
115+
.unwrap();
116+
117+
rayon::scope(|s| {
118+
s.spawn(move |_| {
119+
input_files
120+
.par_iter()
121+
.enumerate()
122+
.progress_with_style(bar_style)
123+
.map(|(genome_idx, (name, fastx1, fastx2))| {
124+
let mut hash_its: Vec<Box<dyn RollHash>> = match seq_type {
125+
HashType::DNA => {
126+
NtHashIterator::new((fastx1, fastx2.as_ref()), rc, min_qual)
127+
.into_iter()
128+
.map(|it| Box::new(it) as Box<dyn RollHash>)
129+
.collect()
130+
}
131+
HashType::AA(level) => {
132+
AaHashIterator::new(fastx1, level.clone(), false)
133+
.into_iter()
134+
.map(|it| Box::new(it) as Box<dyn RollHash>)
135+
.collect()
136+
}
137+
_ => todo!(),
138+
};
139+
140+
if let Some(hash_it) = hash_its.first_mut() {
141+
if hash_it.seq_len() == 0 {
142+
panic!("Genome {} has no valid sequence", genome_idx);
143+
}
144+
(
145+
genome_idx,
146+
Sketch::get_signs(&mut **hash_it, k, sketch_size, min_count),
147+
)
148+
} else {
149+
(genome_idx, Vec::new())
150+
}
151+
})
152+
.for_each_with(tx, |tx, result| {
153+
let _ = tx.send(result);
154+
});
155+
});
156+
});
157+
158+
let mut sketch_results = Vec::new();
159+
while let Ok((genome_idx, sketch)) = rx.recv() {
160+
while sketch_results.len() <= genome_idx {
161+
sketch_results.push(Vec::new());
162+
}
163+
sketch_results[genome_idx] = sketch;
164+
}
165+
166+
(sketch_results, sample_names)
167+
}
168+
169+
fn build_inverted_index(
170+
genome_sketches: &Vec<Vec<u64>>,
171+
sketch_size: u64,
172+
) -> Vec<HashMap<u64, Vec<u32>>> {
173+
// initialize inverted index structure
174+
let mut inverted_index: Vec<HashMap<u64, Vec<u32>>> =
175+
vec![HashMap::new(); sketch_size as usize];
176+
177+
// process each sketch
178+
for (genome_idx, genome_signs) in genome_sketches.into_iter().enumerate() {
179+
for (i, hash) in genome_signs.iter().enumerate() {
180+
// add current genome to the inverted index at the current position
181+
inverted_index[i]
182+
.entry(*hash)
183+
.and_modify(|genome_list| genome_list.push(genome_idx as u32))
184+
.or_insert(vec![genome_idx as u32]);
185+
}
186+
}
187+
inverted_index
188+
}
189+
}

0 commit comments

Comments
 (0)