Skip to content

Commit c3d9647

Browse files
author
Johanna
committed
Add delete function
1 parent 90c6d70 commit c3d9647

15 files changed

+452
-9
lines changed

.github/configs/grcov.yml

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
branch: true
2+
ignore-not-existing: true
3+
llvm: true
4+
filter: covered
5+
output-type: lcov
6+
output-path: ./lcov.info
7+
excl-line: "#\\[cfg\\(test\\)\\]"
8+
excl-start: "mod tests \\{"
9+
excl-stop: "\\}"

.github/workflows/codecov.yml

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
name: Rust codecov
2+
3+
on:
4+
push:
5+
branches: [ "master" ]
6+
pull_request:
7+
branches: [ "master" ]
8+
9+
env:
10+
CARGO_TERM_COLOR: always
11+
12+
jobs:
13+
codecov:
14+
runs-on: ubuntu-latest
15+
steps:
16+
- uses: actions/checkout@v3
17+
- uses: actions-rs/toolchain@v1
18+
with:
19+
toolchain: nightly
20+
override: true
21+
components: llvm-tools-preview
22+
- name: Build
23+
run: cargo build --verbose
24+
- name: Run tests
25+
run: cargo test --verbose --no-fail-fast
26+
env:
27+
CARGO_INCREMENTAL: '0'
28+
RUSTFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests'
29+
RUSTDOCFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests'
30+
- name: rust-grcov
31+
uses: xnuter/grcov@master
32+
with:
33+
config: .github/config/grcov.yml
34+
- name: Upload coverage to Codecov
35+
uses: codecov/codecov-action@v3
36+
with:
37+
verbose: true
38+
fail_ci_if_error: true

codecov.yml

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
coverage:
2+
precision: 2
3+
round: down
4+
range: "70...100"
5+
6+
ignore:
7+
- "tests/**/*"

scripts/poppunk_extract_distances.py

+137
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
#!/usr/bin/env python
2+
# vim: set fileencoding=<utf-8> :
3+
# Copyright 2018 John Lees and Nick Croucher
4+
5+
import pickle
6+
import sys, os
7+
import numpy as np
8+
import argparse
9+
import dendropy
10+
from scipy import sparse
11+
12+
# command line parsing
13+
def get_options():
14+
15+
parser = argparse.ArgumentParser(description='Extract tab-separated file of distances from pkl and npy files', prog='extract_distances')
16+
17+
# input options
18+
parser.add_argument('--distances', help='Prefix of input pickle (and optionally,'
19+
' numpy file) of pre-calculated distances (required)',
20+
required=True)
21+
parser.add_argument('--sparse', help='Sparse distance matrix file name',
22+
default = None,
23+
required = False)
24+
parser.add_argument('--tree', help='Newick file containing phylogeny of isolates',
25+
required = False,
26+
default = None)
27+
parser.add_argument('--output', help='Name of output file',
28+
required = True)
29+
30+
return parser.parse_args()
31+
32+
def listDistInts(refSeqs, querySeqs, self=True):
33+
"""Gets the ref and query ID for each row of the distance matrix
34+
35+
Returns an iterable with ref and query ID pairs by row.
36+
37+
Args:
38+
refSeqs (list)
39+
List of reference sequence names.
40+
querySeqs (list)
41+
List of query sequence names.
42+
self (bool)
43+
Whether a self-comparison, used when constructing a database.
44+
Requires refSeqs == querySeqs
45+
Default is True
46+
Returns:
47+
ref, query (str, str)
48+
Iterable of tuples with ref and query names for each distMat row.
49+
"""
50+
num_ref = len(refSeqs)
51+
num_query = len(querySeqs)
52+
if self:
53+
if refSeqs != querySeqs:
54+
raise RuntimeError('refSeqs must equal querySeqs for db building (self = true)')
55+
for i in range(num_ref):
56+
for j in range(i + 1, num_ref):
57+
yield(j, i)
58+
else:
59+
comparisons = [(0,0)] * (len(refSeqs) * len(querySeqs))
60+
for i in range(num_query):
61+
for j in range(num_ref):
62+
yield(j, i)
63+
64+
def isolateNameToLabel(names):
65+
"""Function to process isolate names to labels
66+
appropriate for visualisation.
67+
68+
Args:
69+
names (list)
70+
List of isolate names.
71+
Returns:
72+
labels (list)
73+
List of isolate labels.
74+
"""
75+
# useful to have as a function in case we
76+
# want to remove certain characters
77+
labels = [os.path.splitext(os.path.basename(name))[0] for name in names]
78+
return labels
79+
80+
# main code
81+
if __name__ == "__main__":
82+
83+
# Check input ok
84+
args = get_options()
85+
86+
# open stored distances
87+
with open(args.distances + ".pkl", 'rb') as pickle_file:
88+
rlist, qlist, self = pickle.load(pickle_file)
89+
90+
# get names order
91+
r_names = isolateNameToLabel(rlist)
92+
q_names = isolateNameToLabel(qlist)
93+
94+
# parse distances from tree, if supplied
95+
if args.tree is not None:
96+
# only calculate if all v all
97+
assert r_names == q_names, 'Using a phylogeny requires an all-v-all distance matrix'
98+
# load tree
99+
tree = dendropy.Tree.get(path = args.tree, schema = 'newick')
100+
# calculate distance matrix
101+
pdc = tree.phylogenetic_distance_matrix()
102+
# dict for identifying nodes from names
103+
tip_index = {}
104+
for t in tree.taxon_namespace:
105+
taxon_name = t.label.replace(' ','_')
106+
tip_index[r_names.index(taxon_name)] = t
107+
108+
# Load sparse matrix
109+
if args.sparse is not None:
110+
sparse_mat = sparse.load_npz(args.sparse)
111+
else:
112+
X = np.load(args.distances + ".npy")
113+
114+
# open output file
115+
with open(args.output, 'w') as oFile:
116+
# Write header of output file
117+
if args.sparse is not None:
118+
oFile.write("\t".join(['Query', 'Reference', 'Core']))
119+
else:
120+
oFile.write("\t".join(['Query', 'Reference', 'Core', 'Accessory']))
121+
if args.tree is not None:
122+
oFile.write("\t" + 'Patristic')
123+
oFile.write("\n")
124+
# Write distances
125+
if args.sparse is not None:
126+
for (r_index, q_index, dist) in zip(sparse_mat.col, sparse_mat.row, sparse_mat.data):
127+
oFile.write("\t".join([q_names[q_index], r_names[r_index], str(dist)]))
128+
if args.tree is not None:
129+
oFile.write("\t" + str(pdc(tip_index[r_index], tip_index[q_index])))
130+
oFile.write("\n")
131+
else:
132+
for i, (r_index, q_index) in enumerate(listDistInts(r_names, q_names, r_names == q_names)):
133+
oFile.write("\t".join([q_names[q_index], r_names[r_index], str(X[i,0]), str(X[i,1])]))
134+
if args.tree is not None:
135+
oFile.write("\t" + str(pdc(tip_index[r_index], tip_index[q_index])))
136+
oFile.write("\n")
137+

src/cli.rs

+16
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,22 @@ pub enum Commands {
204204
#[arg(long, value_enum, default_value_t = DEFAULT_LEVEL)]
205205
level: AaLevel,
206206
},
207+
208+
/// Delete genome(s) from a database (input: one id per line)
209+
Delete {
210+
/// Sketching database basename (so without .skm or .skd)
211+
#[arg(required = true)]
212+
db: String,
213+
214+
/// Input file with IDs to delete (one ID per line)
215+
#[arg(required = true)]
216+
genome_ids: String,
217+
218+
/// output file name
219+
#[arg(required = true)]
220+
output_file: String,
221+
},
222+
207223
/// Print information about a .skm file
208224
Info {
209225
/// Sketch metadata file (.skm) to describe

src/lib.rs

+36
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ pub mod utils;
4242
use std::fs::{File, OpenOptions};
4343
use std::io::copy;
4444

45+
use std::io::BufRead;
46+
use std::path::Path;
47+
4548
/// Default k-mer size for (genome) sketching
4649
pub const DEFAULT_KMER: usize = 17;
4750
/// Chunk size in parallel distance calculations
@@ -481,6 +484,39 @@ pub fn main() -> Result<(), Error> {
481484
Ok(())
482485
}
483486

487+
Commands::Delete {
488+
db,
489+
genome_ids,
490+
output_file,
491+
} => {
492+
let ref_db = utils::strip_sketch_extension(db);
493+
494+
log::info!("Reaging input genomes");
495+
let path = Path::new(genome_ids);
496+
let file = File::open(&path)?;
497+
let reader = std::io::BufReader::new(file);
498+
499+
// Read in genomes to
500+
let ids: Vec<String> = reader.lines().filter_map(|line| line.ok()).collect();
501+
502+
log::info!("Reading input metadata");
503+
let mut sketches: MultiSketch = MultiSketch::load(ref_db)
504+
.unwrap_or_else(|_| panic!("Could not read sketch metadata from {}.skm", ref_db));
505+
506+
507+
println!("BLUB");
508+
// write new .skm
509+
sketches.remove_metadata(ref_db, output_file, &ids);
510+
511+
// remove samples from .skd file
512+
log::info!("Remove genomes and writing output");
513+
sketches.remove_genomes(ref_db, output_file, &ids)?;
514+
515+
log::info!("Finished writing filtered sketch data to {}", output_file);
516+
517+
Ok(())
518+
}
519+
484520
Commands::Info {
485521
skm_file,
486522
sample_info,

src/multisketch.rs

+82-6
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
use core::panic;
21
use anyhow::Error;
2+
use core::panic;
33
use std::fmt;
44
use std::fs::File;
55
use std::io::{BufReader, BufWriter};
@@ -147,11 +147,6 @@ impl MultiSketch {
147147
s1_slice
148148
}
149149

150-
pub fn remove_sketches(&self, ids: &[String]) {
151-
// TODO: remove sketch bins which belong to the duplicate ids
152-
todo!();
153-
}
154-
155150
pub fn is_compatible_with(&self, sketch2: &Self) -> bool {
156151
self.kmer_lengths() == sketch2.kmer_lengths()
157152
&& self.sketch_size == sketch2.sketch_size
@@ -196,6 +191,87 @@ impl MultiSketch {
196191

197192
self
198193
}
194+
pub fn remove_metadata(
195+
&mut self,
196+
input_prefix: &str,
197+
output_file_name: &str,
198+
genome_ids_to_remove: &[String],
199+
) -> std::io::Result<()> {
200+
201+
println!("{}", self);
202+
let mut new_sketch_metadata: Vec<Sketch> = Vec::with_capacity(self.sketch_metadata.len());
203+
204+
for sketch in &self.sketch_metadata {
205+
if !genome_ids_to_remove.contains(&(*sketch.name()).to_string()) {
206+
new_sketch_metadata.push(sketch.clone());
207+
}
208+
}
209+
self.sketch_metadata = new_sketch_metadata;
210+
self.save_metadata(output_file_name);
211+
Ok(())
212+
}
213+
214+
pub fn remove_genomes(
215+
&mut self,
216+
input_prefix: &str,
217+
output_file: &str,
218+
genome_ids_to_remove: &[String],
219+
) -> std::io::Result<()> {
220+
// Check if all genome IDs to remove exist and get their positions
221+
let mut positions_to_remove = Vec::new();
222+
let mut missing_ids = Vec::new();
223+
224+
for id in genome_ids_to_remove {
225+
println!("{}",id);
226+
if let Some(&position) = self.name_map.get(id) {
227+
positions_to_remove.push(position);
228+
} else {
229+
missing_ids.push(id);
230+
}
231+
}
232+
if !missing_ids.is_empty() {
233+
panic!("The following genome IDs were not found: {:?}", missing_ids);
234+
}
235+
236+
// Create a list of indices to keep
237+
let indices_to_keep: Vec<usize> = (0..self.sketch_metadata.len())
238+
.filter(|&idx| !positions_to_remove.contains(&idx))
239+
.collect();
240+
241+
let input_filename = format!("{}.skd", input_prefix);
242+
let output_filename = format!("{}.skd", output_file);
243+
SketchArrayFile::write_batch(
244+
&input_filename,
245+
&output_filename,
246+
&indices_to_keep,
247+
self.sample_stride,
248+
)
249+
.unwrap_or_else(|e| {
250+
eprintln!("Error during batch write: {}", e);
251+
std::process::exit(1);
252+
});
253+
println!("Output sketch data written to: {output_filename}",);
254+
255+
Ok(())
256+
}
257+
258+
// pub fn get_genome_positions(&self, genome_ids: &[String], positions: &mut Vec<usize>) {
259+
// let mut missing_ids = Vec::new();
260+
261+
// for id in genome_ids {
262+
// if let Some(&position) = self.name_map.get(id) {
263+
// positions.push(position);
264+
// } else {
265+
// missing_ids.push(id.clone());
266+
// }
267+
// }
268+
269+
// if !missing_ids.is_empty() {
270+
// panic!("The following genome IDs were not found: {:?}", missing_ids);
271+
// }
272+
273+
// positions.sort();
274+
// }
199275

200276
// This function is called when sketches are merged, not when they are
201277
// first sketched (this is handled by sketch::sketch_files())

0 commit comments

Comments
 (0)