bacpop
diff --git a/‎.github/configs/grcov.yml
+9 b/‎.github/configs/grcov.yml
+9
diff --git a/‎.github/workflows/codecov.yml
+38 b/‎.github/workflows/codecov.yml
+38
diff --git a/‎codecov.yml
+7 b/‎codecov.yml
+7
diff --git a/‎scripts/poppunk_extract_distances.py
+137 b/‎scripts/poppunk_extract_distances.py
+137
diff --git a/‎src/cli.rs
+16 b/‎src/cli.rs
+16
diff --git a/‎src/lib.rs
+36 b/‎src/lib.rs
+36
diff --git a/‎src/multisketch.rs
+82-6 b/‎src/multisketch.rs
+82-6
@@ -0,0 +1,9 @@
+branch: true
+ignore-not-existing: true
+llvm: true
+filter: covered
+output-type: lcov
+output-path: ./lcov.info
+excl-line: "#\\[cfg\\(test\\)\\]"
+excl-start: "mod tests \\{"
+excl-stop: "\\}"
@@ -0,0 +1,38 @@
+name: Rust codecov
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+
+env:
+  CARGO_TERM_COLOR: always
+
+jobs:
+  codecov:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - uses: actions-rs/toolchain@v1
+      with:
+        toolchain: nightly
+        override: true
+        components: llvm-tools-preview
+    - name: Build
+      run: cargo build --verbose
+    - name: Run tests
+      run: cargo test --verbose --no-fail-fast
+      env:
+        CARGO_INCREMENTAL: '0'
+        RUSTFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests'
+        RUSTDOCFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests'
+    - name: rust-grcov
+      uses: xnuter/grcov@master
+      with:
+        config: .github/config/grcov.yml
+    - name: Upload coverage to Codecov
+      uses: codecov/codecov-action@v3
+      with:
+        verbose: true
+        fail_ci_if_error: true
@@ -0,0 +1,7 @@
+coverage:
+  precision: 2
+  round: down
+  range: "70...100"
+
+ignore:
+  - "tests/**/*"
@@ -0,0 +1,137 @@
+#!/usr/bin/env python
+# vim: set fileencoding=<utf-8> :
+# Copyright 2018 John Lees and Nick Croucher
+
+import pickle
+import sys, os
+import numpy as np
+import argparse
+import dendropy
+from scipy import sparse
+
+# command line parsing
+def get_options():
+
+    parser = argparse.ArgumentParser(description='Extract tab-separated file of distances from pkl and npy files', prog='extract_distances')
+
+    # input options
+    parser.add_argument('--distances', help='Prefix of input pickle (and optionally,'
+    '  numpy file) of pre-calculated distances (required)',
+                                    required=True)
+    parser.add_argument('--sparse', help='Sparse distance matrix file name',
+                                    default = None,
+                                    required = False)
+    parser.add_argument('--tree', help='Newick file containing phylogeny of isolates',
+                                    required = False,
+                                    default = None)
+    parser.add_argument('--output', help='Name of output file',
+                                    required = True)
+
+    return parser.parse_args()
+
+def listDistInts(refSeqs, querySeqs, self=True):
+    """Gets the ref and query ID for each row of the distance matrix
+
+    Returns an iterable with ref and query ID pairs by row.
+
+    Args:
+        refSeqs (list)
+            List of reference sequence names.
+        querySeqs (list)
+            List of query sequence names.
+        self (bool)
+            Whether a self-comparison, used when constructing a database.
+            Requires refSeqs == querySeqs
+            Default is True
+    Returns:
+        ref, query (str, str)
+            Iterable of tuples with ref and query names for each distMat row.
+    """
+    num_ref = len(refSeqs)
+    num_query = len(querySeqs)
+    if self:
+        if refSeqs != querySeqs:
+            raise RuntimeError('refSeqs must equal querySeqs for db building (self = true)')
+        for i in range(num_ref):
+            for j in range(i + 1, num_ref):
+                yield(j, i)
+    else:
+        comparisons = [(0,0)] * (len(refSeqs) * len(querySeqs))
+        for i in range(num_query):
+            for j in range(num_ref):
+                yield(j, i)
+
+def isolateNameToLabel(names):
+    """Function to process isolate names to labels
+    appropriate for visualisation.
+
+    Args:
+        names (list)
+            List of isolate names.
+    Returns:
+        labels (list)
+            List of isolate labels.
+    """
+    # useful to have as a function in case we
+    # want to remove certain characters
+    labels = [os.path.splitext(os.path.basename(name))[0] for name in names]
+    return labels
+
+# main code
+if __name__ == "__main__":
+
+    # Check input ok
+    args = get_options()
+
+    # open stored distances
+    with open(args.distances + ".pkl", 'rb') as pickle_file:
+        rlist, qlist, self = pickle.load(pickle_file)
+
+    # get names order
+    r_names = isolateNameToLabel(rlist)
+    q_names = isolateNameToLabel(qlist)
+
+    # parse distances from tree, if supplied
+    if args.tree is not None:
+        # only calculate if all v all
+        assert r_names == q_names, 'Using a phylogeny requires an all-v-all distance matrix'
+        # load tree
+        tree = dendropy.Tree.get(path = args.tree, schema = 'newick')
+        # calculate distance matrix
+        pdc = tree.phylogenetic_distance_matrix()
+        # dict for identifying nodes from names
+        tip_index = {}
+        for t in tree.taxon_namespace:
+            taxon_name = t.label.replace(' ','_')
+            tip_index[r_names.index(taxon_name)] = t
+
+    # Load sparse matrix
+    if args.sparse is not None:
+        sparse_mat = sparse.load_npz(args.sparse)
+    else:
+        X = np.load(args.distances + ".npy")
+
+    # open output file
+    with open(args.output, 'w') as oFile:
+        # Write header of output file
+        if args.sparse is not None:
+            oFile.write("\t".join(['Query', 'Reference', 'Core']))
+        else:
+            oFile.write("\t".join(['Query', 'Reference', 'Core', 'Accessory']))
+        if args.tree is not None:
+            oFile.write("\t" + 'Patristic')
+        oFile.write("\n")
+        # Write distances
+        if args.sparse is not None:
+            for (r_index, q_index, dist) in zip(sparse_mat.col, sparse_mat.row, sparse_mat.data):
+                oFile.write("\t".join([q_names[q_index], r_names[r_index], str(dist)]))
+                if args.tree is not None:
+                    oFile.write("\t" + str(pdc(tip_index[r_index], tip_index[q_index])))
+                oFile.write("\n")
+        else:
+            for i, (r_index, q_index) in enumerate(listDistInts(r_names, q_names, r_names == q_names)):
+                oFile.write("\t".join([q_names[q_index], r_names[r_index], str(X[i,0]), str(X[i,1])]))
+                if args.tree is not None:
+                    oFile.write("\t" + str(pdc(tip_index[r_index], tip_index[q_index])))
+                oFile.write("\n")
+
@@ -204,6 +204,22 @@ pub enum Commands {
         #[arg(long, value_enum, default_value_t = DEFAULT_LEVEL)]
         level: AaLevel,
     },
+
+    /// Delete genome(s) from a database (input: one id per line)
+    Delete {
+        /// Sketching database basename (so without .skm or .skd)
+        #[arg(required = true)]
+        db: String,
+
+        /// Input file with IDs to delete (one ID per line)
+        #[arg(required = true)]
+        genome_ids: String,
+
+        /// output file name
+        #[arg(required = true)]
+        output_file: String,
+    },
+
     /// Print information about a .skm file
     Info {
         /// Sketch metadata file (.skm) to describe
 
@@ -42,6 +42,9 @@ pub mod utils;
 use std::fs::{File, OpenOptions};
 use std::io::copy;
 
+use std::io::BufRead;
+use std::path::Path;
+
 /// Default k-mer size for (genome) sketching
 pub const DEFAULT_KMER: usize = 17;
 /// Chunk size in parallel distance calculations
@@ -481,6 +484,39 @@ pub fn main() -> Result<(), Error> {
             Ok(())
         }
 
+        Commands::Delete {
+            db,
+            genome_ids,
+            output_file,
+        } => {
+            let ref_db = utils::strip_sketch_extension(db);
+
+            log::info!("Reaging input genomes");
+            let path = Path::new(genome_ids);
+            let file = File::open(&path)?;
+            let reader = std::io::BufReader::new(file);
+
+            // Read in genomes to
+            let ids: Vec<String> = reader.lines().filter_map(|line| line.ok()).collect();
+
+            log::info!("Reading input metadata");
+            let mut sketches: MultiSketch = MultiSketch::load(ref_db)
+                .unwrap_or_else(|_| panic!("Could not read sketch metadata from {}.skm", ref_db));
+
+            
+            println!("BLUB");
+            // write new .skm 
+            sketches.remove_metadata(ref_db, output_file, &ids);
+
+            // remove samples from .skd file
+            log::info!("Remove genomes and writing output");
+            sketches.remove_genomes(ref_db, output_file, &ids)?;
+
+            log::info!("Finished writing filtered sketch data to {}", output_file);
+
+            Ok(())
+        }
+
         Commands::Info {
             skm_file,
             sample_info,
 
@@ -1,5 +1,5 @@
-use core::panic;
 use anyhow::Error;
+use core::panic;
 use std::fmt;
 use std::fs::File;
 use std::io::{BufReader, BufWriter};
@@ -147,11 +147,6 @@ impl MultiSketch {
         s1_slice
     }
 
-    pub fn remove_sketches(&self, ids: &[String]) {
-        // TODO: remove sketch bins which belong to the duplicate ids
-        todo!();
-    }
-
     pub fn is_compatible_with(&self, sketch2: &Self) -> bool {
         self.kmer_lengths() == sketch2.kmer_lengths()
             && self.sketch_size == sketch2.sketch_size
@@ -196,6 +191,87 @@ impl MultiSketch {
 
         self
     }
+    pub fn remove_metadata(
+        &mut self,
+        input_prefix: &str,
+        output_file_name: &str,
+        genome_ids_to_remove: &[String],
+    ) -> std::io::Result<()> {
+        
+        println!("{}", self);
+        let mut new_sketch_metadata: Vec<Sketch> = Vec::with_capacity(self.sketch_metadata.len());
+
+        for sketch in &self.sketch_metadata {
+            if !genome_ids_to_remove.contains(&(*sketch.name()).to_string()) {
+                new_sketch_metadata.push(sketch.clone());
+            }
+        }
+        self.sketch_metadata = new_sketch_metadata;
+        self.save_metadata(output_file_name);
+        Ok(())
+    }
+
+    pub fn remove_genomes(
+        &mut self,
+        input_prefix: &str,
+        output_file: &str,
+        genome_ids_to_remove: &[String],
+    ) -> std::io::Result<()> {
+        // Check if all genome IDs to remove exist and get their positions
+        let mut positions_to_remove = Vec::new();
+        let mut missing_ids = Vec::new();
+
+        for id in genome_ids_to_remove {
+            println!("{}",id);
+            if let Some(&position) = self.name_map.get(id) {
+                positions_to_remove.push(position);
+            } else {
+                missing_ids.push(id);
+            }
+        }
+        if !missing_ids.is_empty() {
+            panic!("The following genome IDs were not found: {:?}", missing_ids);
+        }
+
+        // Create a list of indices to keep
+        let indices_to_keep: Vec<usize> = (0..self.sketch_metadata.len())
+            .filter(|&idx| !positions_to_remove.contains(&idx))
+            .collect();
+
+        let input_filename = format!("{}.skd", input_prefix);
+        let output_filename = format!("{}.skd", output_file);
+        SketchArrayFile::write_batch(
+            &input_filename,
+            &output_filename,
+            &indices_to_keep,
+            self.sample_stride,
+        )
+        .unwrap_or_else(|e| {
+            eprintln!("Error during batch write: {}", e);
+            std::process::exit(1);
+        });
+        println!("Output sketch data written to: {output_filename}",);
+
+        Ok(())
+    }
+
+    // pub fn get_genome_positions(&self, genome_ids: &[String], positions: &mut Vec<usize>) {
+    //     let mut missing_ids = Vec::new();
+
+    //     for id in genome_ids {
+    //         if let Some(&position) = self.name_map.get(id) {
+    //             positions.push(position);
+    //         } else {
+    //             missing_ids.push(id.clone());
+    //         }
+    //     }
+
+    //     if !missing_ids.is_empty() {
+    //         panic!("The following genome IDs were not found: {:?}", missing_ids);
+    //     }
+
+    //     positions.sort();
+    // }
 
     // This function is called when sketches are merged, not when they are
     // first sketched (this is handled by sketch::sketch_files())