Merge pull request #34 from bacpop/release

johnlees · web-flow · commit 948d24071515 · 2025-01-27T12:40:20.000Z
Prepare for first release
diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml
@@ -17,29 +17,28 @@ jobs:
     - uses: actions/checkout@v3
     - uses: actions-rs/toolchain@v1
       with:
-        toolchain: nightly
+        toolchain: stable
         override: true
         components: llvm-tools-preview  # Required for grcov
 
     - name: Build
       run: cargo build --verbose
 
-    - name: Run tests
-      run: cargo test --verbose --no-fail-fast
+    - name: Install cargo-llvm-cov and run tests
+      run: cargo install cargo-llvm-cov && cargo llvm-cov --lcov --output-path=./lcov.info
       env:
         CARGO_INCREMENTAL: '0'
-        RUSTFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests'
-        RUSTDOCFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests'
+        RUSTFLAGS: '-Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cinstrument-coverage'
+        RUSTDOCFLAGS: '-Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cinstrument-coverage'
 
-    - name: Run grcov
-      run: |
-        cargo install grcov
-        grcov . -s . --binary-path ./target/debug/ -t lcov --branch --ignore-not-existing --ignore "/*" -o lcov.info
-
-    - name: Upload coverage to Codecov
-      uses: codecov/codecov-action@v3
+    - name: Codecov
+      # You may pin to the exact commit or the version.
+      uses: codecov/codecov-action@v5.1.2
       with:
+        # Repository upload token - get it from codecov.io. Required only for private repositories
         token: ${{ secrets.CODECOV_TOKEN }}
-        files: lcov.info
-        fail_ci_if_error: true
+        file: ./lcov.info
+        # Specify whether the Codecov output should be verbose
         verbose: true
+        fail_ci_if_error: true
+
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -0,0 +1,93 @@
+name: Make release
+
+on:
+  push:
+    tags:
+      - "v*.*.*"
+
+env:
+  CARGO_TERM_COLOR: always
+  GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+
+jobs:
+
+  build-binaries:
+    runs-on: ${{ matrix.config.os }}
+
+    name: Release ${{ matrix.config.os }} (${{ matrix.config.toolchain }})
+
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - {os: macOS-latest,   toolchain: 'stable'}
+          - {os: ubuntu-latest,  toolchain: 'stable'}
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Install rust toolchain
+        uses: actions-rs/toolchain@v1
+        with:
+            toolchain: ${{ matrix.config.toolchain }}
+            override: true
+
+      # NB see https://github.com/actions-rs/cargo if we ever want to try cross
+      # e.g. for Mac M1/arm64
+      - name: Build and package binary
+        shell: bash
+        run: |
+          cargo install --path .
+          cp $HOME/.cargo/bin/sketchlib .
+          tar czvf sketchlib-${{ github.ref_name }}-${{ matrix.config.os }}-${{ matrix.config.toolchain }}.tar.gz sketchlib LICENSE NOTICE README.md
+
+      - name: Upload package
+        if: success()
+        uses: actions/upload-artifact@v4
+        with:
+          name: sketchlib-${{ github.ref_name }}-${{ matrix.config.os }}-${{ matrix.config.toolchain }}
+          path: sketchlib-${{ github.ref_name }}-${{ matrix.config.os }}-${{ matrix.config.toolchain }}.tar.gz
+
+  create-release:
+    runs-on: ubuntu-latest
+
+    needs: build-binaries
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - uses: actions/download-artifact@v4
+        with:
+          path: build
+
+      - name: Organise files
+        shell: bash
+        run: |
+          cp build/sketchlib-${{ github.ref_name }}-macOS-latest-stable/sketchlib-${{ github.ref_name }}-macOS-latest-stable.tar.gz .
+          cp build/sketchlib-${{ github.ref_name }}-ubuntu-latest-stable/sketchlib-${{ github.ref_name }}-ubuntu-latest-stable.tar.gz .
+
+      - name: Create release
+        id: create_release
+        uses: softprops/action-gh-release@v1
+        with:
+          name: Release ${{ github.ref_name }}
+          draft: false
+          prerelease: false
+          fail_on_unmatched_files: true
+          generate_release_notes: true
+          files: |
+            sketchlib-*.tar.gz
+
+  push_crate:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions-rs/toolchain@v1
+        with:
+            toolchain: stable
+            override: true
+      - uses: katyo/publish-crates@v1
+        if: startsWith(github.ref, 'refs/tags/')
+        with:
+            registry-token: ${{ secrets.CARGO_REGISTRY_TOKEN }}
diff --git a/.github/workflows/version.yml b/.github/workflows/version.yml
@@ -1,7 +1,7 @@
 on:
   pull_request:
     branches:
-      - master
+      - main
 
 # This runs on PRs so error can be seen before merging
 name: Version check
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,9 +1,11 @@
 [package]
 name = "sketchlib"
-version = "0.1.0"
+version = "0.1.1"
 authors = [
     "John Lees <jlees@ebi.ac.uk>",
-    "Nicholas Croucher <n.croucher@imperial.ac.uk>"
+    "Nicholas Croucher <n.croucher@imperial.ac.uk>",
+    "Johanna von Wachsmann <wachsmannj@ebi.ac.uk>",
+    "Victor Rodriguez Bouza <vrbouza@ebi.ac.uk>",
 ]
 edition = "2021"
 description = "Genome and amino-acid sketching"
@@ -65,4 +67,4 @@ assert_fs = "1.0.10"
 pretty_assertions = "1.3.0"
 
 [profile.release]
-debug = true
+lto = true
diff --git a/README.md b/README.md
@@ -9,7 +9,64 @@
 ## Description
 
 This is a reimplementation of [pp-sketchlib](https://github.com/bacpop/pp-sketchlib)
-in the rust language.
+in the rust language. This version is optimised for larger sample numbers, particularly
+allowing subsets of samples to be compared.
+
+Sketch databases have two files: `.skm` which is the metadata (samples names, base counts etc)
+and `.skd` which is the actual sketch data.
+
+## Usage
+With all options we typically recommend using `-v` to see all progress during the run.
+
+### Sketching
+
+Using input fasta/fastq files, create a sketch database. Run `sketchlib sketch -h` to see the help.
+
+- List .fasta files on the command line, or use `-f` to provide a file(s). From file,
+these are one line per sample listing the name and fasta file, or name and two read files
+(fastq). Inputs can be gzipped or not, this is automatically detected.
+- To set the k-mer size in the sketch database you can either give a list of sizes with `--k-vals`
+or a sequence `--k-seq` with start,stop,step. e.g. `--k-seq 17,29,4` would sketch at k=17, 21, 25 and 29.
+- Set the sketch size with `-s`. Typically 1000 is enough for species level resolution, 10000 for within-species/strain
+resolution and 100000-1000000 for SNP level resolution.
+- To sketch amino acid sequences use `--seq-type aa --concat-fasta` if you have the typical case
+of each fasta file being a multifasta with many aa sequences. Each one will then be its own sample.
+- You can also sketch structures with .pdb input, see 'Enabling PDB->3Di' below. This is experimental.
+
+### Distances
+
+To compute internal all-vs-all core and accessory distances use:
+```
+sketchlib dist db_name
+```
+Note the database names can be the prefix, or the full path to the .skm file. The output
+is in pairwise 'long' format, which lists the upper triangle of the distance matrix row-by-row.
+
+To calculate distances between two different sample sets, each in their own sketch database, use:
+```
+sketchlib dist db1 db2
+```
+For example, if you want to query distances of a new sample against an existing database,
+first sketch the new sample with e.g. `sketchlib sketch -o db2 new_sample.fasta`, then
+run the above command.
+
+Modifiers:
+- Use `-k` to calculate Jaccard distance at the given k. Otherwise the default is to
+calculate across multiple k and output core and accessory distances.
+- Use `--ani` with `-k` to transform the Jaccard distance into average nucleotide identity.
+- Use `--subset` to provide a list of sample names to include in the distance calculations,
+only these sample will be loaded from the `.skd` file.
+- Use `-o` to write the distances to a file. The default it to write to stdout, so you can also
+use `>` to redirect to a file (progress messages are written to stderr).
+- Use `--knn` to only keep this many nearest neighbour distances. For very large databases
+it may be useful to keep only ~50 distances. This makes the memory use manageable. This sparse output
+can be used with e.g. [mandrake](https://github.com/bacpop/mandrake).
+
+### Other operations
+
+- `merge` joins two existing sketch databases.
+- `append` sketches new input samples, and adds them to an existing database.
+- `delete` removes samples from a sketch database.
 
 ## Enabling PDB->3Di
 conda doesn't work, so make sure it is deactivated
diff --git a/src/distances.rs b/src/distances.rs
@@ -1,3 +1,4 @@
+//! Functions and traits for calculating and storing distances
 use std::cmp::Ordering;
 use std::fmt;
 
diff --git a/src/hashing/mod.rs b/src/hashing/mod.rs
@@ -1,3 +1,4 @@
+//! [nthash](https://github.com/bcgsc/ntHash[) and [aahash](https://github.com/bcgsc/ntHash) iterators
 use clap::ValueEnum;
 use serde::{Deserialize, Serialize};
 
diff --git a/src/io.rs b/src/io.rs
@@ -1,3 +1,4 @@
+//! Functions to read input fasta/fastq files
 use std::fs::File;
 use std::io::{stdout, BufRead, BufReader, BufWriter, Write};
 use std::path::Path;
diff --git a/src/jaccard.rs b/src/jaccard.rs
@@ -1,3 +1,4 @@
+//! Implementation of Jaccard, core and accessory distance calculations
 use crate::multisketch::MultiSketch;
 use crate::sketch::BBITS;
 
diff --git a/src/lib.rs b/src/lib.rs
@@ -1,5 +1,10 @@
-//! DOCS
+//! Fast distance calculations between biological sequences (DNA, AA or structures
+//! via the 3di alphabet). Distances are based on bindash approximations of the Jaccard
+//! distance, with the PopPUNK method to calculate core and accessory distances. nthash/aahash
+//! are used for hash functions to create the sketches
 //!
+//! This package is a work in progress, but is mature enough for research use. See README.md
+//! for current CLI usage.
 
 // #![warn(missing_docs)]
 
diff --git a/src/multisketch.rs b/src/multisketch.rs
@@ -1,3 +1,4 @@
+//! The class to support .skm/.skd reading and writing, containing multiple [`Sketch`] objects
 use anyhow::bail;
 use anyhow::Error;
 use anyhow::{Result, anyhow};
diff --git a/src/sketch.rs b/src/sketch.rs
@@ -1,3 +1,4 @@
+//! Methods to create single sample's sketch
 use std::cmp::Ordering;
 use std::fmt;
 use std::sync::mpsc;
diff --git a/src/sketch_datafile.rs b/src/sketch_datafile.rs
@@ -1,3 +1,4 @@
+//! I/O support and memory mapping used by for lower level read/write to .skd
 use memmap2::Mmap;
 use std::error::Error;
 use std::fs::File;
diff --git a/src/structures.rs b/src/structures.rs
@@ -1,3 +1,4 @@
+//! Support for .pdb files and the 3di alphabet
 use anyhow::Error;
 use crate::io::InputFastx;
 

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+//! Functions and traits for calculating and storing distances`
`1`	`2`	`use std::cmp::Ordering;`
`2`	`3`	`use std::fmt;`
`3`	`4`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+//! [nthash](https://github.com/bcgsc/ntHash[) and [aahash](https://github.com/bcgsc/ntHash) iterators`
`1`	`2`	`use clap::ValueEnum;`
`2`	`3`	`use serde::{Deserialize, Serialize};`
`3`	`4`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+//! Functions to read input fasta/fastq files`
`1`	`2`	`use std::fs::File;`
`2`	`3`	`use std::io::{stdout, BufRead, BufReader, BufWriter, Write};`
`3`	`4`	`use std::path::Path;`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+//! Implementation of Jaccard, core and accessory distance calculations`
`1`	`2`	`use crate::multisketch::MultiSketch;`
`2`	`3`	`use crate::sketch::BBITS;`
`3`	`4`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	+//! The class to support .skm/.skd reading and writing, containing multiple [`Sketch`] objects
`1`	`2`	`use anyhow::bail;`
`2`	`3`	`use anyhow::Error;`
`3`	`4`	`use anyhow::{Result, anyhow};`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+//! Methods to create single sample's sketch`
`1`	`2`	`use std::cmp::Ordering;`
`2`	`3`	`use std::fmt;`
`3`	`4`	`use std::sync::mpsc;`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+//! I/O support and memory mapping used by for lower level read/write to .skd`
`1`	`2`	`use memmap2::Mmap;`
`2`	`3`	`use std::error::Error;`
`3`	`4`	`use std::fs::File;`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+//! Support for .pdb files and the 3di alphabet`
`1`	`2`	`use anyhow::Error;`
`2`	`3`	`use crate::io::InputFastx;`
`3`	`4`