Skip to content

Commit 948d240

Browse files
authored
Merge pull request #34 from bacpop/release
Prepare for first release
2 parents 37a5879 + 7ba0943 commit 948d240

14 files changed

+184
-20
lines changed

.github/workflows/codecov.yml

+13-14
Original file line numberDiff line numberDiff line change
@@ -17,29 +17,28 @@ jobs:
1717
- uses: actions/checkout@v3
1818
- uses: actions-rs/toolchain@v1
1919
with:
20-
toolchain: nightly
20+
toolchain: stable
2121
override: true
2222
components: llvm-tools-preview # Required for grcov
2323

2424
- name: Build
2525
run: cargo build --verbose
2626

27-
- name: Run tests
28-
run: cargo test --verbose --no-fail-fast
27+
- name: Install cargo-llvm-cov and run tests
28+
run: cargo install cargo-llvm-cov && cargo llvm-cov --lcov --output-path=./lcov.info
2929
env:
3030
CARGO_INCREMENTAL: '0'
31-
RUSTFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests'
32-
RUSTDOCFLAGS: '-Zprofile -Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cpanic=abort -Zpanic_abort_tests'
31+
RUSTFLAGS: '-Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cinstrument-coverage'
32+
RUSTDOCFLAGS: '-Ccodegen-units=1 -Cinline-threshold=0 -Clink-dead-code -Coverflow-checks=off -Cinstrument-coverage'
3333

34-
- name: Run grcov
35-
run: |
36-
cargo install grcov
37-
grcov . -s . --binary-path ./target/debug/ -t lcov --branch --ignore-not-existing --ignore "/*" -o lcov.info
38-
39-
- name: Upload coverage to Codecov
40-
uses: codecov/codecov-action@v3
34+
- name: Codecov
35+
# You may pin to the exact commit or the version.
36+
uses: codecov/[email protected]
4137
with:
38+
# Repository upload token - get it from codecov.io. Required only for private repositories
4239
token: ${{ secrets.CODECOV_TOKEN }}
43-
files: lcov.info
44-
fail_ci_if_error: true
40+
file: ./lcov.info
41+
# Specify whether the Codecov output should be verbose
4542
verbose: true
43+
fail_ci_if_error: true
44+

.github/workflows/release.yml

+93
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
name: Make release
2+
3+
on:
4+
push:
5+
tags:
6+
- "v*.*.*"
7+
8+
env:
9+
CARGO_TERM_COLOR: always
10+
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
11+
12+
jobs:
13+
14+
build-binaries:
15+
runs-on: ${{ matrix.config.os }}
16+
17+
name: Release ${{ matrix.config.os }} (${{ matrix.config.toolchain }})
18+
19+
strategy:
20+
fail-fast: false
21+
matrix:
22+
config:
23+
- {os: macOS-latest, toolchain: 'stable'}
24+
- {os: ubuntu-latest, toolchain: 'stable'}
25+
26+
steps:
27+
- uses: actions/checkout@v3
28+
29+
- name: Install rust toolchain
30+
uses: actions-rs/toolchain@v1
31+
with:
32+
toolchain: ${{ matrix.config.toolchain }}
33+
override: true
34+
35+
# NB see https://github.com/actions-rs/cargo if we ever want to try cross
36+
# e.g. for Mac M1/arm64
37+
- name: Build and package binary
38+
shell: bash
39+
run: |
40+
cargo install --path .
41+
cp $HOME/.cargo/bin/sketchlib .
42+
tar czvf sketchlib-${{ github.ref_name }}-${{ matrix.config.os }}-${{ matrix.config.toolchain }}.tar.gz sketchlib LICENSE NOTICE README.md
43+
44+
- name: Upload package
45+
if: success()
46+
uses: actions/upload-artifact@v4
47+
with:
48+
name: sketchlib-${{ github.ref_name }}-${{ matrix.config.os }}-${{ matrix.config.toolchain }}
49+
path: sketchlib-${{ github.ref_name }}-${{ matrix.config.os }}-${{ matrix.config.toolchain }}.tar.gz
50+
51+
create-release:
52+
runs-on: ubuntu-latest
53+
54+
needs: build-binaries
55+
56+
steps:
57+
- uses: actions/checkout@v2
58+
59+
- uses: actions/download-artifact@v4
60+
with:
61+
path: build
62+
63+
- name: Organise files
64+
shell: bash
65+
run: |
66+
cp build/sketchlib-${{ github.ref_name }}-macOS-latest-stable/sketchlib-${{ github.ref_name }}-macOS-latest-stable.tar.gz .
67+
cp build/sketchlib-${{ github.ref_name }}-ubuntu-latest-stable/sketchlib-${{ github.ref_name }}-ubuntu-latest-stable.tar.gz .
68+
69+
- name: Create release
70+
id: create_release
71+
uses: softprops/action-gh-release@v1
72+
with:
73+
name: Release ${{ github.ref_name }}
74+
draft: false
75+
prerelease: false
76+
fail_on_unmatched_files: true
77+
generate_release_notes: true
78+
files: |
79+
sketchlib-*.tar.gz
80+
81+
push_crate:
82+
runs-on: ubuntu-latest
83+
84+
steps:
85+
- uses: actions/checkout@v2
86+
- uses: actions-rs/toolchain@v1
87+
with:
88+
toolchain: stable
89+
override: true
90+
- uses: katyo/publish-crates@v1
91+
if: startsWith(github.ref, 'refs/tags/')
92+
with:
93+
registry-token: ${{ secrets.CARGO_REGISTRY_TOKEN }}

.github/workflows/version.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
on:
22
pull_request:
33
branches:
4-
- master
4+
- main
55

66
# This runs on PRs so error can be seen before merging
77
name: Version check

Cargo.toml

+5-3
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
[package]
22
name = "sketchlib"
3-
version = "0.1.0"
3+
version = "0.1.1"
44
authors = [
55
"John Lees <[email protected]>",
6-
"Nicholas Croucher <[email protected]>"
6+
"Nicholas Croucher <[email protected]>",
7+
"Johanna von Wachsmann <[email protected]>",
8+
"Victor Rodriguez Bouza <[email protected]>",
79
]
810
edition = "2021"
911
description = "Genome and amino-acid sketching"
@@ -65,4 +67,4 @@ assert_fs = "1.0.10"
6567
pretty_assertions = "1.3.0"
6668

6769
[profile.release]
68-
debug = true
70+
lto = true

README.md

+58-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,64 @@
99
## Description
1010

1111
This is a reimplementation of [pp-sketchlib](https://github.com/bacpop/pp-sketchlib)
12-
in the rust language.
12+
in the rust language. This version is optimised for larger sample numbers, particularly
13+
allowing subsets of samples to be compared.
14+
15+
Sketch databases have two files: `.skm` which is the metadata (samples names, base counts etc)
16+
and `.skd` which is the actual sketch data.
17+
18+
## Usage
19+
With all options we typically recommend using `-v` to see all progress during the run.
20+
21+
### Sketching
22+
23+
Using input fasta/fastq files, create a sketch database. Run `sketchlib sketch -h` to see the help.
24+
25+
- List .fasta files on the command line, or use `-f` to provide a file(s). From file,
26+
these are one line per sample listing the name and fasta file, or name and two read files
27+
(fastq). Inputs can be gzipped or not, this is automatically detected.
28+
- To set the k-mer size in the sketch database you can either give a list of sizes with `--k-vals`
29+
or a sequence `--k-seq` with start,stop,step. e.g. `--k-seq 17,29,4` would sketch at k=17, 21, 25 and 29.
30+
- Set the sketch size with `-s`. Typically 1000 is enough for species level resolution, 10000 for within-species/strain
31+
resolution and 100000-1000000 for SNP level resolution.
32+
- To sketch amino acid sequences use `--seq-type aa --concat-fasta` if you have the typical case
33+
of each fasta file being a multifasta with many aa sequences. Each one will then be its own sample.
34+
- You can also sketch structures with .pdb input, see 'Enabling PDB->3Di' below. This is experimental.
35+
36+
### Distances
37+
38+
To compute internal all-vs-all core and accessory distances use:
39+
```
40+
sketchlib dist db_name
41+
```
42+
Note the database names can be the prefix, or the full path to the .skm file. The output
43+
is in pairwise 'long' format, which lists the upper triangle of the distance matrix row-by-row.
44+
45+
To calculate distances between two different sample sets, each in their own sketch database, use:
46+
```
47+
sketchlib dist db1 db2
48+
```
49+
For example, if you want to query distances of a new sample against an existing database,
50+
first sketch the new sample with e.g. `sketchlib sketch -o db2 new_sample.fasta`, then
51+
run the above command.
52+
53+
Modifiers:
54+
- Use `-k` to calculate Jaccard distance at the given k. Otherwise the default is to
55+
calculate across multiple k and output core and accessory distances.
56+
- Use `--ani` with `-k` to transform the Jaccard distance into average nucleotide identity.
57+
- Use `--subset` to provide a list of sample names to include in the distance calculations,
58+
only these sample will be loaded from the `.skd` file.
59+
- Use `-o` to write the distances to a file. The default it to write to stdout, so you can also
60+
use `>` to redirect to a file (progress messages are written to stderr).
61+
- Use `--knn` to only keep this many nearest neighbour distances. For very large databases
62+
it may be useful to keep only ~50 distances. This makes the memory use manageable. This sparse output
63+
can be used with e.g. [mandrake](https://github.com/bacpop/mandrake).
64+
65+
### Other operations
66+
67+
- `merge` joins two existing sketch databases.
68+
- `append` sketches new input samples, and adds them to an existing database.
69+
- `delete` removes samples from a sketch database.
1370

1471
## Enabling PDB->3Di
1572
conda doesn't work, so make sure it is deactivated

src/distances.rs

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
//! Functions and traits for calculating and storing distances
12
use std::cmp::Ordering;
23
use std::fmt;
34

src/hashing/mod.rs

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
//! [nthash](https://github.com/bcgsc/ntHash[) and [aahash](https://github.com/bcgsc/ntHash) iterators
12
use clap::ValueEnum;
23
use serde::{Deserialize, Serialize};
34

src/io.rs

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
//! Functions to read input fasta/fastq files
12
use std::fs::File;
23
use std::io::{stdout, BufRead, BufReader, BufWriter, Write};
34
use std::path::Path;

src/jaccard.rs

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
//! Implementation of Jaccard, core and accessory distance calculations
12
use crate::multisketch::MultiSketch;
23
use crate::sketch::BBITS;
34

src/lib.rs

+6-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
1-
//! DOCS
1+
//! Fast distance calculations between biological sequences (DNA, AA or structures
2+
//! via the 3di alphabet). Distances are based on bindash approximations of the Jaccard
3+
//! distance, with the PopPUNK method to calculate core and accessory distances. nthash/aahash
4+
//! are used for hash functions to create the sketches
25
//!
6+
//! This package is a work in progress, but is mature enough for research use. See README.md
7+
//! for current CLI usage.
38
49
// #![warn(missing_docs)]
510

src/multisketch.rs

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
//! The class to support .skm/.skd reading and writing, containing multiple [`Sketch`] objects
12
use anyhow::bail;
23
use anyhow::Error;
34
use anyhow::{Result, anyhow};

src/sketch.rs

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
//! Methods to create single sample's sketch
12
use std::cmp::Ordering;
23
use std::fmt;
34
use std::sync::mpsc;

src/sketch_datafile.rs

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
//! I/O support and memory mapping used by for lower level read/write to .skd
12
use memmap2::Mmap;
23
use std::error::Error;
34
use std::fs::File;

src/structures.rs

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
//! Support for .pdb files and the 3di alphabet
12
use anyhow::Error;
23
use crate::io::InputFastx;
34

0 commit comments

Comments
 (0)