Skip to content

Commit 35e4258

Browse files
authored
Merge pull request #22 from bacpop/johanna_dev
Add command to merge sketch databases
2 parents 9a67cca + 40411d5 commit 35e4258

25 files changed

+656
-16
lines changed

.github/workflows/ci.yml

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
name: Cargo Build & Test
2+
3+
on:
4+
push:
5+
6+
env:
7+
CARGO_TERM_COLOR: always
8+
9+
jobs:
10+
build_and_test:
11+
name: Rust project - latest
12+
runs-on: ubuntu-latest
13+
strategy:
14+
matrix:
15+
toolchain:
16+
- stable
17+
- beta
18+
- nightly
19+
steps:
20+
- uses: actions/checkout@v3
21+
- run: rustup update ${{ matrix.toolchain }} && rustup default ${{ matrix.toolchain }}
22+
- run: cargo build --verbose
23+
- run: cargo test --verbose

.github/workflows/clippy.yml

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
on: push
2+
name: Clippy check
3+
jobs:
4+
clippy_check:
5+
runs-on: ubuntu-latest
6+
steps:
7+
- uses: actions/checkout@v1
8+
- uses: actions-rs/toolchain@v1
9+
with:
10+
toolchain: nightly
11+
components: clippy
12+
override: true
13+
- uses: actions-rs/clippy-check@v1
14+
with:
15+
token: ${{ secrets.GITHUB_TOKEN }}
16+
args: --all-features

.github/workflows/version.yml

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
on:
2+
pull_request:
3+
branches:
4+
- master
5+
6+
# This runs on PRs so error can be seen before merging
7+
name: Version check
8+
9+
jobs:
10+
all:
11+
runs-on: ubuntu-latest
12+
13+
name: Version check
14+
15+
env:
16+
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
17+
18+
steps:
19+
- uses: actions/checkout@v3
20+
with:
21+
fetch-depth: 0
22+
23+
- name: Check version format and availability
24+
run: ./scripts/version_check.sh

scripts/version_check.sh

+60
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#!/usr/bin/env bash
2+
set -e
3+
# Usage:
4+
# check_version.sh
5+
#
6+
# Reads version from Cargo.toml and checks it against tags
7+
#
8+
# Credit to @richfitz for this from the dust package:
9+
# https://github.com/mrc-ide/dust/blob/master/scripts/version_check
10+
VERSION=${1:-$(grep '^version' Cargo.toml | sed 's/.*= *//' | sed 's/"//g')}
11+
TAG="v${VERSION}"
12+
13+
echo "Proposed version number '$VERSION'"
14+
15+
if echo "$VERSION" | grep -Eq "[0-9]+[.][0-9]+[.][0-9]+"; then
16+
echo "[OK] Version number in correct format"
17+
else
18+
echo "[ERROR] Invalid format version number '$VERSION' must be in format 'x.y.z'"
19+
exit 1
20+
fi
21+
22+
EXIT_CODE=0
23+
24+
echo "Updating remote git data"
25+
git fetch --quiet
26+
27+
BRANCH_DEFAULT=$(git remote show origin | awk '/HEAD branch/ {print $NF}')
28+
LAST_TAG=$(git describe --tags --abbrev=0 "origin/${BRANCH_DEFAULT}")
29+
30+
echo "Last tag was $LAST_TAG"
31+
32+
if git rev-parse "$TAG" >/dev/null 2>&1; then
33+
echo "[ERROR] Tag $TAG already exists - update version number in Cargo.toml"
34+
exit 1
35+
else
36+
echo "[OK] Version number not yet present as git tag"
37+
fi
38+
39+
MAJOR=$(echo $VERSION | cut -d. -f1)
40+
MINOR=$(echo $VERSION | cut -d. -f2)
41+
PATCH=$(echo $VERSION | cut -d. -f3)
42+
43+
LAST_VERSION=$(echo "$LAST_TAG" | sed 's/^v//')
44+
LAST_MAJOR=$(echo $LAST_VERSION | cut -d. -f1)
45+
LAST_MINOR=$(echo $LAST_VERSION | cut -d. -f2)
46+
LAST_PATCH=$(echo $LAST_VERSION | cut -d. -f3)
47+
48+
if (( $MAJOR > $LAST_MAJOR )); then
49+
echo "[OK] Increasing MAJOR version"
50+
exit $EXIT_CODE
51+
elif (( $MINOR > $LAST_MINOR )); then
52+
echo "[OK] Increasing MINOR version"
53+
exit $EXIT_CODE
54+
elif (( $PATCH > $LAST_PATCH )); then
55+
echo "[OK] Increasing PATCH version"
56+
exit $EXIT_CODE
57+
else
58+
echo "[ERROR] Version number has not increased relative to $LAST_VERSION"
59+
exit 1
60+
fi

src/cli.rs

+14-4
Original file line numberDiff line numberDiff line change
@@ -147,10 +147,20 @@ pub enum Commands {
147147
#[arg(long, value_parser = valid_cpus, default_value_t = 1)]
148148
threads: usize,
149149
},
150-
// TODO add a merge mode
151-
// TODO add a concat mode (add sketch to existing DB)
152-
// TODO add a delete mode
153-
// TODO add a reorder mode
150+
/// Merge two sketch files (.skm and .skd pair)
151+
Merge {
152+
/// The first .skd (sketch data) file
153+
#[arg(required = true)]
154+
db1: String,
155+
156+
/// The second .skd (sketch data) file
157+
#[arg(required = true)]
158+
db2: String,
159+
160+
/// Output filename for the merged sketch
161+
#[arg(required = true, short)]
162+
output: String,
163+
},
154164
/// Print information about a .skm file
155165
Info {
156166
/// Sketch metadata file (.skm) to describe

src/lib.rs

+41-8
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ use std::time::Instant;
1010
#[macro_use]
1111
extern crate arrayref;
1212
extern crate num_cpus;
13+
use anyhow::Error;
1314
use indicatif::{ParallelProgressIterator, ProgressStyle};
1415
use rayon::prelude::*;
1516

@@ -37,13 +38,15 @@ use crate::io::{get_input_list, parse_kmers, read_subset_names, set_ostream};
3738
pub mod bloom_filter;
3839
pub mod hashing;
3940

41+
pub mod utils;
42+
4043
/// Default k-mer size for (genome) sketching
4144
pub const DEFAULT_KMER: usize = 17;
4245
/// Chunk size in parallel distance calculations
4346
pub const CHUNK_SIZE: usize = 1000;
4447

4548
#[doc(hidden)]
46-
pub fn main() {
49+
pub fn main() -> Result<(), Error> {
4750
let args = cli_args();
4851
if args.verbose {
4952
simple_logger::init_with_level(log::Level::Info).unwrap();
@@ -54,7 +57,7 @@ pub fn main() {
5457

5558
let mut print_success = true;
5659
let start = Instant::now();
57-
match &args.command {
60+
let result = match &args.command {
5861
Commands::Sketch {
5962
seq_files,
6063
file_list,
@@ -115,6 +118,7 @@ pub fn main() {
115118
sketch_vec
116119
.save_metadata(output)
117120
.expect("Error saving metadata");
121+
Ok(())
118122
}
119123
Commands::Dist {
120124
ref_db,
@@ -130,11 +134,8 @@ pub fn main() {
130134

131135
let mut output_file = set_ostream(output);
132136

133-
let ref_db_name = if ref_db.ends_with(".skm") || ref_db.ends_with(".skd") {
134-
&ref_db[0..ref_db.len() - 4]
135-
} else {
136-
ref_db.as_str()
137-
};
137+
let ref_db_name = utils::strip_sketch_extension(ref_db);
138+
138139
let mut references = MultiSketch::load(ref_db_name)
139140
.unwrap_or_else(|_| panic!("Could not read sketch metadata from {ref_db}.skm"));
140141

@@ -363,7 +364,37 @@ pub fn main() {
363364
write!(output_file, "{distances}").expect("Error writing output distances");
364365
}
365366
}
367+
Ok(())
366368
}
369+
Commands::Merge { db1, db2, output } => {
370+
let ref_db_name1 = utils::strip_sketch_extension(db1);
371+
let ref_db_name2 = utils::strip_sketch_extension(db2);
372+
373+
log::info!("Reading input metadata");
374+
let mut sketches1: MultiSketch = MultiSketch::load(ref_db_name1).unwrap_or_else(|_| {
375+
panic!("Could not read sketch metadata from {}.skm", ref_db_name1)
376+
});
377+
378+
let sketches2: MultiSketch = MultiSketch::load(ref_db_name2).unwrap_or_else(|_| {
379+
panic!("Could not read sketch metadata from {}.skm", ref_db_name2)
380+
});
381+
// check compatibility
382+
if !sketches1.is_compatible_with(&sketches2) {
383+
panic!("Databases are not compatible for merging.")
384+
}
385+
386+
log::info!("Merging metadata to {}.skm", output);
387+
let merged_sketch = sketches1.merge_sketches(&sketches2);
388+
// merge metadata
389+
merged_sketch
390+
.save_metadata(output)
391+
.unwrap_or_else(|_| panic!("Couldn't save metadata to {}", output));
392+
393+
// merge actual sketch data
394+
log::info!("Merging and saving sketch data to {}.skd", output);
395+
utils::save_sketch_data(ref_db_name1, ref_db_name2, output)
396+
}
397+
367398
Commands::Info {
368399
skm_file,
369400
sample_info,
@@ -384,8 +415,9 @@ pub fn main() {
384415
println!("{sketches:?}");
385416
}
386417
print_success = false; // Turn the final message off
418+
Ok(())
387419
}
388-
}
420+
};
389421
let end = Instant::now();
390422

391423
log::info!("Complete");
@@ -395,4 +427,5 @@ pub fn main() {
395427
end.duration_since(start).as_secs()
396428
);
397429
}
430+
result
398431
}

src/main.rs

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
use anyhow::Error;
22

33
fn main() -> Result<(), Error> {
4-
sketchlib::main();
5-
Ok(())
4+
sketchlib::main()
65
}

0 commit comments

Comments
 (0)