|
| 1 | +//! A layer of on-disk index cache for performance. |
| 2 | +//! |
| 3 | +//! One important aspect of the index is that we want to optimize the "happy |
| 4 | +//! path" as much as possible. Whenever you type `cargo build` Cargo will |
| 5 | +//! *always* reparse the registry and learn about dependency information. This |
| 6 | +//! is done because Cargo needs to learn about the upstream crates.io crates |
| 7 | +//! that you're using and ensure that the preexisting `Cargo.lock` still matches |
| 8 | +//! the current state of the world. |
| 9 | +//! |
| 10 | +//! Consequently, Cargo "null builds" (the index that Cargo adds to each build |
| 11 | +//! itself) need to be fast when accessing the index. The primary performance |
| 12 | +//! optimization here is to avoid parsing JSON blobs from the registry if we |
| 13 | +//! don't need them. Most secondary optimizations are centered around removing |
| 14 | +//! allocations and such, but avoiding parsing JSON is the #1 optimization. |
| 15 | +//! |
| 16 | +//! When we get queries from the resolver we're given a [`Dependency`]. This |
| 17 | +//! dependency in turn has a version requirement, and with lock files that |
| 18 | +//! already exist these version requirements are exact version requirements |
| 19 | +//! `=a.b.c`. This means that we in theory only need to parse one line of JSON |
| 20 | +//! per query in the registry, the one that matches version `a.b.c`. |
| 21 | +//! |
| 22 | +//! The crates.io index, however, is not amenable to this form of query. Instead |
| 23 | +//! the crates.io index simply is a file where each line is a JSON blob, aka |
| 24 | +//! [`IndexPackage`]. To learn about the versions in each JSON blob we would |
| 25 | +//! need to parse the JSON via [`IndexSummary::parse`], defeating the purpose |
| 26 | +//! of trying to parse as little as possible. |
| 27 | +//! |
| 28 | +//! > Note that as a small aside even *loading* the JSON from the registry is |
| 29 | +//! > actually pretty slow. For crates.io and [`RemoteRegistry`] we don't |
| 30 | +//! > actually check out the git index on disk because that takes quite some |
| 31 | +//! > time and is quite large. Instead we use `libgit2` to read the JSON from |
| 32 | +//! > the raw git objects. This in turn can be slow (aka show up high in |
| 33 | +//! > profiles) because libgit2 has to do deflate decompression and such. |
| 34 | +//! |
| 35 | +//! To solve all these issues a strategy is employed here where Cargo basically |
| 36 | +//! creates an index into the index. The first time a package is queried about |
| 37 | +//! (first time being for an entire computer) Cargo will load the contents |
| 38 | +//! (slowly via libgit2) from the registry. It will then (slowly) parse every |
| 39 | +//! single line to learn about its versions. Afterwards, however, Cargo will |
| 40 | +//! emit a new file (a cache, representing as [`SummariesCache`]) which is |
| 41 | +//! amenable for speedily parsing in future invocations. |
| 42 | +//! |
| 43 | +//! This cache file is currently organized by basically having the semver |
| 44 | +//! version extracted from each JSON blob. That way Cargo can quickly and |
| 45 | +//! easily parse all versions contained and which JSON blob they're associated |
| 46 | +//! with. The JSON blob then doesn't actually need to get parsed unless the |
| 47 | +//! version is parsed. |
| 48 | +//! |
| 49 | +//! Altogether the initial measurements of this shows a massive improvement for |
| 50 | +//! Cargo null build performance. It's expected that the improvements earned |
| 51 | +//! here will continue to grow over time in the sense that the previous |
| 52 | +//! implementation (parse all lines each time) actually continues to slow down |
| 53 | +//! over time as new versions of a crate are published. In any case when first |
| 54 | +//! implemented a null build of Cargo itself would parse 3700 JSON blobs from |
| 55 | +//! the registry and load 150 blobs from git. Afterwards it parses 150 JSON |
| 56 | +//! blobs and loads 0 files git. Removing 200ms or more from Cargo's startup |
| 57 | +//! time is certainly nothing to sneeze at! |
| 58 | +//! |
| 59 | +//! Note that this is just a high-level overview, there's of course lots of |
| 60 | +//! details like invalidating caches and whatnot which are handled below, but |
| 61 | +//! hopefully those are more obvious inline in the code itself. |
| 62 | +//! |
| 63 | +//! [`Dependency`]: crate::core::Dependency |
| 64 | +//! [`IndexPackage`]: super::IndexPackage |
| 65 | +//! [`IndexSummary::parse`]: super::IndexSummary::parse |
| 66 | +//! [`RemoteRegistry`]: crate::sources::registry::remote::RemoteRegistry |
| 67 | +
|
| 68 | +use std::fs; |
| 69 | +use std::io; |
| 70 | +use std::path::PathBuf; |
| 71 | +use std::str; |
| 72 | + |
| 73 | +use anyhow::bail; |
| 74 | +use cargo_util::registry::make_dep_path; |
| 75 | +use semver::Version; |
| 76 | + |
| 77 | +use crate::util::cache_lock::CacheLockMode; |
| 78 | +use crate::util::Filesystem; |
| 79 | +use crate::CargoResult; |
| 80 | +use crate::GlobalContext; |
| 81 | + |
| 82 | +use super::split; |
| 83 | +use super::INDEX_V_MAX; |
| 84 | + |
| 85 | +/// The current version of [`SummariesCache`]. |
| 86 | +const CURRENT_CACHE_VERSION: u8 = 3; |
| 87 | + |
| 88 | +/// A representation of the cache on disk that Cargo maintains of summaries. |
| 89 | +/// |
| 90 | +/// Cargo will initially parse all summaries in the registry and will then |
| 91 | +/// serialize that into this form and place it in a new location on disk, |
| 92 | +/// ensuring that access in the future is much speedier. |
| 93 | +/// |
| 94 | +/// For serialization and deserialization of this on-disk index cache of |
| 95 | +/// summaries, see [`SummariesCache::serialize`] and [`SummariesCache::parse`]. |
| 96 | +/// |
| 97 | +/// # The format of the index cache |
| 98 | +/// |
| 99 | +/// The idea of this format is that it's a very easy file for Cargo to parse in |
| 100 | +/// future invocations. The read from disk should be fast and then afterwards |
| 101 | +/// all we need to know is what versions correspond to which JSON blob. |
| 102 | +/// |
| 103 | +/// Currently the format looks like: |
| 104 | +/// |
| 105 | +/// ```text |
| 106 | +/// +---------------+----------------------+--------------------+---+ |
| 107 | +/// | cache version | index schema version | index file version | 0 | |
| 108 | +/// +---------------+----------------------+--------------------+---+ |
| 109 | +/// ``` |
| 110 | +/// |
| 111 | +/// followed by one or more (version + JSON blob) pairs... |
| 112 | +/// |
| 113 | +/// ```text |
| 114 | +/// +----------------+---+-----------+---+ |
| 115 | +/// | semver version | 0 | JSON blob | 0 | ... |
| 116 | +/// +----------------+---+-----------+---+ |
| 117 | +/// ``` |
| 118 | +/// |
| 119 | +/// Each field represents: |
| 120 | +/// |
| 121 | +/// * _cache version_ --- Intended to ensure that there's some level of |
| 122 | +/// future compatibility against changes to this cache format so if different |
| 123 | +/// versions of Cargo share the same cache they don't get too confused. |
| 124 | +/// * _index schema version_ --- The schema version of the raw index file. |
| 125 | +/// See [`IndexPackage::v`] for the detail. |
| 126 | +/// * _index file version_ --- Tracks when a cache needs to be regenerated. |
| 127 | +/// A cache regeneration is required whenever the index file itself updates. |
| 128 | +/// * _semver version_ --- The version for each JSON blob. Extracted from the |
| 129 | +/// blob for fast queries without parsing the entire blob. |
| 130 | +/// * _JSON blob_ --- The actual metadata for each version of the package. It |
| 131 | +/// has the same representation as [`IndexPackage`]. |
| 132 | +/// |
| 133 | +/// # Changes between each cache version |
| 134 | +/// |
| 135 | +/// * `1`: The original version. |
| 136 | +/// * `2`: Added the "index schema version" field so that if the index schema |
| 137 | +/// changes, different versions of cargo won't get confused reading each |
| 138 | +/// other's caches. |
| 139 | +/// * `3`: Bumped the version to work around an issue where multiple versions of |
| 140 | +/// a package were published that differ only by semver metadata. For |
| 141 | +/// example, openssl-src 110.0.0 and 110.0.0+1.1.0f. Previously, the cache |
| 142 | +/// would be incorrectly populated with two entries, both 110.0.0. After |
| 143 | +/// this, the metadata will be correctly included. This isn't really a format |
| 144 | +/// change, just a version bump to clear the incorrect cache entries. Note: |
| 145 | +/// the index shouldn't allow these, but unfortunately crates.io doesn't |
| 146 | +/// check it. |
| 147 | +/// |
| 148 | +/// See [`CURRENT_CACHE_VERSION`] for the current cache version. |
| 149 | +/// |
| 150 | +/// [`IndexPackage::v`]: super::IndexPackage::v |
| 151 | +/// [`IndexPackage`]: super::IndexPackage |
| 152 | +#[derive(Default)] |
| 153 | +pub struct SummariesCache<'a> { |
| 154 | + /// JSON blobs of the summaries. Each JSON blob has a [`Version`] beside, |
| 155 | + /// so that Cargo can query a version without full JSON parsing. |
| 156 | + pub versions: Vec<(Version, &'a [u8])>, |
| 157 | + /// For cache invalidation, we tracks the index file version to determine |
| 158 | + /// when to regenerate the cache itself. |
| 159 | + pub index_version: &'a str, |
| 160 | +} |
| 161 | + |
| 162 | +impl<'a> SummariesCache<'a> { |
| 163 | + /// Deserializes an on-disk cache. |
| 164 | + pub fn parse(data: &'a [u8]) -> CargoResult<SummariesCache<'a>> { |
| 165 | + // NB: keep this method in sync with `serialize` below |
| 166 | + let (first_byte, rest) = data |
| 167 | + .split_first() |
| 168 | + .ok_or_else(|| anyhow::format_err!("malformed cache"))?; |
| 169 | + if *first_byte != CURRENT_CACHE_VERSION { |
| 170 | + bail!("looks like a different Cargo's cache, bailing out"); |
| 171 | + } |
| 172 | + let index_v_bytes = rest |
| 173 | + .get(..4) |
| 174 | + .ok_or_else(|| anyhow::anyhow!("cache expected 4 bytes for index schema version"))?; |
| 175 | + let index_v = u32::from_le_bytes(index_v_bytes.try_into().unwrap()); |
| 176 | + if index_v != INDEX_V_MAX { |
| 177 | + bail!( |
| 178 | + "index schema version {index_v} doesn't match the version I know ({INDEX_V_MAX})", |
| 179 | + ); |
| 180 | + } |
| 181 | + let rest = &rest[4..]; |
| 182 | + |
| 183 | + let mut iter = split(rest, 0); |
| 184 | + let last_index_update = if let Some(update) = iter.next() { |
| 185 | + str::from_utf8(update)? |
| 186 | + } else { |
| 187 | + bail!("malformed file"); |
| 188 | + }; |
| 189 | + let mut ret = SummariesCache::default(); |
| 190 | + ret.index_version = last_index_update; |
| 191 | + while let Some(version) = iter.next() { |
| 192 | + let version = str::from_utf8(version)?; |
| 193 | + let version = Version::parse(version)?; |
| 194 | + let summary = iter.next().unwrap(); |
| 195 | + ret.versions.push((version, summary)); |
| 196 | + } |
| 197 | + Ok(ret) |
| 198 | + } |
| 199 | + |
| 200 | + /// Serializes itself with a given `index_version`. |
| 201 | + pub fn serialize(&self, index_version: &str) -> Vec<u8> { |
| 202 | + // NB: keep this method in sync with `parse` above |
| 203 | + let size = self |
| 204 | + .versions |
| 205 | + .iter() |
| 206 | + .map(|(_version, data)| (10 + data.len())) |
| 207 | + .sum(); |
| 208 | + let mut contents = Vec::with_capacity(size); |
| 209 | + contents.push(CURRENT_CACHE_VERSION); |
| 210 | + contents.extend(&u32::to_le_bytes(INDEX_V_MAX)); |
| 211 | + contents.extend_from_slice(index_version.as_bytes()); |
| 212 | + contents.push(0); |
| 213 | + for (version, data) in self.versions.iter() { |
| 214 | + contents.extend_from_slice(version.to_string().as_bytes()); |
| 215 | + contents.push(0); |
| 216 | + contents.extend_from_slice(data); |
| 217 | + contents.push(0); |
| 218 | + } |
| 219 | + contents |
| 220 | + } |
| 221 | +} |
| 222 | + |
| 223 | +/// Manages the on-disk index caches. |
| 224 | +pub struct CacheManager<'gctx> { |
| 225 | + /// The root path where caches are located. |
| 226 | + cache_root: Filesystem, |
| 227 | + /// [`GlobalContext`] reference for convenience. |
| 228 | + gctx: &'gctx GlobalContext, |
| 229 | +} |
| 230 | + |
| 231 | +impl<'gctx> CacheManager<'gctx> { |
| 232 | + /// Creates a new instance of the on-disk index cache manager. |
| 233 | + /// |
| 234 | + /// `root` --- The root path where caches are located. |
| 235 | + pub fn new(cache_root: Filesystem, gctx: &'gctx GlobalContext) -> CacheManager<'gctx> { |
| 236 | + CacheManager { cache_root, gctx } |
| 237 | + } |
| 238 | + |
| 239 | + /// Gets the cache associated with the key. |
| 240 | + pub fn get(&self, key: &str) -> Option<Vec<u8>> { |
| 241 | + let cache_path = &self.cache_path(key); |
| 242 | + match fs::read(cache_path) { |
| 243 | + Ok(contents) => Some(contents), |
| 244 | + Err(e) => { |
| 245 | + tracing::debug!(?cache_path, "cache missing: {e}"); |
| 246 | + None |
| 247 | + } |
| 248 | + } |
| 249 | + } |
| 250 | + |
| 251 | + /// Associates the value with the key. |
| 252 | + pub fn put(&self, key: &str, value: &[u8]) { |
| 253 | + let cache_path = &self.cache_path(key); |
| 254 | + if fs::create_dir_all(cache_path.parent().unwrap()).is_ok() { |
| 255 | + let path = Filesystem::new(cache_path.clone()); |
| 256 | + self.gctx |
| 257 | + .assert_package_cache_locked(CacheLockMode::DownloadExclusive, &path); |
| 258 | + if let Err(e) = fs::write(cache_path, value) { |
| 259 | + tracing::info!(?cache_path, "failed to write cache: {e}"); |
| 260 | + } |
| 261 | + } |
| 262 | + } |
| 263 | + |
| 264 | + /// Invalidates the cache associated with the key. |
| 265 | + pub fn invalidate(&self, key: &str) { |
| 266 | + let cache_path = &self.cache_path(key); |
| 267 | + if let Err(e) = fs::remove_file(cache_path) { |
| 268 | + if e.kind() != io::ErrorKind::NotFound { |
| 269 | + tracing::debug!(?cache_path, "failed to remove from cache: {e}"); |
| 270 | + } |
| 271 | + } |
| 272 | + } |
| 273 | + |
| 274 | + fn cache_path(&self, key: &str) -> PathBuf { |
| 275 | + let relative = make_dep_path(key, false); |
| 276 | + // This is the file we're loading from cache or the index data. |
| 277 | + // See module comment in `registry/mod.rs` for why this is structured |
| 278 | + // the way it is. |
| 279 | + self.cache_root.join(relative).into_path_unlocked() |
| 280 | + } |
| 281 | +} |
0 commit comments