diff --git a/.github/workflows/squash.yml b/.github/workflows/squash.yml index 6c215cf..0aa167a 100644 --- a/.github/workflows/squash.yml +++ b/.github/workflows/squash.yml @@ -9,23 +9,52 @@ on: - main jobs: - build: + build-docker-24: runs-on: ubuntu-20.04 strategy: fail-fast: false matrix: - python-version: ['3.6', '3.7', '3.8', '3.9', '3.10', '3.11'] + python-version: ['3.6', '3.7', '3.8', '3.9', '3.10', '3.11', '3.12'] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Setup run: | sudo apt-get update - pip install -U pip - pip install "tox<4.0.0" + pip install "tox<4.0.0" setuptools + - name: Info + run: | + docker version + docker info + - name: Run tests + run: | + PV=${{ matrix.python-version }} + echo "Running tests for Python version $PV ( ${PV/./} )" + make test-py"${PV/./}" + build-docker-25: + runs-on: ubuntu-20.04 + strategy: + fail-fast: false + matrix: + python-version: ['3.6', '3.7', '3.8', '3.9', '3.10', '3.11', '3.12'] + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Setup + run: | + for pkg in containerd runc; do sudo apt-get remove $pkg; done + sudo apt-get update + sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - + sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu focal stable" + apt-cache policy docker-ce + sudo apt-get install docker-ce docker-ce-cli containerd.io + pip install "tox<4.0.0" setuptools - name: Info run: | docker version diff --git a/Makefile b/Makefile index 405a9d7..cba2f08 100644 --- a/Makefile +++ b/Makefile @@ -21,6 +21,9 @@ test-py310: prepare test-py311: prepare tox -e py311 -- tests +test-py312: prepare + tox -e py312 -- tests + test-unit: prepare tox -- tests/test_unit* diff --git a/docker_squash/image.py b/docker_squash/image.py index 8f0c679..298dc48 100644 --- a/docker_squash/image.py +++ b/docker_squash/image.py @@ -10,9 +10,9 @@ import tarfile import tempfile import threading -from typing import List +from typing import List, Optional, Union -import docker +import docker as docker_library from docker_squash.errors import SquashError, SquashUnnecessaryError @@ -20,8 +20,8 @@ class Chdir(object): """Context manager for changing the current working directory""" - def __init__(self, newPath): - self.newPath = os.path.expanduser(newPath) + def __init__(self, new_path): + self.newPath = os.path.expanduser(new_path) def __enter__(self): self.savedPath = os.getcwd() @@ -43,18 +43,26 @@ class Image(object): """ Image format version """ def __init__( - self, log, docker, image, from_layer, tmp_dir=None, tag=None, comment="" + self, + log, + docker, + image, + from_layer, + tmp_dir: Optional[str] = None, + tag: Optional[str] = None, + comment: Optional[str] = "", ): - self.log = log + self.log: logging.Logger = log self.debug = self.log.isEnabledFor(logging.DEBUG) self.docker = docker - self.image = image - self.from_layer = from_layer - self.tag = tag - self.comment = comment + self.image: str = image + self.from_layer: str = from_layer + self.tag: str = tag + self.comment: str = comment self.image_name = None self.image_tag = None self.squash_id = None + self.oci_format = False # Workaround for https://play.golang.org/p/sCsWMXYxqy # @@ -68,7 +76,7 @@ def __init__( ) """ Date used in metadata, already formatted using the `%Y-%m-%dT%H:%M:%S.%fZ` format """ - self.tmp_dir = tmp_dir + self.tmp_dir: str = tmp_dir """ Main temporary directory to save all working files. This is the root directory for all other temporary files. """ def squash(self): @@ -95,11 +103,11 @@ def _initialize_directories(self): raise SquashError("Preparing temporary directory failed") # Temporary location on the disk of the old, unpacked *image* - self.old_image_dir = os.path.join(self.tmp_dir, "old") + self.old_image_dir: str = os.path.join(self.tmp_dir, "old") # Temporary location on the disk of the new, unpacked, squashed *image* - self.new_image_dir = os.path.join(self.tmp_dir, "new") + self.new_image_dir: str = os.path.join(self.tmp_dir, "new") # Temporary location on the disk of the squashed *layer* - self.squashed_dir = os.path.join(self.new_image_dir, "squashed") + self.squashed_dir: str = os.path.join(self.new_image_dir, "squashed") for d in self.old_image_dir, self.new_image_dir: os.makedirs(d) @@ -115,14 +123,12 @@ def _squash_id(self, layer): squash_id = self.docker.inspect_image(layer)["Id"] except Exception: raise SquashError( - "Could not get the layer ID to squash, please check provided 'layer' argument: %s" - % layer + f"Could not get the layer ID to squash, please check provided 'layer' argument: {layer}" ) if squash_id not in self.old_image_layers: raise SquashError( - "Couldn't find the provided layer (%s) in the %s image" - % (layer, self.image) + f"Couldn't find the provided layer ({layer}) in the {self.image} image" ) self.log.debug("Layer ID to squash from: %s" % squash_id) @@ -138,16 +144,14 @@ def _validate_number_of_layers(self, number_of_layers): # Only positive numbers are correct if number_of_layers <= 0: raise SquashError( - "Number of layers to squash cannot be less or equal 0, provided: %s" - % number_of_layers + f"Number of layers to squash cannot be less or equal 0, provided: {number_of_layers}" ) # Do not squash if provided number of layer to squash is bigger # than number of actual layers in the image if number_of_layers > len(self.old_image_layers): raise SquashError( - "Cannot squash %s layers, the %s image contains only %s layers" - % (number_of_layers, self.image, len(self.old_image_layers)) + f"Cannot squash {number_of_layers} layers, the {self.image} image contains only {len(self.old_image_layers)} layers" ) def _before_squashing(self): @@ -164,17 +168,14 @@ def _before_squashing(self): self.old_image_id = self.docker.inspect_image(self.image)["Id"] except SquashError: raise SquashError( - "Could not get the image ID to squash, please check provided 'image' argument: %s" - % self.image + f"Could not get the image ID to squash, please check provided 'image' argument: {self.image}" ) self.old_image_layers = [] # Read all layers in the image self._read_layers(self.old_image_layers, self.old_image_id) - self.old_image_layers.reverse() - self.log.info("Old image has %s layers", len(self.old_image_layers)) self.log.debug("Old layers: %s", self.old_image_layers) @@ -193,8 +194,7 @@ def _before_squashing(self): if not squash_id: raise SquashError( - "The %s layer could not be found in the %s image" - % (self.from_layer, self.image) + f"The {self.from_layer} layer could not be found in the {self.image} image" ) number_of_layers = ( @@ -212,7 +212,7 @@ def _before_squashing(self): if len(self.layers_to_squash) < 1: raise SquashError( - "Invalid number of layers to squash: %s" % len(self.layers_to_squash) + f"Invalid number of layers to squash: {len(self.layers_to_squash)}" ) if len(self.layers_to_squash) == 1: @@ -233,6 +233,7 @@ def _before_squashing(self): def _after_squashing(self): self.log.debug("Removing from disk already squashed layers...") + self.log.debug("Cleaning up %s temporary directory" % self.old_image_dir) shutil.rmtree(self.old_image_dir, ignore_errors=True) self.size_after = self._dir_size(self.new_image_dir) @@ -281,7 +282,7 @@ def load_squashed_image(self): % (self.image_name, self.image_tag) ) - def _files_in_layers(self, layers, directory): + def _files_in_layers(self, layers): """ Prepare a list of files in all layers """ @@ -289,21 +290,20 @@ def _files_in_layers(self, layers, directory): for layer in layers: self.log.debug("Generating list of files in layer '%s'..." % layer) - tar_file = os.path.join(directory, layer, "layer.tar") + tar_file = self._extract_tar_name(layer) with tarfile.open(tar_file, "r", format=tarfile.PAX_FORMAT) as tar: files[layer] = [self._normalize_path(x) for x in tar.getnames()] self.log.debug("Done, found %s files" % len(files[layer])) return files - def _prepare_tmp_directory(self, tmp_dir): + def _prepare_tmp_directory(self, tmp_dir: str) -> str: """Creates temporary directory that is used to work on layers""" if tmp_dir: if os.path.exists(tmp_dir): raise SquashError( - "The '%s' directory already exists, please remove it before you proceed" - % tmp_dir + f"The '{tmp_dir}' directory already exists, please remove it before you proceed" ) os.makedirs(tmp_dir) else: @@ -374,9 +374,9 @@ def _save_image(self, image_id, directory): try: image = self.docker.get_image(image_id) - if int(docker.__version__.split(".")[0]) < 3: + if int(docker_library.__version__.split(".")[0]) < 3: # Docker library prior to 3.0.0 returned the requests - # object directly which cold be used to read from + # object directly which could be used to read from self.log.debug( "Extracting image using HTTPResponse object directly" ) @@ -408,10 +408,10 @@ def _save_image(self, image_id, directory): except Exception as e: self.log.exception(e) self.log.warning( - "An error occured while saving the %s image, retrying..." % image_id + f"An error occurred while saving the {image_id} image, retrying..." ) - raise SquashError("Couldn't save %s image!" % image_id) + raise SquashError(f"Couldn't save {image_id} image!") def _unpack(self, tar_file, directory): """Unpacks tar archive to selected directory""" @@ -500,7 +500,7 @@ def _read_old_metadata(self, old_json_file): return metadata - def _move_layers(self, layers, src, dest): + def _move_layers(self, layers, src: str, dest: str): """ This moves all the layers that should be copied as-is. In other words - all layers that are not meant to be squashed will be @@ -530,7 +530,7 @@ def _marker_files(self, tar, members): """ Searches for marker files in the specified archive. - Docker marker files are files taht have the .wh. prefix in the name. + Docker marker files are files that have the .wh. prefix in the name. These files mark the corresponding file to be removed (hidden) when we start a container from the image. """ @@ -609,7 +609,9 @@ def _add_markers(self, markers, tar, files_in_layers, added_symlinks): else: self.log.debug("Skipping '%s' marker file..." % marker.name) - def _normalize_path(self, path): + def _normalize_path( + self, path: Union[str, pathlib.Path] + ) -> Union[str, pathlib.Path]: return os.path.normpath(os.path.join("/", path)) def _add_hardlinks(self, squashed_tar, squashed_files, to_skip, skipped_hard_links): @@ -743,17 +745,15 @@ def _add_symlinks(self, squashed_tar, squashed_files, to_skip, skipped_sym_links return added_symlinks - def _squash_layers(self, layers_to_squash, layers_to_move): - self.log.info("Starting squashing...") + def _squash_layers(self, layers_to_squash: List[str], layers_to_move: List[str]): + self.log.info(f"Starting squashing for {self.squashed_tar}...") # Reverse the layers to squash - we begin with the newest one # to make the tar lighter layers_to_squash.reverse() # Find all files in layers that we don't squash - files_in_layers_to_move = self._files_in_layers( - layers_to_move, self.old_image_dir - ) + files_in_layers_to_move = self._files_in_layers(layers_to_move) with tarfile.open( self.squashed_tar, "w", format=tarfile.PAX_FORMAT @@ -770,8 +770,7 @@ def _squash_layers(self, layers_to_squash, layers_to_move): reading_layers: List[tarfile.TarFile] = [] for layer_id in layers_to_squash: - layer_tar_file = os.path.join(self.old_image_dir, layer_id, "layer.tar") - + layer_tar_file = self._extract_tar_name(layer_id) self.log.info("Squashing file '%s'..." % layer_tar_file) # Open the exiting layer to squash @@ -1028,3 +1027,9 @@ def _path_hierarchy(self, path): return itertools.accumulate( path.parts[:-1], func=lambda head, tail: str(path.__class__(head, tail)) ) + + def _extract_tar_name(self, path: str) -> str: + if self.oci_format: + return os.path.join(self.old_image_dir, path) + else: + return os.path.join(self.old_image_dir, path, "layer.tar") diff --git a/docker_squash/squash.py b/docker_squash/squash.py index 1012ee3..47cef97 100644 --- a/docker_squash/squash.py +++ b/docker_squash/squash.py @@ -1,11 +1,14 @@ # -*- coding: utf-8 -*- import os +from logging import Logger +from typing import Optional -import docker +import docker.errors as docker_errors from packaging import version as packaging_version from docker_squash.errors import SquashError +from docker_squash.image import Image from docker_squash.lib import common from docker_squash.v1_image import V1Image from docker_squash.v2_image import V2Image @@ -18,24 +21,24 @@ def __init__( log, image, docker=None, - from_layer=None, - tag=None, - comment="", - tmp_dir=None, - output_path=None, - load_image=True, - cleanup=False, + from_layer: Optional[str] = None, + tag: Optional[str] = None, + comment: Optional[str] = "", + tmp_dir: Optional[str] = None, + output_path: Optional[str] = None, + load_image: Optional[bool] = True, + cleanup: Optional[bool] = False, ): - self.log = log + self.log: Logger = log self.docker = docker - self.image = image - self.from_layer = from_layer - self.tag = tag - self.comment = comment - self.tmp_dir = tmp_dir - self.output_path = output_path - self.load_image = load_image - self.cleanup = cleanup + self.image: str = image + self.from_layer: str = from_layer + self.tag: str = tag + self.comment: str = comment + self.tmp_dir: str = tmp_dir + self.output_path: str = output_path + self.load_image: bool = load_image + self.cleanup: bool = cleanup self.development = False if tmp_dir: @@ -68,7 +71,7 @@ def run(self): if packaging_version.parse( docker_version["ApiVersion"] ) >= packaging_version.parse("1.22"): - image = V2Image( + image: Image = V2Image( self.log, self.docker, self.image, @@ -78,7 +81,7 @@ def run(self): self.comment, ) else: - image = V1Image( + image: Image = V1Image( self.log, self.docker, self.image, @@ -103,7 +106,7 @@ def run(self): def _cleanup(self): try: image_id = self.docker.inspect_image(self.image)["Id"] - except docker.errors.APIError as ex: + except docker_errors.APIError as ex: self.log.warning( "Could not get the image ID for {} image: {}, skipping cleanup after squashing".format( self.image, str(ex) @@ -116,14 +119,14 @@ def _cleanup(self): try: self.docker.remove_image(image_id, force=False, noprune=False) self.log.info("Image {} removed!".format(self.image)) - except docker.errors.APIError as ex: + except docker_errors.APIError as ex: self.log.warning( "Could not remove image {}: {}, skipping cleanup after squashing".format( self.image, str(ex) ) ) - def squash(self, image): + def squash(self, image: Image): # Do the actual squashing new_image_id = image.squash() diff --git a/docker_squash/v2_image.py b/docker_squash/v2_image.py index f80a59f..7cd3765 100644 --- a/docker_squash/v2_image.py +++ b/docker_squash/v2_image.py @@ -3,7 +3,10 @@ import os import shutil from collections import OrderedDict +from pathlib import Path +from typing import List, Tuple +from docker_squash.errors import SquashError from docker_squash.image import Image @@ -14,9 +17,10 @@ def _before_squashing(self): super(V2Image, self)._before_squashing() # Read old image manifest file - self.old_image_manifest = self._read_json_file( - os.path.join(self.old_image_dir, "manifest.json") - )[0] + self.old_image_manifest = self._get_manifest() + self.log.debug( + f"Retrieved manifest {json.dumps(self.old_image_manifest, indent=4)}" + ) # Read old image config file self.old_image_config = self._read_json_file( @@ -26,6 +30,8 @@ def _before_squashing(self): # Read layer paths inside of the tar archive # We split it into layers that needs to be squashed # and layers that needs to be moved as-is + self.layer_paths_to_squash: List[str] = [] + self.layer_paths_to_move: List[str] = [] self.layer_paths_to_squash, self.layer_paths_to_move = self._read_layer_paths( self.old_image_config, self.old_image_manifest, self.layers_to_move ) @@ -53,9 +59,16 @@ def _squash(self): # we store the layer data inside of the tar archive layer_path_id = self._generate_squashed_layer_path_id() - metadata = self._generate_last_layer_metadata( - layer_path_id, self.layer_paths_to_squash[0] - ) + if self.oci_format: + old_layer_path = self.old_image_manifest["Config"] + else: + if self.layer_paths_to_squash[0]: + old_layer_path = self.layer_paths_to_squash[0] + else: + old_layer_path = layer_path_id + old_layer_path = os.path.join(old_layer_path, "json") + + metadata = self._generate_last_layer_metadata(layer_path_id, old_layer_path) self._write_squashed_layer_metadata(metadata) # Write version file to the squashed layer @@ -139,12 +152,14 @@ def _generate_manifest_metadata( def _read_json_file(self, json_file): """Helper function to read JSON file as OrderedDict""" - self.log.debug("Reading '%s' JSON file..." % json_file) + self.log.debug(f"Reading '{json_file}' JSON file...") with open(json_file, "r") as f: return json.load(f, object_pairs_hook=OrderedDict) - def _read_layer_paths(self, old_image_config, old_image_manifest, layers_to_move): + def _read_layer_paths( + self, old_image_config, old_image_manifest, layers_to_move: List[str] + ) -> Tuple[List[str], List[str]]: """ In case of v2 format, layer id's are not the same as the id's used in the exported tar archive to name directories for layers. @@ -165,9 +180,16 @@ def _read_layer_paths(self, old_image_config, old_image_manifest, layers_to_move # (directory name) where the layer's data is # stored if not layer.get("empty_layer", False): - layer_id = old_image_manifest["Layers"][current_manifest_layer].rsplit( - "/" - )[0] + # Under <25 layers look like + # 27f9b97654306a5389e8e48ba3486a11026d34055e1907672231cbd8e1380481/layer.tar + # while >=25 layers look like + # blobs/sha256/d6a7fc1fb44b63324d3fc67f016e1ef7ecc1a5ae6668ae3072d2e17230e3cfbc + if self.oci_format: + layer_id = old_image_manifest["Layers"][current_manifest_layer] + else: + layer_id = old_image_manifest["Layers"][ + current_manifest_layer + ].rsplit("/")[0] # Check if this layer should be moved or squashed if len(layers_to_move) > i: @@ -205,9 +227,7 @@ def _generate_diff_ids(self): diff_ids = [] for path in self.layer_paths_to_move: - sha256 = self._compute_sha256( - os.path.join(self.old_image_dir, path, "layer.tar") - ) + sha256 = self._compute_sha256(self._extract_tar_name(path)) diff_ids.append(sha256) if self.layer_paths_to_squash: @@ -291,12 +311,8 @@ def _generate_squashed_layer_path_id(self): return sha - def _generate_last_layer_metadata(self, layer_path_id, old_layer_path=None): - if not old_layer_path: - old_layer_path = layer_path_id - - config_file = os.path.join(self.old_image_dir, old_layer_path, "json") - + def _generate_last_layer_metadata(self, layer_path_id, old_layer_path: Path): + config_file = os.path.join(self.old_image_dir, old_layer_path) with open(config_file, "r") as f: config = json.load(f, object_pairs_hook=OrderedDict) @@ -353,3 +369,29 @@ def _generate_image_metadata(self): metadata["config"]["Image"] = "" return metadata + + def _get_manifest(self): + if os.path.exists(os.path.join(self.old_image_dir, "index.json")): + # New OCI Archive format type + self.oci_format = True + # Not using index.json to extract manifest details as while the config + # sha could be extracted via some indirection i.e. + # + # index.json:manifest/digest::sha256: + # blobs/sha256/:config/digest::sha256: + # + # Docker spec currently will always include a manifest.json so will standardise + # on using that. Further we rely upon the original manifest format in order to write + # it back. + if os.path.exists(os.path.join(self.old_image_dir, "manifest.json")): + return ( + self._read_json_file( + os.path.join(self.old_image_dir, "manifest.json") + ) + )[0] + else: + raise SquashError("Unable to locate manifest.json") + else: + return ( + self._read_json_file(os.path.join(self.old_image_dir, "manifest.json")) + )[0] diff --git a/tests/test_integ_squash.py b/tests/test_integ_squash.py index c4dbcd4..9023d5f 100644 --- a/tests/test_integ_squash.py +++ b/tests/test_integ_squash.py @@ -11,6 +11,7 @@ import mock import pytest +from packaging import version as packaging_version from docker_squash.errors import SquashError, SquashUnnecessaryError from docker_squash.lib import common @@ -202,9 +203,16 @@ def _squashed_layer(self): self.tar.seek(0) with tarfile.open(fileobj=self.tar, mode="r") as tar: self.squashed_layer_path = ImageHelper.top_layer_path(tar) - return self._extract_file( - "%s/layer.tar" % self.squashed_layer_path, self.tar - ) + if packaging_version.parse( + self.docker.version().get("Version") + ) >= packaging_version.parse("25.0"): + return self._extract_file( + "blobs/sha256/%s" % self.squashed_layer_path, self.tar + ) + else: + return self._extract_file( + "%s/layer.tar" % self.squashed_layer_path, self.tar + ) def assertFileExists(self, name): self.squashed_layer.seek(0) # Rewind diff --git a/tests/test_unit_v2_image.py b/tests/test_unit_v2_image.py index c8f82e2..5852280 100644 --- a/tests/test_unit_v2_image.py +++ b/tests/test_unit_v2_image.py @@ -188,7 +188,7 @@ def test_generate_squashed_layer_metadata(self): builtins, "open", mock.mock_open(read_data=layer_config) ): metadata = self.image._generate_last_layer_metadata( - "squashed_layer_path_id" + "squashed_layer_path_id", "squashed_layer_path_id" ) self.assertEqual(type(metadata), OrderedDict) diff --git a/tox.ini b/tox.ini index 3ffcf32..be55ca2 100644 --- a/tox.ini +++ b/tox.ini @@ -1,13 +1,12 @@ [tox] -envlist = py36,py37,py38,py39,py310,py311 +envlist = py36,py37,py38,py39,py310,py311,py312 [testenv] -passenv=CI,HOME -setenv= - ENVNAME={envname} - PIPENV_VERBOSITY=-1 -deps= - pipenv +passenv= + CI + HOME +setenv=PIPENV_VERBOSITY=-1 +deps=pipenv commands= pipenv install --dev --ignore-pipfile --skip-lock pipenv run pytest -v --cov-report term --cov-report html --cov docker_squash --basetemp={envtmpdir} --junit-xml target/junit-{envname}.xml --junit-prefix {envname} {posargs}