[Notes] [Git][BuildStream/buildstream][master] 5 commits:

Jürg Billeter pushed to branch master at BuildStream / buildstream

Commits:

b199afe6

by Jürg Billeter at 2018-09-25T09:01:51Z

_artifactcache/cascache.py: Add _ensure_blob helper

This adds directory objects to the local repository before downloading
files in the directory. However, artifact references are still stored
only after downloading the complete directory and thus, there won't be
dangling references. This will anyway be required for partial download
support.

7d199322

by Jürg Billeter at 2018-09-25T09:01:51Z

_artifactcache/cascache.py: Increase payload size limit for uploads

gRPC can handle 1 MiB payloads. Increase size limit from 64 KiB to speed
up uploads.`

e2e24015

by Jürg Billeter at 2018-09-25T09:01:51Z

_artifactcache/casserver.py: Harmonize payload size limit

Use 1 MiB as payload size limit on the server side for both individual
downloads and batch uploads.

697d10f2

by Jürg Billeter at 2018-09-25T09:01:51Z

_artifactcache/cascache.py: Use BatchReadBlobs

This uses BatchReadBlobs instead of individual blob download to speed up
artifact pulling, if the server supports it.

Fixes #554.

81c51dbf

by Jürg Billeter at 2018-09-25T09:31:55Z

Merge branch 'juerg/cas-batch' into 'master'

_artifactcache/cascache.py: Use BatchReadBlobs

Closes #554

See merge request BuildStream/buildstream!813

Changes:

buildstream/_artifactcache/cascache.py

@@ -44,6 +44,11 @@ from .._exceptions import ArtifactError
  from . import ArtifactCache
 +# The default limit for gRPC messages is 4 MiB.
 +# Limit payload to 1 MiB to leave sufficient headroom for metadata.
 +_MAX_PAYLOAD_BYTES = 1024 * 1024
++
++
  # A CASCache manages artifacts in a CAS repository as specified in the
  # Remote Execution API.
+ #
@@ -854,6 +859,80 @@ class CASCache(ArtifactCache):
          assert digest.size_bytes == os.fstat(stream.fileno()).st_size
 +    # _ensure_blob():
 +    #
 +    # Fetch and add blob if it's not already local.
 +    #
 +    # Args:
 +    #     remote (Remote): The remote to use.
 +    #     digest (Digest): Digest object for the blob to fetch.
 +    #
 +    # Returns:
 +    #     (str): The path of the object
 +    #
 +    def _ensure_blob(self, remote, digest):
 +        objpath = self.objpath(digest)
 +        if os.path.exists(objpath):
 +            # already in local repository
 +            return objpath
++
 +        with tempfile.NamedTemporaryFile(dir=self.tmpdir) as f:
 +            self._fetch_blob(remote, digest, f)
++
 +            added_digest = self.add_object(path=f.name)
 +            assert added_digest.hash == digest.hash
++
 +        return objpath
++
 +    def _batch_download_complete(self, batch):
 +        for digest, data in batch.send():
 +            with tempfile.NamedTemporaryFile(dir=self.tmpdir) as f:
 +                f.write(data)
 +                f.flush()
++
 +                added_digest = self.add_object(path=f.name)
 +                assert added_digest.hash == digest.hash
++
 +    # Helper function for _fetch_directory().
 +    def _fetch_directory_batch(self, remote, batch, fetch_queue, fetch_next_queue):
 +        self._batch_download_complete(batch)
++
 +        # All previously scheduled directories are now locally available,
 +        # move them to the processing queue.
 +        fetch_queue.extend(fetch_next_queue)
 +        fetch_next_queue.clear()
 +        return _CASBatchRead(remote)
++
 +    # Helper function for _fetch_directory().
 +    def _fetch_directory_node(self, remote, digest, batch, fetch_queue, fetch_next_queue, *, recursive=False):
 +        in_local_cache = os.path.exists(self.objpath(digest))
++
 +        if in_local_cache:
 +            # Skip download, already in local cache.
 +            pass
 +        elif (digest.size_bytes >= remote.max_batch_total_size_bytes or
 +                not remote.batch_read_supported):
 +            # Too large for batch request, download in independent request.
 +            self._ensure_blob(remote, digest)
 +            in_local_cache = True
 +        else:
 +            if not batch.add(digest):
 +                # Not enough space left in batch request.
 +                # Complete pending batch first.
 +                batch = self._fetch_directory_batch(remote, batch, fetch_queue, fetch_next_queue)
 +                batch.add(digest)
++
 +        if recursive:
 +            if in_local_cache:
 +                # Add directory to processing queue.
 +                fetch_queue.append(digest)
 +            else:
 +                # Directory will be available after completing pending batch.
 +                # Add directory to deferred processing queue.
 +                fetch_next_queue.append(digest)
++
 +        return batch
++
      # _fetch_directory():
+     #
      # Fetches remote directory and adds it to content addressable store.
@@ -867,39 +946,32 @@ class CASCache(ArtifactCache):
      #     dir_digest (Digest): Digest object for the directory to fetch.
+     #
      def _fetch_directory(self, remote, dir_digest):
 -        objpath = self.objpath(dir_digest)
 -        if os.path.exists(objpath):
 -            # already in local cache
 -            return
+-
 -        with tempfile.NamedTemporaryFile(dir=self.tmpdir) as out:
 -            self._fetch_blob(remote, dir_digest, out)
+-
 -            directory = remote_execution_pb2.Directory()
 +        fetch_queue = [dir_digest]
 +        fetch_next_queue = []
 +        batch = _CASBatchRead(remote)
 -            with open(out.name, 'rb') as f:
 -                directory.ParseFromString(f.read())
 +        while len(fetch_queue) + len(fetch_next_queue) > 0:
 +            if len(fetch_queue) == 0:
 +                batch = self._fetch_directory_batch(remote, batch, fetch_queue, fetch_next_queue)
 -            for filenode in directory.files:
 -                fileobjpath = self.objpath(filenode.digest)
 -                if os.path.exists(fileobjpath):
 -                    # already in local cache
 -                    continue
 +            dir_digest = fetch_queue.pop(0)
 -                with tempfile.NamedTemporaryFile(dir=self.tmpdir) as f:
 -                    self._fetch_blob(remote, filenode.digest, f)
 +            objpath = self._ensure_blob(remote, dir_digest)
 -                    digest = self.add_object(path=f.name)
 -                    assert digest.hash == filenode.digest.hash
 +            directory = remote_execution_pb2.Directory()
 +            with open(objpath, 'rb') as f:
 +                directory.ParseFromString(f.read())
              for dirnode in directory.directories:
 -                self._fetch_directory(remote, dirnode.digest)
 +                batch = self._fetch_directory_node(remote, dirnode.digest, batch,
 +                                                   fetch_queue, fetch_next_queue, recursive=True)
++
 +            for filenode in directory.files:
 +                batch = self._fetch_directory_node(remote, filenode.digest, batch,
 +                                                   fetch_queue, fetch_next_queue)
 -            # Place directory blob only in final location when we've
 -            # downloaded all referenced blobs to avoid dangling
 -            # references in the repository.
 -            digest = self.add_object(path=out.name)
 -            assert digest.hash == dir_digest.hash
 +        # Fetch final batch
 +        self._fetch_directory_batch(remote, batch, fetch_queue, fetch_next_queue)
      def _fetch_tree(self, remote, digest):
          # download but do not store the Tree object
@@ -914,16 +986,7 @@ class CASCache(ArtifactCache):
              tree.children.extend([tree.root])
              for directory in tree.children:
                  for filenode in directory.files:
 -                    fileobjpath = self.objpath(filenode.digest)
 -                    if os.path.exists(fileobjpath):
 -                        # already in local cache
 -                        continue
+-
 -                    with tempfile.NamedTemporaryFile(dir=self.tmpdir) as f:
 -                        self._fetch_blob(remote, filenode.digest, f)
+-
 -                        added_digest = self.add_object(path=f.name)
 -                        assert added_digest.hash == filenode.digest.hash
 +                    self._ensure_blob(remote, filenode.digest)
                  # place directory blob only in final location when we've downloaded
                  # all referenced blobs to avoid dangling references in the repository
@@ -942,12 +1005,12 @@ class CASCache(ArtifactCache):
              finished = False
              remaining = digest.size_bytes
              while not finished:
 -                chunk_size = min(remaining, 64 * 1024)
 +                chunk_size = min(remaining, _MAX_PAYLOAD_BYTES)
                  remaining -= chunk_size
                  request = bytestream_pb2.WriteRequest()
                  request.write_offset = offset
 -                # max. 64 kB chunks
 +                # max. _MAX_PAYLOAD_BYTES chunks
                  request.data = instream.read(chunk_size)
                  request.resource_name = resname
                  request.finish_write = remaining <= 0
@@ -1035,11 +1098,78 @@ class _CASRemote():
              self.bytestream = bytestream_pb2_grpc.ByteStreamStub(self.channel)
              self.cas = remote_execution_pb2_grpc.ContentAddressableStorageStub(self.channel)
 +            self.capabilities = remote_execution_pb2_grpc.CapabilitiesStub(self.channel)
              self.ref_storage = buildstream_pb2_grpc.ReferenceStorageStub(self.channel)
 +            self.max_batch_total_size_bytes = _MAX_PAYLOAD_BYTES
 +            try:
 +                request = remote_execution_pb2.GetCapabilitiesRequest()
 +                response = self.capabilities.GetCapabilities(request)
 +                server_max_batch_total_size_bytes = response.cache_capabilities.max_batch_total_size_bytes
 +                if 0 < server_max_batch_total_size_bytes < self.max_batch_total_size_bytes:
 +                    self.max_batch_total_size_bytes = server_max_batch_total_size_bytes
 +            except grpc.RpcError as e:
 +                # Simply use the defaults for servers that don't implement GetCapabilities()
 +                if e.code() != grpc.StatusCode.UNIMPLEMENTED:
 +                    raise
++
 +            # Check whether the server supports BatchReadBlobs()
 +            self.batch_read_supported = False
 +            try:
 +                request = remote_execution_pb2.BatchReadBlobsRequest()
 +                response = self.cas.BatchReadBlobs(request)
 +                self.batch_read_supported = True
 +            except grpc.RpcError as e:
 +                if e.code() != grpc.StatusCode.UNIMPLEMENTED:
 +                    raise
++
              self._initialized = True
 +# Represents a batch of blobs queued for fetching.
 +#
 +class _CASBatchRead():
 +    def __init__(self, remote):
 +        self._remote = remote
 +        self._max_total_size_bytes = remote.max_batch_total_size_bytes
 +        self._request = remote_execution_pb2.BatchReadBlobsRequest()
 +        self._size = 0
 +        self._sent = False
++
 +    def add(self, digest):
 +        assert not self._sent
++
 +        new_batch_size = self._size + digest.size_bytes
 +        if new_batch_size > self._max_total_size_bytes:
 +            # Not enough space left in current batch
 +            return False
++
 +        request_digest = self._request.digests.add()
 +        request_digest.hash = digest.hash
 +        request_digest.size_bytes = digest.size_bytes
 +        self._size = new_batch_size
 +        return True
++
 +    def send(self):
 +        assert not self._sent
 +        self._sent = True
++
 +        if len(self._request.digests) == 0:
 +            return
++
 +        batch_response = self._remote.cas.BatchReadBlobs(self._request)
++
 +        for response in batch_response.responses:
 +            if response.status.code != grpc.StatusCode.OK.value[0]:
 +                raise ArtifactError("Failed to download blob {}: {}".format(
 +                    response.digest.hash, response.status.code))
 +            if response.digest.size_bytes != len(response.data):
 +                raise ArtifactError("Failed to download blob {}: expected {} bytes, received {} bytes".format(
 +                    response.digest.hash, response.digest.size_bytes, len(response.data)))
++
 +            yield (response.digest, response.data)
++
++
  def _grouper(iterable, n):
      while True:
          try:

buildstream/_artifactcache/casserver.py

@@ -38,8 +38,9 @@ from .._context import Context
  from .cascache import CASCache
 -# The default limit for gRPC messages is 4 MiB
 -_MAX_BATCH_TOTAL_SIZE_BYTES = 4 * 1024 * 1024
 +# The default limit for gRPC messages is 4 MiB.
 +# Limit payload to 1 MiB to leave sufficient headroom for metadata.
 +_MAX_PAYLOAD_BYTES = 1024 * 1024
  # Trying to push an artifact that is too large
@@ -158,7 +159,7 @@ class _ByteStreamServicer(bytestream_pb2_grpc.ByteStreamServicer):
                  remaining = client_digest.size_bytes - request.read_offset
                  while remaining > 0:
 -                    chunk_size = min(remaining, 64 * 1024)
 +                    chunk_size = min(remaining, _MAX_PAYLOAD_BYTES)
                      remaining -= chunk_size
                      response = bytestream_pb2.ReadResponse()
@@ -242,7 +243,7 @@ class _ContentAddressableStorageServicer(remote_execution_pb2_grpc.ContentAddres
          for digest in request.digests:
              batch_size += digest.size_bytes
 -            if batch_size > _MAX_BATCH_TOTAL_SIZE_BYTES:
 +            if batch_size > _MAX_PAYLOAD_BYTES:
                  context.set_code(grpc.StatusCode.INVALID_ARGUMENT)
                  return response
@@ -269,7 +270,7 @@ class _CapabilitiesServicer(remote_execution_pb2_grpc.CapabilitiesServicer):
          cache_capabilities = response.cache_capabilities
          cache_capabilities.digest_function.append(remote_execution_pb2.SHA256)
          cache_capabilities.action_cache_update_capabilities.update_enabled = False
 -        cache_capabilities.max_batch_total_size_bytes = _MAX_BATCH_TOTAL_SIZE_BYTES
 +        cache_capabilities.max_batch_total_size_bytes = _MAX_PAYLOAD_BYTES
          cache_capabilities.symlink_absolute_path_strategy = remote_execution_pb2.CacheCapabilities.ALLOWED
          response.deprecated_api_version.major = 2

[Notes] [Git][BuildStream/buildstream][master] 5 commits: _artifactcache/cascache.py: Add _ensure_blob helper

Jürg Billeter pushed to branch master at BuildStream / buildstream

Commits:

2 changed files:

Changes: