Raoul Hidalgo Charman pushed to branch raoul/802-refactor-artifactcache at BuildStream / buildstream
Commits:
-
69e643c8
by Raoul Hidalgo Charman at 2018-12-13T12:04:26Z
3 changed files:
Changes:
... | ... | @@ -28,7 +28,7 @@ from ._message import Message, MessageType |
28 | 28 |
from . import utils
|
29 | 29 |
from . import _yaml
|
30 | 30 |
|
31 |
-from ._cas.casremote import CASRemote, CASRemoteSpec
|
|
31 |
+from ._cas.casremote import BlobNotFound, CASRemote, CASRemoteSpec
|
|
32 | 32 |
|
33 | 33 |
|
34 | 34 |
CACHE_SIZE_FILE = "cache_size"
|
... | ... | @@ -644,13 +644,33 @@ class ArtifactCache(): |
644 | 644 |
display_key = element._get_brief_display_key()
|
645 | 645 |
element.status("Pulling artifact {} <- {}".format(display_key, remote.spec.url))
|
646 | 646 |
|
647 |
- if self.cas.pull(ref, remote, progress=progress, subdir=subdir, excluded_subdirs=excluded_subdirs):
|
|
648 |
- element.info("Pulled artifact {} <- {}".format(display_key, remote.spec.url))
|
|
647 |
+ root_digest = remote.get_reference(ref)
|
|
648 |
+ |
|
649 |
+ if root_digest:
|
|
650 |
+ try:
|
|
651 |
+ for blob_digest in remote.yield_blob_digests(
|
|
652 |
+ root_digest, progress=progress, subdir=subdir,
|
|
653 |
+ excluded_subdirs=excluded_subdirs):
|
|
654 |
+ if self.cas.check_blob(blob_digest):
|
|
655 |
+ continue
|
|
656 |
+ path = remote.download_blob(blob_digest)
|
|
657 |
+ self.cas.add_object(path=path, link_directly=True)
|
|
658 |
+ remote
|
|
659 |
+ self.cas.set_ref(ref, root_digest)
|
|
660 |
+ remote.tmp_downloads.clear()
|
|
661 |
+ except BlobNotFound:
|
|
662 |
+ element.info("Remote ({}) is missing blobs for {}".format(
|
|
663 |
+ remote.spec.url, element._get_brief_display_key()
|
|
664 |
+ ))
|
|
665 |
+ continue
|
|
666 |
+ |
|
649 | 667 |
if subdir:
|
650 | 668 |
# Attempt to extract subdir into artifact extract dir if it already exists
|
651 | 669 |
# without containing the subdir. If the respective artifact extract dir does not
|
652 | 670 |
# exist a complete extraction will complete.
|
653 | 671 |
self.extract(element, key, subdir)
|
672 |
+ |
|
673 |
+ element.info("Pulled artifact {} <- {}".format(display_key, remote.spec.url))
|
|
654 | 674 |
# no need to pull from additional remotes
|
655 | 675 |
return True
|
656 | 676 |
else:
|
... | ... | @@ -33,7 +33,7 @@ from .._protos.buildstream.v2 import buildstream_pb2 |
33 | 33 |
from .. import utils
|
34 | 34 |
from .._exceptions import CASError
|
35 | 35 |
|
36 |
-from .casremote import BlobNotFound, _CASBatchRead, _CASBatchUpdate
|
|
36 |
+from .casremote import _CASBatchUpdate
|
|
37 | 37 |
|
38 | 38 |
|
39 | 39 |
# A CASCache manages a CAS repository as specified in the Remote Execution API.
|
... | ... | @@ -183,50 +183,6 @@ class CASCache(): |
183 | 183 |
|
184 | 184 |
return modified, removed, added
|
185 | 185 |
|
186 |
- # pull():
|
|
187 |
- #
|
|
188 |
- # Pull a ref from a remote repository.
|
|
189 |
- #
|
|
190 |
- # Args:
|
|
191 |
- # ref (str): The ref to pull
|
|
192 |
- # remote (CASRemote): The remote repository to pull from
|
|
193 |
- # progress (callable): The progress callback, if any
|
|
194 |
- # subdir (str): The optional specific subdir to pull
|
|
195 |
- # excluded_subdirs (list): The optional list of subdirs to not pull
|
|
196 |
- #
|
|
197 |
- # Returns:
|
|
198 |
- # (bool): True if pull was successful, False if ref was not available
|
|
199 |
- #
|
|
200 |
- def pull(self, ref, remote, *, progress=None, subdir=None, excluded_subdirs=None):
|
|
201 |
- try:
|
|
202 |
- remote.init()
|
|
203 |
- |
|
204 |
- request = buildstream_pb2.GetReferenceRequest()
|
|
205 |
- request.key = ref
|
|
206 |
- response = remote.ref_storage.GetReference(request)
|
|
207 |
- |
|
208 |
- tree = remote_execution_pb2.Digest()
|
|
209 |
- tree.hash = response.digest.hash
|
|
210 |
- tree.size_bytes = response.digest.size_bytes
|
|
211 |
- |
|
212 |
- # Check if the element artifact is present, if so just fetch the subdir.
|
|
213 |
- if subdir and os.path.exists(self.objpath(tree)):
|
|
214 |
- self._fetch_subdir(remote, tree, subdir)
|
|
215 |
- else:
|
|
216 |
- # Fetch artifact, excluded_subdirs determined in pullqueue
|
|
217 |
- self._fetch_directory(remote, tree, excluded_subdirs=excluded_subdirs)
|
|
218 |
- |
|
219 |
- self.set_ref(ref, tree)
|
|
220 |
- |
|
221 |
- return True
|
|
222 |
- except grpc.RpcError as e:
|
|
223 |
- if e.code() != grpc.StatusCode.NOT_FOUND:
|
|
224 |
- raise CASError("Failed to pull ref {}: {}".format(ref, e)) from e
|
|
225 |
- else:
|
|
226 |
- return False
|
|
227 |
- except BlobNotFound as e:
|
|
228 |
- return False
|
|
229 |
- |
|
230 | 186 |
# pull_tree():
|
231 | 187 |
#
|
232 | 188 |
# Pull a single Tree rather than a ref.
|
... | ... | @@ -591,6 +547,17 @@ class CASCache(): |
591 | 547 |
reachable = set()
|
592 | 548 |
self._reachable_refs_dir(reachable, tree, update_mtime=True)
|
593 | 549 |
|
550 |
+ # Check to see if a blob is in the local CAS
|
|
551 |
+ # return None if not
|
|
552 |
+ def check_blob(self, digest):
|
|
553 |
+ objpath = self.objpath(digest)
|
|
554 |
+ if os.path.exists(objpath):
|
|
555 |
+ # already in local repository
|
|
556 |
+ return objpath
|
|
557 |
+ else:
|
|
558 |
+ return None
|
|
559 |
+ |
|
560 |
+ |
|
594 | 561 |
################################################
|
595 | 562 |
# Local Private Methods #
|
596 | 563 |
################################################
|
... | ... | @@ -805,126 +772,6 @@ class CASCache(): |
805 | 772 |
|
806 | 773 |
return objpath
|
807 | 774 |
|
808 |
- def _batch_download_complete(self, batch):
|
|
809 |
- for digest, data in batch.send():
|
|
810 |
- with tempfile.NamedTemporaryFile(dir=self.tmpdir) as f:
|
|
811 |
- f.write(data)
|
|
812 |
- f.flush()
|
|
813 |
- |
|
814 |
- added_digest = self.add_object(path=f.name, link_directly=True)
|
|
815 |
- assert added_digest.hash == digest.hash
|
|
816 |
- |
|
817 |
- # Helper function for _fetch_directory().
|
|
818 |
- def _fetch_directory_batch(self, remote, batch, fetch_queue, fetch_next_queue):
|
|
819 |
- self._batch_download_complete(batch)
|
|
820 |
- |
|
821 |
- # All previously scheduled directories are now locally available,
|
|
822 |
- # move them to the processing queue.
|
|
823 |
- fetch_queue.extend(fetch_next_queue)
|
|
824 |
- fetch_next_queue.clear()
|
|
825 |
- return _CASBatchRead(remote)
|
|
826 |
- |
|
827 |
- # Helper function for _fetch_directory().
|
|
828 |
- def _fetch_directory_node(self, remote, digest, batch, fetch_queue, fetch_next_queue, *, recursive=False):
|
|
829 |
- in_local_cache = os.path.exists(self.objpath(digest))
|
|
830 |
- |
|
831 |
- if in_local_cache:
|
|
832 |
- # Skip download, already in local cache.
|
|
833 |
- pass
|
|
834 |
- elif (digest.size_bytes >= remote.max_batch_total_size_bytes or
|
|
835 |
- not remote.batch_read_supported):
|
|
836 |
- # Too large for batch request, download in independent request.
|
|
837 |
- self._ensure_blob(remote, digest)
|
|
838 |
- in_local_cache = True
|
|
839 |
- else:
|
|
840 |
- if not batch.add(digest):
|
|
841 |
- # Not enough space left in batch request.
|
|
842 |
- # Complete pending batch first.
|
|
843 |
- batch = self._fetch_directory_batch(remote, batch, fetch_queue, fetch_next_queue)
|
|
844 |
- batch.add(digest)
|
|
845 |
- |
|
846 |
- if recursive:
|
|
847 |
- if in_local_cache:
|
|
848 |
- # Add directory to processing queue.
|
|
849 |
- fetch_queue.append(digest)
|
|
850 |
- else:
|
|
851 |
- # Directory will be available after completing pending batch.
|
|
852 |
- # Add directory to deferred processing queue.
|
|
853 |
- fetch_next_queue.append(digest)
|
|
854 |
- |
|
855 |
- return batch
|
|
856 |
- |
|
857 |
- # _fetch_directory():
|
|
858 |
- #
|
|
859 |
- # Fetches remote directory and adds it to content addressable store.
|
|
860 |
- #
|
|
861 |
- # Fetches files, symbolic links and recursively other directories in
|
|
862 |
- # the remote directory and adds them to the content addressable
|
|
863 |
- # store.
|
|
864 |
- #
|
|
865 |
- # Args:
|
|
866 |
- # remote (Remote): The remote to use.
|
|
867 |
- # dir_digest (Digest): Digest object for the directory to fetch.
|
|
868 |
- # excluded_subdirs (list): The optional list of subdirs to not fetch
|
|
869 |
- #
|
|
870 |
- def _fetch_directory(self, remote, dir_digest, *, excluded_subdirs=None):
|
|
871 |
- fetch_queue = [dir_digest]
|
|
872 |
- fetch_next_queue = []
|
|
873 |
- batch = _CASBatchRead(remote)
|
|
874 |
- if not excluded_subdirs:
|
|
875 |
- excluded_subdirs = []
|
|
876 |
- |
|
877 |
- while len(fetch_queue) + len(fetch_next_queue) > 0:
|
|
878 |
- if not fetch_queue:
|
|
879 |
- batch = self._fetch_directory_batch(remote, batch, fetch_queue, fetch_next_queue)
|
|
880 |
- |
|
881 |
- dir_digest = fetch_queue.pop(0)
|
|
882 |
- |
|
883 |
- objpath = self._ensure_blob(remote, dir_digest)
|
|
884 |
- |
|
885 |
- directory = remote_execution_pb2.Directory()
|
|
886 |
- with open(objpath, 'rb') as f:
|
|
887 |
- directory.ParseFromString(f.read())
|
|
888 |
- |
|
889 |
- for dirnode in directory.directories:
|
|
890 |
- if dirnode.name not in excluded_subdirs:
|
|
891 |
- batch = self._fetch_directory_node(remote, dirnode.digest, batch,
|
|
892 |
- fetch_queue, fetch_next_queue, recursive=True)
|
|
893 |
- |
|
894 |
- for filenode in directory.files:
|
|
895 |
- batch = self._fetch_directory_node(remote, filenode.digest, batch,
|
|
896 |
- fetch_queue, fetch_next_queue)
|
|
897 |
- |
|
898 |
- # Fetch final batch
|
|
899 |
- self._fetch_directory_batch(remote, batch, fetch_queue, fetch_next_queue)
|
|
900 |
- |
|
901 |
- def _fetch_subdir(self, remote, tree, subdir):
|
|
902 |
- subdirdigest = self._get_subdir(tree, subdir)
|
|
903 |
- self._fetch_directory(remote, subdirdigest)
|
|
904 |
- |
|
905 |
- def _fetch_tree(self, remote, digest):
|
|
906 |
- # download but do not store the Tree object
|
|
907 |
- with tempfile.NamedTemporaryFile(dir=self.tmpdir) as out:
|
|
908 |
- remote._fetch_blob(digest, out)
|
|
909 |
- |
|
910 |
- tree = remote_execution_pb2.Tree()
|
|
911 |
- |
|
912 |
- with open(out.name, 'rb') as f:
|
|
913 |
- tree.ParseFromString(f.read())
|
|
914 |
- |
|
915 |
- tree.children.extend([tree.root])
|
|
916 |
- for directory in tree.children:
|
|
917 |
- for filenode in directory.files:
|
|
918 |
- self._ensure_blob(remote, filenode.digest)
|
|
919 |
- |
|
920 |
- # place directory blob only in final location when we've downloaded
|
|
921 |
- # all referenced blobs to avoid dangling references in the repository
|
|
922 |
- dirbuffer = directory.SerializeToString()
|
|
923 |
- dirdigest = self.add_object(buffer=dirbuffer)
|
|
924 |
- assert dirdigest.size_bytes == len(dirbuffer)
|
|
925 |
- |
|
926 |
- return dirdigest
|
|
927 |
- |
|
928 | 775 |
def _send_directory(self, remote, digest, u_uid=uuid.uuid4()):
|
929 | 776 |
required_blobs = self._required_blobs(digest)
|
930 | 777 |
|
... | ... | @@ -3,6 +3,7 @@ import io |
3 | 3 |
import os
|
4 | 4 |
import multiprocessing
|
5 | 5 |
import signal
|
6 |
+import tempfile
|
|
6 | 7 |
from urllib.parse import urlparse
|
7 | 8 |
import uuid
|
8 | 9 |
|
... | ... | @@ -88,6 +89,14 @@ class CASRemote(): |
88 | 89 |
self.batch_read_supported = None
|
89 | 90 |
self.capabilities = None
|
90 | 91 |
self.max_batch_total_size_bytes = None
|
92 |
+ # TODO change this later
|
|
93 |
+ self.tmpdir = "/home/raoulhidalgo/.cache/buildstream/tmp"
|
|
94 |
+ os.makedirs(self.tmpdir, exist_ok=True)
|
|
95 |
+ |
|
96 |
+ self.tmp_downloads = [] # files in the tmpdir waiting to be added to local caches
|
|
97 |
+ |
|
98 |
+ def __del__(self):
|
|
99 |
+ self.tmp_downloads.clear()
|
|
91 | 100 |
|
92 | 101 |
def init(self):
|
93 | 102 |
if not self._initialized:
|
... | ... | @@ -252,6 +261,73 @@ class CASRemote(): |
252 | 261 |
|
253 | 262 |
return message_digest
|
254 | 263 |
|
264 |
+ # get_reference():
|
|
265 |
+ #
|
|
266 |
+ # Args:
|
|
267 |
+ # ref (str): The ref to request
|
|
268 |
+ #
|
|
269 |
+ def get_reference(self, ref):
|
|
270 |
+ try:
|
|
271 |
+ self.init()
|
|
272 |
+ |
|
273 |
+ request = buildstream_pb2.GetReferenceRequest()
|
|
274 |
+ request.key = ref
|
|
275 |
+ return self.ref_storage.GetReference(request).digest
|
|
276 |
+ except grpc.RpcError as e:
|
|
277 |
+ if e.code() != grpc.StatusCode.NOT_FOUND:
|
|
278 |
+ raise CASError("Failed to find ref {}: {}".format(ref, e)) from e
|
|
279 |
+ else:
|
|
280 |
+ return None
|
|
281 |
+ |
|
282 |
+ # blob_iter():
|
|
283 |
+ #
|
|
284 |
+ # Iterate over blobs digests from a reference
|
|
285 |
+ #
|
|
286 |
+ # Args:
|
|
287 |
+ # root_digest (str): The root_digest to get a tree of
|
|
288 |
+ # progress (callable): The progress callback, if any
|
|
289 |
+ # subdir (str): The optional specific subdir to pull
|
|
290 |
+ # excluded_subdirs (list): The optional list of subdirs to not pull
|
|
291 |
+ #
|
|
292 |
+ # Returns:
|
|
293 |
+ # (iter): True if pull was successful, False if ref was not available
|
|
294 |
+ #
|
|
295 |
+ def yield_blob_digests(self, root_digest, *, progress=None, subdir=None, excluded_subdirs=None):
|
|
296 |
+ try:
|
|
297 |
+ self.init()
|
|
298 |
+ |
|
299 |
+ # Check if the element artifact is present, if so just fetch the subdir.
|
|
300 |
+ if subdir and os.path.exists(self.objpath(tree)):
|
|
301 |
+ yield from self._yield_subdir(root_digest, subdir)
|
|
302 |
+ else:
|
|
303 |
+ # Fetch artifact, excluded_subdirs determined in pullqueue
|
|
304 |
+ yield from self._yield_directory_digests(root_digest, excluded_subdirs=excluded_subdirs)
|
|
305 |
+ |
|
306 |
+ except grpc.RpcError as e:
|
|
307 |
+ if e.code() != grpc.StatusCode.NOT_FOUND:
|
|
308 |
+ raise CASError("Failed to pull ref {}: {}".format(ref, e)) from e
|
|
309 |
+ except BlobNotFound as e:
|
|
310 |
+ raise e
|
|
311 |
+ |
|
312 |
+ def request_blob(self, digest):
|
|
313 |
+ pass
|
|
314 |
+ |
|
315 |
+ # download_blob():
|
|
316 |
+ #
|
|
317 |
+ # Downloads blob immediately and returns path to tmpdir location
|
|
318 |
+ #
|
|
319 |
+ # Args:
|
|
320 |
+ # path (str): tmpdir location of downloaded blob
|
|
321 |
+ #
|
|
322 |
+ def download_blob(self, digest):
|
|
323 |
+ # TODO expand for adding to batches some other logic
|
|
324 |
+ self.tmp_downloads.clear()
|
|
325 |
+ |
|
326 |
+ f = tempfile.NamedTemporaryFile(dir=self.tmpdir)
|
|
327 |
+ self._fetch_blob(digest, f)
|
|
328 |
+ self.tmp_downloads.append(f)
|
|
329 |
+ return f.name
|
|
330 |
+ |
|
255 | 331 |
################################################
|
256 | 332 |
# Local Private Methods #
|
257 | 333 |
################################################
|
... | ... | @@ -266,6 +342,83 @@ class CASRemote(): |
266 | 342 |
|
267 | 343 |
assert digest.size_bytes == os.fstat(stream.fileno()).st_size
|
268 | 344 |
|
345 |
+ def _batch_download_complete(self, batch):
|
|
346 |
+ for digest, data in batch.send():
|
|
347 |
+ with tempfile.NamedTemporaryFile(dir=self.tmpdir) as f:
|
|
348 |
+ f.write(data)
|
|
349 |
+ f.flush()
|
|
350 |
+ self.tmp_downloads.append(f)
|
|
351 |
+ |
|
352 |
+ # added_digest = self.add_object(path=f.name, link_directly=True)
|
|
353 |
+ # assert added_digest.hash == digest.hash
|
|
354 |
+ |
|
355 |
+ # Helper function for _fetch_directory().
|
|
356 |
+ def _fetch_directory_batch(self, batch, fetch_queue, fetch_next_queue):
|
|
357 |
+ self._batch_download_complete(batch)
|
|
358 |
+ |
|
359 |
+ # All previously scheduled directories are now locally available,
|
|
360 |
+ # move them to the processing queue.
|
|
361 |
+ fetch_queue.extend(fetch_next_queue)
|
|
362 |
+ fetch_next_queue.clear()
|
|
363 |
+ return _CASBatchRead(self)
|
|
364 |
+ |
|
365 |
+ # _fetch_directory():
|
|
366 |
+ #
|
|
367 |
+ # Fetches remote directory and adds it to content addressable store.
|
|
368 |
+ #
|
|
369 |
+ # Fetches files, symbolic links and recursively other directories in
|
|
370 |
+ # the remote directory and adds them to the content addressable
|
|
371 |
+ # store.
|
|
372 |
+ #
|
|
373 |
+ # Args:
|
|
374 |
+ # dir_digest (Digest): Digest object for the directory to fetch.
|
|
375 |
+ # excluded_subdirs (list): The optional list of subdirs to not fetch
|
|
376 |
+ #
|
|
377 |
+ def _yield_directory_digests(self, dir_digest, *, excluded_subdirs=[]):
|
|
378 |
+ |
|
379 |
+ objpath = self.download_blob(dir_digest)
|
|
380 |
+ |
|
381 |
+ directory = remote_execution_pb2.Directory()
|
|
382 |
+ |
|
383 |
+ with open(objpath, 'rb') as f:
|
|
384 |
+ directory.ParseFromString(f.read())
|
|
385 |
+ |
|
386 |
+ yield dir_digest
|
|
387 |
+ for filenode in directory.files:
|
|
388 |
+ yield filenode.digest
|
|
389 |
+ |
|
390 |
+ for dirnode in directory.directories:
|
|
391 |
+ if dirnode.name not in excluded_subdirs:
|
|
392 |
+ yield dirnode.digest
|
|
393 |
+ yield from self._yield_directory_digests(dirnode.digest)
|
|
394 |
+ |
|
395 |
+ def _fetch_subdir(self, remote, tree, subdir):
|
|
396 |
+ subdirdigest = self._get_subdir(tree, subdir)
|
|
397 |
+ self._fetch_directory(remote, subdirdigest)
|
|
398 |
+ |
|
399 |
+ def _fetch_tree(self, remote, digest):
|
|
400 |
+ # download but do not store the Tree object
|
|
401 |
+ with tempfile.NamedTemporaryFile(dir=self.tmpdir) as out:
|
|
402 |
+ remote._fetch_blob(digest, out)
|
|
403 |
+ |
|
404 |
+ tree = remote_execution_pb2.Tree()
|
|
405 |
+ |
|
406 |
+ with open(out.name, 'rb') as f:
|
|
407 |
+ tree.ParseFromString(f.read())
|
|
408 |
+ |
|
409 |
+ tree.children.extend([tree.root])
|
|
410 |
+ for directory in tree.children:
|
|
411 |
+ for filenode in directory.files:
|
|
412 |
+ self._ensure_blob(remote, filenode.digest)
|
|
413 |
+ |
|
414 |
+ # place directory blob only in final location when we've downloaded
|
|
415 |
+ # all referenced blobs to avoid dangling references in the repository
|
|
416 |
+ dirbuffer = directory.SerializeToString()
|
|
417 |
+ dirdigest = self.add_object(buffer=dirbuffer)
|
|
418 |
+ assert dirdigest.size_bytes == len(dirbuffer)
|
|
419 |
+ |
|
420 |
+ return dirdigest
|
|
421 |
+ |
|
269 | 422 |
def _send_blob(self, digest, stream, u_uid=uuid.uuid4()):
|
270 | 423 |
resource_name = '/'.join(['uploads', str(u_uid), 'blobs',
|
271 | 424 |
digest.hash, str(digest.size_bytes)])
|