Martin Blanchard pushed to branch mablanch/630-remote-execution-reconn at BuildStream / buildstream
Commits:
-
74c115b9
by Angelos Evripiotis at 2018-10-23T10:07:31Z
-
ecb58b42
by Phil Dawson at 2018-10-23T10:33:47Z
-
aa0cbf5d
by Martin Blanchard at 2018-10-23T10:54:40Z
3 changed files:
- buildstream/plugins/sources/deb.py
- buildstream/plugins/sources/tar.py
- buildstream/sandbox/_sandboxremote.py
Changes:
| ... | ... | @@ -50,7 +50,7 @@ deb - stage files from .deb packages |
| 50 | 50 |
"""
|
| 51 | 51 |
|
| 52 | 52 |
import tarfile
|
| 53 |
-from contextlib import contextmanager, ExitStack
|
|
| 53 |
+from contextlib import contextmanager
|
|
| 54 | 54 |
import arpy # pylint: disable=import-error
|
| 55 | 55 |
|
| 56 | 56 |
from .tar import TarSource
|
| ... | ... | @@ -69,8 +69,7 @@ class DebSource(TarSource): |
| 69 | 69 |
|
| 70 | 70 |
@contextmanager
|
| 71 | 71 |
def _get_tar(self):
|
| 72 |
- with ExitStack() as context:
|
|
| 73 |
- deb_file = context.enter_context(open(self._get_mirror_file(), 'rb'))
|
|
| 72 |
+ with open(self._get_mirror_file(), 'rb') as deb_file:
|
|
| 74 | 73 |
arpy_archive = arpy.Archive(fileobj=deb_file)
|
| 75 | 74 |
arpy_archive.read_all_headers()
|
| 76 | 75 |
data_tar_arpy = [v for k, v in arpy_archive.archived_files.items() if b"data.tar" in k][0]
|
| ... | ... | @@ -57,7 +57,7 @@ tar - stage files from tar archives |
| 57 | 57 |
|
| 58 | 58 |
import os
|
| 59 | 59 |
import tarfile
|
| 60 |
-from contextlib import contextmanager, ExitStack
|
|
| 60 |
+from contextlib import contextmanager
|
|
| 61 | 61 |
from tempfile import TemporaryFile
|
| 62 | 62 |
|
| 63 | 63 |
from buildstream import SourceError
|
| ... | ... | @@ -88,8 +88,7 @@ class TarSource(DownloadableFileSource): |
| 88 | 88 |
def _run_lzip(self):
|
| 89 | 89 |
assert self.host_lzip
|
| 90 | 90 |
with TemporaryFile() as lzip_stdout:
|
| 91 |
- with ExitStack() as context:
|
|
| 92 |
- lzip_file = context.enter_context(open(self._get_mirror_file(), 'r'))
|
|
| 91 |
+ with open(self._get_mirror_file(), 'r') as lzip_file:
|
|
| 93 | 92 |
self.call([self.host_lzip, '-d'],
|
| 94 | 93 |
stdin=lzip_file,
|
| 95 | 94 |
stdout=lzip_stdout)
|
| ... | ... | @@ -76,8 +76,7 @@ class SandboxRemote(Sandbox): |
| 76 | 76 |
# Upload the Command message to the remote CAS server
|
| 77 | 77 |
command_digest = cascache.push_message(self._get_project(), remote_command)
|
| 78 | 78 |
if not command_digest or not cascache.verify_digest_pushed(self._get_project(), command_digest):
|
| 79 |
- # Command push failed
|
|
| 80 |
- return None
|
|
| 79 |
+ raise SandboxError("Failed pushing build command to remote CAS.")
|
|
| 81 | 80 |
|
| 82 | 81 |
# Create and send the action.
|
| 83 | 82 |
action = remote_execution_pb2.Action(command_digest=command_digest,
|
| ... | ... | @@ -88,27 +87,57 @@ class SandboxRemote(Sandbox): |
| 88 | 87 |
# Upload the Action message to the remote CAS server
|
| 89 | 88 |
action_digest = cascache.push_message(self._get_project(), action)
|
| 90 | 89 |
if not action_digest or not cascache.verify_digest_pushed(self._get_project(), action_digest):
|
| 91 |
- # Action push failed
|
|
| 92 |
- return None
|
|
| 90 |
+ raise SandboxError("Failed pushing build action to remote CAS.")
|
|
| 93 | 91 |
|
| 94 | 92 |
# Next, try to create a communication channel to the BuildGrid server.
|
| 95 | 93 |
channel = grpc.insecure_channel(self.server_url)
|
| 96 | 94 |
stub = remote_execution_pb2_grpc.ExecutionStub(channel)
|
| 97 | 95 |
request = remote_execution_pb2.ExecuteRequest(action_digest=action_digest,
|
| 98 | 96 |
skip_cache_lookup=False)
|
| 99 |
- try:
|
|
| 100 |
- operation_iterator = stub.Execute(request)
|
|
| 101 |
- except grpc.RpcError:
|
|
| 102 |
- return None
|
|
| 97 |
+ |
|
| 98 |
+ def __run_remote_command(stub, execute_request=None, running_operation=None):
|
|
| 99 |
+ try:
|
|
| 100 |
+ last_operation = None
|
|
| 101 |
+ if execute_request is not None:
|
|
| 102 |
+ operation_iterator = stub.Execute(execute_request)
|
|
| 103 |
+ else:
|
|
| 104 |
+ request = remote_execution_pb2.WaitExecutionRequest(name=running_operation.name)
|
|
| 105 |
+ operation_iterator = stub.WaitExecution(request)
|
|
| 106 |
+ |
|
| 107 |
+ for operation in operation_iterator:
|
|
| 108 |
+ if operation.done:
|
|
| 109 |
+ return operation
|
|
| 110 |
+ else:
|
|
| 111 |
+ last_operation = operation
|
|
| 112 |
+ except grpc.RpcError as e:
|
|
| 113 |
+ status_code = e.code()
|
|
| 114 |
+ if status_code == grpc.StatusCode.UNAVAILABLE:
|
|
| 115 |
+ raise SandboxError("Failed contacting remote execution server at {}."
|
|
| 116 |
+ .format(self.server_url))
|
|
| 117 |
+ |
|
| 118 |
+ elif status_code in (grpc.StatusCode.INVALID_ARGUMENT,
|
|
| 119 |
+ grpc.StatusCode.FAILED_PRECONDITION,
|
|
| 120 |
+ grpc.StatusCode.RESOURCE_EXHAUSTED,
|
|
| 121 |
+ grpc.StatusCode.INTERNAL,
|
|
| 122 |
+ grpc.StatusCode.DEADLINE_EXCEEDED):
|
|
| 123 |
+ raise SandboxError("{} ({}).".format(e.details(), status_code.name))
|
|
| 124 |
+ |
|
| 125 |
+ elif running_operation and status_code == grpc.StatusCode.UNIMPLEMENTED:
|
|
| 126 |
+ raise SandboxError("Failed trying to recover from connection loss: "
|
|
| 127 |
+ "server does not support operation status polling recovery.")
|
|
| 128 |
+ |
|
| 129 |
+ return last_operation
|
|
| 103 | 130 |
|
| 104 | 131 |
operation = None
|
| 105 | 132 |
with self._get_context().timed_activity("Waiting for the remote build to complete"):
|
| 106 |
- # It is advantageous to check operation_iterator.code() is grpc.StatusCode.OK here,
|
|
| 107 |
- # which will check the server is actually contactable. However, calling it when the
|
|
| 108 |
- # server is available seems to cause .code() to hang forever.
|
|
| 109 |
- for operation in operation_iterator:
|
|
| 110 |
- if operation.done:
|
|
| 111 |
- break
|
|
| 133 |
+ operation = __run_remote_command(stub, execute_request=request)
|
|
| 134 |
+ if operation is None:
|
|
| 135 |
+ return None
|
|
| 136 |
+ elif operation.done:
|
|
| 137 |
+ return operation
|
|
| 138 |
+ |
|
| 139 |
+ while operation is not None and not operation.done:
|
|
| 140 |
+ operation = __run_remote_command(stub, running_operation=operation)
|
|
| 112 | 141 |
|
| 113 | 142 |
return operation
|
| 114 | 143 |
|
| ... | ... | @@ -192,7 +221,6 @@ class SandboxRemote(Sandbox): |
| 192 | 221 |
|
| 193 | 222 |
if operation is None:
|
| 194 | 223 |
# Failure of remote execution, usually due to an error in BuildStream
|
| 195 |
- # NB This error could be raised in __run_remote_command
|
|
| 196 | 224 |
raise SandboxError("No response returned from server")
|
| 197 | 225 |
|
| 198 | 226 |
assert not operation.HasField('error') and operation.HasField('response')
|
