Martin Blanchard pushed to branch mablanch/630-remote-execution-reconn at BuildStream / buildstream
Commits:
-
74c115b9
by Angelos Evripiotis at 2018-10-23T10:07:31Z
-
ecb58b42
by Phil Dawson at 2018-10-23T10:33:47Z
-
aa0cbf5d
by Martin Blanchard at 2018-10-23T10:54:40Z
3 changed files:
- buildstream/plugins/sources/deb.py
- buildstream/plugins/sources/tar.py
- buildstream/sandbox/_sandboxremote.py
Changes:
... | ... | @@ -50,7 +50,7 @@ deb - stage files from .deb packages |
50 | 50 |
"""
|
51 | 51 |
|
52 | 52 |
import tarfile
|
53 |
-from contextlib import contextmanager, ExitStack
|
|
53 |
+from contextlib import contextmanager
|
|
54 | 54 |
import arpy # pylint: disable=import-error
|
55 | 55 |
|
56 | 56 |
from .tar import TarSource
|
... | ... | @@ -69,8 +69,7 @@ class DebSource(TarSource): |
69 | 69 |
|
70 | 70 |
@contextmanager
|
71 | 71 |
def _get_tar(self):
|
72 |
- with ExitStack() as context:
|
|
73 |
- deb_file = context.enter_context(open(self._get_mirror_file(), 'rb'))
|
|
72 |
+ with open(self._get_mirror_file(), 'rb') as deb_file:
|
|
74 | 73 |
arpy_archive = arpy.Archive(fileobj=deb_file)
|
75 | 74 |
arpy_archive.read_all_headers()
|
76 | 75 |
data_tar_arpy = [v for k, v in arpy_archive.archived_files.items() if b"data.tar" in k][0]
|
... | ... | @@ -57,7 +57,7 @@ tar - stage files from tar archives |
57 | 57 |
|
58 | 58 |
import os
|
59 | 59 |
import tarfile
|
60 |
-from contextlib import contextmanager, ExitStack
|
|
60 |
+from contextlib import contextmanager
|
|
61 | 61 |
from tempfile import TemporaryFile
|
62 | 62 |
|
63 | 63 |
from buildstream import SourceError
|
... | ... | @@ -88,8 +88,7 @@ class TarSource(DownloadableFileSource): |
88 | 88 |
def _run_lzip(self):
|
89 | 89 |
assert self.host_lzip
|
90 | 90 |
with TemporaryFile() as lzip_stdout:
|
91 |
- with ExitStack() as context:
|
|
92 |
- lzip_file = context.enter_context(open(self._get_mirror_file(), 'r'))
|
|
91 |
+ with open(self._get_mirror_file(), 'r') as lzip_file:
|
|
93 | 92 |
self.call([self.host_lzip, '-d'],
|
94 | 93 |
stdin=lzip_file,
|
95 | 94 |
stdout=lzip_stdout)
|
... | ... | @@ -76,8 +76,7 @@ class SandboxRemote(Sandbox): |
76 | 76 |
# Upload the Command message to the remote CAS server
|
77 | 77 |
command_digest = cascache.push_message(self._get_project(), remote_command)
|
78 | 78 |
if not command_digest or not cascache.verify_digest_pushed(self._get_project(), command_digest):
|
79 |
- # Command push failed
|
|
80 |
- return None
|
|
79 |
+ raise SandboxError("Failed pushing build command to remote CAS.")
|
|
81 | 80 |
|
82 | 81 |
# Create and send the action.
|
83 | 82 |
action = remote_execution_pb2.Action(command_digest=command_digest,
|
... | ... | @@ -88,27 +87,57 @@ class SandboxRemote(Sandbox): |
88 | 87 |
# Upload the Action message to the remote CAS server
|
89 | 88 |
action_digest = cascache.push_message(self._get_project(), action)
|
90 | 89 |
if not action_digest or not cascache.verify_digest_pushed(self._get_project(), action_digest):
|
91 |
- # Action push failed
|
|
92 |
- return None
|
|
90 |
+ raise SandboxError("Failed pushing build action to remote CAS.")
|
|
93 | 91 |
|
94 | 92 |
# Next, try to create a communication channel to the BuildGrid server.
|
95 | 93 |
channel = grpc.insecure_channel(self.server_url)
|
96 | 94 |
stub = remote_execution_pb2_grpc.ExecutionStub(channel)
|
97 | 95 |
request = remote_execution_pb2.ExecuteRequest(action_digest=action_digest,
|
98 | 96 |
skip_cache_lookup=False)
|
99 |
- try:
|
|
100 |
- operation_iterator = stub.Execute(request)
|
|
101 |
- except grpc.RpcError:
|
|
102 |
- return None
|
|
97 |
+ |
|
98 |
+ def __run_remote_command(stub, execute_request=None, running_operation=None):
|
|
99 |
+ try:
|
|
100 |
+ last_operation = None
|
|
101 |
+ if execute_request is not None:
|
|
102 |
+ operation_iterator = stub.Execute(execute_request)
|
|
103 |
+ else:
|
|
104 |
+ request = remote_execution_pb2.WaitExecutionRequest(name=running_operation.name)
|
|
105 |
+ operation_iterator = stub.WaitExecution(request)
|
|
106 |
+ |
|
107 |
+ for operation in operation_iterator:
|
|
108 |
+ if operation.done:
|
|
109 |
+ return operation
|
|
110 |
+ else:
|
|
111 |
+ last_operation = operation
|
|
112 |
+ except grpc.RpcError as e:
|
|
113 |
+ status_code = e.code()
|
|
114 |
+ if status_code == grpc.StatusCode.UNAVAILABLE:
|
|
115 |
+ raise SandboxError("Failed contacting remote execution server at {}."
|
|
116 |
+ .format(self.server_url))
|
|
117 |
+ |
|
118 |
+ elif status_code in (grpc.StatusCode.INVALID_ARGUMENT,
|
|
119 |
+ grpc.StatusCode.FAILED_PRECONDITION,
|
|
120 |
+ grpc.StatusCode.RESOURCE_EXHAUSTED,
|
|
121 |
+ grpc.StatusCode.INTERNAL,
|
|
122 |
+ grpc.StatusCode.DEADLINE_EXCEEDED):
|
|
123 |
+ raise SandboxError("{} ({}).".format(e.details(), status_code.name))
|
|
124 |
+ |
|
125 |
+ elif running_operation and status_code == grpc.StatusCode.UNIMPLEMENTED:
|
|
126 |
+ raise SandboxError("Failed trying to recover from connection loss: "
|
|
127 |
+ "server does not support operation status polling recovery.")
|
|
128 |
+ |
|
129 |
+ return last_operation
|
|
103 | 130 |
|
104 | 131 |
operation = None
|
105 | 132 |
with self._get_context().timed_activity("Waiting for the remote build to complete"):
|
106 |
- # It is advantageous to check operation_iterator.code() is grpc.StatusCode.OK here,
|
|
107 |
- # which will check the server is actually contactable. However, calling it when the
|
|
108 |
- # server is available seems to cause .code() to hang forever.
|
|
109 |
- for operation in operation_iterator:
|
|
110 |
- if operation.done:
|
|
111 |
- break
|
|
133 |
+ operation = __run_remote_command(stub, execute_request=request)
|
|
134 |
+ if operation is None:
|
|
135 |
+ return None
|
|
136 |
+ elif operation.done:
|
|
137 |
+ return operation
|
|
138 |
+ |
|
139 |
+ while operation is not None and not operation.done:
|
|
140 |
+ operation = __run_remote_command(stub, running_operation=operation)
|
|
112 | 141 |
|
113 | 142 |
return operation
|
114 | 143 |
|
... | ... | @@ -192,7 +221,6 @@ class SandboxRemote(Sandbox): |
192 | 221 |
|
193 | 222 |
if operation is None:
|
194 | 223 |
# Failure of remote execution, usually due to an error in BuildStream
|
195 |
- # NB This error could be raised in __run_remote_command
|
|
196 | 224 |
raise SandboxError("No response returned from server")
|
197 | 225 |
|
198 | 226 |
assert not operation.HasField('error') and operation.HasField('response')
|