Martin Blanchard pushed to branch mablanch/630-remote-execution-reconn at BuildStream / buildstream
Commits:
-
12719f0d
by Jürg Billeter at 2018-10-22T17:05:41Z
-
a7a28d14
by Jürg Billeter at 2018-10-22T17:05:41Z
-
be29e0f5
by Jürg Billeter at 2018-10-22T17:53:26Z
-
b74aca1a
by Jürg Billeter at 2018-10-23T09:22:19Z
-
c7dda150
by Jürg Billeter at 2018-10-23T09:48:00Z
-
797d3010
by Martin Blanchard at 2018-10-23T10:01:38Z
4 changed files:
- buildstream/_artifactcache/artifactcache.py
- buildstream/_artifactcache/cascache.py
- buildstream/sandbox/_sandboxremote.py
- tests/testutils/artifactshare.py
Changes:
... | ... | @@ -228,7 +228,7 @@ class ArtifactCache(): |
228 | 228 |
self._required_elements.update(elements)
|
229 | 229 |
|
230 | 230 |
# For the cache keys which were resolved so far, we bump
|
231 |
- # the atime of them.
|
|
231 |
+ # the mtime of them.
|
|
232 | 232 |
#
|
233 | 233 |
# This is just in case we have concurrent instances of
|
234 | 234 |
# BuildStream running with the same artifact cache, it will
|
... | ... | @@ -240,7 +240,7 @@ class ArtifactCache(): |
240 | 240 |
for key in (strong_key, weak_key):
|
241 | 241 |
if key:
|
242 | 242 |
try:
|
243 |
- self.update_atime(key)
|
|
243 |
+ self.update_mtime(element, key)
|
|
244 | 244 |
except ArtifactError:
|
245 | 245 |
pass
|
246 | 246 |
|
... | ... | @@ -391,15 +391,16 @@ class ArtifactCache(): |
391 | 391 |
def preflight(self):
|
392 | 392 |
pass
|
393 | 393 |
|
394 |
- # update_atime()
|
|
394 |
+ # update_mtime()
|
|
395 | 395 |
#
|
396 |
- # Update the atime of an artifact.
|
|
396 |
+ # Update the mtime of an artifact.
|
|
397 | 397 |
#
|
398 | 398 |
# Args:
|
399 |
+ # element (Element): The Element to update
|
|
399 | 400 |
# key (str): The key of the artifact.
|
400 | 401 |
#
|
401 |
- def update_atime(self, key):
|
|
402 |
- raise ImplError("Cache '{kind}' does not implement contains()"
|
|
402 |
+ def update_mtime(self, element, key):
|
|
403 |
+ raise ImplError("Cache '{kind}' does not implement update_mtime()"
|
|
403 | 404 |
.format(kind=type(self).__name__))
|
404 | 405 |
|
405 | 406 |
# initialize_remotes():
|
... | ... | @@ -538,8 +538,9 @@ class CASCache(ArtifactCache): |
538 | 538 |
except FileNotFoundError as e:
|
539 | 539 |
raise ArtifactError("Attempt to access unavailable artifact: {}".format(e)) from e
|
540 | 540 |
|
541 |
- def update_atime(self, ref):
|
|
541 |
+ def update_mtime(self, element, key):
|
|
542 | 542 |
try:
|
543 |
+ ref = self.get_artifact_fullname(element, key)
|
|
543 | 544 |
os.utime(self._refpath(ref))
|
544 | 545 |
except FileNotFoundError as e:
|
545 | 546 |
raise ArtifactError("Attempt to access unavailable artifact: {}".format(e)) from e
|
... | ... | @@ -76,8 +76,7 @@ class SandboxRemote(Sandbox): |
76 | 76 |
# Upload the Command message to the remote CAS server
|
77 | 77 |
command_digest = cascache.push_message(self._get_project(), remote_command)
|
78 | 78 |
if not command_digest or not cascache.verify_digest_pushed(self._get_project(), command_digest):
|
79 |
- # Command push failed
|
|
80 |
- return None
|
|
79 |
+ raise SandboxError("Failed pushing build command to remote CAS.")
|
|
81 | 80 |
|
82 | 81 |
# Create and send the action.
|
83 | 82 |
action = remote_execution_pb2.Action(command_digest=command_digest,
|
... | ... | @@ -88,27 +87,57 @@ class SandboxRemote(Sandbox): |
88 | 87 |
# Upload the Action message to the remote CAS server
|
89 | 88 |
action_digest = cascache.push_message(self._get_project(), action)
|
90 | 89 |
if not action_digest or not cascache.verify_digest_pushed(self._get_project(), action_digest):
|
91 |
- # Action push failed
|
|
92 |
- return None
|
|
90 |
+ raise SandboxError("Failed pushing build action to remote CAS.")
|
|
93 | 91 |
|
94 | 92 |
# Next, try to create a communication channel to the BuildGrid server.
|
95 | 93 |
channel = grpc.insecure_channel(self.server_url)
|
96 | 94 |
stub = remote_execution_pb2_grpc.ExecutionStub(channel)
|
97 | 95 |
request = remote_execution_pb2.ExecuteRequest(action_digest=action_digest,
|
98 | 96 |
skip_cache_lookup=False)
|
99 |
- try:
|
|
100 |
- operation_iterator = stub.Execute(request)
|
|
101 |
- except grpc.RpcError:
|
|
102 |
- return None
|
|
97 |
+ |
|
98 |
+ def __run_remote_command(stub, execute_request=None, running_operation=None):
|
|
99 |
+ try:
|
|
100 |
+ last_operation = None
|
|
101 |
+ if execute_request is not None:
|
|
102 |
+ operation_iterator = stub.Execute(execute_request)
|
|
103 |
+ else:
|
|
104 |
+ request = remote_execution_pb2.WaitExecutionRequest(name=running_operation.name)
|
|
105 |
+ operation_iterator = stub.WaitExecution(request)
|
|
106 |
+ |
|
107 |
+ for operation in operation_iterator:
|
|
108 |
+ if operation.done:
|
|
109 |
+ return operation
|
|
110 |
+ else:
|
|
111 |
+ last_operation = operation
|
|
112 |
+ except grpc.RpcError as e:
|
|
113 |
+ status_code = e.code()
|
|
114 |
+ if status_code == grpc.StatusCode.UNAVAILABLE:
|
|
115 |
+ raise SandboxError("Failed contacting remote execution server at {}."
|
|
116 |
+ .format(self.server_url))
|
|
117 |
+ |
|
118 |
+ elif status_code in (grpc.StatusCode.INVALID_ARGUMENT,
|
|
119 |
+ grpc.StatusCode.FAILED_PRECONDITION,
|
|
120 |
+ grpc.StatusCode.RESOURCE_EXHAUSTED,
|
|
121 |
+ grpc.StatusCode.INTERNAL,
|
|
122 |
+ grpc.StatusCode.DEADLINE_EXCEEDED):
|
|
123 |
+ raise SandboxError("{} ({}).".format(e.details(), status_code.name))
|
|
124 |
+ |
|
125 |
+ elif running_operation and status_code == grpc.StatusCode.UNIMPLEMENTED:
|
|
126 |
+ raise SandboxError("Failed trying to recover from connection loss: "
|
|
127 |
+ "server does not support operation status polling recovery.")
|
|
128 |
+ |
|
129 |
+ return last_operation
|
|
103 | 130 |
|
104 | 131 |
operation = None
|
105 | 132 |
with self._get_context().timed_activity("Waiting for the remote build to complete"):
|
106 |
- # It is advantageous to check operation_iterator.code() is grpc.StatusCode.OK here,
|
|
107 |
- # which will check the server is actually contactable. However, calling it when the
|
|
108 |
- # server is available seems to cause .code() to hang forever.
|
|
109 |
- for operation in operation_iterator:
|
|
110 |
- if operation.done:
|
|
111 |
- break
|
|
133 |
+ operation = __run_remote_command(stub, execute_request=request)
|
|
134 |
+ if operation is None:
|
|
135 |
+ return None
|
|
136 |
+ elif operation.done:
|
|
137 |
+ return operation
|
|
138 |
+ |
|
139 |
+ while operation is not None and not operation.done:
|
|
140 |
+ operation = __run_remote_command(stub, running_operation=operation)
|
|
112 | 141 |
|
113 | 142 |
return operation
|
114 | 143 |
|
... | ... | @@ -192,7 +221,6 @@ class SandboxRemote(Sandbox): |
192 | 221 |
|
193 | 222 |
if operation is None:
|
194 | 223 |
# Failure of remote execution, usually due to an error in BuildStream
|
195 |
- # NB This error could be raised in __run_remote_command
|
|
196 | 224 |
raise SandboxError("No response returned from server")
|
197 | 225 |
|
198 | 226 |
assert not operation.HasField('error') and operation.HasField('response')
|
... | ... | @@ -122,9 +122,8 @@ class ArtifactShare(): |
122 | 122 |
# same algo for creating an artifact reference
|
123 | 123 |
#
|
124 | 124 |
|
125 |
- # Chop off the .bst suffix first
|
|
126 |
- assert element_name.endswith('.bst')
|
|
127 |
- element_name = element_name[:-4]
|
|
125 |
+ # Replace path separator and chop off the .bst suffix
|
|
126 |
+ element_name = os.path.splitext(element_name.replace(os.sep, '-'))[0]
|
|
128 | 127 |
|
129 | 128 |
valid_chars = string.digits + string.ascii_letters + '-._'
|
130 | 129 |
element_name = ''.join([
|