finn pushed to branch finn/84-bot-errors at BuildGrid / buildgrid
Commits:
-
7baac532
by finnball at 2018-09-24T08:26:59Z
-
704a3a11
by finnball at 2018-09-24T08:27:02Z
-
fb9cedd3
by finnball at 2018-09-24T08:27:02Z
-
b3927da5
by finnball at 2018-09-24T08:27:50Z
-
5e375045
by finnball at 2018-09-24T08:27:54Z
-
50cbe35f
by finnball at 2018-09-24T08:27:54Z
-
4896bb8f
by finnball at 2018-09-24T08:27:54Z
8 changed files:
- buildgrid/_app/bots/buildbox.py
- buildgrid/_app/bots/temp_directory.py
- buildgrid/bot/bot_session.py
- buildgrid/server/bots/instance.py
- buildgrid/server/job.py
- buildgrid/server/scheduler.py
- buildgrid/settings.py
- tests/integration/operations_service.py
Changes:
... | ... | @@ -19,9 +19,11 @@ import tempfile |
19 | 19 |
|
20 | 20 |
from google.protobuf import any_pb2
|
21 | 21 |
|
22 |
+from buildgrid.settings import HASH_LENGTH
|
|
22 | 23 |
from buildgrid.client.cas import upload
|
23 | 24 |
from buildgrid._protos.build.bazel.remote.execution.v2 import remote_execution_pb2
|
24 | 25 |
from buildgrid._protos.google.bytestream import bytestream_pb2_grpc
|
26 |
+from buildgrid._exceptions import BotError
|
|
25 | 27 |
from buildgrid.utils import read_file, write_file, parse_to_pb2_from_fetch
|
26 | 28 |
|
27 | 29 |
|
... | ... | @@ -87,17 +89,30 @@ def work_buildbox(context, lease): |
87 | 89 |
|
88 | 90 |
command_line = subprocess.Popen(command_line,
|
89 | 91 |
stdin=subprocess.PIPE,
|
90 |
- stdout=subprocess.PIPE)
|
|
91 |
- # TODO: Should return the stdout and stderr to the user.
|
|
92 |
- command_line.communicate()
|
|
92 |
+ stdout=subprocess.PIPE,
|
|
93 |
+ stderr=subprocess.PIPE)
|
|
94 |
+ stdout, stderr = command_line.communicate()
|
|
95 |
+ returncode = command_line.returncode
|
|
96 |
+ action_result = remote_execution_pb2.ActionResult()
|
|
97 |
+ # TODO: Upload to CAS or output RAW
|
|
98 |
+ # For now, just pass raw
|
|
99 |
+ # https://gitlab.com/BuildGrid/buildgrid/issues/90
|
|
100 |
+ action_result.stdout_raw = stdout
|
|
101 |
+ action_result.stderr_raw = stderr
|
|
102 |
+ action_result.exit_code = returncode
|
|
103 |
+ |
|
104 |
+ logger.debug("BuildBox stderr: [{}]".format(stderr.encode()))
|
|
105 |
+ logger.debug("BuildBox stdout: [{}]".format(stdout.encode()))
|
|
106 |
+ logger.debug("BuildBox exit code: [{}]".format(returncode))
|
|
93 | 107 |
|
94 | 108 |
output_digest = remote_execution_pb2.Digest()
|
95 | 109 |
output_digest.ParseFromString(read_file(output_digest_file.name))
|
96 | 110 |
|
97 |
- logger.debug("Output root digest: {}".format(output_digest))
|
|
111 |
+ logger.debug("Output root digest: [{}]".format(output_digest))
|
|
98 | 112 |
|
99 |
- if len(output_digest.hash) < 64:
|
|
100 |
- logger.warning("Buildbox command failed - no output root digest present.")
|
|
113 |
+ if len(output_digest.hash) != HASH_LENGTH:
|
|
114 |
+ raise BotError(stdout,
|
|
115 |
+ detail=stderr, reason="Output root digest too small.")
|
|
101 | 116 |
|
102 | 117 |
# TODO: Have BuildBox helping us creating the Tree instance here
|
103 | 118 |
# See https://gitlab.com/BuildStream/buildbox/issues/7 for details
|
... | ... | @@ -110,7 +125,6 @@ def work_buildbox(context, lease): |
110 | 125 |
output_directory.tree_digest.CopyFrom(output_tree_digest)
|
111 | 126 |
output_directory.path = os.path.relpath(working_directory, start='/')
|
112 | 127 |
|
113 |
- action_result = remote_execution_pb2.ActionResult()
|
|
114 | 128 |
action_result.output_directories.extend([output_directory])
|
115 | 129 |
|
116 | 130 |
action_result_any = any_pb2.Any()
|
... | ... | @@ -77,11 +77,23 @@ def work_temp_directory(context, lease): |
77 | 77 |
universal_newlines=True,
|
78 | 78 |
env=environment,
|
79 | 79 |
stdin=subprocess.PIPE,
|
80 |
- stdout=subprocess.PIPE)
|
|
81 |
- # TODO: Should return the stdout and stderr in the ActionResult.
|
|
82 |
- process.communicate()
|
|
80 |
+ stdout=subprocess.PIPE,
|
|
81 |
+ stderr=subprocess.PIPE)
|
|
82 |
+ |
|
83 |
+ stdout, stderr = process.communicate()
|
|
84 |
+ returncode = process.returncode
|
|
83 | 85 |
|
84 | 86 |
action_result = remote_execution_pb2.ActionResult()
|
87 |
+ # TODO: Upload to CAS or output RAW
|
|
88 |
+ # For now, just pass raw
|
|
89 |
+ # https://gitlab.com/BuildGrid/buildgrid/issues/90
|
|
90 |
+ action_result.stdout_raw = stdout.encode()
|
|
91 |
+ action_result.stderr_raw = stderr.encode()
|
|
92 |
+ action_result.exit_code = returncode
|
|
93 |
+ |
|
94 |
+ logger.debug("Command stderr: [{}]".format(stderr.encode()))
|
|
95 |
+ logger.debug("Command stdout: [{}]".format(stdout.encode()))
|
|
96 |
+ logger.debug("Command exit code: [{}]".format(returncode))
|
|
85 | 97 |
|
86 | 98 |
with upload(context.cas_channel, instance=instance_name) as cas:
|
87 | 99 |
for output_path in command.output_files:
|
... | ... | @@ -12,6 +12,9 @@ |
12 | 12 |
# See the License for the specific language governing permissions and
|
13 | 13 |
# limitations under the License.
|
14 | 14 |
|
15 |
+# Disable broad exception catch
|
|
16 |
+# pylint: disable=broad-except
|
|
17 |
+ |
|
15 | 18 |
|
16 | 19 |
"""
|
17 | 20 |
Bot Session
|
... | ... | @@ -23,10 +26,13 @@ import asyncio |
23 | 26 |
import logging
|
24 | 27 |
import platform
|
25 | 28 |
import uuid
|
26 |
- |
|
27 | 29 |
from enum import Enum
|
28 | 30 |
|
31 |
+import grpc
|
|
32 |
+ |
|
33 |
+from buildgrid._protos.google.rpc import code_pb2
|
|
29 | 34 |
from buildgrid._protos.google.devtools.remoteworkers.v1test2 import bots_pb2, worker_pb2
|
35 |
+from buildgrid._exceptions import BotError
|
|
30 | 36 |
|
31 | 37 |
|
32 | 38 |
class BotStatus(Enum):
|
... | ... | @@ -142,9 +148,22 @@ class BotSession: |
142 | 148 |
|
143 | 149 |
async def create_work(self, lease):
|
144 | 150 |
self.logger.debug("Work created: [{}]".format(lease.id))
|
145 |
- |
|
146 | 151 |
loop = asyncio.get_event_loop()
|
147 |
- lease = await loop.run_in_executor(None, self._work, self._context, lease)
|
|
152 |
+ |
|
153 |
+ try:
|
|
154 |
+ lease = await loop.run_in_executor(None, self._work, self._context, lease)
|
|
155 |
+ |
|
156 |
+ except grpc.RpcError as e:
|
|
157 |
+ self.logger.error("Connection error thrown: [{}]".format(e))
|
|
158 |
+ lease.status.code = e.code()
|
|
159 |
+ |
|
160 |
+ except BotError as e:
|
|
161 |
+ self.logger.error("Internal bot error thrown: [{}]".format(e))
|
|
162 |
+ lease.status.code = code_pb2.INTERNAL
|
|
163 |
+ |
|
164 |
+ except Exception as e:
|
|
165 |
+ self.logger.error("Connection error thrown: [{}]".format(e))
|
|
166 |
+ lease.status.code = code_pb2.INTERNAL
|
|
148 | 167 |
|
149 | 168 |
self.logger.debug("Work complete: [{}]".format(lease.id))
|
150 | 169 |
self.lease_completed(lease)
|
... | ... | @@ -117,7 +117,7 @@ class BotsInterface: |
117 | 117 |
|
118 | 118 |
elif client_state == LeaseState.COMPLETED:
|
119 | 119 |
self._scheduler.update_job_lease_state(client_lease.id, client_lease.state)
|
120 |
- self._scheduler.job_complete(client_lease.id, client_lease.result)
|
|
120 |
+ self._scheduler.job_complete(client_lease.id, client_lease.result, client_lease.status)
|
|
121 | 121 |
return None
|
122 | 122 |
|
123 | 123 |
else:
|
... | ... | @@ -121,10 +121,9 @@ class Job: |
121 | 121 |
self._operation.metadata.CopyFrom(self._pack_any(self.get_operation_meta()))
|
122 | 122 |
if self.result is not None:
|
123 | 123 |
self._operation.done = True
|
124 |
- action_result = remote_execution_pb2.ActionResult()
|
|
125 |
- self.result.Unpack(action_result)
|
|
126 |
- response = remote_execution_pb2.ExecuteResponse(result=action_result,
|
|
127 |
- cached_result=self.result_cached)
|
|
124 |
+ response = remote_execution_pb2.ExecuteResponse(result=self.result,
|
|
125 |
+ cached_result=self.result_cached,
|
|
126 |
+ status=self.lease.status)
|
|
128 | 127 |
self._operation.response.CopyFrom(self._pack_any(response))
|
129 | 128 |
|
130 | 129 |
return self._operation
|
... | ... | @@ -27,6 +27,7 @@ from google.protobuf import any_pb2 |
27 | 27 |
|
28 | 28 |
|
29 | 29 |
from buildgrid.server._exceptions import NotFoundError
|
30 |
+from buildgrid._protos.build.bazel.remote.execution.v2 import remote_execution_pb2
|
|
30 | 31 |
from buildgrid._protos.google.longrunning import operations_pb2
|
31 | 32 |
|
32 | 33 |
from .job import ExecuteStage, LeaseState
|
... | ... | @@ -82,12 +83,16 @@ class Scheduler: |
82 | 83 |
job.n_tries += 1
|
83 | 84 |
self.queue.appendleft(job)
|
84 | 85 |
|
85 |
- def job_complete(self, name, result):
|
|
86 |
+ def job_complete(self, name, result, status):
|
|
86 | 87 |
job = self.jobs[name]
|
87 |
- job.result = result
|
|
88 |
- job.update_execute_stage(ExecuteStage.COMPLETED)
|
|
88 |
+ job.lease.status.CopyFrom(status)
|
|
89 |
+ action_result = remote_execution_pb2.ActionResult()
|
|
90 |
+ result.Unpack(action_result)
|
|
91 |
+ job.result = action_result
|
|
89 | 92 |
if not job.do_not_cache and self._action_cache is not None:
|
90 |
- self._action_cache.update_action_result(job.action_digest, result)
|
|
93 |
+ if not job.lease.status.code:
|
|
94 |
+ self._action_cache.update_action_result(job.action_digest, result)
|
|
95 |
+ job.update_execute_stage(ExecuteStage.COMPLETED)
|
|
91 | 96 |
|
92 | 97 |
def get_operations(self):
|
93 | 98 |
response = operations_pb2.ListOperationsResponse()
|
... | ... | @@ -112,6 +117,6 @@ class Scheduler: |
112 | 117 |
while self.queue:
|
113 | 118 |
job = self.queue.popleft()
|
114 | 119 |
job.update_execute_stage(ExecuteStage.EXECUTING)
|
115 |
- job.lease = job.create_lease()
|
|
120 |
+ job.create_lease()
|
|
116 | 121 |
job.lease.state = LeaseState.PENDING.value
|
117 | 122 |
yield job.lease
|
... | ... | @@ -3,3 +3,4 @@ import hashlib |
3 | 3 |
|
4 | 4 |
# The hash function that CAS uses
|
5 | 5 |
HASH = hashlib.sha256
|
6 |
+HASH_LENGTH = HASH().digest_size * 2
|
... | ... | @@ -33,6 +33,7 @@ from buildgrid.server._exceptions import InvalidArgumentError |
33 | 33 |
|
34 | 34 |
from buildgrid._protos.build.bazel.remote.execution.v2 import remote_execution_pb2
|
35 | 35 |
from buildgrid._protos.google.longrunning import operations_pb2
|
36 |
+from buildgrid._protos.google.rpc import status_pb2
|
|
36 | 37 |
|
37 | 38 |
|
38 | 39 |
server = mock.create_autospec(grpc.server)
|
... | ... | @@ -130,8 +131,10 @@ def test_list_operations_with_result(instance, controller, execute_request, cont |
130 | 131 |
output_file = remote_execution_pb2.OutputFile(path='unicorn')
|
131 | 132 |
action_result.output_files.extend([output_file])
|
132 | 133 |
|
134 |
+ controller.operations_instance._scheduler.jobs[response_execute.name].create_lease()
|
|
133 | 135 |
controller.operations_instance._scheduler.job_complete(response_execute.name,
|
134 |
- _pack_any(action_result))
|
|
136 |
+ _pack_any(action_result),
|
|
137 |
+ status_pb2.Status())
|
|
135 | 138 |
|
136 | 139 |
request = operations_pb2.ListOperationsRequest(name=instance_name)
|
137 | 140 |
response = instance.ListOperations(request, context)
|