Martin Blanchard pushed to branch mablanch/83-executed-action-metadata at BuildGrid / buildgrid
Commits:
-
296cf1ed
by Martin Blanchard at 2018-10-26T18:21:16Z
-
7addd61f
by Finn at 2018-10-29T10:24:41Z
-
f14097b6
by Martin Blanchard at 2018-10-29T10:32:08Z
-
afdcd3ae
by Martin Blanchard at 2018-10-29T10:32:08Z
-
fb0b698f
by Martin Blanchard at 2018-10-29T10:32:08Z
-
27123473
by Martin Blanchard at 2018-10-29T10:32:08Z
-
fb0e53df
by Martin Blanchard at 2018-10-29T10:32:08Z
-
91e3daff
by Martin Blanchard at 2018-10-29T10:32:08Z
-
0dc1a374
by Martin Blanchard at 2018-10-29T10:32:08Z
14 changed files:
- .pylintrc
- buildgrid/_app/bots/dummy.py
- buildgrid/_app/bots/host.py
- buildgrid/_app/commands/cmd_operation.py
- + buildgrid/_enums.py
- buildgrid/bot/bot_session.py
- buildgrid/server/job.py
- buildgrid/server/scheduler.py
- buildgrid/utils.py
- + docs/source/architecture.rst
- + docs/source/architecture_overview.rst
- docs/source/conf.py
- docs/source/index.rst
- tests/integration/operations_service.py
Changes:
... | ... | @@ -184,7 +184,8 @@ ignore-on-opaque-inference=yes |
184 | 184 |
# List of class names for which member attributes should not be checked (useful
|
185 | 185 |
# for classes with dynamically set attributes). This supports the use of
|
186 | 186 |
# qualified names.
|
187 |
-ignored-classes=google.protobuf.any_pb2.Any
|
|
187 |
+ignored-classes=google.protobuf.any_pb2.Any,
|
|
188 |
+ google.protobuf.timestamp_pb2.Timestamp
|
|
188 | 189 |
|
189 | 190 |
# List of module names for which member attributes should not be checked
|
190 | 191 |
# (useful for modules/projects where namespaces are manipulated during runtime
|
... | ... | @@ -17,16 +17,32 @@ import random |
17 | 17 |
import time
|
18 | 18 |
|
19 | 19 |
from buildgrid._protos.build.bazel.remote.execution.v2 import remote_execution_pb2
|
20 |
+from buildgrid.utils import get_hostname
|
|
20 | 21 |
|
21 | 22 |
|
22 | 23 |
def work_dummy(context, lease):
|
23 | 24 |
""" Just returns lease after some random time
|
24 | 25 |
"""
|
26 |
+ action_result = remote_execution_pb2.ActionResult()
|
|
27 |
+ |
|
25 | 28 |
lease.result.Clear()
|
26 | 29 |
|
27 |
- time.sleep(random.randint(1, 5))
|
|
30 |
+ action_result.execution_metadata.worker = get_hostname()
|
|
28 | 31 |
|
29 |
- action_result = remote_execution_pb2.ActionResult()
|
|
32 |
+ # Simulation input-downloading phase:
|
|
33 |
+ action_result.execution_metadata.input_fetch_start_timestamp.GetCurrentTime()
|
|
34 |
+ time.sleep(random.random())
|
|
35 |
+ action_result.execution_metadata.input_fetch_completed_timestamp.GetCurrentTime()
|
|
36 |
+ |
|
37 |
+ # Simulation execution phase:
|
|
38 |
+ action_result.execution_metadata.execution_start_timestamp.GetCurrentTime()
|
|
39 |
+ time.sleep(random.random())
|
|
40 |
+ action_result.execution_metadata.execution_completed_timestamp.GetCurrentTime()
|
|
41 |
+ |
|
42 |
+ # Simulation output-uploading phase:
|
|
43 |
+ action_result.execution_metadata.output_upload_start_timestamp.GetCurrentTime()
|
|
44 |
+ time.sleep(random.random())
|
|
45 |
+ action_result.execution_metadata.output_upload_completed_timestamp.GetCurrentTime()
|
|
30 | 46 |
|
31 | 47 |
lease.result.Pack(action_result)
|
32 | 48 |
|
... | ... | @@ -19,7 +19,7 @@ import tempfile |
19 | 19 |
|
20 | 20 |
from buildgrid.client.cas import download, upload
|
21 | 21 |
from buildgrid._protos.build.bazel.remote.execution.v2 import remote_execution_pb2
|
22 |
-from buildgrid.utils import output_file_maker, output_directory_maker
|
|
22 |
+from buildgrid.utils import get_hostname, output_file_maker, output_directory_maker
|
|
23 | 23 |
|
24 | 24 |
|
25 | 25 |
def work_host_tools(context, lease):
|
... | ... | @@ -29,10 +29,13 @@ def work_host_tools(context, lease): |
29 | 29 |
logger = context.logger
|
30 | 30 |
|
31 | 31 |
action_digest = remote_execution_pb2.Digest()
|
32 |
+ action_result = remote_execution_pb2.ActionResult()
|
|
32 | 33 |
|
33 | 34 |
lease.payload.Unpack(action_digest)
|
34 | 35 |
lease.result.Clear()
|
35 | 36 |
|
37 |
+ action_result.execution_metadata.worker = get_hostname()
|
|
38 |
+ |
|
36 | 39 |
with tempfile.TemporaryDirectory() as temp_directory:
|
37 | 40 |
with download(context.cas_channel, instance=instance_name) as downloader:
|
38 | 41 |
action = downloader.get_message(action_digest,
|
... | ... | @@ -43,8 +46,12 @@ def work_host_tools(context, lease): |
43 | 46 |
command = downloader.get_message(action.command_digest,
|
44 | 47 |
remote_execution_pb2.Command())
|
45 | 48 |
|
49 |
+ action_result.execution_metadata.input_fetch_start_timestamp.GetCurrentTime()
|
|
50 |
+ |
|
46 | 51 |
downloader.download_directory(action.input_root_digest, temp_directory)
|
47 | 52 |
|
53 |
+ action_result.execution_metadata.input_fetch_completed_timestamp.GetCurrentTime()
|
|
54 |
+ |
|
48 | 55 |
environment = os.environ.copy()
|
49 | 56 |
for variable in command.environment_variables:
|
50 | 57 |
if variable.name not in ['PATH', 'PWD']:
|
... | ... | @@ -70,6 +77,8 @@ def work_host_tools(context, lease): |
70 | 77 |
|
71 | 78 |
logger.debug(' '.join(command_line))
|
72 | 79 |
|
80 |
+ action_result.execution_metadata.execution_start_timestamp.GetCurrentTime()
|
|
81 |
+ |
|
73 | 82 |
process = subprocess.Popen(command_line,
|
74 | 83 |
cwd=working_directory,
|
75 | 84 |
env=environment,
|
... | ... | @@ -80,7 +89,8 @@ def work_host_tools(context, lease): |
80 | 89 |
stdout, stderr = process.communicate()
|
81 | 90 |
returncode = process.returncode
|
82 | 91 |
|
83 |
- action_result = remote_execution_pb2.ActionResult()
|
|
92 |
+ action_result.execution_metadata.execution_completed_timestamp.GetCurrentTime()
|
|
93 |
+ |
|
84 | 94 |
# TODO: Upload to CAS or output RAW
|
85 | 95 |
# For now, just pass raw
|
86 | 96 |
# https://gitlab.com/BuildGrid/buildgrid/issues/90
|
... | ... | @@ -92,6 +102,8 @@ def work_host_tools(context, lease): |
92 | 102 |
logger.debug("Command stdout: [{}]".format(stdout))
|
93 | 103 |
logger.debug("Command exit code: [{}]".format(returncode))
|
94 | 104 |
|
105 |
+ action_result.execution_metadata.output_upload_start_timestamp.GetCurrentTime()
|
|
106 |
+ |
|
95 | 107 |
with upload(context.cas_channel, instance=instance_name) as uploader:
|
96 | 108 |
output_files, output_directories = [], []
|
97 | 109 |
|
... | ... | @@ -121,6 +133,8 @@ def work_host_tools(context, lease): |
121 | 133 |
|
122 | 134 |
action_result.output_directories.extend(output_directories)
|
123 | 135 |
|
136 |
+ action_result.execution_metadata.output_upload_completed_timestamp.GetCurrentTime()
|
|
137 |
+ |
|
124 | 138 |
lease.result.Pack(action_result)
|
125 | 139 |
|
126 | 140 |
return lease
|
... | ... | @@ -20,15 +20,21 @@ Operations command |
20 | 20 |
Check the status of operations
|
21 | 21 |
"""
|
22 | 22 |
|
23 |
+from collections import OrderedDict
|
|
23 | 24 |
import logging
|
25 |
+from operator import attrgetter
|
|
24 | 26 |
from urllib.parse import urlparse
|
25 | 27 |
import sys
|
28 |
+from textwrap import indent
|
|
26 | 29 |
|
27 | 30 |
import click
|
31 |
+from google.protobuf import json_format
|
|
28 | 32 |
import grpc
|
29 | 33 |
|
34 |
+from buildgrid._enums import OperationStage
|
|
30 | 35 |
from buildgrid._protos.build.bazel.remote.execution.v2 import remote_execution_pb2, remote_execution_pb2_grpc
|
31 | 36 |
from buildgrid._protos.google.longrunning import operations_pb2, operations_pb2_grpc
|
37 |
+from buildgrid._protos.google.rpc import code_pb2
|
|
32 | 38 |
|
33 | 39 |
from ..cli import pass_context
|
34 | 40 |
|
... | ... | @@ -65,45 +71,145 @@ def cli(context, remote, instance_name, client_key, client_cert, server_cert): |
65 | 71 |
context.logger.debug("Starting for remote {}".format(context.remote))
|
66 | 72 |
|
67 | 73 |
|
74 |
+def _print_operation_status(operation, print_details=False):
|
|
75 |
+ metadata = remote_execution_pb2.ExecuteOperationMetadata()
|
|
76 |
+ # The metadata is expected to be an ExecuteOperationMetadata message:
|
|
77 |
+ assert operation.metadata.Is(metadata.DESCRIPTOR)
|
|
78 |
+ operation.metadata.Unpack(metadata)
|
|
79 |
+ |
|
80 |
+ stage = OperationStage(metadata.stage)
|
|
81 |
+ |
|
82 |
+ if not operation.done:
|
|
83 |
+ if stage == OperationStage.CACHE_CHECK:
|
|
84 |
+ click.echo('CacheCheck: {}: Querying action-cache (stage={})'
|
|
85 |
+ .format(operation.name, metadata.stage))
|
|
86 |
+ elif stage == OperationStage.QUEUED:
|
|
87 |
+ click.echo('Queued: {}: Waiting for execution (stage={})'
|
|
88 |
+ .format(operation.name, metadata.stage))
|
|
89 |
+ elif stage == OperationStage.EXECUTING:
|
|
90 |
+ click.echo('Executing: {}: Currently running (stage={})'
|
|
91 |
+ .format(operation.name, metadata.stage))
|
|
92 |
+ else:
|
|
93 |
+ click.echo('Error: {}: In an invalid state (stage={})'
|
|
94 |
+ .format(operation.name, metadata.stage), err=True)
|
|
95 |
+ return
|
|
96 |
+ |
|
97 |
+ assert stage == OperationStage.COMPLETED
|
|
98 |
+ |
|
99 |
+ response = remote_execution_pb2.ExecuteResponse()
|
|
100 |
+ # The response is expected to be an ExecutionResponse message:
|
|
101 |
+ assert operation.response.Is(response.DESCRIPTOR)
|
|
102 |
+ operation.response.Unpack(response)
|
|
103 |
+ |
|
104 |
+ if response.status.code != code_pb2.OK:
|
|
105 |
+ click.echo('Failure: {}: {} (code={})'
|
|
106 |
+ .format(operation.name, response.status.message, response.status.code))
|
|
107 |
+ else:
|
|
108 |
+ if response.result.exit_code != 0:
|
|
109 |
+ click.echo('Success: {}: Completed with failure (stage={}, exit_code={})'
|
|
110 |
+ .format(operation.name, metadata.stage, response.result.exit_code))
|
|
111 |
+ else:
|
|
112 |
+ click.echo('Success: {}: Completed succesfully (stage={}, exit_code={})'
|
|
113 |
+ .format(operation.name, metadata.stage, response.result.exit_code))
|
|
114 |
+ |
|
115 |
+ if print_details:
|
|
116 |
+ metadata = response.result.execution_metadata
|
|
117 |
+ click.echo(indent('worker={}'.format(metadata.worker), ' '))
|
|
118 |
+ |
|
119 |
+ queued = metadata.queued_timestamp.ToDatetime()
|
|
120 |
+ click.echo(indent('queued_at={}'.format(queued), ' '))
|
|
121 |
+ |
|
122 |
+ worker_start = metadata.worker_start_timestamp.ToDatetime()
|
|
123 |
+ worker_completed = metadata.worker_completed_timestamp.ToDatetime()
|
|
124 |
+ click.echo(indent('work_duration={}'.format(worker_completed - worker_start), ' '))
|
|
125 |
+ |
|
126 |
+ fetch_start = metadata.input_fetch_start_timestamp.ToDatetime()
|
|
127 |
+ fetch_completed = metadata.input_fetch_completed_timestamp.ToDatetime()
|
|
128 |
+ click.echo(indent('fetch_duration={}'.format(fetch_completed - fetch_start), ' '))
|
|
129 |
+ |
|
130 |
+ execution_start = metadata.execution_start_timestamp.ToDatetime()
|
|
131 |
+ execution_completed = metadata.execution_completed_timestamp.ToDatetime()
|
|
132 |
+ click.echo(indent('exection_duration={}'.format(execution_completed - execution_start), ' '))
|
|
133 |
+ |
|
134 |
+ upload_start = metadata.output_upload_start_timestamp.ToDatetime()
|
|
135 |
+ upload_completed = metadata.output_upload_completed_timestamp.ToDatetime()
|
|
136 |
+ click.echo(indent('upload_duration={}'.format(upload_completed - upload_start), ' '))
|
|
137 |
+ |
|
138 |
+ click.echo(indent('total_duration={}'.format(worker_completed - queued), ' '))
|
|
139 |
+ |
|
140 |
+ |
|
68 | 141 |
@cli.command('status', short_help="Get the status of an operation.")
|
69 | 142 |
@click.argument('operation-name', nargs=1, type=click.STRING, required=True)
|
143 |
+@click.option('--json', is_flag=True, show_default=True,
|
|
144 |
+ help="Print operations status in JSON format.")
|
|
70 | 145 |
@pass_context
|
71 |
-def status(context, operation_name):
|
|
72 |
- context.logger.info("Getting operation status...")
|
|
146 |
+def status(context, operation_name, json):
|
|
73 | 147 |
stub = operations_pb2_grpc.OperationsStub(context.channel)
|
74 |
- |
|
75 | 148 |
request = operations_pb2.GetOperationRequest(name=operation_name)
|
76 | 149 |
|
77 |
- response = stub.GetOperation(request)
|
|
78 |
- context.logger.info(response)
|
|
150 |
+ operation = stub.GetOperation(request)
|
|
151 |
+ |
|
152 |
+ if not json:
|
|
153 |
+ _print_operation_status(operation, print_details=True)
|
|
154 |
+ else:
|
|
155 |
+ click.echo(json_format.MessageToJson(operation))
|
|
79 | 156 |
|
80 | 157 |
|
81 | 158 |
@cli.command('list', short_help="List operations.")
|
159 |
+@click.option('--json', is_flag=True, show_default=True,
|
|
160 |
+ help="Print operations list in JSON format.")
|
|
82 | 161 |
@pass_context
|
83 |
-def lists(context):
|
|
84 |
- context.logger.info("Getting list of operations")
|
|
162 |
+def lists(context, json):
|
|
85 | 163 |
stub = operations_pb2_grpc.OperationsStub(context.channel)
|
86 |
- |
|
87 | 164 |
request = operations_pb2.ListOperationsRequest(name=context.instance_name)
|
88 | 165 |
|
89 | 166 |
response = stub.ListOperations(request)
|
90 | 167 |
|
91 | 168 |
if not response.operations:
|
92 |
- context.logger.warning("No operations to list")
|
|
169 |
+ click.echo('Error: No operations to list.', err=True)
|
|
93 | 170 |
return
|
94 | 171 |
|
95 |
- for op in response.operations:
|
|
96 |
- context.logger.info(op)
|
|
172 |
+ operations_map = OrderedDict([
|
|
173 |
+ (OperationStage.CACHE_CHECK, []),
|
|
174 |
+ (OperationStage.QUEUED, []),
|
|
175 |
+ (OperationStage.EXECUTING, []),
|
|
176 |
+ (OperationStage.COMPLETED, [])
|
|
177 |
+ ])
|
|
178 |
+ |
|
179 |
+ for operation in response.operations:
|
|
180 |
+ metadata = remote_execution_pb2.ExecuteOperationMetadata()
|
|
181 |
+ # The metadata is expected to be an ExecuteOperationMetadata message:
|
|
182 |
+ assert operation.metadata.Is(metadata.DESCRIPTOR)
|
|
183 |
+ operation.metadata.Unpack(metadata)
|
|
184 |
+ |
|
185 |
+ stage = OperationStage(metadata.stage)
|
|
186 |
+ |
|
187 |
+ operations_map[stage].append(operation)
|
|
188 |
+ |
|
189 |
+ for operations in operations_map.values():
|
|
190 |
+ operations.sort(key=attrgetter('name'))
|
|
191 |
+ for operation in operations:
|
|
192 |
+ if not json:
|
|
193 |
+ _print_operation_status(operation)
|
|
194 |
+ else:
|
|
195 |
+ click.echo(json_format.MessageToJson(operation))
|
|
97 | 196 |
|
98 | 197 |
|
99 | 198 |
@cli.command('wait', short_help="Streams an operation until it is complete.")
|
100 | 199 |
@click.argument('operation-name', nargs=1, type=click.STRING, required=True)
|
200 |
+@click.option('--json', is_flag=True, show_default=True,
|
|
201 |
+ help="Print operations statuses in JSON format.")
|
|
101 | 202 |
@pass_context
|
102 |
-def wait(context, operation_name):
|
|
203 |
+def wait(context, operation_name, json):
|
|
103 | 204 |
stub = remote_execution_pb2_grpc.ExecutionStub(context.channel)
|
104 | 205 |
request = remote_execution_pb2.WaitExecutionRequest(name=operation_name)
|
105 | 206 |
|
106 |
- response = stub.WaitExecution(request)
|
|
207 |
+ operation_iterator = stub.WaitExecution(request)
|
|
107 | 208 |
|
108 |
- for stream in response:
|
|
109 |
- context.logger.info(stream)
|
|
209 |
+ for operation in operation_iterator:
|
|
210 |
+ if not json and operation.done:
|
|
211 |
+ _print_operation_status(operation, print_details=True)
|
|
212 |
+ elif not json:
|
|
213 |
+ _print_operation_status(operation)
|
|
214 |
+ else:
|
|
215 |
+ click.echo(json_format.MessageToJson(operation))
|
1 |
+# Copyright (C) 2018 Bloomberg LP
|
|
2 |
+#
|
|
3 |
+# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4 |
+# you may not use this file except in compliance with the License.
|
|
5 |
+# You may obtain a copy of the License at
|
|
6 |
+#
|
|
7 |
+# <http://www.apache.org/licenses/LICENSE-2.0>
|
|
8 |
+#
|
|
9 |
+# Unless required by applicable law or agreed to in writing, software
|
|
10 |
+# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11 |
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12 |
+# See the License for the specific language governing permissions and
|
|
13 |
+# limitations under the License.
|
|
14 |
+ |
|
15 |
+ |
|
16 |
+from enum import Enum
|
|
17 |
+ |
|
18 |
+from buildgrid._protos.build.bazel.remote.execution.v2 import remote_execution_pb2
|
|
19 |
+from buildgrid._protos.google.devtools.remoteworkers.v1test2 import bots_pb2
|
|
20 |
+ |
|
21 |
+ |
|
22 |
+class BotStatus(Enum):
|
|
23 |
+ # Initially unknown state.
|
|
24 |
+ BOT_STATUS_UNSPECIFIED = bots_pb2.BotStatus.Value('BOT_STATUS_UNSPECIFIED')
|
|
25 |
+ # The bot is healthy, and will accept leases as normal.
|
|
26 |
+ OK = bots_pb2.BotStatus.Value('OK')
|
|
27 |
+ # The bot is unhealthy and will not accept new leases.
|
|
28 |
+ UNHEALTHY = bots_pb2.BotStatus.Value('UNHEALTHY')
|
|
29 |
+ # The bot has been asked to reboot the host.
|
|
30 |
+ HOST_REBOOTING = bots_pb2.BotStatus.Value('HOST_REBOOTING')
|
|
31 |
+ # The bot has been asked to shut down.
|
|
32 |
+ BOT_TERMINATING = bots_pb2.BotStatus.Value('BOT_TERMINATING')
|
|
33 |
+ |
|
34 |
+ |
|
35 |
+class LeaseState(Enum):
|
|
36 |
+ # Initially unknown state.
|
|
37 |
+ LEASE_STATE_UNSPECIFIED = bots_pb2.LeaseState.Value('LEASE_STATE_UNSPECIFIED')
|
|
38 |
+ # The server expects the bot to accept this lease.
|
|
39 |
+ PENDING = bots_pb2.LeaseState.Value('PENDING')
|
|
40 |
+ # The bot has accepted this lease.
|
|
41 |
+ ACTIVE = bots_pb2.LeaseState.Value('ACTIVE')
|
|
42 |
+ # The bot is no longer leased.
|
|
43 |
+ COMPLETED = bots_pb2.LeaseState.Value('COMPLETED')
|
|
44 |
+ # The bot should immediately release all resources associated with the lease.
|
|
45 |
+ CANCELLED = bots_pb2.LeaseState.Value('CANCELLED')
|
|
46 |
+ |
|
47 |
+ |
|
48 |
+class OperationStage(Enum):
|
|
49 |
+ # Initially unknown stage.
|
|
50 |
+ UNKNOWN = remote_execution_pb2.ExecuteOperationMetadata.Stage.Value('UNKNOWN')
|
|
51 |
+ # Checking the result against the cache.
|
|
52 |
+ CACHE_CHECK = remote_execution_pb2.ExecuteOperationMetadata.Stage.Value('CACHE_CHECK')
|
|
53 |
+ # Currently idle, awaiting a free machine to execute.
|
|
54 |
+ QUEUED = remote_execution_pb2.ExecuteOperationMetadata.Stage.Value('QUEUED')
|
|
55 |
+ # Currently being executed by a worker.
|
|
56 |
+ EXECUTING = remote_execution_pb2.ExecuteOperationMetadata.Stage.Value('EXECUTING')
|
|
57 |
+ # Finished execution.
|
|
58 |
+ COMPLETED = remote_execution_pb2.ExecuteOperationMetadata.Stage.Value('COMPLETED')
|
... | ... | @@ -26,49 +26,15 @@ import asyncio |
26 | 26 |
import logging
|
27 | 27 |
import platform
|
28 | 28 |
import uuid
|
29 |
-from enum import Enum
|
|
30 | 29 |
|
31 | 30 |
import grpc
|
32 | 31 |
|
32 |
+from buildgrid._enums import BotStatus, LeaseState
|
|
33 | 33 |
from buildgrid._protos.google.rpc import code_pb2
|
34 | 34 |
from buildgrid._protos.google.devtools.remoteworkers.v1test2 import bots_pb2, worker_pb2
|
35 | 35 |
from buildgrid._exceptions import BotError
|
36 | 36 |
|
37 | 37 |
|
38 |
-class BotStatus(Enum):
|
|
39 |
- # Default value.
|
|
40 |
- BOT_STATUS_UNSPECIFIED = bots_pb2.BotStatus.Value('BOT_STATUS_UNSPECIFIED')
|
|
41 |
- |
|
42 |
- # The bot is healthy, and will accept leases as normal.
|
|
43 |
- OK = bots_pb2.BotStatus.Value('OK')
|
|
44 |
- |
|
45 |
- # The bot is unhealthy and will not accept new leases.
|
|
46 |
- UNHEALTHY = bots_pb2.BotStatus.Value('UNHEALTHY')
|
|
47 |
- |
|
48 |
- # The bot has been asked to reboot the host.
|
|
49 |
- HOST_REBOOTING = bots_pb2.BotStatus.Value('HOST_REBOOTING')
|
|
50 |
- |
|
51 |
- # The bot has been asked to shut down.
|
|
52 |
- BOT_TERMINATING = bots_pb2.BotStatus.Value('BOT_TERMINATING')
|
|
53 |
- |
|
54 |
- |
|
55 |
-class LeaseState(Enum):
|
|
56 |
- # Default value.
|
|
57 |
- LEASE_STATE_UNSPECIFIED = bots_pb2.LeaseState.Value('LEASE_STATE_UNSPECIFIED')
|
|
58 |
- |
|
59 |
- # The server expects the bot to accept this lease.
|
|
60 |
- PENDING = bots_pb2.LeaseState.Value('PENDING')
|
|
61 |
- |
|
62 |
- # The bot has accepted this lease.
|
|
63 |
- ACTIVE = bots_pb2.LeaseState.Value('ACTIVE')
|
|
64 |
- |
|
65 |
- # The bot is no longer leased.
|
|
66 |
- COMPLETED = bots_pb2.LeaseState.Value('COMPLETED')
|
|
67 |
- |
|
68 |
- # The bot should immediately release all resources associated with the lease.
|
|
69 |
- CANCELLED = bots_pb2.LeaseState.Value('CANCELLED')
|
|
70 |
- |
|
71 |
- |
|
72 | 38 |
class BotSession:
|
73 | 39 |
def __init__(self, parent, interface):
|
74 | 40 |
""" Unique bot ID within the farm used to identify this bot
|
... | ... | @@ -15,39 +15,15 @@ |
15 | 15 |
|
16 | 16 |
import logging
|
17 | 17 |
import uuid
|
18 |
-from enum import Enum
|
|
19 | 18 |
|
19 |
+from google.protobuf import timestamp_pb2
|
|
20 |
+ |
|
21 |
+from buildgrid._enums import LeaseState, OperationStage
|
|
20 | 22 |
from buildgrid._protos.build.bazel.remote.execution.v2 import remote_execution_pb2
|
21 | 23 |
from buildgrid._protos.google.devtools.remoteworkers.v1test2 import bots_pb2
|
22 | 24 |
from buildgrid._protos.google.longrunning import operations_pb2
|
23 | 25 |
|
24 | 26 |
|
25 |
-class OperationStage(Enum):
|
|
26 |
- # Initially unknown stage.
|
|
27 |
- UNKNOWN = remote_execution_pb2.ExecuteOperationMetadata.Stage.Value('UNKNOWN')
|
|
28 |
- # Checking the result against the cache.
|
|
29 |
- CACHE_CHECK = remote_execution_pb2.ExecuteOperationMetadata.Stage.Value('CACHE_CHECK')
|
|
30 |
- # Currently idle, awaiting a free machine to execute.
|
|
31 |
- QUEUED = remote_execution_pb2.ExecuteOperationMetadata.Stage.Value('QUEUED')
|
|
32 |
- # Currently being executed by a worker.
|
|
33 |
- EXECUTING = remote_execution_pb2.ExecuteOperationMetadata.Stage.Value('EXECUTING')
|
|
34 |
- # Finished execution.
|
|
35 |
- COMPLETED = remote_execution_pb2.ExecuteOperationMetadata.Stage.Value('COMPLETED')
|
|
36 |
- |
|
37 |
- |
|
38 |
-class LeaseState(Enum):
|
|
39 |
- # Initially unknown state.
|
|
40 |
- LEASE_STATE_UNSPECIFIED = bots_pb2.LeaseState.Value('LEASE_STATE_UNSPECIFIED')
|
|
41 |
- # The server expects the bot to accept this lease.
|
|
42 |
- PENDING = bots_pb2.LeaseState.Value('PENDING')
|
|
43 |
- # The bot has accepted this lease.
|
|
44 |
- ACTIVE = bots_pb2.LeaseState.Value('ACTIVE')
|
|
45 |
- # The bot is no longer leased.
|
|
46 |
- COMPLETED = bots_pb2.LeaseState.Value('COMPLETED')
|
|
47 |
- # The bot should immediately release all resources associated with the lease.
|
|
48 |
- CANCELLED = bots_pb2.LeaseState.Value('CANCELLED')
|
|
49 |
- |
|
50 |
- |
|
51 | 27 |
class Job:
|
52 | 28 |
|
53 | 29 |
def __init__(self, action, action_digest):
|
... | ... | @@ -60,6 +36,9 @@ class Job: |
60 | 36 |
|
61 | 37 |
self.__execute_response = None
|
62 | 38 |
self.__operation_metadata = remote_execution_pb2.ExecuteOperationMetadata()
|
39 |
+ self.__queued_timestamp = timestamp_pb2.Timestamp()
|
|
40 |
+ self.__worker_start_timestamp = timestamp_pb2.Timestamp()
|
|
41 |
+ self.__worker_completed_timestamp = timestamp_pb2.Timestamp()
|
|
63 | 42 |
|
64 | 43 |
self.__operation_metadata.action_digest.CopyFrom(action_digest)
|
65 | 44 |
self.__operation_metadata.stage = OperationStage.UNKNOWN.value
|
... | ... | @@ -177,10 +156,18 @@ class Job: |
177 | 156 |
self._lease.state = state.value
|
178 | 157 |
|
179 | 158 |
if self._lease.state == LeaseState.PENDING.value:
|
159 |
+ self.__worker_start_timestamp.Clear()
|
|
160 |
+ self.__worker_completed_timestamp.Clear()
|
|
161 |
+ |
|
180 | 162 |
self._lease.status.Clear()
|
181 | 163 |
self._lease.result.Clear()
|
182 | 164 |
|
165 |
+ elif self._lease.state == LeaseState.ACTIVE.value:
|
|
166 |
+ self.__worker_start_timestamp.GetCurrentTime()
|
|
167 |
+ |
|
183 | 168 |
elif self._lease.state == LeaseState.COMPLETED.value:
|
169 |
+ self.__worker_completed_timestamp.GetCurrentTime()
|
|
170 |
+ |
|
184 | 171 |
action_result = remote_execution_pb2.ActionResult()
|
185 | 172 |
|
186 | 173 |
# TODO: Make a distinction between build and bot failures!
|
... | ... | @@ -191,6 +178,11 @@ class Job: |
191 | 178 |
assert result.Is(action_result.DESCRIPTOR)
|
192 | 179 |
result.Unpack(action_result)
|
193 | 180 |
|
181 |
+ action_metadata = action_result.execution_metadata
|
|
182 |
+ action_metadata.queued_timestamp.CopyFrom(self.__worker_start_timestamp)
|
|
183 |
+ action_metadata.worker_start_timestamp.CopyFrom(self.__worker_start_timestamp)
|
|
184 |
+ action_metadata.worker_completed_timestamp.CopyFrom(self.__worker_completed_timestamp)
|
|
185 |
+ |
|
194 | 186 |
self.__execute_response = remote_execution_pb2.ExecuteResponse()
|
195 | 187 |
self.__execute_response.result.CopyFrom(action_result)
|
196 | 188 |
self.__execute_response.cached_result = False
|
... | ... | @@ -208,6 +200,8 @@ class Job: |
208 | 200 |
self.__operation_metadata.stage = stage.value
|
209 | 201 |
|
210 | 202 |
if self.__operation_metadata.stage == OperationStage.QUEUED.value:
|
203 |
+ if self.__queued_timestamp.ByteSize() == 0:
|
|
204 |
+ self.__queued_timestamp.GetCurrentTime()
|
|
211 | 205 |
self._n_tries += 1
|
212 | 206 |
|
213 | 207 |
elif self.__operation_metadata.stage == OperationStage.COMPLETED.value:
|
... | ... | @@ -109,11 +109,16 @@ class Scheduler: |
109 | 109 |
"""
|
110 | 110 |
job = self.jobs[job_name]
|
111 | 111 |
|
112 |
- if lease_state != LeaseState.COMPLETED:
|
|
113 |
- job.update_lease_state(lease_state)
|
|
112 |
+ if lease_state == LeaseState.PENDING:
|
|
113 |
+ job.update_lease_state(LeaseState.PENDING)
|
|
114 |
+ job.update_operation_stage(OperationStage.QUEUED)
|
|
114 | 115 |
|
115 |
- else:
|
|
116 |
- job.update_lease_state(lease_state,
|
|
116 |
+ elif lease_state == LeaseState.ACTIVE:
|
|
117 |
+ job.update_lease_state(LeaseState.ACTIVE)
|
|
118 |
+ job.update_operation_stage(OperationStage.EXECUTING)
|
|
119 |
+ |
|
120 |
+ elif lease_state == LeaseState.COMPLETED:
|
|
121 |
+ job.update_lease_state(LeaseState.COMPLETED,
|
|
117 | 122 |
status=lease_status, result=lease_result)
|
118 | 123 |
|
119 | 124 |
if self._action_cache is not None and not job.do_not_cache:
|
... | ... | @@ -15,11 +15,21 @@ |
15 | 15 |
|
16 | 16 |
from operator import attrgetter
|
17 | 17 |
import os
|
18 |
+import socket
|
|
18 | 19 |
|
19 | 20 |
from buildgrid.settings import HASH
|
20 | 21 |
from buildgrid._protos.build.bazel.remote.execution.v2 import remote_execution_pb2
|
21 | 22 |
|
22 | 23 |
|
24 |
+def get_hostname():
|
|
25 |
+ """Returns the hostname of the machine executing that function.
|
|
26 |
+ |
|
27 |
+ Returns:
|
|
28 |
+ str: Hostname for the current machine.
|
|
29 |
+ """
|
|
30 |
+ return socket.gethostname()
|
|
31 |
+ |
|
32 |
+ |
|
23 | 33 |
def create_digest(bytes_to_digest):
|
24 | 34 |
"""Computes the :obj:`Digest` of a piece of data.
|
25 | 35 |
|
1 |
+.. _architecture:
|
|
2 |
+ |
|
3 |
+Architecture
|
|
4 |
+============
|
|
5 |
+ |
|
6 |
+This section provides details of the overall BuildGrid architecture.
|
|
7 |
+ |
|
8 |
+.. toctree::
|
|
9 |
+ :maxdepth: 3
|
|
10 |
+ |
|
11 |
+ architecture_overview.rst
|
|
\ No newline at end of file |
1 |
+.. _architecture-overview:
|
|
2 |
+ |
|
3 |
+Remote execution overview
|
|
4 |
+=========================
|
|
5 |
+ |
|
6 |
+Remote execution aims to speed up a build process and to rely on two separate
|
|
7 |
+but related concepts that are remote caching and remote execution itself. Remote
|
|
8 |
+caching allows users to share build outputs while remote execution allows the running
|
|
9 |
+of operations on a remote cluster of machines which may be more powerful than what
|
|
10 |
+the user has access to locally.
|
|
11 |
+ |
|
12 |
+The `Remote Execution API`_ (REAPI) describes a `gRPC`_ + `protocol-buffers`_
|
|
13 |
+interface that has three main services for remote caching and execution:
|
|
14 |
+ |
|
15 |
+- A ``ContentAddressableStorage`` (CAS) service: a remote storage end-point
|
|
16 |
+ where content is addressed by digests, a digest being a pair of the hash and
|
|
17 |
+ size of the data stored or retrieved.
|
|
18 |
+- An ``ActionCache`` (AC) service: a mapping between build actions already
|
|
19 |
+ performed and their corresponding resulting artifact.
|
|
20 |
+- An ``Execution`` service: the main end-point allowing one to request build
|
|
21 |
+ job to be perform against the build farm.
|
|
22 |
+ |
|
23 |
+The `Remote Worker API`_ (RWAPI) describes another `gRPC`_ + `protocol-buffers`_
|
|
24 |
+interface that allows a central ``BotsService`` to manage a farm of pluggable workers.
|
|
25 |
+ |
|
26 |
+BuildGrid is combining these two interfaces in order to provide a complete
|
|
27 |
+remote caching and execution service. The high level architecture can be
|
|
28 |
+represented like this:
|
|
29 |
+ |
|
30 |
+.. graphviz::
|
|
31 |
+ :align: center
|
|
32 |
+ |
|
33 |
+ digraph remote_execution_overview {
|
|
34 |
+ node [shape = record,
|
|
35 |
+ width=2,
|
|
36 |
+ height=1];
|
|
37 |
+ |
|
38 |
+ ranksep = 2
|
|
39 |
+ compound=true
|
|
40 |
+ edge[arrowtail="vee"];
|
|
41 |
+ edge[arrowhead="vee"];
|
|
42 |
+ |
|
43 |
+ client [label = "Client",
|
|
44 |
+ color="#0342af",
|
|
45 |
+ fillcolor="#37c1e8",
|
|
46 |
+ style=filled,
|
|
47 |
+ shape=box]
|
|
48 |
+ |
|
49 |
+ subgraph cluster_controller{
|
|
50 |
+ label = "Controller";
|
|
51 |
+ labeljust = "c";
|
|
52 |
+ fillcolor="#42edae";
|
|
53 |
+ style=filled;
|
|
54 |
+ controller [label = "{ExecutionService|BotsInterface\n}",
|
|
55 |
+ fillcolor="#17e86a",
|
|
56 |
+ style=filled];
|
|
57 |
+ |
|
58 |
+ }
|
|
59 |
+ |
|
60 |
+ subgraph cluster_worker0 {
|
|
61 |
+ label = "Worker 1";
|
|
62 |
+ labeljust = "c";
|
|
63 |
+ color="#8e7747";
|
|
64 |
+ fillcolor="#ffda8e";
|
|
65 |
+ style=filled;
|
|
66 |
+ bot0 [label = "{Bot|Host-tools}"
|
|
67 |
+ fillcolor="#ffb214",
|
|
68 |
+ style=filled];
|
|
69 |
+ }
|
|
70 |
+ |
|
71 |
+ subgraph cluster_worker1 {
|
|
72 |
+ label = "Worker 2";
|
|
73 |
+ labeljust = "c";
|
|
74 |
+ color="#8e7747";
|
|
75 |
+ fillcolor="#ffda8e";
|
|
76 |
+ style=filled;
|
|
77 |
+ bot1 [label = "{Bot|BuildBox}",
|
|
78 |
+ fillcolor="#ffb214",
|
|
79 |
+ style=filled];
|
|
80 |
+ }
|
|
81 |
+ |
|
82 |
+ client -> controller [
|
|
83 |
+ dir = "both",
|
|
84 |
+ headlabel = "REAPI",
|
|
85 |
+ labelangle = 20.0,
|
|
86 |
+ labeldistance = 9,
|
|
87 |
+ labelfontsize = 15.0,
|
|
88 |
+ lhead=cluster_controller];
|
|
89 |
+ |
|
90 |
+ controller -> bot0 [
|
|
91 |
+ dir = "both",
|
|
92 |
+ labelangle= 340.0,
|
|
93 |
+ labeldistance = 7.5,
|
|
94 |
+ labelfontsize = 15.0,
|
|
95 |
+ taillabel = "RWAPI ",
|
|
96 |
+ lhead=cluster_worker0,
|
|
97 |
+ ltail=cluster_controller];
|
|
98 |
+ |
|
99 |
+ controller -> bot1 [
|
|
100 |
+ dir = "both",
|
|
101 |
+ labelangle= 20.0,
|
|
102 |
+ labeldistance = 7.5,
|
|
103 |
+ labelfontsize = 15.0,
|
|
104 |
+ taillabel = " RWAPI",
|
|
105 |
+ lhead=cluster_worker1,
|
|
106 |
+ ltail=cluster_controller];
|
|
107 |
+ |
|
108 |
+ }
|
|
109 |
+ |
|
110 |
+BuildGrid can be split up into separate endpoints. It is possible to have
|
|
111 |
+a separate ``ActionCache`` and ``CAS`` from the ``Controller``. The
|
|
112 |
+following diagram shows a typical setup.
|
|
113 |
+ |
|
114 |
+.. graphviz::
|
|
115 |
+ :align: center
|
|
116 |
+ |
|
117 |
+ digraph remote_execution_overview {
|
|
118 |
+ |
|
119 |
+ node [shape=record,
|
|
120 |
+ width=2,
|
|
121 |
+ height=1];
|
|
122 |
+ |
|
123 |
+ compound=true
|
|
124 |
+ graph [nodesep=1,
|
|
125 |
+ ranksep=2]
|
|
126 |
+ |
|
127 |
+ edge[arrowtail="vee"];
|
|
128 |
+ edge[arrowhead="vee"];
|
|
129 |
+ |
|
130 |
+ client [label="Client",
|
|
131 |
+ color="#0342af",
|
|
132 |
+ fillcolor="#37c1e8",
|
|
133 |
+ style=filled,
|
|
134 |
+ shape=box]
|
|
135 |
+ |
|
136 |
+ cas [label="CAS",
|
|
137 |
+ color="#840202",
|
|
138 |
+ fillcolor="#c1034c",
|
|
139 |
+ style=filled,
|
|
140 |
+ shape=box]
|
|
141 |
+ |
|
142 |
+ subgraph cluster_controller{
|
|
143 |
+ label="Controller";
|
|
144 |
+ labeljust="c";
|
|
145 |
+ fillcolor="#42edae";
|
|
146 |
+ style=filled;
|
|
147 |
+ controller [label="{ExecutionService|BotsInterface\n}",
|
|
148 |
+ fillcolor="#17e86a",
|
|
149 |
+ style=filled];
|
|
150 |
+ |
|
151 |
+ }
|
|
152 |
+ |
|
153 |
+ actioncache [label="ActionCache",
|
|
154 |
+ color="#133f42",
|
|
155 |
+ fillcolor="#219399",
|
|
156 |
+ style=filled,
|
|
157 |
+ shape=box]
|
|
158 |
+ |
|
159 |
+ subgraph cluster_worker0 {
|
|
160 |
+ label="Worker";
|
|
161 |
+ labeljust="c";
|
|
162 |
+ color="#8e7747";
|
|
163 |
+ fillcolor="#ffda8e";
|
|
164 |
+ style=filled;
|
|
165 |
+ bot0 [label="{Bot}"
|
|
166 |
+ fillcolor="#ffb214",
|
|
167 |
+ style=filled];
|
|
168 |
+ }
|
|
169 |
+ |
|
170 |
+ client -> controller [
|
|
171 |
+ dir="both"];
|
|
172 |
+ |
|
173 |
+ client -> cas [
|
|
174 |
+ dir="both",
|
|
175 |
+ lhead=cluster_controller];
|
|
176 |
+ |
|
177 |
+ controller -> bot0 [
|
|
178 |
+ dir="both",
|
|
179 |
+ lhead=cluster_worker0];
|
|
180 |
+ //ltail=cluster_controller];
|
|
181 |
+ |
|
182 |
+ cas -> bot0 [
|
|
183 |
+ dir="both",
|
|
184 |
+ lhead=cluster_worker0];
|
|
185 |
+ |
|
186 |
+ actioncache -> controller [
|
|
187 |
+ dir="both"];
|
|
188 |
+ |
|
189 |
+ client -> actioncache [
|
|
190 |
+ dir="both",
|
|
191 |
+ constraint=false,
|
|
192 |
+ ];
|
|
193 |
+ |
|
194 |
+ |
|
195 |
+ }
|
|
196 |
+ |
|
197 |
+.. _Remote Execution API: https://github.com/bazelbuild/remote-apis/blob/master/build/bazel/remote/execution/v2
|
|
198 |
+.. _gRPC: https://grpc.io
|
|
199 |
+.. _protocol-buffers: https://developers.google.com/protocol-buffers
|
|
200 |
+.. _Remote Worker API: https://github.com/googleapis/googleapis/tree/master/google/devtools/remoteworkers/v1test2
|
... | ... | @@ -46,7 +46,8 @@ extensions = [ |
46 | 46 |
'sphinx.ext.autodoc',
|
47 | 47 |
'sphinx.ext.napoleon',
|
48 | 48 |
'sphinx_click.ext',
|
49 |
- 'sphinxcontrib.apidoc'
|
|
49 |
+ 'sphinxcontrib.apidoc',
|
|
50 |
+ 'sphinx.ext.graphviz',
|
|
50 | 51 |
]
|
51 | 52 |
|
52 | 53 |
# Add any paths that contain templates here, relative to this directory.
|
... | ... | @@ -19,6 +19,7 @@ Remote execution service implementing Google's REAPI and RWAPI. |
19 | 19 |
using.rst
|
20 | 20 |
reference.rst
|
21 | 21 |
contributing.rst
|
22 |
+ architecture.rst
|
|
22 | 23 |
resources.rst
|
23 | 24 |
|
24 | 25 |
|
... | ... | @@ -144,7 +144,8 @@ def test_list_operations_with_result(instance, controller, execute_request, cont |
144 | 144 |
|
145 | 145 |
execute_response = remote_execution_pb2.ExecuteResponse()
|
146 | 146 |
response.operations[0].response.Unpack(execute_response)
|
147 |
- assert execute_response.result == action_result
|
|
147 |
+ |
|
148 |
+ assert execute_response.result.output_files == action_result.output_files
|
|
148 | 149 |
|
149 | 150 |
|
150 | 151 |
def test_list_operations_empty(instance, context):
|