[Notes] [Git][BuildGrid/buildgrid][mablanch/83-executed-action-metadata] 9 commits: docs: Load the graphviz extension for diagrams



Title: GitLab

Martin Blanchard pushed to branch mablanch/83-executed-action-metadata at BuildGrid / buildgrid

Commits:

14 changed files:

Changes:

  • .pylintrc
    ... ... @@ -184,7 +184,8 @@ ignore-on-opaque-inference=yes
    184 184
     # List of class names for which member attributes should not be checked (useful
    
    185 185
     # for classes with dynamically set attributes). This supports the use of
    
    186 186
     # qualified names.
    
    187
    -ignored-classes=google.protobuf.any_pb2.Any
    
    187
    +ignored-classes=google.protobuf.any_pb2.Any,
    
    188
    +                google.protobuf.timestamp_pb2.Timestamp
    
    188 189
     
    
    189 190
     # List of module names for which member attributes should not be checked
    
    190 191
     # (useful for modules/projects where namespaces are manipulated during runtime
    

  • buildgrid/_app/bots/dummy.py
    ... ... @@ -17,16 +17,32 @@ import random
    17 17
     import time
    
    18 18
     
    
    19 19
     from buildgrid._protos.build.bazel.remote.execution.v2 import remote_execution_pb2
    
    20
    +from buildgrid.utils import get_hostname
    
    20 21
     
    
    21 22
     
    
    22 23
     def work_dummy(context, lease):
    
    23 24
         """ Just returns lease after some random time
    
    24 25
         """
    
    26
    +    action_result = remote_execution_pb2.ActionResult()
    
    27
    +
    
    25 28
         lease.result.Clear()
    
    26 29
     
    
    27
    -    time.sleep(random.randint(1, 5))
    
    30
    +    action_result.execution_metadata.worker = get_hostname()
    
    28 31
     
    
    29
    -    action_result = remote_execution_pb2.ActionResult()
    
    32
    +    # Simulation input-downloading phase:
    
    33
    +    action_result.execution_metadata.input_fetch_start_timestamp.GetCurrentTime()
    
    34
    +    time.sleep(random.random())
    
    35
    +    action_result.execution_metadata.input_fetch_completed_timestamp.GetCurrentTime()
    
    36
    +
    
    37
    +    # Simulation execution phase:
    
    38
    +    action_result.execution_metadata.execution_start_timestamp.GetCurrentTime()
    
    39
    +    time.sleep(random.random())
    
    40
    +    action_result.execution_metadata.execution_completed_timestamp.GetCurrentTime()
    
    41
    +
    
    42
    +    # Simulation output-uploading phase:
    
    43
    +    action_result.execution_metadata.output_upload_start_timestamp.GetCurrentTime()
    
    44
    +    time.sleep(random.random())
    
    45
    +    action_result.execution_metadata.output_upload_completed_timestamp.GetCurrentTime()
    
    30 46
     
    
    31 47
         lease.result.Pack(action_result)
    
    32 48
     
    

  • buildgrid/_app/bots/host.py
    ... ... @@ -19,7 +19,7 @@ import tempfile
    19 19
     
    
    20 20
     from buildgrid.client.cas import download, upload
    
    21 21
     from buildgrid._protos.build.bazel.remote.execution.v2 import remote_execution_pb2
    
    22
    -from buildgrid.utils import output_file_maker, output_directory_maker
    
    22
    +from buildgrid.utils import get_hostname, output_file_maker, output_directory_maker
    
    23 23
     
    
    24 24
     
    
    25 25
     def work_host_tools(context, lease):
    
    ... ... @@ -29,10 +29,13 @@ def work_host_tools(context, lease):
    29 29
         logger = context.logger
    
    30 30
     
    
    31 31
         action_digest = remote_execution_pb2.Digest()
    
    32
    +    action_result = remote_execution_pb2.ActionResult()
    
    32 33
     
    
    33 34
         lease.payload.Unpack(action_digest)
    
    34 35
         lease.result.Clear()
    
    35 36
     
    
    37
    +    action_result.execution_metadata.worker = get_hostname()
    
    38
    +
    
    36 39
         with tempfile.TemporaryDirectory() as temp_directory:
    
    37 40
             with download(context.cas_channel, instance=instance_name) as downloader:
    
    38 41
                 action = downloader.get_message(action_digest,
    
    ... ... @@ -43,8 +46,12 @@ def work_host_tools(context, lease):
    43 46
                 command = downloader.get_message(action.command_digest,
    
    44 47
                                                  remote_execution_pb2.Command())
    
    45 48
     
    
    49
    +            action_result.execution_metadata.input_fetch_start_timestamp.GetCurrentTime()
    
    50
    +
    
    46 51
                 downloader.download_directory(action.input_root_digest, temp_directory)
    
    47 52
     
    
    53
    +        action_result.execution_metadata.input_fetch_completed_timestamp.GetCurrentTime()
    
    54
    +
    
    48 55
             environment = os.environ.copy()
    
    49 56
             for variable in command.environment_variables:
    
    50 57
                 if variable.name not in ['PATH', 'PWD']:
    
    ... ... @@ -70,6 +77,8 @@ def work_host_tools(context, lease):
    70 77
     
    
    71 78
             logger.debug(' '.join(command_line))
    
    72 79
     
    
    80
    +        action_result.execution_metadata.execution_start_timestamp.GetCurrentTime()
    
    81
    +
    
    73 82
             process = subprocess.Popen(command_line,
    
    74 83
                                        cwd=working_directory,
    
    75 84
                                        env=environment,
    
    ... ... @@ -80,7 +89,8 @@ def work_host_tools(context, lease):
    80 89
             stdout, stderr = process.communicate()
    
    81 90
             returncode = process.returncode
    
    82 91
     
    
    83
    -        action_result = remote_execution_pb2.ActionResult()
    
    92
    +        action_result.execution_metadata.execution_completed_timestamp.GetCurrentTime()
    
    93
    +
    
    84 94
             # TODO: Upload to CAS or output RAW
    
    85 95
             # For now, just pass raw
    
    86 96
             # https://gitlab.com/BuildGrid/buildgrid/issues/90
    
    ... ... @@ -92,6 +102,8 @@ def work_host_tools(context, lease):
    92 102
             logger.debug("Command stdout: [{}]".format(stdout))
    
    93 103
             logger.debug("Command exit code: [{}]".format(returncode))
    
    94 104
     
    
    105
    +        action_result.execution_metadata.output_upload_start_timestamp.GetCurrentTime()
    
    106
    +
    
    95 107
             with upload(context.cas_channel, instance=instance_name) as uploader:
    
    96 108
                 output_files, output_directories = [], []
    
    97 109
     
    
    ... ... @@ -121,6 +133,8 @@ def work_host_tools(context, lease):
    121 133
     
    
    122 134
                 action_result.output_directories.extend(output_directories)
    
    123 135
     
    
    136
    +        action_result.execution_metadata.output_upload_completed_timestamp.GetCurrentTime()
    
    137
    +
    
    124 138
             lease.result.Pack(action_result)
    
    125 139
     
    
    126 140
         return lease

  • buildgrid/_app/commands/cmd_operation.py
    ... ... @@ -20,15 +20,21 @@ Operations command
    20 20
     Check the status of operations
    
    21 21
     """
    
    22 22
     
    
    23
    +from collections import OrderedDict
    
    23 24
     import logging
    
    25
    +from operator import attrgetter
    
    24 26
     from urllib.parse import urlparse
    
    25 27
     import sys
    
    28
    +from textwrap import indent
    
    26 29
     
    
    27 30
     import click
    
    31
    +from google.protobuf import json_format
    
    28 32
     import grpc
    
    29 33
     
    
    34
    +from buildgrid._enums import OperationStage
    
    30 35
     from buildgrid._protos.build.bazel.remote.execution.v2 import remote_execution_pb2, remote_execution_pb2_grpc
    
    31 36
     from buildgrid._protos.google.longrunning import operations_pb2, operations_pb2_grpc
    
    37
    +from buildgrid._protos.google.rpc import code_pb2
    
    32 38
     
    
    33 39
     from ..cli import pass_context
    
    34 40
     
    
    ... ... @@ -65,45 +71,145 @@ def cli(context, remote, instance_name, client_key, client_cert, server_cert):
    65 71
         context.logger.debug("Starting for remote {}".format(context.remote))
    
    66 72
     
    
    67 73
     
    
    74
    +def _print_operation_status(operation, print_details=False):
    
    75
    +    metadata = remote_execution_pb2.ExecuteOperationMetadata()
    
    76
    +    # The metadata is expected to be an ExecuteOperationMetadata message:
    
    77
    +    assert operation.metadata.Is(metadata.DESCRIPTOR)
    
    78
    +    operation.metadata.Unpack(metadata)
    
    79
    +
    
    80
    +    stage = OperationStage(metadata.stage)
    
    81
    +
    
    82
    +    if not operation.done:
    
    83
    +        if stage == OperationStage.CACHE_CHECK:
    
    84
    +            click.echo('CacheCheck: {}: Querying action-cache (stage={})'
    
    85
    +                       .format(operation.name, metadata.stage))
    
    86
    +        elif stage == OperationStage.QUEUED:
    
    87
    +            click.echo('Queued: {}: Waiting for execution (stage={})'
    
    88
    +                       .format(operation.name, metadata.stage))
    
    89
    +        elif stage == OperationStage.EXECUTING:
    
    90
    +            click.echo('Executing: {}: Currently running (stage={})'
    
    91
    +                       .format(operation.name, metadata.stage))
    
    92
    +        else:
    
    93
    +            click.echo('Error: {}: In an invalid state (stage={})'
    
    94
    +                       .format(operation.name, metadata.stage), err=True)
    
    95
    +        return
    
    96
    +
    
    97
    +    assert stage == OperationStage.COMPLETED
    
    98
    +
    
    99
    +    response = remote_execution_pb2.ExecuteResponse()
    
    100
    +    # The response is expected to be an ExecutionResponse message:
    
    101
    +    assert operation.response.Is(response.DESCRIPTOR)
    
    102
    +    operation.response.Unpack(response)
    
    103
    +
    
    104
    +    if response.status.code != code_pb2.OK:
    
    105
    +        click.echo('Failure: {}: {} (code={})'
    
    106
    +                   .format(operation.name, response.status.message, response.status.code))
    
    107
    +    else:
    
    108
    +        if response.result.exit_code != 0:
    
    109
    +            click.echo('Success: {}: Completed with failure (stage={}, exit_code={})'
    
    110
    +                       .format(operation.name, metadata.stage, response.result.exit_code))
    
    111
    +        else:
    
    112
    +            click.echo('Success: {}: Completed succesfully (stage={}, exit_code={})'
    
    113
    +                       .format(operation.name, metadata.stage, response.result.exit_code))
    
    114
    +
    
    115
    +    if print_details:
    
    116
    +        metadata = response.result.execution_metadata
    
    117
    +        click.echo(indent('worker={}'.format(metadata.worker), '  '))
    
    118
    +
    
    119
    +        queued = metadata.queued_timestamp.ToDatetime()
    
    120
    +        click.echo(indent('queued_at={}'.format(queued), '  '))
    
    121
    +
    
    122
    +        worker_start = metadata.worker_start_timestamp.ToDatetime()
    
    123
    +        worker_completed = metadata.worker_completed_timestamp.ToDatetime()
    
    124
    +        click.echo(indent('work_duration={}'.format(worker_completed - worker_start), '  '))
    
    125
    +
    
    126
    +        fetch_start = metadata.input_fetch_start_timestamp.ToDatetime()
    
    127
    +        fetch_completed = metadata.input_fetch_completed_timestamp.ToDatetime()
    
    128
    +        click.echo(indent('fetch_duration={}'.format(fetch_completed - fetch_start), '    '))
    
    129
    +
    
    130
    +        execution_start = metadata.execution_start_timestamp.ToDatetime()
    
    131
    +        execution_completed = metadata.execution_completed_timestamp.ToDatetime()
    
    132
    +        click.echo(indent('exection_duration={}'.format(execution_completed - execution_start), '    '))
    
    133
    +
    
    134
    +        upload_start = metadata.output_upload_start_timestamp.ToDatetime()
    
    135
    +        upload_completed = metadata.output_upload_completed_timestamp.ToDatetime()
    
    136
    +        click.echo(indent('upload_duration={}'.format(upload_completed - upload_start), '    '))
    
    137
    +
    
    138
    +        click.echo(indent('total_duration={}'.format(worker_completed - queued), '  '))
    
    139
    +
    
    140
    +
    
    68 141
     @cli.command('status', short_help="Get the status of an operation.")
    
    69 142
     @click.argument('operation-name', nargs=1, type=click.STRING, required=True)
    
    143
    +@click.option('--json', is_flag=True, show_default=True,
    
    144
    +              help="Print operations status in JSON format.")
    
    70 145
     @pass_context
    
    71
    -def status(context, operation_name):
    
    72
    -    context.logger.info("Getting operation status...")
    
    146
    +def status(context, operation_name, json):
    
    73 147
         stub = operations_pb2_grpc.OperationsStub(context.channel)
    
    74
    -
    
    75 148
         request = operations_pb2.GetOperationRequest(name=operation_name)
    
    76 149
     
    
    77
    -    response = stub.GetOperation(request)
    
    78
    -    context.logger.info(response)
    
    150
    +    operation = stub.GetOperation(request)
    
    151
    +
    
    152
    +    if not json:
    
    153
    +        _print_operation_status(operation, print_details=True)
    
    154
    +    else:
    
    155
    +        click.echo(json_format.MessageToJson(operation))
    
    79 156
     
    
    80 157
     
    
    81 158
     @cli.command('list', short_help="List operations.")
    
    159
    +@click.option('--json', is_flag=True, show_default=True,
    
    160
    +              help="Print operations list in JSON format.")
    
    82 161
     @pass_context
    
    83
    -def lists(context):
    
    84
    -    context.logger.info("Getting list of operations")
    
    162
    +def lists(context, json):
    
    85 163
         stub = operations_pb2_grpc.OperationsStub(context.channel)
    
    86
    -
    
    87 164
         request = operations_pb2.ListOperationsRequest(name=context.instance_name)
    
    88 165
     
    
    89 166
         response = stub.ListOperations(request)
    
    90 167
     
    
    91 168
         if not response.operations:
    
    92
    -        context.logger.warning("No operations to list")
    
    169
    +        click.echo('Error: No operations to list.', err=True)
    
    93 170
             return
    
    94 171
     
    
    95
    -    for op in response.operations:
    
    96
    -        context.logger.info(op)
    
    172
    +    operations_map = OrderedDict([
    
    173
    +        (OperationStage.CACHE_CHECK, []),
    
    174
    +        (OperationStage.QUEUED, []),
    
    175
    +        (OperationStage.EXECUTING, []),
    
    176
    +        (OperationStage.COMPLETED, [])
    
    177
    +    ])
    
    178
    +
    
    179
    +    for operation in response.operations:
    
    180
    +        metadata = remote_execution_pb2.ExecuteOperationMetadata()
    
    181
    +        # The metadata is expected to be an ExecuteOperationMetadata message:
    
    182
    +        assert operation.metadata.Is(metadata.DESCRIPTOR)
    
    183
    +        operation.metadata.Unpack(metadata)
    
    184
    +
    
    185
    +        stage = OperationStage(metadata.stage)
    
    186
    +
    
    187
    +        operations_map[stage].append(operation)
    
    188
    +
    
    189
    +    for operations in operations_map.values():
    
    190
    +        operations.sort(key=attrgetter('name'))
    
    191
    +        for operation in operations:
    
    192
    +            if not json:
    
    193
    +                _print_operation_status(operation)
    
    194
    +            else:
    
    195
    +                click.echo(json_format.MessageToJson(operation))
    
    97 196
     
    
    98 197
     
    
    99 198
     @cli.command('wait', short_help="Streams an operation until it is complete.")
    
    100 199
     @click.argument('operation-name', nargs=1, type=click.STRING, required=True)
    
    200
    +@click.option('--json', is_flag=True, show_default=True,
    
    201
    +              help="Print operations statuses in JSON format.")
    
    101 202
     @pass_context
    
    102
    -def wait(context, operation_name):
    
    203
    +def wait(context, operation_name, json):
    
    103 204
         stub = remote_execution_pb2_grpc.ExecutionStub(context.channel)
    
    104 205
         request = remote_execution_pb2.WaitExecutionRequest(name=operation_name)
    
    105 206
     
    
    106
    -    response = stub.WaitExecution(request)
    
    207
    +    operation_iterator = stub.WaitExecution(request)
    
    107 208
     
    
    108
    -    for stream in response:
    
    109
    -        context.logger.info(stream)
    209
    +    for operation in operation_iterator:
    
    210
    +        if not json and operation.done:
    
    211
    +            _print_operation_status(operation, print_details=True)
    
    212
    +        elif not json:
    
    213
    +            _print_operation_status(operation)
    
    214
    +        else:
    
    215
    +            click.echo(json_format.MessageToJson(operation))

  • buildgrid/_enums.py
    1
    +# Copyright (C) 2018 Bloomberg LP
    
    2
    +#
    
    3
    +# Licensed under the Apache License, Version 2.0 (the "License");
    
    4
    +# you may not use this file except in compliance with the License.
    
    5
    +# You may obtain a copy of the License at
    
    6
    +#
    
    7
    +#  <http://www.apache.org/licenses/LICENSE-2.0>
    
    8
    +#
    
    9
    +# Unless required by applicable law or agreed to in writing, software
    
    10
    +# distributed under the License is distributed on an "AS IS" BASIS,
    
    11
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    
    12
    +# See the License for the specific language governing permissions and
    
    13
    +# limitations under the License.
    
    14
    +
    
    15
    +
    
    16
    +from enum import Enum
    
    17
    +
    
    18
    +from buildgrid._protos.build.bazel.remote.execution.v2 import remote_execution_pb2
    
    19
    +from buildgrid._protos.google.devtools.remoteworkers.v1test2 import bots_pb2
    
    20
    +
    
    21
    +
    
    22
    +class BotStatus(Enum):
    
    23
    +    # Initially unknown state.
    
    24
    +    BOT_STATUS_UNSPECIFIED = bots_pb2.BotStatus.Value('BOT_STATUS_UNSPECIFIED')
    
    25
    +    # The bot is healthy, and will accept leases as normal.
    
    26
    +    OK = bots_pb2.BotStatus.Value('OK')
    
    27
    +    # The bot is unhealthy and will not accept new leases.
    
    28
    +    UNHEALTHY = bots_pb2.BotStatus.Value('UNHEALTHY')
    
    29
    +    # The bot has been asked to reboot the host.
    
    30
    +    HOST_REBOOTING = bots_pb2.BotStatus.Value('HOST_REBOOTING')
    
    31
    +    # The bot has been asked to shut down.
    
    32
    +    BOT_TERMINATING = bots_pb2.BotStatus.Value('BOT_TERMINATING')
    
    33
    +
    
    34
    +
    
    35
    +class LeaseState(Enum):
    
    36
    +    # Initially unknown state.
    
    37
    +    LEASE_STATE_UNSPECIFIED = bots_pb2.LeaseState.Value('LEASE_STATE_UNSPECIFIED')
    
    38
    +    # The server expects the bot to accept this lease.
    
    39
    +    PENDING = bots_pb2.LeaseState.Value('PENDING')
    
    40
    +    # The bot has accepted this lease.
    
    41
    +    ACTIVE = bots_pb2.LeaseState.Value('ACTIVE')
    
    42
    +    # The bot is no longer leased.
    
    43
    +    COMPLETED = bots_pb2.LeaseState.Value('COMPLETED')
    
    44
    +    # The bot should immediately release all resources associated with the lease.
    
    45
    +    CANCELLED = bots_pb2.LeaseState.Value('CANCELLED')
    
    46
    +
    
    47
    +
    
    48
    +class OperationStage(Enum):
    
    49
    +    # Initially unknown stage.
    
    50
    +    UNKNOWN = remote_execution_pb2.ExecuteOperationMetadata.Stage.Value('UNKNOWN')
    
    51
    +    # Checking the result against the cache.
    
    52
    +    CACHE_CHECK = remote_execution_pb2.ExecuteOperationMetadata.Stage.Value('CACHE_CHECK')
    
    53
    +    # Currently idle, awaiting a free machine to execute.
    
    54
    +    QUEUED = remote_execution_pb2.ExecuteOperationMetadata.Stage.Value('QUEUED')
    
    55
    +    # Currently being executed by a worker.
    
    56
    +    EXECUTING = remote_execution_pb2.ExecuteOperationMetadata.Stage.Value('EXECUTING')
    
    57
    +    # Finished execution.
    
    58
    +    COMPLETED = remote_execution_pb2.ExecuteOperationMetadata.Stage.Value('COMPLETED')

  • buildgrid/bot/bot_session.py
    ... ... @@ -26,49 +26,15 @@ import asyncio
    26 26
     import logging
    
    27 27
     import platform
    
    28 28
     import uuid
    
    29
    -from enum import Enum
    
    30 29
     
    
    31 30
     import grpc
    
    32 31
     
    
    32
    +from buildgrid._enums import BotStatus, LeaseState
    
    33 33
     from buildgrid._protos.google.rpc import code_pb2
    
    34 34
     from buildgrid._protos.google.devtools.remoteworkers.v1test2 import bots_pb2, worker_pb2
    
    35 35
     from buildgrid._exceptions import BotError
    
    36 36
     
    
    37 37
     
    
    38
    -class BotStatus(Enum):
    
    39
    -    # Default value.
    
    40
    -    BOT_STATUS_UNSPECIFIED = bots_pb2.BotStatus.Value('BOT_STATUS_UNSPECIFIED')
    
    41
    -
    
    42
    -    # The bot is healthy, and will accept leases as normal.
    
    43
    -    OK = bots_pb2.BotStatus.Value('OK')
    
    44
    -
    
    45
    -    # The bot is unhealthy and will not accept new leases.
    
    46
    -    UNHEALTHY = bots_pb2.BotStatus.Value('UNHEALTHY')
    
    47
    -
    
    48
    -    # The bot has been asked to reboot the host.
    
    49
    -    HOST_REBOOTING = bots_pb2.BotStatus.Value('HOST_REBOOTING')
    
    50
    -
    
    51
    -    # The bot has been asked to shut down.
    
    52
    -    BOT_TERMINATING = bots_pb2.BotStatus.Value('BOT_TERMINATING')
    
    53
    -
    
    54
    -
    
    55
    -class LeaseState(Enum):
    
    56
    -    # Default value.
    
    57
    -    LEASE_STATE_UNSPECIFIED = bots_pb2.LeaseState.Value('LEASE_STATE_UNSPECIFIED')
    
    58
    -
    
    59
    -    # The server expects the bot to accept this lease.
    
    60
    -    PENDING = bots_pb2.LeaseState.Value('PENDING')
    
    61
    -
    
    62
    -    # The bot has accepted this lease.
    
    63
    -    ACTIVE = bots_pb2.LeaseState.Value('ACTIVE')
    
    64
    -
    
    65
    -    # The bot is no longer leased.
    
    66
    -    COMPLETED = bots_pb2.LeaseState.Value('COMPLETED')
    
    67
    -
    
    68
    -    # The bot should immediately release all resources associated with the lease.
    
    69
    -    CANCELLED = bots_pb2.LeaseState.Value('CANCELLED')
    
    70
    -
    
    71
    -
    
    72 38
     class BotSession:
    
    73 39
         def __init__(self, parent, interface):
    
    74 40
             """ Unique bot ID within the farm used to identify this bot
    

  • buildgrid/server/job.py
    ... ... @@ -15,39 +15,15 @@
    15 15
     
    
    16 16
     import logging
    
    17 17
     import uuid
    
    18
    -from enum import Enum
    
    19 18
     
    
    19
    +from google.protobuf import timestamp_pb2
    
    20
    +
    
    21
    +from buildgrid._enums import LeaseState, OperationStage
    
    20 22
     from buildgrid._protos.build.bazel.remote.execution.v2 import remote_execution_pb2
    
    21 23
     from buildgrid._protos.google.devtools.remoteworkers.v1test2 import bots_pb2
    
    22 24
     from buildgrid._protos.google.longrunning import operations_pb2
    
    23 25
     
    
    24 26
     
    
    25
    -class OperationStage(Enum):
    
    26
    -    # Initially unknown stage.
    
    27
    -    UNKNOWN = remote_execution_pb2.ExecuteOperationMetadata.Stage.Value('UNKNOWN')
    
    28
    -    # Checking the result against the cache.
    
    29
    -    CACHE_CHECK = remote_execution_pb2.ExecuteOperationMetadata.Stage.Value('CACHE_CHECK')
    
    30
    -    # Currently idle, awaiting a free machine to execute.
    
    31
    -    QUEUED = remote_execution_pb2.ExecuteOperationMetadata.Stage.Value('QUEUED')
    
    32
    -    # Currently being executed by a worker.
    
    33
    -    EXECUTING = remote_execution_pb2.ExecuteOperationMetadata.Stage.Value('EXECUTING')
    
    34
    -    # Finished execution.
    
    35
    -    COMPLETED = remote_execution_pb2.ExecuteOperationMetadata.Stage.Value('COMPLETED')
    
    36
    -
    
    37
    -
    
    38
    -class LeaseState(Enum):
    
    39
    -    # Initially unknown state.
    
    40
    -    LEASE_STATE_UNSPECIFIED = bots_pb2.LeaseState.Value('LEASE_STATE_UNSPECIFIED')
    
    41
    -    # The server expects the bot to accept this lease.
    
    42
    -    PENDING = bots_pb2.LeaseState.Value('PENDING')
    
    43
    -    # The bot has accepted this lease.
    
    44
    -    ACTIVE = bots_pb2.LeaseState.Value('ACTIVE')
    
    45
    -    # The bot is no longer leased.
    
    46
    -    COMPLETED = bots_pb2.LeaseState.Value('COMPLETED')
    
    47
    -    # The bot should immediately release all resources associated with the lease.
    
    48
    -    CANCELLED = bots_pb2.LeaseState.Value('CANCELLED')
    
    49
    -
    
    50
    -
    
    51 27
     class Job:
    
    52 28
     
    
    53 29
         def __init__(self, action, action_digest):
    
    ... ... @@ -60,6 +36,9 @@ class Job:
    60 36
     
    
    61 37
             self.__execute_response = None
    
    62 38
             self.__operation_metadata = remote_execution_pb2.ExecuteOperationMetadata()
    
    39
    +        self.__queued_timestamp = timestamp_pb2.Timestamp()
    
    40
    +        self.__worker_start_timestamp = timestamp_pb2.Timestamp()
    
    41
    +        self.__worker_completed_timestamp = timestamp_pb2.Timestamp()
    
    63 42
     
    
    64 43
             self.__operation_metadata.action_digest.CopyFrom(action_digest)
    
    65 44
             self.__operation_metadata.stage = OperationStage.UNKNOWN.value
    
    ... ... @@ -177,10 +156,18 @@ class Job:
    177 156
             self._lease.state = state.value
    
    178 157
     
    
    179 158
             if self._lease.state == LeaseState.PENDING.value:
    
    159
    +            self.__worker_start_timestamp.Clear()
    
    160
    +            self.__worker_completed_timestamp.Clear()
    
    161
    +
    
    180 162
                 self._lease.status.Clear()
    
    181 163
                 self._lease.result.Clear()
    
    182 164
     
    
    165
    +        elif self._lease.state == LeaseState.ACTIVE.value:
    
    166
    +            self.__worker_start_timestamp.GetCurrentTime()
    
    167
    +
    
    183 168
             elif self._lease.state == LeaseState.COMPLETED.value:
    
    169
    +            self.__worker_completed_timestamp.GetCurrentTime()
    
    170
    +
    
    184 171
                 action_result = remote_execution_pb2.ActionResult()
    
    185 172
     
    
    186 173
                 # TODO: Make a distinction between build and bot failures!
    
    ... ... @@ -191,6 +178,11 @@ class Job:
    191 178
                     assert result.Is(action_result.DESCRIPTOR)
    
    192 179
                     result.Unpack(action_result)
    
    193 180
     
    
    181
    +            action_metadata = action_result.execution_metadata
    
    182
    +            action_metadata.queued_timestamp.CopyFrom(self.__worker_start_timestamp)
    
    183
    +            action_metadata.worker_start_timestamp.CopyFrom(self.__worker_start_timestamp)
    
    184
    +            action_metadata.worker_completed_timestamp.CopyFrom(self.__worker_completed_timestamp)
    
    185
    +
    
    194 186
                 self.__execute_response = remote_execution_pb2.ExecuteResponse()
    
    195 187
                 self.__execute_response.result.CopyFrom(action_result)
    
    196 188
                 self.__execute_response.cached_result = False
    
    ... ... @@ -208,6 +200,8 @@ class Job:
    208 200
             self.__operation_metadata.stage = stage.value
    
    209 201
     
    
    210 202
             if self.__operation_metadata.stage == OperationStage.QUEUED.value:
    
    203
    +            if self.__queued_timestamp.ByteSize() == 0:
    
    204
    +                self.__queued_timestamp.GetCurrentTime()
    
    211 205
                 self._n_tries += 1
    
    212 206
     
    
    213 207
             elif self.__operation_metadata.stage == OperationStage.COMPLETED.value:
    

  • buildgrid/server/scheduler.py
    ... ... @@ -109,11 +109,16 @@ class Scheduler:
    109 109
             """
    
    110 110
             job = self.jobs[job_name]
    
    111 111
     
    
    112
    -        if lease_state != LeaseState.COMPLETED:
    
    113
    -            job.update_lease_state(lease_state)
    
    112
    +        if lease_state == LeaseState.PENDING:
    
    113
    +            job.update_lease_state(LeaseState.PENDING)
    
    114
    +            job.update_operation_stage(OperationStage.QUEUED)
    
    114 115
     
    
    115
    -        else:
    
    116
    -            job.update_lease_state(lease_state,
    
    116
    +        elif lease_state == LeaseState.ACTIVE:
    
    117
    +            job.update_lease_state(LeaseState.ACTIVE)
    
    118
    +            job.update_operation_stage(OperationStage.EXECUTING)
    
    119
    +
    
    120
    +        elif lease_state == LeaseState.COMPLETED:
    
    121
    +            job.update_lease_state(LeaseState.COMPLETED,
    
    117 122
                                        status=lease_status, result=lease_result)
    
    118 123
     
    
    119 124
                 if self._action_cache is not None and not job.do_not_cache:
    

  • buildgrid/utils.py
    ... ... @@ -15,11 +15,21 @@
    15 15
     
    
    16 16
     from operator import attrgetter
    
    17 17
     import os
    
    18
    +import socket
    
    18 19
     
    
    19 20
     from buildgrid.settings import HASH
    
    20 21
     from buildgrid._protos.build.bazel.remote.execution.v2 import remote_execution_pb2
    
    21 22
     
    
    22 23
     
    
    24
    +def get_hostname():
    
    25
    +    """Returns the hostname of the machine executing that function.
    
    26
    +
    
    27
    +    Returns:
    
    28
    +        str: Hostname for the current machine.
    
    29
    +    """
    
    30
    +    return socket.gethostname()
    
    31
    +
    
    32
    +
    
    23 33
     def create_digest(bytes_to_digest):
    
    24 34
         """Computes the :obj:`Digest` of a piece of data.
    
    25 35
     
    

  • docs/source/architecture.rst
    1
    +.. _architecture:
    
    2
    +
    
    3
    +Architecture
    
    4
    +============
    
    5
    +
    
    6
    +This section provides details of the overall BuildGrid architecture.
    
    7
    +
    
    8
    +.. toctree::
    
    9
    +   :maxdepth: 3
    
    10
    +
    
    11
    +   architecture_overview.rst
    \ No newline at end of file

  • docs/source/architecture_overview.rst
    1
    +.. _architecture-overview:
    
    2
    +
    
    3
    +Remote execution overview
    
    4
    +=========================
    
    5
    +
    
    6
    +Remote execution aims to speed up a build process and to rely on two separate
    
    7
    +but related concepts that are remote caching and remote execution itself. Remote
    
    8
    +caching allows users to share build outputs while remote execution allows the running
    
    9
    +of operations on a remote cluster of machines which may be more powerful than what
    
    10
    +the user has access to locally.
    
    11
    +
    
    12
    +The `Remote Execution API`_ (REAPI) describes a `gRPC`_ + `protocol-buffers`_
    
    13
    +interface that has three main services for remote caching and execution:
    
    14
    +
    
    15
    +- A ``ContentAddressableStorage`` (CAS) service: a remote storage end-point
    
    16
    +  where content is addressed by digests, a digest being a pair of the hash and
    
    17
    +  size of the data stored or retrieved.
    
    18
    +- An ``ActionCache`` (AC) service: a mapping between build actions already
    
    19
    +  performed and their corresponding resulting artifact.
    
    20
    +- An ``Execution`` service: the main end-point allowing one to request build
    
    21
    +  job to be perform against the build farm.
    
    22
    +
    
    23
    +The `Remote Worker API`_ (RWAPI) describes another `gRPC`_ + `protocol-buffers`_
    
    24
    +interface that allows a central ``BotsService`` to manage a farm of pluggable workers.
    
    25
    +
    
    26
    +BuildGrid is combining these two interfaces in order to provide a complete
    
    27
    +remote caching and execution service. The high level architecture can be
    
    28
    +represented like this:
    
    29
    +
    
    30
    +.. graphviz::
    
    31
    +   :align: center
    
    32
    +
    
    33
    +    digraph remote_execution_overview {
    
    34
    +	node [shape = record,
    
    35
    +	      width=2,
    
    36
    +	      height=1];
    
    37
    +
    
    38
    +	ranksep = 2
    
    39
    +	compound=true
    
    40
    +	edge[arrowtail="vee"];
    
    41
    +	edge[arrowhead="vee"];
    
    42
    +
    
    43
    +	client [label = "Client",
    
    44
    +	color="#0342af",
    
    45
    +	fillcolor="#37c1e8",
    
    46
    +	style=filled,
    
    47
    +	shape=box]
    
    48
    +
    
    49
    +	subgraph cluster_controller{
    
    50
    +	    label = "Controller";
    
    51
    +	    labeljust = "c";
    
    52
    +	    fillcolor="#42edae";
    
    53
    +	    style=filled;
    
    54
    +	    controller [label = "{ExecutionService|BotsInterface\n}",
    
    55
    +			fillcolor="#17e86a",
    
    56
    +			style=filled];
    
    57
    +
    
    58
    +	}
    
    59
    +
    
    60
    +	subgraph cluster_worker0 {
    
    61
    +	    label = "Worker 1";
    
    62
    +	    labeljust = "c";
    
    63
    +	    color="#8e7747";
    
    64
    +	    fillcolor="#ffda8e";
    
    65
    +	    style=filled;
    
    66
    +	    bot0 [label = "{Bot|Host-tools}"
    
    67
    +		  fillcolor="#ffb214",
    
    68
    +		  style=filled];
    
    69
    +	}
    
    70
    +
    
    71
    +	subgraph cluster_worker1 {
    
    72
    +	    label = "Worker 2";
    
    73
    +	    labeljust = "c";
    
    74
    +	    color="#8e7747";
    
    75
    +	    fillcolor="#ffda8e";
    
    76
    +	    style=filled;
    
    77
    +	    bot1 [label = "{Bot|BuildBox}",
    
    78
    +		  fillcolor="#ffb214",
    
    79
    +		  style=filled];
    
    80
    +	}
    
    81
    +
    
    82
    +	client -> controller [
    
    83
    +	    dir = "both",
    
    84
    +	    headlabel = "REAPI",
    
    85
    +	    labelangle = 20.0,
    
    86
    +	    labeldistance = 9,
    
    87
    +	    labelfontsize = 15.0,
    
    88
    +	    lhead=cluster_controller];
    
    89
    +
    
    90
    +	controller -> bot0 [
    
    91
    +	    dir = "both",
    
    92
    +	    labelangle= 340.0,
    
    93
    +		labeldistance = 7.5,
    
    94
    +		labelfontsize = 15.0,
    
    95
    +	    taillabel = "RWAPI     ",
    
    96
    +	    lhead=cluster_worker0,
    
    97
    +	    ltail=cluster_controller];
    
    98
    +
    
    99
    +	controller -> bot1 [
    
    100
    +	    dir = "both",
    
    101
    +	    labelangle= 20.0,
    
    102
    +	    labeldistance = 7.5,
    
    103
    +	    labelfontsize = 15.0,
    
    104
    +		taillabel = "     RWAPI",
    
    105
    +	    lhead=cluster_worker1,
    
    106
    +	    ltail=cluster_controller];
    
    107
    +
    
    108
    +    }
    
    109
    +
    
    110
    +BuildGrid can be split up into separate endpoints. It is possible to have
    
    111
    +a separate ``ActionCache`` and ``CAS`` from the ``Controller``. The
    
    112
    +following diagram shows a typical setup.
    
    113
    +
    
    114
    +.. graphviz::
    
    115
    +   :align: center
    
    116
    +
    
    117
    +    digraph remote_execution_overview {
    
    118
    +
    
    119
    +	node [shape=record,
    
    120
    +	      width=2,
    
    121
    +	      height=1];
    
    122
    +
    
    123
    +	compound=true
    
    124
    +	graph [nodesep=1,
    
    125
    +	       ranksep=2]
    
    126
    +
    
    127
    +	edge[arrowtail="vee"];
    
    128
    +	edge[arrowhead="vee"];
    
    129
    +
    
    130
    +	client [label="Client",
    
    131
    +		color="#0342af",
    
    132
    +		fillcolor="#37c1e8",
    
    133
    +		style=filled,
    
    134
    +		shape=box]
    
    135
    +
    
    136
    +	cas [label="CAS",
    
    137
    +	     color="#840202",
    
    138
    +	     fillcolor="#c1034c",
    
    139
    +	     style=filled,
    
    140
    +	     shape=box]
    
    141
    +
    
    142
    +	subgraph cluster_controller{
    
    143
    +	    label="Controller";
    
    144
    +	    labeljust="c";
    
    145
    +	    fillcolor="#42edae";
    
    146
    +	    style=filled;
    
    147
    +	    controller [label="{ExecutionService|BotsInterface\n}",
    
    148
    +			fillcolor="#17e86a",
    
    149
    +			style=filled];
    
    150
    +
    
    151
    +	}
    
    152
    +
    
    153
    +	actioncache [label="ActionCache",
    
    154
    +		     color="#133f42",
    
    155
    +		     fillcolor="#219399",
    
    156
    +		     style=filled,
    
    157
    +		     shape=box]
    
    158
    +
    
    159
    +	subgraph cluster_worker0 {
    
    160
    +	    label="Worker";
    
    161
    +	    labeljust="c";
    
    162
    +	    color="#8e7747";
    
    163
    +	    fillcolor="#ffda8e";
    
    164
    +	    style=filled;
    
    165
    +	    bot0 [label="{Bot}"
    
    166
    +		  fillcolor="#ffb214",
    
    167
    +		  style=filled];
    
    168
    +	}
    
    169
    +
    
    170
    +	client -> controller [
    
    171
    +	    dir="both"];
    
    172
    +
    
    173
    +	client -> cas [
    
    174
    +	    dir="both",
    
    175
    +	    lhead=cluster_controller];
    
    176
    +
    
    177
    +	controller -> bot0 [
    
    178
    +	    dir="both",
    
    179
    +	    lhead=cluster_worker0];
    
    180
    +	    //ltail=cluster_controller];
    
    181
    +
    
    182
    +	cas -> bot0 [
    
    183
    +	    dir="both",
    
    184
    +	    lhead=cluster_worker0];
    
    185
    +
    
    186
    +	actioncache -> controller [
    
    187
    +	    dir="both"];
    
    188
    +
    
    189
    +	client -> actioncache [
    
    190
    +	    dir="both",
    
    191
    +	    constraint=false,
    
    192
    +    ];
    
    193
    +
    
    194
    +
    
    195
    +    }
    
    196
    +
    
    197
    +.. _Remote Execution API: https://github.com/bazelbuild/remote-apis/blob/master/build/bazel/remote/execution/v2
    
    198
    +.. _gRPC: https://grpc.io
    
    199
    +.. _protocol-buffers: https://developers.google.com/protocol-buffers
    
    200
    +.. _Remote Worker API: https://github.com/googleapis/googleapis/tree/master/google/devtools/remoteworkers/v1test2

  • docs/source/conf.py
    ... ... @@ -46,7 +46,8 @@ extensions = [
    46 46
         'sphinx.ext.autodoc',
    
    47 47
         'sphinx.ext.napoleon',
    
    48 48
         'sphinx_click.ext',
    
    49
    -    'sphinxcontrib.apidoc'
    
    49
    +    'sphinxcontrib.apidoc',
    
    50
    +    'sphinx.ext.graphviz',
    
    50 51
     ]
    
    51 52
     
    
    52 53
     # Add any paths that contain templates here, relative to this directory.
    

  • docs/source/index.rst
    ... ... @@ -19,6 +19,7 @@ Remote execution service implementing Google's REAPI and RWAPI.
    19 19
        using.rst
    
    20 20
        reference.rst
    
    21 21
        contributing.rst
    
    22
    +   architecture.rst
    
    22 23
        resources.rst
    
    23 24
     
    
    24 25
     
    

  • tests/integration/operations_service.py
    ... ... @@ -144,7 +144,8 @@ def test_list_operations_with_result(instance, controller, execute_request, cont
    144 144
     
    
    145 145
         execute_response = remote_execution_pb2.ExecuteResponse()
    
    146 146
         response.operations[0].response.Unpack(execute_response)
    
    147
    -    assert execute_response.result == action_result
    
    147
    +
    
    148
    +    assert execute_response.result.output_files == action_result.output_files
    
    148 149
     
    
    149 150
     
    
    150 151
     def test_list_operations_empty(instance, context):
    



  • [Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]