[Notes] [Git][BuildGrid/buildgrid][santigl/104-platform-matching] 4 commits: cmd_bot and cmd_execute: allow specifying list of Platform attributes



Title: GitLab

Santiago Gil pushed to branch santigl/104-platform-matching at BuildGrid / buildgrid

Commits:

8 changed files:

Changes:

  • buildgrid/_app/commands/cmd_bot.py
    ... ... @@ -59,11 +59,14 @@ from ..cli import pass_context, setup_logging
    59 59
                   help="Time period for bot updates to the server in seconds.")
    
    60 60
     @click.option('--parent', type=click.STRING, default=None, show_default=True,
    
    61 61
                   help="Targeted farm resource.")
    
    62
    +@click.option('-wp', '--worker-property', nargs=2, type=(click.STRING, click.STRING), multiple=True,
    
    63
    +              help="List of key-value pairs of worker properties.")
    
    62 64
     @click.option('-v', '--verbose', count=True,
    
    63 65
                   help='Increase log verbosity level.')
    
    64 66
     @pass_context
    
    65
    -def cli(context, parent, update_period, remote, auth_token, client_key, client_cert, server_cert,
    
    66
    -        remote_cas, cas_client_key, cas_client_cert, cas_server_cert, verbose):
    
    67
    +def cli(context, parent, update_period, remote, auth_token, client_key,
    
    68
    +        client_cert, server_cert, remote_cas, cas_client_key, cas_client_cert,
    
    69
    +        cas_server_cert, worker_property, verbose):
    
    67 70
         setup_logging(verbosity=verbose)
    
    68 71
         # Setup the remote execution server channel:
    
    69 72
         try:
    
    ... ... @@ -90,8 +93,14 @@ def cli(context, parent, update_period, remote, auth_token, client_key, client_c
    90 93
     
    
    91 94
         bot_interface = interface.BotInterface(context.channel)
    
    92 95
     
    
    96
    +    worker_properties_dict = dict()
    
    97
    +    for property_name, property_value in worker_property:
    
    98
    +        if property_name not in worker_properties_dict:
    
    99
    +            worker_properties_dict[property_name] = set()
    
    100
    +        worker_properties_dict[property_name].add(property_value)
    
    101
    +
    
    93 102
         worker = Worker()
    
    94
    -    worker.add_device(Device())
    
    103
    +    worker.add_device(Device(properties=worker_properties_dict))
    
    95 104
         hardware_interface = HardwareInterface(worker)
    
    96 105
     
    
    97 106
         context.bot_interface = bot_interface
    

  • buildgrid/_app/commands/cmd_execute.py
    ... ... @@ -107,23 +107,31 @@ def request_dummy(context, number, wait_for_completion):
    107 107
                   help="Tuple of expected output file and is-executeable flag.")
    
    108 108
     @click.option('--output-directory', default='testing', show_default=True,
    
    109 109
                   help="Output directory for the output files.")
    
    110
    +@click.option('-pp', '--platform-property', nargs=2, type=(click.STRING, click.STRING), multiple=True,
    
    111
    +              help="List of key-value pairs of required platform properties.")
    
    110 112
     @click.argument('input-root', nargs=1, type=click.Path(), required=True)
    
    111 113
     @click.argument('commands', nargs=-1, type=click.STRING, required=True)
    
    112 114
     @pass_context
    
    113
    -def run_command(context, input_root, commands, output_file, output_directory):
    
    115
    +def run_command(context, input_root, commands, output_file, output_directory,
    
    116
    +                platform_property):
    
    114 117
         stub = remote_execution_pb2_grpc.ExecutionStub(context.channel)
    
    115 118
     
    
    116
    -    output_executeables = []
    
    119
    +    output_executables = []
    
    117 120
         with upload(context.channel, instance=context.instance_name) as uploader:
    
    118 121
             command = remote_execution_pb2.Command()
    
    119 122
     
    
    120 123
             for arg in commands:
    
    121 124
                 command.arguments.extend([arg])
    
    122 125
     
    
    123
    -        for file, is_executeable in output_file:
    
    126
    +        for file, is_executable in output_file:
    
    124 127
                 command.output_files.extend([file])
    
    125
    -            if is_executeable:
    
    126
    -                output_executeables.append(file)
    
    128
    +            if is_executable:
    
    129
    +                output_executables.append(file)
    
    130
    +
    
    131
    +        for attribute_name, attribute_value in platform_property:
    
    132
    +            property = command.platform.properties.add()
    
    133
    +            property.name = attribute_name
    
    134
    +            property.value = attribute_value
    
    127 135
     
    
    128 136
             command_digest = uploader.put_message(command, queue=True)
    
    129 137
     
    
    ... ... @@ -165,6 +173,6 @@ def run_command(context, input_root, commands, output_file, output_directory):
    165 173
                 downloader.download_file(output_file_response.digest, path)
    
    166 174
     
    
    167 175
         for output_file_response in execute_response.result.output_files:
    
    168
    -        if output_file_response.path in output_executeables:
    
    176
    +        if output_file_response.path in output_executables:
    
    169 177
                 st = os.stat(path)
    
    170 178
                 os.chmod(path, st.st_mode | stat.S_IXUSR)

  • buildgrid/server/bots/instance.py
    ... ... @@ -50,7 +50,6 @@ class BotsInterface:
    50 50
             register with the service, the old one should be closed along
    
    51 51
             with all its jobs.
    
    52 52
             """
    
    53
    -
    
    54 53
             bot_id = bot_session.bot_id
    
    55 54
     
    
    56 55
             if bot_id == "":
    
    ... ... @@ -100,10 +99,25 @@ class BotsInterface:
    100 99
             return bot_session
    
    101 100
     
    
    102 101
         def _request_leases(self, bot_session):
    
    103
    -        # TODO: Send worker capabilities to the scheduler!
    
    104 102
             # Only send one lease at a time currently.
    
    105 103
             if not bot_session.leases:
    
    106
    -            leases = self._scheduler.request_job_leases({})
    
    104
    +            worker_capabilities = dict()
    
    105
    +
    
    106
    +            # TODO? Fail if there are no devices in the worker?
    
    107
    +            if bot_session.worker.devices:
    
    108
    +                # According to the spec:
    
    109
    +                #   "The first device in the worker is the "primary device" -
    
    110
    +                #   that is, the device running a bot and which is
    
    111
    +                #   responsible for actually executing commands."
    
    112
    +                primary_device = bot_session.worker.devices[0]
    
    113
    +
    
    114
    +                for property in primary_device.properties:
    
    115
    +                    if property.key not in worker_capabilities:
    
    116
    +                        worker_capabilities[property.key] = set()
    
    117
    +                    worker_capabilities[property.key].add(property.value)
    
    118
    +
    
    119
    +            leases = self._scheduler.request_job_leases(worker_capabilities)
    
    120
    +
    
    107 121
                 if leases:
    
    108 122
                     for lease in leases:
    
    109 123
                         self._assigned_leases[bot_session.name].add(lease.id)
    

  • buildgrid/server/execution/instance.py
    ... ... @@ -22,7 +22,7 @@ An instance of the Remote Execution Service.
    22 22
     import logging
    
    23 23
     
    
    24 24
     from buildgrid._exceptions import FailedPreconditionError, InvalidArgumentError, NotFoundError
    
    25
    -from buildgrid._protos.build.bazel.remote.execution.v2.remote_execution_pb2 import Action
    
    25
    +from buildgrid._protos.build.bazel.remote.execution.v2.remote_execution_pb2 import Action, Command
    
    26 26
     from buildgrid.utils import get_hash_type
    
    27 27
     
    
    28 28
     
    
    ... ... @@ -50,11 +50,21 @@ class ExecutionInstance:
    50 50
             this action.
    
    51 51
             """
    
    52 52
             action = self._storage.get_message(action_digest, Action)
    
    53
    -
    
    54 53
             if not action:
    
    55 54
                 raise FailedPreconditionError("Could not get action from storage.")
    
    56 55
     
    
    56
    +        platform_requirements = None
    
    57
    +        if action.command_digest.size_bytes:
    
    58
    +            command = self._storage.get_message(action.command_digest, Command)
    
    59
    +
    
    60
    +            if not command:
    
    61
    +                raise FailedPreconditionError("Could not get command from storage.")
    
    62
    +
    
    63
    +            if command:
    
    64
    +                platform_requirements = command.platform
    
    65
    +
    
    57 66
             return self._scheduler.queue_job_action(action, action_digest,
    
    67
    +                                                platform_requirements,
    
    58 68
                                                     skip_cache_lookup=skip_cache_lookup)
    
    59 69
     
    
    60 70
         def register_job_peer(self, job_name, peer, message_queue):
    

  • buildgrid/server/job.py
    ... ... @@ -29,7 +29,7 @@ from buildgrid._protos.google.rpc import code_pb2
    29 29
     
    
    30 30
     class Job:
    
    31 31
     
    
    32
    -    def __init__(self, action, action_digest, priority=0):
    
    32
    +    def __init__(self, action, action_digest, platform_requirements, priority=0):
    
    33 33
             self.__logger = logging.getLogger(__name__)
    
    34 34
     
    
    35 35
             self._name = str(uuid.uuid4())
    
    ... ... @@ -59,6 +59,8 @@ class Job:
    59 59
             self._do_not_cache = self._action.do_not_cache
    
    60 60
             self._n_tries = 0
    
    61 61
     
    
    62
    +        self._platform_requirements = platform_requirements
    
    63
    +
    
    62 64
             self._done = False
    
    63 65
     
    
    64 66
         def __lt__(self, other):
    
    ... ... @@ -111,6 +113,10 @@ class Job:
    111 113
         def done(self):
    
    112 114
             return self._done
    
    113 115
     
    
    116
    +    @property
    
    117
    +    def platform_requirements(self):
    
    118
    +        return self._platform_requirements
    
    119
    +
    
    114 120
         # --- Public API: REAPI ---
    
    115 121
     
    
    116 122
         @property
    

  • buildgrid/server/scheduler.py
    ... ... @@ -145,7 +145,8 @@ class Scheduler:
    145 145
             if not job.n_peers and job.done and not job.lease:
    
    146 146
                 self._delete_job(job.name)
    
    147 147
     
    
    148
    -    def queue_job_action(self, action, action_digest, priority=0, skip_cache_lookup=False):
    
    148
    +    def queue_job_action(self, action, action_digest, platform_requirements,
    
    149
    +                         priority=0, skip_cache_lookup=False):
    
    149 150
             """Inserts a newly created job into the execution queue.
    
    150 151
     
    
    151 152
             Warning:
    
    ... ... @@ -155,6 +156,9 @@ class Scheduler:
    155 156
             Args:
    
    156 157
                 action (Action): the given action to queue for execution.
    
    157 158
                 action_digest (Digest): the digest of the given action.
    
    159
    +            platform_requirements (dict(list)): platform attributes that a worker
    
    160
    +                must satisfy in order to be assigned the job. (Each key can
    
    161
    +                have multiple values.)
    
    158 162
                 priority (int): the execution job's priority.
    
    159 163
                 skip_cache_lookup (bool): whether or not to look for pre-computed
    
    160 164
                     result for the given action.
    
    ... ... @@ -178,7 +182,8 @@ class Scheduler:
    178 182
     
    
    179 183
                     return job.name
    
    180 184
     
    
    181
    -        job = Job(action, action_digest, priority=priority)
    
    185
    +        job = Job(action, action_digest, platform_requirements,
    
    186
    +                  priority=priority)
    
    182 187
     
    
    183 188
             self.__logger.debug("Job created for action [%s]: [%s]",
    
    184 189
                                 action_digest.hash[:8], job.name)
    
    ... ... @@ -271,7 +276,7 @@ class Scheduler:
    271 276
             """Generates a list of the highest priority leases to be run.
    
    272 277
     
    
    273 278
             Args:
    
    274
    -            worker_capabilities (dict): a set of key-value pairs decribing the
    
    279
    +            worker_capabilities (dict): a set of key-value pairs describing the
    
    275 280
                     worker properties, configuration and state at the time of the
    
    276 281
                     request.
    
    277 282
     
    
    ... ... @@ -280,19 +285,21 @@ class Scheduler:
    280 285
             if not self.__queue:
    
    281 286
                 return []
    
    282 287
     
    
    283
    -        # TODO: Try to match worker_capabilities with jobs properties.
    
    284
    -        job = self.__queue.pop()
    
    288
    +        # For now we only look at the first job in the queue.
    
    289
    +        # TODO: Try finding another job that is suitable for the worker.
    
    290
    +        if self._worker_is_capable(worker_capabilities, self.__queue[0]):
    
    291
    +            job = self.__queue.pop()
    
    285 292
     
    
    286
    -        self.__logger.info("Job scheduled to run: [%s]", job.name)
    
    293
    +            self.__logger.info("Job scheduled to run: [%s]", job.name)
    
    287 294
     
    
    288
    -        lease = job.lease
    
    295
    +            lease = job.lease
    
    289 296
     
    
    290
    -        if not lease:
    
    291
    -            # For now, one lease at a time:
    
    292
    -            lease = job.create_lease()
    
    297
    +            if not lease:
    
    298
    +                # For now, one lease at a time:
    
    299
    +                lease = job.create_lease()
    
    293 300
     
    
    294
    -        if lease:
    
    295
    -            return [lease]
    
    301
    +            if lease:
    
    302
    +                return [lease]
    
    296 303
     
    
    297 304
             return None
    
    298 305
     
    
    ... ... @@ -622,3 +629,28 @@ class Scheduler:
    622 629
     
    
    623 630
                         for message_queue in self.__build_metadata_queues:
    
    624 631
                             message_queue.put(message)
    
    632
    +
    
    633
    +    def _worker_is_capable(self, worker_capabilities, job):
    
    634
    +        """Returns whether the worker is suitable to run the job."""
    
    635
    +        # TODO: Replace this with the logic defined in the Platform msg. standard.
    
    636
    +
    
    637
    +        job_requirements = job.platform_requirements
    
    638
    +        # For now we'll only check OS and ISA properties.
    
    639
    +
    
    640
    +        if not job_requirements:
    
    641
    +            return True
    
    642
    +
    
    643
    +        # OS:
    
    644
    +        worker_oses = worker_capabilities.get('os', set())
    
    645
    +        job_oses = job_requirements.get('os', set())
    
    646
    +        if job_oses and not (job_oses & worker_oses):
    
    647
    +            return False
    
    648
    +
    
    649
    +        # ISAs:
    
    650
    +        worker_isas = worker_capabilities.get('isa', [])
    
    651
    +        job_isas = job_requirements.get('isa', None)
    
    652
    +
    
    653
    +        if job_isas and not (job_isas & worker_isas):
    
    654
    +            return False
    
    655
    +
    
    656
    +        return True

  • tests/integration/bots_service.py
    ... ... @@ -153,11 +153,13 @@ def test_post_bot_event_temp(context, instance):
    153 153
         context.set_code.assert_called_once_with(grpc.StatusCode.UNIMPLEMENTED)
    
    154 154
     
    
    155 155
     
    
    156
    -def _inject_work(scheduler, action=None, action_digest=None):
    
    156
    +def _inject_work(scheduler, action=None, action_digest=None,
    
    157
    +                 platform_requirements=None):
    
    157 158
         if not action:
    
    158 159
             action = remote_execution_pb2.Action()
    
    159 160
     
    
    160 161
         if not action_digest:
    
    161 162
             action_digest = remote_execution_pb2.Digest()
    
    162 163
     
    
    163
    -    scheduler.queue_job_action(action, action_digest, skip_cache_lookup=True)
    164
    +    scheduler.queue_job_action(action, action_digest, platform_requirements,
    
    165
    +                               skip_cache_lookup=True)

  • tests/integration/execution_service.py
    ... ... @@ -107,6 +107,7 @@ def test_no_action_digest_in_storage(instance, context):
    107 107
     def test_wait_execution(instance, controller, context):
    
    108 108
         job_name = controller.execution_instance._scheduler.queue_job_action(action,
    
    109 109
                                                                              action_digest,
    
    110
    +                                                                         platform_requirements={},
    
    110 111
                                                                              skip_cache_lookup=True)
    
    111 112
     
    
    112 113
         message_queue = queue.Queue()
    



  • [Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]