[Notes] [Git][BuildGrid/buildgrid][santigl/104-platform-matching] 3 commits: Add platform_attributes property to Job class



Title: GitLab

Santiago Gil pushed to branch santigl/104-platform-matching at BuildGrid / buildgrid

Commits:

6 changed files:

Changes:

  • buildgrid/server/bots/instance.py
    ... ... @@ -50,7 +50,6 @@ class BotsInterface:
    50 50
             register with the service, the old one should be closed along
    
    51 51
             with all its jobs.
    
    52 52
             """
    
    53
    -
    
    54 53
             bot_id = bot_session.bot_id
    
    55 54
     
    
    56 55
             if bot_id == "":
    
    ... ... @@ -100,10 +99,25 @@ class BotsInterface:
    100 99
             return bot_session
    
    101 100
     
    
    102 101
         def _request_leases(self, bot_session):
    
    103
    -        # TODO: Send worker capabilities to the scheduler!
    
    104 102
             # Only send one lease at a time currently.
    
    105 103
             if not bot_session.leases:
    
    106
    -            leases = self._scheduler.request_job_leases({})
    
    104
    +            worker_capabilities = dict()
    
    105
    +
    
    106
    +            # TODO? Fail if there are no devices in the worker?
    
    107
    +            if bot_session.worker.devices:
    
    108
    +                # According to the spec:
    
    109
    +                #   "The first device in the worker is the "primary device" -
    
    110
    +                #   that is, the device running a bot and which is
    
    111
    +                #   responsible for actually executing commands."
    
    112
    +                primary_device = bot_session.worker.devices[0]
    
    113
    +
    
    114
    +                for property in primary_device.properties:
    
    115
    +                    if property.key not in worker_capabilities:
    
    116
    +                        worker_capabilities[property.key] = set()
    
    117
    +                    worker_capabilities[property.key].add(property.value)
    
    118
    +
    
    119
    +            leases = self._scheduler.request_job_leases(worker_capabilities)
    
    120
    +
    
    107 121
                 if leases:
    
    108 122
                     for lease in leases:
    
    109 123
                         self._assigned_leases[bot_session.name].add(lease.id)
    

  • buildgrid/server/execution/instance.py
    ... ... @@ -22,7 +22,7 @@ An instance of the Remote Execution Service.
    22 22
     import logging
    
    23 23
     
    
    24 24
     from buildgrid._exceptions import FailedPreconditionError, InvalidArgumentError, NotFoundError
    
    25
    -from buildgrid._protos.build.bazel.remote.execution.v2.remote_execution_pb2 import Action
    
    25
    +from buildgrid._protos.build.bazel.remote.execution.v2.remote_execution_pb2 import Action, Command
    
    26 26
     from buildgrid.utils import get_hash_type
    
    27 27
     
    
    28 28
     
    
    ... ... @@ -50,11 +50,25 @@ class ExecutionInstance:
    50 50
             this action.
    
    51 51
             """
    
    52 52
             action = self._storage.get_message(action_digest, Action)
    
    53
    -
    
    54 53
             if not action:
    
    55 54
                 raise FailedPreconditionError("Could not get action from storage.")
    
    56 55
     
    
    56
    +        platform_requirements = None
    
    57
    +        if action.command_digest.size_bytes:
    
    58
    +            command = self._storage.get_message(action.command_digest, Command)
    
    59
    +
    
    60
    +            if not command:
    
    61
    +                raise FailedPreconditionError("Could not get command from storage.")
    
    62
    +
    
    63
    +            if command:
    
    64
    +                platform_requirements = dict()
    
    65
    +                for property in command.platform.properties:
    
    66
    +                    if property.name not in platform_requirements:
    
    67
    +                        platform_requirements[property.name] = set()
    
    68
    +                    platform_requirements[property.name].add(property.value)
    
    69
    +
    
    57 70
             return self._scheduler.queue_job_action(action, action_digest,
    
    71
    +                                                platform_requirements,
    
    58 72
                                                     skip_cache_lookup=skip_cache_lookup)
    
    59 73
     
    
    60 74
         def register_job_peer(self, job_name, peer, message_queue):
    

  • buildgrid/server/job.py
    ... ... @@ -29,7 +29,7 @@ from buildgrid._protos.google.rpc import code_pb2
    29 29
     
    
    30 30
     class Job:
    
    31 31
     
    
    32
    -    def __init__(self, action, action_digest, priority=0):
    
    32
    +    def __init__(self, action, action_digest, platform_requirements, priority=0):
    
    33 33
             self.__logger = logging.getLogger(__name__)
    
    34 34
     
    
    35 35
             self._name = str(uuid.uuid4())
    
    ... ... @@ -59,6 +59,8 @@ class Job:
    59 59
             self._do_not_cache = self._action.do_not_cache
    
    60 60
             self._n_tries = 0
    
    61 61
     
    
    62
    +        self._platform_requirements = platform_requirements
    
    63
    +
    
    62 64
             self._done = False
    
    63 65
     
    
    64 66
         def __lt__(self, other):
    
    ... ... @@ -111,6 +113,10 @@ class Job:
    111 113
         def done(self):
    
    112 114
             return self._done
    
    113 115
     
    
    116
    +    @property
    
    117
    +    def platform_requirements(self):
    
    118
    +        return self._platform_requirements
    
    119
    +
    
    114 120
         # --- Public API: REAPI ---
    
    115 121
     
    
    116 122
         @property
    

  • buildgrid/server/scheduler.py
    ... ... @@ -145,7 +145,8 @@ class Scheduler:
    145 145
             if not job.n_peers and job.done and not job.lease:
    
    146 146
                 self._delete_job(job.name)
    
    147 147
     
    
    148
    -    def queue_job_action(self, action, action_digest, priority=0, skip_cache_lookup=False):
    
    148
    +    def queue_job_action(self, action, action_digest, platform_requirements,
    
    149
    +                         priority=0, skip_cache_lookup=False):
    
    149 150
             """Inserts a newly created job into the execution queue.
    
    150 151
     
    
    151 152
             Warning:
    
    ... ... @@ -155,6 +156,9 @@ class Scheduler:
    155 156
             Args:
    
    156 157
                 action (Action): the given action to queue for execution.
    
    157 158
                 action_digest (Digest): the digest of the given action.
    
    159
    +            platform_requirements (dict(set)): platform attributes that a worker
    
    160
    +                must satisfy in order to be assigned the job. (Each key can
    
    161
    +                have multiple values.)
    
    158 162
                 priority (int): the execution job's priority.
    
    159 163
                 skip_cache_lookup (bool): whether or not to look for pre-computed
    
    160 164
                     result for the given action.
    
    ... ... @@ -178,7 +182,8 @@ class Scheduler:
    178 182
     
    
    179 183
                     return job.name
    
    180 184
     
    
    181
    -        job = Job(action, action_digest, priority=priority)
    
    185
    +        job = Job(action, action_digest, platform_requirements,
    
    186
    +                  priority=priority)
    
    182 187
     
    
    183 188
             self.__logger.debug("Job created for action [%s]: [%s]",
    
    184 189
                                 action_digest.hash[:8], job.name)
    
    ... ... @@ -271,7 +276,7 @@ class Scheduler:
    271 276
             """Generates a list of the highest priority leases to be run.
    
    272 277
     
    
    273 278
             Args:
    
    274
    -            worker_capabilities (dict): a set of key-value pairs decribing the
    
    279
    +            worker_capabilities (dict): a set of key-value pairs describing the
    
    275 280
                     worker properties, configuration and state at the time of the
    
    276 281
                     request.
    
    277 282
     
    
    ... ... @@ -280,19 +285,21 @@ class Scheduler:
    280 285
             if not self.__queue:
    
    281 286
                 return []
    
    282 287
     
    
    283
    -        # TODO: Try to match worker_capabilities with jobs properties.
    
    284
    -        job = self.__queue.pop()
    
    288
    +        # For now we only look at the first job in the queue.
    
    289
    +        # TODO: Try finding another job that is suitable for the worker.
    
    290
    +        if self._worker_is_capable(worker_capabilities, self.__queue[0]):
    
    291
    +            job = self.__queue.pop()
    
    285 292
     
    
    286
    -        self.__logger.info("Job scheduled to run: [%s]", job.name)
    
    293
    +            self.__logger.info("Job scheduled to run: [%s]", job.name)
    
    287 294
     
    
    288
    -        lease = job.lease
    
    295
    +            lease = job.lease
    
    289 296
     
    
    290
    -        if not lease:
    
    291
    -            # For now, one lease at a time:
    
    292
    -            lease = job.create_lease()
    
    297
    +            if not lease:
    
    298
    +                # For now, one lease at a time:
    
    299
    +                lease = job.create_lease()
    
    293 300
     
    
    294
    -        if lease:
    
    295
    -            return [lease]
    
    301
    +            if lease:
    
    302
    +                return [lease]
    
    296 303
     
    
    297 304
             return None
    
    298 305
     
    
    ... ... @@ -622,3 +629,28 @@ class Scheduler:
    622 629
     
    
    623 630
                         for message_queue in self.__build_metadata_queues:
    
    624 631
                             message_queue.put(message)
    
    632
    +
    
    633
    +    def _worker_is_capable(self, worker_capabilities, job):
    
    634
    +        """Returns whether the worker is suitable to run the job."""
    
    635
    +        # TODO: Replace this with the logic defined in the Platform msg. standard.
    
    636
    +
    
    637
    +        job_requirements = job.platform_requirements
    
    638
    +        # For now we'll only check OS and ISA properties.
    
    639
    +
    
    640
    +        if not job_requirements:
    
    641
    +            return True
    
    642
    +
    
    643
    +        # OS:
    
    644
    +        worker_oses = worker_capabilities.get('os', set())
    
    645
    +        job_oses = job_requirements.get('os', set())
    
    646
    +        if job_oses and not (job_oses & worker_oses):
    
    647
    +            return False
    
    648
    +
    
    649
    +        # ISAs:
    
    650
    +        worker_isas = worker_capabilities.get('isa', [])
    
    651
    +        job_isas = job_requirements.get('isa', None)
    
    652
    +
    
    653
    +        if job_isas and not (job_isas & worker_isas):
    
    654
    +            return False
    
    655
    +
    
    656
    +        return True

  • tests/integration/bots_service.py
    ... ... @@ -153,11 +153,27 @@ def test_post_bot_event_temp(context, instance):
    153 153
         context.set_code.assert_called_once_with(grpc.StatusCode.UNIMPLEMENTED)
    
    154 154
     
    
    155 155
     
    
    156
    -def _inject_work(scheduler, action=None, action_digest=None):
    
    156
    +def test_unmet_platform_requirements(bot_session, context, instance):
    
    157
    +    request = bots_pb2.CreateBotSessionRequest(parent='',
    
    158
    +                                               bot_session=bot_session)
    
    159
    +
    
    160
    +    action_digest = remote_execution_pb2.Digest(hash='gaff')
    
    161
    +    _inject_work(instance._instances[""]._scheduler,
    
    162
    +                 action_digest=action_digest,
    
    163
    +                 platform_requirements={'os': set('wonderful-os')})
    
    164
    +
    
    165
    +    response = instance.CreateBotSession(request, context)
    
    166
    +
    
    167
    +    assert len(response.leases) == 0
    
    168
    +
    
    169
    +
    
    170
    +def _inject_work(scheduler, action=None, action_digest=None,
    
    171
    +                 platform_requirements=None):
    
    157 172
         if not action:
    
    158 173
             action = remote_execution_pb2.Action()
    
    159 174
     
    
    160 175
         if not action_digest:
    
    161 176
             action_digest = remote_execution_pb2.Digest()
    
    162 177
     
    
    163
    -    scheduler.queue_job_action(action, action_digest, skip_cache_lookup=True)
    178
    +    scheduler.queue_job_action(action, action_digest, platform_requirements,
    
    179
    +                               skip_cache_lookup=True)

  • tests/integration/execution_service.py
    ... ... @@ -107,6 +107,7 @@ def test_no_action_digest_in_storage(instance, context):
    107 107
     def test_wait_execution(instance, controller, context):
    
    108 108
         job_name = controller.execution_instance._scheduler.queue_job_action(action,
    
    109 109
                                                                              action_digest,
    
    110
    +                                                                         platform_requirements={},
    
    110 111
                                                                              skip_cache_lookup=True)
    
    111 112
     
    
    112 113
         message_queue = queue.Queue()
    



  • [Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]