Santiago Gil pushed to branch santigl/104-platform-matching at BuildGrid / buildgrid
Commits:
-
c7c6ebc5
by Santiago Gil at 2019-02-11T12:01:51Z
-
c4303652
by Santiago Gil at 2019-02-11T12:01:51Z
-
a73c5c66
by Santiago Gil at 2019-02-11T12:01:51Z
-
d1ec9cc1
by Santiago Gil at 2019-02-11T12:01:51Z
8 changed files:
- buildgrid/_app/commands/cmd_bot.py
- buildgrid/_app/commands/cmd_execute.py
- buildgrid/server/bots/instance.py
- buildgrid/server/execution/instance.py
- buildgrid/server/job.py
- buildgrid/server/scheduler.py
- tests/integration/bots_service.py
- tests/integration/execution_service.py
Changes:
... | ... | @@ -59,11 +59,14 @@ from ..cli import pass_context, setup_logging |
59 | 59 |
help="Time period for bot updates to the server in seconds.")
|
60 | 60 |
@click.option('--parent', type=click.STRING, default=None, show_default=True,
|
61 | 61 |
help="Targeted farm resource.")
|
62 |
+@click.option('-wp', '--worker-property', nargs=2, type=(click.STRING, click.STRING), multiple=True,
|
|
63 |
+ help="List of key-value pairs of worker properties.")
|
|
62 | 64 |
@click.option('-v', '--verbose', count=True,
|
63 | 65 |
help='Increase log verbosity level.')
|
64 | 66 |
@pass_context
|
65 |
-def cli(context, parent, update_period, remote, auth_token, client_key, client_cert, server_cert,
|
|
66 |
- remote_cas, cas_client_key, cas_client_cert, cas_server_cert, verbose):
|
|
67 |
+def cli(context, parent, update_period, remote, auth_token, client_key,
|
|
68 |
+ client_cert, server_cert, remote_cas, cas_client_key, cas_client_cert,
|
|
69 |
+ cas_server_cert, worker_property, verbose):
|
|
67 | 70 |
setup_logging(verbosity=verbose)
|
68 | 71 |
# Setup the remote execution server channel:
|
69 | 72 |
try:
|
... | ... | @@ -90,8 +93,14 @@ def cli(context, parent, update_period, remote, auth_token, client_key, client_c |
90 | 93 |
|
91 | 94 |
bot_interface = interface.BotInterface(context.channel)
|
92 | 95 |
|
96 |
+ worker_properties_dict = dict()
|
|
97 |
+ for property_name, property_value in worker_property:
|
|
98 |
+ if property_name not in worker_properties_dict:
|
|
99 |
+ worker_properties_dict[property_name] = set()
|
|
100 |
+ worker_properties_dict[property_name].add(property_value)
|
|
101 |
+ |
|
93 | 102 |
worker = Worker()
|
94 |
- worker.add_device(Device())
|
|
103 |
+ worker.add_device(Device(properties=worker_properties_dict))
|
|
95 | 104 |
hardware_interface = HardwareInterface(worker)
|
96 | 105 |
|
97 | 106 |
context.bot_interface = bot_interface
|
... | ... | @@ -107,23 +107,31 @@ def request_dummy(context, number, wait_for_completion): |
107 | 107 |
help="Tuple of expected output file and is-executeable flag.")
|
108 | 108 |
@click.option('--output-directory', default='testing', show_default=True,
|
109 | 109 |
help="Output directory for the output files.")
|
110 |
+@click.option('-pp', '--platform-property', nargs=2, type=(click.STRING, click.STRING), multiple=True,
|
|
111 |
+ help="List of key-value pairs of required platform properties.")
|
|
110 | 112 |
@click.argument('input-root', nargs=1, type=click.Path(), required=True)
|
111 | 113 |
@click.argument('commands', nargs=-1, type=click.STRING, required=True)
|
112 | 114 |
@pass_context
|
113 |
-def run_command(context, input_root, commands, output_file, output_directory):
|
|
115 |
+def run_command(context, input_root, commands, output_file, output_directory,
|
|
116 |
+ platform_property):
|
|
114 | 117 |
stub = remote_execution_pb2_grpc.ExecutionStub(context.channel)
|
115 | 118 |
|
116 |
- output_executeables = []
|
|
119 |
+ output_executables = []
|
|
117 | 120 |
with upload(context.channel, instance=context.instance_name) as uploader:
|
118 | 121 |
command = remote_execution_pb2.Command()
|
119 | 122 |
|
120 | 123 |
for arg in commands:
|
121 | 124 |
command.arguments.extend([arg])
|
122 | 125 |
|
123 |
- for file, is_executeable in output_file:
|
|
126 |
+ for file, is_executable in output_file:
|
|
124 | 127 |
command.output_files.extend([file])
|
125 |
- if is_executeable:
|
|
126 |
- output_executeables.append(file)
|
|
128 |
+ if is_executable:
|
|
129 |
+ output_executables.append(file)
|
|
130 |
+ |
|
131 |
+ for attribute_name, attribute_value in platform_property:
|
|
132 |
+ property = command.platform.properties.add()
|
|
133 |
+ property.name = attribute_name
|
|
134 |
+ property.value = attribute_value
|
|
127 | 135 |
|
128 | 136 |
command_digest = uploader.put_message(command, queue=True)
|
129 | 137 |
|
... | ... | @@ -165,6 +173,6 @@ def run_command(context, input_root, commands, output_file, output_directory): |
165 | 173 |
downloader.download_file(output_file_response.digest, path)
|
166 | 174 |
|
167 | 175 |
for output_file_response in execute_response.result.output_files:
|
168 |
- if output_file_response.path in output_executeables:
|
|
176 |
+ if output_file_response.path in output_executables:
|
|
169 | 177 |
st = os.stat(path)
|
170 | 178 |
os.chmod(path, st.st_mode | stat.S_IXUSR)
|
... | ... | @@ -50,7 +50,6 @@ class BotsInterface: |
50 | 50 |
register with the service, the old one should be closed along
|
51 | 51 |
with all its jobs.
|
52 | 52 |
"""
|
53 |
- |
|
54 | 53 |
bot_id = bot_session.bot_id
|
55 | 54 |
|
56 | 55 |
if bot_id == "":
|
... | ... | @@ -100,10 +99,25 @@ class BotsInterface: |
100 | 99 |
return bot_session
|
101 | 100 |
|
102 | 101 |
def _request_leases(self, bot_session):
|
103 |
- # TODO: Send worker capabilities to the scheduler!
|
|
104 | 102 |
# Only send one lease at a time currently.
|
105 | 103 |
if not bot_session.leases:
|
106 |
- leases = self._scheduler.request_job_leases({})
|
|
104 |
+ worker_capabilities = dict()
|
|
105 |
+ |
|
106 |
+ # TODO? Fail if there are no devices in the worker?
|
|
107 |
+ if bot_session.worker.devices:
|
|
108 |
+ # According to the spec:
|
|
109 |
+ # "The first device in the worker is the "primary device" -
|
|
110 |
+ # that is, the device running a bot and which is
|
|
111 |
+ # responsible for actually executing commands."
|
|
112 |
+ primary_device = bot_session.worker.devices[0]
|
|
113 |
+ |
|
114 |
+ for property in primary_device.properties:
|
|
115 |
+ if property.key not in worker_capabilities:
|
|
116 |
+ worker_capabilities[property.key] = set()
|
|
117 |
+ worker_capabilities[property.key].add(property.value)
|
|
118 |
+ |
|
119 |
+ leases = self._scheduler.request_job_leases(worker_capabilities)
|
|
120 |
+ |
|
107 | 121 |
if leases:
|
108 | 122 |
for lease in leases:
|
109 | 123 |
self._assigned_leases[bot_session.name].add(lease.id)
|
... | ... | @@ -22,7 +22,7 @@ An instance of the Remote Execution Service. |
22 | 22 |
import logging
|
23 | 23 |
|
24 | 24 |
from buildgrid._exceptions import FailedPreconditionError, InvalidArgumentError, NotFoundError
|
25 |
-from buildgrid._protos.build.bazel.remote.execution.v2.remote_execution_pb2 import Action
|
|
25 |
+from buildgrid._protos.build.bazel.remote.execution.v2.remote_execution_pb2 import Action, Command
|
|
26 | 26 |
from buildgrid.utils import get_hash_type
|
27 | 27 |
|
28 | 28 |
|
... | ... | @@ -50,11 +50,21 @@ class ExecutionInstance: |
50 | 50 |
this action.
|
51 | 51 |
"""
|
52 | 52 |
action = self._storage.get_message(action_digest, Action)
|
53 |
- |
|
54 | 53 |
if not action:
|
55 | 54 |
raise FailedPreconditionError("Could not get action from storage.")
|
56 | 55 |
|
56 |
+ platform_requirements = None
|
|
57 |
+ if action.command_digest.size_bytes:
|
|
58 |
+ command = self._storage.get_message(action.command_digest, Command)
|
|
59 |
+ |
|
60 |
+ if not command:
|
|
61 |
+ raise FailedPreconditionError("Could not get command from storage.")
|
|
62 |
+ |
|
63 |
+ if command:
|
|
64 |
+ platform_requirements = command.platform
|
|
65 |
+ |
|
57 | 66 |
return self._scheduler.queue_job_action(action, action_digest,
|
67 |
+ platform_requirements,
|
|
58 | 68 |
skip_cache_lookup=skip_cache_lookup)
|
59 | 69 |
|
60 | 70 |
def register_job_peer(self, job_name, peer, message_queue):
|
... | ... | @@ -29,7 +29,7 @@ from buildgrid._protos.google.rpc import code_pb2 |
29 | 29 |
|
30 | 30 |
class Job:
|
31 | 31 |
|
32 |
- def __init__(self, action, action_digest, priority=0):
|
|
32 |
+ def __init__(self, action, action_digest, platform_requirements, priority=0):
|
|
33 | 33 |
self.__logger = logging.getLogger(__name__)
|
34 | 34 |
|
35 | 35 |
self._name = str(uuid.uuid4())
|
... | ... | @@ -59,6 +59,8 @@ class Job: |
59 | 59 |
self._do_not_cache = self._action.do_not_cache
|
60 | 60 |
self._n_tries = 0
|
61 | 61 |
|
62 |
+ self._platform_requirements = platform_requirements
|
|
63 |
+ |
|
62 | 64 |
self._done = False
|
63 | 65 |
|
64 | 66 |
def __lt__(self, other):
|
... | ... | @@ -111,6 +113,10 @@ class Job: |
111 | 113 |
def done(self):
|
112 | 114 |
return self._done
|
113 | 115 |
|
116 |
+ @property
|
|
117 |
+ def platform_requirements(self):
|
|
118 |
+ return self._platform_requirements
|
|
119 |
+ |
|
114 | 120 |
# --- Public API: REAPI ---
|
115 | 121 |
|
116 | 122 |
@property
|
... | ... | @@ -145,7 +145,8 @@ class Scheduler: |
145 | 145 |
if not job.n_peers and job.done and not job.lease:
|
146 | 146 |
self._delete_job(job.name)
|
147 | 147 |
|
148 |
- def queue_job_action(self, action, action_digest, priority=0, skip_cache_lookup=False):
|
|
148 |
+ def queue_job_action(self, action, action_digest, platform_requirements,
|
|
149 |
+ priority=0, skip_cache_lookup=False):
|
|
149 | 150 |
"""Inserts a newly created job into the execution queue.
|
150 | 151 |
|
151 | 152 |
Warning:
|
... | ... | @@ -155,6 +156,9 @@ class Scheduler: |
155 | 156 |
Args:
|
156 | 157 |
action (Action): the given action to queue for execution.
|
157 | 158 |
action_digest (Digest): the digest of the given action.
|
159 |
+ platform_requirements (dict(list)): platform attributes that a worker
|
|
160 |
+ must satisfy in order to be assigned the job. (Each key can
|
|
161 |
+ have multiple values.)
|
|
158 | 162 |
priority (int): the execution job's priority.
|
159 | 163 |
skip_cache_lookup (bool): whether or not to look for pre-computed
|
160 | 164 |
result for the given action.
|
... | ... | @@ -178,7 +182,8 @@ class Scheduler: |
178 | 182 |
|
179 | 183 |
return job.name
|
180 | 184 |
|
181 |
- job = Job(action, action_digest, priority=priority)
|
|
185 |
+ job = Job(action, action_digest, platform_requirements,
|
|
186 |
+ priority=priority)
|
|
182 | 187 |
|
183 | 188 |
self.__logger.debug("Job created for action [%s]: [%s]",
|
184 | 189 |
action_digest.hash[:8], job.name)
|
... | ... | @@ -271,7 +276,7 @@ class Scheduler: |
271 | 276 |
"""Generates a list of the highest priority leases to be run.
|
272 | 277 |
|
273 | 278 |
Args:
|
274 |
- worker_capabilities (dict): a set of key-value pairs decribing the
|
|
279 |
+ worker_capabilities (dict): a set of key-value pairs describing the
|
|
275 | 280 |
worker properties, configuration and state at the time of the
|
276 | 281 |
request.
|
277 | 282 |
|
... | ... | @@ -280,19 +285,21 @@ class Scheduler: |
280 | 285 |
if not self.__queue:
|
281 | 286 |
return []
|
282 | 287 |
|
283 |
- # TODO: Try to match worker_capabilities with jobs properties.
|
|
284 |
- job = self.__queue.pop()
|
|
288 |
+ # For now we only look at the first job in the queue.
|
|
289 |
+ # TODO: Try finding another job that is suitable for the worker.
|
|
290 |
+ if self._worker_is_capable(worker_capabilities, self.__queue[0]):
|
|
291 |
+ job = self.__queue.pop()
|
|
285 | 292 |
|
286 |
- self.__logger.info("Job scheduled to run: [%s]", job.name)
|
|
293 |
+ self.__logger.info("Job scheduled to run: [%s]", job.name)
|
|
287 | 294 |
|
288 |
- lease = job.lease
|
|
295 |
+ lease = job.lease
|
|
289 | 296 |
|
290 |
- if not lease:
|
|
291 |
- # For now, one lease at a time:
|
|
292 |
- lease = job.create_lease()
|
|
297 |
+ if not lease:
|
|
298 |
+ # For now, one lease at a time:
|
|
299 |
+ lease = job.create_lease()
|
|
293 | 300 |
|
294 |
- if lease:
|
|
295 |
- return [lease]
|
|
301 |
+ if lease:
|
|
302 |
+ return [lease]
|
|
296 | 303 |
|
297 | 304 |
return None
|
298 | 305 |
|
... | ... | @@ -622,3 +629,28 @@ class Scheduler: |
622 | 629 |
|
623 | 630 |
for message_queue in self.__build_metadata_queues:
|
624 | 631 |
message_queue.put(message)
|
632 |
+ |
|
633 |
+ def _worker_is_capable(self, worker_capabilities, job):
|
|
634 |
+ """Returns whether the worker is suitable to run the job."""
|
|
635 |
+ # TODO: Replace this with the logic defined in the Platform msg. standard.
|
|
636 |
+ |
|
637 |
+ job_requirements = job.platform_requirements
|
|
638 |
+ # For now we'll only check OS and ISA properties.
|
|
639 |
+ |
|
640 |
+ if not job_requirements:
|
|
641 |
+ return True
|
|
642 |
+ |
|
643 |
+ # OS:
|
|
644 |
+ worker_oses = worker_capabilities.get('os', set())
|
|
645 |
+ job_oses = job_requirements.get('os', set())
|
|
646 |
+ if job_oses and not (job_oses & worker_oses):
|
|
647 |
+ return False
|
|
648 |
+ |
|
649 |
+ # ISAs:
|
|
650 |
+ worker_isas = worker_capabilities.get('isa', [])
|
|
651 |
+ job_isas = job_requirements.get('isa', None)
|
|
652 |
+ |
|
653 |
+ if job_isas and not (job_isas & worker_isas):
|
|
654 |
+ return False
|
|
655 |
+ |
|
656 |
+ return True
|
... | ... | @@ -153,11 +153,13 @@ def test_post_bot_event_temp(context, instance): |
153 | 153 |
context.set_code.assert_called_once_with(grpc.StatusCode.UNIMPLEMENTED)
|
154 | 154 |
|
155 | 155 |
|
156 |
-def _inject_work(scheduler, action=None, action_digest=None):
|
|
156 |
+def _inject_work(scheduler, action=None, action_digest=None,
|
|
157 |
+ platform_requirements=None):
|
|
157 | 158 |
if not action:
|
158 | 159 |
action = remote_execution_pb2.Action()
|
159 | 160 |
|
160 | 161 |
if not action_digest:
|
161 | 162 |
action_digest = remote_execution_pb2.Digest()
|
162 | 163 |
|
163 |
- scheduler.queue_job_action(action, action_digest, skip_cache_lookup=True)
|
|
164 |
+ scheduler.queue_job_action(action, action_digest, platform_requirements,
|
|
165 |
+ skip_cache_lookup=True)
|
... | ... | @@ -107,6 +107,7 @@ def test_no_action_digest_in_storage(instance, context): |
107 | 107 |
def test_wait_execution(instance, controller, context):
|
108 | 108 |
job_name = controller.execution_instance._scheduler.queue_job_action(action,
|
109 | 109 |
action_digest,
|
110 |
+ platform_requirements={},
|
|
110 | 111 |
skip_cache_lookup=True)
|
111 | 112 |
|
112 | 113 |
message_queue = queue.Queue()
|