Martin Blanchard pushed to branch mablanch/132-gather-state-metrics at BuildGrid / buildgrid
Commits:
-
b5f4c66a
by Martin Blanchard at 2018-11-09T12:54:01Z
-
6e2e93ff
by Martin Blanchard at 2018-11-09T12:54:02Z
-
88c547a3
by Martin Blanchard at 2018-11-09T12:54:02Z
-
58c85cd5
by Martin Blanchard at 2018-11-09T12:54:02Z
-
d1a1b756
by Martin Blanchard at 2018-11-09T12:54:02Z
-
c4647e8c
by Martin Blanchard at 2018-11-09T12:54:02Z
-
6448c8f1
by Martin Blanchard at 2018-11-09T13:14:19Z
7 changed files:
- buildgrid/_app/commands/cmd_server.py
- buildgrid/server/bots/service.py
- buildgrid/server/execution/service.py
- buildgrid/server/instance.py
- buildgrid/server/scheduler.py
- buildgrid/settings.py
- setup.py
Changes:
| ... | ... | @@ -20,7 +20,6 @@ Server command |
| 20 | 20 |
Create a BuildGrid server.
|
| 21 | 21 |
"""
|
| 22 | 22 |
|
| 23 |
-import asyncio
|
|
| 24 | 23 |
import logging
|
| 25 | 24 |
import sys
|
| 26 | 25 |
|
| ... | ... | @@ -52,18 +51,14 @@ def start(context, config): |
| 52 | 51 |
click.echo("ERROR: Could not parse config: {}.\n".format(str(e)), err=True)
|
| 53 | 52 |
sys.exit(-1)
|
| 54 | 53 |
|
| 55 |
- loop = asyncio.get_event_loop()
|
|
| 56 | 54 |
try:
|
| 57 | 55 |
server.start()
|
| 58 |
- loop.run_forever()
|
|
| 59 | 56 |
|
| 60 | 57 |
except KeyboardInterrupt:
|
| 61 | 58 |
pass
|
| 62 | 59 |
|
| 63 | 60 |
finally:
|
| 64 |
- context.logger.info("Stopping server")
|
|
| 65 | 61 |
server.stop()
|
| 66 |
- loop.close()
|
|
| 67 | 62 |
|
| 68 | 63 |
|
| 69 | 64 |
def _create_server_from_config(config):
|
| ... | ... | @@ -23,8 +23,9 @@ import logging |
| 23 | 23 |
|
| 24 | 24 |
import grpc
|
| 25 | 25 |
|
| 26 |
-from google.protobuf.empty_pb2 import Empty
|
|
| 26 |
+from google.protobuf import empty_pb2, timestamp_pb2
|
|
| 27 | 27 |
|
| 28 |
+from buildgrid._enums import BotStatus
|
|
| 28 | 29 |
from buildgrid._exceptions import InvalidArgumentError, OutOfSyncError
|
| 29 | 30 |
from buildgrid._protos.google.devtools.remoteworkers.v1test2 import bots_pb2
|
| 30 | 31 |
from buildgrid._protos.google.devtools.remoteworkers.v1test2 import bots_pb2_grpc
|
| ... | ... | @@ -32,24 +33,60 @@ from buildgrid._protos.google.devtools.remoteworkers.v1test2 import bots_pb2_grp |
| 32 | 33 |
|
| 33 | 34 |
class BotsService(bots_pb2_grpc.BotsServicer):
|
| 34 | 35 |
|
| 35 |
- def __init__(self, server):
|
|
| 36 |
+ def __init__(self, server, monitor=True):
|
|
| 36 | 37 |
self.__logger = logging.getLogger(__name__)
|
| 37 | 38 |
|
| 39 |
+ self.__bots_by_status = {}
|
|
| 40 |
+ self.__bots_by_instance = {}
|
|
| 41 |
+ self.__bots = {}
|
|
| 42 |
+ |
|
| 38 | 43 |
self._instances = {}
|
| 44 |
+ self._is_monitored = True
|
|
| 39 | 45 |
|
| 40 | 46 |
bots_pb2_grpc.add_BotsServicer_to_server(self, server)
|
| 41 | 47 |
|
| 42 |
- def add_instance(self, name, instance):
|
|
| 43 |
- self._instances[name] = instance
|
|
| 48 |
+ if self._is_monitored:
|
|
| 49 |
+ self.__bots_by_status[BotStatus.OK] = set()
|
|
| 50 |
+ self.__bots_by_status[BotStatus.UNHEALTHY] = set()
|
|
| 51 |
+ self.__bots_by_status[BotStatus.HOST_REBOOTING] = set()
|
|
| 52 |
+ self.__bots_by_status[BotStatus.BOT_TERMINATING] = set()
|
|
| 53 |
+ |
|
| 54 |
+ # --- Public API ---
|
|
| 55 |
+ |
|
| 56 |
+ def add_instance(self, instance_name, instance):
|
|
| 57 |
+ self._instances[instance_name] = instance
|
|
| 58 |
+ |
|
| 59 |
+ if self._is_monitored:
|
|
| 60 |
+ self.__bots_by_instance[instance_name] = 0
|
|
| 61 |
+ |
|
| 62 |
+ # --- Public API: Servicer ---
|
|
| 44 | 63 |
|
| 45 | 64 |
def CreateBotSession(self, request, context):
|
| 65 |
+ """Handles CreateBotSessionRequest messages.
|
|
| 66 |
+ |
|
| 67 |
+ Args:
|
|
| 68 |
+ request (CreateBotSessionRequest): The incoming RPC request.
|
|
| 69 |
+ context (grpc.ServicerContext): Context for the RPC call.
|
|
| 70 |
+ """
|
|
| 46 | 71 |
self.__logger.debug("CreateBotSession request from [%s]", context.peer())
|
| 47 | 72 |
|
| 73 |
+ instance_name = request.parent
|
|
| 74 |
+ bot_status = BotStatus(request.bot_session.status)
|
|
| 75 |
+ bot_id = request.bot_session.bot_id
|
|
| 76 |
+ |
|
| 48 | 77 |
try:
|
| 49 |
- parent = request.parent
|
|
| 50 |
- instance = self._get_instance(request.parent)
|
|
| 51 |
- return instance.create_bot_session(parent,
|
|
| 52 |
- request.bot_session)
|
|
| 78 |
+ instance = self._get_instance(instance_name)
|
|
| 79 |
+ bot_session = instance.create_bot_session(instance_name,
|
|
| 80 |
+ request.bot_session)
|
|
| 81 |
+ now = timestamp_pb2.Timestamp()
|
|
| 82 |
+ now.GetCurrentTime()
|
|
| 83 |
+ |
|
| 84 |
+ if self._is_monitored:
|
|
| 85 |
+ self.__bots[bot_id] = now
|
|
| 86 |
+ self.__bots_by_instance[instance_name] += 1
|
|
| 87 |
+ self.__bots_by_status[bot_status].add(bot_id)
|
|
| 88 |
+ |
|
| 89 |
+ return bot_session
|
|
| 53 | 90 |
|
| 54 | 91 |
except InvalidArgumentError as e:
|
| 55 | 92 |
self.__logger.error(e)
|
| ... | ... | @@ -59,17 +96,36 @@ class BotsService(bots_pb2_grpc.BotsServicer): |
| 59 | 96 |
return bots_pb2.BotSession()
|
| 60 | 97 |
|
| 61 | 98 |
def UpdateBotSession(self, request, context):
|
| 99 |
+ """Handles UpdateBotSessionRequest messages.
|
|
| 100 |
+ |
|
| 101 |
+ Args:
|
|
| 102 |
+ request (UpdateBotSessionRequest): The incoming RPC request.
|
|
| 103 |
+ context (grpc.ServicerContext): Context for the RPC call.
|
|
| 104 |
+ """
|
|
| 62 | 105 |
self.__logger.debug("UpdateBotSession request from [%s]", context.peer())
|
| 63 | 106 |
|
| 107 |
+ names = request.name.split("/")
|
|
| 108 |
+ bot_status = BotStatus(request.bot_session.status)
|
|
| 109 |
+ bot_id = request.bot_session.bot_id
|
|
| 110 |
+ |
|
| 64 | 111 |
try:
|
| 65 |
- names = request.name.split("/")
|
|
| 66 |
- # Operation name should be in format:
|
|
| 67 |
- # {instance/name}/{uuid}
|
|
| 68 |
- instance_name = ''.join(names[0:-1])
|
|
| 112 |
+ instance_name = '/'.join(names[:-1])
|
|
| 69 | 113 |
|
| 70 | 114 |
instance = self._get_instance(instance_name)
|
| 71 |
- return instance.update_bot_session(request.name,
|
|
| 72 |
- request.bot_session)
|
|
| 115 |
+ bot_session = instance.update_bot_session(request.name,
|
|
| 116 |
+ request.bot_session)
|
|
| 117 |
+ |
|
| 118 |
+ if self._is_monitored:
|
|
| 119 |
+ self.__bots[bot_id].GetCurrentTime()
|
|
| 120 |
+ if bot_id not in self.__bots_by_status[bot_status]:
|
|
| 121 |
+ self.__bots_by_status[BotStatus.OK].discard(bot_id)
|
|
| 122 |
+ self.__bots_by_status[BotStatus.UNHEALTHY].discard(bot_id)
|
|
| 123 |
+ self.__bots_by_status[BotStatus.HOST_REBOOTING].discard(bot_id)
|
|
| 124 |
+ self.__bots_by_status[BotStatus.BOT_TERMINATING].discard(bot_id)
|
|
| 125 |
+ |
|
| 126 |
+ self.__bots_by_status[bot_status].add(bot_id)
|
|
| 127 |
+ |
|
| 128 |
+ return bot_session
|
|
| 73 | 129 |
|
| 74 | 130 |
except InvalidArgumentError as e:
|
| 75 | 131 |
self.__logger.error(e)
|
| ... | ... | @@ -89,10 +145,40 @@ class BotsService(bots_pb2_grpc.BotsServicer): |
| 89 | 145 |
return bots_pb2.BotSession()
|
| 90 | 146 |
|
| 91 | 147 |
def PostBotEventTemp(self, request, context):
|
| 148 |
+ """Handles PostBotEventTempRequest messages.
|
|
| 149 |
+ |
|
| 150 |
+ Args:
|
|
| 151 |
+ request (PostBotEventTempRequest): The incoming RPC request.
|
|
| 152 |
+ context (grpc.ServicerContext): Context for the RPC call.
|
|
| 153 |
+ """
|
|
| 92 | 154 |
self.__logger.debug("PostBotEventTemp request from [%s]", context.peer())
|
| 93 | 155 |
|
| 94 | 156 |
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
| 95 |
- return Empty()
|
|
| 157 |
+ |
|
| 158 |
+ return empty_pb2.Empty()
|
|
| 159 |
+ |
|
| 160 |
+ # --- Public API: Monitoring ---
|
|
| 161 |
+ |
|
| 162 |
+ @property
|
|
| 163 |
+ def is_monitored(self):
|
|
| 164 |
+ return self._is_monitored
|
|
| 165 |
+ |
|
| 166 |
+ def query_n_bots(self):
|
|
| 167 |
+ return len(self.__bots)
|
|
| 168 |
+ |
|
| 169 |
+ def query_n_bots_for_instance(self, instance_name):
|
|
| 170 |
+ try:
|
|
| 171 |
+ return self.__bots_by_instance[instance_name]
|
|
| 172 |
+ except KeyError:
|
|
| 173 |
+ return 0
|
|
| 174 |
+ |
|
| 175 |
+ def query_n_bots_for_status(self, bot_status):
|
|
| 176 |
+ try:
|
|
| 177 |
+ return len(self.__bots_by_status[bot_status])
|
|
| 178 |
+ except KeyError:
|
|
| 179 |
+ return 0
|
|
| 180 |
+ |
|
| 181 |
+ # --- Private API ---
|
|
| 96 | 182 |
|
| 97 | 183 |
def _get_instance(self, name):
|
| 98 | 184 |
try:
|
| ... | ... | @@ -33,30 +33,57 @@ from buildgrid._protos.google.longrunning import operations_pb2 |
| 33 | 33 |
|
| 34 | 34 |
class ExecutionService(remote_execution_pb2_grpc.ExecutionServicer):
|
| 35 | 35 |
|
| 36 |
- def __init__(self, server):
|
|
| 36 |
+ def __init__(self, server, monitor=True):
|
|
| 37 | 37 |
self.__logger = logging.getLogger(__name__)
|
| 38 | 38 |
|
| 39 |
+ self.__peers_by_instance = {}
|
|
| 40 |
+ self.__peers = {}
|
|
| 41 |
+ |
|
| 39 | 42 |
self._instances = {}
|
| 43 |
+ self._is_monitored = True
|
|
| 44 |
+ |
|
| 40 | 45 |
remote_execution_pb2_grpc.add_ExecutionServicer_to_server(self, server)
|
| 41 | 46 |
|
| 42 |
- def add_instance(self, name, instance):
|
|
| 43 |
- self._instances[name] = instance
|
|
| 47 |
+ # --- Public API ---
|
|
| 48 |
+ |
|
| 49 |
+ def add_instance(self, instance_name, instance):
|
|
| 50 |
+ self._instances[instance_name] = instance
|
|
| 51 |
+ |
|
| 52 |
+ if self._is_monitored:
|
|
| 53 |
+ self.__peers_by_instance[instance_name] = set()
|
|
| 54 |
+ |
|
| 55 |
+ # --- Public API: Servicer ---
|
|
| 44 | 56 |
|
| 45 | 57 |
def Execute(self, request, context):
|
| 58 |
+ """Handles ExecuteRequest messages.
|
|
| 59 |
+ |
|
| 60 |
+ Args:
|
|
| 61 |
+ request (ExecuteRequest): The incoming RPC request.
|
|
| 62 |
+ context (grpc.ServicerContext): Context for the RPC call.
|
|
| 63 |
+ """
|
|
| 46 | 64 |
self.__logger.debug("Execute request from [%s]", context.peer())
|
| 47 | 65 |
|
| 66 |
+ instance_name = request.instance_name
|
|
| 67 |
+ message_queue = queue.Queue()
|
|
| 68 |
+ peer = context.peer()
|
|
| 69 |
+ |
|
| 48 | 70 |
try:
|
| 49 |
- message_queue = queue.Queue()
|
|
| 50 |
- instance = self._get_instance(request.instance_name)
|
|
| 71 |
+ instance = self._get_instance(instance_name)
|
|
| 51 | 72 |
operation = instance.execute(request.action_digest,
|
| 52 | 73 |
request.skip_cache_lookup,
|
| 53 | 74 |
message_queue)
|
| 54 | 75 |
|
| 55 |
- context.add_callback(partial(instance.unregister_message_client,
|
|
| 56 |
- operation.name, message_queue))
|
|
| 76 |
+ context.add_callback(partial(self._rpc_termination_callback,
|
|
| 77 |
+ peer, instance_name, operation.name, message_queue))
|
|
| 78 |
+ |
|
| 79 |
+ if self._is_monitored:
|
|
| 80 |
+ if peer not in self.__peers:
|
|
| 81 |
+ self.__peers_by_instance[instance_name].add(peer)
|
|
| 82 |
+ self.__peers[peer] = 1
|
|
| 83 |
+ else:
|
|
| 84 |
+ self.__peers[peer] += 1
|
|
| 57 | 85 |
|
| 58 |
- instanced_op_name = "{}/{}".format(request.instance_name,
|
|
| 59 |
- operation.name)
|
|
| 86 |
+ instanced_op_name = "{}/{}".format(instance_name, operation.name)
|
|
| 60 | 87 |
|
| 61 | 88 |
self.__logger.info("Operation name: [%s]", instanced_op_name)
|
| 62 | 89 |
|
| ... | ... | @@ -80,23 +107,37 @@ class ExecutionService(remote_execution_pb2_grpc.ExecutionServicer): |
| 80 | 107 |
yield operations_pb2.Operation()
|
| 81 | 108 |
|
| 82 | 109 |
def WaitExecution(self, request, context):
|
| 110 |
+ """Handles WaitExecutionRequest messages.
|
|
| 111 |
+ |
|
| 112 |
+ Args:
|
|
| 113 |
+ request (WaitExecutionRequest): The incoming RPC request.
|
|
| 114 |
+ context (grpc.ServicerContext): Context for the RPC call.
|
|
| 115 |
+ """
|
|
| 83 | 116 |
self.__logger.debug("WaitExecution request from [%s]", context.peer())
|
| 84 | 117 |
|
| 85 |
- try:
|
|
| 86 |
- names = request.name.split("/")
|
|
| 118 |
+ names = request.name.split('/')
|
|
| 119 |
+ instance_name = '/'.join(names[:-1])
|
|
| 120 |
+ operation_name = names[-1]
|
|
| 121 |
+ message_queue = queue.Queue()
|
|
| 122 |
+ peer = context.peer()
|
|
| 87 | 123 |
|
| 88 |
- # Operation name should be in format:
|
|
| 89 |
- # {instance/name}/{operation_id}
|
|
| 90 |
- instance_name = ''.join(names[0:-1])
|
|
| 124 |
+ try:
|
|
| 125 |
+ if instance_name != request.instance_name:
|
|
| 126 |
+ raise InvalidArgumentError("Invalid operation [{}] for instance [{}]"
|
|
| 127 |
+ .format(request.name, instance_name))
|
|
| 91 | 128 |
|
| 92 |
- message_queue = queue.Queue()
|
|
| 93 |
- operation_name = names[-1]
|
|
| 94 | 129 |
instance = self._get_instance(instance_name)
|
| 95 | 130 |
|
| 96 | 131 |
instance.register_message_client(operation_name, message_queue)
|
| 132 |
+ context.add_callback(partial(self._rpc_termination_callback,
|
|
| 133 |
+ peer, instance_name, operation_name, message_queue))
|
|
| 97 | 134 |
|
| 98 |
- context.add_callback(partial(instance.unregister_message_client,
|
|
| 99 |
- operation_name, message_queue))
|
|
| 135 |
+ if self._is_monitored:
|
|
| 136 |
+ if peer not in self.__peers:
|
|
| 137 |
+ self.__peers_by_instance[instance_name].add(peer)
|
|
| 138 |
+ self.__peers[peer] = 1
|
|
| 139 |
+ else:
|
|
| 140 |
+ self.__peers[peer] += 1
|
|
| 100 | 141 |
|
| 101 | 142 |
for operation in instance.stream_operation_updates(message_queue,
|
| 102 | 143 |
operation_name):
|
| ... | ... | @@ -111,6 +152,35 @@ class ExecutionService(remote_execution_pb2_grpc.ExecutionServicer): |
| 111 | 152 |
context.set_code(grpc.StatusCode.INVALID_ARGUMENT)
|
| 112 | 153 |
yield operations_pb2.Operation()
|
| 113 | 154 |
|
| 155 |
+ # --- Public API: Monitoring ---
|
|
| 156 |
+ |
|
| 157 |
+ @property
|
|
| 158 |
+ def is_monitored(self):
|
|
| 159 |
+ return self._is_monitored
|
|
| 160 |
+ |
|
| 161 |
+ def query_n_clients(self):
|
|
| 162 |
+ return len(self.__peers)
|
|
| 163 |
+ |
|
| 164 |
+ def query_n_clients_for_instance(self, instance_name):
|
|
| 165 |
+ try:
|
|
| 166 |
+ return len(self.__peers_by_instance[instance_name])
|
|
| 167 |
+ except KeyError:
|
|
| 168 |
+ return 0
|
|
| 169 |
+ |
|
| 170 |
+ # --- Private API ---
|
|
| 171 |
+ |
|
| 172 |
+ def _rpc_termination_callback(self, peer, instance_name, job_name, message_queue):
|
|
| 173 |
+ instance = self._get_instance(instance_name)
|
|
| 174 |
+ |
|
| 175 |
+ instance.unregister_message_client(job_name, message_queue)
|
|
| 176 |
+ |
|
| 177 |
+ if self._is_monitored:
|
|
| 178 |
+ if self.__peers[peer] > 1:
|
|
| 179 |
+ self.__peers[peer] -= 1
|
|
| 180 |
+ else:
|
|
| 181 |
+ self.__peers_by_instance[instance_name].remove(peer)
|
|
| 182 |
+ del self.__peers[peer]
|
|
| 183 |
+ |
|
| 114 | 184 |
def _get_instance(self, name):
|
| 115 | 185 |
try:
|
| 116 | 186 |
return self._instances[name]
|
| ... | ... | @@ -13,18 +13,21 @@ |
| 13 | 13 |
# limitations under the License.
|
| 14 | 14 |
|
| 15 | 15 |
|
| 16 |
+import asyncio
|
|
| 16 | 17 |
from concurrent import futures
|
| 17 | 18 |
import logging
|
| 18 | 19 |
import os
|
| 20 |
+import time
|
|
| 19 | 21 |
|
| 20 | 22 |
import grpc
|
| 21 | 23 |
|
| 22 |
-from .cas.service import ByteStreamService, ContentAddressableStorageService
|
|
| 23 |
-from .actioncache.service import ActionCacheService
|
|
| 24 |
-from .execution.service import ExecutionService
|
|
| 25 |
-from .operations.service import OperationsService
|
|
| 26 |
-from .bots.service import BotsService
|
|
| 27 |
-from .referencestorage.service import ReferenceStorageService
|
|
| 24 |
+from buildgrid.server.cas.service import ByteStreamService, ContentAddressableStorageService
|
|
| 25 |
+from buildgrid.server.actioncache.service import ActionCacheService
|
|
| 26 |
+from buildgrid.server.execution.service import ExecutionService
|
|
| 27 |
+from buildgrid.server.operations.service import OperationsService
|
|
| 28 |
+from buildgrid.server.bots.service import BotsService
|
|
| 29 |
+from buildgrid.server.referencestorage.service import ReferenceStorageService
|
|
| 30 |
+from buildgrid.settings import MONITORING_PERIOD
|
|
| 28 | 31 |
|
| 29 | 32 |
|
| 30 | 33 |
class BuildGridServer:
|
| ... | ... | @@ -46,9 +49,11 @@ class BuildGridServer: |
| 46 | 49 |
# Use max_workers default from Python 3.5+
|
| 47 | 50 |
max_workers = (os.cpu_count() or 1) * 5
|
| 48 | 51 |
|
| 49 |
- server = grpc.server(futures.ThreadPoolExecutor(max_workers))
|
|
| 52 |
+ self.__grpc_executor = futures.ThreadPoolExecutor(max_workers)
|
|
| 53 |
+ self.__grpc_server = grpc.server(self.__grpc_executor)
|
|
| 50 | 54 |
|
| 51 |
- self._server = server
|
|
| 55 |
+ self.__main_loop = asyncio.get_event_loop()
|
|
| 56 |
+ self.__monitoring_task = None
|
|
| 52 | 57 |
|
| 53 | 58 |
self._execution_service = None
|
| 54 | 59 |
self._bots_service = None
|
| ... | ... | @@ -58,15 +63,32 @@ class BuildGridServer: |
| 58 | 63 |
self._cas_service = None
|
| 59 | 64 |
self._bytestream_service = None
|
| 60 | 65 |
|
| 66 |
+ self._instances = set()
|
|
| 67 |
+ |
|
| 68 |
+ # --- Public API ---
|
|
| 69 |
+ |
|
| 61 | 70 |
def start(self):
|
| 62 |
- """Starts the server.
|
|
| 71 |
+ """Starts the BuildGrid server.
|
|
| 63 | 72 |
"""
|
| 64 |
- self._server.start()
|
|
| 73 |
+ self.__grpc_server.start()
|
|
| 74 |
+ |
|
| 75 |
+ self.__monitoring_task = asyncio.ensure_future(
|
|
| 76 |
+ self._monitoring_worker(period=MONITORING_PERIOD), loop=self.__main_loop)
|
|
| 77 |
+ self.__main_loop.run_forever()
|
|
| 65 | 78 |
|
| 66 | 79 |
def stop(self, grace=0):
|
| 67 |
- """Stops the server.
|
|
| 80 |
+ """Stops the BuildGrid server.
|
|
| 81 |
+ |
|
| 82 |
+ Args:
|
|
| 83 |
+ grace (int, optional): A duration of time in seconds. Defaults to 0.
|
|
| 68 | 84 |
"""
|
| 69 |
- self._server.stop(grace)
|
|
| 85 |
+ if self.__monitoring_task is not None:
|
|
| 86 |
+ self.__monitoring_task.cancel()
|
|
| 87 |
+ |
|
| 88 |
+ self.__grpc_server.stop(grace)
|
|
| 89 |
+ |
|
| 90 |
+ if grace > 0:
|
|
| 91 |
+ time.sleep(grace)
|
|
| 70 | 92 |
|
| 71 | 93 |
def add_port(self, address, credentials):
|
| 72 | 94 |
"""Adds a port to the server.
|
| ... | ... | @@ -80,11 +102,11 @@ class BuildGridServer: |
| 80 | 102 |
"""
|
| 81 | 103 |
if credentials is not None:
|
| 82 | 104 |
self.__logger.info("Adding secure connection on: [%s]", address)
|
| 83 |
- self._server.add_secure_port(address, credentials)
|
|
| 105 |
+ self.__grpc_server.add_secure_port(address, credentials)
|
|
| 84 | 106 |
|
| 85 | 107 |
else:
|
| 86 | 108 |
self.__logger.info("Adding insecure connection on [%s]", address)
|
| 87 |
- self._server.add_insecure_port(address)
|
|
| 109 |
+ self.__grpc_server.add_insecure_port(address)
|
|
| 88 | 110 |
|
| 89 | 111 |
def add_execution_instance(self, instance, instance_name):
|
| 90 | 112 |
"""Adds an :obj:`ExecutionInstance` to the service.
|
| ... | ... | @@ -96,10 +118,11 @@ class BuildGridServer: |
| 96 | 118 |
instance_name (str): Instance name.
|
| 97 | 119 |
"""
|
| 98 | 120 |
if self._execution_service is None:
|
| 99 |
- self._execution_service = ExecutionService(self._server)
|
|
| 100 |
- |
|
| 121 |
+ self._execution_service = ExecutionService(self.__grpc_server)
|
|
| 101 | 122 |
self._execution_service.add_instance(instance_name, instance)
|
| 102 | 123 |
|
| 124 |
+ self._instances.add(instance_name)
|
|
| 125 |
+ |
|
| 103 | 126 |
def add_bots_interface(self, instance, instance_name):
|
| 104 | 127 |
"""Adds a :obj:`BotsInterface` to the service.
|
| 105 | 128 |
|
| ... | ... | @@ -110,10 +133,11 @@ class BuildGridServer: |
| 110 | 133 |
instance_name (str): Instance name.
|
| 111 | 134 |
"""
|
| 112 | 135 |
if self._bots_service is None:
|
| 113 |
- self._bots_service = BotsService(self._server)
|
|
| 114 |
- |
|
| 136 |
+ self._bots_service = BotsService(self.__grpc_server)
|
|
| 115 | 137 |
self._bots_service.add_instance(instance_name, instance)
|
| 116 | 138 |
|
| 139 |
+ self._instances.add(instance_name)
|
|
| 140 |
+ |
|
| 117 | 141 |
def add_operations_instance(self, instance, instance_name):
|
| 118 | 142 |
"""Adds an :obj:`OperationsInstance` to the service.
|
| 119 | 143 |
|
| ... | ... | @@ -124,8 +148,7 @@ class BuildGridServer: |
| 124 | 148 |
instance_name (str): Instance name.
|
| 125 | 149 |
"""
|
| 126 | 150 |
if self._operations_service is None:
|
| 127 |
- self._operations_service = OperationsService(self._server)
|
|
| 128 |
- |
|
| 151 |
+ self._operations_service = OperationsService(self.__grpc_server)
|
|
| 129 | 152 |
self._operations_service.add_instance(instance_name, instance)
|
| 130 | 153 |
|
| 131 | 154 |
def add_reference_storage_instance(self, instance, instance_name):
|
| ... | ... | @@ -138,8 +161,7 @@ class BuildGridServer: |
| 138 | 161 |
instance_name (str): Instance name.
|
| 139 | 162 |
"""
|
| 140 | 163 |
if self._reference_storage_service is None:
|
| 141 |
- self._reference_storage_service = ReferenceStorageService(self._server)
|
|
| 142 |
- |
|
| 164 |
+ self._reference_storage_service = ReferenceStorageService(self.__grpc_server)
|
|
| 143 | 165 |
self._reference_storage_service.add_instance(instance_name, instance)
|
| 144 | 166 |
|
| 145 | 167 |
def add_action_cache_instance(self, instance, instance_name):
|
| ... | ... | @@ -152,8 +174,7 @@ class BuildGridServer: |
| 152 | 174 |
instance_name (str): Instance name.
|
| 153 | 175 |
"""
|
| 154 | 176 |
if self._action_cache_service is None:
|
| 155 |
- self._action_cache_service = ActionCacheService(self._server)
|
|
| 156 |
- |
|
| 177 |
+ self._action_cache_service = ActionCacheService(self.__grpc_server)
|
|
| 157 | 178 |
self._action_cache_service.add_instance(instance_name, instance)
|
| 158 | 179 |
|
| 159 | 180 |
def add_cas_instance(self, instance, instance_name):
|
| ... | ... | @@ -166,8 +187,7 @@ class BuildGridServer: |
| 166 | 187 |
instance_name (str): Instance name.
|
| 167 | 188 |
"""
|
| 168 | 189 |
if self._cas_service is None:
|
| 169 |
- self._cas_service = ContentAddressableStorageService(self._server)
|
|
| 170 |
- |
|
| 190 |
+ self._cas_service = ContentAddressableStorageService(self.__grpc_server)
|
|
| 171 | 191 |
self._cas_service.add_instance(instance_name, instance)
|
| 172 | 192 |
|
| 173 | 193 |
def add_bytestream_instance(self, instance, instance_name):
|
| ... | ... | @@ -180,6 +200,31 @@ class BuildGridServer: |
| 180 | 200 |
instance_name (str): Instance name.
|
| 181 | 201 |
"""
|
| 182 | 202 |
if self._bytestream_service is None:
|
| 183 |
- self._bytestream_service = ByteStreamService(self._server)
|
|
| 184 |
- |
|
| 203 |
+ self._bytestream_service = ByteStreamService(self.__grpc_server)
|
|
| 185 | 204 |
self._bytestream_service.add_instance(instance_name, instance)
|
| 205 |
+ |
|
| 206 |
+ # --- Private API ---
|
|
| 207 |
+ |
|
| 208 |
+ async def _monitoring_worker(self, period=1):
|
|
| 209 |
+ while True:
|
|
| 210 |
+ try:
|
|
| 211 |
+ n_clients = self._execution_service.query_n_clients()
|
|
| 212 |
+ n_bots = self._bots_service.query_n_bots()
|
|
| 213 |
+ |
|
| 214 |
+ print('---')
|
|
| 215 |
+ print('Totals: n_clients={}, n_bots={}'.format(n_clients, n_bots))
|
|
| 216 |
+ print('Per instances:')
|
|
| 217 |
+ for instance_name in self._instances:
|
|
| 218 |
+ n_clients = self._execution_service.query_n_clients_for_instance(instance_name)
|
|
| 219 |
+ n_bots = self._bots_service.query_n_bots_for_instance(instance_name)
|
|
| 220 |
+ |
|
| 221 |
+ instance_name = instance_name or 'empty'
|
|
| 222 |
+ |
|
| 223 |
+ print(' - {}: n_clients={}, n_bots={}'.format(instance_name, n_clients, n_bots))
|
|
| 224 |
+ |
|
| 225 |
+ await asyncio.sleep(period)
|
|
| 226 |
+ |
|
| 227 |
+ except asyncio.CancelledError:
|
|
| 228 |
+ break
|
|
| 229 |
+ |
|
| 230 |
+ self.__main_loop.stop()
|
| ... | ... | @@ -22,22 +22,31 @@ Schedules jobs. |
| 22 | 22 |
from collections import deque
|
| 23 | 23 |
import logging
|
| 24 | 24 |
|
| 25 |
-from buildgrid._exceptions import NotFoundError
|
|
| 25 |
+from google.protobuf import duration_pb2
|
|
| 26 | 26 |
|
| 27 |
-from .job import OperationStage, LeaseState
|
|
| 27 |
+from buildgrid._enums import LeaseState, OperationStage
|
|
| 28 |
+from buildgrid._exceptions import NotFoundError
|
|
| 28 | 29 |
|
| 29 | 30 |
|
| 30 | 31 |
class Scheduler:
|
| 31 | 32 |
|
| 32 | 33 |
MAX_N_TRIES = 5
|
| 33 | 34 |
|
| 34 |
- def __init__(self, action_cache=None):
|
|
| 35 |
+ def __init__(self, action_cache=None, monitor=True):
|
|
| 35 | 36 |
self.__logger = logging.getLogger(__name__)
|
| 36 | 37 |
|
| 38 |
+ self.__queue_times_by_priority = {}
|
|
| 39 |
+ self.__queue_time = duration_pb2.Duration()
|
|
| 40 |
+ self.__retries_by_error = {}
|
|
| 41 |
+ self.__retries_count = 0
|
|
| 42 |
+ |
|
| 37 | 43 |
self._action_cache = action_cache
|
| 44 |
+ self._is_monitored = True
|
|
| 38 | 45 |
self.jobs = {}
|
| 39 | 46 |
self.queue = deque()
|
| 40 | 47 |
|
| 48 |
+ # --- Public API ---
|
|
| 49 |
+ |
|
| 41 | 50 |
def register_client(self, job_name, queue):
|
| 42 | 51 |
self.jobs[job_name].register_client(queue)
|
| 43 | 52 |
|
| ... | ... | @@ -136,3 +145,42 @@ class Scheduler: |
| 136 | 145 |
def get_job_operation(self, job_name):
|
| 137 | 146 |
"""Returns the operation associated to job."""
|
| 138 | 147 |
return self.jobs[job_name].operation
|
| 148 |
+ |
|
| 149 |
+ # --- Public API: Monitoring ---
|
|
| 150 |
+ |
|
| 151 |
+ @property
|
|
| 152 |
+ def is_monitored(self):
|
|
| 153 |
+ return self._is_monitored
|
|
| 154 |
+ |
|
| 155 |
+ def query_n_jobs(self):
|
|
| 156 |
+ return len(self.jobs)
|
|
| 157 |
+ |
|
| 158 |
+ def query_n_operations(self):
|
|
| 159 |
+ return len(self.jobs)
|
|
| 160 |
+ |
|
| 161 |
+ def query_n_operations_by_stage(self):
|
|
| 162 |
+ return len(self.jobs)
|
|
| 163 |
+ |
|
| 164 |
+ def query_n_leases(self):
|
|
| 165 |
+ return len(self.jobs)
|
|
| 166 |
+ |
|
| 167 |
+ def query_n_leases_by_state(self):
|
|
| 168 |
+ return len(self.jobs)
|
|
| 169 |
+ |
|
| 170 |
+ def query_n_retries(self):
|
|
| 171 |
+ return self.__retries_count
|
|
| 172 |
+ |
|
| 173 |
+ def query_n_retries_for_error(self, error_type):
|
|
| 174 |
+ try:
|
|
| 175 |
+ return self.__retries_by_error[error_type]
|
|
| 176 |
+ except KeyError:
|
|
| 177 |
+ return 0
|
|
| 178 |
+ |
|
| 179 |
+ def query_am_queue_time(self):
|
|
| 180 |
+ return self.__average_queue_time
|
|
| 181 |
+ |
|
| 182 |
+ def query_am_queue_time_for_priority(self, priority_level):
|
|
| 183 |
+ try:
|
|
| 184 |
+ return self.__queue_times_by_priority[priority_level]
|
|
| 185 |
+ except KeyError:
|
|
| 186 |
+ return 0
|
| 1 |
+# Copyright (C) 2018 Bloomberg LP
|
|
| 2 |
+#
|
|
| 3 |
+# Licensed under the Apache License, Version 2.0 (the "License");
|
|
| 4 |
+# you may not use this file except in compliance with the License.
|
|
| 5 |
+# You may obtain a copy of the License at
|
|
| 6 |
+#
|
|
| 7 |
+# <http://www.apache.org/licenses/LICENSE-2.0>
|
|
| 8 |
+#
|
|
| 9 |
+# Unless required by applicable law or agreed to in writing, software
|
|
| 10 |
+# distributed under the License is distributed on an "AS IS" BASIS,
|
|
| 11 |
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
| 12 |
+# See the License for the specific language governing permissions and
|
|
| 13 |
+# limitations under the License.
|
|
| 14 |
+ |
|
| 15 |
+ |
|
| 1 | 16 |
import hashlib
|
| 2 | 17 |
|
| 3 | 18 |
|
| 4 |
-# The hash function that CAS uses
|
|
| 19 |
+# Hash function used for computing digests:
|
|
| 5 | 20 |
HASH = hashlib.sha256
|
| 21 |
+ |
|
| 22 |
+# Lenght in bytes of a hash string returned by HASH:
|
|
| 6 | 23 |
HASH_LENGTH = HASH().digest_size * 2
|
| 24 |
+ |
|
| 25 |
+# Period, in seconds, for the monitoring cycle:
|
|
| 26 |
+MONITORING_PERIOD = 5.0
|
| ... | ... | @@ -112,13 +112,15 @@ setup( |
| 112 | 112 |
license="Apache License, Version 2.0",
|
| 113 | 113 |
description="A remote execution service",
|
| 114 | 114 |
packages=find_packages(),
|
| 115 |
+ python_requires='>= 3.5.3', # janus requirement
|
|
| 115 | 116 |
install_requires=[
|
| 116 |
- 'protobuf',
|
|
| 117 |
- 'grpcio',
|
|
| 118 |
- 'Click',
|
|
| 119 |
- 'PyYAML',
|
|
| 120 | 117 |
'boto3 < 1.8.0',
|
| 121 | 118 |
'botocore < 1.11.0',
|
| 119 |
+ 'click',
|
|
| 120 |
+ 'grpcio',
|
|
| 121 |
+ 'janus',
|
|
| 122 |
+ 'protobuf',
|
|
| 123 |
+ 'pyyaml',
|
|
| 122 | 124 |
],
|
| 123 | 125 |
entry_points={
|
| 124 | 126 |
'console_scripts': [
|
