[Notes] [Git][BuildGrid/buildgrid][mablanch/132-gather-state-metrics] 7 commits: execution/service.py: Expose client counts



Title: GitLab

Martin Blanchard pushed to branch mablanch/132-gather-state-metrics at BuildGrid / buildgrid

Commits:

7 changed files:

Changes:

  • buildgrid/_app/commands/cmd_server.py
    ... ... @@ -20,7 +20,6 @@ Server command
    20 20
     Create a BuildGrid server.
    
    21 21
     """
    
    22 22
     
    
    23
    -import asyncio
    
    24 23
     import logging
    
    25 24
     import sys
    
    26 25
     
    
    ... ... @@ -52,18 +51,14 @@ def start(context, config):
    52 51
             click.echo("ERROR: Could not parse config: {}.\n".format(str(e)), err=True)
    
    53 52
             sys.exit(-1)
    
    54 53
     
    
    55
    -    loop = asyncio.get_event_loop()
    
    56 54
         try:
    
    57 55
             server.start()
    
    58
    -        loop.run_forever()
    
    59 56
     
    
    60 57
         except KeyboardInterrupt:
    
    61 58
             pass
    
    62 59
     
    
    63 60
         finally:
    
    64
    -        context.logger.info("Stopping server")
    
    65 61
             server.stop()
    
    66
    -        loop.close()
    
    67 62
     
    
    68 63
     
    
    69 64
     def _create_server_from_config(config):
    

  • buildgrid/server/bots/service.py
    ... ... @@ -23,8 +23,9 @@ import logging
    23 23
     
    
    24 24
     import grpc
    
    25 25
     
    
    26
    -from google.protobuf.empty_pb2 import Empty
    
    26
    +from google.protobuf import empty_pb2, timestamp_pb2
    
    27 27
     
    
    28
    +from buildgrid._enums import BotStatus
    
    28 29
     from buildgrid._exceptions import InvalidArgumentError, OutOfSyncError
    
    29 30
     from buildgrid._protos.google.devtools.remoteworkers.v1test2 import bots_pb2
    
    30 31
     from buildgrid._protos.google.devtools.remoteworkers.v1test2 import bots_pb2_grpc
    
    ... ... @@ -32,24 +33,60 @@ from buildgrid._protos.google.devtools.remoteworkers.v1test2 import bots_pb2_grp
    32 33
     
    
    33 34
     class BotsService(bots_pb2_grpc.BotsServicer):
    
    34 35
     
    
    35
    -    def __init__(self, server):
    
    36
    +    def __init__(self, server, monitor=True):
    
    36 37
             self.__logger = logging.getLogger(__name__)
    
    37 38
     
    
    39
    +        self.__bots_by_status = {}
    
    40
    +        self.__bots_by_instance = {}
    
    41
    +        self.__bots = {}
    
    42
    +
    
    38 43
             self._instances = {}
    
    44
    +        self._is_monitored = True
    
    39 45
     
    
    40 46
             bots_pb2_grpc.add_BotsServicer_to_server(self, server)
    
    41 47
     
    
    42
    -    def add_instance(self, name, instance):
    
    43
    -        self._instances[name] = instance
    
    48
    +        if self._is_monitored:
    
    49
    +            self.__bots_by_status[BotStatus.OK] = set()
    
    50
    +            self.__bots_by_status[BotStatus.UNHEALTHY] = set()
    
    51
    +            self.__bots_by_status[BotStatus.HOST_REBOOTING] = set()
    
    52
    +            self.__bots_by_status[BotStatus.BOT_TERMINATING] = set()
    
    53
    +
    
    54
    +    # --- Public API ---
    
    55
    +
    
    56
    +    def add_instance(self, instance_name, instance):
    
    57
    +        self._instances[instance_name] = instance
    
    58
    +
    
    59
    +        if self._is_monitored:
    
    60
    +            self.__bots_by_instance[instance_name] = 0
    
    61
    +
    
    62
    +    # --- Public API: Servicer ---
    
    44 63
     
    
    45 64
         def CreateBotSession(self, request, context):
    
    65
    +        """Handles CreateBotSessionRequest messages.
    
    66
    +
    
    67
    +        Args:
    
    68
    +            request (CreateBotSessionRequest): The incoming RPC request.
    
    69
    +            context (grpc.ServicerContext): Context for the RPC call.
    
    70
    +        """
    
    46 71
             self.__logger.debug("CreateBotSession request from [%s]", context.peer())
    
    47 72
     
    
    73
    +        instance_name = request.parent
    
    74
    +        bot_status = BotStatus(request.bot_session.status)
    
    75
    +        bot_id = request.bot_session.bot_id
    
    76
    +
    
    48 77
             try:
    
    49
    -            parent = request.parent
    
    50
    -            instance = self._get_instance(request.parent)
    
    51
    -            return instance.create_bot_session(parent,
    
    52
    -                                               request.bot_session)
    
    78
    +            instance = self._get_instance(instance_name)
    
    79
    +            bot_session = instance.create_bot_session(instance_name,
    
    80
    +                                                      request.bot_session)
    
    81
    +            now = timestamp_pb2.Timestamp()
    
    82
    +            now.GetCurrentTime()
    
    83
    +
    
    84
    +            if self._is_monitored:
    
    85
    +                self.__bots[bot_id] = now
    
    86
    +                self.__bots_by_instance[instance_name] += 1
    
    87
    +                self.__bots_by_status[bot_status].add(bot_id)
    
    88
    +
    
    89
    +            return bot_session
    
    53 90
     
    
    54 91
             except InvalidArgumentError as e:
    
    55 92
                 self.__logger.error(e)
    
    ... ... @@ -59,17 +96,36 @@ class BotsService(bots_pb2_grpc.BotsServicer):
    59 96
             return bots_pb2.BotSession()
    
    60 97
     
    
    61 98
         def UpdateBotSession(self, request, context):
    
    99
    +        """Handles UpdateBotSessionRequest messages.
    
    100
    +
    
    101
    +        Args:
    
    102
    +            request (UpdateBotSessionRequest): The incoming RPC request.
    
    103
    +            context (grpc.ServicerContext): Context for the RPC call.
    
    104
    +        """
    
    62 105
             self.__logger.debug("UpdateBotSession request from [%s]", context.peer())
    
    63 106
     
    
    107
    +        names = request.name.split("/")
    
    108
    +        bot_status = BotStatus(request.bot_session.status)
    
    109
    +        bot_id = request.bot_session.bot_id
    
    110
    +
    
    64 111
             try:
    
    65
    -            names = request.name.split("/")
    
    66
    -            # Operation name should be in format:
    
    67
    -            # {instance/name}/{uuid}
    
    68
    -            instance_name = ''.join(names[0:-1])
    
    112
    +            instance_name = '/'.join(names[:-1])
    
    69 113
     
    
    70 114
                 instance = self._get_instance(instance_name)
    
    71
    -            return instance.update_bot_session(request.name,
    
    72
    -                                               request.bot_session)
    
    115
    +            bot_session = instance.update_bot_session(request.name,
    
    116
    +                                                      request.bot_session)
    
    117
    +
    
    118
    +            if self._is_monitored:
    
    119
    +                self.__bots[bot_id].GetCurrentTime()
    
    120
    +                if bot_id not in self.__bots_by_status[bot_status]:
    
    121
    +                    self.__bots_by_status[BotStatus.OK].discard(bot_id)
    
    122
    +                    self.__bots_by_status[BotStatus.UNHEALTHY].discard(bot_id)
    
    123
    +                    self.__bots_by_status[BotStatus.HOST_REBOOTING].discard(bot_id)
    
    124
    +                    self.__bots_by_status[BotStatus.BOT_TERMINATING].discard(bot_id)
    
    125
    +
    
    126
    +                    self.__bots_by_status[bot_status].add(bot_id)
    
    127
    +
    
    128
    +            return bot_session
    
    73 129
     
    
    74 130
             except InvalidArgumentError as e:
    
    75 131
                 self.__logger.error(e)
    
    ... ... @@ -89,10 +145,40 @@ class BotsService(bots_pb2_grpc.BotsServicer):
    89 145
             return bots_pb2.BotSession()
    
    90 146
     
    
    91 147
         def PostBotEventTemp(self, request, context):
    
    148
    +        """Handles PostBotEventTempRequest messages.
    
    149
    +
    
    150
    +        Args:
    
    151
    +            request (PostBotEventTempRequest): The incoming RPC request.
    
    152
    +            context (grpc.ServicerContext): Context for the RPC call.
    
    153
    +        """
    
    92 154
             self.__logger.debug("PostBotEventTemp request from [%s]", context.peer())
    
    93 155
     
    
    94 156
             context.set_code(grpc.StatusCode.UNIMPLEMENTED)
    
    95
    -        return Empty()
    
    157
    +
    
    158
    +        return empty_pb2.Empty()
    
    159
    +
    
    160
    +    # --- Public API: Monitoring ---
    
    161
    +
    
    162
    +    @property
    
    163
    +    def is_monitored(self):
    
    164
    +        return self._is_monitored
    
    165
    +
    
    166
    +    def query_n_bots(self):
    
    167
    +        return len(self.__bots)
    
    168
    +
    
    169
    +    def query_n_bots_for_instance(self, instance_name):
    
    170
    +        try:
    
    171
    +            return self.__bots_by_instance[instance_name]
    
    172
    +        except KeyError:
    
    173
    +            return 0
    
    174
    +
    
    175
    +    def query_n_bots_for_status(self, bot_status):
    
    176
    +        try:
    
    177
    +            return len(self.__bots_by_status[bot_status])
    
    178
    +        except KeyError:
    
    179
    +            return 0
    
    180
    +
    
    181
    +    # --- Private API ---
    
    96 182
     
    
    97 183
         def _get_instance(self, name):
    
    98 184
             try:
    

  • buildgrid/server/execution/service.py
    ... ... @@ -33,30 +33,57 @@ from buildgrid._protos.google.longrunning import operations_pb2
    33 33
     
    
    34 34
     class ExecutionService(remote_execution_pb2_grpc.ExecutionServicer):
    
    35 35
     
    
    36
    -    def __init__(self, server):
    
    36
    +    def __init__(self, server, monitor=True):
    
    37 37
             self.__logger = logging.getLogger(__name__)
    
    38 38
     
    
    39
    +        self.__peers_by_instance = {}
    
    40
    +        self.__peers = {}
    
    41
    +
    
    39 42
             self._instances = {}
    
    43
    +        self._is_monitored = True
    
    44
    +
    
    40 45
             remote_execution_pb2_grpc.add_ExecutionServicer_to_server(self, server)
    
    41 46
     
    
    42
    -    def add_instance(self, name, instance):
    
    43
    -        self._instances[name] = instance
    
    47
    +    # --- Public API ---
    
    48
    +
    
    49
    +    def add_instance(self, instance_name, instance):
    
    50
    +        self._instances[instance_name] = instance
    
    51
    +
    
    52
    +        if self._is_monitored:
    
    53
    +            self.__peers_by_instance[instance_name] = set()
    
    54
    +
    
    55
    +    # --- Public API: Servicer ---
    
    44 56
     
    
    45 57
         def Execute(self, request, context):
    
    58
    +        """Handles ExecuteRequest messages.
    
    59
    +
    
    60
    +        Args:
    
    61
    +            request (ExecuteRequest): The incoming RPC request.
    
    62
    +            context (grpc.ServicerContext): Context for the RPC call.
    
    63
    +        """
    
    46 64
             self.__logger.debug("Execute request from [%s]", context.peer())
    
    47 65
     
    
    66
    +        instance_name = request.instance_name
    
    67
    +        message_queue = queue.Queue()
    
    68
    +        peer = context.peer()
    
    69
    +
    
    48 70
             try:
    
    49
    -            message_queue = queue.Queue()
    
    50
    -            instance = self._get_instance(request.instance_name)
    
    71
    +            instance = self._get_instance(instance_name)
    
    51 72
                 operation = instance.execute(request.action_digest,
    
    52 73
                                              request.skip_cache_lookup,
    
    53 74
                                              message_queue)
    
    54 75
     
    
    55
    -            context.add_callback(partial(instance.unregister_message_client,
    
    56
    -                                         operation.name, message_queue))
    
    76
    +            context.add_callback(partial(self._rpc_termination_callback,
    
    77
    +                                         peer, instance_name, operation.name, message_queue))
    
    78
    +
    
    79
    +            if self._is_monitored:
    
    80
    +                if peer not in self.__peers:
    
    81
    +                    self.__peers_by_instance[instance_name].add(peer)
    
    82
    +                    self.__peers[peer] = 1
    
    83
    +                else:
    
    84
    +                    self.__peers[peer] += 1
    
    57 85
     
    
    58
    -            instanced_op_name = "{}/{}".format(request.instance_name,
    
    59
    -                                               operation.name)
    
    86
    +            instanced_op_name = "{}/{}".format(instance_name, operation.name)
    
    60 87
     
    
    61 88
                 self.__logger.info("Operation name: [%s]", instanced_op_name)
    
    62 89
     
    
    ... ... @@ -80,23 +107,37 @@ class ExecutionService(remote_execution_pb2_grpc.ExecutionServicer):
    80 107
                 yield operations_pb2.Operation()
    
    81 108
     
    
    82 109
         def WaitExecution(self, request, context):
    
    110
    +        """Handles WaitExecutionRequest messages.
    
    111
    +
    
    112
    +        Args:
    
    113
    +            request (WaitExecutionRequest): The incoming RPC request.
    
    114
    +            context (grpc.ServicerContext): Context for the RPC call.
    
    115
    +        """
    
    83 116
             self.__logger.debug("WaitExecution request from [%s]", context.peer())
    
    84 117
     
    
    85
    -        try:
    
    86
    -            names = request.name.split("/")
    
    118
    +        names = request.name.split('/')
    
    119
    +        instance_name = '/'.join(names[:-1])
    
    120
    +        operation_name = names[-1]
    
    121
    +        message_queue = queue.Queue()
    
    122
    +        peer = context.peer()
    
    87 123
     
    
    88
    -            # Operation name should be in format:
    
    89
    -            # {instance/name}/{operation_id}
    
    90
    -            instance_name = ''.join(names[0:-1])
    
    124
    +        try:
    
    125
    +            if instance_name != request.instance_name:
    
    126
    +                raise InvalidArgumentError("Invalid operation [{}] for instance [{}]"
    
    127
    +                                            .format(request.name, instance_name))
    
    91 128
     
    
    92
    -            message_queue = queue.Queue()
    
    93
    -            operation_name = names[-1]
    
    94 129
                 instance = self._get_instance(instance_name)
    
    95 130
     
    
    96 131
                 instance.register_message_client(operation_name, message_queue)
    
    132
    +            context.add_callback(partial(self._rpc_termination_callback,
    
    133
    +                                         peer, instance_name, operation_name, message_queue))
    
    97 134
     
    
    98
    -            context.add_callback(partial(instance.unregister_message_client,
    
    99
    -                                         operation_name, message_queue))
    
    135
    +            if self._is_monitored:
    
    136
    +                if peer not in self.__peers:
    
    137
    +                    self.__peers_by_instance[instance_name].add(peer)
    
    138
    +                    self.__peers[peer] = 1
    
    139
    +                else:
    
    140
    +                    self.__peers[peer] += 1
    
    100 141
     
    
    101 142
                 for operation in instance.stream_operation_updates(message_queue,
    
    102 143
                                                                    operation_name):
    
    ... ... @@ -111,6 +152,35 @@ class ExecutionService(remote_execution_pb2_grpc.ExecutionServicer):
    111 152
                 context.set_code(grpc.StatusCode.INVALID_ARGUMENT)
    
    112 153
                 yield operations_pb2.Operation()
    
    113 154
     
    
    155
    +    # --- Public API: Monitoring ---
    
    156
    +
    
    157
    +    @property
    
    158
    +    def is_monitored(self):
    
    159
    +        return self._is_monitored
    
    160
    +
    
    161
    +    def query_n_clients(self):
    
    162
    +        return len(self.__peers)
    
    163
    +
    
    164
    +    def query_n_clients_for_instance(self, instance_name):
    
    165
    +        try:
    
    166
    +            return len(self.__peers_by_instance[instance_name])
    
    167
    +        except KeyError:
    
    168
    +            return 0
    
    169
    +
    
    170
    +    # --- Private API ---
    
    171
    +
    
    172
    +    def _rpc_termination_callback(self, peer, instance_name, job_name, message_queue):
    
    173
    +        instance = self._get_instance(instance_name)
    
    174
    +
    
    175
    +        instance.unregister_message_client(job_name, message_queue)
    
    176
    +
    
    177
    +        if self._is_monitored:
    
    178
    +            if self.__peers[peer] > 1:
    
    179
    +                self.__peers[peer] -= 1
    
    180
    +            else:
    
    181
    +                self.__peers_by_instance[instance_name].remove(peer)
    
    182
    +                del self.__peers[peer]
    
    183
    +
    
    114 184
         def _get_instance(self, name):
    
    115 185
             try:
    
    116 186
                 return self._instances[name]
    

  • buildgrid/server/instance.py
    ... ... @@ -13,18 +13,21 @@
    13 13
     # limitations under the License.
    
    14 14
     
    
    15 15
     
    
    16
    +import asyncio
    
    16 17
     from concurrent import futures
    
    17 18
     import logging
    
    18 19
     import os
    
    20
    +import time
    
    19 21
     
    
    20 22
     import grpc
    
    21 23
     
    
    22
    -from .cas.service import ByteStreamService, ContentAddressableStorageService
    
    23
    -from .actioncache.service import ActionCacheService
    
    24
    -from .execution.service import ExecutionService
    
    25
    -from .operations.service import OperationsService
    
    26
    -from .bots.service import BotsService
    
    27
    -from .referencestorage.service import ReferenceStorageService
    
    24
    +from buildgrid.server.cas.service import ByteStreamService, ContentAddressableStorageService
    
    25
    +from buildgrid.server.actioncache.service import ActionCacheService
    
    26
    +from buildgrid.server.execution.service import ExecutionService
    
    27
    +from buildgrid.server.operations.service import OperationsService
    
    28
    +from buildgrid.server.bots.service import BotsService
    
    29
    +from buildgrid.server.referencestorage.service import ReferenceStorageService
    
    30
    +from buildgrid.settings import MONITORING_PERIOD
    
    28 31
     
    
    29 32
     
    
    30 33
     class BuildGridServer:
    
    ... ... @@ -46,9 +49,11 @@ class BuildGridServer:
    46 49
                 # Use max_workers default from Python 3.5+
    
    47 50
                 max_workers = (os.cpu_count() or 1) * 5
    
    48 51
     
    
    49
    -        server = grpc.server(futures.ThreadPoolExecutor(max_workers))
    
    52
    +        self.__grpc_executor = futures.ThreadPoolExecutor(max_workers)
    
    53
    +        self.__grpc_server = grpc.server(self.__grpc_executor)
    
    50 54
     
    
    51
    -        self._server = server
    
    55
    +        self.__main_loop = asyncio.get_event_loop()
    
    56
    +        self.__monitoring_task = None
    
    52 57
     
    
    53 58
             self._execution_service = None
    
    54 59
             self._bots_service = None
    
    ... ... @@ -58,15 +63,32 @@ class BuildGridServer:
    58 63
             self._cas_service = None
    
    59 64
             self._bytestream_service = None
    
    60 65
     
    
    66
    +        self._instances = set()
    
    67
    +
    
    68
    +    # --- Public API ---
    
    69
    +
    
    61 70
         def start(self):
    
    62
    -        """Starts the server.
    
    71
    +        """Starts the BuildGrid server.
    
    63 72
             """
    
    64
    -        self._server.start()
    
    73
    +        self.__grpc_server.start()
    
    74
    +
    
    75
    +        self.__monitoring_task = asyncio.ensure_future(
    
    76
    +            self._monitoring_worker(period=MONITORING_PERIOD), loop=self.__main_loop)
    
    77
    +        self.__main_loop.run_forever()
    
    65 78
     
    
    66 79
         def stop(self, grace=0):
    
    67
    -        """Stops the server.
    
    80
    +        """Stops the BuildGrid server.
    
    81
    +
    
    82
    +        Args:
    
    83
    +            grace (int, optional): A duration of time in seconds. Defaults to 0.
    
    68 84
             """
    
    69
    -        self._server.stop(grace)
    
    85
    +        if self.__monitoring_task is not None:
    
    86
    +            self.__monitoring_task.cancel()
    
    87
    +
    
    88
    +        self.__grpc_server.stop(grace)
    
    89
    +
    
    90
    +        if grace > 0:
    
    91
    +            time.sleep(grace)
    
    70 92
     
    
    71 93
         def add_port(self, address, credentials):
    
    72 94
             """Adds a port to the server.
    
    ... ... @@ -80,11 +102,11 @@ class BuildGridServer:
    80 102
             """
    
    81 103
             if credentials is not None:
    
    82 104
                 self.__logger.info("Adding secure connection on: [%s]", address)
    
    83
    -            self._server.add_secure_port(address, credentials)
    
    105
    +            self.__grpc_server.add_secure_port(address, credentials)
    
    84 106
     
    
    85 107
             else:
    
    86 108
                 self.__logger.info("Adding insecure connection on [%s]", address)
    
    87
    -            self._server.add_insecure_port(address)
    
    109
    +            self.__grpc_server.add_insecure_port(address)
    
    88 110
     
    
    89 111
         def add_execution_instance(self, instance, instance_name):
    
    90 112
             """Adds an :obj:`ExecutionInstance` to the service.
    
    ... ... @@ -96,10 +118,11 @@ class BuildGridServer:
    96 118
                 instance_name (str): Instance name.
    
    97 119
             """
    
    98 120
             if self._execution_service is None:
    
    99
    -            self._execution_service = ExecutionService(self._server)
    
    100
    -
    
    121
    +            self._execution_service = ExecutionService(self.__grpc_server)
    
    101 122
             self._execution_service.add_instance(instance_name, instance)
    
    102 123
     
    
    124
    +        self._instances.add(instance_name)
    
    125
    +
    
    103 126
         def add_bots_interface(self, instance, instance_name):
    
    104 127
             """Adds a :obj:`BotsInterface` to the service.
    
    105 128
     
    
    ... ... @@ -110,10 +133,11 @@ class BuildGridServer:
    110 133
                 instance_name (str): Instance name.
    
    111 134
             """
    
    112 135
             if self._bots_service is None:
    
    113
    -            self._bots_service = BotsService(self._server)
    
    114
    -
    
    136
    +            self._bots_service = BotsService(self.__grpc_server)
    
    115 137
             self._bots_service.add_instance(instance_name, instance)
    
    116 138
     
    
    139
    +        self._instances.add(instance_name)
    
    140
    +
    
    117 141
         def add_operations_instance(self, instance, instance_name):
    
    118 142
             """Adds an :obj:`OperationsInstance` to the service.
    
    119 143
     
    
    ... ... @@ -124,8 +148,7 @@ class BuildGridServer:
    124 148
                 instance_name (str): Instance name.
    
    125 149
             """
    
    126 150
             if self._operations_service is None:
    
    127
    -            self._operations_service = OperationsService(self._server)
    
    128
    -
    
    151
    +            self._operations_service = OperationsService(self.__grpc_server)
    
    129 152
             self._operations_service.add_instance(instance_name, instance)
    
    130 153
     
    
    131 154
         def add_reference_storage_instance(self, instance, instance_name):
    
    ... ... @@ -138,8 +161,7 @@ class BuildGridServer:
    138 161
                 instance_name (str): Instance name.
    
    139 162
             """
    
    140 163
             if self._reference_storage_service is None:
    
    141
    -            self._reference_storage_service = ReferenceStorageService(self._server)
    
    142
    -
    
    164
    +            self._reference_storage_service = ReferenceStorageService(self.__grpc_server)
    
    143 165
             self._reference_storage_service.add_instance(instance_name, instance)
    
    144 166
     
    
    145 167
         def add_action_cache_instance(self, instance, instance_name):
    
    ... ... @@ -152,8 +174,7 @@ class BuildGridServer:
    152 174
                 instance_name (str): Instance name.
    
    153 175
             """
    
    154 176
             if self._action_cache_service is None:
    
    155
    -            self._action_cache_service = ActionCacheService(self._server)
    
    156
    -
    
    177
    +            self._action_cache_service = ActionCacheService(self.__grpc_server)
    
    157 178
             self._action_cache_service.add_instance(instance_name, instance)
    
    158 179
     
    
    159 180
         def add_cas_instance(self, instance, instance_name):
    
    ... ... @@ -166,8 +187,7 @@ class BuildGridServer:
    166 187
                 instance_name (str): Instance name.
    
    167 188
             """
    
    168 189
             if self._cas_service is None:
    
    169
    -            self._cas_service = ContentAddressableStorageService(self._server)
    
    170
    -
    
    190
    +            self._cas_service = ContentAddressableStorageService(self.__grpc_server)
    
    171 191
             self._cas_service.add_instance(instance_name, instance)
    
    172 192
     
    
    173 193
         def add_bytestream_instance(self, instance, instance_name):
    
    ... ... @@ -180,6 +200,31 @@ class BuildGridServer:
    180 200
                 instance_name (str): Instance name.
    
    181 201
             """
    
    182 202
             if self._bytestream_service is None:
    
    183
    -            self._bytestream_service = ByteStreamService(self._server)
    
    184
    -
    
    203
    +            self._bytestream_service = ByteStreamService(self.__grpc_server)
    
    185 204
             self._bytestream_service.add_instance(instance_name, instance)
    
    205
    +
    
    206
    +    # --- Private API ---
    
    207
    +
    
    208
    +    async def _monitoring_worker(self, period=1):
    
    209
    +        while True:
    
    210
    +            try:
    
    211
    +                n_clients = self._execution_service.query_n_clients()
    
    212
    +                n_bots = self._bots_service.query_n_bots()
    
    213
    +
    
    214
    +                print('---')
    
    215
    +                print('Totals: n_clients={}, n_bots={}'.format(n_clients, n_bots))
    
    216
    +                print('Per instances:')
    
    217
    +                for instance_name in self._instances:
    
    218
    +                    n_clients = self._execution_service.query_n_clients_for_instance(instance_name)
    
    219
    +                    n_bots = self._bots_service.query_n_bots_for_instance(instance_name)
    
    220
    +
    
    221
    +                    instance_name = instance_name or 'empty'
    
    222
    +
    
    223
    +                    print(' - {}: n_clients={}, n_bots={}'.format(instance_name, n_clients, n_bots))
    
    224
    +
    
    225
    +                await asyncio.sleep(period)
    
    226
    +
    
    227
    +            except asyncio.CancelledError:
    
    228
    +                break
    
    229
    +
    
    230
    +        self.__main_loop.stop()

  • buildgrid/server/scheduler.py
    ... ... @@ -22,22 +22,31 @@ Schedules jobs.
    22 22
     from collections import deque
    
    23 23
     import logging
    
    24 24
     
    
    25
    -from buildgrid._exceptions import NotFoundError
    
    25
    +from google.protobuf import duration_pb2
    
    26 26
     
    
    27
    -from .job import OperationStage, LeaseState
    
    27
    +from buildgrid._enums import LeaseState, OperationStage
    
    28
    +from buildgrid._exceptions import NotFoundError
    
    28 29
     
    
    29 30
     
    
    30 31
     class Scheduler:
    
    31 32
     
    
    32 33
         MAX_N_TRIES = 5
    
    33 34
     
    
    34
    -    def __init__(self, action_cache=None):
    
    35
    +    def __init__(self, action_cache=None, monitor=True):
    
    35 36
             self.__logger = logging.getLogger(__name__)
    
    36 37
     
    
    38
    +        self.__queue_times_by_priority = {}
    
    39
    +        self.__queue_time = duration_pb2.Duration()
    
    40
    +        self.__retries_by_error = {}
    
    41
    +        self.__retries_count = 0
    
    42
    +
    
    37 43
             self._action_cache = action_cache
    
    44
    +        self._is_monitored = True
    
    38 45
             self.jobs = {}
    
    39 46
             self.queue = deque()
    
    40 47
     
    
    48
    +    # --- Public API ---
    
    49
    +
    
    41 50
         def register_client(self, job_name, queue):
    
    42 51
             self.jobs[job_name].register_client(queue)
    
    43 52
     
    
    ... ... @@ -136,3 +145,42 @@ class Scheduler:
    136 145
         def get_job_operation(self, job_name):
    
    137 146
             """Returns the operation associated to job."""
    
    138 147
             return self.jobs[job_name].operation
    
    148
    +
    
    149
    +    # --- Public API: Monitoring ---
    
    150
    +
    
    151
    +    @property
    
    152
    +    def is_monitored(self):
    
    153
    +        return self._is_monitored
    
    154
    +
    
    155
    +    def query_n_jobs(self):
    
    156
    +        return len(self.jobs)
    
    157
    +
    
    158
    +    def query_n_operations(self):
    
    159
    +        return len(self.jobs)
    
    160
    +
    
    161
    +    def query_n_operations_by_stage(self):
    
    162
    +        return len(self.jobs)
    
    163
    +
    
    164
    +    def query_n_leases(self):
    
    165
    +        return len(self.jobs)
    
    166
    +
    
    167
    +    def query_n_leases_by_state(self):
    
    168
    +        return len(self.jobs)
    
    169
    +
    
    170
    +    def query_n_retries(self):
    
    171
    +        return self.__retries_count
    
    172
    +
    
    173
    +    def query_n_retries_for_error(self, error_type):
    
    174
    +        try:
    
    175
    +            return self.__retries_by_error[error_type]
    
    176
    +        except KeyError:
    
    177
    +            return 0
    
    178
    +
    
    179
    +    def query_am_queue_time(self):
    
    180
    +        return self.__average_queue_time
    
    181
    +
    
    182
    +    def query_am_queue_time_for_priority(self, priority_level):
    
    183
    +        try:
    
    184
    +            return self.__queue_times_by_priority[priority_level]
    
    185
    +        except KeyError:
    
    186
    +            return 0

  • buildgrid/settings.py
    1
    +# Copyright (C) 2018 Bloomberg LP
    
    2
    +#
    
    3
    +# Licensed under the Apache License, Version 2.0 (the "License");
    
    4
    +# you may not use this file except in compliance with the License.
    
    5
    +# You may obtain a copy of the License at
    
    6
    +#
    
    7
    +#  <http://www.apache.org/licenses/LICENSE-2.0>
    
    8
    +#
    
    9
    +# Unless required by applicable law or agreed to in writing, software
    
    10
    +# distributed under the License is distributed on an "AS IS" BASIS,
    
    11
    +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    
    12
    +# See the License for the specific language governing permissions and
    
    13
    +# limitations under the License.
    
    14
    +
    
    15
    +
    
    1 16
     import hashlib
    
    2 17
     
    
    3 18
     
    
    4
    -# The hash function that CAS uses
    
    19
    +# Hash function used for computing digests:
    
    5 20
     HASH = hashlib.sha256
    
    21
    +
    
    22
    +# Lenght in bytes of a hash string returned by HASH:
    
    6 23
     HASH_LENGTH = HASH().digest_size * 2
    
    24
    +
    
    25
    +# Period, in seconds, for the monitoring cycle:
    
    26
    +MONITORING_PERIOD = 5.0

  • setup.py
    ... ... @@ -112,13 +112,15 @@ setup(
    112 112
         license="Apache License, Version 2.0",
    
    113 113
         description="A remote execution service",
    
    114 114
         packages=find_packages(),
    
    115
    +    python_requires='>= 3.5.3',  # janus requirement
    
    115 116
         install_requires=[
    
    116
    -        'protobuf',
    
    117
    -        'grpcio',
    
    118
    -        'Click',
    
    119
    -        'PyYAML',
    
    120 117
             'boto3 < 1.8.0',
    
    121 118
             'botocore < 1.11.0',
    
    119
    +        'click',
    
    120
    +        'grpcio',
    
    121
    +        'janus',
    
    122
    +        'protobuf',
    
    123
    +        'pyyaml',
    
    122 124
         ],
    
    123 125
         entry_points={
    
    124 126
             'console_scripts': [
    



  • [Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]