[Notes] [Git][BuildGrid/buildgrid][raoul/smarter-bot-calls] Bot: Reconnects and timeouts on the bot side



Title: GitLab

Raoul Hidalgo Charman pushed to branch raoul/smarter-bot-calls at BuildGrid / buildgrid

Commits:

5 changed files:

Changes:

  • buildgrid/_app/commands/cmd_bot.py
    ... ... @@ -33,7 +33,6 @@ from buildgrid.bot.bot_session import BotSession, Device, Worker
    33 33
     
    
    34 34
     from ..bots import buildbox, dummy, host
    
    35 35
     from ..cli import pass_context
    
    36
    -from ...settings import INTERVAL_BUFFER
    
    37 36
     
    
    38 37
     
    
    39 38
     @click.group(name='bot', short_help="Create and register bot clients.")
    
    ... ... @@ -53,7 +52,8 @@ from ...settings import INTERVAL_BUFFER
    53 52
                   help="Public CAS client certificate for TLS (PEM-encoded)")
    
    54 53
     @click.option('--cas-server-cert', type=click.Path(exists=True, dir_okay=False), default=None,
    
    55 54
                   help="Public CAS server certificate for TLS (PEM-encoded)")
    
    56
    -@click.option('--update-period', type=click.FLOAT, default=30, show_default=True,
    
    55
    +# TODO change default to 30
    
    56
    +@click.option('--update-period', type=click.FLOAT, default=5, show_default=True,
    
    57 57
                   help="Time period for bot updates to the server in seconds.")
    
    58 58
     @click.option('--parent', type=click.STRING, default='main', show_default=True,
    
    59 59
                   help="Targeted farm resource.")
    

  • buildgrid/bot/bot.py
    ... ... @@ -37,10 +37,8 @@ class Bot:
    37 37
         def session(self, work, context):
    
    38 38
             loop = asyncio.get_event_loop()
    
    39 39
     
    
    40
    -        self._bot_session.create_bot_session(work, context)
    
    41
    -
    
    42 40
             try:
    
    43
    -            task = asyncio.ensure_future(self._update_bot_session())
    
    41
    +            task = asyncio.ensure_future(self._update_bot_session(work, context))
    
    44 42
                 loop.run_forever()
    
    45 43
             except KeyboardInterrupt:
    
    46 44
                 pass
    
    ... ... @@ -48,9 +46,12 @@ class Bot:
    48 46
                 task.cancel()
    
    49 47
                 loop.close()
    
    50 48
     
    
    51
    -    async def _update_bot_session(self):
    
    49
    +    async def _update_bot_session(self, work, context):
    
    52 50
             """
    
    53 51
             Calls the server periodically to inform the server the client has not died.
    
    54 52
             """
    
    55 53
             while True:
    
    56
    -            self._bot_session.update_bot_session()
    54
    +            sleep = self._bot_session.run_bot_session(work, context)
    
    55
    +            # If you get rid of this it breaks when actually executing commands
    
    56
    +            if sleep:
    
    57
    +                await asyncio.sleep(sleep)

  • buildgrid/bot/bot_interface.py
    ... ... @@ -21,8 +21,10 @@ Interface to grpc
    21 21
     """
    
    22 22
     
    
    23 23
     import logging
    
    24
    +import grpc
    
    24 25
     
    
    25 26
     from buildgrid._protos.google.devtools.remoteworkers.v1test2 import bots_pb2, bots_pb2_grpc
    
    27
    +from ..settings import INTERVAL_BUFFER
    
    26 28
     
    
    27 29
     
    
    28 30
     class BotInterface:
    
    ... ... @@ -39,10 +41,19 @@ class BotInterface:
    39 41
         def create_bot_session(self, parent, bot_session):
    
    40 42
             request = bots_pb2.CreateBotSessionRequest(parent=parent,
    
    41 43
                                                        bot_session=bot_session)
    
    42
    -        return self._stub.CreateBotSession(request)
    
    44
    +        return self._bot_call(self._stub.CreateBotSession, request)
    
    43 45
     
    
    44 46
         def update_bot_session(self, bot_session, update_mask=None):
    
    45 47
             request = bots_pb2.UpdateBotSessionRequest(name=bot_session.name,
    
    46 48
                                                        bot_session=bot_session,
    
    47 49
                                                        update_mask=update_mask)
    
    48
    -        return self._stub.UpdateBotSession(request, timeout=self._interval)
    50
    +        return self._bot_call(self._stub.UpdateBotSession, request)
    
    51
    +
    
    52
    +    def _bot_call(self, call, request):
    
    53
    +        try:
    
    54
    +            response = call(request, timeout=self._interval + INTERVAL_BUFFER)
    
    55
    +            return response
    
    56
    +        except grpc.RpcError as e:
    
    57
    +            if e.code() in grpc.StatusCode:
    
    58
    +                self.logger.warning("Server responded with error: {}".format(e.code()))
    
    59
    +                return None

  • buildgrid/bot/bot_session.py
    ... ... @@ -49,6 +49,7 @@ class BotSession:
    49 49
             self._bot_id = '{}.{}'.format(parent, platform.node())
    
    50 50
             self._context = None
    
    51 51
             self._interface = interface
    
    52
    +        self._connected = False
    
    52 53
             self._leases = {}
    
    53 54
             self._name = None
    
    54 55
             self._parent = parent
    
    ... ... @@ -63,12 +64,29 @@ class BotSession:
    63 64
         def add_worker(self, worker):
    
    64 65
             self._worker = worker
    
    65 66
     
    
    67
    +    def run_bot_session(self, work, context=None):
    
    68
    +        if self._connected is False:
    
    69
    +            self.create_bot_session(work, context)
    
    70
    +        else:
    
    71
    +            self.update_bot_session()
    
    72
    +
    
    73
    +        if self._connected is False:
    
    74
    +            return self._interface._interval
    
    75
    +        elif self._leases:
    
    76
    +            return 0.5
    
    77
    +        else:
    
    78
    +            return None
    
    79
    +
    
    66 80
         def create_bot_session(self, work, context=None):
    
    67 81
             self.logger.debug("Creating bot session")
    
    68 82
             self._work = work
    
    69 83
             self._context = context
    
    70 84
     
    
    71 85
             session = self._interface.create_bot_session(self._parent, self.get_pb2())
    
    86
    +        if session is None:
    
    87
    +            self._connected = False
    
    88
    +            return
    
    89
    +        self._connected = True
    
    72 90
             self._name = session.name
    
    73 91
     
    
    74 92
             self.logger.info("Created bot session with name: [{}]".format(self._name))
    
    ... ... @@ -79,10 +97,15 @@ class BotSession:
    79 97
         def update_bot_session(self):
    
    80 98
             self.logger.debug("Updating bot session: [{}]".format(self._bot_id))
    
    81 99
             session = self._interface.update_bot_session(self.get_pb2())
    
    100
    +        if session is None:
    
    101
    +            self._connected = False
    
    102
    +            return
    
    103
    +        self._connected = True
    
    82 104
             for k, v in list(self._leases.items()):
    
    83 105
                 if v.state == LeaseState.COMPLETED.value:
    
    84 106
                     del self._leases[k]
    
    85 107
     
    
    108
    +        print(self._leases)
    
    86 109
             for lease in session.leases:
    
    87 110
                 self._update_lease_from_server(lease)
    
    88 111
     
    

  • buildgrid/utils.py
    ... ... @@ -16,6 +16,8 @@
    16 16
     from operator import attrgetter
    
    17 17
     import os
    
    18 18
     import socket
    
    19
    +from contextlib import contextmanager
    
    20
    +import signal
    
    19 21
     
    
    20 22
     from buildgrid.settings import HASH
    
    21 23
     from buildgrid._protos.build.bazel.remote.execution.v2 import remote_execution_pb2
    



  • [Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]