[Notes] [Git][BuildStream/buildstream][mablanch/630-remote-execution-reconn] _sandboxremote.py: Try to reopen operation steam on failure



Title: GitLab

Martin Blanchard pushed to branch mablanch/630-remote-execution-reconn at BuildStream / buildstream

Commits:

1 changed file:

Changes:

  • buildstream/sandbox/_sandboxremote.py
    ... ... @@ -76,8 +76,7 @@ class SandboxRemote(Sandbox):
    76 76
             # Upload the Command message to the remote CAS server
    
    77 77
             command_digest = cascache.push_message(self._get_project(), remote_command)
    
    78 78
             if not command_digest or not cascache.verify_digest_pushed(self._get_project(), command_digest):
    
    79
    -            # Command push failed
    
    80
    -            return None
    
    79
    +            raise SandboxError("Failed pushing build command to remote CAS.")
    
    81 80
     
    
    82 81
             # Create and send the action.
    
    83 82
             action = remote_execution_pb2.Action(command_digest=command_digest,
    
    ... ... @@ -88,27 +87,57 @@ class SandboxRemote(Sandbox):
    88 87
             # Upload the Action message to the remote CAS server
    
    89 88
             action_digest = cascache.push_message(self._get_project(), action)
    
    90 89
             if not action_digest or not cascache.verify_digest_pushed(self._get_project(), action_digest):
    
    91
    -            # Action push failed
    
    92
    -            return None
    
    90
    +            raise SandboxError("Failed pushing build action to remote CAS.")
    
    93 91
     
    
    94 92
             # Next, try to create a communication channel to the BuildGrid server.
    
    95 93
             channel = grpc.insecure_channel(self.server_url)
    
    96 94
             stub = remote_execution_pb2_grpc.ExecutionStub(channel)
    
    97 95
             request = remote_execution_pb2.ExecuteRequest(action_digest=action_digest,
    
    98 96
                                                           skip_cache_lookup=False)
    
    99
    -        try:
    
    100
    -            operation_iterator = stub.Execute(request)
    
    101
    -        except grpc.RpcError:
    
    102
    -            return None
    
    97
    +
    
    98
    +        def __run_remote_command(stub, execute_request=None, running_operation=None):
    
    99
    +            try:
    
    100
    +                last_operation = None
    
    101
    +                if execute_request is not None:
    
    102
    +                    operation_iterator = stub.Execute(execute_request)
    
    103
    +                else:
    
    104
    +                    request = remote_execution_pb2.WaitExecutionRequest(name=running_operation.name)
    
    105
    +                    operation_iterator = stub.WaitExecution(request)
    
    106
    +
    
    107
    +                for operation in operation_iterator:
    
    108
    +                    if operation.done:
    
    109
    +                        return operation
    
    110
    +                    else:
    
    111
    +                        last_operation = operation
    
    112
    +            except grpc.RpcError as e:
    
    113
    +                status_code = e.code()
    
    114
    +                if status_code == grpc.StatusCode.UNAVAILABLE:
    
    115
    +                    raise SandboxError("Failed contacting remote execution server at {}."
    
    116
    +                                       .format(self.server_url))
    
    117
    +
    
    118
    +                elif status_code in (grpc.StatusCode.INVALID_ARGUMENT,
    
    119
    +                                     grpc.StatusCode.FAILED_PRECONDITION,
    
    120
    +                                     grpc.StatusCode.RESOURCE_EXHAUSTED,
    
    121
    +                                     grpc.StatusCode.INTERNAL,
    
    122
    +                                     grpc.StatusCode.DEADLINE_EXCEEDED):
    
    123
    +                    raise SandboxError("{} ({}).".format(e.details(), status_code.name))
    
    124
    +
    
    125
    +                elif running_operation and status_code == grpc.StatusCode.UNIMPLEMENTED:
    
    126
    +                    raise SandboxError("Failed trying to recover from connection loss: "
    
    127
    +                                       "server does not support operation status polling recovery.")
    
    128
    +
    
    129
    +            return last_operation
    
    103 130
     
    
    104 131
             operation = None
    
    105 132
             with self._get_context().timed_activity("Waiting for the remote build to complete"):
    
    106
    -            # It is advantageous to check operation_iterator.code() is grpc.StatusCode.OK here,
    
    107
    -            # which will check the server is actually contactable. However, calling it when the
    
    108
    -            # server is available seems to cause .code() to hang forever.
    
    109
    -            for operation in operation_iterator:
    
    110
    -                if operation.done:
    
    111
    -                    break
    
    133
    +            operation = __run_remote_command(stub, execute_request=request)
    
    134
    +            if operation is None:
    
    135
    +                return None
    
    136
    +            elif operation.done:
    
    137
    +                return operation
    
    138
    +
    
    139
    +            while operation is not None and not operation.done:
    
    140
    +                operation = __run_remote_command(stub, running_operation=operation)
    
    112 141
     
    
    113 142
             return operation
    
    114 143
     
    
    ... ... @@ -196,7 +225,6 @@ class SandboxRemote(Sandbox):
    196 225
     
    
    197 226
             if operation is None:
    
    198 227
                 # Failure of remote execution, usually due to an error in BuildStream
    
    199
    -            # NB This error could be raised in __run_remote_command
    
    200 228
                 raise SandboxError("No response returned from server")
    
    201 229
     
    
    202 230
             assert not operation.HasField('error') and operation.HasField('response')
    



  • [Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]