[Notes] [Git][BuildStream/buildstream][tristan/fix-required-artifacts] 6 commits: _artifactcache/artifactcache.py: Changes to mark_required_elements()



Title: GitLab

Tristan Van Berkom pushed to branch tristan/fix-required-artifacts at BuildStream / buildstream

Commits:

7 changed files:

Changes:

  • buildstream/_artifactcache/artifactcache.py
    ... ... @@ -87,7 +87,7 @@ class ArtifactCache():
    87 87
             self.global_remote_specs = []
    
    88 88
             self.project_remote_specs = {}
    
    89 89
     
    
    90
    -        self._required_artifacts = set()      # The artifacts required for this session
    
    90
    +        self._required_elements = set()       # The elements required for this session
    
    91 91
             self._cache_size = None               # The current cache size, sometimes it's an estimate
    
    92 92
             self._cache_quota = None              # The cache quota
    
    93 93
             self._cache_lower_threshold = None    # The target cache size for a cleanup
    
    ... ... @@ -189,33 +189,40 @@ class ArtifactCache():
    189 189
                                       (str(provenance)))
    
    190 190
             return cache_specs
    
    191 191
     
    
    192
    -    # append_required_artifacts():
    
    192
    +    # mark_required_elements():
    
    193 193
         #
    
    194
    -    # Append to the list of elements whose artifacts are required for
    
    195
    -    # the current run. Artifacts whose elements are in this list will
    
    196
    -    # be locked by the artifact cache and not touched for the duration
    
    197
    -    # of the current pipeline.
    
    194
    +    # Mark elements whose artifacts are required for the current run.
    
    195
    +    #
    
    196
    +    # Artifacts whose elements are in this list will be locked by the artifact
    
    197
    +    # cache and not touched for the duration of the current pipeline.
    
    198 198
         #
    
    199 199
         # Args:
    
    200 200
         #     elements (iterable): A set of elements to mark as required
    
    201 201
         #
    
    202
    -    def append_required_artifacts(self, elements):
    
    203
    -        # We lock both strong and weak keys - deleting one but not the
    
    204
    -        # other won't save space in most cases anyway, but would be a
    
    205
    -        # user inconvenience.
    
    202
    +    def mark_required_elements(self, elements):
    
    203
    +
    
    204
    +        # We risk calling this function with a generator, so we
    
    205
    +        # better consume it first.
    
    206
    +        #
    
    207
    +        elements = list(elements)
    
    208
    +
    
    209
    +        # Mark the elements as required. We cannot know that we know the
    
    210
    +        # cache keys yet, so we only check that later when deleting.
    
    211
    +        #
    
    212
    +        self._required_elements.update(elements)
    
    206 213
     
    
    214
    +        # For the cache keys which were resolved so far, we bump
    
    215
    +        # the atime of them.
    
    216
    +        #
    
    217
    +        # This is just in case we have concurrent instances of
    
    218
    +        # BuildStream running with the same artifact cache, it will
    
    219
    +        # reduce the likelyhood of one instance deleting artifacts
    
    220
    +        # which are required by the other.
    
    207 221
             for element in elements:
    
    208 222
                 strong_key = element._get_cache_key(strength=_KeyStrength.STRONG)
    
    209 223
                 weak_key = element._get_cache_key(strength=_KeyStrength.WEAK)
    
    210
    -
    
    211 224
                 for key in (strong_key, weak_key):
    
    212
    -                if key and key not in self._required_artifacts:
    
    213
    -                    self._required_artifacts.add(key)
    
    214
    -
    
    215
    -                    # We also update the usage times of any artifacts
    
    216
    -                    # we will be using, which helps preventing a
    
    217
    -                    # buildstream process that runs in parallel with
    
    218
    -                    # this one from removing artifacts in-use.
    
    225
    +                if key:
    
    219 226
                         try:
    
    220 227
                             self.update_atime(key)
    
    221 228
                         except ArtifactError:
    
    ... ... @@ -231,6 +238,18 @@ class ArtifactCache():
    231 238
         def clean(self):
    
    232 239
             artifacts = self.list_artifacts()
    
    233 240
     
    
    241
    +        # Build a set of the cache keys which are required
    
    242
    +        # based on the required elements at cleanup time
    
    243
    +        #
    
    244
    +        # We lock both strong and weak keys - deleting one but not the
    
    245
    +        # other won't save space, but would be a user inconvenience.
    
    246
    +        required_artifacts = set()
    
    247
    +        for element in self._required_elements:
    
    248
    +            required_artifacts.update([
    
    249
    +                element._get_cache_key(strength=_KeyStrength.STRONG),
    
    250
    +                element._get_cache_key(strength=_KeyStrength.WEAK)
    
    251
    +            ])
    
    252
    +
    
    234 253
             # Do a real computation of the cache size once, just in case
    
    235 254
             self.compute_cache_size()
    
    236 255
     
    
    ... ... @@ -256,7 +275,7 @@ class ArtifactCache():
    256 275
                         break
    
    257 276
     
    
    258 277
                 key = to_remove.rpartition('/')[2]
    
    259
    -            if key not in self._required_artifacts:
    
    278
    +            if key not in required_artifacts:
    
    260 279
     
    
    261 280
                     # Remove the actual artifact, if it's not required.
    
    262 281
                     size = self.remove(to_remove)
    

  • buildstream/_stream.py
    ... ... @@ -938,13 +938,10 @@ class Stream():
    938 938
             # Set the "required" artifacts that should not be removed
    
    939 939
             # while this pipeline is active
    
    940 940
             #
    
    941
    -        # FIXME: The set of required artifacts is only really needed
    
    942
    -        #        for build and pull tasks.
    
    941
    +        # It must include all the artifacts which are required by the
    
    942
    +        # final product. Note that this is a superset of the build plan.
    
    943 943
             #
    
    944
    -        #        It must include all the artifacts which are required by the
    
    945
    -        #        final product. Note that this is a superset of the build plan.
    
    946
    -        #
    
    947
    -        self._artifacts.append_required_artifacts((e for e in self._pipeline.dependencies(elements, Scope.ALL)))
    
    944
    +        self._artifacts.mark_required_elements(self._pipeline.dependencies(elements, Scope.ALL))
    
    948 945
     
    
    949 946
             if selection == PipelineSelection.PLAN and dynamic_plan:
    
    950 947
                 # We use a dynamic build plan, only request artifacts of top-level targets,
    

  • buildstream/element.py
    ... ... @@ -1486,15 +1486,20 @@ class Element(Plugin):
    1486 1486
                 workspace.clear_running_files()
    
    1487 1487
                 self._get_context().get_workspaces().save_config()
    
    1488 1488
     
    
    1489
    -            # We also need to update the required artifacts, since
    
    1490
    -            # workspaced dependencies do not have a fixed cache key
    
    1491
    -            # when the build starts.
    
    1489
    +            # This element will have already been marked as
    
    1490
    +            # required, but we bump the atime again, in case
    
    1491
    +            # we did not know the cache key until now.
    
    1492 1492
                 #
    
    1493
    -            # This does *not* cause a race condition, because
    
    1494
    -            # _assemble_done is called before a cleanup job may be
    
    1495
    -            # launched.
    
    1493
    +            # FIXME: This is not exactly correct, we should be
    
    1494
    +            #        doing this at the time which we have discovered
    
    1495
    +            #        a new cache key, this just happens to be the
    
    1496
    +            #        last place where that can happen.
    
    1496 1497
                 #
    
    1497
    -            self.__artifacts.append_required_artifacts([self])
    
    1498
    +            #        Ultimately, we should be refactoring
    
    1499
    +            #        Element._update_state() such that we know
    
    1500
    +            #        when a cache key is actually discovered.
    
    1501
    +            #
    
    1502
    +            self.__artifacts.mark_required_elements([self])
    
    1498 1503
     
    
    1499 1504
         # _assemble():
    
    1500 1505
         #
    

  • tests/artifactcache/expiry.py
    ... ... @@ -24,7 +24,7 @@ import pytest
    24 24
     from buildstream import _yaml
    
    25 25
     from buildstream._exceptions import ErrorDomain, LoadErrorReason
    
    26 26
     
    
    27
    -from tests.testutils import cli, create_element_size, wait_for_cache_granularity
    
    27
    +from tests.testutils import cli, create_element_size, update_element_size, wait_for_cache_granularity
    
    28 28
     
    
    29 29
     
    
    30 30
     DATA_DIR = os.path.join(
    
    ... ... @@ -93,6 +93,7 @@ def test_artifact_too_large(cli, datafiles, tmpdir, size):
    93 93
         create_element_size('target.bst', project, element_path, [], size)
    
    94 94
         res = cli.run(project=project, args=['build', 'target.bst'])
    
    95 95
         res.assert_main_error(ErrorDomain.STREAM, None)
    
    96
    +    res.assert_task_error(ErrorDomain.ARTIFACT, 'cache-too-full')
    
    96 97
     
    
    97 98
     
    
    98 99
     @pytest.mark.datafiles(DATA_DIR)
    
    ... ... @@ -196,24 +197,8 @@ def test_keep_dependencies(cli, datafiles, tmpdir):
    196 197
     
    
    197 198
     
    
    198 199
     # Assert that we never delete a dependency required for a build tree
    
    199
    -#
    
    200
    -# NOTE: This test expects that a build will fail if it attempts to
    
    201
    -#       put more artifacts in the cache than the quota can hold,
    
    202
    -#       and expects that the last two elements which don't fit into
    
    203
    -#       the quota wont even be built.
    
    204
    -#
    
    205
    -#       In real life, this will not be the case, since once we reach
    
    206
    -#       the estimated quota we launch a cache size calculation job and
    
    207
    -#       only launch a cleanup job when the size is calculated; and
    
    208
    -#       other build tasks will be scheduled while the cache size job
    
    209
    -#       is running.
    
    210
    -#
    
    211
    -#       This test only passes because we configure `builders` to 1,
    
    212
    -#       ensuring that the cache size job runs exclusively since it
    
    213
    -#       also requires a compute resource (a "builder").
    
    214
    -#
    
    215 200
     @pytest.mark.datafiles(DATA_DIR)
    
    216
    -def test_never_delete_dependencies(cli, datafiles, tmpdir):
    
    201
    +def test_never_delete_required(cli, datafiles, tmpdir):
    
    217 202
         project = os.path.join(datafiles.dirname, datafiles.basename)
    
    218 203
         element_path = 'elements'
    
    219 204
     
    
    ... ... @@ -226,37 +211,94 @@ def test_never_delete_dependencies(cli, datafiles, tmpdir):
    226 211
             }
    
    227 212
         })
    
    228 213
     
    
    229
    -    # Create a build tree
    
    230
    -    create_element_size('dependency.bst', project,
    
    231
    -                        element_path, [], 8000000)
    
    232
    -    create_element_size('related.bst', project,
    
    233
    -                        element_path, ['dependency.bst'], 8000000)
    
    234
    -    create_element_size('target.bst', project,
    
    235
    -                        element_path, ['related.bst'], 8000000)
    
    236
    -    create_element_size('target2.bst', project,
    
    237
    -                        element_path, ['target.bst'], 8000000)
    
    214
    +    # Create a linear build tree
    
    215
    +    create_element_size('dep1.bst', project, element_path, [], 8000000)
    
    216
    +    create_element_size('dep2.bst', project, element_path, ['dep1.bst'], 8000000)
    
    217
    +    create_element_size('dep3.bst', project, element_path, ['dep2.bst'], 8000000)
    
    218
    +    create_element_size('target.bst', project, element_path, ['dep3.bst'], 8000000)
    
    238 219
     
    
    239 220
         # We try to build this pipeline, but it's too big for the
    
    240 221
         # cache. Since all elements are required, the build should fail.
    
    241
    -    res = cli.run(project=project, args=['build', 'target2.bst'])
    
    222
    +    res = cli.run(project=project, args=['build', 'target.bst'])
    
    242 223
         res.assert_main_error(ErrorDomain.STREAM, None)
    
    224
    +    res.assert_task_error(ErrorDomain.ARTIFACT, 'cache-too-full')
    
    243 225
     
    
    244
    -    assert cli.get_element_state(project, 'dependency.bst') == 'cached'
    
    226
    +    # Only the first artifact fits in the cache, but we expect
    
    227
    +    # that the first *two* artifacts will be cached.
    
    228
    +    #
    
    229
    +    # This is because after caching the first artifact we must
    
    230
    +    # proceed to build the next artifact, and we cannot really
    
    231
    +    # know how large an artifact will be until we try to cache it.
    
    232
    +    #
    
    233
    +    # In this case, we deem it more acceptable to not delete an
    
    234
    +    # artifact which caused the cache to outgrow the quota.
    
    235
    +    #
    
    236
    +    # Note that this test only works because we have forced
    
    237
    +    # the configuration to build one element at a time, in real
    
    238
    +    # life there may potentially be N-builders cached artifacts
    
    239
    +    # which exceed the quota
    
    240
    +    #
    
    241
    +    assert cli.get_element_state(project, 'dep1.bst') == 'cached'
    
    242
    +    assert cli.get_element_state(project, 'dep2.bst') == 'cached'
    
    243
    +
    
    244
    +    assert cli.get_element_state(project, 'dep3.bst') != 'cached'
    
    245
    +    assert cli.get_element_state(project, 'target.bst') != 'cached'
    
    246
    +
    
    247
    +
    
    248
    +# Assert that we never delete a dependency required for a build tree,
    
    249
    +# even when the artifact cache was previously populated with
    
    250
    +# artifacts we do not require, and the new build is run with dynamic tracking.
    
    251
    +#
    
    252
    +@pytest.mark.datafiles(DATA_DIR)
    
    253
    +def test_never_delete_required_track(cli, datafiles, tmpdir):
    
    254
    +    project = os.path.join(datafiles.dirname, datafiles.basename)
    
    255
    +    element_path = 'elements'
    
    256
    +
    
    257
    +    cli.configure({
    
    258
    +        'cache': {
    
    259
    +            'quota': 10000000
    
    260
    +        },
    
    261
    +        'scheduler': {
    
    262
    +            'builders': 1
    
    263
    +        }
    
    264
    +    })
    
    245 265
     
    
    246
    -    # This is *technically* above the cache limit. BuildStream accepts
    
    247
    -    # some fuzziness, since it's hard to assert that we don't create
    
    248
    -    # an artifact larger than the cache quota. We would have to remove
    
    249
    -    # the artifact after-the-fact, but since it is required for the
    
    250
    -    # current build and nothing broke yet, it's nicer to keep it
    
    251
    -    # around.
    
    266
    +    # Create a linear build tree
    
    267
    +    repo_dep1 = create_element_size('dep1.bst', project, element_path, [], 2000000)
    
    268
    +    repo_dep2 = create_element_size('dep2.bst', project, element_path, ['dep1.bst'], 2000000)
    
    269
    +    repo_dep3 = create_element_size('dep3.bst', project, element_path, ['dep2.bst'], 2000000)
    
    270
    +    repo_target = create_element_size('target.bst', project, element_path, ['dep3.bst'], 2000000)
    
    271
    +
    
    272
    +    # This should all fit into the artifact cache
    
    273
    +    res = cli.run(project=project, args=['build', 'target.bst'])
    
    274
    +    res.assert_success()
    
    275
    +
    
    276
    +    # They should all be cached
    
    277
    +    assert cli.get_element_state(project, 'dep1.bst') == 'cached'
    
    278
    +    assert cli.get_element_state(project, 'dep2.bst') == 'cached'
    
    279
    +    assert cli.get_element_state(project, 'dep3.bst') == 'cached'
    
    280
    +    assert cli.get_element_state(project, 'target.bst') == 'cached'
    
    281
    +
    
    282
    +    # Now increase the size of all the elements
    
    252 283
         #
    
    253
    -    # This scenario is quite unlikely, and the cache overflow will be
    
    254
    -    # resolved if the user does something about it anyway.
    
    284
    +    update_element_size('dep1.bst', project, repo_dep1, 8000000)
    
    285
    +    update_element_size('dep2.bst', project, repo_dep2, 8000000)
    
    286
    +    update_element_size('dep3.bst', project, repo_dep3, 8000000)
    
    287
    +    update_element_size('target.bst', project, repo_target, 8000000)
    
    288
    +
    
    289
    +    # Now repeat the same test we did in test_never_delete_required(),
    
    290
    +    # except this time let's add dynamic tracking
    
    255 291
         #
    
    256
    -    assert cli.get_element_state(project, 'related.bst') == 'cached'
    
    292
    +    res = cli.run(project=project, args=['build', '--track-all', 'target.bst'])
    
    293
    +    res.assert_main_error(ErrorDomain.STREAM, None)
    
    294
    +    res.assert_task_error(ErrorDomain.ARTIFACT, 'cache-too-full')
    
    257 295
     
    
    296
    +    # Expect the same result that we did in test_never_delete_required()
    
    297
    +    #
    
    298
    +    assert cli.get_element_state(project, 'dep1.bst') == 'cached'
    
    299
    +    assert cli.get_element_state(project, 'dep2.bst') == 'cached'
    
    300
    +    assert cli.get_element_state(project, 'dep3.bst') != 'cached'
    
    258 301
         assert cli.get_element_state(project, 'target.bst') != 'cached'
    
    259
    -    assert cli.get_element_state(project, 'target2.bst') != 'cached'
    
    260 302
     
    
    261 303
     
    
    262 304
     # Ensure that only valid cache quotas make it through the loading
    

  • tests/testutils/__init__.py
    ... ... @@ -26,6 +26,6 @@
    26 26
     from .runcli import cli, cli_integration
    
    27 27
     from .repo import create_repo, ALL_REPO_KINDS
    
    28 28
     from .artifactshare import create_artifact_share
    
    29
    -from .element_generators import create_element_size
    
    29
    +from .element_generators import create_element_size, update_element_size
    
    30 30
     from .junction import generate_junction
    
    31 31
     from .runner_integration import wait_for_cache_granularity

  • tests/testutils/element_generators.py
    1 1
     import os
    
    2 2
     
    
    3 3
     from buildstream import _yaml
    
    4
    +from buildstream import utils
    
    5
    +
    
    6
    +from . import create_repo
    
    4 7
     
    
    5 8
     
    
    6 9
     # create_element_size()
    
    7 10
     #
    
    8
    -# This will open a "<name>_data" file for writing and write
    
    9
    -# <size> MB of urandom (/dev/urandom) "stuff" into the file.
    
    10
    -# A bst import element file is then created: <name>.bst
    
    11
    +# Creates an import element with a git repo, using random
    
    12
    +# data to create a file in that repo of the specified size,
    
    13
    +# such that building it will add an artifact of the specified
    
    14
    +# size to the artifact cache.
    
    11 15
     #
    
    12 16
     # Args:
    
    13
    -#  name: (str) of the element name (e.g. target.bst)
    
    14
    -#  path: (str) pathway to the project/elements directory
    
    15
    -#  dependencies: A list of strings (can also be an empty list)
    
    16
    -#  size: (int) size of the element in bytes
    
    17
    +#    name: (str) of the element name (e.g. target.bst)
    
    18
    +#    project_dir (str): The path to the project
    
    19
    +#    element_path (str): The element path within the project
    
    20
    +#    dependencies: A list of strings (can also be an empty list)
    
    21
    +#    size: (int) size of the element in bytes
    
    17 22
     #
    
    18 23
     # Returns:
    
    19
    -#  Nothing (creates a .bst file of specified size)
    
    24
    +#    (Repo): A git repo which can be used to introduce trackable changes
    
    25
    +#            by using the update_element_size() function below.
    
    20 26
     #
    
    21 27
     def create_element_size(name, project_dir, elements_path, dependencies, size):
    
    22 28
         full_elements_path = os.path.join(project_dir, elements_path)
    
    23 29
         os.makedirs(full_elements_path, exist_ok=True)
    
    24 30
     
    
    25
    -    # Create a file to be included in this element's artifact
    
    26
    -    with open(os.path.join(project_dir, name + '_data'), 'wb+') as f:
    
    27
    -        f.write(os.urandom(size))
    
    31
    +    # Create a git repo
    
    32
    +    repodir = os.path.join(project_dir, 'repos')
    
    33
    +    repo = create_repo('git', repodir, subdir=name)
    
    34
    +
    
    35
    +    with utils._tempdir(dir=project_dir) as tmp:
    
    36
    +
    
    37
    +        # We use a data/ subdir in the git repo we create,
    
    38
    +        # and we set the import element to only extract that
    
    39
    +        # part; this ensures we never include a .git/ directory
    
    40
    +        # in the cached artifacts for these sized elements.
    
    41
    +        #
    
    42
    +        datadir = os.path.join(tmp, 'data')
    
    43
    +        os.makedirs(datadir)
    
    44
    +
    
    45
    +        # Use /dev/urandom to create the sized file in the datadir
    
    46
    +        with open(os.path.join(datadir, name), 'wb+') as f:
    
    47
    +            f.write(os.urandom(size))
    
    48
    +
    
    49
    +        # Create the git repo from the temp directory
    
    50
    +        ref = repo.create(tmp)
    
    28 51
     
    
    29
    -    # Simplest case: We want this file (of specified size) to just
    
    30
    -    # be an import element.
    
    31 52
         element = {
    
    32 53
             'kind': 'import',
    
    33 54
             'sources': [
    
    34
    -            {
    
    35
    -                'kind': 'local',
    
    36
    -                'path': name + '_data'
    
    37
    -            }
    
    55
    +            repo.source_config(ref=ref)
    
    38 56
             ],
    
    57
    +        'config': {
    
    58
    +            # Extract only the data directory
    
    59
    +            'source': 'data'
    
    60
    +        },
    
    39 61
             'depends': dependencies
    
    40 62
         }
    
    41 63
         _yaml.dump(element, os.path.join(project_dir, elements_path, name))
    
    64
    +
    
    65
    +    # Return the repo, so that it can later be used to add commits
    
    66
    +    return repo
    
    67
    +
    
    68
    +
    
    69
    +# update_element_size()
    
    70
    +#
    
    71
    +# Updates a repo returned by create_element_size() such that
    
    72
    +# the newly added commit is completely changed, and has the newly
    
    73
    +# specified size.
    
    74
    +#
    
    75
    +# The name and project_dir arguments must match the arguments
    
    76
    +# previously given to create_element_size()
    
    77
    +#
    
    78
    +# Args:
    
    79
    +#    name: (str) of the element name (e.g. target.bst)
    
    80
    +#    project_dir (str): The path to the project
    
    81
    +#    repo: (Repo) The Repo returned by create_element_size()
    
    82
    +#    size: (int) The new size which the element generates, in bytes
    
    83
    +#
    
    84
    +# Returns:
    
    85
    +#    (Repo): A git repo which can be used to introduce trackable changes
    
    86
    +#            by using the update_element_size() function below.
    
    87
    +#
    
    88
    +def update_element_size(name, project_dir, repo, size):
    
    89
    +
    
    90
    +    with utils._tempdir(dir=project_dir) as tmp:
    
    91
    +
    
    92
    +        new_file = os.path.join(tmp, name)
    
    93
    +
    
    94
    +        # Use /dev/urandom to create the sized file in the datadir
    
    95
    +        with open(new_file, 'wb+') as f:
    
    96
    +            f.write(os.urandom(size))
    
    97
    +
    
    98
    +        # Modify the git repo with a new commit to the same path,
    
    99
    +        # replacing the original file with a new one.
    
    100
    +        repo.modify_file(new_file, os.path.join('data', name))

  • tests/testutils/repo/git.py
    ... ... @@ -52,6 +52,13 @@ class Git(Repo):
    52 52
             self._run_git('commit', '-m', 'Added {}'.format(os.path.basename(filename)))
    
    53 53
             return self.latest_commit()
    
    54 54
     
    
    55
    +    def modify_file(self, new_file, path):
    
    56
    +        shutil.copy(new_file, os.path.join(self.repo, path))
    
    57
    +        subprocess.call([
    
    58
    +            'git', 'commit', path, '-m', 'Modified {}'.format(os.path.basename(path))
    
    59
    +        ], env=GIT_ENV, cwd=self.repo)
    
    60
    +        return self.latest_commit()
    
    61
    +
    
    55 62
         def add_submodule(self, subdir, url=None, checkout=None):
    
    56 63
             submodule = {}
    
    57 64
             if checkout is not None:
    



  • [Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]