[gimp-web/wip/Jehan/fix-ci: 1/2] tools: make the HTTP checks concurrently.




commit 4243faa07e9105ac63f03460a70361676d15b46d
Author: Jehan <jehan girinstud io>
Date:   Sun May 1 16:01:00 2022 +0200

    tools: make the HTTP checks concurrently.
    
    With our now 30 mirrors, the whole check just takes too long, especially
    with checksum verification. It's even worse in the CI with the now
    1h-max job run at platform level. So basically our check can never
    finish in the CI.
    This will make the whole mirror verification much faster.

 tools/downloads/gimp-check-mirrors.py | 122 ++++++++++++++++++----------------
 1 file changed, 66 insertions(+), 56 deletions(-)
---
diff --git a/tools/downloads/gimp-check-mirrors.py b/tools/downloads/gimp-check-mirrors.py
index 8fd31496..2ba16305 100755
--- a/tools/downloads/gimp-check-mirrors.py
+++ b/tools/downloads/gimp-check-mirrors.py
@@ -2,7 +2,7 @@
 
 import os
 import argparse
-#import concurrent.futures
+import concurrent.futures
 import fileinput
 import hashlib
 import re
@@ -93,6 +93,44 @@ def find_latest():
 
     return latest
 
+def verify_remote(uri, checksum):
+  success = False
+  try:
+    if checksum is not None:
+      with requests.get(uri, stream=True) as response:
+        m = hashlib.sha256()
+        # I don't think the chunk_size is useful, since docs
+        # says that "stream=True will read data as it arrives
+        # in whatever size the chunks are received" which is
+        # the ideal way. But if it doesn't, let's use 2**16 as
+        # a reasonable chunk size to process.
+        for line in response.iter_content(chunk_size=65536, decode_unicode=False):
+          m.update(line)
+        if m.digest() == origin_sum.digest():
+          checksum_text = ' (checksum OK)'
+          success = True
+        else:
+          checksum_text = ' (checksum KO)'
+        print(str(response.status_code) + ' : ' + uri + checksum_text)
+    else:
+      response = requests.head(url=uri, timeout=20, allow_redirects=True)
+      print(str(response.status_code) + ' : ' + uri)
+      if response.status_code == 200:
+        success = True
+  except requests.exceptions.ConnectionError as error:
+    print('Connection error: ' + uri)
+  except requests.exceptions.ConnectTimeout as error:
+    print('Connection timed out: ' + uri)
+  except requests.exceptions.ReadTimeout as error:
+    print('Read timed out: ' + uri)
+  except requests.exceptions.TooManyRedirects as error:
+    print('Too many redirects: ' + uri)
+  except OSError as error:
+    print(str(error.strerror) + ' : ' + uri)
+  sys.stdout.flush()
+
+  return success
+
 if len(args.uris) == 0:
     print("No URIs given as argument. Trying to guess the last packages.")
     local_uris = find_latest()
@@ -111,63 +149,35 @@ check_count = 0
 for local_uri in local_uris:
     print("Checking: {}".format(local_uri))
 
+    origin_checksum = None
     if args.verify_checksum:
-        with requests.get('https://download.gimp.org/pub/gimp/' + local_uri, stream=True) as response:
-          origin_sum = hashlib.sha256()
-          for line in response.iter_content(chunk_size=65536, decode_unicode=False):
-            origin_sum.update(line)
-
-    # read mirrors file
-    # fileinput.
-    #with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
-    with fileinput.input(files=(args.mirrorsfile), mode='r') as f:
+      with requests.get('https://download.gimp.org/pub/gimp/' + local_uri, stream=True) as response:
+        origin_sum = hashlib.sha256()
+        for line in response.iter_content(chunk_size=65536, decode_unicode=False):
+          origin_sum.update(line)
+      origin_checksum = origin_sum.digest()
+
+    test_results = []
+
+    # Docs says: "If max_workers is None or not given, it will default
+    # to the number of processors on the machine, multiplied by 5",
+    # which is fine by us.
+    with concurrent.futures.ThreadPoolExecutor(max_workers=None) as executor:
+      with fileinput.input(files=(args.mirrorsfile), mode='r') as f:
         for line in f:
-            if args.verify_checksum and line.strip() == 'https://download.gimp.org/pub/gimp/':
-              # Passing the main server which is also in the list. It's
-              # our checksum base, we don't need to check it again.
-              continue
-            mirror_uri = line.strip() + local_uri
-            check_count += 1
-
-            try:
-                if args.verify_checksum:
-                  with requests.get(mirror_uri, stream=True) as response:
-                    m = hashlib.sha256()
-                    # I don't think the chunk_size is useful, since docs
-                    # says that "stream=True will read data as it arrives
-                    # in whatever size the chunks are received" which is
-                    # the ideal way. But if it doesn't, let's use 2**16 as
-                    # a reasonable chunk size to process.
-                    for line in response.iter_content(chunk_size=65536, decode_unicode=False):
-                      m.update(line)
-                    if m.digest() == origin_sum.digest():
-                      checksum_text = ' (checksum OK)'
-                    else:
-                      checksum_text = ' (checksum KO)'
-                      error_count += 1
-                    print(str(response.status_code) + ' : ' + mirror_uri + checksum_text)
-                else:
-                  response = requests.head(url=mirror_uri, timeout=20, allow_redirects=True)
-                  print(str(response.status_code) + ' : ' + mirror_uri)
-                  if response.status_code != 200:
-                      error_count += 1
-            except requests.exceptions.ConnectionError as error:
-                error_count += 1
-                print('Connection error: ' + mirror_uri)
-            except requests.exceptions.ConnectTimeout as error:
-                error_count += 1
-                print('Connection timed out: ' + mirror_uri)
-            except requests.exceptions.ReadTimeout as error:
-                error_count += 1
-                print('Read timed out: ' + mirror_uri)
-            except requests.exceptions.TooManyRedirects as error:
-                error_count += 1
-                print('Too many redirects: ' + mirror_uri)
-            except OSError as error:
-                error_count += 1
-                print(str(error.strerror) + ' : ' + mirror_uri)
-            sys.stdout.flush()
-        print()
+          if args.verify_checksum and line.strip() == 'https://download.gimp.org/pub/gimp/':
+            # Passing the main server which is also in the list. It's
+            # our checksum base, we don't need to check it again.
+            continue
+          mirror_uri = line.strip() + local_uri
+          check_count += 1
+
+          future = executor.submit(verify_remote, mirror_uri, origin_checksum)
+          test_results += [future]
+    for future in test_results:
+      if not future.result():
+        error_count += 1
+    print()
 
 if error_count == 0:
     sys.exit(os.EX_OK)


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]