"Regression testing" for sysadmin problems



So, thinking about my previous mail, I came up with the idea that we
could basically just have a directory full of shell scripts, that
get run frequently and either pass or fail.

So, basically, if there were any failures, sysadmins would get a mail:

---
[FAIL] Latest Django version
[FAIL] SSL Certificate expiry
 
Please log into https://sysadmin.gnome.org/checks for more information
---

(I think it would be explicitly minimal like that without detail, since
people are never supposed to try and parse out "is this OK" from the
mail. If you get the mail, it's not OK.)

Then https://sysadmin.gnome.org/checks would be a simple static site,
with the front page being a list of failures and passes, with links to a
page for each test which would have:

 Detailed description of the test
 Output of the test

To get the idea of what scripts would do, I'll attach two scripts here;
one looks at releases tags in Django version control and sees if the
version of Django in the local GNOME repository matches what has been
tagged most recently. Another checks the certificates for
extensions.gnome.org and bugzilla.gnome.org to see if they are expiring
in less than a month.

The basic idea of tests is that they are quick and dirty coded and
designed to "fail closed" - that is, report an error if the test
breaks. At that point, the test _must_ be fixed. These two tests
are written in Python, but you could mix in tests written in shell.

I know we have nagios, and there are all sorts of monitoring solutions
that have real development teams, but the basic idea is to try to put
absolutely minimal effort and get to a single web page that someone
(like the GNOME board) could look at and see "is the sysadmin team doing
a minimally OK job".

Does this make any sense? If so, I can spend another day on it to create
the driver code that writes out the website and sends mail on failure.

- Owen

#!/usr/bin/python
# Latest Django release
#
# This test checks that the version of Django that we have packaged in our
# local GNOME repositories matches the latest release that upstream has made
# on the branch we are following.

# The upstream version we are following
MAJOR_MINOR = "1.3"

import json
import re
import subprocess
import sys

def get_releases_from_bitbucket():
    process = subprocess.Popen(['curl', '-s', 'https://api.bitbucket.org/1.0/repositories/django/django/tags'], stdout=subprocess.PIPE)
    output, error = process.communicate()
    decoded = json.loads(output)

    releases = []

    for tag in decoded:
        m = re.match('^releases/(.*)', tag)
        if m is not None:
            releases.append(m.group(1))

    return releases

def get_releases_from_svn():
    process = subprocess.Popen(['svn', 'ls', 'http://code.djangoproject.com/svn/django/tags/releases/'], stdout=subprocess.PIPE)
    output, error = process.communicate()

    releases = []

    for line in output.split('\n'):
        m = re.match('^(.*)/$', line)
        if m is not None:
            releases.append(m.group(1))

    return releases

# We care only about the current branch, the current branch running out
# of the support lifetime upstream is something we need to handle
# separately
MAJOR_MINOR_SPLIT = [int(v) for v in MAJOR_MINOR.split('.')]

def latest_release(releases):
    max_release = [0,0,0,0]
    for release in releases:
       release = [int(v) for v in release.split('.')]
       if len(release) >= 2 and release[0:2] == MAJOR_MINOR_SPLIT:
           if release > max_release:
               max_release = release

    return '.'.join(str(x) for x in max_release)

def current_gnome_release():
    process = subprocess.Popen(['yum', '--disablerepo=*', '--enablerepo=gnome-*', 'list' ,'available', 'Django'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    output, error = process.communicate()

    i = output.split('\n').__iter__()
    for line in i:
        if re.match('^Available Packages', line):
            break
    for line in i:
        m = re.match(r'^\S+\s+([\d\.]+)-.*', line)
        if m:
            return m.group(1)

    return None

latest_svn_release = latest_release(get_releases_from_svn())
latest_bitbucket_release = latest_release(get_releases_from_bitbucket())
latest_bitbucket_release = latest_svn_release

if latest_svn_release != latest_bitbucket_release:
    print "Latest tagged release retrieved from SVN is %s and from bitbucket is %s" % (latest_svn_release, latest_bitbucket_release)
    print "This means that the Django project has updated their version control systems"
    print "or our release retrieval code is failing."
    sys.exit(1)

current_release = current_gnome_release()

if current_gnome_release != latest_svn_release:
    print "Current GNOME package version of Django-%s is '%s'." % (MAJOR_MINOR, current_release)
    print "Current upstream released version is '%s'." % (latest_svn_release,)
    print "This mismatch probably means that we are missing security fixes."
    print "Our policy is that we must always package the latest upstream release."
    sys.exit(1)
#!/usr/bin/python
# SSL Certificate Expiry
# 
# This test checks that expiration time for the configured web servers
# is more than 30 days in the future

import re
import rfc822
import subprocess
import sys
import time

HOSTS=[
    'bugzilla.gnome.org',
    'extensions.gnome.org',
    ]

def get_ssl_expiry(host):
    process = subprocess.Popen(['openssl', 's_client', '-host', host, '-port', '443'],
                               stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE)
    output, error = process.communicate("")
    if process.returncode != 0:
        print "Failed to download cert from", host
        print error
        return None

    cert_lines = []

    i = output.split('\n').__iter__()

    for line in i:
        if re.match('-----BEGIN CERTIFICATE-----', line):
            cert_lines.append(line)
            break

    for line in i:
        cert_lines.append(line)
        if re.match('-----END CERTIFICATE-----', line):
            break

    cert = '\n'.join(cert_lines)

    process = subprocess.Popen(['openssl', 'x509', '-dates', '-noout'],
                               stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE)
    output, error = process.communicate(cert)
    if process.returncode != 0:
        print "Failed to parse cert"
        print error
        return None

    not_after=None
    for line in output.split('\n'):
        m = re.match('notAfter=(.*)', line)
        if m:
            not_after = m.group(1)

    if not_after is None:
        print "Failed to get date from cert"
        return None

    return not_after

failed = False
for host in HOSTS:
    not_after = get_ssl_expiry(host)
    expiry_time = time.mktime(rfc822.parsedate(not_after))
    if expiry_time < time.time() + 30 * 24 * 60 * 60:
        print "SSL Certificate for %s expires at %s." % (host, not_after)
        print "This is less than 30 days from now."
        print "The certificate needs to be renewed immediately."
        print
        failed = True
    else:
        print "%s: OK - (%s)" % (host, not_after)

if failed:
    sys.exit(1)



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]