[sysadmin-bin] inactive-gitlab-users.py: rewrite and add language and spam checks



commit 296a10b80e01676dd9890b8cd88ed2b9ad560133
Author: Bartłomiej Piotrowski <bpiotrowski gnome org>
Date:   Mon Jul 22 12:06:31 2019 +0200

    inactive-gitlab-users.py: rewrite and add language and spam checks

 gitlab/inactive-gitlab-users.py | 265 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 235 insertions(+), 30 deletions(-)
---
diff --git a/gitlab/inactive-gitlab-users.py b/gitlab/inactive-gitlab-users.py
old mode 100755
new mode 100644
index 78f5eda..088db38
--- a/gitlab/inactive-gitlab-users.py
+++ b/gitlab/inactive-gitlab-users.py
@@ -1,35 +1,240 @@
 #!/usr/bin/python
 
-import dateutil.relativedelta as relativedelta
-import datetime as dt
+from __future__ import print_function
+
+import argparse
+import datetime
+import json
+import os
+import re
+import sys
+
 import gitlab
+import polyglot.detect
+import pytz
+from dateutil.parser import parse as dateparser
+from dateutil.relativedelta import relativedelta
+from gitlab.exceptions import GitlabGetError
+from spam.surbl import SurblChecker
+from spam import DomainInexistentException
+
+
+def timestamp2date(timestamp):
+    if timestamp:
+        return timestamp.split("T")[0]
+    else:
+        return None
+
+
+def check_if_spam(url):
+    if not re.match("http[s]?://", url, re.IGNORECASE):
+        url = "https://{}".format(url)
+
+    try:
+        surblchecker = SurblChecker()
+        surbl = surblchecker.is_spam(url)
+    except:
+        surbl = False
+
+    return surbl
+
+
+def get_inactive_users(gl):
+    trusted_domains = [
+        "canonical.com",
+        "debian.org",
+        "endlessm.com",
+        "fedoraproject.org",
+        "gentoo.org",
+        "igalia.com",
+        "gnome.org",
+        "opensuse.org"
+        "redhat.com",
+        "suse.com",
+        "ubuntu.com",
+    ]
+
+    fields = [
+        "username",
+        "email",
+        "id",
+        "bio",
+        "website_url",
+        "created_at",
+        "current_sign_in_at",
+        "last_activity_on",
+    ]
+
+    trusted_users = gl.users.list(custom_attributes={"trusted": "true"}, all=True)
+    users = gl.users.list(as_list=False, order_by="created_at", sort="asc")
+    results = []
+
+    for user in users:
+        attrs = user.attributes
+        userdata = {field: str(attrs[field]) for field in fields}
+
+        if user in trusted_users:
+            continue
+
+        identities = [identity["provider"] for identity in attrs["identities"]]
+        if "ldapmain" in identities:
+            user.customattributes.set("trusted", "true")
+
+        if attrs["email"].split("@")[1] in trusted_domains:
+            user.customattributes.set("trusted", "true")
+            continue
+
+        if attrs["two_factor_enabled"]:
+            user.customattributes.set("trusted", "true")
+            continue
+
+        # Skip user if registered this month
+        timedelta = datetime.datetime.now(pytz.utc) - relativedelta(months=1)
+        if dateparser(attrs["created_at"]) > timedelta:
+            continue
+
+        created_at = timestamp2date(attrs["created_at"])
+        current_sign_in_at = timestamp2date(attrs["current_sign_in_at"])
+
+        # If user logged in only once or never, check if they made any action.
+        if (created_at == current_sign_in_at) or (not current_sign_in_at):
+            events = user.events.list(all=True)
+            if len(events) == 0:
+                userdata['reason'] = "inactivity"
+                results.append(userdata)
+                continue
+
+        if attrs["bio"] and len(attrs["bio"]) > 0:
+            # Some users set URL in bio, check it against surbl
+            if re.match("http[s]?://", attrs["bio"], re.IGNORECASE):
+                if check_if_spam(attrs["bio"]):
+                    userdata['reason'] = "spam"
+                    results.append(userdata)
+                    continue
+
+            # We have a problem with spam accounts with descriptions in some languages
+            try:
+                unwanted_langs = ["id", "es", "fr", "ms", "vi", "pt"]
+                detector = polyglot.detect.Detector(attrs["bio"])
+                lang = detector.language
+
+                if detector.reliable and lang.code in unwanted_langs and lang.confidence > 95:
+                    userdata['reason'] = "language"
+                    results.append(userdata)
+                    continue
+            except polyglot.detect.base.UnknownLanguage:
+                pass
+
+        if attrs["website_url"] and len(attrs["website_url"]) > 0:
+            if check_if_spam(attrs['website_url']):
+                userdata['reason'] = "spam"
+                results.append(userdata)
+                continue
+
+    return results
+
+
+def trust_user(gl, user_id):
+    user = gl.users.get(user_id, lazy=True)
+    user.customattributes.set("trusted", "true")
+    print(user_id)
+
+
+def untrust_user(gl, user_id):
+    user = gl.users.get(user_id, lazy=True)
+    user.customattributes.delete("trusted")
+    print(user_id)
+
+
+def delete_user(gl, user_id):
+    try:
+        gl.users.delete(user_id)
+        print(user_id)
+    except gitlab.exceptions.GitlabDeleteError:
+        pass
+
+
+def trust_all_groups(gl):
+    groups = gl.groups.list(all=True, visibility="public")
+    parent_groups = [grp for grp in groups if not grp.attributes["parent_id"]]
+    members = set()
+
+    for group in parent_groups:
+        group_members = group.members.all(all=True)
+        members.update(group_members)
+
+    for user in members:
+        trust_user(gl, user.id)
+
+
+def trust_2fa_users(gl):
+    users = gl.users.list(all=True, two_factor='enabled', as_list=False)
+
+    for user in users:
+        trust_user(gl, user.id)
+
+
+if __name__ == "__main__":
+    GITLAB_TOKEN = os.getenv("GITLAB_TOKEN")
+    if GITLAB_TOKEN is None:
+        with open("/home/admin/secret/gitlab_rw") as f:
+            tokenfile = f.readline()
+        GITLAB_TOKEN = tokenfile.rstrip().split("=")[1]
+
+    gl = gitlab.Gitlab(
+        "https://gitlab.gnome.org";, private_token=GITLAB_TOKEN, per_page=100
+    )
+    gl.auth()
+
+    parser = argparse.ArgumentParser()
+    subparsers = parser.add_subparsers(dest="command")
+
+    inactive = subparsers.add_parser("get-inactive", help="get inactive users")
+    trust_groups = subparsers.add_parser(
+        "trust-groups", help="mark users which are member of any group as trusted"
+    )
+    trust_2fa = subparsers.add_parser(
+        "trust-2fa", help="mark users which have two factor authenticated enabled as trusted"
+    )
+
+    trust = subparsers.add_parser("trust", help="mark users as trusted")
+    trust.add_argument("user_id", nargs="+", help="user IDs to mark as trusted")
+
+    untrust = subparsers.add_parser("untrust", help="mark users as untrusted")
+    untrust.add_argument("user_id", nargs="+", help="user IDs to mark as untrusted")
+
+    delete = subparsers.add_parser("delete", help="delete users")
+    delete.add_argument("user_id", nargs="+", help="user IDs to delete")
+
+    delete_from_file = subparsers.add_parser(
+        "delete-from-json",
+        help="delete users from json file generated with get-inactive",
+    )
+    delete_from_file.add_argument("filename", help="path to json file")
+
+    args = parser.parse_args()
+
+    if args.command == "get-inactive":
+        inactive = get_inactive_users(gl)
+        print(json.dumps(inactive, indent=4, separators=(",", ": ")))
+    elif args.command == "trust":
+        for id in args.user_id:
+            trust_user(gl, id)
+    elif args.command == "untrust":
+        for id in args.user_id:
+            untrust_user(gl, id)
+    elif args.command == "delete":
+        for id in args.user_id:
+            delete_user(gl, id)
+    elif args.command == "delete-from-json":
+        with open(args.filename, "r") as f:
+            users = json.load(f)
 
-execfile('/home/admin/secret/gitlab_rw')
-gl = gitlab.Gitlab('https://gitlab.gnome.org', GITLAB_PRIVATE_RW_TOKEN, api_version=4)
-
-users = gl.users.list(all=True, per_page=100)
-today = dt.date.today()
-timedelta = relativedelta.relativedelta(months=3)
-
-whitelist = ['debian', 'ubuntu', 'redhat',
-             'canonical', 'suse', 'fedoraproject',
-             'gnome', 'gentoo']
-
-print 'username,email,id,created_at,current_sign_in_at'
-
-is_ldap = False
-for user in users:
-    for index, _ in enumerate(user.attributes['identities']):
-        if user.attributes['identities'][index]['provider'] == 'ldapmain':
-            is_ldap = True
-
-    if not is_ldap:
-        if user.attributes['username'] != 'ghost':
-            if not user.attributes['last_activity_on'] and user.attributes['current_sign_in_at']:
-                if user.attributes['email'].split('@')[1].split('.')[-2] not in whitelist:
-                    if user.attributes['created_at'].split('T')[0] == 
user.attributes['current_sign_in_at'].split('T')[0]:
-                        if len(user.events.list()) == 0:
-                            if dt.datetime.strptime(user.attributes['created_at'].split('T')[0], 
'%Y-%m-%d').date() < (today - timedelta):
-                                print '{},{},{},{},{}'.format(user.attributes['username'], 
user.attributes['email'], user.attributes['id'], user.attributes['created_at'].split('T')[0], 
user.attributes['current_sign_in_at'].split('T')[0])
+        for user in users:
+            delete_user(gl, user["id"])
+    elif args.command == "trust-groups":
+        trust_all_groups(gl)
+    elif args.command == "trust-2fa":
+        trust_2fa_users(gl)
     else:
-        is_ldap = False
+        parser.print_help()


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]