[nanny] Add initial DansGuardianImporter



commit 9bd50505e28154beae322075371aaf6914ba65c5
Author: Roberto Majadas <roberto majadas openshine com>
Date:   Fri Jan 29 04:26:56 2010 +0100

    Add initial DansGuardianImporter

 daemon/src/DansGuardianImporter.py |  223 ++++++++++++++++++++++++++++++++++++
 daemon/src/Makefile.am             |   19 ++--
 2 files changed, 233 insertions(+), 9 deletions(-)
---
diff --git a/daemon/src/DansGuardianImporter.py b/daemon/src/DansGuardianImporter.py
new file mode 100644
index 0000000..5427201
--- /dev/null
+++ b/daemon/src/DansGuardianImporter.py
@@ -0,0 +1,223 @@
+#!/usr/bin/python
+
+# Copyright (C) 2009 Junta de Andalucia
+# 
+# Authors:
+#   Roberto Majadas <roberto.majadas at openshine.com>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public
+# License as published by the Free Software Foundation; either
+# version 2 of the License, or (at your option) any later version.
+
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Library General Public License for more details.
+
+# You should have received a copy of the GNU General Public
+# License along with this library; if not, write to the Free
+# Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+
+import os
+import tempfile
+import tarfile
+
+import gobject
+import gio
+
+import sqlite3
+
+from hachoir_regex import PatternMatching
+
+class DansGuardianImporter (gobject.GObject):
+    __gsignals__ = {
+        'progress-status' : (gobject.SIGNAL_RUN_LAST, gobject.TYPE_NONE,
+                             (gobject.TYPE_PYOBJECT, gobject.TYPE_PYOBJECT,)),
+        }
+
+    def __init__(self, in_url, out_path):
+        gobject.GObject.__init__(self)
+        self.in_url = in_url
+        self.out_path = out_path
+        self.conn = None
+
+    def run(self):
+        self.__create_sqlite()
+        self.__copy_dansguardian_file()
+        self.__dansguardian_2_sqlite()
+        self.conn.close()
+
+    def __create_sqlite(self):
+        if os.path.exists(self.out_path) :
+            gio.File(self.out_path).move(gio.File(self.out_path + ".bak"))
+
+        self.conn = sqlite3.connect(self.out_path)
+        c = self.conn.cursor()
+        c.execute('create table black_domains (category text, regexp text)')
+        c.execute('create table black_urls (category text, regexp text)')
+        c.execute('create table white_domains (category text, regexp text)')
+        c.execute('create table white_urls (category text, regexp text)')
+        self.conn.commit()
+
+    def __download_dansguardian_progress_cb(self, current, total, data):
+        pass
+
+    def __copy_dansguardian_file(self):
+        self.tmp_dansguardian = os.path.join(tempfile.mkdtemp(), os.path.basename(self.in_url))
+        gio.File(self.in_url).copy(gio.File(self.tmp_dansguardian),
+                                   self.__download_dansguardian_progress_cb,
+                                   0, None, None)
+
+    def __dansguardian_2_sqlite(self):
+        tfile = tarfile.open(self.tmp_dansguardian)
+        for member in tfile :
+            m_fd = None
+            if "whitelist" in member.name.lower() :
+                is_black = False
+            else:
+                is_black = True
+
+            if os.path.basename(member.name) == "urls" and member.isfile():
+                m_fd = tfile.extractfile(member.name)
+                itype = "url"
+                category = os.path.basename(os.path.dirname(member.name))
+                self.__add_items_2_sqlite(m_fd, category, is_black, itype)
+                
+            elif os.path.basename(member.name) == "domains" and member.isfile():
+                m_fd = tfile.extractfile(member.name)
+                itype = "domain"
+                category = os.path.basename(os.path.dirname(member.name))
+                self.__add_items_2_sqlite(m_fd, category, is_black, itype)
+                
+            else:
+                continue
+
+    def __add_items_2_sqlite(self, fd, category, is_black, itype):
+        if itype == "domain" :
+            domains = []
+            for line in fd.readlines() :
+                dg_domain=line.replace("\r","").replace("\n", "").replace(" ","").decode("iso8859-15").lower()
+                tmp_domain=''
+                tmp_domain_item_list = dg_domain.split(".")
+                tmp_domain_item_list.reverse()
+                for x in tmp_domain_item_list:
+                    tmp_domain = tmp_domain + x + "."
+                tmp_domain=tmp_domain[:-1]
+                domains.append(tmp_domain)
+            
+            domains.sort()
+
+            p = PatternMatching()
+            i = 0
+            step = False
+            
+            for domain in domains :
+                p.addString(str(domain))
+                i = i + 1
+                if step == False and i % 500 == 0 :
+                    if len(str(p.regex)) > 20000 :
+                        step = True
+                        continue
+                    
+                elif step == True and i % 100 == 0 :
+                    if len(str(p.regex)) > 30000 :
+                        print "Domains -> To sqlite!! (%s, %s)"  % (i, len(str(p.regex)))
+                        self.__insert_domain_into_sqlite(category, str(p.regex), is_black)
+                        p = PatternMatching()
+                        step = False
+                        i = 0
+            
+            if len(str(p.regex)) > 0 :
+                print "Domains -> To sqlite!! (%s, %s)"  % (i, len(str(p.regex)))
+                self.__insert_domain_into_sqlite(category, str(p.regex), is_black)
+        else:
+            domain_set = set()
+            
+            urls = []
+            for line in fd.readlines() :
+                dg_url = line.replace("\r","").replace("\n", "").replace(" ","").decode("iso8859-15").lower()
+                urls.append(dg_url)
+
+                if is_black == True:
+                    tmp_domain=''
+                    tmp_domain_item_list = dg_url.split("/")[0].split(".")
+                    tmp_domain_item_list.reverse()
+                    for x in tmp_domain_item_list:
+                        tmp_domain = tmp_domain + x + "."
+                        tmp_domain=tmp_domain[:-1]                
+                        domain_set.add(tmp_domain)
+
+            urls.sort()
+            
+            p = PatternMatching()
+            i = 0
+
+            for url in urls :
+                p.addString(str(url))
+                i = i + 1
+
+                if i % 100 == 0 :
+                    if len(str(p.regex)) > 30000 :
+                        print "Urls -> To sqlite!! (%s, %s)"  % (i, len(str(p.regex)))
+                        self.__insert_url_into_sqlite(category, str(p.regex), is_black)
+                        p = PatternMatching()
+                        i = 0
+
+            if len(str(p.regex)) > 0 :
+                print "Urls -> To sqlite!! (%s, %s)"  % (i, len(str(p.regex)))
+                self.__insert_url_into_sqlite(category, str(p.regex), is_black)
+                
+            if is_black == True:
+                domains = list(domain_set)
+                domains.sort()
+
+                p = PatternMatching()
+                i = 0
+                step = False
+
+                for domain in domains :
+                    p.addString(str(domain))
+                    i = i + 1
+                    if step == False and i % 500 == 0 :
+                        if len(str(p.regex)) > 20000 :
+                            step = True
+                            continue
+
+                    elif step == True and i % 100 == 0 :
+                        if len(str(p.regex)) > 30000 :
+                            print "May url block -> To sqlite!! (%s, %s)"  % (i, len(str(p.regex)))
+                            self.__insert_domain_into_sqlite("may_url_blocked", str(p.regex), is_black)
+                            p = PatternMatching()
+                            step = False
+                            i = 0
+
+                if len(str(p.regex)) > 0 :
+                    print "May url block -> To sqlite!! (%s, %s)"  % (i, len(str(p.regex)))
+                    self.__insert_domain_into_sqlite("may_url_blocked", str(p.regex), is_black)        
+            
+
+    def __insert_domain_into_sqlite(self, category, regexp, is_black):
+        c = self.conn.cursor()
+        if is_black == True :
+            c.execute('insert into black_domains values ("%s", "%s")' % (category, regexp))
+        else:
+            c.execute('insert into white_domains values ("%s", "%s")' % (category, regexp))
+
+        self.conn.commit()
+
+    def __insert_url_into_sqlite(self, category, regexp, is_black):
+        c = self.conn.cursor()
+        if is_black == True :
+            c.execute('insert into black_urls values ("%s", "%s")' % (category, regexp))
+        else:
+            c.execute('insert into white_urlss values ("%s", "%s")' % (category, regexp))
+
+        self.conn.commit()
+            
+gobject.type_register(DansGuardianImporter)
+
+if __name__ == '__main__':
+    d = DansGuardianImporter("/var/www/prueba3.tgz","/tmp/prueba.sqlite")
+    d.run()
diff --git a/daemon/src/Makefile.am b/daemon/src/Makefile.am
index e836108..075bbc7 100644
--- a/daemon/src/Makefile.am
+++ b/daemon/src/Makefile.am
@@ -1,15 +1,16 @@
 SUBDIRS=contrib proxy 
 
 corelibdir = $(pythondir)/nanny/daemon
-corelib_PYTHON =	 __init__.py 		\
-			Chrono.py		\
-                        Daemon.py         	\
-			NannyDBus.py		\
-			QuarterBack.py		\
-			LinuxFiltering.py	\
-			LinuxWebContentFiltering.py \
-			LinuxUsersManager.py	\
-			LinuxSessionFiltering.py
+corelib_PYTHON =	 __init__.py 			\
+			Chrono.py			\
+                        Daemon.py         		\
+			NannyDBus.py			\
+			QuarterBack.py			\
+			LinuxFiltering.py		\
+			LinuxWebContentFiltering.py 	\
+			LinuxUsersManager.py		\
+			LinuxSessionFiltering.py    	\
+			DansGuardianImporter.py
 
 
 INCLUDES =      -I$(top_srcdir)                 \



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]