[nanny] Some performances in DansGuardian Importer



commit 8c20e346e69147e1396c9003601ecfe44b5549fb
Author: Roberto Majadas <roberto majadas openshine com>
Date:   Sat Jan 30 17:41:15 2010 +0100

    Some performances in DansGuardian Importer

 daemon/src/DansGuardianImporter.py |   85 ++++++++++++++++++++++++++----------
 1 files changed, 61 insertions(+), 24 deletions(-)
---
diff --git a/daemon/src/DansGuardianImporter.py b/daemon/src/DansGuardianImporter.py
index 5427201..4824e6a 100644
--- a/daemon/src/DansGuardianImporter.py
+++ b/daemon/src/DansGuardianImporter.py
@@ -21,6 +21,7 @@
 #
 
 import os
+import time
 import tempfile
 import tarfile
 
@@ -44,10 +45,13 @@ class DansGuardianImporter (gobject.GObject):
         self.conn = None
 
     def run(self):
+        t0 = time.time()
         self.__create_sqlite()
         self.__copy_dansguardian_file()
         self.__dansguardian_2_sqlite()
         self.conn.close()
+        print "---------------------------------"
+        print "Time : %s" % ((time.time() - t0) / 60.0)
 
     def __create_sqlite(self):
         if os.path.exists(self.out_path) :
@@ -83,14 +87,19 @@ class DansGuardianImporter (gobject.GObject):
                 m_fd = tfile.extractfile(member.name)
                 itype = "url"
                 category = os.path.basename(os.path.dirname(member.name))
+                t0 = time.time()
+                print "Importing urls [%s]" % category
                 self.__add_items_2_sqlite(m_fd, category, is_black, itype)
+                print "Imported urls [%s] (t: %s)" % (category, time.time() - t0)
                 
             elif os.path.basename(member.name) == "domains" and member.isfile():
                 m_fd = tfile.extractfile(member.name)
                 itype = "domain"
                 category = os.path.basename(os.path.dirname(member.name))
+                t0 = time.time()
+                print "Importing domains [%s]" % category
                 self.__add_items_2_sqlite(m_fd, category, is_black, itype)
-                
+                print "Imported domains [%s] (t: %s)" % (category, time.time() - t0)
             else:
                 continue
 
@@ -116,21 +125,32 @@ class DansGuardianImporter (gobject.GObject):
             for domain in domains :
                 p.addString(str(domain))
                 i = i + 1
+                if i < 1500 :
+                    continue
+                
                 if step == False and i % 500 == 0 :
                     if len(str(p.regex)) > 20000 :
+                        if len(str(p.regex)) > 24000 :
+                            print "\tDomains -> To sqlite!! (%s, %s)"  % (i, len(str(p.regex)))
+                            self.__insert_domain_into_sqlite(category, str(p.regex), is_black)
+                            p = PatternMatching()
+                            step = False
+                            i = 0
+                            continue
+                        
                         step = True
                         continue
                     
                 elif step == True and i % 100 == 0 :
-                    if len(str(p.regex)) > 30000 :
-                        print "Domains -> To sqlite!! (%s, %s)"  % (i, len(str(p.regex)))
+                    if len(str(p.regex)) > 25000 :
+                        print "\tDomains -> To sqlite!! (%s, %s)"  % (i, len(str(p.regex)))
                         self.__insert_domain_into_sqlite(category, str(p.regex), is_black)
                         p = PatternMatching()
                         step = False
                         i = 0
             
             if len(str(p.regex)) > 0 :
-                print "Domains -> To sqlite!! (%s, %s)"  % (i, len(str(p.regex)))
+                print "\tDomains -> To sqlite!! (%s, %s)"  % (i, len(str(p.regex)))
                 self.__insert_domain_into_sqlite(category, str(p.regex), is_black)
         else:
             domain_set = set()
@@ -159,14 +179,14 @@ class DansGuardianImporter (gobject.GObject):
                 i = i + 1
 
                 if i % 100 == 0 :
-                    if len(str(p.regex)) > 30000 :
-                        print "Urls -> To sqlite!! (%s, %s)"  % (i, len(str(p.regex)))
+                    if len(str(p.regex)) > 25000 :
+                        print "\tUrls -> To sqlite!! (%s, %s)"  % (i, len(str(p.regex)))
                         self.__insert_url_into_sqlite(category, str(p.regex), is_black)
                         p = PatternMatching()
                         i = 0
 
             if len(str(p.regex)) > 0 :
-                print "Urls -> To sqlite!! (%s, %s)"  % (i, len(str(p.regex)))
+                print "\tUrls -> To sqlite!! (%s, %s)"  % (i, len(str(p.regex)))
                 self.__insert_url_into_sqlite(category, str(p.regex), is_black)
                 
             if is_black == True:
@@ -180,44 +200,61 @@ class DansGuardianImporter (gobject.GObject):
                 for domain in domains :
                     p.addString(str(domain))
                     i = i + 1
+                    if i < 1500 :
+                        continue
+                    
                     if step == False and i % 500 == 0 :
                         if len(str(p.regex)) > 20000 :
+                            if len(str(p.regex)) > 24000 :
+                                print "\tMay url block -> To sqlite!! (%s, %s)"  % (i, len(str(p.regex)))
+                                self.__insert_domain_into_sqlite("may_url_blocked", str(p.regex), is_black)
+                                p = PatternMatching()
+                                step = False
+                                i = 0
+                                continue
+                                
                             step = True
                             continue
 
                     elif step == True and i % 100 == 0 :
-                        if len(str(p.regex)) > 30000 :
-                            print "May url block -> To sqlite!! (%s, %s)"  % (i, len(str(p.regex)))
+                        if len(str(p.regex)) > 25000 :
+                            print "\tMay url block -> To sqlite!! (%s, %s)"  % (i, len(str(p.regex)))
                             self.__insert_domain_into_sqlite("may_url_blocked", str(p.regex), is_black)
                             p = PatternMatching()
                             step = False
                             i = 0
 
                 if len(str(p.regex)) > 0 :
-                    print "May url block -> To sqlite!! (%s, %s)"  % (i, len(str(p.regex)))
+                    print "\tMay url block -> To sqlite!! (%s, %s)"  % (i, len(str(p.regex)))
                     self.__insert_domain_into_sqlite("may_url_blocked", str(p.regex), is_black)        
             
 
     def __insert_domain_into_sqlite(self, category, regexp, is_black):
-        c = self.conn.cursor()
-        if is_black == True :
-            c.execute('insert into black_domains values ("%s", "%s")' % (category, regexp))
-        else:
-            c.execute('insert into white_domains values ("%s", "%s")' % (category, regexp))
+        try:
+            c = self.conn.cursor()
+            if is_black == True :
+                c.execute('insert into black_domains values ("%s", "%s")' % (category, regexp))
+            else:
+                c.execute('insert into white_domains values ("%s", "%s")' % (category, regexp))
 
-        self.conn.commit()
+            self.conn.commit()
+        except :
+            print "Something wrong in sqlite inserting domains :\nCategory : %s\nREGEX %s" (category, regexp)
 
     def __insert_url_into_sqlite(self, category, regexp, is_black):
-        c = self.conn.cursor()
-        if is_black == True :
-            c.execute('insert into black_urls values ("%s", "%s")' % (category, regexp))
-        else:
-            c.execute('insert into white_urlss values ("%s", "%s")' % (category, regexp))
-
-        self.conn.commit()
+        try:
+            c = self.conn.cursor()
+            if is_black == True :
+                c.execute('insert into black_urls values ("%s", "%s")' % (category, regexp))
+            else:
+                c.execute('insert into white_urlss values ("%s", "%s")' % (category, regexp))
+        
+            self.conn.commit()
+        except :
+            print "Something wrong in sqlite inserting urls :\nCategory : %s\nREGEX %s" (category, regexp)
             
 gobject.type_register(DansGuardianImporter)
 
 if __name__ == '__main__':
-    d = DansGuardianImporter("/var/www/prueba3.tgz","/tmp/prueba.sqlite")
+    d = DansGuardianImporter("/var/www/pets.tgz","/tmp/pets.sqlite")
     d.run()



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]