[meld] Rework directory-level file comparison (closes bgo#586656)

From: Kai Willadsen <kaiw src gnome org>
To: commits-list gnome org
Cc:
Subject: [meld] Rework directory-level file comparison (closes bgo#586656)
Date: Sat, 5 Feb 2011 22:54:49 +0000 (UTC)
commit 886cf9dc30a7c62ae74d46ec69c6ba6510aa92cd
Author: Kai Willadsen <kai willadsen gmail com>
Date:   Fri Dec 17 18:01:13 2010 +1000

    Rework directory-level file comparison (closes bgo#586656)
    
    In comparison to previous incarnation, this reworked implementation
    features somewhat simpler logic, progressive file reading, binary file
    filter handling (i.e., we now disable filters if we guess that a file
    is binary) and provides earlier exit in no-filter cases.
    
    We also define and use different exit codes from the function; we can
    now additionally distinguish files that appear to be the
    same/different, but for which a full comparison was not run, and error
    cases.
    
    This commit also adds the Python 2.4 compatible namedtuple from:
        http://code.activestate.com/recipes/500261/
    as a replacement for misc.struct.

 meld/dirdiff.py         |  173 +++++++++++++++++++++++++++++++---------------
 meld/util/namedtuple.py |  151 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 267 insertions(+), 57 deletions(-)
---
diff --git a/meld/dirdiff.py b/meld/dirdiff.py
index 0761335..db617ba 100644
--- a/meld/dirdiff.py
+++ b/meld/dirdiff.py
@@ -1,4 +1,5 @@
 ### Copyright (C) 2002-2006 Stephen Kennedy <stevek gnome org>
+### Copyright (C) 2009-2010 Kai Willadsen <kai willadsen gmail com>
 
 ### This program is free software; you can redistribute it and/or modify
 ### it under the terms of the GNU General Public License as published by
@@ -14,7 +15,6 @@
 ### along with this program; if not, write to the Free Software
 ### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 
-import filecmp
 import paths
 from ui import gnomeglade
 import gtk
@@ -31,6 +31,8 @@ import re
 import stat
 import time
 
+from util.namedtuple import namedtuple
+
 import ui.emblemcellrenderer
 
 gdk = gtk.gdk
@@ -41,65 +43,117 @@ gdk = gtk.gdk
 #
 ################################################################################
 
+# For compatibility with Python 2.5, we use the Python 2.4 compatible version of
+# namedtuple. The class is included in the collections module as of Python 2.6.
+class StatItem(namedtuple('StatItem', 'mode size time')):
+    __slots__ = ()
+
+    @classmethod
+    def _make(cls, stat_result):
+        return StatItem(stat.S_IFMT(stat_result.st_mode),
+                        stat_result.st_size, stat_result.st_mtime)
+
+
+CacheResult = namedtuple('CacheResult', 'stats result')
+
+
 _cache = {}
+Same, SameFiltered, DodgySame, DodgyDifferent, Different, FileError = range(6)
+# TODO: Get the block size from os.stat
+CHUNK_SIZE = 4096
+
+
+def all_same(lst):
+    return not lst or lst.count(lst[0]) == len(lst)
+
 
-def _files_same(lof, regexes):
-    """Return 1 if all the files in 'lof' have the same contents.
-       If the files are the same after the regular expression substitution, return 2.
-       Finally, return 0 if the files still differ.
+def _files_same(files, regexes):
+    """Determine whether a list of files are the same.
+
+    Possible results are:
+      Same: The files are the same
+      SameFiltered: The files are identical only after filtering with 'regexes'
+      DodgySame: The files are superficially the same (i.e., type, size, mtime)
+      DodgyDifferent: The files are superficially different
+      FileError: There was a problem reading one or more of the files
     """
-    # early out if only one file
-    if len(lof) <= 1:
-        return 1
-    # get sigs
-    lof = tuple(lof)
-    def sig(f):
-        s = os.stat(f)
-        return misc.struct(mode=stat.S_IFMT(s.st_mode), size=s.st_size, time=s.st_mtime)
-    def all_same(l):
-        for i in l[1:]:
-            if l[0] != i:
-                return 0
-        return 1
-    sigs = tuple( [ sig(f) for f in lof ] )
-    # check for directories
-    arefiles = [ stat.S_ISREG(s.mode) for s in sigs ]
-    if arefiles.count(0) == len(arefiles): # all dirs
-        return 1
-    elif arefiles.count(0): # mixture
-        return 0
-    # if no substitutions look for different sizes
-    if len(regexes) == 0 and all_same( [s.size for s in sigs] ) == 0:
-        return 0
-    # try cache
-    try:
-        cache = _cache[ lof ]
-    except KeyError:
-        pass
-    else:
-        if cache.sigs == sigs: # up to date
-            return cache.result
-    # do it
+
+    # One file is the same as itself
+    if len(files) < 2:
+        return Same
+
+    files = tuple(files)
+    stats = tuple([StatItem._make(os.stat(f)) for f in files])
+
+    # If all entries are directories, they are considered to be the same
+    if all([stat.S_ISDIR(s.mode) for s in stats]):
+        return Same
+
+    # If any entries are not regular files, consider them different
+    if not all([stat.S_ISREG(s.mode) for s in stats]):
+        return Different
+
+    # If there are no text filters, unequal sizes imply a difference
+    if not regexes and not all_same([s.size for s in stats]):
+        return Different
+
+    # Check the cache before doing the expensive comparison
+    cache = _cache.get(files)
+    if cache and cache.stats == stats:
+        return cache.result
+
+    # Open files and compare bit-by-bit
+    contents = [[] for f in files]
+    result = None
+
     try:
-        contents = [open(f, "r").read() for f in lof]
-    except (MemoryError, OverflowError): # Files are too large
-        # FIXME: Filters are not current applied in this case. If that was
-        # to be fixed, we could drop the all-at-once loading.
-        for i in range(len(lof) - 1):
-            same = filecmp.cmp(lof[i], lof[i + 1], False)
-            if not same:
-                return 0
-        return 1
+        handles = [open(f, "rb") for f in files]
+        try:
+            data = [h.read(CHUNK_SIZE) for h in handles]
+
+            # Rough test to see whether files are binary. If files are guessed
+            # to be binary, we unset regexes for speed and space reasons.
+            if any(["\0" in d for d in data]):
+                regexes = []
 
-    if all_same(contents):
-        result = 1
-    else:
+            while True:
+                if all_same(data):
+                    if not data[0]:
+                        break
+                else:
+                    result = Different
+                    if not regexes:
+                        break
+
+                if regexes:
+                    for i in range(len(data)):
+                        contents[i].append(data[i])
+
+                data = [h.read(CHUNK_SIZE) for h in handles]
+
+        # Files are too large; we can't apply filters
+        except (MemoryError, OverflowError):
+            result = DodgySame if all_same(stats) else DodgyDifferent
+        finally:
+            for h in handles:
+                h.close()
+    except IOError:
+        # Don't cache generic errors as results
+        return FileError
+
+    if result is None:
+        result = Same
+
+    if result == Different and regexes:
+        contents = ["".join(c) for c in contents]
         for r in regexes:
-            contents = [ re.sub(r, "", c) for c in contents ]
-        result = all_same(contents) and 2
-    _cache[ lof ] = misc.struct(sigs=sigs, result=result)
+            contents = [re.sub(r, "", c) for c in contents]
+        result = SameFiltered if all_same(contents) else Different
+
+    _cache[files] = CacheResult(stats, result)
     return result
 
+
 COL_EMBLEM, COL_END = tree.COL_END, tree.COL_END + 1
 
 ################################################################################
@@ -726,7 +780,7 @@ class DirDiff(melddoc.MeldDoc, gnomeglade.Component):
             is_present = [ os.path.exists( f ) for f in curfiles ]
             all_present = 0 not in is_present
             if all_present:
-                if _files_same( curfiles, self.regexes ):
+                if _files_same(curfiles, self.regexes) in (Same, SameFiltered):
                     state = tree.STATE_NORMAL
                 else:
                     state = tree.STATE_MODIFIED
@@ -759,21 +813,26 @@ class DirDiff(melddoc.MeldDoc, gnomeglade.Component):
             for j in range(len(mod_times)):
                 if mod_times[j]:
                     lof.append( files[j] )
-            all_same = 0
+            all_same = Different
             all_present_same = _files_same( lof, self.regexes )
         different = 1
         one_isdir = [None for i in range(self.model.ntree)]
         for j in range(self.model.ntree):
             if mod_times[j]:
                 isdir = os.path.isdir( files[j] )
-                if all_same == 1:
+                # TODO: Differentiate the DodgySame case
+                if all_same == Same or all_same == DodgySame:
                     self.model.set_state(it, j,  tree.STATE_NORMAL, isdir)
                     different = 0
-                elif all_same == 2:
+                elif all_same == SameFiltered:
                     self.model.set_state(it, j,  tree.STATE_NOCHANGE, isdir)
                     different = 0
-                elif all_present_same:
+                # TODO: Differentiate the SameFiltered and DodgySame cases
+                elif all_present_same in (Same, SameFiltered, DodgySame):
                     self.model.set_state(it, j,  tree.STATE_NEW, isdir)
+                elif all_same == FileError or all_present_same == FileError:
+                    self.model.set_state(it, j,  tree.STATE_ERROR, isdir)
+                # Different and DodgyDifferent
                 else:
                     self.model.set_state(it, j,  tree.STATE_MODIFIED, isdir)
                 self.model.set_value(it,
diff --git a/meld/util/namedtuple.py b/meld/util/namedtuple.py
new file mode 100644
index 0000000..12b5e5e
--- /dev/null
+++ b/meld/util/namedtuple.py
@@ -0,0 +1,151 @@
+
+# Retrieved from http://code.activestate.com/recipes/500261/
+# Licensed under the PSF license
+
+from operator import itemgetter as _itemgetter
+from keyword import iskeyword as _iskeyword
+import sys as _sys
+
+def namedtuple(typename, field_names, verbose=False, rename=False):
+    """Returns a new subclass of tuple with named fields.
+
+    >>> Point = namedtuple('Point', 'x y')
+    >>> Point.__doc__                   # docstring for the new class
+    'Point(x, y)'
+    >>> p = Point(11, y=22)             # instantiate with positional args or keywords
+    >>> p[0] + p[1]                     # indexable like a plain tuple
+    33
+    >>> x, y = p                        # unpack like a regular tuple
+    >>> x, y
+    (11, 22)
+    >>> p.x + p.y                       # fields also accessable by name
+    33
+    >>> d = p._asdict()                 # convert to a dictionary
+    >>> d['x']
+    11
+    >>> Point(**d)                      # convert from a dictionary
+    Point(x=11, y=22)
+    >>> p._replace(x=100)               # _replace() is like str.replace() but targets named fields
+    Point(x=100, y=22)
+
+    """
+
+    # Parse and validate the field names.  Validation serves two purposes,
+    # generating informative error messages and preventing template injection attacks.
+    if isinstance(field_names, basestring):
+        field_names = field_names.replace(',', ' ').split() # names separated by whitespace and/or commas
+    field_names = tuple(map(str, field_names))
+    if rename:
+        names = list(field_names)
+        seen = set()
+        for i, name in enumerate(names):
+            if (not min(c.isalnum() or c=='_' for c in name) or _iskeyword(name)
+                or not name or name[0].isdigit() or name.startswith('_')
+                or name in seen):
+                    names[i] = '_%d' % i
+            seen.add(name)
+        field_names = tuple(names)
+    for name in (typename,) + field_names:
+        if not min(c.isalnum() or c=='_' for c in name):
+            raise ValueError('Type names and field names can only contain alphanumeric characters and underscores: %r' % name)
+        if _iskeyword(name):
+            raise ValueError('Type names and field names cannot be a keyword: %r' % name)
+        if name[0].isdigit():
+            raise ValueError('Type names and field names cannot start with a number: %r' % name)
+    seen_names = set()
+    for name in field_names:
+        if name.startswith('_') and not rename:
+            raise ValueError('Field names cannot start with an underscore: %r' % name)
+        if name in seen_names:
+            raise ValueError('Encountered duplicate field name: %r' % name)
+        seen_names.add(name)
+
+    # Create and fill-in the class template
+    numfields = len(field_names)
+    argtxt = repr(field_names).replace("'", "")[1:-1]   # tuple repr without parens or quotes
+    reprtxt = ', '.join('%s=%%r' % name for name in field_names)
+    template = '''class %(typename)s(tuple):
+        '%(typename)s(%(argtxt)s)' \n
+        __slots__ = () \n
+        _fields = %(field_names)r \n
+        def __new__(_cls, %(argtxt)s):
+            return _tuple.__new__(_cls, (%(argtxt)s)) \n
+        @classmethod
+        def _make(cls, iterable, new=tuple.__new__, len=len):
+            'Make a new %(typename)s object from a sequence or iterable'
+            result = new(cls, iterable)
+            if len(result) != %(numfields)d:
+                raise TypeError('Expected %(numfields)d arguments, got %%d' %% len(result))
+            return result \n
+        def __repr__(self):
+            return '%(typename)s(%(reprtxt)s)' %% self \n
+        def _asdict(self):
+            'Return a new dict which maps field names to their values'
+            return dict(zip(self._fields, self)) \n
+        def _replace(_self, **kwds):
+            'Return a new %(typename)s object replacing specified fields with new values'
+            result = _self._make(map(kwds.pop, %(field_names)r, _self))
+            if kwds:
+                raise ValueError('Got unexpected field names: %%r' %% kwds.keys())
+            return result \n
+        def __getnewargs__(self):
+            return tuple(self) \n\n''' % locals()
+    for i, name in enumerate(field_names):
+        template += '        %s = _property(_itemgetter(%d))\n' % (name, i)
+    if verbose:
+        print template
+
+    # Execute the template string in a temporary namespace
+    namespace = dict(_itemgetter=_itemgetter, __name__='namedtuple_%s' % typename,
+                     _property=property, _tuple=tuple)
+    try:
+        exec template in namespace
+    except SyntaxError, e:
+        raise SyntaxError(e.message + ':\n' + template)
+    result = namespace[typename]
+
+    # For pickling to work, the __module__ variable needs to be set to the frame
+    # where the named tuple is created.  Bypass this step in enviroments where
+    # sys._getframe is not defined (Jython for example) or sys._getframe is not
+    # defined for arguments greater than 0 (IronPython).
+    try:
+        result.__module__ = _sys._getframe(1).f_globals.get('__name__', '__main__')
+    except (AttributeError, ValueError):
+        pass
+
+    return result
+
+
+
+
+
+
+if __name__ == '__main__':
+    # verify that instances can be pickled
+    from cPickle import loads, dumps
+    Point = namedtuple('Point', 'x, y', True)
+    p = Point(x=10, y=20)
+    assert p == loads(dumps(p, -1))
+
+    # test and demonstrate ability to override methods
+    class Point(namedtuple('Point', 'x y')):
+        @property
+        def hypot(self):
+            return (self.x ** 2 + self.y ** 2) ** 0.5
+        def __str__(self):
+            return 'Point: x=%6.3f y=%6.3f hypot=%6.3f' % (self.x, self.y, self.hypot)
+
+    for p in Point(3,4), Point(14,5), Point(9./7,6):
+        print p
+
+    class Point(namedtuple('Point', 'x y')):
+        'Point class with optimized _make() and _replace() without error-checking'
+        _make = classmethod(tuple.__new__)
+        def _replace(self, _map=map, **kwds):
+            return self._make(_map(kwds.get, ('x', 'y'), self))
+
+    print Point(11, 22)._replace(x=100)
+
+    import doctest
+    TestResults = namedtuple('TestResults', 'failed attempted')
+    print TestResults(*doctest.testmod())
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]