[meld] Rework directory-level file comparison (closes bgo#586656)
- From: Kai Willadsen <kaiw src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [meld] Rework directory-level file comparison (closes bgo#586656)
- Date: Sat, 5 Feb 2011 22:54:49 +0000 (UTC)
commit 886cf9dc30a7c62ae74d46ec69c6ba6510aa92cd
Author: Kai Willadsen <kai willadsen gmail com>
Date: Fri Dec 17 18:01:13 2010 +1000
Rework directory-level file comparison (closes bgo#586656)
In comparison to previous incarnation, this reworked implementation
features somewhat simpler logic, progressive file reading, binary file
filter handling (i.e., we now disable filters if we guess that a file
is binary) and provides earlier exit in no-filter cases.
We also define and use different exit codes from the function; we can
now additionally distinguish files that appear to be the
same/different, but for which a full comparison was not run, and error
cases.
This commit also adds the Python 2.4 compatible namedtuple from:
http://code.activestate.com/recipes/500261/
as a replacement for misc.struct.
meld/dirdiff.py | 173 +++++++++++++++++++++++++++++++---------------
meld/util/namedtuple.py | 151 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 267 insertions(+), 57 deletions(-)
---
diff --git a/meld/dirdiff.py b/meld/dirdiff.py
index 0761335..db617ba 100644
--- a/meld/dirdiff.py
+++ b/meld/dirdiff.py
@@ -1,4 +1,5 @@
### Copyright (C) 2002-2006 Stephen Kennedy <stevek gnome org>
+### Copyright (C) 2009-2010 Kai Willadsen <kai willadsen gmail com>
### This program is free software; you can redistribute it and/or modify
### it under the terms of the GNU General Public License as published by
@@ -14,7 +15,6 @@
### along with this program; if not, write to the Free Software
### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-import filecmp
import paths
from ui import gnomeglade
import gtk
@@ -31,6 +31,8 @@ import re
import stat
import time
+from util.namedtuple import namedtuple
+
import ui.emblemcellrenderer
gdk = gtk.gdk
@@ -41,65 +43,117 @@ gdk = gtk.gdk
#
################################################################################
+# For compatibility with Python 2.5, we use the Python 2.4 compatible version of
+# namedtuple. The class is included in the collections module as of Python 2.6.
+class StatItem(namedtuple('StatItem', 'mode size time')):
+ __slots__ = ()
+
+ @classmethod
+ def _make(cls, stat_result):
+ return StatItem(stat.S_IFMT(stat_result.st_mode),
+ stat_result.st_size, stat_result.st_mtime)
+
+
+CacheResult = namedtuple('CacheResult', 'stats result')
+
+
_cache = {}
+Same, SameFiltered, DodgySame, DodgyDifferent, Different, FileError = range(6)
+# TODO: Get the block size from os.stat
+CHUNK_SIZE = 4096
+
+
+def all_same(lst):
+ return not lst or lst.count(lst[0]) == len(lst)
+
-def _files_same(lof, regexes):
- """Return 1 if all the files in 'lof' have the same contents.
- If the files are the same after the regular expression substitution, return 2.
- Finally, return 0 if the files still differ.
+def _files_same(files, regexes):
+ """Determine whether a list of files are the same.
+
+ Possible results are:
+ Same: The files are the same
+ SameFiltered: The files are identical only after filtering with 'regexes'
+ DodgySame: The files are superficially the same (i.e., type, size, mtime)
+ DodgyDifferent: The files are superficially different
+ FileError: There was a problem reading one or more of the files
"""
- # early out if only one file
- if len(lof) <= 1:
- return 1
- # get sigs
- lof = tuple(lof)
- def sig(f):
- s = os.stat(f)
- return misc.struct(mode=stat.S_IFMT(s.st_mode), size=s.st_size, time=s.st_mtime)
- def all_same(l):
- for i in l[1:]:
- if l[0] != i:
- return 0
- return 1
- sigs = tuple( [ sig(f) for f in lof ] )
- # check for directories
- arefiles = [ stat.S_ISREG(s.mode) for s in sigs ]
- if arefiles.count(0) == len(arefiles): # all dirs
- return 1
- elif arefiles.count(0): # mixture
- return 0
- # if no substitutions look for different sizes
- if len(regexes) == 0 and all_same( [s.size for s in sigs] ) == 0:
- return 0
- # try cache
- try:
- cache = _cache[ lof ]
- except KeyError:
- pass
- else:
- if cache.sigs == sigs: # up to date
- return cache.result
- # do it
+
+ # One file is the same as itself
+ if len(files) < 2:
+ return Same
+
+ files = tuple(files)
+ stats = tuple([StatItem._make(os.stat(f)) for f in files])
+
+ # If all entries are directories, they are considered to be the same
+ if all([stat.S_ISDIR(s.mode) for s in stats]):
+ return Same
+
+ # If any entries are not regular files, consider them different
+ if not all([stat.S_ISREG(s.mode) for s in stats]):
+ return Different
+
+ # If there are no text filters, unequal sizes imply a difference
+ if not regexes and not all_same([s.size for s in stats]):
+ return Different
+
+ # Check the cache before doing the expensive comparison
+ cache = _cache.get(files)
+ if cache and cache.stats == stats:
+ return cache.result
+
+ # Open files and compare bit-by-bit
+ contents = [[] for f in files]
+ result = None
+
try:
- contents = [open(f, "r").read() for f in lof]
- except (MemoryError, OverflowError): # Files are too large
- # FIXME: Filters are not current applied in this case. If that was
- # to be fixed, we could drop the all-at-once loading.
- for i in range(len(lof) - 1):
- same = filecmp.cmp(lof[i], lof[i + 1], False)
- if not same:
- return 0
- return 1
+ handles = [open(f, "rb") for f in files]
+ try:
+ data = [h.read(CHUNK_SIZE) for h in handles]
+
+ # Rough test to see whether files are binary. If files are guessed
+ # to be binary, we unset regexes for speed and space reasons.
+ if any(["\0" in d for d in data]):
+ regexes = []
- if all_same(contents):
- result = 1
- else:
+ while True:
+ if all_same(data):
+ if not data[0]:
+ break
+ else:
+ result = Different
+ if not regexes:
+ break
+
+ if regexes:
+ for i in range(len(data)):
+ contents[i].append(data[i])
+
+ data = [h.read(CHUNK_SIZE) for h in handles]
+
+ # Files are too large; we can't apply filters
+ except (MemoryError, OverflowError):
+ result = DodgySame if all_same(stats) else DodgyDifferent
+ finally:
+ for h in handles:
+ h.close()
+ except IOError:
+ # Don't cache generic errors as results
+ return FileError
+
+ if result is None:
+ result = Same
+
+ if result == Different and regexes:
+ contents = ["".join(c) for c in contents]
for r in regexes:
- contents = [ re.sub(r, "", c) for c in contents ]
- result = all_same(contents) and 2
- _cache[ lof ] = misc.struct(sigs=sigs, result=result)
+ contents = [re.sub(r, "", c) for c in contents]
+ result = SameFiltered if all_same(contents) else Different
+
+ _cache[files] = CacheResult(stats, result)
return result
+
COL_EMBLEM, COL_END = tree.COL_END, tree.COL_END + 1
################################################################################
@@ -726,7 +780,7 @@ class DirDiff(melddoc.MeldDoc, gnomeglade.Component):
is_present = [ os.path.exists( f ) for f in curfiles ]
all_present = 0 not in is_present
if all_present:
- if _files_same( curfiles, self.regexes ):
+ if _files_same(curfiles, self.regexes) in (Same, SameFiltered):
state = tree.STATE_NORMAL
else:
state = tree.STATE_MODIFIED
@@ -759,21 +813,26 @@ class DirDiff(melddoc.MeldDoc, gnomeglade.Component):
for j in range(len(mod_times)):
if mod_times[j]:
lof.append( files[j] )
- all_same = 0
+ all_same = Different
all_present_same = _files_same( lof, self.regexes )
different = 1
one_isdir = [None for i in range(self.model.ntree)]
for j in range(self.model.ntree):
if mod_times[j]:
isdir = os.path.isdir( files[j] )
- if all_same == 1:
+ # TODO: Differentiate the DodgySame case
+ if all_same == Same or all_same == DodgySame:
self.model.set_state(it, j, tree.STATE_NORMAL, isdir)
different = 0
- elif all_same == 2:
+ elif all_same == SameFiltered:
self.model.set_state(it, j, tree.STATE_NOCHANGE, isdir)
different = 0
- elif all_present_same:
+ # TODO: Differentiate the SameFiltered and DodgySame cases
+ elif all_present_same in (Same, SameFiltered, DodgySame):
self.model.set_state(it, j, tree.STATE_NEW, isdir)
+ elif all_same == FileError or all_present_same == FileError:
+ self.model.set_state(it, j, tree.STATE_ERROR, isdir)
+ # Different and DodgyDifferent
else:
self.model.set_state(it, j, tree.STATE_MODIFIED, isdir)
self.model.set_value(it,
diff --git a/meld/util/namedtuple.py b/meld/util/namedtuple.py
new file mode 100644
index 0000000..12b5e5e
--- /dev/null
+++ b/meld/util/namedtuple.py
@@ -0,0 +1,151 @@
+
+# Retrieved from http://code.activestate.com/recipes/500261/
+# Licensed under the PSF license
+
+from operator import itemgetter as _itemgetter
+from keyword import iskeyword as _iskeyword
+import sys as _sys
+
+def namedtuple(typename, field_names, verbose=False, rename=False):
+ """Returns a new subclass of tuple with named fields.
+
+ >>> Point = namedtuple('Point', 'x y')
+ >>> Point.__doc__ # docstring for the new class
+ 'Point(x, y)'
+ >>> p = Point(11, y=22) # instantiate with positional args or keywords
+ >>> p[0] + p[1] # indexable like a plain tuple
+ 33
+ >>> x, y = p # unpack like a regular tuple
+ >>> x, y
+ (11, 22)
+ >>> p.x + p.y # fields also accessable by name
+ 33
+ >>> d = p._asdict() # convert to a dictionary
+ >>> d['x']
+ 11
+ >>> Point(**d) # convert from a dictionary
+ Point(x=11, y=22)
+ >>> p._replace(x=100) # _replace() is like str.replace() but targets named fields
+ Point(x=100, y=22)
+
+ """
+
+ # Parse and validate the field names. Validation serves two purposes,
+ # generating informative error messages and preventing template injection attacks.
+ if isinstance(field_names, basestring):
+ field_names = field_names.replace(',', ' ').split() # names separated by whitespace and/or commas
+ field_names = tuple(map(str, field_names))
+ if rename:
+ names = list(field_names)
+ seen = set()
+ for i, name in enumerate(names):
+ if (not min(c.isalnum() or c=='_' for c in name) or _iskeyword(name)
+ or not name or name[0].isdigit() or name.startswith('_')
+ or name in seen):
+ names[i] = '_%d' % i
+ seen.add(name)
+ field_names = tuple(names)
+ for name in (typename,) + field_names:
+ if not min(c.isalnum() or c=='_' for c in name):
+ raise ValueError('Type names and field names can only contain alphanumeric characters and underscores: %r' % name)
+ if _iskeyword(name):
+ raise ValueError('Type names and field names cannot be a keyword: %r' % name)
+ if name[0].isdigit():
+ raise ValueError('Type names and field names cannot start with a number: %r' % name)
+ seen_names = set()
+ for name in field_names:
+ if name.startswith('_') and not rename:
+ raise ValueError('Field names cannot start with an underscore: %r' % name)
+ if name in seen_names:
+ raise ValueError('Encountered duplicate field name: %r' % name)
+ seen_names.add(name)
+
+ # Create and fill-in the class template
+ numfields = len(field_names)
+ argtxt = repr(field_names).replace("'", "")[1:-1] # tuple repr without parens or quotes
+ reprtxt = ', '.join('%s=%%r' % name for name in field_names)
+ template = '''class %(typename)s(tuple):
+ '%(typename)s(%(argtxt)s)' \n
+ __slots__ = () \n
+ _fields = %(field_names)r \n
+ def __new__(_cls, %(argtxt)s):
+ return _tuple.__new__(_cls, (%(argtxt)s)) \n
+ @classmethod
+ def _make(cls, iterable, new=tuple.__new__, len=len):
+ 'Make a new %(typename)s object from a sequence or iterable'
+ result = new(cls, iterable)
+ if len(result) != %(numfields)d:
+ raise TypeError('Expected %(numfields)d arguments, got %%d' %% len(result))
+ return result \n
+ def __repr__(self):
+ return '%(typename)s(%(reprtxt)s)' %% self \n
+ def _asdict(self):
+ 'Return a new dict which maps field names to their values'
+ return dict(zip(self._fields, self)) \n
+ def _replace(_self, **kwds):
+ 'Return a new %(typename)s object replacing specified fields with new values'
+ result = _self._make(map(kwds.pop, %(field_names)r, _self))
+ if kwds:
+ raise ValueError('Got unexpected field names: %%r' %% kwds.keys())
+ return result \n
+ def __getnewargs__(self):
+ return tuple(self) \n\n''' % locals()
+ for i, name in enumerate(field_names):
+ template += ' %s = _property(_itemgetter(%d))\n' % (name, i)
+ if verbose:
+ print template
+
+ # Execute the template string in a temporary namespace
+ namespace = dict(_itemgetter=_itemgetter, __name__='namedtuple_%s' % typename,
+ _property=property, _tuple=tuple)
+ try:
+ exec template in namespace
+ except SyntaxError, e:
+ raise SyntaxError(e.message + ':\n' + template)
+ result = namespace[typename]
+
+ # For pickling to work, the __module__ variable needs to be set to the frame
+ # where the named tuple is created. Bypass this step in enviroments where
+ # sys._getframe is not defined (Jython for example) or sys._getframe is not
+ # defined for arguments greater than 0 (IronPython).
+ try:
+ result.__module__ = _sys._getframe(1).f_globals.get('__name__', '__main__')
+ except (AttributeError, ValueError):
+ pass
+
+ return result
+
+
+
+
+
+
+if __name__ == '__main__':
+ # verify that instances can be pickled
+ from cPickle import loads, dumps
+ Point = namedtuple('Point', 'x, y', True)
+ p = Point(x=10, y=20)
+ assert p == loads(dumps(p, -1))
+
+ # test and demonstrate ability to override methods
+ class Point(namedtuple('Point', 'x y')):
+ @property
+ def hypot(self):
+ return (self.x ** 2 + self.y ** 2) ** 0.5
+ def __str__(self):
+ return 'Point: x=%6.3f y=%6.3f hypot=%6.3f' % (self.x, self.y, self.hypot)
+
+ for p in Point(3,4), Point(14,5), Point(9./7,6):
+ print p
+
+ class Point(namedtuple('Point', 'x y')):
+ 'Point class with optimized _make() and _replace() without error-checking'
+ _make = classmethod(tuple.__new__)
+ def _replace(self, _map=map, **kwds):
+ return self._make(_map(kwds.get, ('x', 'y'), self))
+
+ print Point(11, 22)._replace(x=100)
+
+ import doctest
+ TestResults = namedtuple('TestResults', 'failed attempted')
+ print TestResults(*doctest.testmod())
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]