[refactoring] First round of refactoring/review
authorVincent Michel <vincent.michel@logilab.fr>
Fri, 09 Nov 2012 13:26:12 +0100
changeset 108 57e172386d5f
parent 107 5de6850d5183
child 109 0b655812245f
[refactoring] First round of refactoring/review
distances.py
minhashing.py
normalize.py
test/data/file2parse
test/test_alignment.py
--- a/distances.py	Mon Nov 12 09:34:36 2012 +0100
+++ b/distances.py	Fri Nov 09 13:26:12 2012 +0100
@@ -22,36 +22,9 @@
 
 from alignment.normalize import tokenize
 
-def levenshtein(stra, strb):
-    """ Compute the Levenshtein distance between stra and strb.
 
-    The Levenshtein distance is defined as the minimal cost to transform stra
-    into strb, where 3 operators are allowed :
-        - Replace one character of stra into a character of strb
-        - Add one character of strb into stra
-        - Remove one character of strb
-
-        If spaces are found in stra or strb, this method returns
-            _handlespaces(stra, strb, levenshtein)
-    """
-
-    if ' ' in stra or ' ' in strb:
-        return _handlespaces(stra, strb, levenshtein)
-
-    lena = len(stra)
-    lenb = len(strb)
-    onerowago = None
-    thisrow = range(1, lenb + 1) + [0]
-    for x in xrange(lena):
-        onerowago, thisrow = thisrow, [0] * lenb + [x+1]
-        for y in xrange(lenb):
-            delcost = onerowago[y] + 1
-            addcost = thisrow[y - 1] + 1
-            subcost = onerowago[y - 1] + (stra[x] != strb[y])
-            thisrow[y] = min(delcost, addcost, subcost)
-    return thisrow[lenb - 1]
-
-def _handlespaces(stra, strb, distance, **args):
+### UTILITY FUNCTIONS #########################################################
+def _handlespaces(stra, strb, distance, tokenizer=None, **kwargs):
     """ Compute the matrix of distances between all tokens of stra and strb
         (with function ``distance``). Extra args are given to the distance
         function
@@ -74,21 +47,62 @@
     if ' ' not in strb:
         strb += ' '
 
-    toka, tokb = stra.split(' '), strb.split(' ')
+    toka = tokenize(stra, tokenizer)
+    tokb = tokenize(strb, tokenizer)
+    # If not same number of tokens, complete the smallest list with empty strings
+    if len(toka) != len(tokb):
+        mint = toka if len(toka)<len(tokb) else tokb
+        maxt = toka if len(toka)>len(tokb) else tokb
+        mint.extend(['' for i in range(len(maxt)-len(mint))])
 
     listmatrix = []
     for i in xrange(len(toka)):
-        listmatrix.append([])
-        for j in xrange(len(tokb)):
-            listmatrix[-1].append(distance(toka[i], tokb[j], **args))
+        listmatrix.append([distance(toka[i], tokb[j], **kwargs) for j in xrange(len(tokb))])
     m = matrix(listmatrix)
     minlist = [m[i,:].min() for i in xrange(m.shape[0])]
     minlist.extend([m[:,i].min() for i in xrange(m.shape[1])])
-
     return max(minlist)
 
 
-def soundexcode(word, language = 'french'):
+### NUMERICAL DISTANCES #######################################################
+def euclidean(a, b):
+    """ Simple euclidian distance
+    """
+    try:
+        return abs(a - b)
+    except TypeError:
+        return abs(float(a) - float(b))
+
+
+### STRING DISTANCES ##########################################################
+def levenshtein(stra, strb, tokenizer=None):
+    """ Compute the Levenshtein distance between stra and strb.
+
+    The Levenshtein distance is defined as the minimal cost to transform stra
+    into strb, where 3 operators are allowed :
+        - Replace one character of stra into a character of strb
+        - Add one character of strb into stra
+        - Remove one character of strb
+
+        If spaces are found in stra or strb, this method returns
+            _handlespaces(stra, strb, levenshtein)
+    """
+    if ' ' in stra or ' ' in strb:
+        return _handlespaces(stra, strb, levenshtein, tokenizer)
+
+    lenb = len(strb)
+    onerowago = None
+    thisrow = range(1, lenb + 1) + [0]
+    for x in xrange(len(stra)):
+        onerowago, thisrow = thisrow, [0] * lenb + [x+1]
+        for y in xrange(lenb):
+            delcost = onerowago[y] + 1
+            addcost = thisrow[y - 1] + 1
+            subcost = onerowago[y - 1] + (stra[x] != strb[y])
+            thisrow[y] = min(delcost, addcost, subcost)
+    return thisrow[lenb - 1]
+
+def soundexcode(word, language='french'):
     """ Return the Soundex code of the word ``word``
         For more information about soundex code see wiki_
 
@@ -123,7 +137,8 @@
                           }
     else:
         raise NotImplementedError('Soundex code is not supported (yet ?) for'
-                                  'this language')
+                                  'this language (%s). '
+                                  'Supported languages are french and english' % language)
     word = word.strip().upper()
     code = word[0]
     #After this ``for`` code is
@@ -146,17 +161,17 @@
     ###First four letters, completed by zeros
     return code[:4] + '0' * (4 - len(code))
 
-def soundex(stra, strb, language = 'french'):
+def soundex(stra, strb, language='french', tokenizer=None):
     """ Return the 1/0 distance between the soundex code of stra and strb.
         0 means they have the same code, 1 they don't
     """
     if ' ' in stra or ' ' in strb:
-        return _handlespaces(stra, strb, soundex, language = language)
+        return _handlespaces(stra, strb, soundex, tokenizer=tokenizer, language=language)
 
     return 0 if (soundexcode(stra, language) == soundexcode(strb, language)) \
              else 1
 
-def jaccard(stra, strb, tokenizer = None):
+def jaccard(stra, strb, tokenizer=None):
     """ Return the jaccard distance between stra and strb, condering the tokens
         set of stra and strb. If no tokenizer is given, it use if
         alignement.normalize.tokenize's default one.
@@ -167,12 +182,12 @@
 
     seta = set(tokenize(stra, tokenizer))
     setb = set(tokenize(strb, tokenizer))
+    return 1.0 - 1.0 * len(seta.intersection(setb)) / len(seta.union(setb))
 
-    jacc = 1.0 * len(seta.intersection(setb)) / len(seta.union(setb))
-    return 1.0 - jacc
 
-def temporal(stra, strb, granularity = u'days', language = u'french',
-             dayfirst = True, yearfirst = False):
+### TEMPORAL DISTANCES ########################################################
+def temporal(stra, strb, granularity=u'days', language=u'french',
+             dayfirst=True, yearfirst=False):
     """ Return the distance between two strings (read as dates).
 
         ``granularity`` can be either ``days`` or ``months`` or ``years``
@@ -205,25 +220,21 @@
                         (u'Ven', u'Vendredi'),
                         (u'Sam', u'Samedi'),
                         (u'Dim', u'Dimanche'),]
-    datea = dateparser.parse(stra, parserinfo = customparserinfo(dayfirst,
-                             yearfirst), fuzzy = True)
-    dateb = dateparser.parse(strb, parserinfo = customparserinfo(dayfirst,
-                             yearfirst), fuzzy = True)
-    diff  = datea - dateb
+    datea = dateparser.parse(stra, parserinfo=customparserinfo(dayfirst,
+                             yearfirst), fuzzy=True)
+    dateb = dateparser.parse(strb, parserinfo=customparserinfo(dayfirst,
+                             yearfirst), fuzzy=True)
+    diff = datea - dateb
     if granularity.lower() == 'years':
         return abs(diff.days / 365.25)
     if granularity.lower() == 'months':
         return abs(diff.days / 30.5)
     return abs(diff.days)
 
-def euclidean(a, b):
-    try:
-        return abs(a - b)
-    except TypeError:
-        return abs(float(a) - float(b))
 
-def geographical(pointa, pointb, inRadians = False, planetRadius = 6371009,
-                 units = 'm'):
+### GEOGRAPHICAL DISTANCES ####################################################
+def geographical(pointa, pointb, inRadians=False, planetRadius=6371009,
+                 units='m'):
     """ Return the geographical distance between two points.
 
         Both points must be tuples (latitude, longitude)
--- a/minhashing.py	Mon Nov 12 09:34:36 2012 +0100
+++ b/minhashing.py	Fri Nov 09 13:26:12 2012 +0100
@@ -24,7 +24,7 @@
 from scipy.sparse import lil_matrix
 from scipy.optimize import bisect
 
-from alignment.normalize import wordgrams
+from alignment.normalize import iter_wordgrams
 
 def randomhashfunction(zr):
     """ Return a random hash function, mapping x in Z to ZR
@@ -82,7 +82,7 @@
         for sent in sentences:
             row = []
             rowdata = []
-            for w in wordgrams(sent, k):
+            for w in iter_wordgrams(sent, k):
                 row.append(universe.setdefault(w, sizeofuniverse))
                 if row[-1] == sizeofuniverse:
                     sizeofuniverse += 1
--- a/normalize.py	Mon Nov 12 09:34:36 2012 +0100
+++ b/normalize.py	Fri Nov 09 13:26:12 2012 +0100
@@ -16,18 +16,10 @@
 # with this program. If not, see <http://www.gnu.org/licenses/>.
 
 import re
-
 from string import punctuation
 from warnings import warn
 from unicodedata import normalize as _uninormalize
 
-try:
-    from nltk.tokenize import WordPunctTokenizer as Tokenizer
-except ImportError:
-    class Tokenizer(object):
-        def tokenize(self, string):
-            return string.split(' ')
-
 
 STOPWORDS = set([u'alors', u'au', u'aucuns', u'aussi', u'autre', u'avant',
 u'avec', u'avoir', u'bon', u'car', u'ce', u'cela', u'ces', u'ceux', u'chaque',
@@ -63,6 +55,22 @@
     u'\xdf': u'ss',   # LATIN SMALL LETTER SHARP S
     }
 
+class Tokenizer(object):
+    """ Simple tokenizer similar to the one in NLTK.
+    """
+    def tokenize(self, string):
+        return [s for s in string.split(' ') if s]
+
+
+class WordTokenizer(Tokenizer):
+    """ Simple punctutation tokenizer similar to the one in NLTK.
+    XXX THIS CANNOT HANDLE UNICODE.
+    """
+    regexp = re.compile(r'\w+|[^\w\s]+')
+    def tokenize(self, string):
+        return [t for t in self.regexp.findall(string) if t]
+
+
 def unormalize(ustring, ignorenonascii=None, substitute=None):
     """replace diacritical characters with their corresponding ascii characters
 
@@ -96,11 +104,11 @@
         res.append(replacement)
     return u''.join(res)
 
-def lunormalize(sentence):
+def lunormalize(sentence, ignorenonascii=None, substitute=None):
     """ Normalize a sentence (ie remove accents, set to lower, etc) """
-    return unormalize(sentence).lower()
+    return unormalize(sentence, ignorenonascii, substitute).lower()
 
-def simplify(sentence, lemmas = None, removeStopWords = True):
+def simplify(sentence, lemmas=None, removeStopWords=True):
     """ Simply the given sentence
         0) If removeStopWords, then remove the stop words
         1) If lemmas are given, the sentence is lemmatized
@@ -110,28 +118,32 @@
     if lemmas:
         sentence = lemmatized(sentence, lemmas)
     sentence = sentence.lower()
-    cleansent = ''
-    for s in sentence:
-        if s not in punctuation:
-            cleansent += s
+    cleansent = ''.join([s for s in sentence if s not in punctuation])
 
     if not removeStopWords:
         return cleansent
     else:
         return ' '.join([w for w in cleansent.split(' ') if w not in STOPWORDS])
 
-
-def tokenize(sentence, tokenizer = None):
+def tokenize(sentence, tokenizer=None):
     """ Tokenize a sentence.
         Use ``tokenizer`` if given, else try to use the nltk WordPunctTokenizer,
         in case of failure, it just split on spaces.
 
         Anyway, tokenizer must have a ``tokenize()`` method
     """
-    tokenizer = tokenizer or Tokenizer
-    return [w for w in tokenizer().tokenize(sentence)]
+    if not tokenizer and isinstance(sentence, str):
+        tokenizer = WordTokenizer
+    elif not tokenizer and isinstance(sentence, unicode):
+        # XXX Unicode, could not use WorkTokenizer
+        if sentence == unormalize(sentence):
+            # This may be still used with the WordTokenizer
+            tokenizer = WordTokenizer
+        else:
+            tokenizer = Tokenizer
+    return tokenizer().tokenize(sentence)
 
-def wordgrams(sentence, k):
+def iter_wordgrams(sentence, k):
     """ Generator of k-wordgrams on the given sentence
     """
     words = sentence.split(' ')
@@ -141,11 +153,10 @@
 def loadlemmas(filename):
     """ Return the default lemmas dictionnary
     """
-    return dict([line.strip().split('\t')
-                 for line in open(filename)
-                     if len(line.strip().split('\t'))==2])
+    return dict([line.strip().split('\t') for line in open(filename)
+                 if len(line.strip().split('\t'))==2])
 
-def lemmatized(sentence, lemmas, tokenizer = None):
+def lemmatized(sentence, lemmas, tokenizer=None):
     """ Return the lemmatized sentence
     """
     tokenized_sent = tokenize(sentence, tokenizer)
@@ -171,14 +182,13 @@
             lemma = _words[0]
     return lemma
 
-def roundstr(number, ndigits = 0):
+def roundstr(number, ndigits=0):
     """Return an unicode string of ``number`` rounded to a given precision
         in decimal digits (default 0 digits)
 
         If ``number`` is not a float, this method casts it to a float. (An
         exception may be raised if it's not possible)
     """
-
     return format(round(float(number), ndigits), '0.%df' % ndigits)
 
 def rgxformat(string, regexp, output):
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/data/file2parse	Fri Nov 09 13:26:12 2012 +0100
@@ -0,0 +1,3 @@
+1, house , 12, 19, apple
+2, horse , 21.9, 19, stramberry
+3, flower, 23, 2.17 , cherry
--- a/test/test_alignment.py	Mon Nov 12 09:34:36 2012 +0100
+++ b/test/test_alignment.py	Fri Nov 09 13:26:12 2012 +0100
@@ -101,7 +101,7 @@
 
         self.assertEqual(jaccard('bonjour', 'bonjour'), 0.0)
         self.assertAlmostEqual(jaccard('boujour', 'bonjour'), 1, 2)
-        self.assertAlmostEqual(jaccard('sacré rubert', 'sacré hubert'), 0.5, 2)
+        self.assertAlmostEqual(jaccard(u'sacré rubert', u'sacré hubert'), 0.667, 2)
 
         #Test symetry
         self.assertEqual(jaccard('orange', 'morange'),
@@ -161,11 +161,13 @@
         self.assertEqual(simplify(u"J'aime les frites, les pommes et les" \
                                   u" scoubidous !", self.lemmas),
                          u"aimer frites pomme scoubidou")
+
     def test_tokenize(self):
         self.assertEqual(tokenize(u"J'aime les frites !"),
                          [u'J', u"'", u'aime', u'les', u'frites', u'!',])
 
     def test_lemmatizer(self):
+        self.assertEqual(lemmatized(u'sacré rubert', self.lemmas), u'sacré rubert')
         self.assertEqual(lemmatized(u"J'aime les frites !", self.lemmas),
                          u'je aimer le frite')
         self.assertEqual(lemmatized(u", J'aime les frites", self.lemmas),