author Vincent Michel Fri, 09 Nov 2012 13:26:12 +0100 changeset 108 57e172386d5f parent 107 5de6850d5183 child 109 0b655812245f
[refactoring] First round of refactoring/review
 distances.py file | annotate | diff | comparison | revisions minhashing.py file | annotate | diff | comparison | revisions normalize.py file | annotate | diff | comparison | revisions test/data/file2parse file | annotate | diff | comparison | revisions test/test_alignment.py file | annotate | diff | comparison | revisions
```--- a/distances.py	Mon Nov 12 09:34:36 2012 +0100
+++ b/distances.py	Fri Nov 09 13:26:12 2012 +0100
@@ -22,36 +22,9 @@

from alignment.normalize import tokenize

-def levenshtein(stra, strb):
-    """ Compute the Levenshtein distance between stra and strb.

-    The Levenshtein distance is defined as the minimal cost to transform stra
-    into strb, where 3 operators are allowed :
-        - Replace one character of stra into a character of strb
-        - Add one character of strb into stra
-        - Remove one character of strb
-
-        If spaces are found in stra or strb, this method returns
-            _handlespaces(stra, strb, levenshtein)
-    """
-
-    if ' ' in stra or ' ' in strb:
-        return _handlespaces(stra, strb, levenshtein)
-
-    lena = len(stra)
-    lenb = len(strb)
-    onerowago = None
-    thisrow = range(1, lenb + 1) + [0]
-    for x in xrange(lena):
-        onerowago, thisrow = thisrow, [0] * lenb + [x+1]
-        for y in xrange(lenb):
-            delcost = onerowago[y] + 1
-            addcost = thisrow[y - 1] + 1
-            subcost = onerowago[y - 1] + (stra[x] != strb[y])
-            thisrow[y] = min(delcost, addcost, subcost)
-    return thisrow[lenb - 1]
-
-def _handlespaces(stra, strb, distance, **args):
+### UTILITY FUNCTIONS #########################################################
+def _handlespaces(stra, strb, distance, tokenizer=None, **kwargs):
""" Compute the matrix of distances between all tokens of stra and strb
(with function ``distance``). Extra args are given to the distance
function
@@ -74,21 +47,62 @@
if ' ' not in strb:
strb += ' '

-    toka, tokb = stra.split(' '), strb.split(' ')
+    toka = tokenize(stra, tokenizer)
+    tokb = tokenize(strb, tokenizer)
+    # If not same number of tokens, complete the smallest list with empty strings
+    if len(toka) != len(tokb):
+        mint = toka if len(toka)<len(tokb) else tokb
+        maxt = toka if len(toka)>len(tokb) else tokb
+        mint.extend(['' for i in range(len(maxt)-len(mint))])

listmatrix = []
for i in xrange(len(toka)):
-        listmatrix.append([])
-        for j in xrange(len(tokb)):
-            listmatrix[-1].append(distance(toka[i], tokb[j], **args))
+        listmatrix.append([distance(toka[i], tokb[j], **kwargs) for j in xrange(len(tokb))])
m = matrix(listmatrix)
minlist = [m[i,:].min() for i in xrange(m.shape[0])]
minlist.extend([m[:,i].min() for i in xrange(m.shape[1])])
-
return max(minlist)

-def soundexcode(word, language = 'french'):
+### NUMERICAL DISTANCES #######################################################
+def euclidean(a, b):
+    """ Simple euclidian distance
+    """
+    try:
+        return abs(a - b)
+    except TypeError:
+        return abs(float(a) - float(b))
+
+
+### STRING DISTANCES ##########################################################
+def levenshtein(stra, strb, tokenizer=None):
+    """ Compute the Levenshtein distance between stra and strb.
+
+    The Levenshtein distance is defined as the minimal cost to transform stra
+    into strb, where 3 operators are allowed :
+        - Replace one character of stra into a character of strb
+        - Add one character of strb into stra
+        - Remove one character of strb
+
+        If spaces are found in stra or strb, this method returns
+            _handlespaces(stra, strb, levenshtein)
+    """
+    if ' ' in stra or ' ' in strb:
+        return _handlespaces(stra, strb, levenshtein, tokenizer)
+
+    lenb = len(strb)
+    onerowago = None
+    thisrow = range(1, lenb + 1) + [0]
+    for x in xrange(len(stra)):
+        onerowago, thisrow = thisrow, [0] * lenb + [x+1]
+        for y in xrange(lenb):
+            delcost = onerowago[y] + 1
+            addcost = thisrow[y - 1] + 1
+            subcost = onerowago[y - 1] + (stra[x] != strb[y])
+            thisrow[y] = min(delcost, addcost, subcost)
+    return thisrow[lenb - 1]
+
+def soundexcode(word, language='french'):
""" Return the Soundex code of the word ``word``

@@ -123,7 +137,8 @@
}
else:
raise NotImplementedError('Soundex code is not supported (yet ?) for'
-                                  'this language')
+                                  'this language (%s). '
+                                  'Supported languages are french and english' % language)
word = word.strip().upper()
code = word[0]
#After this ``for`` code is
@@ -146,17 +161,17 @@
###First four letters, completed by zeros
return code[:4] + '0' * (4 - len(code))

-def soundex(stra, strb, language = 'french'):
+def soundex(stra, strb, language='french', tokenizer=None):
""" Return the 1/0 distance between the soundex code of stra and strb.
0 means they have the same code, 1 they don't
"""
if ' ' in stra or ' ' in strb:
-        return _handlespaces(stra, strb, soundex, language = language)
+        return _handlespaces(stra, strb, soundex, tokenizer=tokenizer, language=language)

return 0 if (soundexcode(stra, language) == soundexcode(strb, language)) \
else 1

-def jaccard(stra, strb, tokenizer = None):
+def jaccard(stra, strb, tokenizer=None):
""" Return the jaccard distance between stra and strb, condering the tokens
set of stra and strb. If no tokenizer is given, it use if
alignement.normalize.tokenize's default one.
@@ -167,12 +182,12 @@

seta = set(tokenize(stra, tokenizer))
setb = set(tokenize(strb, tokenizer))
+    return 1.0 - 1.0 * len(seta.intersection(setb)) / len(seta.union(setb))

-    jacc = 1.0 * len(seta.intersection(setb)) / len(seta.union(setb))
-    return 1.0 - jacc

-def temporal(stra, strb, granularity = u'days', language = u'french',
-             dayfirst = True, yearfirst = False):
+### TEMPORAL DISTANCES ########################################################
+def temporal(stra, strb, granularity=u'days', language=u'french',
+             dayfirst=True, yearfirst=False):
""" Return the distance between two strings (read as dates).

``granularity`` can be either ``days`` or ``months`` or ``years``
@@ -205,25 +220,21 @@
(u'Ven', u'Vendredi'),
(u'Sam', u'Samedi'),
(u'Dim', u'Dimanche'),]
-    datea = dateparser.parse(stra, parserinfo = customparserinfo(dayfirst,
-                             yearfirst), fuzzy = True)
-    dateb = dateparser.parse(strb, parserinfo = customparserinfo(dayfirst,
-                             yearfirst), fuzzy = True)
-    diff  = datea - dateb
+    datea = dateparser.parse(stra, parserinfo=customparserinfo(dayfirst,
+                             yearfirst), fuzzy=True)
+    dateb = dateparser.parse(strb, parserinfo=customparserinfo(dayfirst,
+                             yearfirst), fuzzy=True)
+    diff = datea - dateb
if granularity.lower() == 'years':
return abs(diff.days / 365.25)
if granularity.lower() == 'months':
return abs(diff.days / 30.5)
return abs(diff.days)

-def euclidean(a, b):
-    try:
-        return abs(a - b)
-    except TypeError:
-        return abs(float(a) - float(b))

-                 units = 'm'):
+### GEOGRAPHICAL DISTANCES ####################################################
+                 units='m'):
""" Return the geographical distance between two points.

Both points must be tuples (latitude, longitude)```
```--- a/minhashing.py	Mon Nov 12 09:34:36 2012 +0100
+++ b/minhashing.py	Fri Nov 09 13:26:12 2012 +0100
@@ -24,7 +24,7 @@
from scipy.sparse import lil_matrix
from scipy.optimize import bisect

-from alignment.normalize import wordgrams
+from alignment.normalize import iter_wordgrams

def randomhashfunction(zr):
""" Return a random hash function, mapping x in Z to ZR
@@ -82,7 +82,7 @@
for sent in sentences:
row = []
rowdata = []
-            for w in wordgrams(sent, k):
+            for w in iter_wordgrams(sent, k):
row.append(universe.setdefault(w, sizeofuniverse))
if row[-1] == sizeofuniverse:
sizeofuniverse += 1```
```--- a/normalize.py	Mon Nov 12 09:34:36 2012 +0100
+++ b/normalize.py	Fri Nov 09 13:26:12 2012 +0100
@@ -16,18 +16,10 @@
# with this program. If not, see <http://www.gnu.org/licenses/>.

import re
-
from string import punctuation
from warnings import warn
from unicodedata import normalize as _uninormalize

-try:
-    from nltk.tokenize import WordPunctTokenizer as Tokenizer
-except ImportError:
-    class Tokenizer(object):
-        def tokenize(self, string):
-            return string.split(' ')
-

STOPWORDS = set([u'alors', u'au', u'aucuns', u'aussi', u'autre', u'avant',
u'avec', u'avoir', u'bon', u'car', u'ce', u'cela', u'ces', u'ceux', u'chaque',
@@ -63,6 +55,22 @@
u'\xdf': u'ss',   # LATIN SMALL LETTER SHARP S
}

+class Tokenizer(object):
+    """ Simple tokenizer similar to the one in NLTK.
+    """
+    def tokenize(self, string):
+        return [s for s in string.split(' ') if s]
+
+
+class WordTokenizer(Tokenizer):
+    """ Simple punctutation tokenizer similar to the one in NLTK.
+    XXX THIS CANNOT HANDLE UNICODE.
+    """
+    regexp = re.compile(r'\w+|[^\w\s]+')
+    def tokenize(self, string):
+        return [t for t in self.regexp.findall(string) if t]
+
+
def unormalize(ustring, ignorenonascii=None, substitute=None):
"""replace diacritical characters with their corresponding ascii characters

@@ -96,11 +104,11 @@
res.append(replacement)
return u''.join(res)

-def lunormalize(sentence):
+def lunormalize(sentence, ignorenonascii=None, substitute=None):
""" Normalize a sentence (ie remove accents, set to lower, etc) """
-    return unormalize(sentence).lower()
+    return unormalize(sentence, ignorenonascii, substitute).lower()

-def simplify(sentence, lemmas = None, removeStopWords = True):
+def simplify(sentence, lemmas=None, removeStopWords=True):
""" Simply the given sentence
0) If removeStopWords, then remove the stop words
1) If lemmas are given, the sentence is lemmatized
@@ -110,28 +118,32 @@
if lemmas:
sentence = lemmatized(sentence, lemmas)
sentence = sentence.lower()
-    cleansent = ''
-    for s in sentence:
-        if s not in punctuation:
-            cleansent += s
+    cleansent = ''.join([s for s in sentence if s not in punctuation])

if not removeStopWords:
return cleansent
else:
return ' '.join([w for w in cleansent.split(' ') if w not in STOPWORDS])

-
-def tokenize(sentence, tokenizer = None):
+def tokenize(sentence, tokenizer=None):
""" Tokenize a sentence.
Use ``tokenizer`` if given, else try to use the nltk WordPunctTokenizer,
in case of failure, it just split on spaces.

Anyway, tokenizer must have a ``tokenize()`` method
"""
-    tokenizer = tokenizer or Tokenizer
-    return [w for w in tokenizer().tokenize(sentence)]
+    if not tokenizer and isinstance(sentence, str):
+        tokenizer = WordTokenizer
+    elif not tokenizer and isinstance(sentence, unicode):
+        # XXX Unicode, could not use WorkTokenizer
+        if sentence == unormalize(sentence):
+            # This may be still used with the WordTokenizer
+            tokenizer = WordTokenizer
+        else:
+            tokenizer = Tokenizer

-def wordgrams(sentence, k):
+def iter_wordgrams(sentence, k):
""" Generator of k-wordgrams on the given sentence
"""
words = sentence.split(' ')
@@ -141,11 +153,10 @@
""" Return the default lemmas dictionnary
"""
-    return dict([line.strip().split('\t')
-                 for line in open(filename)
-                     if len(line.strip().split('\t'))==2])
+    return dict([line.strip().split('\t') for line in open(filename)
+                 if len(line.strip().split('\t'))==2])

-def lemmatized(sentence, lemmas, tokenizer = None):
+def lemmatized(sentence, lemmas, tokenizer=None):
""" Return the lemmatized sentence
"""
tokenized_sent = tokenize(sentence, tokenizer)
@@ -171,14 +182,13 @@
lemma = _words[0]
return lemma

-def roundstr(number, ndigits = 0):
+def roundstr(number, ndigits=0):
"""Return an unicode string of ``number`` rounded to a given precision
in decimal digits (default 0 digits)

If ``number`` is not a float, this method casts it to a float. (An
exception may be raised if it's not possible)
"""
-
return format(round(float(number), ndigits), '0.%df' % ndigits)

def rgxformat(string, regexp, output):```
```--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/data/file2parse	Fri Nov 09 13:26:12 2012 +0100
@@ -0,0 +1,3 @@
+1, house , 12, 19, apple
+2, horse , 21.9, 19, stramberry
+3, flower, 23, 2.17 , cherry```
```--- a/test/test_alignment.py	Mon Nov 12 09:34:36 2012 +0100
+++ b/test/test_alignment.py	Fri Nov 09 13:26:12 2012 +0100
@@ -101,7 +101,7 @@

self.assertEqual(jaccard('bonjour', 'bonjour'), 0.0)
self.assertAlmostEqual(jaccard('boujour', 'bonjour'), 1, 2)
-        self.assertAlmostEqual(jaccard('sacré rubert', 'sacré hubert'), 0.5, 2)
+        self.assertAlmostEqual(jaccard(u'sacré rubert', u'sacré hubert'), 0.667, 2)

#Test symetry
self.assertEqual(jaccard('orange', 'morange'),
@@ -161,11 +161,13 @@
self.assertEqual(simplify(u"J'aime les frites, les pommes et les" \
u" scoubidous !", self.lemmas),
u"aimer frites pomme scoubidou")
+
def test_tokenize(self):
self.assertEqual(tokenize(u"J'aime les frites !"),
[u'J', u"'", u'aime', u'les', u'frites', u'!',])

def test_lemmatizer(self):
+        self.assertEqual(lemmatized(u'sacré rubert', self.lemmas), u'sacré rubert')
self.assertEqual(lemmatized(u"J'aime les frites !", self.lemmas),
u'je aimer le frite')
self.assertEqual(lemmatized(u", J'aime les frites", self.lemmas),```