[distances] Add soundex code (related #128982)
authorSimon Chabot <simon.chabot@logilab.fr>
Wed, 17 Oct 2012 16:42:48 +0200
changeset 2 0b047ac20259
parent 1 f191df5b5d37
child 3 9bfce523ec42
[distances] Add soundex code (related #128982)
distances.py
--- a/distances.py	Wed Oct 17 16:40:51 2012 +0200
+++ b/distances.py	Wed Oct 17 16:42:48 2012 +0200
@@ -22,3 +22,61 @@
             subcost = onerowago[y - 1] + (stra[x] != strb[y])
             thisrow[y] = min(delcost, addcost, subcost)
     return thisrow[lenb - 1]
+
+def soundex(word, language = 'french'):
+    """ Return the Soundex code of the word ``word``
+        For more information about soundex code see wiki_
+
+        ``language`` can be 'french' or 'english'
+
+        .:: wiki_ : https://en.wikipedia.org/wiki/Soundex
+    """
+
+    vowels = 'AEHIOUWY'
+    if language.lower() == 'french' :
+        consonnantscode = { 'B' : '1', 'P' : '1',
+                            'C' : '2', 'K' : '2', 'Q' : '2',
+                            'D' : '3', 'T' : '3',
+                            'L' : '4',
+                            'M' : '5', 'N' : '5',
+                            'R' : '6',
+                            'G' : '7', 'J' : '7',
+                            'X' : '8', 'Z' : '8', 'S' : '8',
+                            'F' : '9', 'V' : '9'
+                          }
+    elif language.lower() == 'english':
+        consonnantscode = { 'B' : '1', 'F' : '1', 'P' : '1', 'V' : '1',
+                            'C' : '2', 'G' : '2', 'J' : '2', 'K' : '2',
+                            'Q' : '2', 'S' : '2', 'X' : '2', 'Z' : '2',
+                            'D' : '3', 'T' : '3',
+                            'L' : '4',
+                            'M' : '5', 'N' : '5',
+                            'R' : '6'
+                          }
+    else:
+        raise NotImplementedError('Soundex code is not supported (yet ?) for'
+                                  'this language')
+
+    word = word.strip().upper()
+    code = word[0]
+
+    #After this ``for`` code is
+    # the first letter of ``word`` followed by all the consonnants of word,
+    # where from consecutive consonnants, only the first is kept,
+    # and from two identical consonnants separated by a W or a H, only the first
+    # is kept too.
+    for i in xrange(len(word[1:])):
+        if word[i] in vowels:
+            continue
+        if consonnantscode[word[i]] == consonnantscode.get(code[-1], ''):
+            continue
+        if i + 2 < len(word) and word[i + 1] in 'WH' and \
+           consonnantscode[word[i]] == consonnantscode[word[i + 2]]:
+            continue
+        code += word[i]
+
+
+    #Replace according to the codes
+    code = code[0] + ''.join([consonnantscode[c] for c in code[1:]])
+    ###First four letters, completed by zeros
+    return code[:4] + '0' * (4 - len(code))