[aligner] Set the parsefile function into aligner module
authorSimon Chabot <simon.chabot@logilab.fr>
Tue, 06 Nov 2012 16:27:12 +0100
changeset 73 0ada0a18de12
parent 72 547a199eec21
child 74 20ef4e9c9f26
[aligner] Set the parsefile function into aligner module
aligner.py
demo.py
test/test_alignment.py
--- a/aligner.py	Tue Nov 06 14:05:31 2012 +0100
+++ b/aligner.py	Tue Nov 06 16:27:12 2012 +0100
@@ -17,6 +17,8 @@
 
 from os.path import exists as fileexists
 
+import csv
+
 import alignment.distances as d
 import alignment.normalize as n
 import alignment.matrix as m
@@ -110,3 +112,53 @@
                      dist
                     ))
     return mat, True
+
+def parsefile(filename, indexes=[], nbmax=None, delimiter='\t'):
+    """ Parse the file (``nbmax`` line as maximum if given). Each
+        line is splitted according ``delimiter`` and ``indexes`` are kept
+
+        eg : The file is :
+                1, house, 12, 19, apple
+                2, horse, 21.9, 19, stramberry
+                3, flower, 23, 2.17, cherry
+
+            data = parsefile('myfile', [0, (2, 3), 4, 1], delimiter=',')
+
+            The result will be :
+            data = [[1, (12,   19), 'apple', 'house'],
+                    [2, (21.9, 19), 'stramberry', 'horse'],
+                    [3, (23,   2.17), 'cherry', 'flower']]
+
+    """
+    def str2number(word):
+        try:
+            return int(word)
+        except ValueError:
+            try:
+                return float(word)
+            except ValueError:
+                return word
+
+    result = []
+    with open(filename, 'r') as csvfile:
+        reader = csv.reader(csvfile, delimiter=delimiter)
+        for ind, row in enumerate(reader):
+            data = []
+            if nbmax and ind > nbmax:
+                break
+            row = [str2number(r.strip()) for r in row]
+            if not indexes:
+                data = row
+            else:
+                for ind in indexes:
+                    if isinstance(ind, tuple):
+                        data.append(tuple([row[i] for i in ind]))
+                        if '' in data[-1]:
+                            data[-1] = None
+                    elif row[ind]:
+                        data.append(row[ind])
+                    else:
+                        data.append(None)
+
+            result.append(data)
+    return result
--- a/demo.py	Tue Nov 06 14:05:31 2012 +0100
+++ b/demo.py	Tue Nov 06 16:27:12 2012 +0100
@@ -3,36 +3,7 @@
 
 import alignment.distances as d
 import alignment.normalize as n
-from alignment.aligner import align
-
-
-def parsefile(filename, indexes = [], nbmax = None, fielddelimiter = '\t'):
-    """ Read filename line by line, (``nbmax`` line as maximum if given). Each
-        line is splitted according ``fielddelimiter`` and keep ``indexes``
-    """
-    result = []
-    with open(filename) as fobj:
-        for ind, line in enumerate(fobj):
-            data = []
-            if nbmax and ind > nbmax:
-                break
-            line = line.strip().decode('utf-8')
-            line = line.split(fielddelimiter)
-            if not indexes:
-                data = line
-            else:
-                for ind in indexes:
-                    try:
-                        if isinstance(ind, tuple):
-                            data.append(tuple([line[i] for i in ind]))
-                        else:
-                            data.append(line[ind])
-                    except IndexError:
-                        data.append(None)
-            result.append(data)
-    return result
-
-
+from alignment.aligner import align, parsefile
 
 if __name__ == '__main__':
     targetset = parsefile('data/FR.txt', indexes = [0, 1, (4, 5)], nbmax = 2000)
--- a/test/test_alignment.py	Tue Nov 06 14:05:31 2012 +0100
+++ b/test/test_alignment.py	Tue Nov 06 16:27:12 2012 +0100
@@ -50,6 +50,7 @@
                                  roundstr, rgxformat, tokenize, simplify)
 from alignment.matrix import Distancematrix
 from alignment.minhashing import Minlsh
+from alignment.aligner import parsefile
 
 class DistancesTest(unittest2.TestCase):
     def test_levenshtein(self):
@@ -236,5 +237,12 @@
 
         self.assertEqual(minlsh.findsimilarsentences(0.65), set([(0, 1), (2, 4)]))
 
+class AlignerTestCase(unittest2.TestCase):
+    def test_parser(self):
+        data = parsefile('data/file2parse', [0, (2, 3), 4, 1], delimiter=',')
+        self.assertEqual(data, [[1, (12, 19), 'apple', 'house'],
+                                [2, (21.9, 19), 'stramberry', 'horse'],
+                                [3, (23, 2.17), 'cherry', 'flower']])
+
 if __name__ == '__main__':
     unittest2.main()