Respect (or try to respect) pep8
authorSimon Chabot <simon.chabot@logilab.fr>
Thu, 15 Nov 2012 14:42:05 +0100
changeset 139 f29c1b937abb
parent 138 1e3d51f347d5
child 140 7c5fbad69680
Respect (or try to respect) pep8
aligner.py
dataio.py
demo.py
distances.py
matrix.py
minhashing.py
--- a/aligner.py	Thu Nov 15 09:38:52 2012 +0100
+++ b/aligner.py	Thu Nov 15 14:42:05 2012 +0100
@@ -171,12 +171,12 @@
         that contains the treatments to do on the different attributs.
         Each dictionnary is built as the following:
 
-            treatment = { 'normalization': [f1, f2, f3],
-                          'norm_params': { 'arg1': arg01, 'arg2': arg02},
-                          'metric': d1,
-                          'metric_params': { 'arg1': arg11 },
-                          'weighting': w,
-                          'matrix_normalize': True
+            treatment = {'normalization': [f1, f2, f3],
+                         'norm_params': {'arg1': arg01, 'arg2': arg02},
+                         'metric': d1,
+                         'metric_params': {'arg1': arg11},
+                         'weighting': w,
+                         'matrix_normalize': True
                         }
 
             `normalization` is the list of functions called to normalize the
--- a/dataio.py	Thu Nov 15 09:38:52 2012 +0100
+++ b/dataio.py	Thu Nov 15 14:42:05 2012 +0100
@@ -35,7 +35,7 @@
                 return data.decode(encoding)
             return data
 
-def sparqlquery(endpoint, query, indexes=[]):
+def sparqlquery(endpoint, query, indexes=None):
     """ Run the sparql query on the given endpoint, and wrap the items in the
     indexes form. If indexes is empty, keep raw output"""
 
@@ -47,6 +47,7 @@
     rawresults = sparql.query().convert()
     labels = rawresults['head']['vars']
     results = []
+    indexes = indexes or []
 
     for raw in rawresults["results"]["bindings"]:
         data = []
@@ -61,7 +62,7 @@
         results.append(data)
     return results
 
-def parsefile(filename, indexes=[], nbmax=None, delimiter='\t',
+def parsefile(filename, indexes=None, nbmax=None, delimiter='\t',
               encoding='utf-8', field_size_limit=None):
     """ Parse the file (read ``nbmax`` line at maximum if given). Each
         line is splitted according ``delimiter`` and only ``indexes`` are kept
@@ -90,6 +91,7 @@
 
 
     result = []
+    indexes = indexes or []
     for ind, row in enumerate(formatedoutput(filename)):
         data = []
         if nbmax and ind > nbmax:
--- a/demo.py	Thu Nov 15 09:38:52 2012 +0100
+++ b/demo.py	Thu Nov 15 14:42:05 2012 +0100
@@ -26,18 +26,18 @@
     """
 
     targetset = sparqlquery('http://dbpedia.org/sparql', query)
-    alignset = parsefile(path.join(DEMODIR, 'demo','prixgoncourt'), indexes = [1, 1])
+    alignset = parsefile(path.join(DEMODIR, 'demo','prixgoncourt'), indexes=[1, 1])
 
     def removeparenthesis(string):
         if '(' in string:
             return string[:string.index('(')]
         return string
 
-    tr_name = { 'normalization' : [removeparenthesis, n.simplify],
-                'metric': d.levenshtein
+    tr_name = {'normalization': [removeparenthesis, n.simplify],
+               'metric': d.levenshtein
               }
 
-    treatments = { 1: tr_name }
+    treatments = {1: tr_name }
 
     dmatrix, hasmatched = align(alignset, targetset, 0.4, treatments,
                                 'demo0_results')
@@ -53,25 +53,25 @@
     # position (longitude, latitude)
     # ``nbmax`` is the number of locations to load
 
-    targetset = parsefile(path.join(DEMODIR, 'demo', 'FR.txt'), indexes = [0, 1, (4, 5)],
+    targetset = parsefile(path.join(DEMODIR, 'demo', 'FR.txt'), indexes=[0, 1, (4, 5)],
                           nbmax = 2000)
     alignset = parsefile(path.join(DEMODIR, 'demo', 'frenchbnf'),
-                         indexes = [0, 2, (14, 12)], nbmax = 1000)
+                         indexes = [0, 2, (14, 12)], nbmax=1000)
 
 
     # Let's define the treatments to apply on the location's name
-    tr_name = { 'normalization': [n.simplify], # Simply all the names (remove
-                                               #   punctuation, lower case, etc)
-                'metric': d.levenshtein,       # Use the levenshtein distance
-                'weighting': 1                 # Use 1 a name-distance matrix
-                                               #   weighting coefficient
+    tr_name = {'normalization': [n.simplify], # Simply all the names (remove
+                                              #   punctuation, lower case, etc)
+               'metric': d.levenshtein,       # Use the levenshtein distance
+               'weighting': 1                 # Use 1 a name-distance matrix
+                                              #   weighting coefficient
               }
-    tr_geo = { 'normalization': [],              # No normalization needed
-               'metric': d.geographical,         # Use the geographical distance
-               'metric_params': {'units' : 'km'},# Arguments given the
-                                                 #   distance function. Here,
-                                                 #   the unit to use
-               'weighting': 1
+    tr_geo = {'normalization': [],              # No normalization needed
+              'metric': d.geographical,         # Use the geographical distance
+              'metric_params': {'units': 'km'},# Arguments given the
+                                                #   distance function. Here,
+                                                #   the unit to use
+              'weighting': 1
              }
 
     treatments = {1: tr_name, 2: tr_geo}
@@ -100,13 +100,13 @@
                                mode='minibatch')
 
     # Let's define the treatments to apply on the location's name
-    tr_name = { 'normalization': [lambda x: str(x),#Some names are casted to
-                                                   #int/float, just correct it
-                                  n.simplify], # Simply all the names (remove
-                                               #   punctuation, lower case, etc)
-                'metric': d.levenshtein,       # Use the levenshtein distance
-                'weighting': 1                 # Use 1 a name-distance matrix
-                                               #   weighting coefficient
+    tr_name = {'normalization': [lambda x: str(x),#Some names are casted to
+                                                  #int/float, just correct it
+                                 n.simplify], # Simply all the names (remove
+                                              #   punctuation, lower case, etc)
+               'metric': d.levenshtein,       # Use the levenshtein distance
+               'weighting': 1                 # Use 1 a name-distance matrix
+                                              #   weighting coefficient
               }
 
     treatments = {1: tr_name}
@@ -141,4 +141,4 @@
         demo_2()
 
     print "Demo terminated"
-    print "Took %d min" % ((time() - t) / 60)
+    print "Took %d min" % ((time() - t)/60)
--- a/distances.py	Thu Nov 15 09:38:52 2012 +0100
+++ b/distances.py	Thu Nov 15 14:42:05 2012 +0100
@@ -94,7 +94,7 @@
     onerowago = None
     thisrow = range(1, lenb + 1) + [0]
     for x in xrange(len(stra)):
-        onerowago, thisrow = thisrow, [0] * lenb + [x+1]
+        onerowago, thisrow = thisrow, [0]*lenb + [x+1]
         for y in xrange(lenb):
             delcost = onerowago[y] + 1
             addcost = thisrow[y - 1] + 1
@@ -111,29 +111,29 @@
         .:: wiki_ : https://en.wikipedia.org/wiki/Soundex
 
         If spaces are found in stra or strb, this method returns
-            _handlespaces(stra, strb), soundex, language = language)
+            _handlespaces(stra, strb), soundex, language=language)
     """
 
     vowels = 'AEHIOUWY'
     if language.lower() == 'french' :
-        consonnantscode = { 'B' : '1', 'P' : '1',
-                            'C' : '2', 'K' : '2', 'Q' : '2',
-                            'D' : '3', 'T' : '3',
-                            'L' : '4',
-                            'M' : '5', 'N' : '5',
-                            'R' : '6',
-                            'G' : '7', 'J' : '7',
-                            'X' : '8', 'Z' : '8', 'S' : '8',
-                            'F' : '9', 'V' : '9'
+        consonnantscode = {'B': '1', 'P': '1',
+                           'C': '2', 'K': '2', 'Q': '2',
+                           'D': '3', 'T': '3',
+                           'L': '4',
+                           'M': '5', 'N': '5',
+                           'R': '6',
+                           'G': '7', 'J': '7',
+                           'X': '8', 'Z': '8', 'S': '8',
+                           'F': '9', 'V': '9'
                           }
     elif language.lower() == 'english':
-        consonnantscode = { 'B' : '1', 'F' : '1', 'P' : '1', 'V' : '1',
-                            'C' : '2', 'G' : '2', 'J' : '2', 'K' : '2',
-                            'Q' : '2', 'S' : '2', 'X' : '2', 'Z' : '2',
-                            'D' : '3', 'T' : '3',
-                            'L' : '4',
-                            'M' : '5', 'N' : '5',
-                            'R' : '6'
+        consonnantscode = {'B': '1', 'F': '1', 'P': '1', 'V': '1',
+                           'C': '2', 'G': '2', 'J': '2', 'K': '2',
+                           'Q': '2', 'S': '2', 'X': '2', 'Z': '2',
+                           'D': '3', 'T': '3',
+                           'L': '4',
+                           'M': '5', 'N': '5',
+                           'R': '6'
                           }
     else:
         raise NotImplementedError('Soundex code is not supported (yet ?) for'
@@ -159,7 +159,7 @@
     #Replace according to the codes
     code = code[0] + ''.join([consonnantscode[c] for c in code[1:]])
     ###First four letters, completed by zeros
-    return code[:4] + '0' * (4 - len(code))
+    return code[:4] + '0'*(4 - len(code))
 
 def soundex(stra, strb, language='french', tokenizer=None):
     """ Return the 1/0 distance between the soundex code of stra and strb.
@@ -176,13 +176,13 @@
         set of stra and strb. If no tokenizer is given, it use if
         alignement.normalize.tokenize's default one.
 
-        J(A, B) = (A \cap B) / (A \cup B)
+        J(A, B) = (A \cap B)/(A \cup B)
         d(A, B) = 1 - J(A, B)
     """
 
     seta = set(tokenize(stra, tokenizer))
     setb = set(tokenize(strb, tokenizer))
-    return 1.0 - 1.0 * len(seta.intersection(setb)) / len(seta.union(setb))
+    return 1.0 - 1.0*len(seta.intersection(setb))/len(seta.union(setb))
 
 
 ### TEMPORAL DISTANCES ########################################################
@@ -202,33 +202,34 @@
 
     class customparserinfo(dateparser.parserinfo):
         if language.lower() == u'french':
-            HMS      = [(u'h', u'heure', u'heures'),
-                        (u'm', u'minute', u'minutes'),
+            HMS = [(u'h', u'heure', u'heures'),
+                   (u'm', u'minute', u'minutes'),
                         (u's', u'seconde', u'seconde'),]
-            JUMP     = [u' ', u'.', u',', u';', u'-', u'/', u"'",
-                        u'a', u'le', u'et', u'er']
-            MONTHS   = [(u'Jan', u'Janvier'), (u'Fev', u'Fevrier'), (u'Mar', u'Mars'),
-                       (u'Avr', u'Avril'), (u'Mai', u'Mai'), (u'Jun', u'Juin'),
-                       (u'Jui', u'Juillet'), (u'Aou', u'Aout'),
-                       (u'Sep', u'Septembre'), (u'Oct', u'Octobre'),
-                       (u'Nov', u'Novembre'), (u'Dec', u'Decembre'),]
-            PERTAIN  = [u'de']
+            JUMP = [u' ', u'.', u',', u';', u'-', u'/', u"'",
+                   u'a', u'le', u'et', u'er']
+            MONTHS = [(u'Jan', u'Janvier'), (u'Fev', u'Fevrier'),
+                      (u'Mar', u'Mars'), (u'Avr', u'Avril'), (u'Mai', u'Mai'),
+                      (u'Jun', u'Juin'), (u'Jui', u'Juillet'),
+                      (u'Aou', u'Aout'), (u'Sep', u'Septembre'),
+                      (u'Oct', u'Octobre'), (u'Nov', u'Novembre'),
+                      (u'Dec', u'Decembre')]
+            PERTAIN = [u'de']
             WEEKDAYS = [(u'Lun', u'Lundi'),
                         (u'Mar', u'Mardi'),
                         (u'Mer', u'Mercredi'),
                         (u'Jeu', u'Jeudi'),
                         (u'Ven', u'Vendredi'),
                         (u'Sam', u'Samedi'),
-                        (u'Dim', u'Dimanche'),]
+                        (u'Dim', u'Dimanche')]
     datea = dateparser.parse(stra, parserinfo=customparserinfo(dayfirst,
                              yearfirst), fuzzy=True)
     dateb = dateparser.parse(strb, parserinfo=customparserinfo(dayfirst,
                              yearfirst), fuzzy=True)
     diff = datea - dateb
     if granularity.lower() == 'years':
-        return abs(diff.days / 365.25)
+        return abs(diff.days/365.25)
     if granularity.lower() == 'months':
-        return abs(diff.days / 30.5)
+        return abs(diff.days/30.5)
     return abs(diff.days)
 
 
@@ -254,9 +255,9 @@
     meanlat = (pointa[0] + pointb[0])/2.0
 
     if not in_radians:
-        difflat *= pi / 180.0
-        difflong *= pi / 180.0
-        meanlat *= pi / 180.0
+        difflat *= pi/180.0
+        difflong *= pi/180.0
+        meanlat *= pi/180.0
 
     coef = 1. if units == 'm' else 0.001
-    return coef * planetRadius * sqrt(difflat**2 + (cos(meanlat) * difflong)**2)
+    return coef*planetRadius*sqrt(difflat**2 + (cos(meanlat)*difflong)**2)
--- a/matrix.py	Thu Nov 15 09:38:52 2012 +0100
+++ b/matrix.py	Thu Nov 15 14:42:05 2012 +0100
@@ -54,7 +54,7 @@
             if X[i] and X[j]:
                 d = metric(X[i], X[j], **(metric_params or {}))
                 if matrix_normalized:
-                    d = 1 - (1.0 / (1.0 + d))
+                    d = 1 - (1.0/(1.0 + d))
             values.append(d)
     return values
 
@@ -70,11 +70,11 @@
             if X[i] and Y[j]:
                 d = metric(X[i], Y[j], **(metric_params or {}))
                 if matrix_normalized:
-                    d = 1 - (1.0 / (1.0 + d))
+                    d = 1 - (1.0/(1.0 + d))
             distmatrix[i, j] = d
     return distmatrix
 
-def matched(distmatrix, cutoff = 0, normalized = False):
+def matched(distmatrix, cutoff=0, normalized=False):
     """ Return the matched elements within a dictionnary,
     each key being the indice from X, and the corresponding
     values being a list of couple (indice from Y, distance)
--- a/minhashing.py	Thu Nov 15 09:38:52 2012 +0100
+++ b/minhashing.py	Thu Nov 15 14:42:05 2012 +0100
@@ -36,7 +36,7 @@
     b = randint(1, zr - 1)
 
     def hashfunc(x):
-        return ((a * x + b) % zr)
+        return ((a*x + b)%zr)
 
     return hashfunc
 
@@ -141,10 +141,10 @@
 
         ### t ~ (1/b)^(1/r), where t is the threshold, b the number of
         ### bands, and r the number of rows per band. And nbrows (the length
-        ### of the matrix is nbrows = b * r, so t ~ (r / L)^(1 / r). So, let's
-        ### find the root of f(x) = (x / L)^(1/r) - t.
+        ### of the matrix is nbrows = b*r, so t ~ (r/L)^(1/r). So, let's
+        ### find the root of f(x) = (x/L)^(1/r) - t.
         def f(x):
-            y = pow(x / nbrows, 1. /x) - threshold
+            y = pow(x/nbrows, 1. /x) - threshold
             return y
 
         ## Solve f(x) = 0, with x having values in [1, nbrows]
@@ -186,8 +186,7 @@
     from scipy import polyfit
 
     sentences = [s[0] for s in parsefile('data/US.txt', indexes=[1],
-                               field_size_limit=1000000000) if s[0]]
-    print sentences[:10]
+                               field_size_limit=1000000000, nbmax=None) if s[0]]
 
 
     lemmas = loadlemmas('data/french_lemmas.txt')
@@ -196,7 +195,7 @@
     def compute_complexite(size):
         print "%d%%" % size
         t0 = time()
-        length = int(size * len(sentences) / 100)
+        length = int(size*len(sentences)/100)
         minlsh.train((simplify(s, lemmas) for s in sentences[:length]), 1, 100)
         t1 = time()
         r = minlsh.predict(0.3)