wip
authorSimon Chabot <simon.chabot@logilab.fr>
Wed, 24 Oct 2012 19:10:54 +0200
changeset 45 9f4669700221
parent 44 905240fb0f8d
child 46 4c2f7553490b
wip
matrix.py
test.py
--- a/matrix.py	Wed Oct 24 15:39:38 2012 +0200
+++ b/matrix.py	Wed Oct 24 19:10:54 2012 +0200
@@ -113,18 +113,8 @@
         if normalized:
             cutoff *= self._maxdist
 
-        row, col = self._matrix.nonzero()
-        rowcol = zip(row, col)
-
-        #Get those that exactly matched
-        allindexes = ((i, j) for i in xrange(self.size[0])
-                                 for j in xrange(self.size[1]))
-        zeros = [index for index in allindexes if index not in rowcol]
-        for (i, j) in zeros:
-            match[i].append((j, 0))
-
-        if cutoff > 0: #If more is wanted, return it too
-            for (i, j) in rowcol:
+        for i in xrange(self._matrix.shape[0]):
+            for j in xrange(self._matrix.shape[1]):
                 if self._matrix[i, j] <= cutoff:
                     if normalized:
                         match[i].append((j, self._matrix[i, j]/self._maxdist))
@@ -165,5 +155,7 @@
     """
     globalmatrix = items[0][0] * Distancematrix(*items[0][1:])
     for item in items[1:]:
-        globalmatrix +=  item[0] * Distancematrix(*item[1:])
+        tmp =  item[0] * Distancematrix(*item[1:])
+        print tmp._maxdist
+        globalmatrix +=  tmp
     return globalmatrix
--- a/test.py	Wed Oct 24 15:39:38 2012 +0200
+++ b/test.py	Wed Oct 24 19:10:54 2012 +0200
@@ -1,5 +1,8 @@
 # -*- coding:utf-8 -*-
 
+import cubes.alignment.distances as d
+from cubes.alignment.dbpedia import dbparse
+
 def dbpediasent(filename, maxind = None, enco = 'unicode_escape'):
     fobj = open(filename)
     fobj.readline()
@@ -18,3 +21,111 @@
         if i in ind:
             print s.encode('utf-8')
             print
+            
+def builtItemsFromData(datafile):
+    """
+        given_name;family_name;birthdate;birthplace;deathdate;deathplace
+    """
+
+    def gettuples(datafile):
+        def none2None(mylist):
+            cleanlist = []
+            for e in mylist:
+                if e == 'None':
+                    cleanlist.append(None)
+                else:
+                    cleanlist.append(e)
+            return cleanlist
+
+        fobj = open(datafile)
+        for line in fobj:
+            line = line.strip().decode('utf-8')
+            yield none2None(line.split(';'))
+
+    fieldsopencat = { 'given' : [],
+                      'family' : [],
+                      'birthdate' : [],
+                      'birthplace' : [],
+                      'deathdate' : [],
+                      'deathplace' : [],
+                      'uri' : []
+                    }
+    fieldsdbpedia = { 'givenName' : [],
+                      'surname' : [],
+                      'birthDate' : [],
+                      'birthPlace' : [],
+                      'deathDate' : [],
+                      'deathPlace' : [],
+                      'uri' : []
+                    }
+
+    for uri, g, f, bd, bp, dd, dp in gettuples(datafile):
+        fieldsopencat['given'].append(g)
+        fieldsopencat['family'].append(f)
+        fieldsopencat['birthdate'].append(bd)
+        fieldsopencat['birthplace'].append(bp)
+        fieldsopencat['deathdate'].append(dd)
+        fieldsopencat['deathplace'].append(dp)
+        fieldsopencat['uri'].append(uri)
+
+    olduri = None
+    for uri, attr, val in dbparse('data/dbpedia_data.nt', 
+                         attributs = set(fieldsdbpedia.keys()), uri = False):
+        maxlen = max([len(v) for v in fieldsdbpedia.values()])
+        if olduri and uri != olduri:
+            for key in fieldsdbpedia.keys():
+                if key == attr or key == 'uri':
+                    continue
+                diff = maxlen - len(fieldsdbpedia[key])
+
+                while diff > 0:
+                    print "missing : ", olduri, key
+                    fieldsdbpedia[key].append(None)
+                    diff -= 1
+        if olduri == uri and maxlen and len(fieldsdbpedia[attr]) == maxlen:
+            continue
+        olduri = uri
+        print "add : ", uri, attr, val
+        fieldsdbpedia[attr].append(val)
+        if not fieldsdbpedia['uri'] or fieldsdbpedia['uri'][-1] != uri:
+            fieldsdbpedia['uri'].append(uri)
+
+
+    for key in fieldsdbpedia.keys():
+        if key == attr or key == 'uri':
+            continue
+        diff = maxlen - len(fieldsdbpedia[key])
+
+        while diff > 0:
+            print "missing : ", olduri, key
+            fieldsdbpedia[key].append(None)
+            diff -= 1
+    items = [
+        (1, fieldsopencat['given'],
+            fieldsdbpedia['givenName'],
+            d.jaccard, 1, {}),
+        (1, fieldsopencat['family'],
+            fieldsdbpedia['surname'],
+            d.jaccard, 1, {}),
+        (20, fieldsopencat['birthdate'],
+              fieldsdbpedia['birthDate'],
+              d.temporal, 20000, {'granularity' : 'months',
+                              'dayfirst' : False,
+                              'yearfirst' : True,
+                              }),
+        (0.1, fieldsopencat['birthplace'],
+               fieldsdbpedia['birthPlace'], 
+               d.jaccard, 10, {}),
+        (20, fieldsopencat['deathdate'],
+              fieldsdbpedia['deathDate'], 
+              d.temporal, 20000, {'granularity' : 'months',
+                              'dayfirst' : False,
+                              'yearfirst' : True,
+                             }),
+        (0.1, fieldsopencat['deathplace'],
+               fieldsdbpedia['deathPlace'],
+               d.jaccard, 10, {}),
+    ]
+
+    return items, fieldsopencat['uri'], fieldsdbpedia['uri'] 
+