Extract the alignment process from the cube to be independant
authorSimon Chabot <simon.chabot@logilab.fr>
Tue, 06 Nov 2012 17:37:36 +0100
changeset 71 22c93ceb0163
parent 70 e9de54adb8f7
child 72 547a199eec21
Extract the alignment process from the cube to be independant *** amends 7d398efa1ab38937d1a6aae63ec13fcbfcad1d3f
MANIFEST.in
README
__init__.py
__pkginfo__.py
aligner.py
distances.py
entities.py
hooks.py
matrix.py
migration/postcreate.py
minhashing.py
normalize.py
schema.py
setup.py
test.py
test/test_alignment.py
views.py
--- a/MANIFEST.in	Tue Nov 06 10:51:43 2012 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,5 +0,0 @@
-include *.py
-include */*.py
-recursive-include data *.gif *.png *.ico *.css *.js
-recursive-include i18n *.po
-recursive-include wdoc *
--- a/README	Tue Nov 06 10:51:43 2012 +0100
+++ b/README	Tue Nov 06 17:37:36 2012 +0100
@@ -1,3 +0,0 @@
-Summary
--------
-Cube for aligning data
--- a/__init__.py	Tue Nov 06 10:51:43 2012 +0100
+++ b/__init__.py	Tue Nov 06 17:37:36 2012 +0100
@@ -1,4 +0,0 @@
-"""cubicweb-alignment application package
-
-Cube for aligning data
-"""
--- a/__pkginfo__.py	Tue Nov 06 10:51:43 2012 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,42 +0,0 @@
-# pylint: disable=W0622
-"""cubicweb-alignment application packaging information"""
-
-modname = 'alignment'
-distname = 'cubicweb-alignment'
-
-numversion = (0, 1, 0)
-version = '.'.join(str(num) for num in numversion)
-
-license = 'LGPL'
-author = 'LOGILAB S.A. (Paris, FRANCE)'
-author_email = 'contact@logilab.fr'
-description = 'Cube for aligning data'
-web = 'http://www.cubicweb.org/project/%s' % distname
-
-__depends__ =  {'cubicweb': '>= 3.15.4'}
-__recommends__ = {}
-
-
-from os import listdir as _listdir
-from os.path import join, isdir
-from glob import glob
-
-THIS_CUBE_DIR = join('share', 'cubicweb', 'cubes', modname)
-
-def listdir(dirpath):
-    return [join(dirpath, fname) for fname in _listdir(dirpath)
-            if fname[0] != '.' and not fname.endswith('.pyc')
-            and not fname.endswith('~')
-            and not isdir(join(dirpath, fname))]
-
-data_files = [
-    # common files
-    [THIS_CUBE_DIR, [fname for fname in glob('*.py') if fname != 'setup.py']],
-    ]
-# check for possible extended cube layout
-for dname in ('entities', 'views', 'sobjects', 'hooks', 'schema', 'data', 'wdoc', 'i18n', 'migration'):
-    if isdir(dname):
-        data_files.append([join(THIS_CUBE_DIR, dname), listdir(dname)])
-# Note: here, you'll need to add subdirectories if you want
-# them to be included in the debian package
-
--- a/aligner.py	Tue Nov 06 10:51:43 2012 +0100
+++ b/aligner.py	Tue Nov 06 17:37:36 2012 +0100
@@ -110,20 +110,3 @@
                      dist
                     ))
     return mat, True
-
-if __name__ == '__main__':
-    alignquery = 'Any P, BP ORDERBY(RANDOM()) LIMIT 100 WHERE P is Person, ' \
-                 'P birthplace BP, NOT BP is NULL'
-    targetquery = 'Any GID, N ORDERBY(RANDOM()) LIMIT 1000 WHERE L is Location, ' \
-                  'L name N, L geoid GID'
-
-    lemmas = n.loadlemmas('data/french_lemmas.txt')
-    tr = { 'normalization': [n.simplify],
-           'norm_args': { 'lemmas' : lemmas, 'removeStopWords': False },
-           'distance':  d.levenshtein,
-         }
-
-    align(session.execute(alignquery),
-          session.execute(targetquery),
-          [tr], 0.3, 'alignment_results')
-
--- a/distances.py	Tue Nov 06 10:51:43 2012 +0100
+++ b/distances.py	Tue Nov 06 17:37:36 2012 +0100
@@ -20,7 +20,7 @@
 
 from scipy import matrix
 
-from alignement.normalize import tokenize
+from alignment.normalize import tokenize
 
 def levenshtein(stra, strb):
     """ Compute the Levenshtein distance between stra and strb.
--- a/entities.py	Tue Nov 06 10:51:43 2012 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,18 +0,0 @@
-# -*- coding: utf-8 -*-
-# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-
-"""cubicweb-alignment entity's classes"""
--- a/hooks.py	Tue Nov 06 10:51:43 2012 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,18 +0,0 @@
-# -*- coding: utf-8 -*-
-# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-
-"""cubicweb-alignment specific hooks and operations"""
--- a/matrix.py	Tue Nov 06 10:51:43 2012 +0100
+++ b/matrix.py	Tue Nov 06 17:37:36 2012 +0100
@@ -21,9 +21,6 @@
 from scipy import array, empty
 from scipy import where
 
-from cubes.alignment.distances import (levenshtein, soundex, \
-                                       jaccard, temporal, euclidean)
-
 class Distancematrix(object):
     """ Construct and compute a matrix of distance given a distance function.
 
--- a/migration/postcreate.py	Tue Nov 06 10:51:43 2012 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,26 +0,0 @@
-# -*- coding: utf-8 -*-
-# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-
-"""cubicweb-alignment postcreate script, executed at instance creation time or when
-the cube is added to an existing instance.
-
-You could setup site properties or a workflow here for example.
-"""
-
-# Example of site property change
-#set_property('ui.site-title', "<sitename>")
-
--- a/minhashing.py	Tue Nov 06 10:51:43 2012 +0100
+++ b/minhashing.py	Tue Nov 06 17:37:36 2012 +0100
@@ -17,7 +17,6 @@
 
 import cPickle
 
-from time import time
 from random import randint
 from collections import defaultdict
 
@@ -25,7 +24,7 @@
 from scipy.sparse import lil_matrix
 from scipy.optimize import bisect
 
-from cubes.alignment.normalize import wordgrams
+from alignment.normalize import wordgrams
 
 def randomhashfunction(zr):
     """ Return a random hash function, mapping x in Z to ZR
@@ -189,7 +188,7 @@
         return set(tuple(v) for v in buckets.itervalues() if len(v) > 1)
 
 if __name__ == '__main__':
-    from cubes.alignment.normalize import (loadlemmas, simplify)
+    from alignment.normalize import (loadlemmas, simplify)
 
     sentences = ["j'aime le poisson", "le poisson c'est bon",
                  "je cuis le poisson", "je fais du sport",
--- a/normalize.py	Tue Nov 06 10:51:43 2012 +0100
+++ b/normalize.py	Tue Nov 06 17:37:36 2012 +0100
@@ -18,9 +18,10 @@
 import re
 
 from string import punctuation
+from warnings import warn
+from unicodedata import normalize as _uninormalize
 
 from nltk.tokenize import WordPunctTokenizer
-from logilab.common.textutils import unormalize
 
 
 STOPWORDS = set([u'alors', u'au', u'aucuns', u'aussi', u'autre', u'avant',
@@ -40,6 +41,56 @@
 u'voient', u'vont', u'votre', u'vous', u'vu', u'ça', u'étaient', u'état',
 u'étions', u'été', u'être'])
 
+MANUAL_UNICODE_MAP = {
+    u'\xa1': u'!',    # INVERTED EXCLAMATION MARK
+    u'\u0142': u'l',  # LATIN SMALL LETTER L WITH STROKE
+    u'\u2044': u'/',  # FRACTION SLASH
+    u'\xc6': u'AE',   # LATIN CAPITAL LETTER AE
+    u'\xa9': u'(c)',  # COPYRIGHT SIGN
+    u'\xab': u'"',    # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
+    u'\xe6': u'ae',   # LATIN SMALL LETTER AE
+    u'\xae': u'(r)',  # REGISTERED SIGN
+    u'\u0153': u'oe', # LATIN SMALL LIGATURE OE
+    u'\u0152': u'OE', # LATIN CAPITAL LIGATURE OE
+    u'\xd8': u'O',    # LATIN CAPITAL LETTER O WITH STROKE
+    u'\xf8': u'o',    # LATIN SMALL LETTER O WITH STROKE
+    u'\xbb': u'"',    # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
+    u'\xdf': u'ss',   # LATIN SMALL LETTER SHARP S
+    }
+
+def unormalize(ustring, ignorenonascii=None, substitute=None):
+    """replace diacritical characters with their corresponding ascii characters
+
+    Convert the unicode string to its long normalized form (unicode character
+    will be transform into several characters) and keep the first one only.
+    The normal form KD (NFKD) will apply the compatibility decomposition, i.e.
+    replace all compatibility characters with their equivalents.
+
+    :type substitute: str
+    :param substitute: replacement character to use if decomposition fails
+
+    :see: Another project about ASCII transliterations of Unicode text
+          http://pypi.python.org/pypi/Unidecode
+    """
+    # backward compatibility, ignorenonascii was a boolean
+    if ignorenonascii is not None:
+        warn("ignorenonascii is deprecated, use substitute named parameter instead",
+             DeprecationWarning, stacklevel=2)
+        if ignorenonascii:
+            substitute = ''
+    res = []
+    for letter in ustring[:]:
+        try:
+            replacement = MANUAL_UNICODE_MAP[letter]
+        except KeyError:
+            replacement = _uninormalize('NFKD', letter)[0]
+            if ord(replacement) >= 2 ** 7:
+                if substitute is None:
+                    raise ValueError("can't deal with non-ascii based characters")
+                replacement = substitute
+        res.append(replacement)
+    return u''.join(res)
+
 def lunormalize(sentence):
     """ Normalize a sentence (ie remove accents, set to lower, etc) """
     return unormalize(sentence).lower()
--- a/schema.py	Tue Nov 06 10:51:43 2012 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,18 +0,0 @@
-# -*- coding: utf-8 -*-
-# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-
-"""cubicweb-alignment schema"""
--- a/setup.py	Tue Nov 06 10:51:43 2012 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,200 +0,0 @@
-#!/usr/bin/env python
-# pylint: disable=W0142,W0403,W0404,W0613,W0622,W0622,W0704,R0904,C0103,E0611
-#
-# copyright 2003-2010 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr/ -- mailto:contact@logilab.fr
-#
-# This file is part of CubicWeb tag cube.
-#
-# CubicWeb is free software: you can redistribute it and/or modify it under the
-# terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# CubicWeb is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with CubicWeb.  If not, see <http://www.gnu.org/licenses/>.
-"""Generic Setup script, takes package info from __pkginfo__.py file
-"""
-__docformat__ = "restructuredtext en"
-
-import os
-import sys
-import shutil
-from os.path import isdir, exists, join, walk
-
-try:
-    if os.environ.get('NO_SETUPTOOLS'):
-        raise ImportError() # do as there is no setuptools
-    from setuptools import setup
-    from setuptools.command import install_lib
-    USE_SETUPTOOLS = True
-except ImportError:
-    from distutils.core import setup
-    from distutils.command import install_lib
-    USE_SETUPTOOLS = False
-from distutils.command import install_data
-
-# import required features
-from __pkginfo__ import modname, version, license, description, web, \
-     author, author_email
-
-if exists('README'):
-    long_description = file('README').read()
-else:
-    long_description = ''
-
-# import optional features
-import __pkginfo__
-if USE_SETUPTOOLS:
-    requires = {}
-    for entry in ("__depends__",): # "__recommends__"):
-        requires.update(getattr(__pkginfo__, entry, {}))
-    install_requires = [("%s %s" % (d, v and v or "")).strip()
-                       for d, v in requires.iteritems()]
-else:
-    install_requires = []
-
-distname = getattr(__pkginfo__, 'distname', modname)
-scripts = getattr(__pkginfo__, 'scripts', ())
-include_dirs = getattr(__pkginfo__, 'include_dirs', ())
-data_files = getattr(__pkginfo__, 'data_files', None)
-ext_modules = getattr(__pkginfo__, 'ext_modules', None)
-dependency_links = getattr(__pkginfo__, 'dependency_links', ())
-
-BASE_BLACKLIST = ('CVS', '.svn', '.hg', 'debian', 'dist', 'build')
-IGNORED_EXTENSIONS = ('.pyc', '.pyo', '.elc', '~')
-
-
-def ensure_scripts(linux_scripts):
-    """
-    Creates the proper script names required for each platform
-    (taken from 4Suite)
-    """
-    from distutils import util
-    if util.get_platform()[:3] == 'win':
-        scripts_ = [script + '.bat' for script in linux_scripts]
-    else:
-        scripts_ = linux_scripts
-    return scripts_
-
-def export(from_dir, to_dir,
-           blacklist=BASE_BLACKLIST,
-           ignore_ext=IGNORED_EXTENSIONS,
-           verbose=True):
-    """make a mirror of from_dir in to_dir, omitting directories and files
-    listed in the black list
-    """
-    def make_mirror(arg, directory, fnames):
-        """walk handler"""
-        for norecurs in blacklist:
-            try:
-                fnames.remove(norecurs)
-            except ValueError:
-                pass
-        for filename in fnames:
-            # don't include binary files
-            if filename[-4:] in ignore_ext:
-                continue
-            if filename[-1] == '~':
-                continue
-            src = join(directory, filename)
-            dest = to_dir + src[len(from_dir):]
-            if verbose:
-                sys.stderr.write('%s -> %s\n' % (src, dest))
-            if os.path.isdir(src):
-                if not exists(dest):
-                    os.mkdir(dest)
-            else:
-                if exists(dest):
-                    os.remove(dest)
-                shutil.copy2(src, dest)
-    try:
-        os.mkdir(to_dir)
-    except OSError, ex:
-        # file exists ?
-        import errno
-        if ex.errno != errno.EEXIST:
-            raise
-    walk(from_dir, make_mirror, None)
-
-
-class MyInstallLib(install_lib.install_lib):
-    """extend install_lib command to handle  package __init__.py and
-    include_dirs variable if necessary
-    """
-    def run(self):
-        """overridden from install_lib class"""
-        install_lib.install_lib.run(self)
-        # manually install included directories if any
-        if include_dirs:
-            base = modname
-            for directory in include_dirs:
-                dest = join(self.install_dir, base, directory)
-                export(directory, dest, verbose=False)
-
-# re-enable copying data files in sys.prefix
-old_install_data = install_data.install_data
-if USE_SETUPTOOLS:
-    # overwrite InstallData to use sys.prefix instead of the egg directory
-    class MyInstallData(old_install_data):
-        """A class that manages data files installation"""
-        def run(self):
-            _old_install_dir = self.install_dir
-            if self.install_dir.endswith('egg'):
-                self.install_dir = sys.prefix
-            old_install_data.run(self)
-            self.install_dir = _old_install_dir
-    try:
-        import setuptools.command.easy_install # only if easy_install avaible
-        # monkey patch: Crack SandboxViolation verification
-        from setuptools.sandbox import DirectorySandbox as DS
-        old_ok = DS._ok
-        def _ok(self, path):
-            """Return True if ``path`` can be written during installation."""
-            out = old_ok(self, path) # here for side effect from setuptools
-            realpath = os.path.normcase(os.path.realpath(path))
-            allowed_path = os.path.normcase(sys.prefix)
-            if realpath.startswith(allowed_path):
-                out = True
-            return out
-        DS._ok = _ok
-    except ImportError:
-        pass
-
-def install(**kwargs):
-    """setup entry point"""
-    if USE_SETUPTOOLS:
-        if '--force-manifest' in sys.argv:
-            sys.argv.remove('--force-manifest')
-    # install-layout option was introduced in 2.5.3-1~exp1
-    elif sys.version_info < (2, 5, 4) and '--install-layout=deb' in sys.argv:
-        sys.argv.remove('--install-layout=deb')
-    cmdclass = {'install_lib': MyInstallLib}
-    if USE_SETUPTOOLS:
-        kwargs['install_requires'] = install_requires
-        kwargs['dependency_links'] = dependency_links
-        kwargs['zip_safe'] = False
-        cmdclass['install_data'] = MyInstallData
-
-    return setup(name = distname,
-                 version = version,
-                 license = license,
-                 description = description,
-                 long_description = long_description,
-                 author = author,
-                 author_email = author_email,
-                 url = web,
-                 scripts = ensure_scripts(scripts),
-                 data_files = data_files,
-                 ext_modules = ext_modules,
-                 cmdclass = cmdclass,
-                 **kwargs
-                 )
-
-if __name__ == '__main__' :
-    install()
--- a/test.py	Tue Nov 06 10:51:43 2012 +0100
+++ b/test.py	Tue Nov 06 17:37:36 2012 +0100
@@ -1,10 +1,8 @@
 # -*- coding:utf-8 -*-
 
-import cubes.alignment.distances as d
+import alignment.distances as d
 import rdflib
 
-from cubes.alignment.dbpedia import dbparse
-
 def dbpediasent(filename, maxind = None, enco = 'unicode_escape'):
     fobj = open(filename)
     fobj.readline()
--- a/test/test_alignment.py	Tue Nov 06 10:51:43 2012 +0100
+++ b/test/test_alignment.py	Tue Nov 06 17:37:36 2012 +0100
@@ -43,13 +43,13 @@
 import random
 random.seed(42) ### Make sure tests are repeatable
 
-from cubes.alignment.distances import (levenshtein, soundex, soundexcode,   \
-                                       jaccard, temporal, euclidean,        \
-                                       geographical)
-from cubes.alignment.normalize import (lunormalize, loadlemmas, lemmatized, \
-                                       roundstr, rgxformat, tokenize, simplify)
-from cubes.alignment.matrix import Distancematrix
-from cubes.alignment.minhashing import Minlsh
+from alignment.distances import (levenshtein, soundex, soundexcode,   \
+                                 jaccard, temporal, euclidean,        \
+                                 geographical)
+from alignment.normalize import (lunormalize, loadlemmas, lemmatized, \
+                                 roundstr, rgxformat, tokenize, simplify)
+from alignment.matrix import Distancematrix
+from alignment.minhashing import Minlsh
 
 class DistancesTest(unittest2.TestCase):
     def test_levenshtein(self):
@@ -147,7 +147,7 @@
 
 class NormalizerTestCase(unittest2.TestCase):
     def setUp(self):
-        self.lemmas = loadlemmas('../data/french_lemmas.txt')
+        self.lemmas = loadlemmas('data/french_lemmas.txt')
 
     def test_unormalize(self):
         self.assertEqual(lunormalize(u'bépoèàÀêùï'),
@@ -231,7 +231,7 @@
                       "pour la santé, faîtes du sport"
                     ]
         minlsh = Minlsh()
-        lemmas = loadlemmas('../data/french_lemmas.txt')
+        lemmas = loadlemmas('data/french_lemmas.txt')
         minlsh.train((simplify(s, lemmas) for s in sentences), 1, 200)
 
         self.assertEqual(minlsh.findsimilarsentences(0.65), set([(0, 1), (2, 4)]))
--- a/views.py	Tue Nov 06 10:51:43 2012 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,18 +0,0 @@
-# -*- coding: utf-8 -*-
-# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-
-"""cubicweb-alignment views/forms/actions/components for web ui"""