aligner.py
author Simon Chabot <simon.chabot@logilab.fr>
Tue, 06 Nov 2012 10:49:10 +0100
changeset 69 477442625b03
parent 65 alignment.py@5efd92896bbb
child 71 22c93ceb0163
permissions -rw-r--r--
Correct some spelling

# -*- coding:utf-8 -*-
# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
# contact http://www.logilab.fr -- mailto:contact@logilab.fr
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation, either version 2.1 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
# details.
#
# You should have received a copy of the GNU Lesser General Public License along
# with this program. If not, see <http://www.gnu.org/licenses/>.

from os.path import exists as fileexists

import alignment.distances as d
import alignment.normalize as n
import alignment.matrix as m


def align(alignset, targetset, treatments, threshold, resultfile):
    """ Try to align the results of alignquery onto targetquery's ones

        `alignset` and `targetset` are the sets to align. Each set contains list
        were the first column is the identifier of the items, and the others are
        the attributs to align. (Note that the order is important !) Both must
        have the same number of columns

        `treatments` is a list of dictionnary. Each dictionnary contains the
        treatments to do on the different attributs. The first dictionnary is
        for the first attribut (not the identifier !), the second for the
        second, etc. Each dictionnary is built as the following:

            treatment = { 'normalization': [f1, f2, f3],
                          'norm_args': { 'arg1': arg01, 'arg2': arg02},
                          'distance': d1,
                          'distance_args': { 'arg1': arg11 },
                          'weighting': w,
                          'matrix_normalize': True
                        }

            `normalization` is the list of functions called to normalize the
            given attribut (in order). Each functions is called with the
            `norm_args`

            Idem for `distance` and `distance_args`

            `weighting` is the weighting of the current attribut in regard to
            the others

        `resultfile` is the name of the output csv.
    """

    def normalizerset(rset):
        """ Apply all the normalization functions to the given rset """
        for row in rset:
            for ind, attribut in enumerate(row[1:]):
                treat = treatments[ind]
                if not attribut:
                    continue
                for f in treat['normalization']:
                    farg = f.func_code.co_varnames #List of the arguments of f
                    # A kind of union between the arguments needed by f, and the
                    # provided ones
                    givenargs = dict((arg, treat['norm_args'][arg])
                                 for arg in farg if arg in treat['norm_args'])
                    attribut = f(attribut, **givenargs)
                row[ind + 1] = attribut
        return rset

    ## Just to be certain we have all the keys
    for t in treatments:
        t.setdefault('norm_args', {})
        t.setdefault('distance_args', {})
        t.setdefault('weighting', 1)
        t.setdefault('matrix_normalize', True)

    ralignset = normalizerset(alignset)
    rtargetset = normalizerset(targetset)

    items = []
    for ind, tr in enumerate(treatments):
        item = (tr['weighting'],
                [ralignset[i][ind + 1] for i in xrange(len(ralignset))],
                [rtargetset[i][ind + 1] for i in xrange(len(rtargetset))],
                tr['distance'],
                tr['matrix_normalize'],
                tr['distance_args'])
        items.append(item)

    mat = m.globalalignmentmatrix(items)
    matched = mat.matched(threshold)

    if not matched:
        return mat, False

    openmode = 'a' if fileexists(resultfile) else 'w'
    with open(resultfile, openmode) as fobj:
        if openmode == 'w':
            fobj.write('aligned;targetted;distance\n')
        for aligned in matched:
            for target, dist in matched[aligned]:
                fobj.write('%s;%s;%s\n' %
                    (ralignset[aligned][0],
                     rtargetset[target][0],
                     dist
                    ))
    return mat, True

if __name__ == '__main__':
    alignquery = 'Any P, BP ORDERBY(RANDOM()) LIMIT 100 WHERE P is Person, ' \
                 'P birthplace BP, NOT BP is NULL'
    targetquery = 'Any GID, N ORDERBY(RANDOM()) LIMIT 1000 WHERE L is Location, ' \
                  'L name N, L geoid GID'

    lemmas = n.loadlemmas('data/french_lemmas.txt')
    tr = { 'normalization': [n.simplify],
           'norm_args': { 'lemmas' : lemmas, 'removeStopWords': False },
           'distance':  d.levenshtein,
         }

    align(session.execute(alignquery),
          session.execute(targetquery),
          [tr], 0.3, 'alignment_results')