author Vincent Michel <>
Tue, 23 Apr 2013 17:01:00 +0200
changeset 254 f627c58bf1c1
parent 158 a6449ca99bbf
child 182 59d328a2a35f
child 265 7451e60dd4a8
child 285 28156ee4d13f
child 301 50a25080aa33
permissions -rw-r--r--
Added tag nazca-version-0.2.1 for changeset 458190cbe7d4

# -*- coding:utf-8 -*-
# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
# contact --
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation, either version 2.1 of the License, or (at your option)
# any later version.
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
# details.
# You should have received a copy of the GNU Lesser General Public License along
# with this program. If not, see <>.

import re
from string import punctuation
from warnings import warn
from unicodedata import normalize as _uninormalize

STOPWORDS = set([u'alors', u'au', u'aux', u'aucuns', u'aussi', u'autre', u'avant',
u'avec', u'avoir', u'bon', u'car', u'ce', u'cela', u'ces', u'ceux', u'chaque',
u'ci', u'comme', u'comment', u'dans', u'de', u'des', u'du', u'dedans', u'dehors',
u'depuis', u'deux', u'devrait', u'doit', u'donc', u'dos', u'droite', u'début',
u'elle', u'elles', u'en', u'encore', u'essai', u'est', u'et', u'eu', u'eux', u'fait',
u'faites', u'fois', u'font', u'force', u'haut', u'hors', u'ici', u'il', u'ils',
u'je', u'juste', u'la', u'le', u'les', u'leur', u'lui', u'là', u'ma', u'maintenant',
u'mais', u'me', u'mes', u'moi', u'moins', u'mon', u'mot', u'même', u'ne',
u'ni', u'nommés', u'nos',
u'notre', u'nous', u'nouveaux', u'on', u'ou', u'où', u'par', u'parce', u'parole',
u'pas', u'personnes', u'peut', u'peu', u'pièce', u'plupart', u'pour',
u'pourquoi', u'quand', u'que', u'quel', u'quelle', u'quelles', u'quels', u'qui',
u'sa', u'sans', u'se', u'ses', u'seulement', u'si', u'sien', u'son', u'sont', u'sous',
u'soyez', u'sujet', u'sur', u'ta', u'tandis', u'tellement', u'te', u'tels', u'tes', u'toi',
u'ton', u'tous', u'tout', u'trop', u'très', u'tu', u'un', u'une', u'valeur', u'voie',
u'voient', u'vont', u'vos', u'votre', u'vous', u'vu', u'ça', u'étaient', u'état',
u'étions', u'été', u'être'])

    u'\xa1': u'!',    # INVERTED EXCLAMATION MARK
    u'\u0142': u'l',  # LATIN SMALL LETTER L WITH STROKE
    u'\u2044': u'/',  # FRACTION SLASH
    u'\xc6': u'AE',   # LATIN CAPITAL LETTER AE
    u'\xa9': u'(c)',  # COPYRIGHT SIGN
    u'\xe6': u'ae',   # LATIN SMALL LETTER AE
    u'\xae': u'(r)',  # REGISTERED SIGN
    u'\u0153': u'oe', # LATIN SMALL LIGATURE OE
    u'\u0152': u'OE', # LATIN CAPITAL LIGATURE OE
    u'\xf8': u'o',    # LATIN SMALL LETTER O WITH STROKE
    u'\xdf': u'ss',   # LATIN SMALL LETTER SHARP S

def unormalize(ustring, ignorenonascii=None, substitute=None):
    """replace diacritical characters with their corresponding ascii characters

    Convert the unicode string to its long normalized form (unicode character
    will be transform into several characters) and keep the first one only.
    The normal form KD (NFKD) will apply the compatibility decomposition, i.e.
    replace all compatibility characters with their equivalents.

    :type substitute: str
    :param substitute: replacement character to use if decomposition fails

    :see: Another project about ASCII transliterations of Unicode text

    # backward compatibility, ignorenonascii was a boolean
    if ignorenonascii is not None:
        warn("ignorenonascii is deprecated, use substitute named parameter instead",
             DeprecationWarning, stacklevel=2)
        if ignorenonascii:
            substitute = ''
    res = []
    for letter in ustring[:]:
            replacement = MANUAL_UNICODE_MAP[letter]
        except KeyError:
            replacement = _uninormalize('NFKD', letter)[0]
            if ord(replacement) >= 2 ** 7:
                if substitute is None:
                    raise ValueError("can't deal with non-ascii based characters")
                replacement = substitute
    return u''.join(res)

def lunormalize(sentence, ignorenonascii=None, substitute=None):
    """ Normalize a sentence (ie remove accents, set to lower, etc) """
    return unormalize(sentence, ignorenonascii, substitute).lower()

def simplify(sentence, lemmas=None, remove_stopwords=True):
    """ Simply the given sentence
        0) If remove_stopwords, then remove the stop words
        1) If lemmas are given, the sentence is lemmatized
        2) Set the sentence to lower case
        3) Remove punctuation
    if lemmas:
        sentence = lemmatized(sentence, lemmas)
    sentence = sentence.lower()
    cleansent = ''.join([s if s not in punctuation
                           else ' ' for s in sentence]).strip()
    #comma followed by a space is replaced by two spaces, keep only one
    cleansent = cleansent.replace('  ', ' ')

    if not remove_stopwords:
        return cleansent
        return ' '.join([w for w in cleansent.split(' ') if w not in STOPWORDS])

def tokenize(sentence, tokenizer=None, regexp=re.compile(r"[^\s]+")):
    """ Tokenize a sentence.
        Use ``tokenizer`` if given, else try to use the nltk WordPunctTokenizer,
        in case of failure, it just split on spaces.

        Anyway, tokenizer must have a ``tokenize()`` method
    if tokenizer:
        return tokenizer().tokenize(sentence)
    # XXX Unicode, could not use WorkTokenizer.
    # Instead split on whitespaces
    chunks = []
    for chunk in [t for t in regexp.findall(sentence) if t]:
        # Deals with '
        if "'" in chunk:
            schunks = chunk.split("'")
            chunks.extend([c+"'" for c in schunks[:-1]])
    return chunks

def iter_wordgrams(sentence, k):
    """ Generator of k-wordgrams on the given sentence
    words = sentence.split(' ')
    #XXX Call tokenizer
    for r in xrange(len(words)):
        yield ' '.join(words[r:r + k])

def loadlemmas(filename, encoding='utf-8'):
    """ Return the default lemmas dictionnary
    lemmas = {}
    with open(filename) as fobj:
        for line in fobj:
            line = line.decode(encoding).strip().split('\t')
            if len(line) == 2:
                lemmas[line[0]] = line[1]
    return lemmas

def lemmatized(sentence, lemmas, tokenizer=None):
    """ Return the lemmatized sentence
    tokenized_sent = tokenize(sentence, tokenizer)
    tokenized_sentformated = []
    for w in tokenized_sent:
        if w in ".,'" and len(tokenized_sentformated) > 0:
            tokenized_sentformated[-1] += w
        elif w not in punctuation:
    return u' '.join([lemmatized_word(w, lemmas) for w in tokenized_sentformated])

def lemmatized_word(word, lemmas):
    """ Return the lemmatized word
    lemma = lemmas.get(word.lower(), word)
    if '|' in lemma:
        _words = lemma.split('|')
        if word.lower() in _words:
            lemma = word.lower()
            lemma = _words[0]
    return lemma

def roundstr(number, ndigits=0):
    """Return an unicode string of ``number`` rounded to a given precision
        in decimal digits (default 0 digits)

        If ``number`` is not a float, this method casts it to a float. (An
        exception may be raised if it's not possible)
    return format(round(float(number), ndigits), '0.%df' % ndigits)

def rgxformat(string, regexp, output):
    """ Apply the regexp to the ``string`` and return a formatted string
    according to ``output``

    eg :
        format(u'[Victor Hugo - 26 fev 1802 / 22 mai 1885]',
               r'\[(?P<firstname>\w+) (?p<lastname>\w+) - '
               r'(?P<birthdate>.*?) / (?<deathdate>.*?)\]',
               u'%(lastname)s, %(firstname)s (%(birthdate)s -'

     would return u'Hugo, Victor (26 fev 1802 - 22 mai 1885)'

    match = re.match(regexp, string)
    return output % match.groupdict()