[rename] Rename modules with shorter names, related to #187461
authorVincent Michel <vincent.michel@logilab.fr>
Thu, 19 Dec 2013 14:45:56 +0000
changeset 375 343a4304a259
parent 374 102c6331f3f6
child 376 e04ca0af1491
[rename] Rename modules with shorter names, related to #187461
data/__init__.py
data/countries.py
data/countries_iso_3166.txt
data/stopwords.py
data/us_states.py
demo.py
examples/demo.py
named_entities/__init__.py
named_entities/filters.py
named_entities/preprocessors.py
named_entities/sources.py
ner/__init__.py
ner/filters.py
ner/preprocessors.py
ner/sources.py
record_linkage/__init__.py
record_linkage/aligner.py
record_linkage/blocking.py
reference_data/__init__.py
reference_data/countries.py
reference_data/countries_iso_3166.txt
reference_data/stopwords.py
reference_data/us_states.py
rl/__init__.py
rl/aligner.py
rl/blocking.py
test/test_alignment.py
test/test_blocking.py
test/test_dataio.py
test/test_filters.py
test/test_named_entities.py
test/test_ner.py
test/test_preprocessors.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data/countries.py	Thu Dec 19 14:45:56 2013 +0000
@@ -0,0 +1,994 @@
+
+# Countries list (ISO-3166)
+COUNTRIES = {'##': 'non renseign\xc3\xa9',
+             '..': 'non renseign\xc3\xa9',
+             'aa': 'aire g\xc3\xa9ographique ancienne',
+             'ad': 'Andorre',
+             'ae': '\xc3\x89mirats arabes unis',
+             'af': 'Afghanistan',
+             'ag': 'Antigua-et-Barbuda',
+             'ai': 'Anguilla',
+             'al': 'Albanie',
+             'am': 'Arm\xc3\xa9nie',
+             'an': 'Antilles n\xc3\xa9erlandaises',
+             'ao': 'Angola',
+             'aq': 'Antarctique',
+             'ar': 'Argentine',
+             'as': 'Samoa am\xc3\xa9ricaines',
+             'at': 'Autriche',
+             'au': 'Australie',
+             'aw': 'Aruba',
+             'ax': 'Aland (\xc3\xaeles)',
+             'az': 'Azerba\xc3\xafdjan',
+             'ba': 'Bosnie-Herz\xc3\xa9govine',
+             'bb': 'Barbade',
+             'bd': 'Bangladesh',
+             'be': 'Belgique',
+             'bf': 'Burkina',
+             'bg': 'Bulgarie',
+             'bh': 'Bahre\xc3\xafn',
+             'bi': 'Burundi',
+             'bj': 'B\xc3\xa9nin',
+             'bl': 'Saint-Barth\xc3\xa9lemy',
+             'bm': 'Bermudes',
+             'bn': 'Brun\xc3\xa9i',
+             'bo': 'Bolivie',
+             'bq': 'Bonaire, Saint-Eustache et Saba,Saba###Saint-Eustache',
+             'br': 'Br\xc3\xa9sil',
+             'bs': 'Bahamas',
+             'bt': 'Bhoutan',
+             'bv': 'Bouvet (\xc3\xaele)',
+             'bw': 'Botswana',
+             'by': 'Bi\xc3\xa9lorussie,B\xc3\xa9larus',
+             'bz': 'Belize',
+             'ca': 'Canada',
+             'cc': 'Cocos (\xc3\xaeles),Keeling (\xc3\xaeles)',
+             'cd': 'Congo (R\xc3\xa9publique d\xc3\xa9mocratique),Za\xc3\xafre',
+             'cf': 'Centrafrique,R\xc3\xa9publique centrafricaine',
+             'cg': 'Congo,Congo (R\xc3\xa9publique)',
+             'ch': 'Suisse,Conf\xc3\xa9d\xc3\xa9ration helv\xc3\xa9tique',
+             'ci': "C\xc3\xb4te d'Ivoire\n",
+             'ck': 'Cook (\xc3\xaeles)',
+             'cl': 'Chili',
+             'cm': 'Cameroun',
+             'cn': 'Chine,Chine (R\xc3\xa9publique populaire)',
+             'co': 'Colombie',
+             'cr': 'Costa Rica',
+             'cs': 'Serbie-et-Mont\xc3\xa9n\xc3\xa9gro',
+             'cu': 'Cuba',
+             'cv': 'Cap-Vert',
+             'cw': 'Cura\xc3\xa7ao',
+             'cx': 'Christmas (\xc3\xaele)',
+             'cy': 'Chypre',
+             'cz': 'R\xc3\xa9publique tch\xc3\xa8que,Tch\xc3\xa8que, R\xc3\xa9publique',
+             'dd': 'Allemagne (R\xc3\xa9publique d\xc3\xa9mocratique)',
+             'de': 'Allemagne,Allemagne (R\xc3\xa9publique f\xc3\xa9d\xc3\xa9rale)',
+             'dj': 'Djibouti',
+             'dk': 'Danemark',
+             'dm': 'Dominique',
+             'do': 'R\xc3\xa9publique dominicaine,Dominicaine, R\xc3\xa9publique',
+             'dz': 'Alg\xc3\xa9rie',
+             'ec': '\xc3\x89quateur',
+             'ee': 'Estonie',
+             'eg': '\xc3\x89gypte',
+             'eh': 'Sahara occidental',
+             'er': '\xc3\x89rythr\xc3\xa9e',
+             'es': 'Espagne',
+             'et': '\xc3\x89thiopie',
+             'fi': 'Finlande',
+             'fj': 'Fidji',
+             'fk': 'Malouines (\xc3\xaeles),Falkland (\xc3\xaeles)',
+             'fm': 'Micron\xc3\xa9sie,\xc3\x89tats f\xc3\xa9d\xc3\xa9r\xc3\xa9s de Micron\xc3\xa9sie',
+             'fo': 'F\xc3\xa9ro\xc3\xa9 (\xc3\xaeles)',
+             'fr': 'France',
+             'ga': 'Gabon',
+             'gb': 'Grande-Bretagne,Royaume-Uni',
+             'gd': 'Grenade',
+             'ge': 'G\xc3\xa9orgie',
+             'gf': 'Guyane fran\xc3\xa7aise',
+             'gg': 'Guernesey',
+             'gh': 'Ghana',
+             'gi': 'Gibraltar',
+             'gl': 'Groenland',
+             'gm': 'Gambie',
+             'gn': 'Guin\xc3\xa9e',
+             'gp': 'Guadeloupe',
+             'gq': 'Guin\xc3\xa9e \xc3\xa9quatoriale',
+             'gr': 'Gr\xc3\xa8ce',
+             'gs': 'G\xc3\xa9orgie du Sud et les \xc3\xaeles Sandwich du Sud',
+             'gt': 'Guatemala',
+             'gu': 'Guam',
+             'gw': 'Guin\xc3\xa9e-Bissau',
+             'gy': 'Guyana',
+             'hk': 'Hong Kong',
+             'hm': 'Heard (\xc3\xaele) et \xc3\xaeles McDonald',
+             'hn': 'Honduras',
+             'hr': 'Croatie',
+             'ht': 'Ha\xc3\xafti',
+             'hu': 'Hongrie',
+             'id': 'Indon\xc3\xa9sie',
+             'ie': 'Irlande',
+             'ii': 'intergouvernemental',
+             'il': 'Isra\xc3\xabl',
+             'im': '\xc3\x8ele de Man,Man, \xc3\x8ele de',
+             'in': 'Inde',
+             'io': "Territoire britannique de l'Oc\xc3\xa9an indien,Chagos (\xc3\xaeles)###Oc\xc3\xa9an indien, Territoire britannique de l'\n",
+             'iq': 'Irak',
+             'ir': 'Iran',
+             'is': 'Islande',
+             'it': 'Italie',
+             'je': 'Jersey',
+             'jm': 'Jama\xc3\xafque',
+             'jo': 'Jordanie',
+             'jp': 'Japon',
+             'ke': 'Kenya',
+             'kg': 'Kirghizistan',
+             'kh': 'Cambodge',
+             'ki': 'Kiribati',
+             'km': 'Comores',
+             'kn': 'Saint-Kitts-et-Nevis,Saint-Christophe-et-Nevis',
+             'ko': 'Kosovo',
+             'kp': 'Cor\xc3\xa9e (R\xc3\xa9publique populaire d\xc3\xa9mocratique),Cor\xc3\xa9e du Nord',
+             'kr': 'Cor\xc3\xa9e (R\xc3\xa9publique),Cor\xc3\xa9e du Sud',
+             'kw': 'Kowe\xc3\xaft',
+             'ky': 'Cayman,Ca\xc3\xafmanes, \xc3\x8eles###Ca\xc3\xafman (\xc3\xaeles)',
+             'kz': 'Kazakhstan',
+             'la': 'Laos',
+             'lb': 'Liban',
+             'lc': 'Sainte-Lucie',
+             'li': 'Liechtenstein',
+             'lk': 'Sri Lanka',
+             'lr': 'Liberia',
+             'ls': 'Lesotho',
+             'lt': 'Lituanie',
+             'lu': 'Luxembourg',
+             'lv': 'Lettonie',
+             'ly': 'Libye',
+             'ma': 'Maroc',
+             'mc': 'Monaco',
+             'md': 'Moldavie,Moldova, R\xc3\xa9publique de',
+             'me': 'Mont\xc3\xa9n\xc3\xa9gro',
+             'mf': 'Saint-Martin (partie fran\xc3\xa7aise)',
+             'mg': 'Madagascar',
+             'mh': 'Marshall (\xc3\xaeles)',
+             'mk': 'Mac\xc3\xa9doine (R\xc3\xa9publique)',
+             'ml': 'Mali',
+             'mm': 'Myanmar,Birmanie',
+             'mn': 'Mongolie',
+             'mo': 'Macao',
+             'mp': 'Mariannes du Nord (\xc3\xaeles)',
+             'mq': 'Martinique',
+             'mr': 'Mauritanie',
+             'ms': 'Montserrat',
+             'mt': 'Malte',
+             'mu': 'Maurice',
+             'mv': 'Maldives',
+             'mw': 'Malawi',
+             'mx': 'Mexique',
+             'my': 'Malaisie',
+             'mz': 'Mozambique',
+             'na': 'Namibie',
+             'nc': 'Nouvelle-Cal\xc3\xa9donie',
+             'ne': 'Niger',
+             'nf': 'Norfolk (\xc3\xaele)',
+             'ng': 'Nigeria',
+             'ni': 'Nicaragua',
+             'nl': 'Pays-Bas',
+             'no': 'Norv\xc3\xa8ge',
+             'np': 'N\xc3\xa9pal',
+             'nr': 'Nauru',
+             'nu': 'Niue',
+             'nz': 'Nouvelle-Z\xc3\xa9lande',
+             'om': 'Oman',
+             'oo': 'code non adapt\xc3\xa9',
+             'pa': 'Panama',
+             'pe': 'P\xc3\xa9rou',
+             'pf': 'Polyn\xc3\xa9sie fran\xc3\xa7aise',
+             'pg': 'Papouasie-Nouvelle-Guin\xc3\xa9e',
+             'ph': 'Philippines',
+             'pk': 'Pakistan',
+             'pl': 'Pologne',
+             'pm': 'Saint-Pierre-et-Miquelon',
+             'pn': 'Pitcairn',
+             'pr': 'Porto Rico',
+             'ps': 'Autorit\xc3\xa9 palestinienne,Palestine',
+             'pt': 'Portugal',
+             'pw': 'Palau,Palaos',
+             'py': 'Paraguay',
+             'qa': 'Qatar',
+             're': 'R\xc3\xa9union',
+             'ro': 'Roumanie',
+             'rs': 'Serbie',
+             'ru': 'Russie (F\xc3\xa9d\xc3\xa9ration),Russie',
+             'rw': 'Rwanda',
+             'sa': 'Arabie saoudite',
+             'sb': 'Salomon (\xc3\xaeles)',
+             'sc': 'Seychelles',
+             'sd': 'Soudan',
+             'se': 'Su\xc3\xa8de',
+             'sg': 'Singapour',
+             'sh': 'Sainte-H\xc3\xa9l\xc3\xa8ne,Ascension (\xc3\xaele)###Tristan da Cunha (\xc3\xaele)',
+             'si': 'Slov\xc3\xa9nie',
+             'sj': 'Svalbard et \xc3\xaele Jan Mayen',
+             'sk': 'Slovaquie',
+             'sl': 'Sierra Leone',
+             'sm': 'Saint-Marin',
+             'sn': 'S\xc3\xa9n\xc3\xa9gal',
+             'so': 'Somalie',
+             'sr': 'Suriname',
+             'ss': 'Soudan du Sud,Sud Soudan',
+             'st': 'Sao Tom\xc3\xa9-et-Principe',
+             'su': 'URSS',
+             'sv': 'El Salvador,Salvador',
+             'sx': 'Saint-Martin (partie n\xc3\xa9erlandaise),Sint Maarten',
+             'sy': 'Syrie',
+             'sz': 'Swaziland',
+             'tc': 'Turks et Ca\xc3\xafques (\xc3\xaeles)',
+             'td': 'Tchad',
+             'tf': 'Terres australes fran\xc3\xa7aises',
+             'tg': 'Togo',
+             'th': 'Tha\xc3\xaflande',
+             'tj': 'Tadjikistan',
+             'tk': 'Tokelau',
+             'tl': 'Timor oriental',
+             'tm': 'Turkm\xc3\xa9nistan',
+             'tn': 'Tunisie',
+             'to': 'Tonga',
+             'tr': 'Turquie',
+             'tt': 'Trinit\xc3\xa9-et-Tobago',
+             'tv': 'Tuvalu',
+             'tw': 'Ta\xc3\xafwan,Chine (R\xc3\xa9publique)',
+             'tz': 'Tanzanie',
+             'ua': 'Ukraine',
+             'ug': 'Ouganda',
+             'um': '\xc3\x8eles mineures \xc3\xa9loign\xc3\xa9es des \xc3\x89tats-Unis',
+             'us': '\xc3\x89tats-Unis',
+             'uy': 'Uruguay',
+             'uz': 'Ouzb\xc3\xa9kistan',
+             'va': 'Vatican,Saint-Si\xc3\xa8ge',
+             'vc': 'Saint-Vincent-et-les Grenadines',
+             've': 'Venezuela',
+             'vg': '\xc3\x8eles Vierges britanniques,Vierges (\xc3\xaeles) britanniques',
+             'vi': '\xc3\x8eles Vierges am\xc3\xa9ricaines,Vierges (\xc3\xaeles) am\xc3\xa9ricaines',
+             'vn': 'Viet Nam',
+             'vu': 'Vanuatu',
+             'wf': 'Wallis et Futuna (\xc3\xaeles)',
+             'ws': 'Samoa,Samoa occidentales',
+             'xc': 'Tch\xc3\xa9coslovaquie',
+             'xd': 'Allemagne avant 1945',
+             'xe': 'Europe,Union europ\xc3\xa9enne',
+             'xk': 'Cor\xc3\xa9e avant 1948',
+             'xn': 'Pays-Bas avant 1830,Belgique avant 1830',
+             'xx': 'inconnu',
+             'yd': 'Y\xc3\xa9men (R\xc3\xa9publique d\xc3\xa9mocratique populaire),Sud Y\xc3\xa9men',
+             'ye': 'Y\xc3\xa9men',
+             'yt': 'Mayotte',
+             'yu': 'Yougoslavie',
+             'yy': "ne s'applique pas\n",
+             'za': 'Afrique du Sud',
+             'zm': 'Zambie',
+             'zw': 'Zimbabwe',
+             'zz': 'multiple\n'}
+
+
+# REGIONS TO COUNTRIES MAPPING
+REGIONS_TO_COUNTRIES = {u'Abruzzes': u'Italie',
+                        u'Acha\xefe': u'Gr\xe8ce',
+                        u'Acre': u'Br\xe9sil',
+                        u'Afghanistan': u'Afghanistan',
+                        u'Afrique du Sud': u'Afrique du Sud',
+                        u'Aguascalientes': u'Mexique',
+                        u'Ain': u'France',
+                        u'Aisne': u'France',
+                        u'Alabama': u'\xc9tats-Unis',
+                        u'Alagoas': u'Br\xe9sil',
+                        u'Aland (\xeeles)': u'Aland (\xeeles)',
+                        u'Alaska': u'\xc9tats-Unis',
+                        u'Albanie': u'Albanie',
+                        u'Alberta': u'Canada',
+                        u'Alg\xe9rie': u'Alg\xe9rie',
+                        u'Allemagne': u'Allemagne',
+                        u'Allemagne (R\xe9publique d\xe9mocratique)': u'Allemagne (R\xe9publique d\xe9mocratique)',
+                        u'Allemagne avant 1945': u'Allemagne avant 1945',
+                        u'Allier': u'France',
+                        u'Alpes-Maritimes': u'France',
+                        u'Alpes-de-Haute-Provence': u'France',
+                        u'Alsace': u'France',
+                        u'Amapa': u'Br\xe9sil',
+                        u'Amazonas': u'Br\xe9sil',
+                        u'Andalousie': u'Espagne',
+                        u'Andorre': u'Andorre',
+                        u'Angola': u'Angola',
+                        u'Anguilla': u'Anguilla',
+                        u'Antarctique': u'Antarctique',
+                        u'Antigua-et-Barbuda': u'Antigua-et-Barbuda',
+                        u'Antilles n\xe9erlandaises': u'Antilles n\xe9erlandaises',
+                        u'Anvers': u'Belgique',
+                        u'Appenzell-Rhodes-Ext\xe9rieures': u'Suisse',
+                        u'Appenzell-Rhodes-Int\xe9rieures': u'Suisse',
+                        u'Aquitaine': u'France',
+                        u'Arabie saoudite': u'Arabie saoudite',
+                        u'Aragon': u'Espagne',
+                        u'Arcadie': u'Gr\xe8ce',
+                        u'Ardennes': u'France',
+                        u'Ard\xe8che': u'France',
+                        u'Argentine': u'Argentine',
+                        u'Argolide': u'Gr\xe8ce',
+                        u'Argovie': u'Suisse',
+                        u'Arizona': u'\xc9tats-Unis',
+                        u'Ari\xe8ge': u'France',
+                        u'Arkansas': u'\xc9tats-Unis',
+                        u'Arm\xe9nie': u'Arm\xe9nie',
+                        u'Aruba': u'Aruba',
+                        u'Asturies': u'Espagne',
+                        u'Ath\xe8nes et agglom\xe9ration': u'Gr\xe8ce',
+                        u'Attique': u'Gr\xe8ce',
+                        u'Aube': u'France',
+                        u'Aude': u'France',
+                        u'Australie': u'Australie',
+                        u'Australie-M\xe9ridionale': u'Australie',
+                        u'Australie-Occidentale': u'Australie',
+                        u'Autorit\xe9 palestinienne': u'Autorit\xe9 palestinienne',
+                        u'Autriche': u'Autriche',
+                        u'Auvergne': u'France',
+                        u'Aveyron': u'France',
+                        u'Azerba\xefdjan': u'Azerba\xefdjan',
+                        u'Bade-Wurtemberg': u'Allemagne',
+                        u'Bahamas': u'Bahamas',
+                        u'Bahia': u'Br\xe9sil',
+                        u'Bahre\xefn': u'Bahre\xefn',
+                        u'Baja California Norte': u'Mexique',
+                        u'Baja California Sur': u'Mexique',
+                        u'Bangladesh': u'Bangladesh',
+                        u'Barbade': u'Barbade',
+                        u'Bas-Rhin': u'France',
+                        u'Basilicate': u'Italie',
+                        u'Basse-Autriche': u'Autriche',
+                        u'Basse-Normandie': u'France',
+                        u'Basse-Saxe': u'Allemagne',
+                        u'Bavi\xe8re': u'Allemagne',
+                        u'Belgique': u'Belgique',
+                        u'Belize': u'Belize',
+                        u'Berlin': u'Allemagne',
+                        u'Bermudes': u'Bermudes',
+                        u'Berne': u'Suisse',
+                        u'Bhoutan': u'Bhoutan',
+                        u'Bi\xe9lorussie': u'Bi\xe9lorussie',
+                        u'Bolivie': u'Bolivie',
+                        u'Bonaire, Saint-Eustache et Saba': u'Bonaire, Saint-Eustache et Saba',
+                        u'Bosnie-Herz\xe9govine': u'Bosnie-Herz\xe9govine',
+                        u'Botswana': u'Botswana',
+                        u'Bouches-du-Rh\xf4ne': u'France',
+                        u'Bourgogne': u'France',
+                        u'Bouvet (\xeele)': u'Bouvet (\xeele)',
+                        u'Brabant': u'Belgique',
+                        u'Brabant flamand': u'Belgique',
+                        u'Brabant wallon': u'Belgique',
+                        u'Brabant-Septentrional': u'Pays-Bas',
+                        u'Brandebourg': u'Allemagne',
+                        u'Bretagne': u'France',
+                        u'Brun\xe9i': u'Brun\xe9i',
+                        u'Bruxelles': u'Belgique',
+                        u'Br\xe9sil': u'Br\xe9sil',
+                        u'Br\xeame': u'Allemagne',
+                        u'Buenos Aires': u'Argentine',
+                        u'Bulgarie': u'Bulgarie',
+                        u'Burgenland': u'Autriche',
+                        u'Burkina': u'Burkina',
+                        u'Burundi': u'Burundi',
+                        u'B\xe2le-Campagne': u'Suisse',
+                        u'B\xe2le-Ville': u'Suisse',
+                        u'B\xe9nin': u'B\xe9nin',
+                        u'B\xe9otie': u'Gr\xe8ce',
+                        u'Calabre': u'Italie',
+                        u'Californie': u'\xc9tats-Unis',
+                        u'Calvados': u'France',
+                        u'Cambodge': u'Cambodge',
+                        u'Cameroun': u'Cameroun',
+                        u'Campanie': u'Italie',
+                        u'Campeche': u'Mexique',
+                        u'Canada': u'Canada',
+                        u'Canaries': u'Espagne',
+                        u'Cantabrie': u'Espagne',
+                        u'Cantal': u'France',
+                        u'Cap-Vert': u'Cap-Vert',
+                        u'Capitale f\xe9d\xe9rale': u'Argentine',
+                        u'Carinthie': u'Autriche',
+                        u'Caroline du Nord': u'\xc9tats-Unis',
+                        u'Caroline du Sud': u'\xc9tats-Unis',
+                        u'Castille et L\xe9on': u'Espagne',
+                        u'Castille-la Manche': u'Espagne',
+                        u'Catalogne': u'Espagne',
+                        u'Catamarca': u'Argentine',
+                        u'Cayman': u'Cayman',
+                        u'Cear\xe1': u'Br\xe9sil',
+                        u'Centrafrique': u'Centrafrique',
+                        u'Centre': u'France',
+                        u'Ceuta': u'Espagne',
+                        u'Chaco': u'Argentine',
+                        u'Chalcidique': u'Gr\xe8ce',
+                        u'Champagne-Ardenne': u'France',
+                        u'Charente': u'France',
+                        u'Charente-Maritime': u'France',
+                        u'Cher': u'France',
+                        u'Chiapas': u'Mexique',
+                        u'Chihuahua': u'Mexique',
+                        u'Chili': u'Chili',
+                        u'Chine': u'Chine',
+                        u'Christmas (\xeele)': u'Christmas (\xeele)',
+                        u'Chubut': u'Argentine',
+                        u'Chypre': u'Chypre',
+                        u'Ch\xedos': u'Gr\xe8ce',
+                        u'Coahuila': u'Mexique',
+                        u'Cocos (\xeeles)': u'Cocos (\xeeles)',
+                        u'Colima': u'Mexique',
+                        u'Colombie': u'Colombie',
+                        u'Colombie britannique': u'Canada',
+                        u'Colorado': u'\xc9tats-Unis',
+                        u'Communaut\xe9 de Madrid': u'Espagne',
+                        u'Communaut\xe9 de Valence': u'Espagne',
+                        u'Comores': u'Comores',
+                        u'Congo': u'Congo',
+                        u'Congo (R\xe9publique d\xe9mocratique)': u'Congo (R\xe9publique d\xe9mocratique)',
+                        u'Connecticut': u'\xc9tats-Unis',
+                        u'Cook (\xeeles)': u'Cook (\xeeles)',
+                        u'Corfou': u'Gr\xe8ce',
+                        u'Corinthie': u'Gr\xe8ce',
+                        u'Corrientes': u'Argentine',
+                        u'Corr\xe8ze': u'France',
+                        u'Corse': u'France',
+                        u'Corse-du-Sud': u'France',
+                        u'Cor\xe9e (R\xe9publique populaire d\xe9mocratique)': u'Cor\xe9e (R\xe9publique populaire d\xe9mocratique)',
+                        u'Cor\xe9e (R\xe9publique)': u'Cor\xe9e (R\xe9publique)',
+                        u'Cor\xe9e avant 1948': u'Cor\xe9e avant 1948',
+                        u'Costa Rica': u'Costa Rica',
+                        u'Creuse': u'France',
+                        u'Croatie': u'Croatie',
+                        u'Cr\xe8te': u'Gr\xe8ce',
+                        u'Cuba': u'Cuba',
+                        u'Cura\xe7ao': u'Cura\xe7ao',
+                        u'Cyclades': u'Gr\xe8ce',
+                        u'C\xe9phalonie': u'Gr\xe8ce',
+                        u'C\xf3rdoba': u'Argentine',
+                        u"C\xf4te d'Ivoire": u"C\xf4te d'Ivoire",
+                        u"C\xf4te-d'Or": u'France',
+                        u"C\xf4tes-d'Armor": u'France',
+                        u'Dakota du Nord': u'\xc9tats-Unis',
+                        u'Dakota du Sud': u'\xc9tats-Unis',
+                        u'Danemark': u'Danemark',
+                        u'Delaware': u'\xc9tats-Unis',
+                        u'Deux-S\xe8vres': u'France',
+                        u'District de Columbia': u'\xc9tats-Unis',
+                        u'District f\xe9d\xe9ral': u'Br\xe9sil',
+                        u'Djibouti': u'Djibouti',
+                        u'Dod\xe9can\xe8se': u'Gr\xe8ce',
+                        u'Dominique': u'Dominique',
+                        u'Dordogne': u'France',
+                        u'Doubs': u'France',
+                        u'Drenthe': u'Pays-Bas',
+                        u'Dr\xe1ma': u'Gr\xe8ce',
+                        u'Dr\xf4me': u'France',
+                        u'Durango': u'Mexique',
+                        u'D\xe9pendance de Ross (Nouvelle-Z\xe9lande)': u'Antarctique',
+                        u'El Salvador': u'El Salvador',
+                        u'Entre-Rios': u'Argentine',
+                        u'Espagne': u'Espagne',
+                        u'Espirito Santo': u'Br\xe9sil',
+                        u'Essonne': u'France',
+                        u'Estonie': u'Estonie',
+                        u'Estr\xe9madure': u'Espagne',
+                        u'Eub\xe9e': u'Gr\xe8ce',
+                        u'Eure': u'France',
+                        u'Eure-et-Loir': u'France',
+                        u'Eurytanie': u'Gr\xe8ce',
+                        u'Fidji': u'Fidji',
+                        u'Finist\xe8re': u'France',
+                        u'Finlande': u'Finlande',
+                        u'Flandre occidentale': u'Belgique',
+                        u'Flandre orientale': u'Belgique',
+                        u'Floride': u'\xc9tats-Unis',
+                        u'Fl\xf3rina': u'Gr\xe8ce',
+                        u'Formosa': u'Argentine',
+                        u'France': u'France',
+                        u'Franche-Comt\xe9': u'France',
+                        u'Fribourg': u'Suisse',
+                        u'Frioul-V\xe9n\xe9tie-Julienne': u'Italie',
+                        u'Frise': u'Pays-Bas',
+                        u'F\xe9ro\xe9 (\xeeles)': u'F\xe9ro\xe9 (\xeeles)',
+                        u'Gabon': u'Gabon',
+                        u'Galice': u'Espagne',
+                        u'Gambie': u'Gambie',
+                        u'Gard': u'France',
+                        u'Gen\xe8ve': u'Suisse',
+                        u'Gers': u'France',
+                        u'Ghana': u'Ghana',
+                        u'Gibraltar': u'Gibraltar',
+                        u'Gironde': u'France',
+                        u'Glaris': u'Suisse',
+                        u'Goi\xe1s': u'Br\xe9sil',
+                        u'Grande-Bretagne': u'Grande-Bretagne',
+                        u'Grenade': u'Grenade',
+                        u'Greven\xe1': u'Gr\xe8ce',
+                        u'Grisons': u'Suisse',
+                        u'Groenland': u'Groenland',
+                        u'Groningue': u'Pays-Bas',
+                        u'Gr\xe8ce': u'Gr\xe8ce',
+                        u'Gr\xe8ce centrale': u'Gr\xe8ce',
+                        u'Gr\xe8ce occidentale': u'Gr\xe8ce',
+                        u'Guadeloupe': u'Guadeloupe',
+                        u'Guam': u'Guam',
+                        u'Guanajuato': u'Mexique',
+                        u'Guatemala': u'Guatemala',
+                        u'Gueldre': u'Pays-Bas',
+                        u'Guernesey': u'Guernesey',
+                        u'Guerrero': u'Mexique',
+                        u'Guin\xe9e': u'Guin\xe9e',
+                        u'Guin\xe9e \xe9quatoriale': u'Guin\xe9e \xe9quatoriale',
+                        u'Guin\xe9e-Bissau': u'Guin\xe9e-Bissau',
+                        u'Guyana': u'Guyana',
+                        u'Guyane fran\xe7aise': u'Guyane fran\xe7aise',
+                        u'G\xe9orgie': u'\xc9tats-Unis',
+                        u'G\xe9orgie du Sud et les \xeeles Sandwich du Sud': u'G\xe9orgie du Sud et les \xeeles Sandwich du Sud',
+                        u'Hainaut': u'Belgique',
+                        u'Hambourg': u'Allemagne',
+                        u'Haut-Rhin': u'France',
+                        u'Haute-Autriche': u'Autriche',
+                        u'Haute-Corse': u'France',
+                        u'Haute-Garonne': u'France',
+                        u'Haute-Loire': u'France',
+                        u'Haute-Marne': u'France',
+                        u'Haute-Normandie': u'France',
+                        u'Haute-Savoie': u'France',
+                        u'Haute-Sa\xf4ne': u'France',
+                        u'Haute-Vienne': u'France',
+                        u'Hautes-Alpes': u'France',
+                        u'Hautes-Pyr\xe9n\xe9es': u'France',
+                        u'Hauts-de-Seine': u'France',
+                        u'Hawaii': u'\xc9tats-Unis',
+                        u'Ha\xefti': u'Ha\xefti',
+                        u'Heard (\xeele) et \xeeles McDonald': u'Heard (\xeele) et \xeeles McDonald',
+                        u'Hesse': u'Allemagne',
+                        u'Hidalgo': u'Mexique',
+                        u'Hollande-M\xe9ridionale': u'Pays-Bas',
+                        u'Hollande-Septentrionale': u'Pays-Bas',
+                        u'Honduras': u'Honduras',
+                        u'Hong Kong': u'Hong Kong',
+                        u'Hongrie': u'Hongrie',
+                        u'H\xe9rault': u'France',
+                        u'Idaho': u'\xc9tats-Unis',
+                        u'Ille-et-Vilaine': u'France',
+                        u'Illinois': u'\xc9tats-Unis',
+                        u'Inde': u'Inde',
+                        u'Indiana': u'\xc9tats-Unis',
+                        u'Indon\xe9sie': u'Indon\xe9sie',
+                        u'Indre': u'France',
+                        u'Indre-et-Loire': u'France',
+                        u'Iowa': u'\xc9tats-Unis',
+                        u'Io\xe1nnina': u'Gr\xe8ce',
+                        u'Irak': u'Irak',
+                        u'Iran': u'Iran',
+                        u'Irlande': u'Irlande',
+                        u'Ir\xe1kleion': u'Gr\xe8ce',
+                        u'Islande': u'Islande',
+                        u'Isra\xebl': u'Isra\xebl',
+                        u'Is\xe8re': u'France',
+                        u'Italie': u'Italie',
+                        u'Jalisco': u'Mexique',
+                        u'Jama\xefque': u'Jama\xefque',
+                        u'Japon': u'Japon',
+                        u'Jersey': u'Jersey',
+                        u'Jordanie': u'Jordanie',
+                        u'Jujuy': u'Argentine',
+                        u'Jura': u'France',
+                        u'Kansas': u'\xc9tats-Unis',
+                        u'Kard\xedtsa': u'Gr\xe8ce',
+                        u'Kastori\xe1': u'Gr\xe8ce',
+                        u'Kav\xe1la': u'Gr\xe8ce',
+                        u'Kazakhstan': u'Kazakhstan',
+                        u'Kentucky': u'\xc9tats-Unis',
+                        u'Kenya': u'Kenya',
+                        u'Kilk\xeds': u'Gr\xe8ce',
+                        u'Kirghizistan': u'Kirghizistan',
+                        u'Kiribati': u'Kiribati',
+                        u'Kosovo': u'Kosovo',
+                        u'Kowe\xeft': u'Kowe\xeft',
+                        u'Koz\xe1ni': u'Gr\xe8ce',
+                        u'La Can\xe9e': u'Gr\xe8ce',
+                        u'Laconie': u'Gr\xe8ce',
+                        u'Landes': u'France',
+                        u'Languedoc-Roussillon': u'France',
+                        u'Laos': u'Laos',
+                        u'Las\xedthi': u'Gr\xe8ce',
+                        u'Latium': u'Italie',
+                        u'Le Pir\xe9e': u'Gr\xe8ce',
+                        u'Lesotho': u'Lesotho',
+                        u'Lettonie': u'Lettonie',
+                        u'Leucade': u'Gr\xe8ce',
+                        u'Liban': u'Liban',
+                        u'Liberia': u'Liberia',
+                        u'Libye': u'Libye',
+                        u'Liechtenstein': u'Liechtenstein',
+                        u'Ligurie': u'Italie',
+                        u'Limbourg': u'Pays-Bas',
+                        u'Limousin': u'France',
+                        u'Lituanie': u'Lituanie',
+                        u'Li\xe8ge': u'Belgique',
+                        u'Loir-et-Cher': u'France',
+                        u'Loire': u'France',
+                        u'Loire-Atlantique': u'France',
+                        u'Loiret': u'France',
+                        u'Lombardie': u'Italie',
+                        u'Lorraine': u'France',
+                        u'Lot': u'France',
+                        u'Lot-et-Garonne': u'France',
+                        u'Louisiane': u'\xc9tats-Unis',
+                        u'Loz\xe8re': u'France',
+                        u'Lucerne': u'Suisse',
+                        u'Luxembourg': u'Belgique',
+                        u'L\xe1risa': u'Gr\xe8ce',
+                        u'L\xe9svos': u'Gr\xe8ce',
+                        u'Macao': u'Macao',
+                        u'Mac\xe9doine (R\xe9publique)': u'Mac\xe9doine (R\xe9publique)',
+                        u'Mac\xe9doine centrale': u'Gr\xe8ce',
+                        u'Mac\xe9doine occidentale': u'Gr\xe8ce',
+                        u'Mac\xe9doine orientale et Thrace': u'Gr\xe8ce',
+                        u'Madagascar': u'Madagascar',
+                        u'Magn\xe9sie': u'Gr\xe8ce',
+                        u'Maine': u'\xc9tats-Unis',
+                        u'Maine-et-Loire': u'France',
+                        u'Malaisie': u'Malaisie',
+                        u'Malawi': u'Malawi',
+                        u'Maldives': u'Maldives',
+                        u'Mali': u'Mali',
+                        u'Malouines (\xeeles)': u'Malouines (\xeeles)',
+                        u'Malte': u'Malte',
+                        u'Manche': u'France',
+                        u'Manitoba': u'Canada',
+                        u'Maranh\xe3o': u'Br\xe9sil',
+                        u'Marches': u'Italie',
+                        u'Mariannes du Nord (\xeeles)': u'Mariannes du Nord (\xeeles)',
+                        u'Marne': u'France',
+                        u'Maroc': u'Maroc',
+                        u'Marshall (\xeeles)': u'Marshall (\xeeles)',
+                        u'Martinique': u'Martinique',
+                        u'Maryland': u'\xc9tats-Unis',
+                        u'Massachusetts': u'\xc9tats-Unis',
+                        u'Mato grosso': u'Br\xe9sil',
+                        u'Mato grosso do Sul': u'Br\xe9sil',
+                        u'Maurice': u'Maurice',
+                        u'Mauritanie': u'Mauritanie',
+                        u'Mayenne': u'France',
+                        u'Mayotte': u'Mayotte',
+                        u'Mecklembourg-Pom\xe9ranie ant\xe9rieure': u'Allemagne',
+                        u'Melilla': u'Espagne',
+                        u'Mendoza': u'Argentine',
+                        u'Mess\xe9nie': u'Gr\xe8ce',
+                        u'Meurthe-et-Moselle': u'France',
+                        u'Meuse': u'France',
+                        u'Mexico': u'Mexique',
+                        u'Mexique': u'Mexique',
+                        u'Michigan': u'\xc9tats-Unis',
+                        u'Michoac\xe1n': u'Mexique',
+                        u'Micron\xe9sie': u'Micron\xe9sie',
+                        u'Midi-Pyr\xe9n\xe9es': u'France',
+                        u'Minas Gerais': u'Br\xe9sil',
+                        u'Minnesota': u'\xc9tats-Unis',
+                        u'Misiones': u'Argentine',
+                        u'Mississippi': u'\xc9tats-Unis',
+                        u'Missouri': u'\xc9tats-Unis',
+                        u'Moldavie': u'Moldavie',
+                        u'Molise': u'Italie',
+                        u'Monaco': u'Monaco',
+                        u'Mongolie': u'Mongolie',
+                        u'Montana': u'\xc9tats-Unis',
+                        u'Montserrat': u'Montserrat',
+                        u'Mont\xe9n\xe9gro': u'Mont\xe9n\xe9gro',
+                        u'Morbihan': u'France',
+                        u'Morelos': u'Mexique',
+                        u'Moselle': u'France',
+                        u'Mozambique': u'Mozambique',
+                        u'Murcie': u'Espagne',
+                        u'Myanmar': u'Myanmar',
+                        u'Namibie': u'Namibie',
+                        u'Namur': u'Belgique',
+                        u'Nauru': u'Nauru',
+                        u'Navarre': u'Espagne',
+                        u'Nayarit': u'Mexique',
+                        u'Nebraska': u'\xc9tats-Unis',
+                        u'Neuch\xe2tel': u'Suisse',
+                        u'Neuqu\xe9n': u'Argentine',
+                        u'Nevada': u'\xc9tats-Unis',
+                        u'New Hampshire': u'\xc9tats-Unis',
+                        u'New Jersey': u'\xc9tats-Unis',
+                        u'New York': u'\xc9tats-Unis',
+                        u'Nicaragua': u'Nicaragua',
+                        u'Nidwald': u'Suisse',
+                        u'Niger': u'Niger',
+                        u'Nigeria': u'Nigeria',
+                        u'Niue': u'Niue',
+                        u'Ni\xe8vre': u'France',
+                        u'Nord': u'France',
+                        u'Nord-Pas-de-Calais': u'France',
+                        u'Norfolk (\xeele)': u'Norfolk (\xeele)',
+                        u'Norv\xe8ge': u'Norv\xe8ge',
+                        u'Nouveau Mexique': u'\xc9tats-Unis',
+                        u'Nouveau-Brunswick': u'Canada',
+                        u'Nouvelle-Cal\xe9donie': u'Nouvelle-Cal\xe9donie',
+                        u'Nouvelle-Galles-du-Sud': u'Australie',
+                        u'Nouvelle-Z\xe9lande': u'Nouvelle-Z\xe9lande',
+                        u'Nouvelle-\xc9cosse': u'Canada',
+                        u'Nuevo Le\xf3n': u'Mexique',
+                        u'N\xe9pal': u'N\xe9pal',
+                        u'Oaxaca': u'Mexique',
+                        u'Obwald': u'Suisse',
+                        u'Ohio': u'\xc9tats-Unis',
+                        u'Oise': u'France',
+                        u'Oklahoma': u'\xc9tats-Unis',
+                        u'Oman': u'Oman',
+                        u'Ombrie': u'Italie',
+                        u'Ontario': u'Canada',
+                        u'Oregon': u'\xc9tats-Unis',
+                        u'Orne': u'France',
+                        u'Ouganda': u'Ouganda',
+                        u'Ouzb\xe9kistan': u'Ouzb\xe9kistan',
+                        u'Overijssell': u'Pays-Bas',
+                        u'Pakistan': u'Pakistan',
+                        u'Palau': u'Palau',
+                        u'Pampa': u'Argentine',
+                        u'Panama': u'Panama',
+                        u'Papouasie-Nouvelle-Guin\xe9e': u'Papouasie-Nouvelle-Guin\xe9e',
+                        u'Paraguay': u'Paraguay',
+                        u'Paraiba': u'Br\xe9sil',
+                        u'Param\xe1': u'Br\xe9sil',
+                        u'Paris': u'France',
+                        u'Par\xe1': u'Br\xe9sil',
+                        u'Pas-de-Calais': u'France',
+                        u'Pays Basque': u'Espagne',
+                        u'Pays-Bas': u'Pays-Bas',
+                        u'Pays-Bas avant 1830': u'Pays-Bas avant 1830',
+                        u'Pays-de-la-Loire': u'France',
+                        u'Pennsylvanie': u'\xc9tats-Unis',
+                        u'Pernambouc': u'Br\xe9sil',
+                        u'Philippines': u'Philippines',
+                        u'Phocide': u'Gr\xe8ce',
+                        u'Phtiotide': u'Gr\xe8ce',
+                        u'Piau\xed': u'Br\xe9sil',
+                        u'Picardie': u'France',
+                        u'Pitcairn': u'Pitcairn',
+                        u'Pi\xe9mont': u'Italie',
+                        u'Pi\xe9rie': u'Gr\xe8ce',
+                        u'Poitou-Charentes': u'France',
+                        u'Pologne': u'Pologne',
+                        u'Polyn\xe9sie fran\xe7aise': u'Polyn\xe9sie fran\xe7aise',
+                        u'Porto Rico': u'Porto Rico',
+                        u'Portugal': u'Portugal',
+                        u'Pouilles': u'Italie',
+                        u"Provence-Alpes-C\xf4te d'Azur": u'France',
+                        u'Pr\xe9veza': u'Gr\xe8ce',
+                        u'Puebla': u'Mexique',
+                        u'Puy-de-D\xf4me': u'France',
+                        u'Pyr\xe9n\xe9es-Atlantiques': u'France',
+                        u'Pyr\xe9n\xe9es-Orientales': u'France',
+                        u'P\xe9lla': u'Gr\xe8ce',
+                        u'P\xe9loponn\xe8se': u'Gr\xe8ce',
+                        u'P\xe9rou': u'P\xe9rou',
+                        u'Qatar': u'Qatar',
+                        u'Queensland': u'Australie',
+                        u'Quer\xe9taro': u'Mexique',
+                        u'Quintana Roo': u'Mexique',
+                        u'Qu\xe9bec': u'Canada',
+                        u'Rhode Island': u'\xc9tats-Unis',
+                        u'Rhodope': u'Gr\xe8ce',
+                        u'Rh\xe9nanie-Palatinat': u'Allemagne',
+                        u'Rh\xe9nanie-du-Nord-Westphalie': u'Allemagne',
+                        u'Rh\xf4ne': u'France',
+                        u'Rh\xf4ne-Alpes': u'France',
+                        u'Rio Grande do Norte': u'Br\xe9sil',
+                        u'Rio Grande do Sul': u'Br\xe9sil',
+                        u'Rio Negro': u'Argentine',
+                        u'Rio de Janeiro': u'Br\xe9sil',
+                        u'Rioja': u'Argentine',
+                        u'Rond\xf4nia': u'Br\xe9sil',
+                        u'Roraima': u'Br\xe9sil',
+                        u'Roumanie': u'Roumanie',
+                        u'Royaume-Uni': u'Grande-Bretagne',
+                        u'Russie (F\xe9d\xe9ration)': u'Russie (F\xe9d\xe9ration)',
+                        u'Rwanda': u'Rwanda',
+                        u'R\xc3\xa9publique Tch\xc3\xa8que': u'R\xc3\xa9publique tch\xc3\xa8que',
+                        u'R\xe9publique dominicaine': u'R\xe9publique dominicaine',
+                        u'R\xe9publique tch\xe8que': u'R\xe9publique tch\xe8que',
+                        u'R\xe9thymnon': u'Gr\xe8ce',
+                        u'R\xe9union': u'R\xe9union',
+                        u'Sahara occidental': u'Sahara occidental',
+                        u'Saint-Barth\xe9lemy': u'Saint-Barth\xe9lemy',
+                        u'Saint-Gall': u'Suisse',
+                        u'Saint-Kitts-et-Nevis': u'Saint-Kitts-et-Nevis',
+                        u'Saint-Marin': u'Saint-Marin',
+                        u'Saint-Martin (partie fran\xe7aise)': u'Saint-Martin (partie fran\xe7aise)',
+                        u'Saint-Martin (partie n\xe9erlandaise)': u'Saint-Martin (partie n\xe9erlandaise)',
+                        u'Saint-Pierre-et-Miquelon': u'Saint-Pierre-et-Miquelon',
+                        u'Saint-Vincent-et-les Grenadines': u'Saint-Vincent-et-les Grenadines',
+                        u'Sainte-H\xe9l\xe8ne': u'Sainte-H\xe9l\xe8ne',
+                        u'Sainte-Lucie': u'Sainte-Lucie',
+                        u'Salomon (\xeeles)': u'Salomon (\xeeles)',
+                        u'Salta': u'Argentine',
+                        u'Salzbourg': u'Autriche',
+                        u'Samoa': u'Samoa',
+                        u'Samoa am\xe9ricaines': u'Samoa am\xe9ricaines',
+                        u'San Juan': u'Argentine',
+                        u'San Luis': u'Argentine',
+                        u'San Luis Potos\xed': u'Mexique',
+                        u'Santa Catarina': u'Br\xe9sil',
+                        u'Santa Cruz': u'Argentine',
+                        u'Santa Fe': u'Argentine',
+                        u'Santiago del Estero': u'Argentine',
+                        u'Sao Tom\xe9-et-Principe': u'Sao Tom\xe9-et-Principe',
+                        u'Sardaigne': u'Italie',
+                        u'Sarre': u'Allemagne',
+                        u'Sarthe': u'France',
+                        u'Saskatchewan': u'Canada',
+                        u'Savoie': u'France',
+                        u'Saxe': u'Allemagne',
+                        u'Saxe-Anhalt': u'Allemagne',
+                        u'Sa\xf4ne-et-Loire': u'France',
+                        u'Schaffhouse': u'Suisse',
+                        u'Schleswig-Holstein': u'Allemagne',
+                        u'Schwyz': u'Suisse',
+                        u'Seine-Maritime': u'France',
+                        u'Seine-Saint-Denis': u'France',
+                        u'Seine-et-Marne': u'France',
+                        u'Serbie': u'Serbie',
+                        u'Serbie-et-Mont\xe9n\xe9gro': u'Serbie-et-Mont\xe9n\xe9gro',
+                        u'Sergipe': u'Br\xe9sil',
+                        u'Seychelles': u'Seychelles',
+                        u'Sicile': u'Italie',
+                        u'Sierra Leone': u'Sierra Leone',
+                        u'Sinaloa': u'Mexique',
+                        u'Singapour': u'Singapour',
+                        u'Slovaquie': u'Slovaquie',
+                        u'Slov\xe9nie': u'Slov\xe9nie',
+                        u'Soleure': u'Suisse',
+                        u'Somalie': u'Somalie',
+                        u'Somme': u'France',
+                        u'Sonora': u'Mexique',
+                        u'Soudan': u'Soudan',
+                        u'Soudan du Sud': u'Soudan du Sud',
+                        u'Sri Lanka': u'Sri Lanka',
+                        u'Styrie': u'Autriche',
+                        u'Suisse': u'Suisse',
+                        u'Suriname': u'Suriname',
+                        u'Su\xe8de': u'Su\xe8de',
+                        u'Svalbard et \xeele Jan Mayen': u'Svalbard et \xeele Jan Mayen',
+                        u'Swaziland': u'Swaziland',
+                        u'Syrie': u'Syrie',
+                        u'S\xe1mos': u'Gr\xe8ce',
+                        u'S\xe3o Paulo': u'Br\xe9sil',
+                        u'S\xe9n\xe9gal': u'S\xe9n\xe9gal',
+                        u'S\xe9rrai': u'Gr\xe8ce',
+                        u'Tabasco': u'Mexique',
+                        u'Tadjikistan': u'Tadjikistan',
+                        u'Tamaulipas': u'Mexique',
+                        u'Tanzanie': u'Tanzanie',
+                        u'Tarn': u'France',
+                        u'Tarn-et-Garonne': u'France',
+                        u'Tasmanie': u'Australie',
+                        u'Ta\xefwan': u'Ta\xefwan',
+                        u'Tchad': u'Tchad',
+                        u'Tch\xe9coslovaquie': u'Tch\xe9coslovaquie',
+                        u'Tennessee': u'\xc9tats-Unis',
+                        u'Terre de Feu': u'Argentine',
+                        u'Terre de la Reine-Maud (Norv\xe8ge)': u'Antarctique',
+                        u'Terre-Neuve': u'Canada',
+                        u'Terres australes et antarctiques fran\xe7aises': u'Antarctique',
+                        u'Terres australes fran\xe7aises': u'Terres australes fran\xe7aises',
+                        u'Territoire antarctique australien': u'Antarctique',
+                        u'Territoire antarctique britannique': u'Antarctique',
+                        u"Territoire britannique de l'Oc\xe9an indien": u"Territoire britannique de l'Oc\xe9an indien",
+                        u'Territoire de la capitale australienne': u'Australie',
+                        u'Territoire du Nord': u'Australie',
+                        u'Territoire du Yukon': u'Canada',
+                        u'Territoire-de-Belfort': u'France',
+                        u'Territoires du Nord-Ouest': u'Canada',
+                        u'Tessin': u'Suisse',
+                        u'Texas': u'\xc9tats-Unis',
+                        u'Tha\xeflande': u'Tha\xeflande',
+                        u'Thesprotie': u'Gr\xe8ce',
+                        u'Thessalie': u'Gr\xe8ce',
+                        u'Thessalonique': u'Gr\xe8ce',
+                        u'Thurgovie': u'Suisse',
+                        u'Thuringe': u'Allemagne',
+                        u'Timor oriental': u'Timor oriental',
+                        u'Tlaxcala': u'Mexique',
+                        u'Togo': u'Togo',
+                        u'Tokelau': u'Tokelau',
+                        u'Tonga': u'Tonga',
+                        u'Toscane': u'Italie',
+                        u'Trentin-Haut-Adige': u'Italie',
+                        u'Trinit\xe9-et-Tobago': u'Trinit\xe9-et-Tobago',
+                        u'Tr\xedkala': u'Gr\xe8ce',
+                        u'Tucum\xe1n': u'Argentine',
+                        u'Tunisie': u'Tunisie',
+                        u'Turkm\xe9nistan': u'Turkm\xe9nistan',
+                        u'Turks et Ca\xefques (\xeeles)': u'Turks et Ca\xefques (\xeeles)',
+                        u'Turquie': u'Turquie',
+                        u'Tuvalu': u'Tuvalu',
+                        u'Tyrol': u'Autriche',
+                        u'URSS': u'URSS',
+                        u'US': u'\xc9tats-Unis',
+                        'USA': u'\xc9tats-Unis',
+                        u'Ukraine': u'Ukraine',
+                        u'Uri': u'Suisse',
+                        u'Uruguay': u'Uruguay',
+                        u'Utah': u'\xc9tats-Unis',
+                        u'Utrecht': u'Pays-Bas',
+                        u"Val d'Aoste": u'Italie',
+                        u"Val-d'Oise": u'France',
+                        u'Val-de-Marne': u'France',
+                        u'Valais': u'Suisse',
+                        u'Vanuatu': u'Vanuatu',
+                        u'Var': u'France',
+                        u'Vatican': u'Vatican',
+                        u'Vaucluse': u'France',
+                        u'Vaud': u'Suisse',
+                        u'Vend\xe9e': u'France',
+                        u'Venezuela': u'Venezuela',
+                        u'Veracruz': u'Mexique',
+                        u'Vermont': u'\xc9tats-Unis',
+                        u'Victoria': u'Australie',
+                        u'Vienne': u'Autriche',
+                        u'Viet Nam': u'Viet Nam',
+                        u'Virginie': u'\xc9tats-Unis',
+                        u'Virginie occidentale': u'\xc9tats-Unis',
+                        u'Vorarlberg': u'Autriche',
+                        u'Vosges': u'France',
+                        u'V\xe9n\xe9tie': u'Italie',
+                        u'Wallis et Futuna (\xeeles)': u'Wallis et Futuna (\xeeles)',
+                        u'Washington': u'\xc9tats-Unis',
+                        u'Wisconsin': u'\xc9tats-Unis',
+                        u'Wyoming': u'\xc9tats-Unis',
+                        u'X\xe1nthi': u'Gr\xe8ce',
+                        u'Yonne': u'France',
+                        u'Yougoslavie': u'Yougoslavie',
+                        u'Yucat\xe1n': u'Mexique',
+                        u'Yvelines': u'France',
+                        u'Y\xe9men': u'Y\xe9men',
+                        u'Y\xe9men (R\xe9publique d\xe9mocratique populaire)': u'Y\xe9men (R\xe9publique d\xe9mocratique populaire)',
+                        u'Zacatecas': u'Mexique',
+                        u'Zambie': u'Zambie',
+                        u'Zimbabwe': u'Zimbabwe',
+                        u'Zoug': u'Suisse',
+                        u'Zurich': u'Suisse',
+                        u'Z\xe1kynthos': u'Gr\xe8ce',
+                        u'Z\xe9lande': u'Pays-Bas',
+                        u'aire g\xe9ographique ancienne': u'aire g\xe9ographique ancienne',
+                        u'code non adapt\xe9': u'code non adapt\xe9',
+                        u'inconnu': u'inconnu',
+                        u'intergouvernemental': u'intergouvernemental',
+                        u'multiple': u'multiple',
+                        u"ne s'applique pas": u"ne s'applique pas",
+                        u'non renseign\xe9': u'non renseign\xe9',
+                        u'\xc1rta': u'Gr\xe8ce',
+                        u'\xc9gypte': u'\xc9gypte',
+                        u'\xc9lide': u'Gr\xe8ce',
+                        u'\xc9mathie': u'Gr\xe8ce',
+                        u'\xc9milie-Romagne': u'Italie',
+                        u'\xc9mirats arabes unis': u'\xc9mirats arabes unis',
+                        u'\xc9pire': u'Gr\xe8ce',
+                        u'\xc9quateur': u'\xc9quateur',
+                        u'\xc9rythr\xe9e': u'\xc9rythr\xe9e',
+                        u'\xc9tats-Unis': u'\xc9tats-Unis',
+                        u'\xc9thiopie': u'\xc9thiopie',
+                        u'\xc9tolie-et-Acarnanie': u'Gr\xe8ce',
+                        u'\xc9vros': u'Gr\xe8ce',
+                        u'\xcele Pierre 1er (Norv\xe8ge)': u'Antarctique',
+                        u'\xcele de Man': u'\xcele de Man',
+                        u'\xcele du Prince-\xc9douard': u'Canada',
+                        u'\xcele-de-France': u'France',
+                        u'\xceles Bal\xe9ares': u'Espagne',
+                        u'\xceles Ioniennes': u'Gr\xe8ce',
+                        u'\xceles Vierges am\xe9ricaines': u'\xceles Vierges am\xe9ricaines',
+                        u'\xceles Vierges britanniques': u'\xceles Vierges britanniques',
+                        u'\xceles de la Mer \xc9g\xe9e m\xe9ridionale': u'Gr\xe8ce',
+                        u'\xceles de la Mer \xc9g\xe9e septentrionale': u'Gr\xe8ce',
+                        u'\xceles mineures \xe9loign\xe9es des \xc9tats-Unis': u'\xceles mineures \xe9loign\xe9es des \xc9tats-Unis'
+                                                }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data/countries_iso_3166.txt	Thu Dec 19 14:45:56 2013 +0000
@@ -0,0 +1,269 @@
+##,non renseigné
+..,non renseigné
+aa,aire géographique ancienne
+ad,Andorre
+ae,Émirats arabes unis
+af,Afghanistan
+ag,Antigua-et-Barbuda
+ai,Anguilla
+al,Albanie
+am,Arménie
+an,Antilles néerlandaises
+ao,Angola
+aq,Antarctique
+ar,Argentine
+as,Samoa américaines
+at,Autriche
+au,Australie
+aw,Aruba
+ax,Aland (îles)
+az,Azerbaïdjan
+ba,Bosnie-Herzégovine
+bb,Barbade
+bd,Bangladesh
+be,Belgique
+bf,Burkina
+bg,Bulgarie
+bh,Bahreïn
+bi,Burundi
+bj,Bénin
+bl,Saint-Barthélemy
+bm,Bermudes
+bn,Brunéi
+bo,Bolivie
+bq,Bonaire, Saint-Eustache et Saba,Saba###Saint-Eustache
+br,Brésil
+bs,Bahamas
+bt,Bhoutan
+bv,Bouvet (île)
+bw,Botswana
+by,Biélorussie,Bélarus
+bz,Belize
+ca,Canada
+cc,Cocos (îles),Keeling (îles)
+cd,Congo (République démocratique),Zaïre
+cf,Centrafrique,République centrafricaine
+cg,Congo,Congo (République)
+ch,Suisse,Confédération helvétique
+ci,Côte d'Ivoire
+ck,Cook (îles)
+cl,Chili
+cm,Cameroun
+cn,Chine,Chine (République populaire)
+co,Colombie
+cr,Costa Rica
+cs,Serbie-et-Monténégro
+cu,Cuba
+cv,Cap-Vert
+cw,Curaçao
+cx,Christmas (île)
+cy,Chypre
+cz,République tchèque,Tchèque, République
+dd,Allemagne (République démocratique)
+de,Allemagne,Allemagne (République fédérale)
+dj,Djibouti
+dk,Danemark
+dm,Dominique
+do,République dominicaine,Dominicaine, République
+dz,Algérie
+ec,Équateur
+ee,Estonie
+eg,Égypte
+eh,Sahara occidental
+er,Érythrée
+es,Espagne
+et,Éthiopie
+fi,Finlande
+fj,Fidji
+fk,Malouines (îles),Falkland (îles)
+fm,Micronésie,États fédérés de Micronésie
+fo,Féroé (îles)
+fr,France
+ga,Gabon
+gb,Grande-Bretagne,Royaume-Uni
+gd,Grenade
+ge,Géorgie
+gf,Guyane française
+gg,Guernesey
+gh,Ghana
+gi,Gibraltar
+gl,Groenland
+gm,Gambie
+gn,Guinée
+gp,Guadeloupe
+gq,Guinée équatoriale
+gr,Grèce
+gs,Géorgie du Sud et les îles Sandwich du Sud
+gt,Guatemala
+gu,Guam
+gw,Guinée-Bissau
+gy,Guyana
+hk,Hong Kong
+hm,Heard (île) et îles McDonald
+hn,Honduras
+hr,Croatie
+ht,Haïti
+hu,Hongrie
+id,Indonésie
+ie,Irlande
+ii,intergouvernemental
+il,Israël
+im,Île de Man,Man, Île de
+in,Inde
+io,Territoire britannique de l'Océan indien,Chagos (îles)###Océan indien, Territoire britannique de l'
+iq,Irak
+ir,Iran
+is,Islande
+it,Italie
+je,Jersey
+jm,Jamaïque
+jo,Jordanie
+jp,Japon
+ke,Kenya
+kg,Kirghizistan
+kh,Cambodge
+ki,Kiribati
+km,Comores
+kn,Saint-Kitts-et-Nevis,Saint-Christophe-et-Nevis
+ko,Kosovo
+kp,Corée (République populaire démocratique),Corée du Nord
+kr,Corée (République),Corée du Sud
+kw,Koweït
+ky,Cayman,Caïmanes, Îles###Caïman (îles)
+kz,Kazakhstan
+la,Laos
+lb,Liban
+lc,Sainte-Lucie
+li,Liechtenstein
+lk,Sri Lanka
+lr,Liberia
+ls,Lesotho
+lt,Lituanie
+lu,Luxembourg
+lv,Lettonie
+ly,Libye
+ma,Maroc
+mc,Monaco
+md,Moldavie,Moldova, République de
+me,Monténégro
+mf,Saint-Martin (partie française)
+mg,Madagascar
+mh,Marshall (îles)
+mk,Macédoine (République)
+ml,Mali
+mm,Myanmar,Birmanie
+mn,Mongolie
+mo,Macao
+mp,Mariannes du Nord (îles)
+mq,Martinique
+mr,Mauritanie
+ms,Montserrat
+mt,Malte
+mu,Maurice
+mv,Maldives
+mw,Malawi
+mx,Mexique
+my,Malaisie
+mz,Mozambique
+na,Namibie
+nc,Nouvelle-Calédonie
+ne,Niger
+nf,Norfolk (île)
+ng,Nigeria
+ni,Nicaragua
+nl,Pays-Bas
+no,Norvège
+np,Népal
+nr,Nauru
+nu,Niue
+nz,Nouvelle-Zélande
+om,Oman
+oo,code non adapté
+pa,Panama
+pe,Pérou
+pf,Polynésie française
+pg,Papouasie-Nouvelle-Guinée
+ph,Philippines
+pk,Pakistan
+pl,Pologne
+pm,Saint-Pierre-et-Miquelon
+pn,Pitcairn
+pr,Porto Rico
+ps,Autorité palestinienne,Palestine
+pt,Portugal
+pw,Palau,Palaos
+py,Paraguay
+qa,Qatar
+re,Réunion
+ro,Roumanie
+rs,Serbie
+ru,Russie (Fédération),Russie
+rw,Rwanda
+sa,Arabie saoudite
+sb,Salomon (îles)
+sc,Seychelles
+sd,Soudan
+se,Suède
+sg,Singapour
+sh,Sainte-Hélène,Ascension (île)###Tristan da Cunha (île)
+si,Slovénie
+sj,Svalbard et île Jan Mayen
+sk,Slovaquie
+sl,Sierra Leone
+sm,Saint-Marin
+sn,Sénégal
+so,Somalie
+sr,Suriname
+ss,Soudan du Sud,Sud Soudan
+st,Sao Tomé-et-Principe
+su,URSS
+sv,El Salvador,Salvador
+sx,Saint-Martin (partie néerlandaise),Sint Maarten
+sy,Syrie
+sz,Swaziland
+tc,Turks et Caïques (îles)
+td,Tchad
+tf,Terres australes françaises
+tg,Togo
+th,Thaïlande
+tj,Tadjikistan
+tk,Tokelau
+tl,Timor oriental
+tm,Turkménistan
+tn,Tunisie
+to,Tonga
+tr,Turquie
+tt,Trinité-et-Tobago
+tv,Tuvalu
+tw,Taïwan,Chine (République)
+tz,Tanzanie
+ua,Ukraine
+ug,Ouganda
+um,Îles mineures éloignées des États-Unis
+us,États-Unis
+uy,Uruguay
+uz,Ouzbékistan
+va,Vatican,Saint-Siège
+vc,Saint-Vincent-et-les Grenadines
+ve,Venezuela
+vg,Îles Vierges britanniques,Vierges (îles) britanniques
+vi,Îles Vierges américaines,Vierges (îles) américaines
+vn,Viet Nam
+vu,Vanuatu
+wf,Wallis et Futuna (îles)
+ws,Samoa,Samoa occidentales
+xc,Tchécoslovaquie
+xd,Allemagne avant 1945
+xe,Europe,Union européenne
+xk,Corée avant 1948
+xn,Pays-Bas avant 1830,Belgique avant 1830
+xx,inconnu
+yd,Yémen (République démocratique populaire),Sud Yémen
+ye,Yémen
+yt,Mayotte
+yu,Yougoslavie
+yy,ne s'applique pas
+za,Afrique du Sud
+zm,Zambie
+zw,Zimbabwe
+zz,multiple
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data/stopwords.py	Thu Dec 19 14:45:56 2013 +0000
@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*-
+"""
+Stopwords in different languages.
+"""
+
+FRENCH_STOPWORDS = set(['alors', 'au', 'aucuns', 'aussi', 'autre', 'aux', 'avant', 'avec', 'avoir', 'bon', 'car', 'ce', 'cela', 'ces', 'ceux', 'chaque', 'ci', 'comme', 'comment', 'dans', 'de', 'dedans', 'dehors', 'depuis', 'des', 'deux', 'devrait', 'doit', 'donc', 'dos', 'droite', 'du', 'début', 'elle', 'elles', 'en', 'encore', 'essai', 'est', 'et', 'eu', 'eux', 'fait', 'faites', 'fois', 'font', 'force', 'haut', 'hors', 'ici', 'il', 'ils', 'je', 'juste', 'la', 'le', 'les', 'leur', 'lui', 'là', 'ma', 'maintenant', 'mais', 'me', 'meme', 'mes', 'mine', 'moi', 'moins', 'mon', 'mot', 'ne', 'ni', 'nommés', 'nos', 'notre', 'nous', 'nouveaux', 'on', 'ou', 'où', 'par', 'parce', 'parole', 'pas', 'personnes', 'peu', 'peut', 'pièce', 'plupart', 'pour', 'pourquoi', 'qu', 'quand', 'que', 'quel', 'quelle', 'quelles', 'quels', 'qui', 'sa', 'sans', 'se', 'ses', 'seulement', 'si', 'sien', 'son', 'sont', 'sous', 'soyez', 'sujet', 'sur', 'ta', 'tandis', 'te', 'tellement', 'tels', 'tes', 'toi', 'ton', 'tous', 'tout', 'trop', 'très', 'tu', 'un', 'une', 'valeur', 'voie', 'voient', 'vont', 'vos', 'votre', 'vous', 'vu', 'ça', 'étaient', 'état', 'étions', 'été', 'être'])
+
+
+ENGLISH_STOPWORDS = set(['a', 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', "ain't", 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', "aren't", 'around', 'as', 'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best', 'better', 'between', 'beyond', 'bill', 'both', 'bottom', 'brief', 'but', 'by', 'call', 'came', 'can', 'cannot', 'cant', "can't", 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', 'co', 'com', 'come', 'comes', 'computer', 'con', 'concerning', 'consequently', 'consider', 'considering', 'contain', 'containing', 'contains', 'corresponding', 'could', 'couldnt', "couldn't", 'course', 'cry', 'currently', "c'mon", "c's", 'de', 'definitely', 'describe', 'described', 'despite', 'detail', 'did', "didn't", 'different', 'do', 'does', "doesn't", 'doing', 'done', "don't", 'down', 'downwards', 'due', 'during', 'each', 'edu', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'entirely', 'especially', 'et', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'exactly', 'example', 'except', 'far', 'few', 'fifteen', 'fifth', 'fify', 'fill', 'find', 'fire', 'first', 'five', 'followed', 'following', 'follows', 'for', 'former', 'formerly', 'forth', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'furthermore', 'get', 'gets', 'getting', 'give', 'given', 'gives', 'go', 'goes', 'going', 'gone', 'got', 'gotten', 'greetings', 'had', "hadn't", 'happens', 'hardly', 'has', 'hasnt', "hasn't", 'have', "haven't", 'having', 'he', 'hello', 'help', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', "here's", 'hers', 'herself', "he's", 'hi', 'him', 'himself', 'his', 'hither', 'hopefully', 'how', 'howbeit', 'however', 'hundred', 'i', "i'd", "i'll", "i'm", "i've", 'ie', 'if', 'ignored', 'immediate', 'in', 'inasmuch', 'inc', 'indeed', 'indicate', 'indicated', 'indicates', 'inner', 'insofar', 'instead', 'interest', 'into', 'inward', 'is', "isn't", 'it', 'its', 'itself', "it'd", "it'll", "it's'", "i'd", "i'll", "i'm", "i've", 'just', 'keep', 'keeps', 'kept', 'know', 'known', 'knows', 'last', 'lately', 'later', 'latter', 'latterly', 'least', 'less', 'lest', 'let', "let's", 'like', 'liked', 'likely', 'little', 'look', 'looking', 'looks', 'ltd', 'made', 'mainly', 'many', 'may', 'maybe', 'me', 'mean', 'meanwhile', 'merely', 'might', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', 'my', 'myself', 'name', 'namely', 'nd', 'near', 'nearly', 'necessary', 'need', 'needs', 'neither', 'never', 'nevertheless', 'new', 'next', 'nine', 'no', 'nobody', 'non', 'none', 'noone', 'nor', 'normally', 'not', 'nothing', 'novel', 'now', 'nowhere', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old', 'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'own', 'part', 'particular', 'particularly', 'per', 'perhaps', 'placed', 'please', 'plus', 'possible', 'presumably', 'probably', 'provides', 'put', 'que', 'quite', 'qv', 'rather', 'rd', 're', 'really', 'reasonably', 'regarding', 'regardless', 'regards', 'relatively', 'respectively', 'right', 'said', 'same', 'saw', 'say', 'saying', 'says', 'second', 'secondly', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves', 'sensible', 'sent', 'serious', 'seriously', 'seven', 'several', 'shall', 'she', 'should', "shouldn't", 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some', 'somebody', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry', 'specified', 'specify', 'specifying', 'still', 'sub', 'such', 'sup', 'sure', 'system', 'take', 'taken', 'tell', 'ten', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', "that's", 'thats', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'theres', 'thereupon', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'thick', 'thin', 'think', 'third', 'this', 'thorough', 'thoroughly', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'took', 'top', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying', 'twelve', 'twenty', 'twice', 'two', "t's", 'un', 'under', 'unfortunately', 'unless', 'unlikely', 'until', 'unto', 'up', 'upon', 'us', 'use', 'used', 'useful', 'uses', 'using', 'usually', 'value', 'various', 'very', 'via', 'viz', 'vs', 'want', 'wants', 'was', "wasn't", 'way', 'we', 'welcome', 'well', 'went', 'were', "weren't", "we'd", "we'll", "we're", "we've", 'what', 'whatever', "what's", 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', "where's", 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', "who's", 'why', 'will', 'willing', 'wish', 'with', 'within', 'without', 'wonder', "won't", 'would', "wouldn't", 'yes', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves', "you'd", "you'll", "you're", "you've", 'zero'])
+
+
+ENGLISH_REGULAR_VERBS = set(['accept', 'add', 'admire', 'admit', 'advise', 'afford', 'agree', 'alert', 'allow', 'amuse', 'analyse', 'announce', 'annoy', 'answer', 'apologise', 'appear', 'applaud', 'appreciate', 'approve', 'argue', 'arrange', 'arrest', 'arrive', 'ask', 'attach', 'attack', 'attempt', 'attend', 'attract', 'avoid', 'back', 'bake', 'balance', 'ban', 'bang', 'bare', 'bat', 'bathe', 'battle', 'beam', 'beg', 'behave', 'belong', 'bleach', 'bless', 'blind', 'blink', 'blot', 'blush', 'boast', 'boil', 'bolt', 'bomb', 'book', 'bore', 'borrow', 'bounce', 'bow', 'box', 'brake', 'branch', 'breathe', 'bruise', 'brush', 'bubble', 'bump', 'burn', 'bury', 'buzz', 'calculate', 'call', 'camp', 'care', 'carry', 'carve', 'cause', 'challenge', 'change', 'charge', 'chase', 'cheat', 'check', 'cheer', 'chew', 'choke', 'chop', 'claim', 'clap', 'clean', 'clear', 'clip', 'close', 'coach', 'coil', 'collect', 'colour', 'comb', 'command', 'communicate', 'compare', 'compete', 'complain', 'complete', 'concentrate', 'concern', 'confess', 'confuse', 'connect', 'consider', 'consist', 'contain', 'continue', 'copy', 'correct', 'cough', 'count', 'cover', 'crack', 'crash', 'crawl', 'cross', 'crush', 'cry', 'cure', 'curl', 'curve', 'cycle', 'dam', 'damage', 'dance', 'dare', 'decay', 'deceive', 'decide', 'decorate', 'delay', 'delight', 'deliver', 'depend', 'describe', 'desert', 'deserve', 'destroy', 'detect', 'develop', 'disagree', 'disappear', 'disapprove', 'disarm', 'discover', 'dislike', 'divide', 'double', 'doubt', 'drag', 'drain', 'dream', 'dress', 'drip', 'drop', 'drown', 'drum', 'dry', 'dust', 'earn', 'educate', 'embarrass', 'employ', 'empty', 'encourage', 'end', 'enjoy', 'enter', 'entertain', 'escape', 'examine', 'excite', 'excuse', 'exercise', 'exist', 'expand', 'expect', 'explain', 'explode', 'extend', 'face', 'fade', 'fail', 'fancy', 'fasten', 'fax', 'fear', 'fence', 'fetch', 'file', 'fill', 'film', 'fire', 'fit', 'fix', 'flap', 'flash', 'float', 'flood', 'flow', 'flower', 'fold', 'follow', 'fool', 'force', 'form', 'found', 'frame', 'frighten', 'fry', 'gather', 'gaze', 'glow', 'glue', 'grab', 'grate', 'grease', 'greet', 'grin', 'grip', 'groan', 'guarantee', 'guard', 'guess', 'guide', 'hammer', 'hand', 'handle', 'hang', 'happen', 'harass', 'harm', 'hate', 'haunt', 'head', 'heal', 'heap', 'heat', 'help', 'hook', 'hop', 'hope', 'hover', 'hug', 'hum', 'hunt', 'hurry', 'identify', 'ignore', 'imagine', 'impress', 'improve', 'include', 'increase', 'influence', 'inform', 'inject', 'injure', 'instruct', 'intend', 'interest', 'interfere', 'interrupt', 'introduce', 'invent', 'invite', 'irritate', 'itch', 'jail', 'jam', 'jog', 'join', 'joke', 'judge', 'juggle', 'jump', 'kick', 'kill', 'kiss', 'kneel', 'knit', 'knock', 'knot', 'label', 'land', 'last', 'laugh', 'launch', 'learn', 'level', 'license', 'lick', 'lie', 'lighten', 'like', 'list', 'listen', 'live', 'load', 'lock', 'long', 'look', 'love', 'man', 'manage', 'march', 'mark', 'marry', 'match', 'mate', 'matter', 'measure', 'meddle', 'melt', 'memorise', 'mend', 'mess up', 'milk', 'mine', 'miss', 'mix', 'moan', 'moor', 'mourn', 'move', 'muddle', 'mug', 'multiply', 'murder', 'nail', 'name', 'need', 'nest', 'nod', 'note', 'notice', 'number', 'obey', 'object', 'observe', 'obtain', 'occur', 'offend', 'offer', 'open', 'order', 'overflow', 'owe', 'own', 'pack', 'paddle', 'paint', 'park', 'part', 'pass', 'paste', 'pat', 'pause', 'peck', 'pedal', 'peel', 'peep', 'perform', 'permit', 'phone', 'pick', 'pinch', 'pine', 'place', 'plan', 'plant', 'play', 'please', 'plug', 'point', 'poke', 'polish', 'pop', 'possess', 'post', 'pour', 'practise', 'pray', 'preach', 'precede', 'prefer', 'prepare', 'present', 'preserve', 'press', 'pretend', 'prevent', 'prick', 'print', 'produce', 'program', 'promise', 'protect', 'provide', 'pull', 'pump', 'punch', 'puncture', 'punish', 'push', 'question', 'queue', 'race', 'radiate', 'rain', 'raise', 'reach', 'realise', 'receive', 'recognise', 'record', 'reduce', 'reflect', 'refuse', 'regret', 'reign', 'reject', 'rejoice', 'relax', 'release', 'rely', 'remain', 'remember', 'remind', 'remove', 'repair', 'repeat', 'replace', 'reply', 'report', 'reproduce', 'request', 'rescue', 'retire', 'return', 'rhyme', 'rinse', 'risk', 'rob', 'rock', 'roll', 'rot', 'rub', 'ruin', 'rule', 'rush', 'sack', 'sail', 'satisfy', 'save', 'saw', 'scare', 'scatter', 'scold', 'scorch', 'scrape', 'scratch', 'scream', 'screw', 'scribble', 'scrub', 'seal', 'search', 'separate', 'serve', 'settle', 'shade', 'share', 'shave', 'shelter', 'shiver', 'shock', 'shop', 'shrug', 'sigh', 'sign', 'signal', 'sin', 'sip', 'ski', 'skip', 'slap', 'slip', 'slow', 'smash', 'smell', 'smile', 'smoke', 'snatch', 'sneeze', 'sniff', 'snore', 'snow', 'soak', 'soothe', 'sound', 'spare', 'spark', 'sparkle', 'spell', 'spill', 'spoil', 'spot', 'spray', 'sprout', 'squash', 'squeak', 'squeal', 'squeeze', 'stain', 'stamp', 'stare', 'start', 'stay', 'steer', 'step', 'stir', 'stitch', 'stop', 'store', 'strap', 'strengthen', 'stretch', 'strip', 'stroke', 'stuff', 'subtract', 'succeed', 'suck', 'suffer', 'suggest', 'suit', 'supply', 'support', 'suppose', 'surprise', 'surround', 'suspect', 'suspend', 'switch', 'talk', 'tame', 'tap', 'taste', 'tease', 'telephone', 'tempt', 'terrify', 'test', 'thank', 'thaw', 'tick', 'tickle', 'tie', 'time', 'tip', 'tire', 'touch', 'tour', 'tow', 'trace', 'trade', 'train', 'transport', 'trap', 'travel', 'treat', 'tremble', 'trick', 'trip', 'trot', 'trouble', 'trust', 'try', 'tug', 'tumble', 'turn', 'twist', 'type', 'undress', 'unfasten', 'unite', 'unlock', 'unpack', 'untidy', 'use', 'vanish', 'visit', 'wail', 'wait', 'walk', 'wander', 'want', 'warm', 'warn', 'wash', 'waste', 'watch', 'water', 'wave', 'weigh', 'welcome', 'whine', 'whip', 'whirl', 'whisper', 'whistle', 'wink', 'wipe', 'wish', 'wobble', 'wonder', 'work', 'worry', 'wrap', 'wreck', 'wrestle', 'wriggle', 'x-ray', 'yawn', 'yell', 'zip', 'zoom'])
+
+
+ENGLISH_IRREGULAR_VERBS = set(['arise ', 'arisen', 'arose ', 'ate', 'awake', 'awakened', 'awoke', 'awoken', 'backslid', 'backslidden', 'backslide', 'bade', 'be', 'bear', 'beat', 'beaten', 'became', 'become', 'been', 'began', 'begin', 'begun', 'bend', 'bent', 'bet', 'betted', 'bid', 'bidden', 'bind', 'bit', 'bite', 'bitten', 'bled', 'bleed', 'blew', 'blow', 'blown', 'bore', 'born', 'borne', 'bought', 'bound', 'break', 'bred', 'breed', 'bring', 'broadcast', 'broadcasted', 'broke', 'broken', 'brought', 'build', 'built', 'burn', 'burned', 'burnt', 'burst', 'bust', 'busted', 'buy', 'came', 'cast', 'catch', 'caught', 'choose', 'chose', 'chosen', 'clad', 'cling', 'clothe', 'clothed', 'clung', 'come', 'cost', 'creep', 'crept', 'cut', 'daydream', 'daydreamed', 'daydreamt', 'deal', 'dealt', 'did', 'dig', 'disprove', 'disproved', 'disproven', 'dive', 'dived', 'do', 'done', 'dove', 'drank', 'draw', 'drawn', 'dream', 'dreamed', 'dreamt', 'drew', 'drink', 'drive', 'driven', 'drove', 'drunk', 'dug', 'dwell', 'dwelled', 'dwelt', 'eat', 'eaten', 'fall', 'fallen', 'fed', 'feed', 'feel', 'fell', 'felt', 'fight', 'find', 'fit', 'fitted', 'fled', 'flee', 'flew', 'fling', 'flown', 'flung', 'fly', 'forbade', 'forbid', 'forbidden', 'forecast', 'forego', 'foregone', 'foresaw', 'foresee', 'foreseen', 'foretell', 'foretold', 'forewent', 'forgave', 'forget', 'forgive', 'forgiven', 'forgot', 'forgotten', 'forsake', 'forsaken', 'forsook', 'fought', 'found', 'freeze', 'froze', 'frozen', 'gave', 'get', 'give', 'given', 'go', 'gone', 'got', 'gotten', 'grew', 'grind', 'ground', 'grow', 'grown', 'had', 'hang', 'have', 'hear', 'heard', 'held', 'hew', 'hewed', 'hewn', 'hid', 'hidden', 'hide', 'hit', 'hold', 'hung', 'hurt', 'keep', 'kept', 'kneel', 'kneeled', 'knelt', 'knew', 'knit', 'knitted', 'know', 'known', 'laid', 'lain', 'lay', 'lead', 'lean', 'leaned', 'leant', 'leap', 'leaped', 'leapt', 'learn', 'learned', 'learnt', 'leave', 'led', 'left', 'lend', 'lent', 'let', 'lie', 'lied', 'light', 'lighted', 'lit', 'lose', 'lost', 'made', 'make', 'mean', 'meant', 'meet', 'met', 'misunderstand', 'misunderstood', 'mow', 'mowed', 'mown', 'paid', 'partake', 'partaken', 'partook', 'pay', 'plead', 'pleaded', 'pled', 'proofread', 'prove', 'proved', 'proven', 'put', 'quick-freeze', 'quick-froze', 'quick-frozen', 'quit', 'quitted', 'ran', 'rang', 'read', 'rid', 'ridden', 'ride', 'ring', 'rise', 'risen', 'rode', 'rose', 'run', 'rung', 'said', 'sang', 'sank', 'sat', 'saw', 'sawed', 'sawn', 'say', 'see', 'seek', 'seen', 'sell', 'send', 'sent', 'set', 'sew', 'sewed', 'sewn', 'shake', 'shaken', 'shave', 'shaved', 'shaven', 'shear', 'sheared', 'shed', 'shine', 'shined', 'shone', 'shook', 'shoot', 'shorn', 'shot', 'show', 'showed', 'shown', 'shrank', 'shrink', 'shrunk', 'shut', 'sing', 'sink', 'sit', 'slain', 'slay', 'slayed', 'sleep', 'slept', 'slew', 'slid', 'slide', 'sling', 'slink', 'slinked', 'slit', 'slung', 'slunk', 'smell', 'smelled', 'smelt', 'sneak', 'sneaked', 'snuck', 'sold', 'sought', 'sow', 'sowed', 'sown', 'spat', 'speak', 'sped', 'speed', 'speeded', 'spell', 'spelled', 'spelt', 'spend', 'spent', 'spill', 'spilled', 'spilt', 'spin', 'spit', 'split', 'spoil', 'spoiled', 'spoilt', 'spoke', 'spoken', 'sprang', 'spread', 'spring', 'sprung', 'spun', 'stand ', 'stank', 'steal', 'stick', 'sting', 'stink', 'stole', 'stolen', 'stood', 'strew', 'strewed', 'strewn', 'stricken', 'stridden', 'stride', 'strike', 'string', 'strive', 'strived', 'striven', 'strode', 'strove', 'struck', 'strung', 'stuck', 'stung', 'stunk', 'sublet', 'sunburn', 'sunburned', 'sunburnt', 'sung', 'sunk', 'swam', 'swear', 'sweat', 'sweated', 'sweep', 'swell', 'swelled', 'swept', 'swim', 'swing', 'swollen', 'swore', 'sworn', 'swum', 'swung', 'take', 'taken', 'taught', 'teach', 'tear', 'telecast', 'tell', 'test-drive', 'test-driven', 'test-drove', 'test-flew', 'test-flown', 'test-fly', 'think', 'thought', 'threw', 'throw', 'thrown', 'thrust', 'told', 'took', 'tore', 'torn', 'tread', 'trod', 'trodden', 'understand', 'understood', 'undertake', 'undertaken', 'undertook', 'undid', 'undo', 'undone', 'wake', 'waked', 'was, were', 'waylaid', 'waylay', 'wear', 'weave', 'weaved', 'wed', 'wedded', 'weep', 'went', 'wept', 'wet', 'wetted', 'whet', 'whetted', 'win', 'wind', 'withdraw', 'withdrawn', 'withdrew', 'withheld', 'withhold', 'withstand', 'withstood', 'woke', 'woken', 'won', 'wore', 'worn', 'wound', 'wove', 'woven', 'wring', 'write', 'written', 'wrote', 'wrung'])
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data/us_states.py	Thu Dec 19 14:45:56 2013 +0000
@@ -0,0 +1,211 @@
+# -*- coding: utf-8 -*-
+
+# See http://en.wikipedia.org/wiki/List_of_U.S._state_abbreviations
+# WARNING: The name of each state should be in French
+# (e.g. "Floride", not "Florida")
+US_STATES = {'AK': 'Alaska',
+             'AL': 'Alabama',
+             'AR': 'Arkansas',
+             'AZ': 'Arizona',
+             'Ala.': 'Alabama',
+             'Alas.': 'Alaska',
+             'Alaska': 'Alaska',
+             'Ariz.': 'Arizona',
+             'Ark.': 'Arkansas',
+             'Az.': 'Arizona',
+             'CA': 'Californie',
+             'CF': 'Californie',
+             'CL': 'Colorado',
+             'CO': 'Colorado',
+             'CT': 'Connecticut',
+             'Ca.': 'Californie',
+             'Cal.': 'Californie',
+             'Cali.': 'Californie',
+             'Calif.': 'Californie',
+             'Col.': 'Colorado',
+             'Colo.': 'Colorado',
+             'Conn.': 'Connecticut',
+             'Ct.': 'Connecticut',
+             'D.C.': 'District of ColuFederal district',
+             'DC': 'District of ColuFederal district',
+             'DE': 'Delaware',
+             'DL': 'Delaware',
+             'De.': 'Delaware',
+             'Del.': 'Delaware',
+             'FL': 'Floride',
+             'Fl.': 'Floride',
+             'Fla.': 'Floride',
+             'Flor.': 'Floride',
+             'GA': u'Géorgie',
+             'Ga.': u'Géorgie',
+             'H.I.': 'Hawaii',
+             'HA': 'Hawaii',
+             'HI': 'Hawaii',
+             'Hawaii': 'Hawaii',
+             'IA': 'Iowa',
+             'ID': 'Idaho',
+             'IL': 'Illinois',
+             'IN': 'Indiana',
+             'Ia.': 'Iowa',
+             'Id.': 'Idaho',
+             'Ida.': 'Idaho',
+             'Idaho': 'Idaho',
+             'Il.': 'Illinois',
+             "Ill's": 'Illinois',
+             'Ill.': 'Illinois',
+             'Ills.': 'Illinois',
+             'In.': 'Indiana',
+             'Ind.': 'Indiana',
+             'Ioa.': 'Iowa',
+             'Iowa': 'Iowa',
+             'KA': 'Kansas',
+             'KS': 'Kansas',
+             'KY': 'Kentucky',
+             'Ka.': 'Kansas',
+             'Kan.': 'Kansas',
+             'Kans.': 'Kansas',
+             'Ks.': 'Kansas',
+             'Ky.': 'Kentucky',
+             'LA': 'Louisiane',
+             'La.': 'Louisiane',
+             'MA': 'Massachusetts',
+             'MC': 'Michigan',
+             'MD': 'Maryland',
+             'ME': 'Maine',
+             'MI': 'Mississippi',
+             'MN': 'Minnesota',
+             'MO': 'Missouri',
+             'MS': 'Mississippi',
+             'MT': 'Montana',
+             'Maine': 'Maine',
+             'Mass.': 'Massachusetts',
+             'Md.': 'Maryland',
+             'Me.': 'Maine',
+             'Mich.': 'Michigan',
+             'Minn.': 'Minnesota',
+             'Miss.': 'Mississippi',
+             'Mn.': 'Minnesota',
+             'Mo.': 'Missouri',
+             'Mont.': 'Montana',
+             'N. Car.': 'Caroline du Nord',
+             'N. Dak.': 'Dakota du Nord',
+             'N. Mex.': 'Nouveau-Mexique',
+             'N. York': 'New York',
+             'N.C.': 'Caroline du Nord',
+             'N.D.': 'Dakota du Nord',
+             'N.H.': 'New Hampshire',
+             'N.J.': 'New Jersey',
+             'N.M.': 'Nouveau-Mexique',
+             'N.Y.': 'New York',
+             'NB': 'Nebraska',
+             'NC': 'Caroline du Nord',
+             'ND': 'Dakota du Nord',
+             'NE': 'Nebraska',
+             'NH': 'New Hampshire',
+             'NJ': 'New Jersey',
+             'NM': 'Nouveau-Mexique',
+             'NV': 'Nevada',
+             'NY': 'New York',
+             'Neb.': 'Nebraska',
+             'Nebr.': 'Nebraska',
+             'Nev.': 'Nevada',
+             'New M.': 'Nouveau-Mexique',
+             'NoDak': 'Dakota du Nord',
+             'Nv.': 'Nevada',
+             'O.': 'Ohio',
+             'OH': 'Ohio',
+             'OK': 'Oklahoma',
+             'OR': 'Oregon',
+             'Oh.': 'Ohio',
+             'Ohio': 'Ohio',
+             'Ok.': 'Oklahoma',
+             'Okla.': 'Oklahoma',
+             'Or.': 'Oregon',
+             'Ore.': 'Oregon',
+             'Oreg.': 'Oregon',
+             'PA': 'Pennsylvanie',
+             'Pa.': 'Pennsylvanie',
+             'R.I.': 'Rhode Island',
+             'R.I. & P.P.': 'Rhode Island',
+             'RI': 'Rhode Island',
+             'S. Car.': 'Caroline du Sud',
+             'S. Dak.': 'Dakota du Sud',
+             'S.C.': 'Caroline du Sud',
+             'S.D.': 'Dakota du Sud',
+             'SC': 'Caroline du Sud',
+             'SD': 'Dakota du Sud',
+             'SoDak': 'Dakota du Sud',
+             'State': 'Utah',
+             'TN': 'Tennessee',
+             'TX': 'Texas',
+             'Tenn.': 'Tennessee',
+             'Tex.': 'Texas',
+             'Texas': 'Texas',
+             'Tn.': 'Tennessee',
+             'Tx.': 'Texas',
+             'US-AL': 'Alabama',
+             'US-AR': 'Arkansas',
+             'US-AZ': 'Arizona',
+             'US-CA': 'Californie',
+             'US-CO': 'Colorado',
+             'US-CT': 'Connecticut',
+             'US-DC': 'District of ColuFederal district',
+             'US-DE': 'Delaware',
+             'US-FL': 'Floride',
+             'US-GA': u'Géorgie',
+             'US-IL': 'Illinois',
+             'US-IN': 'Indiana',
+             'US-KY': 'Kentucky',
+             'US-LA': 'Louisiane',
+             'US-MA': 'Massachusetts',
+             'US-MD': 'Maryland',
+             'US-MI': 'Michigan',
+             'US-MN': 'Minnesota',
+             'US-MO': 'Missouri',
+             'US-MS': 'Mississippi',
+             'US-MT': 'Montana',
+             'US-NC': 'Caroline du Nord',
+             'US-ND': 'Dakota du Nord',
+             'US-NE': 'Nebraska',
+             'US-NH': 'New Hampshire',
+             'US-NJ': 'New Jersey',
+             'US-NM': 'Nouveau-Mexique',
+             'US-NY': 'New York',
+             'US-OK': 'Oklahoma',
+             'US-PA': 'Pennsylvanie',
+             'US-RI': 'Rhode Island',
+             'US-SC': 'Caroline du Sud',
+             'US-SD': 'Dakota du Sud',
+             'US-TN': 'Tennessee',
+             'US-VA': 'Virginia',
+             'US-VT': 'Vermont',
+             'US-WA': 'Washington',
+             'US-WI': 'Wisconsin',
+             'US-WV': 'Virginie occidentale',
+             'US-WY': 'Wyoming',
+             'UT': 'Utah',
+             'Ut.': 'Utah',
+             'Utah': 'Utah',
+             'VA': 'Virginia',
+             'VT': 'Vermont',
+             'Va.': 'Virginia',
+             'Vt.': 'Vermont',
+             'W. Va.': 'Virginie occidentale',
+             'W. Virg.': 'Virginie occidentale',
+             'W.V.': 'Virginie occidentale',
+             'W.Va.': 'Virginie occidentale',
+             'WA': 'Washington',
+             'WI': 'Wisconsin',
+             'WN': 'Washington',
+             'WS': 'Wisconsin',
+             'WV': 'Virginie occidentale',
+             'WY': 'Wyoming',
+             'Wa.': 'Washington',
+             'Wash.': 'Washington',
+             'Wash. D.C.': 'District of ColuFederal district',
+             'Wi.': 'Wisconsin',
+             'Wis.': 'Wisconsin',
+             'Wisc.': 'Wisconsin',
+             'Wn.': 'Washington',
+             'Wy.': 'Wyoming',
+             'Wyo.': 'Wyoming'}
--- a/demo.py	Thu Dec 19 14:45:43 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,197 +0,0 @@
-#!/usr/bin/python
-#-*- coding:utf-8 -*-
-
-from os import path
-
-import urllib
-
-import nazca.distances as ald
-import nazca.normalize as aln
-from nazca.aligner import align, subalign, findneighbours, alignall
-from nazca.dataio import parsefile, sparqlquery, write_results
-
-DEMODIR = path.dirname(__file__)
-
-def dpath(filename):
-    return path.join(DEMODIR, 'demo', filename)
-
-def remove_after(string, sub):
-    try:
-        return string[:string.lower().index(sub)].strip()
-    except ValueError:
-        return string
-
-def parserql(host, rql):
-    filehandle = urllib.urlopen('%(host)sview?'
-                                'rql=%(rql)s&vid=csvexport'
-                                % {'rql': rql, 'host': host})
-    filehandle.readline()
-    rset = [[e.decode('utf-8') for e in line.strip().split(';')]
-            for line in filehandle]
-    return rset
-
-def demo_0():
-    # prixgoncourt is the list of Goncourt Prize, extracted
-    # from wikipedia
-
-    #We try to align Goncourt winers onto dbpedia results
-
-    query = """
-       SELECT ?writer, ?name WHERE {
-          ?writer  <http://purl.org/dc/terms/subject> <http://dbpedia.org/resource/Category:French_novelists>.
-          ?writer rdfs:label ?name.
-          FILTER(lang(?name) = 'fr')
-       }
-    """
-
-    print "Sending query to dbpedia"
-    targetset = sparqlquery('http://dbpedia.org/sparql', query)
-    print "Reading the prixgoncourt file"
-    alignset = parsefile(dpath('prixgoncourt'), indexes=[1, 1])
-
-    tr_name = {'normalization': [lambda x:remove_after(x, '('),
-                                 aln.simplify],
-               'metric': ald.levenshtein
-              }
-
-    processings = {1: tr_name}
-
-    print "Alignment started"
-    align(alignset, targetset, 0.4, processings,
-          dpath('demo0_results'))
-
-    print "Done, see the resuls in %s" % dpath('demo0_results')
-
-def demo_1():
-    # FR.txt is an extract of geonames, where locations have been sorted by name
-    # frenchbnf is an extract of french BNF's locations, sorted by name too
-
-    # For each line (ie location) we keep the identifier, the name and the
-    # position (longitude, latitude)
-    # ``nbmax`` is the number of locations to load
-
-    print "Parsing the input files"
-    targetset = parsefile(dpath('FR.txt'), indexes=[0, 1, (4, 5)],
-                          nbmax=2000)
-    alignset = parsefile(dpath('frenchbnf'),
-                         indexes=[0, 2, (14, 12)], nbmax=1000)
-
-
-    # Let's define the processings to apply on the location's name
-    tr_name = {'normalization': [aln.simplify], # Simply all the names (remove
-                                              #   punctuation, lower case, etc)
-               'metric': ald.levenshtein,       # Use the levenshtein distance
-               'weighting': 1                 # Use 1 a name-distance matrix
-                                              #   weighting coefficient
-              }
-    tr_geo = {'normalization': [],              # No normalization needed
-              'metric': ald.geographical,         # Use the geographical distance
-              'metric_params': {'units': 'km'},# Arguments given the
-                                                #   distance function. Here,
-                                                #   the unit to use
-              'weighting': 1
-             }
-
-    processings = {1: tr_name, 2: tr_geo}
-
-    print "Alignment started"
-    align(alignset,           # The dataset to align
-          targetset,          # The target dataset
-          0.4,                # The maximal distance
-                              #   threshold
-          processings,         # The list of processings to
-                              #   apply.
-          dpath('demo1_results'))
-                              # Filename of the output
-                              #   result file
-    # the ``align()`` function return two items
-    # 0. the computed distance matrix
-    # 1. a boolean, True if at least one alignment has been done, False
-    #    otherwise
-    print "Done, see the results in %s" % dpath('demo1_results')
-
-def demo_2():
-    targetset = parsefile(dpath('FR.txt'), indexes=[0, 1, (4, 5)],
-                          formatopt={1:lambda x:x.decode('utf-8')})
-    alignset = parsefile(dpath('frenchbnf'), indexes=[0, 2, (14, 12)],
-                         formatopt={2:lambda x:x.decode('utf-8')}, nbmax=30000)
-
-    print "Finding neighbours"
-    neighbours = findneighbours(alignset, targetset, indexes=(2, 2),
-                               mode='minibatch')
-
-    # Let's define the processings to apply on the location's name
-    tr_name = {'normalization': [aln.simplify], # Simply all the names (remove
-                                              #   punctuation, lower case, etc)
-               'metric': ald.levenshtein,     # Use the levenshtein distance
-               'weighting': 1                 # Use 1 a name-distance matrix
-                                              #   weighting coefficient
-              }
-
-    processings = {1: tr_name}
-    print "Start computation"
-    for ind, (alignid, targetid) in enumerate(neighbours):
-        print '%3d' % ind, len(alignid), 'x', len(targetid)
-        _, matched = subalign(alignset,   # The dataset to align
-                              targetset,  # The target dataset
-                              alignid,
-                              targetid,
-                              0.3,
-                              processings)
-        write_results(matched, alignset, targetset, dpath('demo2_results'))
-    print "Done, see the results in %s" % dpath('demo2_results')
-
-def demo_3():
-    print "Parsing files"
-    alignset = parserql(host='http://demo.cubicweb.org/elections/',
-                        rql='Any E, N WHERE X is Commune, X eid E, X label N')
-    targetset = parsefile(dpath('FR.txt'), indexes=[0, 1])
-    print '%s×%s' % (len(alignset), len(targetset))
-
-    tr_name = {'normalization': [aln.simplify],
-               'metric': 'levenshtein'
-              }
-
-    print "Alignment started"
-    results = alignall(alignset, targetset, 0.75, processings={1: tr_name},
-                       indexes=(1,1), mode='minhashing', kwordsgram=1, siglen=200,
-                       uniq=True)
-    dicresults = dict([(a, b) for (a, b) in results])
-
-    print "Done, writing output"
-
-    with open(dpath('demo3_results'), 'w') as fout:
-        for line in alignset:
-            sent = u'http://demo.cubicweb.org/elections/commune/%s;'\
-                   u'http://www.geonames.org/%s\n' \
-                   % (line[0], dicresults.get(line[0], 'not_found'))
-            fout.write(sent.encode('utf-8'))
-
-    print "See the results in %s" % dpath('demo3_results')
-
-if __name__ == '__main__':
-    import sys
-    from time import time
-    runall = (len(sys.argv) == 1)
-
-    t = time()
-    if runall or '0' in sys.argv:
-        print "Running demo_0"
-        demo_0()
-
-    if runall or '1' in sys.argv:
-        print "Running demo_1"
-        demo_1()
-
-    if runall or '2' in sys.argv:
-        print "Running demo_2"
-        ## Same as demo_1, but in a more efficient way, using a method to find
-        ## neighbours
-        demo_2()
-
-    if runall or '3' in sys.argv:
-        print "Running demo_3"
-        demo_3()
-
-    print "Demo terminated"
-    print "Took %d min" % ((time() - t)/60.)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/examples/demo.py	Thu Dec 19 14:45:56 2013 +0000
@@ -0,0 +1,197 @@
+#!/usr/bin/python
+#-*- coding:utf-8 -*-
+
+from os import path
+
+import urllib
+
+import nazca.distances as ald
+import nazca.normalize as aln
+from nazca.aligner import align, subalign, findneighbours, alignall
+from nazca.dataio import parsefile, sparqlquery, write_results
+
+DEMODIR = path.dirname(__file__)
+
+def dpath(filename):
+    return path.join(DEMODIR, 'demo', filename)
+
+def remove_after(string, sub):
+    try:
+        return string[:string.lower().index(sub)].strip()
+    except ValueError:
+        return string
+
+def parserql(host, rql):
+    filehandle = urllib.urlopen('%(host)sview?'
+                                'rql=%(rql)s&vid=csvexport'
+                                % {'rql': rql, 'host': host})
+    filehandle.readline()
+    rset = [[e.decode('utf-8') for e in line.strip().split(';')]
+            for line in filehandle]
+    return rset
+
+def demo_0():
+    # prixgoncourt is the list of Goncourt Prize, extracted
+    # from wikipedia
+
+    #We try to align Goncourt winers onto dbpedia results
+
+    query = """
+       SELECT ?writer, ?name WHERE {
+          ?writer  <http://purl.org/dc/terms/subject> <http://dbpedia.org/resource/Category:French_novelists>.
+          ?writer rdfs:label ?name.
+          FILTER(lang(?name) = 'fr')
+       }
+    """
+
+    print "Sending query to dbpedia"
+    targetset = sparqlquery('http://dbpedia.org/sparql', query)
+    print "Reading the prixgoncourt file"
+    alignset = parsefile(dpath('prixgoncourt'), indexes=[1, 1])
+
+    tr_name = {'normalization': [lambda x:remove_after(x, '('),
+                                 aln.simplify],
+               'metric': ald.levenshtein
+              }
+
+    processings = {1: tr_name}
+
+    print "Alignment started"
+    align(alignset, targetset, 0.4, processings,
+          dpath('demo0_results'))
+
+    print "Done, see the resuls in %s" % dpath('demo0_results')
+
+def demo_1():
+    # FR.txt is an extract of geonames, where locations have been sorted by name
+    # frenchbnf is an extract of french BNF's locations, sorted by name too
+
+    # For each line (ie location) we keep the identifier, the name and the
+    # position (longitude, latitude)
+    # ``nbmax`` is the number of locations to load
+
+    print "Parsing the input files"
+    targetset = parsefile(dpath('FR.txt'), indexes=[0, 1, (4, 5)],
+                          nbmax=2000)
+    alignset = parsefile(dpath('frenchbnf'),
+                         indexes=[0, 2, (14, 12)], nbmax=1000)
+
+
+    # Let's define the processings to apply on the location's name
+    tr_name = {'normalization': [aln.simplify], # Simply all the names (remove
+                                              #   punctuation, lower case, etc)
+               'metric': ald.levenshtein,       # Use the levenshtein distance
+               'weighting': 1                 # Use 1 a name-distance matrix
+                                              #   weighting coefficient
+              }
+    tr_geo = {'normalization': [],              # No normalization needed
+              'metric': ald.geographical,         # Use the geographical distance
+              'metric_params': {'units': 'km'},# Arguments given the
+                                                #   distance function. Here,
+                                                #   the unit to use
+              'weighting': 1
+             }
+
+    processings = {1: tr_name, 2: tr_geo}
+
+    print "Alignment started"
+    align(alignset,           # The dataset to align
+          targetset,          # The target dataset
+          0.4,                # The maximal distance
+                              #   threshold
+          processings,         # The list of processings to
+                              #   apply.
+          dpath('demo1_results'))
+                              # Filename of the output
+                              #   result file
+    # the ``align()`` function return two items
+    # 0. the computed distance matrix
+    # 1. a boolean, True if at least one alignment has been done, False
+    #    otherwise
+    print "Done, see the results in %s" % dpath('demo1_results')
+
+def demo_2():
+    targetset = parsefile(dpath('FR.txt'), indexes=[0, 1, (4, 5)],
+                          formatopt={1:lambda x:x.decode('utf-8')})
+    alignset = parsefile(dpath('frenchbnf'), indexes=[0, 2, (14, 12)],
+                         formatopt={2:lambda x:x.decode('utf-8')}, nbmax=30000)
+
+    print "Finding neighbours"
+    neighbours = findneighbours(alignset, targetset, indexes=(2, 2),
+                               mode='minibatch')
+
+    # Let's define the processings to apply on the location's name
+    tr_name = {'normalization': [aln.simplify], # Simply all the names (remove
+                                              #   punctuation, lower case, etc)
+               'metric': ald.levenshtein,     # Use the levenshtein distance
+               'weighting': 1                 # Use 1 a name-distance matrix
+                                              #   weighting coefficient
+              }
+
+    processings = {1: tr_name}
+    print "Start computation"
+    for ind, (alignid, targetid) in enumerate(neighbours):
+        print '%3d' % ind, len(alignid), 'x', len(targetid)
+        _, matched = subalign(alignset,   # The dataset to align
+                              targetset,  # The target dataset
+                              alignid,
+                              targetid,
+                              0.3,
+                              processings)
+        write_results(matched, alignset, targetset, dpath('demo2_results'))
+    print "Done, see the results in %s" % dpath('demo2_results')
+
+def demo_3():
+    print "Parsing files"
+    alignset = parserql(host='http://demo.cubicweb.org/elections/',
+                        rql='Any E, N WHERE X is Commune, X eid E, X label N')
+    targetset = parsefile(dpath('FR.txt'), indexes=[0, 1])
+    print '%s×%s' % (len(alignset), len(targetset))
+
+    tr_name = {'normalization': [aln.simplify],
+               'metric': 'levenshtein'
+              }
+
+    print "Alignment started"
+    results = alignall(alignset, targetset, 0.75, processings={1: tr_name},
+                       indexes=(1,1), mode='minhashing', kwordsgram=1, siglen=200,
+                       uniq=True)
+    dicresults = dict([(a, b) for (a, b) in results])
+
+    print "Done, writing output"
+
+    with open(dpath('demo3_results'), 'w') as fout:
+        for line in alignset:
+            sent = u'http://demo.cubicweb.org/elections/commune/%s;'\
+                   u'http://www.geonames.org/%s\n' \
+                   % (line[0], dicresults.get(line[0], 'not_found'))
+            fout.write(sent.encode('utf-8'))
+
+    print "See the results in %s" % dpath('demo3_results')
+
+if __name__ == '__main__':
+    import sys
+    from time import time
+    runall = (len(sys.argv) == 1)
+
+    t = time()
+    if runall or '0' in sys.argv:
+        print "Running demo_0"
+        demo_0()
+
+    if runall or '1' in sys.argv:
+        print "Running demo_1"
+        demo_1()
+
+    if runall or '2' in sys.argv:
+        print "Running demo_2"
+        ## Same as demo_1, but in a more efficient way, using a method to find
+        ## neighbours
+        demo_2()
+
+    if runall or '3' in sys.argv:
+        print "Running demo_3"
+        demo_3()
+
+    print "Demo terminated"
+    print "Took %d min" % ((time() - t)/60.)
--- a/named_entities/__init__.py	Thu Dec 19 14:45:43 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,80 +0,0 @@
-# -*- coding: utf-8 -*-
-""" Process/Core functions for Named Entities Recognition.
-"""
-from nazca.utils.tokenizer import RichStringTokenizer
-
-
-###############################################################################
-### NER PROCESS ###############################################################
-###############################################################################
-class NerProcess(object):
-    """ High-level process for Named Entities Recognition
-    """
-
-    def __init__(self, ner_sources, preprocessors=None, filters=None, unique=False):
-        """ Initialise the class.
-
-        :tokenizer: an instance of tokenizer
-        """
-        self.ner_sources = list(ner_sources)
-        self.preprocessors = preprocessors or []
-        self.filters = filters or []
-        self.unique = unique
-
-    def add_ner_source(self, process):
-        """ Add a ner process
-        """
-        self.ner_sources.append(process)
-
-    def add_preprocessors(self, preprocessor):
-        """ Add a preprocessor
-        """
-        self.preprocessors.append(preprocessor)
-
-    def add_filters(self, filter):
-        """ Add a filter
-        """
-        self.filters.append(filter)
-
-    def process_text(self, text):
-        """ High level function for analyzing a text
-        """
-        tokenizer = RichStringTokenizer(text)
-        return self.recognize_tokens(tokenizer)
-
-    def recognize_tokens(self, tokens):
-        """ Recognize Named Entities from a tokenizer or
-        an iterator yielding tokens.
-        """
-        last_stop = 0
-        named_entities = []
-        for token in tokens:
-            if token.start < last_stop:
-                continue # this token overlaps with a previous match
-            word = token.word
-            # Applies preprocessors
-            # XXX Preprocessors may be sources dependant
-            for preprocessor in self.preprocessors:
-                token = preprocessor(token)
-                if not token:
-                    break
-            if not token:
-                continue
-            recognized = False
-            for process in self.ner_sources:
-                for uri in process.recognize_token(token):
-                    named_entities.append((uri, process.name, token))
-                    recognized = True
-                    last_stop = token.end
-                    if self.unique:
-                        break
-                if recognized and self.unique:
-                    break
-        # XXX Postprocess/filters may be sources dependant
-        return self.postprocess(named_entities)
-
-    def postprocess(self, named_entities):
-        """ Postprocess the results by applying filters """
-        for filter in self.filters:
-            named_entities = filter(named_entities)
-        return named_entities
--- a/named_entities/filters.py	Thu Dec 19 14:45:43 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,103 +0,0 @@
-# -*- coding: utf-8 -*-
-""" Filters for Named Entities Recognition.
-"""
-from nazca.utils.dataio import sparqlquery
-
-
-###############################################################################
-### NER FILTERS ###############################################################
-###############################################################################
-class AbstractNerFilter(object):
-    """ A filter used for cleaning named entities results
-    """
-
-    def __call__(self, named_entities):
-        raise NotImplementedError
-
-
-class NerOccurenceFilter(object):
-    """ A filter based on the number of occurence of
-    named entities in the results.
-    """
-    def __init__(self, min_occ=None, max_occ=None):
-        self.min_occ = min_occ
-        self.max_occ = max_occ
-
-    def __call__(self, named_entities):
-        uris = [u for u, p, t in named_entities]
-        counts = dict([(u, uris.count(u)) for u in set(uris)])
-        return [n for n in named_entities if not ((self.min_occ and counts[n[0]]<self.min_occ)
-                                              or (self.max_occ and counts[n[0]]>self.max_occ))]
-
-
-class NerRDFTypeFilter(object):
-    """ A filter based on the RDF type on entity
-    E.g.
-
-    filter = NerRDFTypeFilter('http://dbpedia.org/sparql',
-                                ('http://schema.org/Place',
-                                'http://dbpedia.org/ontology/Agent',
-                                'http://dbpedia.org/ontology/Place'))
-
-    """
-    def __init__(self, endpoint, accepted_types):
-        self.endpoint = endpoint
-        self.accepted_types = accepted_types
-        self.query = 'SELECT ?type WHERE{<%(uri)s> rdf:type ?type}'
-
-    def __call__(self, named_entities):
-        filtered_named_entities = []
-        seen_uris = {}
-        for uri, p, t in named_entities:
-            if uri in seen_uris:
-                if seen_uris[uri]:
-                    filtered_named_entities.append((uri, p, t))
-            else:
-                results = sparqlquery(self.endpoint, self.query % {'uri': uri})
-                types = set([r['type']['value'] for r in results])
-                if not len(types.intersection(self.accepted_types)):
-                    seen_uris[uri] = False
-                else:
-                    seen_uris[uri] = True
-                    filtered_named_entities.append((uri, p, t))
-        return filtered_named_entities
-
-
-class NerDisambiguationWordParts(object):
-    """ Disambiguate named entities based on the words parts.
-    E.g.:
-          'toto tutu': 'http://example.com/toto_tutu',
-          'toto': 'http://example.com/toto'
-
-          Then if 'toto' is found in the text, replace the URI 'http://example.com/toto'
-          by 'http://example.com/toto_tutu'
-    """
-    def __call__(self, named_entities):
-        # Create parts dictionnary
-        parts = {}
-        for uri, peid, token in named_entities:
-            if ' ' in token.word:
-                for part in token.word.split(' '):
-                    parts[part.lower()] = uri
-        # Replace named entities
-        filtered_named_entities = []
-        for uri, peid, token in named_entities:
-            if token.word in parts:
-                # Change URI
-                uri = parts[token.word]
-            filtered_named_entities.append((uri, peid, token))
-        return filtered_named_entities
-
-
-class NerReplacementRulesFilter(object):
-    """ Allow to define replacement rules for Named Entities
-    """
-    def __init__(self,rules):
-        self.rules = rules
-
-    def __call__(self, named_entities):
-        filtered_named_entities = []
-        for uri, peid, token in named_entities:
-            uri = self.rules.get(uri, uri)
-            filtered_named_entities.append((uri, peid, token))
-        return filtered_named_entities
--- a/named_entities/preprocessors.py	Thu Dec 19 14:45:43 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,83 +0,0 @@
-# -*- coding: utf-8 -*-
-""" Preprocessors for Named Entities Recognition.
-"""
-from nazca.utils.tokenizer import Token
-from nazca.reference_data.stopwords import FRENCH_STOPWORDS, ENGLISH_STOPWORDS
-
-STOPWORDS = {'fr': FRENCH_STOPWORDS,
-             'en': ENGLISH_STOPWORDS}
-
-
-###############################################################################
-### NER PREPROCESSORS #########################################################
-###############################################################################
-class AbstractNerPreprocessor(object):
-    """ Preprocessor
-    """
-
-    def __call__(self, token):
-        raise NotImplementedError
-
-
-class NerWordSizeFilterPreprocessor(AbstractNerPreprocessor):
-    """ Remove token based on the size of the word
-    """
-    def __init__(self, min_size=None, max_size=None):
-        self.min_size = min_size
-        self.max_size = max_size
-
-    def __call__(self, token):
-        if ((self.min_size and len(token.word)<self.min_size)
-            or (self.max_size and len(token.word)>self.max_size)):
-            return None
-        return token
-
-
-class NerLowerCaseFilterPreprocessor(AbstractNerPreprocessor):
-    """ Remove token with word in lower case
-    """
-
-    def __call__(self, token):
-        return None if token.word.islower() else token
-
-
-class NerLowerFirstWordPreprocessor(AbstractNerPreprocessor):
-    """ Lower the first word of each sentence if it is a stopword.
-    """
-    def __init__(self, lang='en'):
-        self.lang = lang
-
-    def __call__(self, token):
-        if (token.start == token.sentence.start and
-            token.word.split()[0].lower() in STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)):
-            word = token.word[0].lower() + token.word[1:]
-            return Token(word, token.start, token.end, token.sentence)
-        return token
-
-
-class NerStopwordsFilterPreprocessor(AbstractNerPreprocessor):
-    """ Remove stopwords
-    """
-    def __init__(self, split_words=False, lang='en'):
-        self.split_words = split_words
-        self.lang = lang
-
-    def __call__(self, token):
-        stopwords = STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)
-        if self.split_words and not [w for w in token.word.split() if w.lower() not in stopwords]:
-            return None
-        if not self.split_words and token.word.lower() in stopwords:
-            return None
-        return token
-
-
-class NerHashTagPreprocessor(AbstractNerPreprocessor):
-    """ Cleanup hashtag
-    """
-    def __call__(self, token):
-        if token.word.startswith('@'):
-            # XXX Split capitalize letter ?
-            # @BarackObama -> Barack Obama
-            word = token.word[1:].replace('_', ' ')
-            return Token(word, token.start, token.end, token.sentence)
-        return token
--- a/named_entities/sources.py	Thu Dec 19 14:45:43 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,124 +0,0 @@
-# -*- coding: utf-8 -*-
-""" Sources for Named Entities Recognition.
-"""
-from nazca.utils.tokenizer import Token
-from nazca.utils.dataio import sparqlquery, rqlquery
-
-
-###############################################################################
-### NER SOURCE ################################################################
-###############################################################################
-class AbstractNerSource(object):
-    """ High-level source for Named Entities Recognition
-    """
-
-    def __init__(self, endpoint, query, name=None, use_cache=True, preprocessors=None):
-        """ Initialise the class.
-        """
-        self.endpoint = endpoint
-        self.query = query
-        self.name = name
-        self.preprocessors = preprocessors or []
-        self.use_cache = use_cache
-        self._recognized_cache = {}
-
-    def add_preprocessors(self, preprocessor):
-        """ Add a preprocessor
-        """
-        self.preprocessors.append(preprocessor)
-
-    def recognize_token(self, token):
-        """ Recognize a token
-        """
-        # Applies source specific preprocessors
-        for preprocessor in self.preprocessors:
-            token = preprocessor(token)
-            if not token:
-                return []
-        if self.use_cache and token.word in self._recognized_cache:
-            return self._recognized_cache[token.word]
-        uris = self.query_word(token.word) if token.word else []
-        if self.use_cache:
-            self._recognized_cache[token.word] = uris
-        return uris
-
-    def query_word(self, word):
-        """ Query a word for a Named Entities Recognition process
-        """
-        raise NotImplementedError
-
-
-class NerSourceLexicon(AbstractNerSource):
-    """ Source based on a (pre-computed) dictionnary of words (token, uri)
-    """
-    def __init__(self, lexicon, name=None, use_cache=True, preprocessors=None):
-        self.lexicon = lexicon
-        self.name = name
-        self.preprocessors = preprocessors or []
-        self.use_cache = use_cache
-        self._recognized_cache = {}
-
-    def query_word(self, word):
-        uri = self.lexicon.get(word)
-        return [uri,] if uri else []
-
-
-class NerSourceLocalRql(AbstractNerSource):
-    """ High-level source for Named Entities Recognition
-    Local RQL version
-    """
-
-    def __init__(self, session, query, name=None, use_cache=True, preprocessors=None):
-        """ Initialise the class.
-        """
-        self.query = query
-        self.session = session
-        self.name = name
-        self.preprocessors = preprocessors or []
-        self.use_cache = use_cache
-        self._recognized_cache = {}
-
-    def query_word(self, word):
-        """ Query a word for a Named Entities Recognition process
-        """
-        return [r[0] for r in self.session.execute(self.query, dict(word=word))]
-
-
-class NerSourceRql(AbstractNerSource):
-    """ High-level source for Named Entities Recognition
-    Url version (distant source)
-    """
-
-    def query_word(self, word):
-        """ Query a word for a Named Entities Recognition process
-        """
-        if self.endpoint.startswith('http://'):
-            # url
-            return [r[0] for r in rqlquery(self.endpoint, self.query % {'word': word})]
-        else:
-            return [r[0] for r in rqlquery(self.endpoint, self.query, word=word)]
-
-
-class NerSourceSparql(AbstractNerSource):
-    """ High-level source for Named Entities Recognition
-    SPARQL version
-
-   >>> from ner.core import NerSourceSparql
-   >>> ner_source = NerSourceSparql('''SELECT ?uri
-                                         WHERE{
-                                         ?uri rdfs:label "%(word)s"@en}''',
-			                 'http://dbpedia.org/sparql')
-   >>> print ner_source.recognize_token('Victor Hugo')
-		... ['http://dbpedia.org/resource/Category:Victor_Hugo',
-		     'http://dbpedia.org/resource/Victor_Hugo',
-		     'http://dbpedia.org/class/yago/VictorHugo',
-		     'http://dbpedia.org/class/yago/VictorHugo(ParisM%C3%A9tro)',
-		     'http://sw.opencyc.org/2008/06/10/concept/en/VictorHugo',
-		     'http://sw.opencyc.org/2008/06/10/concept/Mx4rve1ZXJwpEbGdrcN5Y29ycA']
-
-    """
-
-    def query_word(self, word):
-        """ Query a word for a Named Entities Recognition process
-        """
-        return [r[0] for r in sparqlquery(self.endpoint, self.query % {'word': word})]
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ner/__init__.py	Thu Dec 19 14:45:56 2013 +0000
@@ -0,0 +1,80 @@
+# -*- coding: utf-8 -*-
+""" Process/Core functions for Named Entities Recognition.
+"""
+from nazca.utils.tokenizer import RichStringTokenizer
+
+
+###############################################################################
+### NER PROCESS ###############################################################
+###############################################################################
+class NerProcess(object):
+    """ High-level process for Named Entities Recognition
+    """
+
+    def __init__(self, ner_sources, preprocessors=None, filters=None, unique=False):
+        """ Initialise the class.
+
+        :tokenizer: an instance of tokenizer
+        """
+        self.ner_sources = list(ner_sources)
+        self.preprocessors = preprocessors or []
+        self.filters = filters or []
+        self.unique = unique
+
+    def add_ner_source(self, process):
+        """ Add a ner process
+        """
+        self.ner_sources.append(process)
+
+    def add_preprocessors(self, preprocessor):
+        """ Add a preprocessor
+        """
+        self.preprocessors.append(preprocessor)
+
+    def add_filters(self, filter):
+        """ Add a filter
+        """
+        self.filters.append(filter)
+
+    def process_text(self, text):
+        """ High level function for analyzing a text
+        """
+        tokenizer = RichStringTokenizer(text)
+        return self.recognize_tokens(tokenizer)
+
+    def recognize_tokens(self, tokens):
+        """ Recognize Named Entities from a tokenizer or
+        an iterator yielding tokens.
+        """
+        last_stop = 0
+        named_entities = []
+        for token in tokens:
+            if token.start < last_stop:
+                continue # this token overlaps with a previous match
+            word = token.word
+            # Applies preprocessors
+            # XXX Preprocessors may be sources dependant
+            for preprocessor in self.preprocessors:
+                token = preprocessor(token)
+                if not token:
+                    break
+            if not token:
+                continue
+            recognized = False
+            for process in self.ner_sources:
+                for uri in process.recognize_token(token):
+                    named_entities.append((uri, process.name, token))
+                    recognized = True
+                    last_stop = token.end
+                    if self.unique:
+                        break
+                if recognized and self.unique:
+                    break
+        # XXX Postprocess/filters may be sources dependant
+        return self.postprocess(named_entities)
+
+    def postprocess(self, named_entities):
+        """ Postprocess the results by applying filters """
+        for filter in self.filters:
+            named_entities = filter(named_entities)
+        return named_entities
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ner/filters.py	Thu Dec 19 14:45:56 2013 +0000
@@ -0,0 +1,103 @@
+# -*- coding: utf-8 -*-
+""" Filters for Named Entities Recognition.
+"""
+from nazca.utils.dataio import sparqlquery
+
+
+###############################################################################
+### NER FILTERS ###############################################################
+###############################################################################
+class AbstractNerFilter(object):
+    """ A filter used for cleaning named entities results
+    """
+
+    def __call__(self, named_entities):
+        raise NotImplementedError
+
+
+class NerOccurenceFilter(object):
+    """ A filter based on the number of occurence of
+    named entities in the results.
+    """
+    def __init__(self, min_occ=None, max_occ=None):
+        self.min_occ = min_occ
+        self.max_occ = max_occ
+
+    def __call__(self, named_entities):
+        uris = [u for u, p, t in named_entities]
+        counts = dict([(u, uris.count(u)) for u in set(uris)])
+        return [n for n in named_entities if not ((self.min_occ and counts[n[0]]<self.min_occ)
+                                              or (self.max_occ and counts[n[0]]>self.max_occ))]
+
+
+class NerRDFTypeFilter(object):
+    """ A filter based on the RDF type on entity
+    E.g.
+
+    filter = NerRDFTypeFilter('http://dbpedia.org/sparql',
+                                ('http://schema.org/Place',
+                                'http://dbpedia.org/ontology/Agent',
+                                'http://dbpedia.org/ontology/Place'))
+
+    """
+    def __init__(self, endpoint, accepted_types):
+        self.endpoint = endpoint
+        self.accepted_types = accepted_types
+        self.query = 'SELECT ?type WHERE{<%(uri)s> rdf:type ?type}'
+
+    def __call__(self, named_entities):
+        filtered_named_entities = []
+        seen_uris = {}
+        for uri, p, t in named_entities:
+            if uri in seen_uris:
+                if seen_uris[uri]:
+                    filtered_named_entities.append((uri, p, t))
+            else:
+                results = sparqlquery(self.endpoint, self.query % {'uri': uri})
+                types = set([r['type']['value'] for r in results])
+                if not len(types.intersection(self.accepted_types)):
+                    seen_uris[uri] = False
+                else:
+                    seen_uris[uri] = True
+                    filtered_named_entities.append((uri, p, t))
+        return filtered_named_entities
+
+
+class NerDisambiguationWordParts(object):
+    """ Disambiguate named entities based on the words parts.
+    E.g.:
+          'toto tutu': 'http://example.com/toto_tutu',
+          'toto': 'http://example.com/toto'
+
+          Then if 'toto' is found in the text, replace the URI 'http://example.com/toto'
+          by 'http://example.com/toto_tutu'
+    """
+    def __call__(self, named_entities):
+        # Create parts dictionnary
+        parts = {}
+        for uri, peid, token in named_entities:
+            if ' ' in token.word:
+                for part in token.word.split(' '):
+                    parts[part.lower()] = uri
+        # Replace named entities
+        filtered_named_entities = []
+        for uri, peid, token in named_entities:
+            if token.word in parts:
+                # Change URI
+                uri = parts[token.word]
+            filtered_named_entities.append((uri, peid, token))
+        return filtered_named_entities
+
+
+class NerReplacementRulesFilter(object):
+    """ Allow to define replacement rules for Named Entities
+    """
+    def __init__(self,rules):
+        self.rules = rules
+
+    def __call__(self, named_entities):
+        filtered_named_entities = []
+        for uri, peid, token in named_entities:
+            uri = self.rules.get(uri, uri)
+            filtered_named_entities.append((uri, peid, token))
+        return filtered_named_entities
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ner/preprocessors.py	Thu Dec 19 14:45:56 2013 +0000
@@ -0,0 +1,83 @@
+# -*- coding: utf-8 -*-
+""" Preprocessors for Named Entities Recognition.
+"""
+from nazca.utils.tokenizer import Token
+from nazca.data.stopwords import FRENCH_STOPWORDS, ENGLISH_STOPWORDS
+
+STOPWORDS = {'fr': FRENCH_STOPWORDS,
+             'en': ENGLISH_STOPWORDS}
+
+
+###############################################################################
+### NER PREPROCESSORS #########################################################
+###############################################################################
+class AbstractNerPreprocessor(object):
+    """ Preprocessor
+    """
+
+    def __call__(self, token):
+        raise NotImplementedError
+
+
+class NerWordSizeFilterPreprocessor(AbstractNerPreprocessor):
+    """ Remove token based on the size of the word
+    """
+    def __init__(self, min_size=None, max_size=None):
+        self.min_size = min_size
+        self.max_size = max_size
+
+    def __call__(self, token):
+        if ((self.min_size and len(token.word)<self.min_size)
+            or (self.max_size and len(token.word)>self.max_size)):
+            return None
+        return token
+
+
+class NerLowerCaseFilterPreprocessor(AbstractNerPreprocessor):
+    """ Remove token with word in lower case
+    """
+
+    def __call__(self, token):
+        return None if token.word.islower() else token
+
+
+class NerLowerFirstWordPreprocessor(AbstractNerPreprocessor):
+    """ Lower the first word of each sentence if it is a stopword.
+    """
+    def __init__(self, lang='en'):
+        self.lang = lang
+
+    def __call__(self, token):
+        if (token.start == token.sentence.start and
+            token.word.split()[0].lower() in STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)):
+            word = token.word[0].lower() + token.word[1:]
+            return Token(word, token.start, token.end, token.sentence)
+        return token
+
+
+class NerStopwordsFilterPreprocessor(AbstractNerPreprocessor):
+    """ Remove stopwords
+    """
+    def __init__(self, split_words=False, lang='en'):
+        self.split_words = split_words
+        self.lang = lang
+
+    def __call__(self, token):
+        stopwords = STOPWORDS.get(self.lang, ENGLISH_STOPWORDS)
+        if self.split_words and not [w for w in token.word.split() if w.lower() not in stopwords]:
+            return None
+        if not self.split_words and token.word.lower() in stopwords:
+            return None
+        return token
+
+
+class NerHashTagPreprocessor(AbstractNerPreprocessor):
+    """ Cleanup hashtag
+    """
+    def __call__(self, token):
+        if token.word.startswith('@'):
+            # XXX Split capitalize letter ?
+            # @BarackObama -> Barack Obama
+            word = token.word[1:].replace('_', ' ')
+            return Token(word, token.start, token.end, token.sentence)
+        return token
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ner/sources.py	Thu Dec 19 14:45:56 2013 +0000
@@ -0,0 +1,124 @@
+# -*- coding: utf-8 -*-
+""" Sources for Named Entities Recognition.
+"""
+from nazca.utils.tokenizer import Token
+from nazca.utils.dataio import sparqlquery, rqlquery
+
+
+###############################################################################
+### NER SOURCE ################################################################
+###############################################################################
+class AbstractNerSource(object):
+    """ High-level source for Named Entities Recognition
+    """
+
+    def __init__(self, endpoint, query, name=None, use_cache=True, preprocessors=None):
+        """ Initialise the class.
+        """
+        self.endpoint = endpoint
+        self.query = query
+        self.name = name
+        self.preprocessors = preprocessors or []
+        self.use_cache = use_cache
+        self._recognized_cache = {}
+
+    def add_preprocessors(self, preprocessor):
+        """ Add a preprocessor
+        """
+        self.preprocessors.append(preprocessor)
+
+    def recognize_token(self, token):
+        """ Recognize a token
+        """
+        # Applies source specific preprocessors
+        for preprocessor in self.preprocessors:
+            token = preprocessor(token)
+            if not token:
+                return []
+        if self.use_cache and token.word in self._recognized_cache:
+            return self._recognized_cache[token.word]
+        uris = self.query_word(token.word) if token.word else []
+        if self.use_cache:
+            self._recognized_cache[token.word] = uris
+        return uris
+
+    def query_word(self, word):
+        """ Query a word for a Named Entities Recognition process
+        """
+        raise NotImplementedError
+
+
+class NerSourceLexicon(AbstractNerSource):
+    """ Source based on a (pre-computed) dictionnary of words (token, uri)
+    """
+    def __init__(self, lexicon, name=None, use_cache=True, preprocessors=None):
+        self.lexicon = lexicon
+        self.name = name
+        self.preprocessors = preprocessors or []
+        self.use_cache = use_cache
+        self._recognized_cache = {}
+
+    def query_word(self, word):
+        uri = self.lexicon.get(word)
+        return [uri,] if uri else []
+
+
+class NerSourceLocalRql(AbstractNerSource):
+    """ High-level source for Named Entities Recognition
+    Local RQL version
+    """
+
+    def __init__(self, session, query, name=None, use_cache=True, preprocessors=None):
+        """ Initialise the class.
+        """
+        self.query = query
+        self.session = session
+        self.name = name
+        self.preprocessors = preprocessors or []
+        self.use_cache = use_cache
+        self._recognized_cache = {}
+
+    def query_word(self, word):
+        """ Query a word for a Named Entities Recognition process
+        """
+        return [r[0] for r in self.session.execute(self.query, dict(word=word))]
+
+
+class NerSourceRql(AbstractNerSource):
+    """ High-level source for Named Entities Recognition
+    Url version (distant source)
+    """
+
+    def query_word(self, word):
+        """ Query a word for a Named Entities Recognition process
+        """
+        if self.endpoint.startswith('http://'):
+            # url
+            return [r[0] for r in rqlquery(self.endpoint, self.query % {'word': word})]
+        else:
+            return [r[0] for r in rqlquery(self.endpoint, self.query, word=word)]
+
+
+class NerSourceSparql(AbstractNerSource):
+    """ High-level source for Named Entities Recognition
+    SPARQL version
+
+   >>> from ner.core import NerSourceSparql
+   >>> ner_source = NerSourceSparql('''SELECT ?uri
+                                         WHERE{
+                                         ?uri rdfs:label "%(word)s"@en}''',
+			                 'http://dbpedia.org/sparql')
+   >>> print ner_source.recognize_token('Victor Hugo')
+		... ['http://dbpedia.org/resource/Category:Victor_Hugo',
+		     'http://dbpedia.org/resource/Victor_Hugo',
+		     'http://dbpedia.org/class/yago/VictorHugo',
+		     'http://dbpedia.org/class/yago/VictorHugo(ParisM%C3%A9tro)',
+		     'http://sw.opencyc.org/2008/06/10/concept/en/VictorHugo',
+		     'http://sw.opencyc.org/2008/06/10/concept/Mx4rve1ZXJwpEbGdrcN5Y29ycA']
+
+    """
+
+    def query_word(self, word):
+        """ Query a word for a Named Entities Recognition process
+        """
+        return [r[0] for r in sparqlquery(self.endpoint, self.query % {'word': word})]
--- a/record_linkage/aligner.py	Thu Dec 19 14:45:43 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,324 +0,0 @@
-# -*- coding:utf-8 -*-
-# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-import time
-import logging
-from collections import defaultdict
-
-from scipy import zeros
-from scipy.sparse import lil_matrix
-
-from nazca.utils.dataio import parsefile
-
-
-###############################################################################
-### UTILITY FUNCTIONS #########################################################
-###############################################################################
-def iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique=True):
-    """ Return the aligned pairs
-    """
-    if unique:
-        for refid in global_matched:
-            bestid, _ = sorted(global_matched[refid], key=lambda x:x[1])[0]
-            ref_record = refset[refid]
-            target_record = targetset[bestid]
-            distance = global_mat[refid, bestid] if global_mat is not None else None
-            yield (ref_record[0], refid), (target_record[0], bestid), distance
-    else:
-        for refid in global_matched:
-            for targetid, _ in global_matched[refid]:
-                ref_record = refset[refid]
-                target_record = targetset[targetid]
-                distance = global_mat[refid, targetid] if global_mat is not None else None
-                yield (ref_record[0], refid), (target_record[0], targetid), distance
-
-
-###############################################################################
-### BASE ALIGNER OBJECT #######################################################
-###############################################################################
-class BaseAligner(object):
-
-    def __init__(self, threshold, processings, normalize_matrix=False):
-        self.threshold = threshold
-        self.processings = processings
-        self.normalize_matrix = normalize_matrix
-        self.ref_normalizer = None
-        self.target_normalizer = None
-        self.target_normalizer = None
-        self.blocking = None
-        self.alignments_done = 0
-        self.pairs_found = 0
-        self.nb_comparisons = 0
-        self.nb_blocks = 0
-        self.refset_size = None
-        self.targetset_size = None
-        self.time = None
-        self.logger = logging.getLogger('nazca.aligner')
-
-    def register_ref_normalizer(self, normalizer):
-        """ Register normalizers to be applied
-        before alignment """
-        self.ref_normalizer = normalizer
-
-    def register_target_normalizer(self, normalizer):
-        """ Register normalizers to be applied
-        before alignment """
-        self.target_normalizer = normalizer
-
-    def register_blocking(self, blocking):
-        self.blocking = blocking
-
-    def apply_normalization(self, dataset, normalizer):
-        if normalizer:
-            return normalizer.normalize_dataset(dataset)
-        return dataset
-
-    def compute_distance_matrix(self, refset, targetset,
-                                ref_indexes, target_indexes):
-        """ Compute and return the global alignment matrix.
-        For each `processing` a `Distancematrix` is built, then all the
-        matrices are summed with their own weighting and the result is the global
-        alignment matrix, which is returned.
-        """
-        distmatrix = zeros((len(ref_indexes), len(target_indexes)), dtype='float32')
-        for processing in self.processings:
-            distmatrix += processing.cdist(refset, targetset,
-                                          ref_indexes, target_indexes)
-        return distmatrix
-
-    def threshold_matched(self, distmatrix):
-        """ Return the matched elements within a dictionnary,
-        each key being the indice from X, and the corresponding
-        values being a list of couple (indice from Y, distance)
-        """
-        match = defaultdict(list)
-        if self.normalize_matrix:
-            distmatrix /= distmatrix.max()
-        ind = (distmatrix <= self.threshold).nonzero()
-        indrow = ind[0].tolist()
-        indcol = ind[1].tolist()
-        for (i, j) in zip(indrow, indcol):
-            match[i].append((j, distmatrix[i, j]))
-        return match
-
-    def _get_match(self, refset, targetset, ref_indexes=None, target_indexes=None):
-        # Build items
-        items = []
-        ref_indexes = ref_indexes or xrange(len(refset))
-        target_indexes = target_indexes or xrange(len(targetset))
-        # Apply alignments
-        mat = self.compute_distance_matrix(refset, targetset,
-                                           ref_indexes=ref_indexes,
-                                           target_indexes=target_indexes)
-        matched = self.threshold_matched(mat)
-        # Reapply matched to global indexes
-        new_matched = {}
-        for k, values in matched.iteritems():
-            new_matched[ref_indexes[k]] = [(target_indexes[i], d) for i, d in values]
-        return mat, new_matched
-
-    def align(self, refset, targetset, get_matrix=True):
-        """ Perform the alignment on the referenceset
-        and the targetset
-        """
-        start_time = time.time()
-        refset = self.apply_normalization(refset, self.ref_normalizer)
-        targetset = self.apply_normalization(targetset, self.target_normalizer)
-        self.refset_size = len(refset)
-        self.targetset_size = len(targetset)
-        # If no blocking
-        if not self.blocking:
-            return self._get_match(refset, targetset)
-        # Blocking == conquer_and_divide
-        global_matched = {}
-        global_mat = lil_matrix((len(refset), len(targetset)))
-        self.blocking.fit(refset, targetset)
-        for refblock, targetblock in self.blocking.iter_blocks():
-            self.nb_blocks += 1
-            ref_index = [r[0] for r in refblock]
-            target_index = [r[0] for r in targetblock]
-            self.nb_comparisons += len(ref_index)*len(target_index)
-            _, matched = self._get_match(refset, targetset, ref_index, target_index)
-            for k, values in matched.iteritems():
-                subdict = global_matched.setdefault(k, set())
-                for v, d in values:
-                    subdict.add((v, d))
-                    self.alignments_done += 1
-                    if get_matrix:
-                        # XXX avoid issue in sparse matrix
-                        global_mat[k, v] = d or 10**(-10)
-        self.time = time.time() - start_time
-        return global_mat, global_matched
-
-    def get_aligned_pairs(self, refset, targetset, unique=True):
-        """ Get the pairs of aligned elements
-        """
-        global_mat, global_matched = self.align(refset, targetset, get_matrix=False)
-        for pair in iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique):
-            self.pairs_found += 1
-            yield pair
-        self.log_infos()
-
-    def align_from_files(self, reffile, targetfile,
-                         ref_indexes=None, target_indexes=None,
-                         ref_encoding=None, target_encoding=None,
-                         ref_separator='\t', target_separator='\t',
-                         get_matrix=True):
-        """ Align data from files
-
-        Parameters
-        ----------
-
-        reffile: name of the reference file
-
-        targetfile: name of the target file
-
-        ref_encoding: if given (e.g. 'utf-8' or 'latin-1'), it will
-                      be used to read the files.
-
-        target_encoding: if given (e.g. 'utf-8' or 'latin-1'), it will
-                         be used to read the files.
-
-        ref_separator: separator of the reference file
-
-        target_separator: separator of the target file
-        """
-        refset = parsefile(reffile, indexes=ref_indexes,
-                           encoding=ref_encoding, delimiter=ref_separator)
-        targetset = parsefile(targetfile, indexes=target_indexes,
-                              encoding=target_encoding, delimiter=target_separator)
-        return self.align(refset, targetset, get_matrix=get_matrix)
-
-    def get_aligned_pairs_from_files(self, reffile, targetfile,
-                         ref_indexes=None, target_indexes=None,
-                         ref_encoding=None, target_encoding=None,
-                         ref_separator='\t', target_separator='\t',
-                         unique=True):
-        """ Get the pairs of aligned elements
-        """
-        refset = parsefile(reffile, indexes=ref_indexes,
-                           encoding=ref_encoding, delimiter=ref_separator)
-        targetset = parsefile(targetfile, indexes=target_indexes,
-                              encoding=target_encoding, delimiter=target_separator)
-        global_mat, global_matched = self.align(refset, targetset, get_matrix=False)
-        for pair in iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique):
-            yield pair
-
-    def log_infos(self):
-        """ Display some info on the aligner process
-        """
-        self.logger.info('Computation time : %s' % self.time)
-        self.logger.info('Size reference set : %s' % self.refset_size)
-        self.logger.info('Size target set : %s' % self.targetset_size)
-        self.logger.info('Comparisons done : %s' % self.nb_comparisons)
-        self.logger.info('Alignments done : %s' % self.alignments_done)
-        self.logger.info('Pairs found : %s' % self.pairs_found)
-        self.logger.info('Ratio reference set/alignments done : %s'
-                         % (self.alignments_done/float(self.refset_size)))
-        self.logger.info('Ratio target set/alignments done : %s'
-                         % (self.alignments_done/float(self.targetset_size)))
-        self.logger.info('Ratio reference set/pairs found : %s'
-                         % (self.pairs_found/float(self.refset_size)))
-        self.logger.info('Ratio target set/pairs found : %s'
-                         % (self.pairs_found/float(self.targetset_size)))
-        self.logger.info('Maximum comparisons : %s'
-                         % (self.refset_size * self.targetset_size))
-        self.logger.info('Number of blocks : %s' % self.nb_blocks)
-        if self.nb_blocks:
-            self.logger.info('Ratio comparisons/block : %s'
-                             % (float(self.nb_comparisons)/self.nb_blocks))
-        self.logger.info('Blocking reduction : %s'
-                         % (self.nb_comparisons/float(self.refset_size * self.targetset_size)))
-
-
-###############################################################################
-### PIPELINE ALIGNER OBJECT ##################################################
-###############################################################################
-class PipelineAligner(object):
-    """ This pipeline will perform iterative alignments, removing each time
-    the aligned results from the previous aligner.
-    """
-
-    def __init__(self, aligners):
-        self.aligners = aligners
-        self.pairs = {}
-        self.nb_comparisons = 0
-        self.nb_blocks = 0
-        self.alignments_done = 0
-        self.pairs_found = 0
-        self.refset_size = None
-        self.targetset_size = None
-        self.time = None
-        self.logger = logging.getLogger('nazca.aligner')
-
-    def get_aligned_pairs(self, refset, targetset, unique=True):
-        """ Get the pairs of aligned elements
-        """
-        start_time = time.time()
-        ref_index = range(len(refset))
-        target_index = range(len(targetset))
-        self.refset_size = len(refset)
-        self.targetset_size = len(targetset)
-        global_matched = {}
-        global_mat = lil_matrix((len(refset), len(targetset)))
-        seen_refset = set()
-        # Iteration over aligners
-        for ind_aligner, aligner in enumerate(self.aligners):
-            # Perform alignment
-            _refset = [refset[i] for i in ref_index]
-            _targetset = [targetset[i] for i in target_index]
-            for pair in aligner.get_aligned_pairs(_refset, _targetset, unique):
-                self.pairs_found += 1
-                pair = ((pair[0][0], ref_index[pair[0][1]]),
-                        (pair[1][0], target_index[pair[1][1]]))
-                yield pair
-                seen_refset.add(pair[0][1])
-            # Store stats
-            self.nb_blocks += aligner.nb_blocks
-            self.nb_comparisons += aligner.nb_comparisons
-            # Update indexes if necessary
-            # For now, we remove all the reference set that are already matched
-            if ind_aligner < len(self.aligners) - 1:
-                # There are other aligners after this one
-                ref_index = [i for i in ref_index if i not in seen_refset]
-        self.time = time.time() - start_time
-        self.log_infos()
-
-    def log_infos(self):
-        """ Display some info on the aligner process
-        """
-        self.logger.info('Computation time : %s' % self.time)
-        self.logger.info('Size reference set : %s' % self.refset_size)
-        self.logger.info('Size target set : %s' % self.targetset_size)
-        self.logger.info('Comparisons done : %s' % self.nb_comparisons)
-        self.logger.info('Alignments done : %s' % self.alignments_done)
-        self.logger.info('Pairs found : %s' % self.pairs_found)
-        self.logger.info('Ratio reference set/alignments done : %s'
-                         % (self.alignments_done/float(self.refset_size)))
-        self.logger.info('Ratio target set/alignments done : %s'
-                         % (self.alignments_done/float(self.targetset_size)))
-        self.logger.info('Ratio reference set/pairs found : %s'
-                         % (self.pairs_found/float(self.refset_size)))
-        self.logger.info('Ratio target set/pairs found : %s'
-                         % (self.pairs_found/float(self.targetset_size)))
-        self.logger.info('Maximum comparisons : %s'
-                         % (self.refset_size * self.targetset_size))
-        self.logger.info('Number of blocks : %s' % self.nb_blocks)
-        if self.nb_blocks:
-            self.logger.info('Ratio comparisons/block : %s'
-                             % (float(self.nb_comparisons)/self.nb_blocks))
-        self.logger.info('Blocking reduction : %s'
-                         % (self.nb_comparisons/float(self.refset_size * self.targetset_size)))
--- a/record_linkage/blocking.py	Thu Dec 19 14:45:43 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,666 +0,0 @@
-# -*- coding:utf-8 -*-
-# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-
-
-""" Blocking techniques.
-
-This module implements a set of blocking techniques used to split
-datasets in smaller subsets that will be aligned in more details.
-
-Additional information:
-
-   P. Christen, Data Matching, Data-Centric Systems and Applications,
-
-
-"""
-from functools import partial
-import warnings
-
-from scipy.spatial import KDTree
-
-from nazca.utils.minhashing import Minlsh
-from nazca.utils.distances import soundexcode
-
-
-###############################################################################
-### GENERAL BLOCKING ##########################################################
-###############################################################################
-class BaseBlocking(object):
-    """ An abstract general blocking object that exposes
-    the API that should be common to all blockings object
-    """
-    def __init__(self, ref_attr_index, target_attr_index):
-        """ Build the blocking object
-
-        Parameters
-        ----------
-
-        ref_attr_index: index of the attribute of interest in a record
-                        for the reference dataset
-                        (i.e. attribute to be used for key computation)
-
-        target_attr_index: index of the attribute of interest in a record
-                           for the target dataset
-                           (i.e. attribute to be used for key computation)
-        """
-        self.ref_attr_index = ref_attr_index
-        self.target_attr_index = target_attr_index
-        self.refids = None
-        self.targetids = None
-        self.is_fitted = False
-
-    def _fit(self, refset, targetset):
-        raise NotImplementedError
-
-    def _iter_blocks(self):
-        """ Internal iteration function over blocks
-        """
-        raise NotImplementedError
-
-    def _cleanup(self):
-        """ Internal cleanup blocking for further use (e.g. in pipeline)
-        """
-        raise NotImplementedError
-
-    def fit(self, refset, targetset):
-        """ Fit the blocking technique on the reference and target datasets
-
-        Parameters
-        ----------
-        refset: a dataset (list of records)
-
-        targetset: a dataset (list of records)
-        """
-        self._fit(refset, targetset)
-        # Keep ids for blocks building
-        self.refids = [(i, r[0]) for i, r in enumerate(refset)]
-        self.targetids = [(i, r[0]) for i, r in enumerate(targetset)]
-        self.is_fitted = True
-
-    def iter_blocks(self):
-        """ Iterator over the different possible blocks.
-
-        Returns
-        -------
-
-        (block1, block2): The blocks are always (reference_block, target_block)
-                          and contains the pair (index, id) of the record in the
-                          corresponding dataset.
-        """
-        assert self.is_fitted
-        return self._iter_blocks()
-
-    def iter_indice_blocks(self):
-        """ Iterator over the different possible blocks.
-
-        Returns
-        -------
-
-        (block1, block2): The blocks are always (reference_block, target_block)
-                          and contains the indexes of the record in the
-                          corresponding dataset.
-        """
-        assert self.is_fitted
-        for block1, block2 in self._iter_blocks():
-            yield [r[0] for r in block1], [r[0] for r in block2]
-
-    def iter_id_blocks(self):
-        """ Iterator over the different possible blocks.
-
-        Returns
-        -------
-
-        (block1, block2): The blocks are always (reference_block, target_block)
-                          and contains the ids of the record in the
-                          corresponding dataset.
-        """
-        assert self.is_fitted
-        for block1, block2 in self._iter_blocks():
-            yield [r[1] for r in block1], [r[1] for r in block2]
-
-    def iter_pairs(self):
-        """ Iterator over the different possible pairs.
-
-        Returns
-        -------
-
-        (pair1, pari2): The pairs are always ((ind_reference, id_reference),
-                                              (ind_target, id_target))
-                        and are the ids of the record in the corresponding dataset.
-        """
-        assert self.is_fitted
-        for block1, block2 in self.iter_blocks():
-            for val1 in block1:
-                for val2 in block2:
-                    yield val1, val2
-
-    def iter_indice_pairs(self):
-        """ Iterator over the different possible pairs.
-
-        Returns
-        -------
-
-        (pair1, pari2): The pairs are always (ind_reference, ind_target)
-                        and are the ids of the record in the corresponding dataset.
-        """
-        assert self.is_fitted
-        for block1, block2 in self.iter_indice_blocks():
-            for val1 in block1:
-                for val2 in block2:
-                    yield val1, val2
-
-    def iter_id_pairs(self):
-        """ Iterator over the different possible pairs.
-
-        Returns
-        -------
-
-        (pair1, pari2): The pairs are always (id_reference, id_target)
-                        and are the ids of the record in the corresponding dataset.
-        """
-        assert self.is_fitted
-        for block1, block2 in self.iter_id_blocks():
-            for val1 in block1:
-                for val2 in block2:
-                    yield val1, val2
-
-    def cleanup(self):
-        """ Cleanup blocking for further use (e.g. in pipeline)
-        """
-        self.is_fitted = True
-        self._cleanup()
-
-
-###############################################################################
-### KEY BLOCKING ##############################################################
-###############################################################################
-class KeyBlocking(BaseBlocking):
-    """ This blocking technique is based on a a blocking criteria
-    (or blocking key), that will be used to divide the datasets.
-
-    The main idea here is:
-
-    1 - to create an index of f(x) for each x in the reference set.
-
-    2 - to create an index of f(y) for each y in the target set.
-
-    3 - to iterate on each distinct value of f(x) and to return
-        the identifiers of the records of the both sets for this value.
-    """
-
-    def __init__(self, ref_attr_index, target_attr_index, callback, ignore_none=False):
-        super(KeyBlocking, self).__init__(ref_attr_index, target_attr_index)
-        self.callback = callback
-        self.ignore_none = ignore_none
-        self.reference_index = {}
-        self.target_index = {}
-
-    def _fit(self, refset, targetset):
-        """ Fit a dataset in an index using the callback
-        """
-        for ind, rec in enumerate(refset):
-            key = self.callback(rec[self.ref_attr_index])
-            if not key and self.ignore_none:
-                continue
-            self.reference_index.setdefault(key, []).append((ind, rec[0]))
-        for ind, rec in enumerate(targetset):
-            key = self.callback(rec[self.target_attr_index])
-            if not key and self.ignore_none:
-                continue
-            self.target_index.setdefault(key, []).append((ind, rec[0]))
-
-    def _iter_blocks(self):
-        """ Iterator over the different possible blocks.
-
-        Returns
-        -------
-
-        (block1, block2): The blocks are always (reference_block, target_block)
-                          and containts the indexes of the record in the
-                          corresponding dataset.
-        """
-        for key, block1 in self.reference_index.iteritems():
-            block2 = self.target_index.get(key)
-            if block1 and block2:
-                yield (block1, block2)
-
-    def _cleanup(self):
-        """ Cleanup blocking for further use (e.g. in pipeline)
-        """
-        self.reference_index = {}
-        self.target_index = {}
-
-
-class SoundexBlocking(KeyBlocking):
-
-    def __init__(self, ref_attr_index, target_attr_index, language='french',):
-        super(SoundexBlocking, self).__init__(ref_attr_index, target_attr_index,
-                                              partial(soundexcode, language=language))
-
-
-###############################################################################
-### BIGRAM BLOCKING ###########################################################
-###############################################################################
-class NGramBlocking(BaseBlocking):
-    """ This blocking technique is based on a a n-gram key.
-    """
-
-    def __init__(self, ref_attr_index, target_attr_index, ngram_size=2, depth=2):
-        super(NGramBlocking, self).__init__(ref_attr_index, target_attr_index)
-        self.ngram_size = ngram_size
-        self.depth = depth
-        self.reference_index = {}
-        self.target_index = {}
-
-    def _fit_dataset(self, dataset, cur_index, attr_index):
-        """ Fit a dataset
-        """
-        for ind, r in enumerate(dataset):
-            cur_dict = cur_index
-            text = r[attr_index]
-            for i in range(self.depth):
-                ngram = text[i*self.ngram_size:(i+1)*self.ngram_size]
-                if i < self.depth - 1:
-                    cur_dict = cur_dict.setdefault(ngram, {})
-            cur_dict.setdefault(ngram, []).append((ind, r[0]))
-
-    def _fit(self, refset, targetset):
-        """ Fit the two sets (reference set and target set)
-        """
-        self._fit_dataset(refset, self.reference_index, self.ref_attr_index)
-        self._fit_dataset(targetset, self.target_index, self.target_attr_index)
-
-    def _iter_dict(self, ref_cur_dict, target_cur_dict):
-        """ Iterative function used to create blocks from dicts
-        """
-        for key, sub_dict in ref_cur_dict.iteritems():
-            if key in target_cur_dict:
-                if isinstance(sub_dict, dict):
-                    # There is another dict layer
-                    for block1, block2 in self._iter_dict(sub_dict, target_cur_dict[key]):
-                        yield block1, block2
-                else:
-                    # This is a list
-                    yield sub_dict, target_cur_dict[key]
-
-    def _iter_blocks(self):
-        """ Iterator over the different possible blocks.
-
-        Returns
-        -------
-
-        (block1, block2): The blocks are always (reference_block, target_block)
-                          and containts the indexes of the record in the
-                          corresponding dataset.
-        """
-        for block1, block2 in self._iter_dict(self.reference_index, self.target_index):
-            if block1 and block2:
-                yield block1, block2
-
-    def _cleanup(self):
-        """ Cleanup blocking for further use (e.g. in pipeline)
-        """
-        self.reference_index = {}
-        self.target_index = {}
-
-
-###############################################################################
-### SORTKEY BLOCKING ##########################################################
-###############################################################################
-class SortedNeighborhoodBlocking(BaseBlocking):
-    """ This blocking technique is based on a a sorting blocking criteria
-    (or blocking key), that will be used to divide the datasets.
-    """
-
-    def __init__(self, ref_attr_index, target_attr_index, key_func=lambda x: x, window_width=20):
-        super(SortedNeighborhoodBlocking, self).__init__(ref_attr_index, target_attr_index)
-        self.key_func = key_func
-        self.window_width = window_width
-        self.sorted_dataset = None
-
-    def _fit(self, refset, targetset):
-        """ Fit a dataset in an index using the callback
-        """
-        self.sorted_dataset = [((ind, r[0]), r[self.ref_attr_index], 0)
-                               for ind, r in enumerate(refset)]
-        self.sorted_dataset.extend([((ind, r[0]), r[self.target_attr_index], 1)
-                                    for ind, r in enumerate(targetset)])
-        self.sorted_dataset.sort(key=lambda x: self.key_func(x[1]))
-
-    def _iter_blocks(self):
-        """ Iterator over the different possible blocks.
-        """
-        for ind, (rid, record, dset) in enumerate(self.sorted_dataset):
-            # Only keep reference set record
-            if dset == 1:
-                continue
-            block1 = [rid,]
-            minind = (ind - self.window_width)
-            minind = minind if minind >=0 else 0
-            maxind = (ind + self.window_width + 1)
-            block2 = [ri for ri, re, d in self.sorted_dataset[minind:maxind]
-                      if d == 1]
-            if block1 and block2:
-                yield (block1, block2)
-
-    def _cleanup(self):
-        """ Cleanup blocking for further use (e.g. in pipeline)
-        """
-        self.sorted_dataset = None
-
-
-###############################################################################
-### MERGE BLOCKING ############################################################
-###############################################################################
-class MergeBlocking(BaseBlocking):
-    """ This blocking technique keep only one appearance of one given values,
-    and removes all the other records having this value.
-    The merge is based on a score function
-
-    E.g.
-      ('http://fr.wikipedia.org/wiki/Paris_%28Texas%29', 'Paris', 25898)
-      ('http://fr.wikipedia.org/wiki/Paris', 'Paris', 12223100)
-
-    could be (with a score function based on the population (third value):
-
-      ('http://fr.wikipedia.org/wiki/Paris', 'Paris', 12223100)
-
-    !!! WARNING !!! This is only done on ONE set (the one with a non null attr index)
-    """
-
-    def __init__(self, ref_attr_index, target_attr_index, score_func):
-        super(MergeBlocking, self).__init__(ref_attr_index, target_attr_index)
-        self.score_func = score_func
-        self.merged_dataset = None
-        self.other_dataset = None
-        if ref_attr_index is None and target_attr_index is None:
-            raise ValueError('At least one of ref_attr_index or target_attr_index '
-                             'should not be None')
-
-    def _fit(self, refset, targetset):
-        """ Fit a dataset in an index using the callback
-        """
-        if self.ref_attr_index is not None:
-            # Merge refset
-            self.merged_dataset = self._merge_dataset(refset, self.ref_attr_index)
-            self.other_dataset = [(ind, r[0]) for ind, r in enumerate(targetset)]
-        else:
-            # Merge targetset
-            self.merged_dataset = self._merge_dataset(targetset, self.target_attr_index)
-            self.other_dataset = [(ind, r[0]) for ind, r in enumerate(refset)]
-
-    def _merge_dataset(self, dataset, attr_index):
-        """ Merge a dataset
-        """
-        merged_dataset_dict = {}
-        for ind, record in enumerate(dataset):
-            score = self.score_func(record)
-            if record[attr_index] not in merged_dataset_dict:
-                # Create new entry
-                merged_dataset_dict[record[attr_index]] = (ind, record, score)
-            elif (record[attr_index] in merged_dataset_dict
-                  and merged_dataset_dict[record[attr_index]][2] < score):
-                # Change current score
-                merged_dataset_dict[record[attr_index]] = (ind, record, score)
-        return [(ind, r[0]) for ind, r, score in merged_dataset_dict.itervalues()]
-
-    def _iter_blocks(self):
-        """ Iterator over the different possible blocks.
-        """
-        if self.ref_attr_index is not None:
-            yield self.merged_dataset, self.other_dataset
-        else:
-            # self.target_attr_index is not None
-            yield self.other_dataset, self.merged_dataset
-
-    def _cleanup(self):
-        """ Cleanup blocking for further use (e.g. in pipeline)
-        """
-        self.merged_dataset = None
-        self.other_dataset = None
-
-
-###############################################################################
-### CLUSTERING-BASED BLOCKINGS ################################################
-###############################################################################
-class KmeansBlocking(BaseBlocking):
-    """ A blocking technique based on Kmeans
-    """
-
-    def __init__(self, ref_attr_index, target_attr_index, n_clusters=None):
-        super(KmeansBlocking, self).__init__(ref_attr_index, target_attr_index)
-        self.n_clusters = n_clusters
-        self.kmeans = None
-        self.predicted = None
-        from sklearn import cluster
-        self.cluster_class = cluster.KMeans
-
-    def _fit(self, refset, targetset):
-        """ Fit the reference dataset.
-        """
-        # If an element is None (missing), use instead the identity element.
-        # The identity element is defined as the 0-vector
-        idelement = tuple([0 for _ in xrange(len(refset[0][self.ref_attr_index]))])
-        # We assume here that there are at least 2 elements in the refset
-        n_clusters = self.n_clusters or (len(refset)/10 or len(refset)/2)
-        kmeans =  self.cluster_class(n_clusters=n_clusters)
-        kmeans.fit([elt[self.ref_attr_index] or idelement for elt in refset])
-        self.kmeans = kmeans
-        # Predict on targetset
-        self.predicted = self.kmeans.predict([elt[self.target_attr_index]
-                                              or idelement for elt in targetset])
-
-    def _iter_blocks(self):
-        """ Iterator over the different possible blocks.
-
-        Returns
-        -------
-
-        (block1, block2): The blocks are always (reference_block, target_block)
-                          and containts the indexes of the record in the
-                          corresponding dataset.
-        """
-        neighbours = [[[], []] for _ in xrange(self.kmeans.n_clusters)]
-        for ind, li in enumerate(self.predicted):
-            neighbours[li][1].append(self.targetids[ind])
-        for ind, li in enumerate(self.kmeans.labels_):
-            neighbours[li][0].append(self.refids[ind])
-        for block1, block2 in neighbours:
-            if len(block1) and len(block2):
-                yield block1, block2
-
-    def _cleanup(self):
-        """ Cleanup blocking for further use (e.g. in pipeline)
-        """
-        self.kmeans = None
-        self.predicted = None
-
-
-###############################################################################
-### KDTREE BLOCKINGS ##########################################################
-###############################################################################
-class KdTreeBlocking(BaseBlocking):
-    """ A blocking technique based on KdTree
-    """
-    def __init__(self, ref_attr_index, target_attr_index, threshold=0.1):
-        super(KdTreeBlocking, self).__init__(ref_attr_index, target_attr_index)
-        self.threshold = threshold
-        self.reftree = None
-        self.targettree = None
-        self.nb_elements = None
-
-    def _fit(self, refset, targetset):
-        """ Fit the blocking
-        """
-        firstelement = refset[0][self.ref_attr_index]
-        self.nb_elements = len(refset)
-        idsize = len(firstelement) if isinstance(firstelement, (tuple, list)) else 1
-        idelement = (0,) * idsize
-        # KDTree is expecting a two-dimensional array
-        if idsize == 1:
-            self.reftree  = KDTree([(elt[self.ref_attr_index],) or idelement for elt in refset])
-            self.targettree = KDTree([(elt[self.target_attr_index],) or idelement for elt in targetset])
-        else:
-            self.reftree = KDTree([elt[self.ref_attr_index] or idelement for elt in refset])
-            self.targettree = KDTree([elt[self.target_attr_index] or idelement for elt in targetset])
-
-    def _iter_blocks(self):
-        """ Iterator over the different possible blocks.
-
-        Returns
-        -------
-
-        (block1, block2): The blocks are always (reference_block, target_block)
-                          and containts the indexes of the record in the
-                          corresponding dataset.
-        """
-        extraneighbours = self.reftree.query_ball_tree(self.targettree, self.threshold)
-        neighbours = []
-        for ind in xrange(self.nb_elements):
-            if not extraneighbours[ind]:
-                continue
-            _ref = [self.refids[ind],]
-            _target = [self.targetids[v] for v in extraneighbours[ind]]
-            neighbours.append((_ref, _target))
-        for block1, block2 in neighbours:
-            if len(block1) and len(block2):
-                yield block1, block2
-
-    def _cleanup(self):
-        """ Cleanup blocking for further use (e.g. in pipeline)
-        """
-        self.reftree = None
-        self.targettree = None
-        self.nb_elements = None
-
-
-###############################################################################
-### MINHASHING BLOCKINGS ######################################################
-###############################################################################
-class MinHashingBlocking(BaseBlocking):
-    """ A blocking technique based on MinHashing
-    """
-    def __init__(self, ref_attr_index, target_attr_index,
-                 threshold=0.1, kwordsgram=1, siglen=200):
-        super(MinHashingBlocking, self).__init__(ref_attr_index, target_attr_index)
-        self.threshold = threshold
-        self.kwordsgram = kwordsgram
-        self.siglen = siglen
-        self.minhasher = Minlsh()
-        self.nb_elements = None
-
-    def _fit(self, refset, targetset):
-        """ Find the blocking using minhashing
-        """
-        # If an element is None (missing), use instead the identity element.
-        idelement = ''
-        self.minhasher.train([elt[self.ref_attr_index] or idelement for elt in refset] +
-                        [elt[self.target_attr_index] or idelement for elt in targetset],
-                        self.kwordsgram, self.siglen)
-        self.nb_elements = len(refset)
-
-    def _iter_blocks(self):
-        """ Iterator over the different possible blocks.
-
-        Returns
-        -------
-
-        (block1, block2): The blocks are always (reference_block, target_block)
-                          and containts the indexes of the record in the
-                          corresponding dataset.
-        """
-        rawneighbours = self.minhasher.predict(self.threshold)
-        neighbours = []
-        for data in rawneighbours:
-            neighbours.append([[], []])
-            for i in data:
-                if i >= self.nb_elements:
-                    neighbours[-1][1].append(self.targetids[i - self.nb_elements])
-                else:
-                    neighbours[-1][0].append(self.refids[i])
-            if len(neighbours[-1][0]) == 0 or len(neighbours[-1][1]) == 0:
-                neighbours.pop()
-        for block1, block2 in neighbours:
-            if len(block1) and len(block2):
-                yield block1, block2
-
-    def _cleanup(self):
-        """ Cleanup blocking for further use (e.g. in pipeline)
-        """
-        self.minhasher = Minlsh()
-        self.nb_elements = None
-
-
-###############################################################################
-### BLOCKING PIPELINE #########################################################
-###############################################################################
-class PipelineBlocking(BaseBlocking):
-    """ Pipeline multiple blocking techniques
-    """
-
-    def __init__(self, blockings, collect_stats=False):
-        """ Build the blocking object
-
-        Parameters
-        ----------
-
-        blockings: ordered list of blocking objects
-        """
-        self.blockings = blockings
-        self.stored_blocks = []
-        self.collect_stats = collect_stats
-        self.stats = {}
-
-    def _fit(self, refset, targetset):
-        """ Internal fit of the pipeline """
-        self._recursive_fit(refset, targetset, range(len(refset)), range(len(targetset)), 0)
-
-    def _recursive_fit(self, refset, targetset, ref_index, target_index, ind):
-        """ Recursive fit of the blockings.
-        Blocks are stored in the stored_blocks attribute.
-        """
-        if ind < len(self.blockings) - 1:
-            # There are other blockings after this one
-            blocking = self.blockings[ind]
-            blocking.cleanup()
-            blocking.fit([refset[i] for i in ref_index],
-                         [targetset[i] for i in target_index])
-            for block1, block2 in blocking.iter_indice_blocks():
-                ind_block1 = [ref_index[i] for i in block1]
-                ind_block2 = [target_index[i] for i in block2]
-                if self.collect_stats:
-                    self.stats.setdefault(ind, []).append((len(block1), len(block2)))
-                self._recursive_fit(refset, targetset, ind_block1, ind_block2, ind+1)
-        else:
-            # This is the final blocking
-            blocking = self.blockings[ind]
-            blocking.cleanup()
-            blocking.fit([refset[i] for i in ref_index],
-                         [targetset[i] for i in target_index])
-            for block1, block2 in blocking.iter_blocks():
-                ind_block1 = [(ref_index[i], _id) for i, _id in block1]
-                ind_block2 = [(target_index[i], _id) for i, _id in block2]
-                if self.collect_stats:
-                    self.stats.setdefault(ind, []).append((len(block1), len(block2)))
-                self.stored_blocks.append((ind_block1, ind_block2))
-
-    def _iter_blocks(self):
-        """ Internal iteration function over blocks
-        """
-        for block1, block2 in self.stored_blocks:
-            if block1 and block2:
-                yield block1, block2
--- a/reference_data/countries.py	Thu Dec 19 14:45:43 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,994 +0,0 @@
-
-# Countries list (ISO-3166)
-COUNTRIES = {'##': 'non renseign\xc3\xa9',
-             '..': 'non renseign\xc3\xa9',
-             'aa': 'aire g\xc3\xa9ographique ancienne',
-             'ad': 'Andorre',
-             'ae': '\xc3\x89mirats arabes unis',
-             'af': 'Afghanistan',
-             'ag': 'Antigua-et-Barbuda',
-             'ai': 'Anguilla',
-             'al': 'Albanie',
-             'am': 'Arm\xc3\xa9nie',
-             'an': 'Antilles n\xc3\xa9erlandaises',
-             'ao': 'Angola',
-             'aq': 'Antarctique',
-             'ar': 'Argentine',
-             'as': 'Samoa am\xc3\xa9ricaines',
-             'at': 'Autriche',
-             'au': 'Australie',
-             'aw': 'Aruba',
-             'ax': 'Aland (\xc3\xaeles)',
-             'az': 'Azerba\xc3\xafdjan',
-             'ba': 'Bosnie-Herz\xc3\xa9govine',
-             'bb': 'Barbade',
-             'bd': 'Bangladesh',
-             'be': 'Belgique',
-             'bf': 'Burkina',
-             'bg': 'Bulgarie',
-             'bh': 'Bahre\xc3\xafn',
-             'bi': 'Burundi',
-             'bj': 'B\xc3\xa9nin',
-             'bl': 'Saint-Barth\xc3\xa9lemy',
-             'bm': 'Bermudes',
-             'bn': 'Brun\xc3\xa9i',
-             'bo': 'Bolivie',
-             'bq': 'Bonaire, Saint-Eustache et Saba,Saba###Saint-Eustache',
-             'br': 'Br\xc3\xa9sil',
-             'bs': 'Bahamas',
-             'bt': 'Bhoutan',
-             'bv': 'Bouvet (\xc3\xaele)',
-             'bw': 'Botswana',
-             'by': 'Bi\xc3\xa9lorussie,B\xc3\xa9larus',
-             'bz': 'Belize',
-             'ca': 'Canada',
-             'cc': 'Cocos (\xc3\xaeles),Keeling (\xc3\xaeles)',
-             'cd': 'Congo (R\xc3\xa9publique d\xc3\xa9mocratique),Za\xc3\xafre',
-             'cf': 'Centrafrique,R\xc3\xa9publique centrafricaine',
-             'cg': 'Congo,Congo (R\xc3\xa9publique)',
-             'ch': 'Suisse,Conf\xc3\xa9d\xc3\xa9ration helv\xc3\xa9tique',
-             'ci': "C\xc3\xb4te d'Ivoire\n",
-             'ck': 'Cook (\xc3\xaeles)',
-             'cl': 'Chili',
-             'cm': 'Cameroun',
-             'cn': 'Chine,Chine (R\xc3\xa9publique populaire)',
-             'co': 'Colombie',
-             'cr': 'Costa Rica',
-             'cs': 'Serbie-et-Mont\xc3\xa9n\xc3\xa9gro',
-             'cu': 'Cuba',
-             'cv': 'Cap-Vert',
-             'cw': 'Cura\xc3\xa7ao',
-             'cx': 'Christmas (\xc3\xaele)',
-             'cy': 'Chypre',
-             'cz': 'R\xc3\xa9publique tch\xc3\xa8que,Tch\xc3\xa8que, R\xc3\xa9publique',
-             'dd': 'Allemagne (R\xc3\xa9publique d\xc3\xa9mocratique)',
-             'de': 'Allemagne,Allemagne (R\xc3\xa9publique f\xc3\xa9d\xc3\xa9rale)',
-             'dj': 'Djibouti',
-             'dk': 'Danemark',
-             'dm': 'Dominique',
-             'do': 'R\xc3\xa9publique dominicaine,Dominicaine, R\xc3\xa9publique',
-             'dz': 'Alg\xc3\xa9rie',
-             'ec': '\xc3\x89quateur',
-             'ee': 'Estonie',
-             'eg': '\xc3\x89gypte',
-             'eh': 'Sahara occidental',
-             'er': '\xc3\x89rythr\xc3\xa9e',
-             'es': 'Espagne',
-             'et': '\xc3\x89thiopie',
-             'fi': 'Finlande',
-             'fj': 'Fidji',
-             'fk': 'Malouines (\xc3\xaeles),Falkland (\xc3\xaeles)',
-             'fm': 'Micron\xc3\xa9sie,\xc3\x89tats f\xc3\xa9d\xc3\xa9r\xc3\xa9s de Micron\xc3\xa9sie',
-             'fo': 'F\xc3\xa9ro\xc3\xa9 (\xc3\xaeles)',
-             'fr': 'France',
-             'ga': 'Gabon',
-             'gb': 'Grande-Bretagne,Royaume-Uni',
-             'gd': 'Grenade',
-             'ge': 'G\xc3\xa9orgie',
-             'gf': 'Guyane fran\xc3\xa7aise',
-             'gg': 'Guernesey',
-             'gh': 'Ghana',
-             'gi': 'Gibraltar',
-             'gl': 'Groenland',
-             'gm': 'Gambie',
-             'gn': 'Guin\xc3\xa9e',
-             'gp': 'Guadeloupe',
-             'gq': 'Guin\xc3\xa9e \xc3\xa9quatoriale',
-             'gr': 'Gr\xc3\xa8ce',
-             'gs': 'G\xc3\xa9orgie du Sud et les \xc3\xaeles Sandwich du Sud',
-             'gt': 'Guatemala',
-             'gu': 'Guam',
-             'gw': 'Guin\xc3\xa9e-Bissau',
-             'gy': 'Guyana',
-             'hk': 'Hong Kong',
-             'hm': 'Heard (\xc3\xaele) et \xc3\xaeles McDonald',
-             'hn': 'Honduras',
-             'hr': 'Croatie',
-             'ht': 'Ha\xc3\xafti',
-             'hu': 'Hongrie',
-             'id': 'Indon\xc3\xa9sie',
-             'ie': 'Irlande',
-             'ii': 'intergouvernemental',
-             'il': 'Isra\xc3\xabl',
-             'im': '\xc3\x8ele de Man,Man, \xc3\x8ele de',
-             'in': 'Inde',
-             'io': "Territoire britannique de l'Oc\xc3\xa9an indien,Chagos (\xc3\xaeles)###Oc\xc3\xa9an indien, Territoire britannique de l'\n",
-             'iq': 'Irak',
-             'ir': 'Iran',
-             'is': 'Islande',
-             'it': 'Italie',
-             'je': 'Jersey',
-             'jm': 'Jama\xc3\xafque',
-             'jo': 'Jordanie',
-             'jp': 'Japon',
-             'ke': 'Kenya',
-             'kg': 'Kirghizistan',
-             'kh': 'Cambodge',
-             'ki': 'Kiribati',
-             'km': 'Comores',
-             'kn': 'Saint-Kitts-et-Nevis,Saint-Christophe-et-Nevis',
-             'ko': 'Kosovo',
-             'kp': 'Cor\xc3\xa9e (R\xc3\xa9publique populaire d\xc3\xa9mocratique),Cor\xc3\xa9e du Nord',
-             'kr': 'Cor\xc3\xa9e (R\xc3\xa9publique),Cor\xc3\xa9e du Sud',
-             'kw': 'Kowe\xc3\xaft',
-             'ky': 'Cayman,Ca\xc3\xafmanes, \xc3\x8eles###Ca\xc3\xafman (\xc3\xaeles)',
-             'kz': 'Kazakhstan',
-             'la': 'Laos',
-             'lb': 'Liban',
-             'lc': 'Sainte-Lucie',
-             'li': 'Liechtenstein',
-             'lk': 'Sri Lanka',
-             'lr': 'Liberia',
-             'ls': 'Lesotho',
-             'lt': 'Lituanie',
-             'lu': 'Luxembourg',
-             'lv': 'Lettonie',
-             'ly': 'Libye',
-             'ma': 'Maroc',
-             'mc': 'Monaco',
-             'md': 'Moldavie,Moldova, R\xc3\xa9publique de',
-             'me': 'Mont\xc3\xa9n\xc3\xa9gro',
-             'mf': 'Saint-Martin (partie fran\xc3\xa7aise)',
-             'mg': 'Madagascar',
-             'mh': 'Marshall (\xc3\xaeles)',
-             'mk': 'Mac\xc3\xa9doine (R\xc3\xa9publique)',
-             'ml': 'Mali',
-             'mm': 'Myanmar,Birmanie',
-             'mn': 'Mongolie',
-             'mo': 'Macao',
-             'mp': 'Mariannes du Nord (\xc3\xaeles)',
-             'mq': 'Martinique',
-             'mr': 'Mauritanie',
-             'ms': 'Montserrat',
-             'mt': 'Malte',
-             'mu': 'Maurice',
-             'mv': 'Maldives',
-             'mw': 'Malawi',
-             'mx': 'Mexique',
-             'my': 'Malaisie',
-             'mz': 'Mozambique',
-             'na': 'Namibie',
-             'nc': 'Nouvelle-Cal\xc3\xa9donie',
-             'ne': 'Niger',
-             'nf': 'Norfolk (\xc3\xaele)',
-             'ng': 'Nigeria',
-             'ni': 'Nicaragua',
-             'nl': 'Pays-Bas',
-             'no': 'Norv\xc3\xa8ge',
-             'np': 'N\xc3\xa9pal',
-             'nr': 'Nauru',
-             'nu': 'Niue',
-             'nz': 'Nouvelle-Z\xc3\xa9lande',
-             'om': 'Oman',
-             'oo': 'code non adapt\xc3\xa9',
-             'pa': 'Panama',
-             'pe': 'P\xc3\xa9rou',
-             'pf': 'Polyn\xc3\xa9sie fran\xc3\xa7aise',
-             'pg': 'Papouasie-Nouvelle-Guin\xc3\xa9e',
-             'ph': 'Philippines',
-             'pk': 'Pakistan',
-             'pl': 'Pologne',
-             'pm': 'Saint-Pierre-et-Miquelon',
-             'pn': 'Pitcairn',
-             'pr': 'Porto Rico',
-             'ps': 'Autorit\xc3\xa9 palestinienne,Palestine',
-             'pt': 'Portugal',
-             'pw': 'Palau,Palaos',
-             'py': 'Paraguay',
-             'qa': 'Qatar',
-             're': 'R\xc3\xa9union',
-             'ro': 'Roumanie',
-             'rs': 'Serbie',
-             'ru': 'Russie (F\xc3\xa9d\xc3\xa9ration),Russie',
-             'rw': 'Rwanda',
-             'sa': 'Arabie saoudite',
-             'sb': 'Salomon (\xc3\xaeles)',
-             'sc': 'Seychelles',
-             'sd': 'Soudan',
-             'se': 'Su\xc3\xa8de',
-             'sg': 'Singapour',
-             'sh': 'Sainte-H\xc3\xa9l\xc3\xa8ne,Ascension (\xc3\xaele)###Tristan da Cunha (\xc3\xaele)',
-             'si': 'Slov\xc3\xa9nie',
-             'sj': 'Svalbard et \xc3\xaele Jan Mayen',
-             'sk': 'Slovaquie',
-             'sl': 'Sierra Leone',
-             'sm': 'Saint-Marin',
-             'sn': 'S\xc3\xa9n\xc3\xa9gal',
-             'so': 'Somalie',
-             'sr': 'Suriname',
-             'ss': 'Soudan du Sud,Sud Soudan',
-             'st': 'Sao Tom\xc3\xa9-et-Principe',
-             'su': 'URSS',
-             'sv': 'El Salvador,Salvador',
-             'sx': 'Saint-Martin (partie n\xc3\xa9erlandaise),Sint Maarten',
-             'sy': 'Syrie',
-             'sz': 'Swaziland',
-             'tc': 'Turks et Ca\xc3\xafques (\xc3\xaeles)',
-             'td': 'Tchad',
-             'tf': 'Terres australes fran\xc3\xa7aises',
-             'tg': 'Togo',
-             'th': 'Tha\xc3\xaflande',
-             'tj': 'Tadjikistan',
-             'tk': 'Tokelau',
-             'tl': 'Timor oriental',
-             'tm': 'Turkm\xc3\xa9nistan',
-             'tn': 'Tunisie',
-             'to': 'Tonga',
-             'tr': 'Turquie',
-             'tt': 'Trinit\xc3\xa9-et-Tobago',
-             'tv': 'Tuvalu',
-             'tw': 'Ta\xc3\xafwan,Chine (R\xc3\xa9publique)',
-             'tz': 'Tanzanie',
-             'ua': 'Ukraine',
-             'ug': 'Ouganda',
-             'um': '\xc3\x8eles mineures \xc3\xa9loign\xc3\xa9es des \xc3\x89tats-Unis',
-             'us': '\xc3\x89tats-Unis',
-             'uy': 'Uruguay',
-             'uz': 'Ouzb\xc3\xa9kistan',
-             'va': 'Vatican,Saint-Si\xc3\xa8ge',
-             'vc': 'Saint-Vincent-et-les Grenadines',
-             've': 'Venezuela',
-             'vg': '\xc3\x8eles Vierges britanniques,Vierges (\xc3\xaeles) britanniques',
-             'vi': '\xc3\x8eles Vierges am\xc3\xa9ricaines,Vierges (\xc3\xaeles) am\xc3\xa9ricaines',
-             'vn': 'Viet Nam',
-             'vu': 'Vanuatu',
-             'wf': 'Wallis et Futuna (\xc3\xaeles)',
-             'ws': 'Samoa,Samoa occidentales',
-             'xc': 'Tch\xc3\xa9coslovaquie',
-             'xd': 'Allemagne avant 1945',
-             'xe': 'Europe,Union europ\xc3\xa9enne',
-             'xk': 'Cor\xc3\xa9e avant 1948',
-             'xn': 'Pays-Bas avant 1830,Belgique avant 1830',
-             'xx': 'inconnu',
-             'yd': 'Y\xc3\xa9men (R\xc3\xa9publique d\xc3\xa9mocratique populaire),Sud Y\xc3\xa9men',
-             'ye': 'Y\xc3\xa9men',
-             'yt': 'Mayotte',
-             'yu': 'Yougoslavie',
-             'yy': "ne s'applique pas\n",
-             'za': 'Afrique du Sud',
-             'zm': 'Zambie',
-             'zw': 'Zimbabwe',
-             'zz': 'multiple\n'}
-
-
-# REGIONS TO COUNTRIES MAPPING
-REGIONS_TO_COUNTRIES = {u'Abruzzes': u'Italie',
-                        u'Acha\xefe': u'Gr\xe8ce',
-                        u'Acre': u'Br\xe9sil',
-                        u'Afghanistan': u'Afghanistan',
-                        u'Afrique du Sud': u'Afrique du Sud',
-                        u'Aguascalientes': u'Mexique',
-                        u'Ain': u'France',
-                        u'Aisne': u'France',
-                        u'Alabama': u'\xc9tats-Unis',
-                        u'Alagoas': u'Br\xe9sil',
-                        u'Aland (\xeeles)': u'Aland (\xeeles)',
-                        u'Alaska': u'\xc9tats-Unis',
-                        u'Albanie': u'Albanie',
-                        u'Alberta': u'Canada',
-                        u'Alg\xe9rie': u'Alg\xe9rie',
-                        u'Allemagne': u'Allemagne',
-                        u'Allemagne (R\xe9publique d\xe9mocratique)': u'Allemagne (R\xe9publique d\xe9mocratique)',
-                        u'Allemagne avant 1945': u'Allemagne avant 1945',
-                        u'Allier': u'France',
-                        u'Alpes-Maritimes': u'France',
-                        u'Alpes-de-Haute-Provence': u'France',
-                        u'Alsace': u'France',
-                        u'Amapa': u'Br\xe9sil',
-                        u'Amazonas': u'Br\xe9sil',
-                        u'Andalousie': u'Espagne',
-                        u'Andorre': u'Andorre',
-                        u'Angola': u'Angola',
-                        u'Anguilla': u'Anguilla',
-                        u'Antarctique': u'Antarctique',
-                        u'Antigua-et-Barbuda': u'Antigua-et-Barbuda',
-                        u'Antilles n\xe9erlandaises': u'Antilles n\xe9erlandaises',
-                        u'Anvers': u'Belgique',
-                        u'Appenzell-Rhodes-Ext\xe9rieures': u'Suisse',
-                        u'Appenzell-Rhodes-Int\xe9rieures': u'Suisse',
-                        u'Aquitaine': u'France',
-                        u'Arabie saoudite': u'Arabie saoudite',
-                        u'Aragon': u'Espagne',
-                        u'Arcadie': u'Gr\xe8ce',
-                        u'Ardennes': u'France',
-                        u'Ard\xe8che': u'France',
-                        u'Argentine': u'Argentine',
-                        u'Argolide': u'Gr\xe8ce',
-                        u'Argovie': u'Suisse',
-                        u'Arizona': u'\xc9tats-Unis',
-                        u'Ari\xe8ge': u'France',
-                        u'Arkansas': u'\xc9tats-Unis',
-                        u'Arm\xe9nie': u'Arm\xe9nie',
-                        u'Aruba': u'Aruba',
-                        u'Asturies': u'Espagne',
-                        u'Ath\xe8nes et agglom\xe9ration': u'Gr\xe8ce',
-                        u'Attique': u'Gr\xe8ce',
-                        u'Aube': u'France',
-                        u'Aude': u'France',
-                        u'Australie': u'Australie',
-                        u'Australie-M\xe9ridionale': u'Australie',
-                        u'Australie-Occidentale': u'Australie',
-                        u'Autorit\xe9 palestinienne': u'Autorit\xe9 palestinienne',
-                        u'Autriche': u'Autriche',
-                        u'Auvergne': u'France',
-                        u'Aveyron': u'France',
-                        u'Azerba\xefdjan': u'Azerba\xefdjan',
-                        u'Bade-Wurtemberg': u'Allemagne',
-                        u'Bahamas': u'Bahamas',
-                        u'Bahia': u'Br\xe9sil',
-                        u'Bahre\xefn': u'Bahre\xefn',
-                        u'Baja California Norte': u'Mexique',
-                        u'Baja California Sur': u'Mexique',
-                        u'Bangladesh': u'Bangladesh',
-                        u'Barbade': u'Barbade',
-                        u'Bas-Rhin': u'France',
-                        u'Basilicate': u'Italie',
-                        u'Basse-Autriche': u'Autriche',
-                        u'Basse-Normandie': u'France',
-                        u'Basse-Saxe': u'Allemagne',
-                        u'Bavi\xe8re': u'Allemagne',
-                        u'Belgique': u'Belgique',
-                        u'Belize': u'Belize',
-                        u'Berlin': u'Allemagne',
-                        u'Bermudes': u'Bermudes',
-                        u'Berne': u'Suisse',
-                        u'Bhoutan': u'Bhoutan',
-                        u'Bi\xe9lorussie': u'Bi\xe9lorussie',
-                        u'Bolivie': u'Bolivie',
-                        u'Bonaire, Saint-Eustache et Saba': u'Bonaire, Saint-Eustache et Saba',
-                        u'Bosnie-Herz\xe9govine': u'Bosnie-Herz\xe9govine',
-                        u'Botswana': u'Botswana',
-                        u'Bouches-du-Rh\xf4ne': u'France',
-                        u'Bourgogne': u'France',
-                        u'Bouvet (\xeele)': u'Bouvet (\xeele)',
-                        u'Brabant': u'Belgique',
-                        u'Brabant flamand': u'Belgique',
-                        u'Brabant wallon': u'Belgique',
-                        u'Brabant-Septentrional': u'Pays-Bas',
-                        u'Brandebourg': u'Allemagne',
-                        u'Bretagne': u'France',
-                        u'Brun\xe9i': u'Brun\xe9i',
-                        u'Bruxelles': u'Belgique',
-                        u'Br\xe9sil': u'Br\xe9sil',
-                        u'Br\xeame': u'Allemagne',
-                        u'Buenos Aires': u'Argentine',
-                        u'Bulgarie': u'Bulgarie',
-                        u'Burgenland': u'Autriche',
-                        u'Burkina': u'Burkina',
-                        u'Burundi': u'Burundi',
-                        u'B\xe2le-Campagne': u'Suisse',
-                        u'B\xe2le-Ville': u'Suisse',
-                        u'B\xe9nin': u'B\xe9nin',
-                        u'B\xe9otie': u'Gr\xe8ce',
-                        u'Calabre': u'Italie',
-                        u'Californie': u'\xc9tats-Unis',
-                        u'Calvados': u'France',
-                        u'Cambodge': u'Cambodge',
-                        u'Cameroun': u'Cameroun',
-                        u'Campanie': u'Italie',
-                        u'Campeche': u'Mexique',
-                        u'Canada': u'Canada',
-                        u'Canaries': u'Espagne',
-                        u'Cantabrie': u'Espagne',
-                        u'Cantal': u'France',
-                        u'Cap-Vert': u'Cap-Vert',
-                        u'Capitale f\xe9d\xe9rale': u'Argentine',
-                        u'Carinthie': u'Autriche',
-                        u'Caroline du Nord': u'\xc9tats-Unis',
-                        u'Caroline du Sud': u'\xc9tats-Unis',
-                        u'Castille et L\xe9on': u'Espagne',
-                        u'Castille-la Manche': u'Espagne',
-                        u'Catalogne': u'Espagne',
-                        u'Catamarca': u'Argentine',
-                        u'Cayman': u'Cayman',
-                        u'Cear\xe1': u'Br\xe9sil',
-                        u'Centrafrique': u'Centrafrique',
-                        u'Centre': u'France',
-                        u'Ceuta': u'Espagne',
-                        u'Chaco': u'Argentine',
-                        u'Chalcidique': u'Gr\xe8ce',
-                        u'Champagne-Ardenne': u'France',
-                        u'Charente': u'France',
-                        u'Charente-Maritime': u'France',
-                        u'Cher': u'France',
-                        u'Chiapas': u'Mexique',
-                        u'Chihuahua': u'Mexique',
-                        u'Chili': u'Chili',
-                        u'Chine': u'Chine',
-                        u'Christmas (\xeele)': u'Christmas (\xeele)',
-                        u'Chubut': u'Argentine',
-                        u'Chypre': u'Chypre',
-                        u'Ch\xedos': u'Gr\xe8ce',
-                        u'Coahuila': u'Mexique',
-                        u'Cocos (\xeeles)': u'Cocos (\xeeles)',
-                        u'Colima': u'Mexique',
-                        u'Colombie': u'Colombie',
-                        u'Colombie britannique': u'Canada',
-                        u'Colorado': u'\xc9tats-Unis',
-                        u'Communaut\xe9 de Madrid': u'Espagne',
-                        u'Communaut\xe9 de Valence': u'Espagne',
-                        u'Comores': u'Comores',
-                        u'Congo': u'Congo',
-                        u'Congo (R\xe9publique d\xe9mocratique)': u'Congo (R\xe9publique d\xe9mocratique)',
-                        u'Connecticut': u'\xc9tats-Unis',
-                        u'Cook (\xeeles)': u'Cook (\xeeles)',
-                        u'Corfou': u'Gr\xe8ce',
-                        u'Corinthie': u'Gr\xe8ce',
-                        u'Corrientes': u'Argentine',
-                        u'Corr\xe8ze': u'France',
-                        u'Corse': u'France',
-                        u'Corse-du-Sud': u'France',
-                        u'Cor\xe9e (R\xe9publique populaire d\xe9mocratique)': u'Cor\xe9e (R\xe9publique populaire d\xe9mocratique)',
-                        u'Cor\xe9e (R\xe9publique)': u'Cor\xe9e (R\xe9publique)',
-                        u'Cor\xe9e avant 1948': u'Cor\xe9e avant 1948',
-                        u'Costa Rica': u'Costa Rica',
-                        u'Creuse': u'France',
-                        u'Croatie': u'Croatie',
-                        u'Cr\xe8te': u'Gr\xe8ce',
-                        u'Cuba': u'Cuba',
-                        u'Cura\xe7ao': u'Cura\xe7ao',
-                        u'Cyclades': u'Gr\xe8ce',
-                        u'C\xe9phalonie': u'Gr\xe8ce',
-                        u'C\xf3rdoba': u'Argentine',
-                        u"C\xf4te d'Ivoire": u"C\xf4te d'Ivoire",
-                        u"C\xf4te-d'Or": u'France',
-                        u"C\xf4tes-d'Armor": u'France',
-                        u'Dakota du Nord': u'\xc9tats-Unis',
-                        u'Dakota du Sud': u'\xc9tats-Unis',
-                        u'Danemark': u'Danemark',
-                        u'Delaware': u'\xc9tats-Unis',
-                        u'Deux-S\xe8vres': u'France',
-                        u'District de Columbia': u'\xc9tats-Unis',
-                        u'District f\xe9d\xe9ral': u'Br\xe9sil',
-                        u'Djibouti': u'Djibouti',
-                        u'Dod\xe9can\xe8se': u'Gr\xe8ce',
-                        u'Dominique': u'Dominique',
-                        u'Dordogne': u'France',
-                        u'Doubs': u'France',
-                        u'Drenthe': u'Pays-Bas',
-                        u'Dr\xe1ma': u'Gr\xe8ce',
-                        u'Dr\xf4me': u'France',
-                        u'Durango': u'Mexique',
-                        u'D\xe9pendance de Ross (Nouvelle-Z\xe9lande)': u'Antarctique',
-                        u'El Salvador': u'El Salvador',
-                        u'Entre-Rios': u'Argentine',
-                        u'Espagne': u'Espagne',
-                        u'Espirito Santo': u'Br\xe9sil',
-                        u'Essonne': u'France',
-                        u'Estonie': u'Estonie',
-                        u'Estr\xe9madure': u'Espagne',
-                        u'Eub\xe9e': u'Gr\xe8ce',
-                        u'Eure': u'France',
-                        u'Eure-et-Loir': u'France',
-                        u'Eurytanie': u'Gr\xe8ce',
-                        u'Fidji': u'Fidji',
-                        u'Finist\xe8re': u'France',
-                        u'Finlande': u'Finlande',
-                        u'Flandre occidentale': u'Belgique',
-                        u'Flandre orientale': u'Belgique',
-                        u'Floride': u'\xc9tats-Unis',
-                        u'Fl\xf3rina': u'Gr\xe8ce',
-                        u'Formosa': u'Argentine',
-                        u'France': u'France',
-                        u'Franche-Comt\xe9': u'France',
-                        u'Fribourg': u'Suisse',
-                        u'Frioul-V\xe9n\xe9tie-Julienne': u'Italie',
-                        u'Frise': u'Pays-Bas',
-                        u'F\xe9ro\xe9 (\xeeles)': u'F\xe9ro\xe9 (\xeeles)',
-                        u'Gabon': u'Gabon',
-                        u'Galice': u'Espagne',
-                        u'Gambie': u'Gambie',
-                        u'Gard': u'France',
-                        u'Gen\xe8ve': u'Suisse',
-                        u'Gers': u'France',
-                        u'Ghana': u'Ghana',
-                        u'Gibraltar': u'Gibraltar',
-                        u'Gironde': u'France',
-                        u'Glaris': u'Suisse',
-                        u'Goi\xe1s': u'Br\xe9sil',
-                        u'Grande-Bretagne': u'Grande-Bretagne',
-                        u'Grenade': u'Grenade',
-                        u'Greven\xe1': u'Gr\xe8ce',
-                        u'Grisons': u'Suisse',
-                        u'Groenland': u'Groenland',
-                        u'Groningue': u'Pays-Bas',
-                        u'Gr\xe8ce': u'Gr\xe8ce',
-                        u'Gr\xe8ce centrale': u'Gr\xe8ce',
-                        u'Gr\xe8ce occidentale': u'Gr\xe8ce',
-                        u'Guadeloupe': u'Guadeloupe',
-                        u'Guam': u'Guam',
-                        u'Guanajuato': u'Mexique',
-                        u'Guatemala': u'Guatemala',
-                        u'Gueldre': u'Pays-Bas',
-                        u'Guernesey': u'Guernesey',
-                        u'Guerrero': u'Mexique',
-                        u'Guin\xe9e': u'Guin\xe9e',
-                        u'Guin\xe9e \xe9quatoriale': u'Guin\xe9e \xe9quatoriale',
-                        u'Guin\xe9e-Bissau': u'Guin\xe9e-Bissau',
-                        u'Guyana': u'Guyana',
-                        u'Guyane fran\xe7aise': u'Guyane fran\xe7aise',
-                        u'G\xe9orgie': u'\xc9tats-Unis',
-                        u'G\xe9orgie du Sud et les \xeeles Sandwich du Sud': u'G\xe9orgie du Sud et les \xeeles Sandwich du Sud',
-                        u'Hainaut': u'Belgique',
-                        u'Hambourg': u'Allemagne',
-                        u'Haut-Rhin': u'France',
-                        u'Haute-Autriche': u'Autriche',
-                        u'Haute-Corse': u'France',
-                        u'Haute-Garonne': u'France',
-                        u'Haute-Loire': u'France',
-                        u'Haute-Marne': u'France',
-                        u'Haute-Normandie': u'France',
-                        u'Haute-Savoie': u'France',
-                        u'Haute-Sa\xf4ne': u'France',
-                        u'Haute-Vienne': u'France',
-                        u'Hautes-Alpes': u'France',
-                        u'Hautes-Pyr\xe9n\xe9es': u'France',
-                        u'Hauts-de-Seine': u'France',
-                        u'Hawaii': u'\xc9tats-Unis',
-                        u'Ha\xefti': u'Ha\xefti',
-                        u'Heard (\xeele) et \xeeles McDonald': u'Heard (\xeele) et \xeeles McDonald',
-                        u'Hesse': u'Allemagne',
-                        u'Hidalgo': u'Mexique',
-                        u'Hollande-M\xe9ridionale': u'Pays-Bas',
-                        u'Hollande-Septentrionale': u'Pays-Bas',
-                        u'Honduras': u'Honduras',
-                        u'Hong Kong': u'Hong Kong',
-                        u'Hongrie': u'Hongrie',
-                        u'H\xe9rault': u'France',
-                        u'Idaho': u'\xc9tats-Unis',
-                        u'Ille-et-Vilaine': u'France',
-                        u'Illinois': u'\xc9tats-Unis',
-                        u'Inde': u'Inde',
-                        u'Indiana': u'\xc9tats-Unis',
-                        u'Indon\xe9sie': u'Indon\xe9sie',
-                        u'Indre': u'France',
-                        u'Indre-et-Loire': u'France',
-                        u'Iowa': u'\xc9tats-Unis',
-                        u'Io\xe1nnina': u'Gr\xe8ce',
-                        u'Irak': u'Irak',
-                        u'Iran': u'Iran',
-                        u'Irlande': u'Irlande',
-                        u'Ir\xe1kleion': u'Gr\xe8ce',
-                        u'Islande': u'Islande',
-                        u'Isra\xebl': u'Isra\xebl',
-                        u'Is\xe8re': u'France',
-                        u'Italie': u'Italie',
-                        u'Jalisco': u'Mexique',
-                        u'Jama\xefque': u'Jama\xefque',
-                        u'Japon': u'Japon',
-                        u'Jersey': u'Jersey',
-                        u'Jordanie': u'Jordanie',
-                        u'Jujuy': u'Argentine',
-                        u'Jura': u'France',
-                        u'Kansas': u'\xc9tats-Unis',
-                        u'Kard\xedtsa': u'Gr\xe8ce',
-                        u'Kastori\xe1': u'Gr\xe8ce',
-                        u'Kav\xe1la': u'Gr\xe8ce',
-                        u'Kazakhstan': u'Kazakhstan',
-                        u'Kentucky': u'\xc9tats-Unis',
-                        u'Kenya': u'Kenya',
-                        u'Kilk\xeds': u'Gr\xe8ce',
-                        u'Kirghizistan': u'Kirghizistan',
-                        u'Kiribati': u'Kiribati',
-                        u'Kosovo': u'Kosovo',
-                        u'Kowe\xeft': u'Kowe\xeft',
-                        u'Koz\xe1ni': u'Gr\xe8ce',
-                        u'La Can\xe9e': u'Gr\xe8ce',
-                        u'Laconie': u'Gr\xe8ce',
-                        u'Landes': u'France',
-                        u'Languedoc-Roussillon': u'France',
-                        u'Laos': u'Laos',
-                        u'Las\xedthi': u'Gr\xe8ce',
-                        u'Latium': u'Italie',
-                        u'Le Pir\xe9e': u'Gr\xe8ce',
-                        u'Lesotho': u'Lesotho',
-                        u'Lettonie': u'Lettonie',
-                        u'Leucade': u'Gr\xe8ce',
-                        u'Liban': u'Liban',
-                        u'Liberia': u'Liberia',
-                        u'Libye': u'Libye',
-                        u'Liechtenstein': u'Liechtenstein',
-                        u'Ligurie': u'Italie',
-                        u'Limbourg': u'Pays-Bas',
-                        u'Limousin': u'France',
-                        u'Lituanie': u'Lituanie',
-                        u'Li\xe8ge': u'Belgique',
-                        u'Loir-et-Cher': u'France',
-                        u'Loire': u'France',
-                        u'Loire-Atlantique': u'France',
-                        u'Loiret': u'France',
-                        u'Lombardie': u'Italie',
-                        u'Lorraine': u'France',
-                        u'Lot': u'France',
-                        u'Lot-et-Garonne': u'France',
-                        u'Louisiane': u'\xc9tats-Unis',
-                        u'Loz\xe8re': u'France',
-                        u'Lucerne': u'Suisse',
-                        u'Luxembourg': u'Belgique',
-                        u'L\xe1risa': u'Gr\xe8ce',
-                        u'L\xe9svos': u'Gr\xe8ce',
-                        u'Macao': u'Macao',
-                        u'Mac\xe9doine (R\xe9publique)': u'Mac\xe9doine (R\xe9publique)',
-                        u'Mac\xe9doine centrale': u'Gr\xe8ce',
-                        u'Mac\xe9doine occidentale': u'Gr\xe8ce',
-                        u'Mac\xe9doine orientale et Thrace': u'Gr\xe8ce',
-                        u'Madagascar': u'Madagascar',
-                        u'Magn\xe9sie': u'Gr\xe8ce',
-                        u'Maine': u'\xc9tats-Unis',
-                        u'Maine-et-Loire': u'France',
-                        u'Malaisie': u'Malaisie',
-                        u'Malawi': u'Malawi',
-                        u'Maldives': u'Maldives',
-                        u'Mali': u'Mali',
-                        u'Malouines (\xeeles)': u'Malouines (\xeeles)',
-                        u'Malte': u'Malte',
-                        u'Manche': u'France',
-                        u'Manitoba': u'Canada',
-                        u'Maranh\xe3o': u'Br\xe9sil',
-                        u'Marches': u'Italie',
-                        u'Mariannes du Nord (\xeeles)': u'Mariannes du Nord (\xeeles)',
-                        u'Marne': u'France',
-                        u'Maroc': u'Maroc',
-                        u'Marshall (\xeeles)': u'Marshall (\xeeles)',
-                        u'Martinique': u'Martinique',
-                        u'Maryland': u'\xc9tats-Unis',
-                        u'Massachusetts': u'\xc9tats-Unis',
-                        u'Mato grosso': u'Br\xe9sil',
-                        u'Mato grosso do Sul': u'Br\xe9sil',
-                        u'Maurice': u'Maurice',
-                        u'Mauritanie': u'Mauritanie',
-                        u'Mayenne': u'France',
-                        u'Mayotte': u'Mayotte',
-                        u'Mecklembourg-Pom\xe9ranie ant\xe9rieure': u'Allemagne',
-                        u'Melilla': u'Espagne',
-                        u'Mendoza': u'Argentine',
-                        u'Mess\xe9nie': u'Gr\xe8ce',
-                        u'Meurthe-et-Moselle': u'France',
-                        u'Meuse': u'France',
-                        u'Mexico': u'Mexique',
-                        u'Mexique': u'Mexique',
-                        u'Michigan': u'\xc9tats-Unis',
-                        u'Michoac\xe1n': u'Mexique',
-                        u'Micron\xe9sie': u'Micron\xe9sie',
-                        u'Midi-Pyr\xe9n\xe9es': u'France',
-                        u'Minas Gerais': u'Br\xe9sil',
-                        u'Minnesota': u'\xc9tats-Unis',
-                        u'Misiones': u'Argentine',
-                        u'Mississippi': u'\xc9tats-Unis',
-                        u'Missouri': u'\xc9tats-Unis',
-                        u'Moldavie': u'Moldavie',
-                        u'Molise': u'Italie',
-                        u'Monaco': u'Monaco',
-                        u'Mongolie': u'Mongolie',
-                        u'Montana': u'\xc9tats-Unis',
-                        u'Montserrat': u'Montserrat',
-                        u'Mont\xe9n\xe9gro': u'Mont\xe9n\xe9gro',
-                        u'Morbihan': u'France',
-                        u'Morelos': u'Mexique',
-                        u'Moselle': u'France',
-                        u'Mozambique': u'Mozambique',
-                        u'Murcie': u'Espagne',
-                        u'Myanmar': u'Myanmar',
-                        u'Namibie': u'Namibie',
-                        u'Namur': u'Belgique',
-                        u'Nauru': u'Nauru',
-                        u'Navarre': u'Espagne',
-                        u'Nayarit': u'Mexique',
-                        u'Nebraska': u'\xc9tats-Unis',
-                        u'Neuch\xe2tel': u'Suisse',
-                        u'Neuqu\xe9n': u'Argentine',
-                        u'Nevada': u'\xc9tats-Unis',
-                        u'New Hampshire': u'\xc9tats-Unis',
-                        u'New Jersey': u'\xc9tats-Unis',
-                        u'New York': u'\xc9tats-Unis',
-                        u'Nicaragua': u'Nicaragua',
-                        u'Nidwald': u'Suisse',
-                        u'Niger': u'Niger',
-                        u'Nigeria': u'Nigeria',
-                        u'Niue': u'Niue',
-                        u'Ni\xe8vre': u'France',
-                        u'Nord': u'France',
-                        u'Nord-Pas-de-Calais': u'France',
-                        u'Norfolk (\xeele)': u'Norfolk (\xeele)',
-                        u'Norv\xe8ge': u'Norv\xe8ge',
-                        u'Nouveau Mexique': u'\xc9tats-Unis',
-                        u'Nouveau-Brunswick': u'Canada',
-                        u'Nouvelle-Cal\xe9donie': u'Nouvelle-Cal\xe9donie',
-                        u'Nouvelle-Galles-du-Sud': u'Australie',
-                        u'Nouvelle-Z\xe9lande': u'Nouvelle-Z\xe9lande',
-                        u'Nouvelle-\xc9cosse': u'Canada',
-                        u'Nuevo Le\xf3n': u'Mexique',
-                        u'N\xe9pal': u'N\xe9pal',
-                        u'Oaxaca': u'Mexique',
-                        u'Obwald': u'Suisse',
-                        u'Ohio': u'\xc9tats-Unis',
-                        u'Oise': u'France',
-                        u'Oklahoma': u'\xc9tats-Unis',
-                        u'Oman': u'Oman',
-                        u'Ombrie': u'Italie',
-                        u'Ontario': u'Canada',
-                        u'Oregon': u'\xc9tats-Unis',
-                        u'Orne': u'France',
-                        u'Ouganda': u'Ouganda',
-                        u'Ouzb\xe9kistan': u'Ouzb\xe9kistan',
-                        u'Overijssell': u'Pays-Bas',
-                        u'Pakistan': u'Pakistan',
-                        u'Palau': u'Palau',
-                        u'Pampa': u'Argentine',
-                        u'Panama': u'Panama',
-                        u'Papouasie-Nouvelle-Guin\xe9e': u'Papouasie-Nouvelle-Guin\xe9e',
-                        u'Paraguay': u'Paraguay',
-                        u'Paraiba': u'Br\xe9sil',
-                        u'Param\xe1': u'Br\xe9sil',
-                        u'Paris': u'France',
-                        u'Par\xe1': u'Br\xe9sil',
-                        u'Pas-de-Calais': u'France',
-                        u'Pays Basque': u'Espagne',
-                        u'Pays-Bas': u'Pays-Bas',
-                        u'Pays-Bas avant 1830': u'Pays-Bas avant 1830',
-                        u'Pays-de-la-Loire': u'France',
-                        u'Pennsylvanie': u'\xc9tats-Unis',
-                        u'Pernambouc': u'Br\xe9sil',
-                        u'Philippines': u'Philippines',
-                        u'Phocide': u'Gr\xe8ce',
-                        u'Phtiotide': u'Gr\xe8ce',
-                        u'Piau\xed': u'Br\xe9sil',
-                        u'Picardie': u'France',
-                        u'Pitcairn': u'Pitcairn',
-                        u'Pi\xe9mont': u'Italie',
-                        u'Pi\xe9rie': u'Gr\xe8ce',
-                        u'Poitou-Charentes': u'France',
-                        u'Pologne': u'Pologne',
-                        u'Polyn\xe9sie fran\xe7aise': u'Polyn\xe9sie fran\xe7aise',
-                        u'Porto Rico': u'Porto Rico',
-                        u'Portugal': u'Portugal',
-                        u'Pouilles': u'Italie',
-                        u"Provence-Alpes-C\xf4te d'Azur": u'France',
-                        u'Pr\xe9veza': u'Gr\xe8ce',
-                        u'Puebla': u'Mexique',
-                        u'Puy-de-D\xf4me': u'France',
-                        u'Pyr\xe9n\xe9es-Atlantiques': u'France',
-                        u'Pyr\xe9n\xe9es-Orientales': u'France',
-                        u'P\xe9lla': u'Gr\xe8ce',
-                        u'P\xe9loponn\xe8se': u'Gr\xe8ce',
-                        u'P\xe9rou': u'P\xe9rou',
-                        u'Qatar': u'Qatar',
-                        u'Queensland': u'Australie',
-                        u'Quer\xe9taro': u'Mexique',
-                        u'Quintana Roo': u'Mexique',
-                        u'Qu\xe9bec': u'Canada',
-                        u'Rhode Island': u'\xc9tats-Unis',
-                        u'Rhodope': u'Gr\xe8ce',
-                        u'Rh\xe9nanie-Palatinat': u'Allemagne',
-                        u'Rh\xe9nanie-du-Nord-Westphalie': u'Allemagne',
-                        u'Rh\xf4ne': u'France',
-                        u'Rh\xf4ne-Alpes': u'France',
-                        u'Rio Grande do Norte': u'Br\xe9sil',
-                        u'Rio Grande do Sul': u'Br\xe9sil',
-                        u'Rio Negro': u'Argentine',
-                        u'Rio de Janeiro': u'Br\xe9sil',
-                        u'Rioja': u'Argentine',
-                        u'Rond\xf4nia': u'Br\xe9sil',
-                        u'Roraima': u'Br\xe9sil',
-                        u'Roumanie': u'Roumanie',
-                        u'Royaume-Uni': u'Grande-Bretagne',
-                        u'Russie (F\xe9d\xe9ration)': u'Russie (F\xe9d\xe9ration)',
-                        u'Rwanda': u'Rwanda',
-                        u'R\xc3\xa9publique Tch\xc3\xa8que': u'R\xc3\xa9publique tch\xc3\xa8que',
-                        u'R\xe9publique dominicaine': u'R\xe9publique dominicaine',
-                        u'R\xe9publique tch\xe8que': u'R\xe9publique tch\xe8que',
-                        u'R\xe9thymnon': u'Gr\xe8ce',
-                        u'R\xe9union': u'R\xe9union',
-                        u'Sahara occidental': u'Sahara occidental',
-                        u'Saint-Barth\xe9lemy': u'Saint-Barth\xe9lemy',
-                        u'Saint-Gall': u'Suisse',
-                        u'Saint-Kitts-et-Nevis': u'Saint-Kitts-et-Nevis',
-                        u'Saint-Marin': u'Saint-Marin',
-                        u'Saint-Martin (partie fran\xe7aise)': u'Saint-Martin (partie fran\xe7aise)',
-                        u'Saint-Martin (partie n\xe9erlandaise)': u'Saint-Martin (partie n\xe9erlandaise)',
-                        u'Saint-Pierre-et-Miquelon': u'Saint-Pierre-et-Miquelon',
-                        u'Saint-Vincent-et-les Grenadines': u'Saint-Vincent-et-les Grenadines',
-                        u'Sainte-H\xe9l\xe8ne': u'Sainte-H\xe9l\xe8ne',
-                        u'Sainte-Lucie': u'Sainte-Lucie',
-                        u'Salomon (\xeeles)': u'Salomon (\xeeles)',
-                        u'Salta': u'Argentine',
-                        u'Salzbourg': u'Autriche',
-                        u'Samoa': u'Samoa',
-                        u'Samoa am\xe9ricaines': u'Samoa am\xe9ricaines',
-                        u'San Juan': u'Argentine',
-                        u'San Luis': u'Argentine',
-                        u'San Luis Potos\xed': u'Mexique',
-                        u'Santa Catarina': u'Br\xe9sil',
-                        u'Santa Cruz': u'Argentine',
-                        u'Santa Fe': u'Argentine',
-                        u'Santiago del Estero': u'Argentine',
-                        u'Sao Tom\xe9-et-Principe': u'Sao Tom\xe9-et-Principe',
-                        u'Sardaigne': u'Italie',
-                        u'Sarre': u'Allemagne',
-                        u'Sarthe': u'France',
-                        u'Saskatchewan': u'Canada',
-                        u'Savoie': u'France',
-                        u'Saxe': u'Allemagne',
-                        u'Saxe-Anhalt': u'Allemagne',
-                        u'Sa\xf4ne-et-Loire': u'France',
-                        u'Schaffhouse': u'Suisse',
-                        u'Schleswig-Holstein': u'Allemagne',
-                        u'Schwyz': u'Suisse',
-                        u'Seine-Maritime': u'France',
-                        u'Seine-Saint-Denis': u'France',
-                        u'Seine-et-Marne': u'France',
-                        u'Serbie': u'Serbie',
-                        u'Serbie-et-Mont\xe9n\xe9gro': u'Serbie-et-Mont\xe9n\xe9gro',
-                        u'Sergipe': u'Br\xe9sil',
-                        u'Seychelles': u'Seychelles',
-                        u'Sicile': u'Italie',
-                        u'Sierra Leone': u'Sierra Leone',
-                        u'Sinaloa': u'Mexique',
-                        u'Singapour': u'Singapour',
-                        u'Slovaquie': u'Slovaquie',
-                        u'Slov\xe9nie': u'Slov\xe9nie',
-                        u'Soleure': u'Suisse',
-                        u'Somalie': u'Somalie',
-                        u'Somme': u'France',
-                        u'Sonora': u'Mexique',
-                        u'Soudan': u'Soudan',
-                        u'Soudan du Sud': u'Soudan du Sud',
-                        u'Sri Lanka': u'Sri Lanka',
-                        u'Styrie': u'Autriche',
-                        u'Suisse': u'Suisse',
-                        u'Suriname': u'Suriname',
-                        u'Su\xe8de': u'Su\xe8de',
-                        u'Svalbard et \xeele Jan Mayen': u'Svalbard et \xeele Jan Mayen',
-                        u'Swaziland': u'Swaziland',
-                        u'Syrie': u'Syrie',
-                        u'S\xe1mos': u'Gr\xe8ce',
-                        u'S\xe3o Paulo': u'Br\xe9sil',
-                        u'S\xe9n\xe9gal': u'S\xe9n\xe9gal',
-                        u'S\xe9rrai': u'Gr\xe8ce',
-                        u'Tabasco': u'Mexique',
-                        u'Tadjikistan': u'Tadjikistan',
-                        u'Tamaulipas': u'Mexique',
-                        u'Tanzanie': u'Tanzanie',
-                        u'Tarn': u'France',
-                        u'Tarn-et-Garonne': u'France',
-                        u'Tasmanie': u'Australie',
-                        u'Ta\xefwan': u'Ta\xefwan',
-                        u'Tchad': u'Tchad',
-                        u'Tch\xe9coslovaquie': u'Tch\xe9coslovaquie',
-                        u'Tennessee': u'\xc9tats-Unis',
-                        u'Terre de Feu': u'Argentine',
-                        u'Terre de la Reine-Maud (Norv\xe8ge)': u'Antarctique',
-                        u'Terre-Neuve': u'Canada',
-                        u'Terres australes et antarctiques fran\xe7aises': u'Antarctique',
-                        u'Terres australes fran\xe7aises': u'Terres australes fran\xe7aises',
-                        u'Territoire antarctique australien': u'Antarctique',
-                        u'Territoire antarctique britannique': u'Antarctique',
-                        u"Territoire britannique de l'Oc\xe9an indien": u"Territoire britannique de l'Oc\xe9an indien",
-                        u'Territoire de la capitale australienne': u'Australie',
-                        u'Territoire du Nord': u'Australie',
-                        u'Territoire du Yukon': u'Canada',
-                        u'Territoire-de-Belfort': u'France',
-                        u'Territoires du Nord-Ouest': u'Canada',
-                        u'Tessin': u'Suisse',
-                        u'Texas': u'\xc9tats-Unis',
-                        u'Tha\xeflande': u'Tha\xeflande',
-                        u'Thesprotie': u'Gr\xe8ce',
-                        u'Thessalie': u'Gr\xe8ce',
-                        u'Thessalonique': u'Gr\xe8ce',
-                        u'Thurgovie': u'Suisse',
-                        u'Thuringe': u'Allemagne',
-                        u'Timor oriental': u'Timor oriental',
-                        u'Tlaxcala': u'Mexique',
-                        u'Togo': u'Togo',
-                        u'Tokelau': u'Tokelau',
-                        u'Tonga': u'Tonga',
-                        u'Toscane': u'Italie',
-                        u'Trentin-Haut-Adige': u'Italie',
-                        u'Trinit\xe9-et-Tobago': u'Trinit\xe9-et-Tobago',
-                        u'Tr\xedkala': u'Gr\xe8ce',
-                        u'Tucum\xe1n': u'Argentine',
-                        u'Tunisie': u'Tunisie',
-                        u'Turkm\xe9nistan': u'Turkm\xe9nistan',
-                        u'Turks et Ca\xefques (\xeeles)': u'Turks et Ca\xefques (\xeeles)',
-                        u'Turquie': u'Turquie',
-                        u'Tuvalu': u'Tuvalu',
-                        u'Tyrol': u'Autriche',
-                        u'URSS': u'URSS',
-                        u'US': u'\xc9tats-Unis',
-                        'USA': u'\xc9tats-Unis',
-                        u'Ukraine': u'Ukraine',
-                        u'Uri': u'Suisse',
-                        u'Uruguay': u'Uruguay',
-                        u'Utah': u'\xc9tats-Unis',
-                        u'Utrecht': u'Pays-Bas',
-                        u"Val d'Aoste": u'Italie',
-                        u"Val-d'Oise": u'France',
-                        u'Val-de-Marne': u'France',
-                        u'Valais': u'Suisse',
-                        u'Vanuatu': u'Vanuatu',
-                        u'Var': u'France',
-                        u'Vatican': u'Vatican',
-                        u'Vaucluse': u'France',
-                        u'Vaud': u'Suisse',
-                        u'Vend\xe9e': u'France',
-                        u'Venezuela': u'Venezuela',
-                        u'Veracruz': u'Mexique',
-                        u'Vermont': u'\xc9tats-Unis',
-                        u'Victoria': u'Australie',
-                        u'Vienne': u'Autriche',
-                        u'Viet Nam': u'Viet Nam',
-                        u'Virginie': u'\xc9tats-Unis',
-                        u'Virginie occidentale': u'\xc9tats-Unis',
-                        u'Vorarlberg': u'Autriche',
-                        u'Vosges': u'France',
-                        u'V\xe9n\xe9tie': u'Italie',
-                        u'Wallis et Futuna (\xeeles)': u'Wallis et Futuna (\xeeles)',
-                        u'Washington': u'\xc9tats-Unis',
-                        u'Wisconsin': u'\xc9tats-Unis',
-                        u'Wyoming': u'\xc9tats-Unis',
-                        u'X\xe1nthi': u'Gr\xe8ce',
-                        u'Yonne': u'France',
-                        u'Yougoslavie': u'Yougoslavie',
-                        u'Yucat\xe1n': u'Mexique',
-                        u'Yvelines': u'France',
-                        u'Y\xe9men': u'Y\xe9men',
-                        u'Y\xe9men (R\xe9publique d\xe9mocratique populaire)': u'Y\xe9men (R\xe9publique d\xe9mocratique populaire)',
-                        u'Zacatecas': u'Mexique',
-                        u'Zambie': u'Zambie',
-                        u'Zimbabwe': u'Zimbabwe',
-                        u'Zoug': u'Suisse',
-                        u'Zurich': u'Suisse',
-                        u'Z\xe1kynthos': u'Gr\xe8ce',
-                        u'Z\xe9lande': u'Pays-Bas',
-                        u'aire g\xe9ographique ancienne': u'aire g\xe9ographique ancienne',
-                        u'code non adapt\xe9': u'code non adapt\xe9',
-                        u'inconnu': u'inconnu',
-                        u'intergouvernemental': u'intergouvernemental',
-                        u'multiple': u'multiple',
-                        u"ne s'applique pas": u"ne s'applique pas",
-                        u'non renseign\xe9': u'non renseign\xe9',
-                        u'\xc1rta': u'Gr\xe8ce',
-                        u'\xc9gypte': u'\xc9gypte',
-                        u'\xc9lide': u'Gr\xe8ce',
-                        u'\xc9mathie': u'Gr\xe8ce',
-                        u'\xc9milie-Romagne': u'Italie',
-                        u'\xc9mirats arabes unis': u'\xc9mirats arabes unis',
-                        u'\xc9pire': u'Gr\xe8ce',
-                        u'\xc9quateur': u'\xc9quateur',
-                        u'\xc9rythr\xe9e': u'\xc9rythr\xe9e',
-                        u'\xc9tats-Unis': u'\xc9tats-Unis',
-                        u'\xc9thiopie': u'\xc9thiopie',
-                        u'\xc9tolie-et-Acarnanie': u'Gr\xe8ce',
-                        u'\xc9vros': u'Gr\xe8ce',
-                        u'\xcele Pierre 1er (Norv\xe8ge)': u'Antarctique',
-                        u'\xcele de Man': u'\xcele de Man',
-                        u'\xcele du Prince-\xc9douard': u'Canada',
-                        u'\xcele-de-France': u'France',
-                        u'\xceles Bal\xe9ares': u'Espagne',
-                        u'\xceles Ioniennes': u'Gr\xe8ce',
-                        u'\xceles Vierges am\xe9ricaines': u'\xceles Vierges am\xe9ricaines',
-                        u'\xceles Vierges britanniques': u'\xceles Vierges britanniques',
-                        u'\xceles de la Mer \xc9g\xe9e m\xe9ridionale': u'Gr\xe8ce',
-                        u'\xceles de la Mer \xc9g\xe9e septentrionale': u'Gr\xe8ce',
-                        u'\xceles mineures \xe9loign\xe9es des \xc9tats-Unis': u'\xceles mineures \xe9loign\xe9es des \xc9tats-Unis'
-                                                }
--- a/reference_data/countries_iso_3166.txt	Thu Dec 19 14:45:43 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,269 +0,0 @@
-##,non renseigné
-..,non renseigné
-aa,aire géographique ancienne
-ad,Andorre
-ae,Émirats arabes unis
-af,Afghanistan
-ag,Antigua-et-Barbuda
-ai,Anguilla
-al,Albanie
-am,Arménie
-an,Antilles néerlandaises
-ao,Angola
-aq,Antarctique
-ar,Argentine
-as,Samoa américaines
-at,Autriche
-au,Australie
-aw,Aruba
-ax,Aland (îles)
-az,Azerbaïdjan
-ba,Bosnie-Herzégovine
-bb,Barbade
-bd,Bangladesh
-be,Belgique
-bf,Burkina
-bg,Bulgarie
-bh,Bahreïn
-bi,Burundi
-bj,Bénin
-bl,Saint-Barthélemy
-bm,Bermudes
-bn,Brunéi
-bo,Bolivie
-bq,Bonaire, Saint-Eustache et Saba,Saba###Saint-Eustache
-br,Brésil
-bs,Bahamas
-bt,Bhoutan
-bv,Bouvet (île)
-bw,Botswana
-by,Biélorussie,Bélarus
-bz,Belize
-ca,Canada
-cc,Cocos (îles),Keeling (îles)
-cd,Congo (République démocratique),Zaïre
-cf,Centrafrique,République centrafricaine
-cg,Congo,Congo (République)
-ch,Suisse,Confédération helvétique
-ci,Côte d'Ivoire
-ck,Cook (îles)
-cl,Chili
-cm,Cameroun
-cn,Chine,Chine (République populaire)
-co,Colombie
-cr,Costa Rica
-cs,Serbie-et-Monténégro
-cu,Cuba
-cv,Cap-Vert
-cw,Curaçao
-cx,Christmas (île)
-cy,Chypre
-cz,République tchèque,Tchèque, République
-dd,Allemagne (République démocratique)
-de,Allemagne,Allemagne (République fédérale)
-dj,Djibouti
-dk,Danemark
-dm,Dominique
-do,République dominicaine,Dominicaine, République
-dz,Algérie
-ec,Équateur
-ee,Estonie
-eg,Égypte
-eh,Sahara occidental
-er,Érythrée
-es,Espagne
-et,Éthiopie
-fi,Finlande
-fj,Fidji
-fk,Malouines (îles),Falkland (îles)
-fm,Micronésie,États fédérés de Micronésie
-fo,Féroé (îles)
-fr,France
-ga,Gabon
-gb,Grande-Bretagne,Royaume-Uni
-gd,Grenade
-ge,Géorgie
-gf,Guyane française
-gg,Guernesey
-gh,Ghana
-gi,Gibraltar
-gl,Groenland
-gm,Gambie
-gn,Guinée
-gp,Guadeloupe
-gq,Guinée équatoriale
-gr,Grèce
-gs,Géorgie du Sud et les îles Sandwich du Sud
-gt,Guatemala
-gu,Guam
-gw,Guinée-Bissau
-gy,Guyana
-hk,Hong Kong
-hm,Heard (île) et îles McDonald
-hn,Honduras
-hr,Croatie
-ht,Haïti
-hu,Hongrie
-id,Indonésie
-ie,Irlande
-ii,intergouvernemental
-il,Israël
-im,Île de Man,Man, Île de
-in,Inde
-io,Territoire britannique de l'Océan indien,Chagos (îles)###Océan indien, Territoire britannique de l'
-iq,Irak
-ir,Iran
-is,Islande
-it,Italie
-je,Jersey
-jm,Jamaïque
-jo,Jordanie
-jp,Japon
-ke,Kenya
-kg,Kirghizistan
-kh,Cambodge
-ki,Kiribati
-km,Comores
-kn,Saint-Kitts-et-Nevis,Saint-Christophe-et-Nevis
-ko,Kosovo
-kp,Corée (République populaire démocratique),Corée du Nord
-kr,Corée (République),Corée du Sud
-kw,Koweït
-ky,Cayman,Caïmanes, Îles###Caïman (îles)
-kz,Kazakhstan
-la,Laos
-lb,Liban
-lc,Sainte-Lucie
-li,Liechtenstein
-lk,Sri Lanka
-lr,Liberia
-ls,Lesotho
-lt,Lituanie
-lu,Luxembourg
-lv,Lettonie
-ly,Libye
-ma,Maroc
-mc,Monaco
-md,Moldavie,Moldova, République de
-me,Monténégro
-mf,Saint-Martin (partie française)
-mg,Madagascar
-mh,Marshall (îles)
-mk,Macédoine (République)
-ml,Mali
-mm,Myanmar,Birmanie
-mn,Mongolie
-mo,Macao
-mp,Mariannes du Nord (îles)
-mq,Martinique
-mr,Mauritanie
-ms,Montserrat
-mt,Malte
-mu,Maurice
-mv,Maldives
-mw,Malawi
-mx,Mexique
-my,Malaisie
-mz,Mozambique
-na,Namibie
-nc,Nouvelle-Calédonie
-ne,Niger
-nf,Norfolk (île)
-ng,Nigeria
-ni,Nicaragua
-nl,Pays-Bas
-no,Norvège
-np,Népal
-nr,Nauru
-nu,Niue
-nz,Nouvelle-Zélande
-om,Oman
-oo,code non adapté
-pa,Panama
-pe,Pérou
-pf,Polynésie française
-pg,Papouasie-Nouvelle-Guinée
-ph,Philippines
-pk,Pakistan
-pl,Pologne
-pm,Saint-Pierre-et-Miquelon
-pn,Pitcairn
-pr,Porto Rico
-ps,Autorité palestinienne,Palestine
-pt,Portugal
-pw,Palau,Palaos
-py,Paraguay
-qa,Qatar
-re,Réunion
-ro,Roumanie
-rs,Serbie
-ru,Russie (Fédération),Russie
-rw,Rwanda
-sa,Arabie saoudite
-sb,Salomon (îles)
-sc,Seychelles
-sd,Soudan
-se,Suède
-sg,Singapour
-sh,Sainte-Hélène,Ascension (île)###Tristan da Cunha (île)
-si,Slovénie
-sj,Svalbard et île Jan Mayen
-sk,Slovaquie
-sl,Sierra Leone
-sm,Saint-Marin
-sn,Sénégal
-so,Somalie
-sr,Suriname
-ss,Soudan du Sud,Sud Soudan
-st,Sao Tomé-et-Principe
-su,URSS
-sv,El Salvador,Salvador
-sx,Saint-Martin (partie néerlandaise),Sint Maarten
-sy,Syrie
-sz,Swaziland
-tc,Turks et Caïques (îles)
-td,Tchad
-tf,Terres australes françaises
-tg,Togo
-th,Thaïlande
-tj,Tadjikistan
-tk,Tokelau
-tl,Timor oriental
-tm,Turkménistan
-tn,Tunisie
-to,Tonga
-tr,Turquie
-tt,Trinité-et-Tobago
-tv,Tuvalu
-tw,Taïwan,Chine (République)
-tz,Tanzanie
-ua,Ukraine
-ug,Ouganda
-um,Îles mineures éloignées des États-Unis
-us,États-Unis
-uy,Uruguay
-uz,Ouzbékistan
-va,Vatican,Saint-Siège
-vc,Saint-Vincent-et-les Grenadines
-ve,Venezuela
-vg,Îles Vierges britanniques,Vierges (îles) britanniques
-vi,Îles Vierges américaines,Vierges (îles) américaines
-vn,Viet Nam
-vu,Vanuatu
-wf,Wallis et Futuna (îles)
-ws,Samoa,Samoa occidentales
-xc,Tchécoslovaquie
-xd,Allemagne avant 1945
-xe,Europe,Union européenne
-xk,Corée avant 1948
-xn,Pays-Bas avant 1830,Belgique avant 1830
-xx,inconnu
-yd,Yémen (République démocratique populaire),Sud Yémen
-ye,Yémen
-yt,Mayotte
-yu,Yougoslavie
-yy,ne s'applique pas
-za,Afrique du Sud
-zm,Zambie
-zw,Zimbabwe
-zz,multiple
--- a/reference_data/stopwords.py	Thu Dec 19 14:45:43 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,15 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-Stopwords in different languages.
-"""
-
-FRENCH_STOPWORDS = set(['alors', 'au', 'aucuns', 'aussi', 'autre', 'aux', 'avant', 'avec', 'avoir', 'bon', 'car', 'ce', 'cela', 'ces', 'ceux', 'chaque', 'ci', 'comme', 'comment', 'dans', 'de', 'dedans', 'dehors', 'depuis', 'des', 'deux', 'devrait', 'doit', 'donc', 'dos', 'droite', 'du', 'début', 'elle', 'elles', 'en', 'encore', 'essai', 'est', 'et', 'eu', 'eux', 'fait', 'faites', 'fois', 'font', 'force', 'haut', 'hors', 'ici', 'il', 'ils', 'je', 'juste', 'la', 'le', 'les', 'leur', 'lui', 'là', 'ma', 'maintenant', 'mais', 'me', 'meme', 'mes', 'mine', 'moi', 'moins', 'mon', 'mot', 'ne', 'ni', 'nommés', 'nos', 'notre', 'nous', 'nouveaux', 'on', 'ou', 'où', 'par', 'parce', 'parole', 'pas', 'personnes', 'peu', 'peut', 'pièce', 'plupart', 'pour', 'pourquoi', 'qu', 'quand', 'que', 'quel', 'quelle', 'quelles', 'quels', 'qui', 'sa', 'sans', 'se', 'ses', 'seulement', 'si', 'sien', 'son', 'sont', 'sous', 'soyez', 'sujet', 'sur', 'ta', 'tandis', 'te', 'tellement', 'tels', 'tes', 'toi', 'ton', 'tous', 'tout', 'trop', 'très', 'tu', 'un', 'une', 'valeur', 'voie', 'voient', 'vont', 'vos', 'votre', 'vous', 'vu', 'ça', 'étaient', 'état', 'étions', 'été', 'être'])
-
-
-ENGLISH_STOPWORDS = set(['a', 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', "ain't", 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', "aren't", 'around', 'as', 'aside', 'ask', 'asking', 'associated', 'at', 'available', 'away', 'awfully', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'best', 'better', 'between', 'beyond', 'bill', 'both', 'bottom', 'brief', 'but', 'by', 'call', 'came', 'can', 'cannot', 'cant', "can't", 'cause', 'causes', 'certain', 'certainly', 'changes', 'clearly', 'co', 'com', 'come', 'comes', 'computer', 'con', 'concerning', 'consequently', 'consider', 'considering', 'contain', 'containing', 'contains', 'corresponding', 'could', 'couldnt', "couldn't", 'course', 'cry', 'currently', "c'mon", "c's", 'de', 'definitely', 'describe', 'described', 'despite', 'detail', 'did', "didn't", 'different', 'do', 'does', "doesn't", 'doing', 'done', "don't", 'down', 'downwards', 'due', 'during', 'each', 'edu', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'entirely', 'especially', 'et', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'exactly', 'example', 'except', 'far', 'few', 'fifteen', 'fifth', 'fify', 'fill', 'find', 'fire', 'first', 'five', 'followed', 'following', 'follows', 'for', 'former', 'formerly', 'forth', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'furthermore', 'get', 'gets', 'getting', 'give', 'given', 'gives', 'go', 'goes', 'going', 'gone', 'got', 'gotten', 'greetings', 'had', "hadn't", 'happens', 'hardly', 'has', 'hasnt', "hasn't", 'have', "haven't", 'having', 'he', 'hello', 'help', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', "here's", 'hers', 'herself', "he's", 'hi', 'him', 'himself', 'his', 'hither', 'hopefully', 'how', 'howbeit', 'however', 'hundred', 'i', "i'd", "i'll", "i'm", "i've", 'ie', 'if', 'ignored', 'immediate', 'in', 'inasmuch', 'inc', 'indeed', 'indicate', 'indicated', 'indicates', 'inner', 'insofar', 'instead', 'interest', 'into', 'inward', 'is', "isn't", 'it', 'its', 'itself', "it'd", "it'll", "it's'", "i'd", "i'll", "i'm", "i've", 'just', 'keep', 'keeps', 'kept', 'know', 'known', 'knows', 'last', 'lately', 'later', 'latter', 'latterly', 'least', 'less', 'lest', 'let', "let's", 'like', 'liked', 'likely', 'little', 'look', 'looking', 'looks', 'ltd', 'made', 'mainly', 'many', 'may', 'maybe', 'me', 'mean', 'meanwhile', 'merely', 'might', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', 'my', 'myself', 'name', 'namely', 'nd', 'near', 'nearly', 'necessary', 'need', 'needs', 'neither', 'never', 'nevertheless', 'new', 'next', 'nine', 'no', 'nobody', 'non', 'none', 'noone', 'nor', 'normally', 'not', 'nothing', 'novel', 'now', 'nowhere', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old', 'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'own', 'part', 'particular', 'particularly', 'per', 'perhaps', 'placed', 'please', 'plus', 'possible', 'presumably', 'probably', 'provides', 'put', 'que', 'quite', 'qv', 'rather', 'rd', 're', 'really', 'reasonably', 'regarding', 'regardless', 'regards', 'relatively', 'respectively', 'right', 'said', 'same', 'saw', 'say', 'saying', 'says', 'second', 'secondly', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves', 'sensible', 'sent', 'serious', 'seriously', 'seven', 'several', 'shall', 'she', 'should', "shouldn't", 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some', 'somebody', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry', 'specified', 'specify', 'specifying', 'still', 'sub', 'such', 'sup', 'sure', 'system', 'take', 'taken', 'tell', 'ten', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', "that's", 'thats', "that's", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'theres', 'thereupon', "there's", 'these', 'they', "they'd", "they'll", "they're", "they've", 'thick', 'thin', 'think', 'third', 'this', 'thorough', 'thoroughly', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'took', 'top', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying', 'twelve', 'twenty', 'twice', 'two', "t's", 'un', 'under', 'unfortunately', 'unless', 'unlikely', 'until', 'unto', 'up', 'upon', 'us', 'use', 'used', 'useful', 'uses', 'using', 'usually', 'value', 'various', 'very', 'via', 'viz', 'vs', 'want', 'wants', 'was', "wasn't", 'way', 'we', 'welcome', 'well', 'went', 'were', "weren't", "we'd", "we'll", "we're", "we've", 'what', 'whatever', "what's", 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', "where's", 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', "who's", 'why', 'will', 'willing', 'wish', 'with', 'within', 'without', 'wonder', "won't", 'would', "wouldn't", 'yes', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves', "you'd", "you'll", "you're", "you've", 'zero'])
-
-
-ENGLISH_REGULAR_VERBS = set(['accept', 'add', 'admire', 'admit', 'advise', 'afford', 'agree', 'alert', 'allow', 'amuse', 'analyse', 'announce', 'annoy', 'answer', 'apologise', 'appear', 'applaud', 'appreciate', 'approve', 'argue', 'arrange', 'arrest', 'arrive', 'ask', 'attach', 'attack', 'attempt', 'attend', 'attract', 'avoid', 'back', 'bake', 'balance', 'ban', 'bang', 'bare', 'bat', 'bathe', 'battle', 'beam', 'beg', 'behave', 'belong', 'bleach', 'bless', 'blind', 'blink', 'blot', 'blush', 'boast', 'boil', 'bolt', 'bomb', 'book', 'bore', 'borrow', 'bounce', 'bow', 'box', 'brake', 'branch', 'breathe', 'bruise', 'brush', 'bubble', 'bump', 'burn', 'bury', 'buzz', 'calculate', 'call', 'camp', 'care', 'carry', 'carve', 'cause', 'challenge', 'change', 'charge', 'chase', 'cheat', 'check', 'cheer', 'chew', 'choke', 'chop', 'claim', 'clap', 'clean', 'clear', 'clip', 'close', 'coach', 'coil', 'collect', 'colour', 'comb', 'command', 'communicate', 'compare', 'compete', 'complain', 'complete', 'concentrate', 'concern', 'confess', 'confuse', 'connect', 'consider', 'consist', 'contain', 'continue', 'copy', 'correct', 'cough', 'count', 'cover', 'crack', 'crash', 'crawl', 'cross', 'crush', 'cry', 'cure', 'curl', 'curve', 'cycle', 'dam', 'damage', 'dance', 'dare', 'decay', 'deceive', 'decide', 'decorate', 'delay', 'delight', 'deliver', 'depend', 'describe', 'desert', 'deserve', 'destroy', 'detect', 'develop', 'disagree', 'disappear', 'disapprove', 'disarm', 'discover', 'dislike', 'divide', 'double', 'doubt', 'drag', 'drain', 'dream', 'dress', 'drip', 'drop', 'drown', 'drum', 'dry', 'dust', 'earn', 'educate', 'embarrass', 'employ', 'empty', 'encourage', 'end', 'enjoy', 'enter', 'entertain', 'escape', 'examine', 'excite', 'excuse', 'exercise', 'exist', 'expand', 'expect', 'explain', 'explode', 'extend', 'face', 'fade', 'fail', 'fancy', 'fasten', 'fax', 'fear', 'fence', 'fetch', 'file', 'fill', 'film', 'fire', 'fit', 'fix', 'flap', 'flash', 'float', 'flood', 'flow', 'flower', 'fold', 'follow', 'fool', 'force', 'form', 'found', 'frame', 'frighten', 'fry', 'gather', 'gaze', 'glow', 'glue', 'grab', 'grate', 'grease', 'greet', 'grin', 'grip', 'groan', 'guarantee', 'guard', 'guess', 'guide', 'hammer', 'hand', 'handle', 'hang', 'happen', 'harass', 'harm', 'hate', 'haunt', 'head', 'heal', 'heap', 'heat', 'help', 'hook', 'hop', 'hope', 'hover', 'hug', 'hum', 'hunt', 'hurry', 'identify', 'ignore', 'imagine', 'impress', 'improve', 'include', 'increase', 'influence', 'inform', 'inject', 'injure', 'instruct', 'intend', 'interest', 'interfere', 'interrupt', 'introduce', 'invent', 'invite', 'irritate', 'itch', 'jail', 'jam', 'jog', 'join', 'joke', 'judge', 'juggle', 'jump', 'kick', 'kill', 'kiss', 'kneel', 'knit', 'knock', 'knot', 'label', 'land', 'last', 'laugh', 'launch', 'learn', 'level', 'license', 'lick', 'lie', 'lighten', 'like', 'list', 'listen', 'live', 'load', 'lock', 'long', 'look', 'love', 'man', 'manage', 'march', 'mark', 'marry', 'match', 'mate', 'matter', 'measure', 'meddle', 'melt', 'memorise', 'mend', 'mess up', 'milk', 'mine', 'miss', 'mix', 'moan', 'moor', 'mourn', 'move', 'muddle', 'mug', 'multiply', 'murder', 'nail', 'name', 'need', 'nest', 'nod', 'note', 'notice', 'number', 'obey', 'object', 'observe', 'obtain', 'occur', 'offend', 'offer', 'open', 'order', 'overflow', 'owe', 'own', 'pack', 'paddle', 'paint', 'park', 'part', 'pass', 'paste', 'pat', 'pause', 'peck', 'pedal', 'peel', 'peep', 'perform', 'permit', 'phone', 'pick', 'pinch', 'pine', 'place', 'plan', 'plant', 'play', 'please', 'plug', 'point', 'poke', 'polish', 'pop', 'possess', 'post', 'pour', 'practise', 'pray', 'preach', 'precede', 'prefer', 'prepare', 'present', 'preserve', 'press', 'pretend', 'prevent', 'prick', 'print', 'produce', 'program', 'promise', 'protect', 'provide', 'pull', 'pump', 'punch', 'puncture', 'punish', 'push', 'question', 'queue', 'race', 'radiate', 'rain', 'raise', 'reach', 'realise', 'receive', 'recognise', 'record', 'reduce', 'reflect', 'refuse', 'regret', 'reign', 'reject', 'rejoice', 'relax', 'release', 'rely', 'remain', 'remember', 'remind', 'remove', 'repair', 'repeat', 'replace', 'reply', 'report', 'reproduce', 'request', 'rescue', 'retire', 'return', 'rhyme', 'rinse', 'risk', 'rob', 'rock', 'roll', 'rot', 'rub', 'ruin', 'rule', 'rush', 'sack', 'sail', 'satisfy', 'save', 'saw', 'scare', 'scatter', 'scold', 'scorch', 'scrape', 'scratch', 'scream', 'screw', 'scribble', 'scrub', 'seal', 'search', 'separate', 'serve', 'settle', 'shade', 'share', 'shave', 'shelter', 'shiver', 'shock', 'shop', 'shrug', 'sigh', 'sign', 'signal', 'sin', 'sip', 'ski', 'skip', 'slap', 'slip', 'slow', 'smash', 'smell', 'smile', 'smoke', 'snatch', 'sneeze', 'sniff', 'snore', 'snow', 'soak', 'soothe', 'sound', 'spare', 'spark', 'sparkle', 'spell', 'spill', 'spoil', 'spot', 'spray', 'sprout', 'squash', 'squeak', 'squeal', 'squeeze', 'stain', 'stamp', 'stare', 'start', 'stay', 'steer', 'step', 'stir', 'stitch', 'stop', 'store', 'strap', 'strengthen', 'stretch', 'strip', 'stroke', 'stuff', 'subtract', 'succeed', 'suck', 'suffer', 'suggest', 'suit', 'supply', 'support', 'suppose', 'surprise', 'surround', 'suspect', 'suspend', 'switch', 'talk', 'tame', 'tap', 'taste', 'tease', 'telephone', 'tempt', 'terrify', 'test', 'thank', 'thaw', 'tick', 'tickle', 'tie', 'time', 'tip', 'tire', 'touch', 'tour', 'tow', 'trace', 'trade', 'train', 'transport', 'trap', 'travel', 'treat', 'tremble', 'trick', 'trip', 'trot', 'trouble', 'trust', 'try', 'tug', 'tumble', 'turn', 'twist', 'type', 'undress', 'unfasten', 'unite', 'unlock', 'unpack', 'untidy', 'use', 'vanish', 'visit', 'wail', 'wait', 'walk', 'wander', 'want', 'warm', 'warn', 'wash', 'waste', 'watch', 'water', 'wave', 'weigh', 'welcome', 'whine', 'whip', 'whirl', 'whisper', 'whistle', 'wink', 'wipe', 'wish', 'wobble', 'wonder', 'work', 'worry', 'wrap', 'wreck', 'wrestle', 'wriggle', 'x-ray', 'yawn', 'yell', 'zip', 'zoom'])
-
-
-ENGLISH_IRREGULAR_VERBS = set(['arise ', 'arisen', 'arose ', 'ate', 'awake', 'awakened', 'awoke', 'awoken', 'backslid', 'backslidden', 'backslide', 'bade', 'be', 'bear', 'beat', 'beaten', 'became', 'become', 'been', 'began', 'begin', 'begun', 'bend', 'bent', 'bet', 'betted', 'bid', 'bidden', 'bind', 'bit', 'bite', 'bitten', 'bled', 'bleed', 'blew', 'blow', 'blown', 'bore', 'born', 'borne', 'bought', 'bound', 'break', 'bred', 'breed', 'bring', 'broadcast', 'broadcasted', 'broke', 'broken', 'brought', 'build', 'built', 'burn', 'burned', 'burnt', 'burst', 'bust', 'busted', 'buy', 'came', 'cast', 'catch', 'caught', 'choose', 'chose', 'chosen', 'clad', 'cling', 'clothe', 'clothed', 'clung', 'come', 'cost', 'creep', 'crept', 'cut', 'daydream', 'daydreamed', 'daydreamt', 'deal', 'dealt', 'did', 'dig', 'disprove', 'disproved', 'disproven', 'dive', 'dived', 'do', 'done', 'dove', 'drank', 'draw', 'drawn', 'dream', 'dreamed', 'dreamt', 'drew', 'drink', 'drive', 'driven', 'drove', 'drunk', 'dug', 'dwell', 'dwelled', 'dwelt', 'eat', 'eaten', 'fall', 'fallen', 'fed', 'feed', 'feel', 'fell', 'felt', 'fight', 'find', 'fit', 'fitted', 'fled', 'flee', 'flew', 'fling', 'flown', 'flung', 'fly', 'forbade', 'forbid', 'forbidden', 'forecast', 'forego', 'foregone', 'foresaw', 'foresee', 'foreseen', 'foretell', 'foretold', 'forewent', 'forgave', 'forget', 'forgive', 'forgiven', 'forgot', 'forgotten', 'forsake', 'forsaken', 'forsook', 'fought', 'found', 'freeze', 'froze', 'frozen', 'gave', 'get', 'give', 'given', 'go', 'gone', 'got', 'gotten', 'grew', 'grind', 'ground', 'grow', 'grown', 'had', 'hang', 'have', 'hear', 'heard', 'held', 'hew', 'hewed', 'hewn', 'hid', 'hidden', 'hide', 'hit', 'hold', 'hung', 'hurt', 'keep', 'kept', 'kneel', 'kneeled', 'knelt', 'knew', 'knit', 'knitted', 'know', 'known', 'laid', 'lain', 'lay', 'lead', 'lean', 'leaned', 'leant', 'leap', 'leaped', 'leapt', 'learn', 'learned', 'learnt', 'leave', 'led', 'left', 'lend', 'lent', 'let', 'lie', 'lied', 'light', 'lighted', 'lit', 'lose', 'lost', 'made', 'make', 'mean', 'meant', 'meet', 'met', 'misunderstand', 'misunderstood', 'mow', 'mowed', 'mown', 'paid', 'partake', 'partaken', 'partook', 'pay', 'plead', 'pleaded', 'pled', 'proofread', 'prove', 'proved', 'proven', 'put', 'quick-freeze', 'quick-froze', 'quick-frozen', 'quit', 'quitted', 'ran', 'rang', 'read', 'rid', 'ridden', 'ride', 'ring', 'rise', 'risen', 'rode', 'rose', 'run', 'rung', 'said', 'sang', 'sank', 'sat', 'saw', 'sawed', 'sawn', 'say', 'see', 'seek', 'seen', 'sell', 'send', 'sent', 'set', 'sew', 'sewed', 'sewn', 'shake', 'shaken', 'shave', 'shaved', 'shaven', 'shear', 'sheared', 'shed', 'shine', 'shined', 'shone', 'shook', 'shoot', 'shorn', 'shot', 'show', 'showed', 'shown', 'shrank', 'shrink', 'shrunk', 'shut', 'sing', 'sink', 'sit', 'slain', 'slay', 'slayed', 'sleep', 'slept', 'slew', 'slid', 'slide', 'sling', 'slink', 'slinked', 'slit', 'slung', 'slunk', 'smell', 'smelled', 'smelt', 'sneak', 'sneaked', 'snuck', 'sold', 'sought', 'sow', 'sowed', 'sown', 'spat', 'speak', 'sped', 'speed', 'speeded', 'spell', 'spelled', 'spelt', 'spend', 'spent', 'spill', 'spilled', 'spilt', 'spin', 'spit', 'split', 'spoil', 'spoiled', 'spoilt', 'spoke', 'spoken', 'sprang', 'spread', 'spring', 'sprung', 'spun', 'stand ', 'stank', 'steal', 'stick', 'sting', 'stink', 'stole', 'stolen', 'stood', 'strew', 'strewed', 'strewn', 'stricken', 'stridden', 'stride', 'strike', 'string', 'strive', 'strived', 'striven', 'strode', 'strove', 'struck', 'strung', 'stuck', 'stung', 'stunk', 'sublet', 'sunburn', 'sunburned', 'sunburnt', 'sung', 'sunk', 'swam', 'swear', 'sweat', 'sweated', 'sweep', 'swell', 'swelled', 'swept', 'swim', 'swing', 'swollen', 'swore', 'sworn', 'swum', 'swung', 'take', 'taken', 'taught', 'teach', 'tear', 'telecast', 'tell', 'test-drive', 'test-driven', 'test-drove', 'test-flew', 'test-flown', 'test-fly', 'think', 'thought', 'threw', 'throw', 'thrown', 'thrust', 'told', 'took', 'tore', 'torn', 'tread', 'trod', 'trodden', 'understand', 'understood', 'undertake', 'undertaken', 'undertook', 'undid', 'undo', 'undone', 'wake', 'waked', 'was, were', 'waylaid', 'waylay', 'wear', 'weave', 'weaved', 'wed', 'wedded', 'weep', 'went', 'wept', 'wet', 'wetted', 'whet', 'whetted', 'win', 'wind', 'withdraw', 'withdrawn', 'withdrew', 'withheld', 'withhold', 'withstand', 'withstood', 'woke', 'woken', 'won', 'wore', 'worn', 'wound', 'wove', 'woven', 'wring', 'write', 'written', 'wrote', 'wrung'])
--- a/reference_data/us_states.py	Thu Dec 19 14:45:43 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,211 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# See http://en.wikipedia.org/wiki/List_of_U.S._state_abbreviations
-# WARNING: The name of each state should be in French
-# (e.g. "Floride", not "Florida")
-US_STATES = {'AK': 'Alaska',
-             'AL': 'Alabama',
-             'AR': 'Arkansas',
-             'AZ': 'Arizona',
-             'Ala.': 'Alabama',
-             'Alas.': 'Alaska',
-             'Alaska': 'Alaska',
-             'Ariz.': 'Arizona',
-             'Ark.': 'Arkansas',
-             'Az.': 'Arizona',
-             'CA': 'Californie',
-             'CF': 'Californie',
-             'CL': 'Colorado',
-             'CO': 'Colorado',
-             'CT': 'Connecticut',
-             'Ca.': 'Californie',
-             'Cal.': 'Californie',
-             'Cali.': 'Californie',
-             'Calif.': 'Californie',
-             'Col.': 'Colorado',
-             'Colo.': 'Colorado',
-             'Conn.': 'Connecticut',
-             'Ct.': 'Connecticut',
-             'D.C.': 'District of ColuFederal district',
-             'DC': 'District of ColuFederal district',
-             'DE': 'Delaware',
-             'DL': 'Delaware',
-             'De.': 'Delaware',
-             'Del.': 'Delaware',
-             'FL': 'Floride',
-             'Fl.': 'Floride',
-             'Fla.': 'Floride',
-             'Flor.': 'Floride',
-             'GA': u'Géorgie',
-             'Ga.': u'Géorgie',
-             'H.I.': 'Hawaii',
-             'HA': 'Hawaii',
-             'HI': 'Hawaii',
-             'Hawaii': 'Hawaii',
-             'IA': 'Iowa',
-             'ID': 'Idaho',
-             'IL': 'Illinois',
-             'IN': 'Indiana',
-             'Ia.': 'Iowa',
-             'Id.': 'Idaho',
-             'Ida.': 'Idaho',
-             'Idaho': 'Idaho',
-             'Il.': 'Illinois',
-             "Ill's": 'Illinois',
-             'Ill.': 'Illinois',
-             'Ills.': 'Illinois',
-             'In.': 'Indiana',
-             'Ind.': 'Indiana',
-             'Ioa.': 'Iowa',
-             'Iowa': 'Iowa',
-             'KA': 'Kansas',
-             'KS': 'Kansas',
-             'KY': 'Kentucky',
-             'Ka.': 'Kansas',
-             'Kan.': 'Kansas',
-             'Kans.': 'Kansas',
-             'Ks.': 'Kansas',
-             'Ky.': 'Kentucky',
-             'LA': 'Louisiane',
-             'La.': 'Louisiane',
-             'MA': 'Massachusetts',
-             'MC': 'Michigan',
-             'MD': 'Maryland',
-             'ME': 'Maine',
-             'MI': 'Mississippi',
-             'MN': 'Minnesota',
-             'MO': 'Missouri',
-             'MS': 'Mississippi',
-             'MT': 'Montana',
-             'Maine': 'Maine',
-             'Mass.': 'Massachusetts',
-             'Md.': 'Maryland',
-             'Me.': 'Maine',
-             'Mich.': 'Michigan',
-             'Minn.': 'Minnesota',
-             'Miss.': 'Mississippi',
-             'Mn.': 'Minnesota',
-             'Mo.': 'Missouri',
-             'Mont.': 'Montana',
-             'N. Car.': 'Caroline du Nord',
-             'N. Dak.': 'Dakota du Nord',
-             'N. Mex.': 'Nouveau-Mexique',
-             'N. York': 'New York',
-             'N.C.': 'Caroline du Nord',
-             'N.D.': 'Dakota du Nord',
-             'N.H.': 'New Hampshire',
-             'N.J.': 'New Jersey',
-             'N.M.': 'Nouveau-Mexique',
-             'N.Y.': 'New York',
-             'NB': 'Nebraska',
-             'NC': 'Caroline du Nord',
-             'ND': 'Dakota du Nord',
-             'NE': 'Nebraska',
-             'NH': 'New Hampshire',
-             'NJ': 'New Jersey',
-             'NM': 'Nouveau-Mexique',
-             'NV': 'Nevada',
-             'NY': 'New York',
-             'Neb.': 'Nebraska',
-             'Nebr.': 'Nebraska',
-             'Nev.': 'Nevada',
-             'New M.': 'Nouveau-Mexique',
-             'NoDak': 'Dakota du Nord',
-             'Nv.': 'Nevada',
-             'O.': 'Ohio',
-             'OH': 'Ohio',
-             'OK': 'Oklahoma',
-             'OR': 'Oregon',
-             'Oh.': 'Ohio',
-             'Ohio': 'Ohio',
-             'Ok.': 'Oklahoma',
-             'Okla.': 'Oklahoma',
-             'Or.': 'Oregon',
-             'Ore.': 'Oregon',
-             'Oreg.': 'Oregon',
-             'PA': 'Pennsylvanie',
-             'Pa.': 'Pennsylvanie',
-             'R.I.': 'Rhode Island',
-             'R.I. & P.P.': 'Rhode Island',
-             'RI': 'Rhode Island',
-             'S. Car.': 'Caroline du Sud',
-             'S. Dak.': 'Dakota du Sud',
-             'S.C.': 'Caroline du Sud',
-             'S.D.': 'Dakota du Sud',
-             'SC': 'Caroline du Sud',
-             'SD': 'Dakota du Sud',
-             'SoDak': 'Dakota du Sud',
-             'State': 'Utah',
-             'TN': 'Tennessee',
-             'TX': 'Texas',
-             'Tenn.': 'Tennessee',
-             'Tex.': 'Texas',
-             'Texas': 'Texas',
-             'Tn.': 'Tennessee',
-             'Tx.': 'Texas',
-             'US-AL': 'Alabama',
-             'US-AR': 'Arkansas',
-             'US-AZ': 'Arizona',
-             'US-CA': 'Californie',
-             'US-CO': 'Colorado',
-             'US-CT': 'Connecticut',
-             'US-DC': 'District of ColuFederal district',
-             'US-DE': 'Delaware',
-             'US-FL': 'Floride',
-             'US-GA': u'Géorgie',
-             'US-IL': 'Illinois',
-             'US-IN': 'Indiana',
-             'US-KY': 'Kentucky',
-             'US-LA': 'Louisiane',
-             'US-MA': 'Massachusetts',
-             'US-MD': 'Maryland',
-             'US-MI': 'Michigan',
-             'US-MN': 'Minnesota',
-             'US-MO': 'Missouri',
-             'US-MS': 'Mississippi',
-             'US-MT': 'Montana',
-             'US-NC': 'Caroline du Nord',
-             'US-ND': 'Dakota du Nord',
-             'US-NE': 'Nebraska',
-             'US-NH': 'New Hampshire',
-             'US-NJ': 'New Jersey',
-             'US-NM': 'Nouveau-Mexique',
-             'US-NY': 'New York',
-             'US-OK': 'Oklahoma',
-             'US-PA': 'Pennsylvanie',
-             'US-RI': 'Rhode Island',
-             'US-SC': 'Caroline du Sud',
-             'US-SD': 'Dakota du Sud',
-             'US-TN': 'Tennessee',
-             'US-VA': 'Virginia',
-             'US-VT': 'Vermont',
-             'US-WA': 'Washington',
-             'US-WI': 'Wisconsin',
-             'US-WV': 'Virginie occidentale',
-             'US-WY': 'Wyoming',
-             'UT': 'Utah',
-             'Ut.': 'Utah',
-             'Utah': 'Utah',
-             'VA': 'Virginia',
-             'VT': 'Vermont',
-             'Va.': 'Virginia',
-             'Vt.': 'Vermont',
-             'W. Va.': 'Virginie occidentale',
-             'W. Virg.': 'Virginie occidentale',
-             'W.V.': 'Virginie occidentale',
-             'W.Va.': 'Virginie occidentale',
-             'WA': 'Washington',
-             'WI': 'Wisconsin',
-             'WN': 'Washington',
-             'WS': 'Wisconsin',
-             'WV': 'Virginie occidentale',
-             'WY': 'Wyoming',
-             'Wa.': 'Washington',
-             'Wash.': 'Washington',
-             'Wash. D.C.': 'District of ColuFederal district',
-             'Wi.': 'Wisconsin',
-             'Wis.': 'Wisconsin',
-             'Wisc.': 'Wisconsin',
-             'Wn.': 'Washington',
-             'Wy.': 'Wyoming',
-             'Wyo.': 'Wyoming'}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rl/aligner.py	Thu Dec 19 14:45:56 2013 +0000
@@ -0,0 +1,324 @@
+# -*- coding:utf-8 -*-
+# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+import time
+import logging
+from collections import defaultdict
+
+from scipy import zeros
+from scipy.sparse import lil_matrix
+
+from nazca.utils.dataio import parsefile
+
+
+###############################################################################
+### UTILITY FUNCTIONS #########################################################
+###############################################################################
+def iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique=True):
+    """ Return the aligned pairs
+    """
+    if unique:
+        for refid in global_matched:
+            bestid, _ = sorted(global_matched[refid], key=lambda x:x[1])[0]
+            ref_record = refset[refid]
+            target_record = targetset[bestid]
+            distance = global_mat[refid, bestid] if global_mat is not None else None
+            yield (ref_record[0], refid), (target_record[0], bestid), distance
+    else:
+        for refid in global_matched:
+            for targetid, _ in global_matched[refid]:
+                ref_record = refset[refid]
+                target_record = targetset[targetid]
+                distance = global_mat[refid, targetid] if global_mat is not None else None
+                yield (ref_record[0], refid), (target_record[0], targetid), distance
+
+
+###############################################################################
+### BASE ALIGNER OBJECT #######################################################
+###############################################################################
+class BaseAligner(object):
+
+    def __init__(self, threshold, processings, normalize_matrix=False):
+        self.threshold = threshold
+        self.processings = processings
+        self.normalize_matrix = normalize_matrix
+        self.ref_normalizer = None
+        self.target_normalizer = None
+        self.target_normalizer = None
+        self.blocking = None
+        self.alignments_done = 0
+        self.pairs_found = 0
+        self.nb_comparisons = 0
+        self.nb_blocks = 0
+        self.refset_size = None
+        self.targetset_size = None
+        self.time = None
+        self.logger = logging.getLogger('nazca.aligner')
+
+    def register_ref_normalizer(self, normalizer):
+        """ Register normalizers to be applied
+        before alignment """
+        self.ref_normalizer = normalizer
+
+    def register_target_normalizer(self, normalizer):
+        """ Register normalizers to be applied
+        before alignment """
+        self.target_normalizer = normalizer
+
+    def register_blocking(self, blocking):
+        self.blocking = blocking
+
+    def apply_normalization(self, dataset, normalizer):
+        if normalizer:
+            return normalizer.normalize_dataset(dataset)
+        return dataset
+
+    def compute_distance_matrix(self, refset, targetset,
+                                ref_indexes, target_indexes):
+        """ Compute and return the global alignment matrix.
+        For each `processing` a `Distancematrix` is built, then all the
+        matrices are summed with their own weighting and the result is the global
+        alignment matrix, which is returned.
+        """
+        distmatrix = zeros((len(ref_indexes), len(target_indexes)), dtype='float32')
+        for processing in self.processings:
+            distmatrix += processing.cdist(refset, targetset,
+                                          ref_indexes, target_indexes)
+        return distmatrix
+
+    def threshold_matched(self, distmatrix):
+        """ Return the matched elements within a dictionnary,
+        each key being the indice from X, and the corresponding
+        values being a list of couple (indice from Y, distance)
+        """
+        match = defaultdict(list)
+        if self.normalize_matrix:
+            distmatrix /= distmatrix.max()
+        ind = (distmatrix <= self.threshold).nonzero()
+        indrow = ind[0].tolist()
+        indcol = ind[1].tolist()
+        for (i, j) in zip(indrow, indcol):
+            match[i].append((j, distmatrix[i, j]))
+        return match
+
+    def _get_match(self, refset, targetset, ref_indexes=None, target_indexes=None):
+        # Build items
+        items = []
+        ref_indexes = ref_indexes or xrange(len(refset))
+        target_indexes = target_indexes or xrange(len(targetset))
+        # Apply alignments
+        mat = self.compute_distance_matrix(refset, targetset,
+                                           ref_indexes=ref_indexes,
+                                           target_indexes=target_indexes)
+        matched = self.threshold_matched(mat)
+        # Reapply matched to global indexes
+        new_matched = {}
+        for k, values in matched.iteritems():
+            new_matched[ref_indexes[k]] = [(target_indexes[i], d) for i, d in values]
+        return mat, new_matched
+
+    def align(self, refset, targetset, get_matrix=True):
+        """ Perform the alignment on the referenceset
+        and the targetset
+        """
+        start_time = time.time()
+        refset = self.apply_normalization(refset, self.ref_normalizer)
+        targetset = self.apply_normalization(targetset, self.target_normalizer)
+        self.refset_size = len(refset)
+        self.targetset_size = len(targetset)
+        # If no blocking
+        if not self.blocking:
+            return self._get_match(refset, targetset)
+        # Blocking == conquer_and_divide
+        global_matched = {}
+        global_mat = lil_matrix((len(refset), len(targetset)))
+        self.blocking.fit(refset, targetset)
+        for refblock, targetblock in self.blocking.iter_blocks():
+            self.nb_blocks += 1
+            ref_index = [r[0] for r in refblock]
+            target_index = [r[0] for r in targetblock]
+            self.nb_comparisons += len(ref_index)*len(target_index)
+            _, matched = self._get_match(refset, targetset, ref_index, target_index)
+            for k, values in matched.iteritems():
+                subdict = global_matched.setdefault(k, set())
+                for v, d in values:
+                    subdict.add((v, d))
+                    self.alignments_done += 1
+                    if get_matrix:
+                        # XXX avoid issue in sparse matrix
+                        global_mat[k, v] = d or 10**(-10)
+        self.time = time.time() - start_time
+        return global_mat, global_matched
+
+    def get_aligned_pairs(self, refset, targetset, unique=True):
+        """ Get the pairs of aligned elements
+        """
+        global_mat, global_matched = self.align(refset, targetset, get_matrix=False)
+        for pair in iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique):
+            self.pairs_found += 1
+            yield pair
+        self.log_infos()
+
+    def align_from_files(self, reffile, targetfile,
+                         ref_indexes=None, target_indexes=None,
+                         ref_encoding=None, target_encoding=None,
+                         ref_separator='\t', target_separator='\t',
+                         get_matrix=True):
+        """ Align data from files
+
+        Parameters
+        ----------
+
+        reffile: name of the reference file
+
+        targetfile: name of the target file
+
+        ref_encoding: if given (e.g. 'utf-8' or 'latin-1'), it will
+                      be used to read the files.
+
+        target_encoding: if given (e.g. 'utf-8' or 'latin-1'), it will
+                         be used to read the files.
+
+        ref_separator: separator of the reference file
+
+        target_separator: separator of the target file
+        """
+        refset = parsefile(reffile, indexes=ref_indexes,
+                           encoding=ref_encoding, delimiter=ref_separator)
+        targetset = parsefile(targetfile, indexes=target_indexes,
+                              encoding=target_encoding, delimiter=target_separator)
+        return self.align(refset, targetset, get_matrix=get_matrix)
+
+    def get_aligned_pairs_from_files(self, reffile, targetfile,
+                         ref_indexes=None, target_indexes=None,
+                         ref_encoding=None, target_encoding=None,
+                         ref_separator='\t', target_separator='\t',
+                         unique=True):
+        """ Get the pairs of aligned elements
+        """
+        refset = parsefile(reffile, indexes=ref_indexes,
+                           encoding=ref_encoding, delimiter=ref_separator)
+        targetset = parsefile(targetfile, indexes=target_indexes,
+                              encoding=target_encoding, delimiter=target_separator)
+        global_mat, global_matched = self.align(refset, targetset, get_matrix=False)
+        for pair in iter_aligned_pairs(refset, targetset, global_mat, global_matched, unique):
+            yield pair
+
+    def log_infos(self):
+        """ Display some info on the aligner process
+        """
+        self.logger.info('Computation time : %s' % self.time)
+        self.logger.info('Size reference set : %s' % self.refset_size)
+        self.logger.info('Size target set : %s' % self.targetset_size)
+        self.logger.info('Comparisons done : %s' % self.nb_comparisons)
+        self.logger.info('Alignments done : %s' % self.alignments_done)
+        self.logger.info('Pairs found : %s' % self.pairs_found)
+        self.logger.info('Ratio reference set/alignments done : %s'
+                         % (self.alignments_done/float(self.refset_size)))
+        self.logger.info('Ratio target set/alignments done : %s'
+                         % (self.alignments_done/float(self.targetset_size)))
+        self.logger.info('Ratio reference set/pairs found : %s'
+                         % (self.pairs_found/float(self.refset_size)))
+        self.logger.info('Ratio target set/pairs found : %s'
+                         % (self.pairs_found/float(self.targetset_size)))
+        self.logger.info('Maximum comparisons : %s'
+                         % (self.refset_size * self.targetset_size))
+        self.logger.info('Number of blocks : %s' % self.nb_blocks)
+        if self.nb_blocks:
+            self.logger.info('Ratio comparisons/block : %s'
+                             % (float(self.nb_comparisons)/self.nb_blocks))
+        self.logger.info('Blocking reduction : %s'
+                         % (self.nb_comparisons/float(self.refset_size * self.targetset_size)))
+
+
+###############################################################################
+### PIPELINE ALIGNER OBJECT ##################################################
+###############################################################################
+class PipelineAligner(object):
+    """ This pipeline will perform iterative alignments, removing each time
+    the aligned results from the previous aligner.
+    """
+
+    def __init__(self, aligners):
+        self.aligners = aligners
+        self.pairs = {}
+        self.nb_comparisons = 0
+        self.nb_blocks = 0
+        self.alignments_done = 0
+        self.pairs_found = 0
+        self.refset_size = None
+        self.targetset_size = None
+        self.time = None
+        self.logger = logging.getLogger('nazca.aligner')
+
+    def get_aligned_pairs(self, refset, targetset, unique=True):
+        """ Get the pairs of aligned elements
+        """
+        start_time = time.time()
+        ref_index = range(len(refset))
+        target_index = range(len(targetset))
+        self.refset_size = len(refset)
+        self.targetset_size = len(targetset)
+        global_matched = {}
+        global_mat = lil_matrix((len(refset), len(targetset)))
+        seen_refset = set()
+        # Iteration over aligners
+        for ind_aligner, aligner in enumerate(self.aligners):
+            # Perform alignment
+            _refset = [refset[i] for i in ref_index]
+            _targetset = [targetset[i] for i in target_index]
+            for pair in aligner.get_aligned_pairs(_refset, _targetset, unique):
+                self.pairs_found += 1
+                pair = ((pair[0][0], ref_index[pair[0][1]]),
+                        (pair[1][0], target_index[pair[1][1]]))
+                yield pair
+                seen_refset.add(pair[0][1])
+            # Store stats
+            self.nb_blocks += aligner.nb_blocks
+            self.nb_comparisons += aligner.nb_comparisons
+            # Update indexes if necessary
+            # For now, we remove all the reference set that are already matched
+            if ind_aligner < len(self.aligners) - 1:
+                # There are other aligners after this one
+                ref_index = [i for i in ref_index if i not in seen_refset]
+        self.time = time.time() - start_time
+        self.log_infos()
+
+    def log_infos(self):
+        """ Display some info on the aligner process
+        """
+        self.logger.info('Computation time : %s' % self.time)
+        self.logger.info('Size reference set : %s' % self.refset_size)
+        self.logger.info('Size target set : %s' % self.targetset_size)
+        self.logger.info('Comparisons done : %s' % self.nb_comparisons)
+        self.logger.info('Alignments done : %s' % self.alignments_done)
+        self.logger.info('Pairs found : %s' % self.pairs_found)
+        self.logger.info('Ratio reference set/alignments done : %s'
+                         % (self.alignments_done/float(self.refset_size)))
+        self.logger.info('Ratio target set/alignments done : %s'
+                         % (self.alignments_done/float(self.targetset_size)))
+        self.logger.info('Ratio reference set/pairs found : %s'
+                         % (self.pairs_found/float(self.refset_size)))
+        self.logger.info('Ratio target set/pairs found : %s'
+                         % (self.pairs_found/float(self.targetset_size)))
+        self.logger.info('Maximum comparisons : %s'
+                         % (self.refset_size * self.targetset_size))
+        self.logger.info('Number of blocks : %s' % self.nb_blocks)
+        if self.nb_blocks:
+            self.logger.info('Ratio comparisons/block : %s'
+                             % (float(self.nb_comparisons)/self.nb_blocks))
+        self.logger.info('Blocking reduction : %s'
+                         % (self.nb_comparisons/float(self.refset_size * self.targetset_size)))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rl/blocking.py	Thu Dec 19 14:45:56 2013 +0000
@@ -0,0 +1,666 @@
+# -*- coding:utf-8 -*-
+# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+
+
+""" Blocking techniques.
+
+This module implements a set of blocking techniques used to split
+datasets in smaller subsets that will be aligned in more details.
+
+Additional information:
+
+   P. Christen, Data Matching, Data-Centric Systems and Applications,
+
+
+"""
+from functools import partial
+import warnings
+
+from scipy.spatial import KDTree
+
+from nazca.utils.minhashing import Minlsh
+from nazca.utils.distances import soundexcode
+
+
+###############################################################################
+### GENERAL BLOCKING ##########################################################
+###############################################################################
+class BaseBlocking(object):
+    """ An abstract general blocking object that exposes
+    the API that should be common to all blockings object
+    """
+    def __init__(self, ref_attr_index, target_attr_index):
+        """ Build the blocking object
+
+        Parameters
+        ----------
+
+        ref_attr_index: index of the attribute of interest in a record
+                        for the reference dataset
+                        (i.e. attribute to be used for key computation)
+
+        target_attr_index: index of the attribute of interest in a record
+                           for the target dataset
+                           (i.e. attribute to be used for key computation)
+        """
+        self.ref_attr_index = ref_attr_index
+        self.target_attr_index = target_attr_index
+        self.refids = None
+        self.targetids = None
+        self.is_fitted = False
+
+    def _fit(self, refset, targetset):
+        raise NotImplementedError
+
+    def _iter_blocks(self):
+        """ Internal iteration function over blocks
+        """
+        raise NotImplementedError
+
+    def _cleanup(self):
+        """ Internal cleanup blocking for further use (e.g. in pipeline)
+        """
+        raise NotImplementedError
+
+    def fit(self, refset, targetset):
+        """ Fit the blocking technique on the reference and target datasets
+
+        Parameters
+        ----------
+        refset: a dataset (list of records)
+
+        targetset: a dataset (list of records)
+        """
+        self._fit(refset, targetset)
+        # Keep ids for blocks building
+        self.refids = [(i, r[0]) for i, r in enumerate(refset)]
+        self.targetids = [(i, r[0]) for i, r in enumerate(targetset)]
+        self.is_fitted = True
+
+    def iter_blocks(self):
+        """ Iterator over the different possible blocks.
+
+        Returns
+        -------
+
+        (block1, block2): The blocks are always (reference_block, target_block)
+                          and contains the pair (index, id) of the record in the
+                          corresponding dataset.
+        """
+        assert self.is_fitted
+        return self._iter_blocks()
+
+    def iter_indice_blocks(self):
+        """ Iterator over the different possible blocks.
+
+        Returns
+        -------
+
+        (block1, block2): The blocks are always (reference_block, target_block)
+                          and contains the indexes of the record in the
+                          corresponding dataset.
+        """
+        assert self.is_fitted
+        for block1, block2 in self._iter_blocks():
+            yield [r[0] for r in block1], [r[0] for r in block2]
+
+    def iter_id_blocks(self):
+        """ Iterator over the different possible blocks.
+
+        Returns
+        -------
+
+        (block1, block2): The blocks are always (reference_block, target_block)
+                          and contains the ids of the record in the
+                          corresponding dataset.
+        """
+        assert self.is_fitted
+        for block1, block2 in self._iter_blocks():
+            yield [r[1] for r in block1], [r[1] for r in block2]
+
+    def iter_pairs(self):
+        """ Iterator over the different possible pairs.
+
+        Returns
+        -------
+
+        (pair1, pari2): The pairs are always ((ind_reference, id_reference),
+                                              (ind_target, id_target))
+                        and are the ids of the record in the corresponding dataset.
+        """
+        assert self.is_fitted
+        for block1, block2 in self.iter_blocks():
+            for val1 in block1:
+                for val2 in block2:
+                    yield val1, val2
+
+    def iter_indice_pairs(self):
+        """ Iterator over the different possible pairs.
+
+        Returns
+        -------
+
+        (pair1, pari2): The pairs are always (ind_reference, ind_target)
+                        and are the ids of the record in the corresponding dataset.
+        """
+        assert self.is_fitted
+        for block1, block2 in self.iter_indice_blocks():
+            for val1 in block1:
+                for val2 in block2:
+                    yield val1, val2
+
+    def iter_id_pairs(self):
+        """ Iterator over the different possible pairs.
+
+        Returns
+        -------
+
+        (pair1, pari2): The pairs are always (id_reference, id_target)
+                        and are the ids of the record in the corresponding dataset.
+        """
+        assert self.is_fitted
+        for block1, block2 in self.iter_id_blocks():
+            for val1 in block1:
+                for val2 in block2:
+                    yield val1, val2
+
+    def cleanup(self):
+        """ Cleanup blocking for further use (e.g. in pipeline)
+        """
+        self.is_fitted = True
+        self._cleanup()
+
+
+###############################################################################
+### KEY BLOCKING ##############################################################
+###############################################################################
+class KeyBlocking(BaseBlocking):
+    """ This blocking technique is based on a a blocking criteria
+    (or blocking key), that will be used to divide the datasets.
+
+    The main idea here is:
+
+    1 - to create an index of f(x) for each x in the reference set.
+
+    2 - to create an index of f(y) for each y in the target set.
+
+    3 - to iterate on each distinct value of f(x) and to return
+        the identifiers of the records of the both sets for this value.
+    """
+
+    def __init__(self, ref_attr_index, target_attr_index, callback, ignore_none=False):
+        super(KeyBlocking, self).__init__(ref_attr_index, target_attr_index)
+        self.callback = callback
+        self.ignore_none = ignore_none
+        self.reference_index = {}
+        self.target_index = {}
+
+    def _fit(self, refset, targetset):
+        """ Fit a dataset in an index using the callback
+        """
+        for ind, rec in enumerate(refset):
+            key = self.callback(rec[self.ref_attr_index])
+            if not key and self.ignore_none:
+                continue
+            self.reference_index.setdefault(key, []).append((ind, rec[0]))
+        for ind, rec in enumerate(targetset):
+            key = self.callback(rec[self.target_attr_index])
+            if not key and self.ignore_none:
+                continue
+            self.target_index.setdefault(key, []).append((ind, rec[0]))
+
+    def _iter_blocks(self):
+        """ Iterator over the different possible blocks.
+
+        Returns
+        -------
+
+        (block1, block2): The blocks are always (reference_block, target_block)
+                          and containts the indexes of the record in the
+                          corresponding dataset.
+        """
+        for key, block1 in self.reference_index.iteritems():
+            block2 = self.target_index.get(key)
+            if block1 and block2:
+                yield (block1, block2)
+
+    def _cleanup(self):
+        """ Cleanup blocking for further use (e.g. in pipeline)
+        """
+        self.reference_index = {}
+        self.target_index = {}
+
+
+class SoundexBlocking(KeyBlocking):
+
+    def __init__(self, ref_attr_index, target_attr_index, language='french',):
+        super(SoundexBlocking, self).__init__(ref_attr_index, target_attr_index,
+                                              partial(soundexcode, language=language))
+
+
+###############################################################################
+### BIGRAM BLOCKING ###########################################################
+###############################################################################
+class NGramBlocking(BaseBlocking):
+    """ This blocking technique is based on a a n-gram key.
+    """
+
+    def __init__(self, ref_attr_index, target_attr_index, ngram_size=2, depth=2):
+        super(NGramBlocking, self).__init__(ref_attr_index, target_attr_index)
+        self.ngram_size = ngram_size
+        self.depth = depth
+        self.reference_index = {}
+        self.target_index = {}
+
+    def _fit_dataset(self, dataset, cur_index, attr_index):
+        """ Fit a dataset
+        """
+        for ind, r in enumerate(dataset):
+            cur_dict = cur_index
+            text = r[attr_index]
+            for i in range(self.depth):
+                ngram = text[i*self.ngram_size:(i+1)*self.ngram_size]
+                if i < self.depth - 1:
+                    cur_dict = cur_dict.setdefault(ngram, {})
+            cur_dict.setdefault(ngram, []).append((ind, r[0]))
+
+    def _fit(self, refset, targetset):
+        """ Fit the two sets (reference set and target set)
+        """
+        self._fit_dataset(refset, self.reference_index, self.ref_attr_index)
+        self._fit_dataset(targetset, self.target_index, self.target_attr_index)
+
+    def _iter_dict(self, ref_cur_dict, target_cur_dict):
+        """ Iterative function used to create blocks from dicts
+        """
+        for key, sub_dict in ref_cur_dict.iteritems():
+            if key in target_cur_dict:
+                if isinstance(sub_dict, dict):
+                    # There is another dict layer
+                    for block1, block2 in self._iter_dict(sub_dict, target_cur_dict[key]):
+                        yield block1, block2
+                else:
+                    # This is a list
+                    yield sub_dict, target_cur_dict[key]
+
+    def _iter_blocks(self):
+        """ Iterator over the different possible blocks.
+
+        Returns
+        -------
+
+        (block1, block2): The blocks are always (reference_block, target_block)
+                          and containts the indexes of the record in the
+                          corresponding dataset.
+        """
+        for block1, block2 in self._iter_dict(self.reference_index, self.target_index):
+            if block1 and block2:
+                yield block1, block2
+
+    def _cleanup(self):
+        """ Cleanup blocking for further use (e.g. in pipeline)
+        """
+        self.reference_index = {}
+        self.target_index = {}
+
+
+###############################################################################
+### SORTKEY BLOCKING ##########################################################
+###############################################################################
+class SortedNeighborhoodBlocking(BaseBlocking):
+    """ This blocking technique is based on a a sorting blocking criteria
+    (or blocking key), that will be used to divide the datasets.
+    """
+
+    def __init__(self, ref_attr_index, target_attr_index, key_func=lambda x: x, window_width=20):
+        super(SortedNeighborhoodBlocking, self).__init__(ref_attr_index, target_attr_index)
+        self.key_func = key_func
+        self.window_width = window_width
+        self.sorted_dataset = None
+
+    def _fit(self, refset, targetset):
+        """ Fit a dataset in an index using the callback
+        """
+        self.sorted_dataset = [((ind, r[0]), r[self.ref_attr_index], 0)
+                               for ind, r in enumerate(refset)]
+        self.sorted_dataset.extend([((ind, r[0]), r[self.target_attr_index], 1)
+                                    for ind, r in enumerate(targetset)])
+        self.sorted_dataset.sort(key=lambda x: self.key_func(x[1]))
+
+    def _iter_blocks(self):
+        """ Iterator over the different possible blocks.
+        """
+        for ind, (rid, record, dset) in enumerate(self.sorted_dataset):
+            # Only keep reference set record
+            if dset == 1:
+                continue
+            block1 = [rid,]
+            minind = (ind - self.window_width)
+            minind = minind if minind >=0 else 0
+            maxind = (ind + self.window_width + 1)
+            block2 = [ri for ri, re, d in self.sorted_dataset[minind:maxind]
+                      if d == 1]
+            if block1 and block2:
+                yield (block1, block2)
+
+    def _cleanup(self):
+        """ Cleanup blocking for further use (e.g. in pipeline)
+        """
+        self.sorted_dataset = None
+
+
+###############################################################################
+### MERGE BLOCKING ############################################################
+###############################################################################
+class MergeBlocking(BaseBlocking):
+    """ This blocking technique keep only one appearance of one given values,
+    and removes all the other records having this value.
+    The merge is based on a score function
+
+    E.g.
+      ('http://fr.wikipedia.org/wiki/Paris_%28Texas%29', 'Paris', 25898)
+      ('http://fr.wikipedia.org/wiki/Paris', 'Paris', 12223100)
+
+    could be (with a score function based on the population (third value):
+
+      ('http://fr.wikipedia.org/wiki/Paris', 'Paris', 12223100)
+
+    !!! WARNING !!! This is only done on ONE set (the one with a non null attr index)
+    """
+
+    def __init__(self, ref_attr_index, target_attr_index, score_func):
+        super(MergeBlocking, self).__init__(ref_attr_index, target_attr_index)
+        self.score_func = score_func
+        self.merged_dataset = None
+        self.other_dataset = None
+        if ref_attr_index is None and target_attr_index is None:
+            raise ValueError('At least one of ref_attr_index or target_attr_index '
+                             'should not be None')
+
+    def _fit(self, refset, targetset):
+        """ Fit a dataset in an index using the callback
+        """
+        if self.ref_attr_index is not None:
+            # Merge refset
+            self.merged_dataset = self._merge_dataset(refset, self.ref_attr_index)
+            self.other_dataset = [(ind, r[0]) for ind, r in enumerate(targetset)]
+        else:
+            # Merge targetset
+            self.merged_dataset = self._merge_dataset(targetset, self.target_attr_index)
+            self.other_dataset = [(ind, r[0]) for ind, r in enumerate(refset)]
+
+    def _merge_dataset(self, dataset, attr_index):
+        """ Merge a dataset
+        """
+        merged_dataset_dict = {}
+        for ind, record in enumerate(dataset):
+            score = self.score_func(record)
+            if record[attr_index] not in merged_dataset_dict:
+                # Create new entry
+                merged_dataset_dict[record[attr_index]] = (ind, record, score)
+            elif (record[attr_index] in merged_dataset_dict
+                  and merged_dataset_dict[record[attr_index]][2] < score):
+                # Change current score
+                merged_dataset_dict[record[attr_index]] = (ind, record, score)
+        return [(ind, r[0]) for ind, r, score in merged_dataset_dict.itervalues()]
+
+    def _iter_blocks(self):
+        """ Iterator over the different possible blocks.
+        """
+        if self.ref_attr_index is not None:
+            yield self.merged_dataset, self.other_dataset
+        else:
+            # self.target_attr_index is not None
+            yield self.other_dataset, self.merged_dataset
+
+    def _cleanup(self):
+        """ Cleanup blocking for further use (e.g. in pipeline)
+        """
+        self.merged_dataset = None
+        self.other_dataset = None
+
+
+###############################################################################
+### CLUSTERING-BASED BLOCKINGS ################################################
+###############################################################################
+class KmeansBlocking(BaseBlocking):
+    """ A blocking technique based on Kmeans
+    """
+
+    def __init__(self, ref_attr_index, target_attr_index, n_clusters=None):
+        super(KmeansBlocking, self).__init__(ref_attr_index, target_attr_index)
+        self.n_clusters = n_clusters
+        self.kmeans = None
+        self.predicted = None
+        from sklearn import cluster
+        self.cluster_class = cluster.KMeans
+
+    def _fit(self, refset, targetset):
+        """ Fit the reference dataset.
+        """
+        # If an element is None (missing), use instead the identity element.
+        # The identity element is defined as the 0-vector
+        idelement = tuple([0 for _ in xrange(len(refset[0][self.ref_attr_index]))])
+        # We assume here that there are at least 2 elements in the refset
+        n_clusters = self.n_clusters or (len(refset)/10 or len(refset)/2)
+        kmeans =  self.cluster_class(n_clusters=n_clusters)
+        kmeans.fit([elt[self.ref_attr_index] or idelement for elt in refset])
+        self.kmeans = kmeans
+        # Predict on targetset
+        self.predicted = self.kmeans.predict([elt[self.target_attr_index]
+                                              or idelement for elt in targetset])
+
+    def _iter_blocks(self):
+        """ Iterator over the different possible blocks.
+
+        Returns
+        -------
+
+        (block1, block2): The blocks are always (reference_block, target_block)
+                          and containts the indexes of the record in the
+                          corresponding dataset.
+        """
+        neighbours = [[[], []] for _ in xrange(self.kmeans.n_clusters)]
+        for ind, li in enumerate(self.predicted):
+            neighbours[li][1].append(self.targetids[ind])
+        for ind, li in enumerate(self.kmeans.labels_):
+            neighbours[li][0].append(self.refids[ind])
+        for block1, block2 in neighbours:
+            if len(block1) and len(block2):
+                yield block1, block2
+
+    def _cleanup(self):
+        """ Cleanup blocking for further use (e.g. in pipeline)
+        """
+        self.kmeans = None
+        self.predicted = None
+
+
+###############################################################################
+### KDTREE BLOCKINGS ##########################################################
+###############################################################################
+class KdTreeBlocking(BaseBlocking):
+    """ A blocking technique based on KdTree
+    """
+    def __init__(self, ref_attr_index, target_attr_index, threshold=0.1):
+        super(KdTreeBlocking, self).__init__(ref_attr_index, target_attr_index)
+        self.threshold = threshold
+        self.reftree = None
+        self.targettree = None
+        self.nb_elements = None
+
+    def _fit(self, refset, targetset):
+        """ Fit the blocking
+        """
+        firstelement = refset[0][self.ref_attr_index]
+        self.nb_elements = len(refset)
+        idsize = len(firstelement) if isinstance(firstelement, (tuple, list)) else 1
+        idelement = (0,) * idsize
+        # KDTree is expecting a two-dimensional array
+        if idsize == 1:
+            self.reftree  = KDTree([(elt[self.ref_attr_index],) or idelement for elt in refset])
+            self.targettree = KDTree([(elt[self.target_attr_index],) or idelement for elt in targetset])
+        else:
+            self.reftree = KDTree([elt[self.ref_attr_index] or idelement for elt in refset])
+            self.targettree = KDTree([elt[self.target_attr_index] or idelement for elt in targetset])
+
+    def _iter_blocks(self):
+        """ Iterator over the different possible blocks.
+
+        Returns
+        -------
+
+        (block1, block2): The blocks are always (reference_block, target_block)
+                          and containts the indexes of the record in the
+                          corresponding dataset.
+        """
+        extraneighbours = self.reftree.query_ball_tree(self.targettree, self.threshold)
+        neighbours = []
+        for ind in xrange(self.nb_elements):
+            if not extraneighbours[ind]:
+                continue
+            _ref = [self.refids[ind],]
+            _target = [self.targetids[v] for v in extraneighbours[ind]]
+            neighbours.append((_ref, _target))
+        for block1, block2 in neighbours:
+            if len(block1) and len(block2):
+                yield block1, block2
+
+    def _cleanup(self):
+        """ Cleanup blocking for further use (e.g. in pipeline)
+        """
+        self.reftree = None
+        self.targettree = None
+        self.nb_elements = None
+
+
+###############################################################################
+### MINHASHING BLOCKINGS ######################################################
+###############################################################################
+class MinHashingBlocking(BaseBlocking):
+    """ A blocking technique based on MinHashing
+    """
+    def __init__(self, ref_attr_index, target_attr_index,
+                 threshold=0.1, kwordsgram=1, siglen=200):
+        super(MinHashingBlocking, self).__init__(ref_attr_index, target_attr_index)
+        self.threshold = threshold
+        self.kwordsgram = kwordsgram
+        self.siglen = siglen
+        self.minhasher = Minlsh()
+        self.nb_elements = None
+
+    def _fit(self, refset, targetset):
+        """ Find the blocking using minhashing
+        """
+        # If an element is None (missing), use instead the identity element.
+        idelement = ''
+        self.minhasher.train([elt[self.ref_attr_index] or idelement for elt in refset] +
+                        [elt[self.target_attr_index] or idelement for elt in targetset],
+                        self.kwordsgram, self.siglen)
+        self.nb_elements = len(refset)
+
+    def _iter_blocks(self):
+        """ Iterator over the different possible blocks.
+
+        Returns
+        -------
+
+        (block1, block2): The blocks are always (reference_block, target_block)
+                          and containts the indexes of the record in the
+                          corresponding dataset.
+        """
+        rawneighbours = self.minhasher.predict(self.threshold)
+        neighbours = []
+        for data in rawneighbours:
+            neighbours.append([[], []])
+            for i in data:
+                if i >= self.nb_elements:
+                    neighbours[-1][1].append(self.targetids[i - self.nb_elements])
+                else:
+                    neighbours[-1][0].append(self.refids[i])
+            if len(neighbours[-1][0]) == 0 or len(neighbours[-1][1]) == 0:
+                neighbours.pop()
+        for block1, block2 in neighbours:
+            if len(block1) and len(block2):
+                yield block1, block2
+
+    def _cleanup(self):
+        """ Cleanup blocking for further use (e.g. in pipeline)
+        """
+        self.minhasher = Minlsh()
+        self.nb_elements = None
+
+
+###############################################################################
+### BLOCKING PIPELINE #########################################################
+###############################################################################
+class PipelineBlocking(BaseBlocking):
+    """ Pipeline multiple blocking techniques
+    """
+
+    def __init__(self, blockings, collect_stats=False):
+        """ Build the blocking object
+
+        Parameters
+        ----------
+
+        blockings: ordered list of blocking objects
+        """
+        self.blockings = blockings
+        self.stored_blocks = []
+        self.collect_stats = collect_stats
+        self.stats = {}
+
+    def _fit(self, refset, targetset):
+        """ Internal fit of the pipeline """
+        self._recursive_fit(refset, targetset, range(len(refset)), range(len(targetset)), 0)
+
+    def _recursive_fit(self, refset, targetset, ref_index, target_index, ind):
+        """ Recursive fit of the blockings.
+        Blocks are stored in the stored_blocks attribute.
+        """
+        if ind < len(self.blockings) - 1:
+            # There are other blockings after this one
+            blocking = self.blockings[ind]
+            blocking.cleanup()
+            blocking.fit([refset[i] for i in ref_index],
+                         [targetset[i] for i in target_index])
+            for block1, block2 in blocking.iter_indice_blocks():
+                ind_block1 = [ref_index[i] for i in block1]
+                ind_block2 = [target_index[i] for i in block2]
+                if self.collect_stats:
+                    self.stats.setdefault(ind, []).append((len(block1), len(block2)))
+                self._recursive_fit(refset, targetset, ind_block1, ind_block2, ind+1)
+        else:
+            # This is the final blocking
+            blocking = self.blockings[ind]
+            blocking.cleanup()
+            blocking.fit([refset[i] for i in ref_index],
+                         [targetset[i] for i in target_index])
+            for block1, block2 in blocking.iter_blocks():
+                ind_block1 = [(ref_index[i], _id) for i, _id in block1]
+                ind_block2 = [(target_index[i], _id) for i, _id in block2]
+                if self.collect_stats:
+                    self.stats.setdefault(ind, []).append((len(block1), len(block2)))
+                self.stored_blocks.append((ind_block1, ind_block2))
+
+    def _iter_blocks(self):
+        """ Internal iteration function over blocks
+        """
+        for block1, block2 in self.stored_blocks:
+            if block1 and block2:
+                yield block1, block2
--- a/test/test_alignment.py	Thu Dec 19 14:45:43 2013 +0000
+++ b/test/test_alignment.py	Thu Dec 19 14:45:56 2013 +0000
@@ -22,8 +22,8 @@
 from os import path
 
 from nazca.utils.normalize import simplify
-import nazca.record_linkage.aligner as alig
-import nazca.record_linkage.blocking as blo
+import nazca.rl.aligner as alig
+import nazca.rl.blocking as blo
 from nazca.utils.distances import LevenshteinProcessing, GeographicalProcessing
 
 
--- a/test/test_blocking.py	Thu Dec 19 14:45:43 2013 +0000
+++ b/test/test_blocking.py	Thu Dec 19 14:45:56 2013 +0000
@@ -23,11 +23,11 @@
 
 from nazca.utils.distances import (levenshtein, soundex, soundexcode,   \
                                        jaccard, euclidean, geographical)
-from nazca.record_linkage.blocking import (KeyBlocking, SortedNeighborhoodBlocking,
-                                           MergeBlocking,
-                                           NGramBlocking, PipelineBlocking,
-                                           SoundexBlocking, KmeansBlocking,
-                                           MinHashingBlocking, KdTreeBlocking)
+from nazca.rl.blocking import (KeyBlocking, SortedNeighborhoodBlocking,
+                               MergeBlocking,
+                               NGramBlocking, PipelineBlocking,
+                               SoundexBlocking, KmeansBlocking,
+                               MinHashingBlocking, KdTreeBlocking)
 from nazca.utils.normalize import SimplifyNormalizer, loadlemmas
 
 
--- a/test/test_dataio.py	Thu Dec 19 14:45:43 2013 +0000
+++ b/test/test_dataio.py	Thu Dec 19 14:45:56 2013 +0000
@@ -25,8 +25,8 @@
 from nazca.utils.dataio import (HTMLPrettyPrint, ValidXHTMLPrettyPrint,
                                 sparqlquery, rqlquery, parsefile,
                                 autocast, split_file)
-from nazca.named_entities import NerProcess
-from nazca.named_entities.sources import NerSourceLexicon
+from nazca.ner import NerProcess
+from nazca.ner.sources import NerSourceLexicon
 
 TESTDIR = path.dirname(__file__)
 
--- a/test/test_filters.py	Thu Dec 19 14:45:43 2013 +0000
+++ b/test/test_filters.py	Thu Dec 19 14:45:56 2013 +0000
@@ -17,8 +17,11 @@
 # with this program. If not, see <http://www.gnu.org/licenses/>.
 import unittest2
 
-from nazca.named_entities import named_entities as core, filters
-from nazca.named_entities.sources import NerSourceLexicon
+from nazca.ner import NerProcess
+from nazca.ner.filters import (NerOccurenceFilter,
+                               NerReplacementRulesFilter,
+                               NerDisambiguationWordParts)
+from nazca.ner.sources import NerSourceLexicon
 from nazca.utils.tokenizer import Token, Sentence
 
 
@@ -31,8 +34,8 @@
         source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
                                     'me': 'http://example.com/me'})
         source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
-        _filter = filters.NerOccurenceFilter(min_occ=2)
-        ner = core.NerProcess((source1, source2), filters=(_filter,))
+        _filter = NerOccurenceFilter(min_occ=2)
+        ner = NerProcess((source1, source2), filters=(_filter,))
         named_entities = ner.process_text(text)
         self.assertEqual(named_entities,
                          [('http://example.com/me', None,
@@ -54,8 +57,8 @@
         source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
                                     'me': 'http://example.com/me'})
         source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
-        _filter = filters.NerOccurenceFilter(max_occ=1)
-        ner = core.NerProcess((source1, source2), filters=(_filter,))
+        _filter = NerOccurenceFilter(max_occ=1)
+        ner = NerProcess((source1, source2), filters=(_filter,))
         named_entities = ner.process_text(text)
         self.assertEqual(named_entities,
                          [('http://example.com/everyone', None,
@@ -67,8 +70,8 @@
         text = 'Hello toto tutu. And toto.'
         source = NerSourceLexicon({'toto tutu': 'http://example.com/toto_tutu',
                                    'toto': 'http://example.com/toto'})
-        _filter = filters.NerDisambiguationWordParts()
-        ner = core.NerProcess((source,), filters=(_filter,))
+        _filter = NerDisambiguationWordParts()
+        ner = NerProcess((source,), filters=(_filter,))
         named_entities = ner.process_text(text)
         self.assertEqual(named_entities,
                          [('http://example.com/toto_tutu', None,
@@ -84,8 +87,8 @@
         source = NerSourceLexicon({'toto tutu': 'http://example.com/toto_tutu',
                                    'toto': 'http://example.com/toto'})
         rules = {'http://example.com/toto': 'http://example.com/tata'}
-        _filter = filters.NerReplacementRulesFilter(rules)
-        ner = core.NerProcess((source,), filters=(_filter,))
+        _filter = NerReplacementRulesFilter(rules)
+        ner = NerProcess((source,), filters=(_filter,))
         named_entities = ner.process_text(text)
         self.assertEqual(named_entities,
                          [('http://example.com/toto_tutu', None,
--- a/test/test_named_entities.py	Thu Dec 19 14:45:43 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,230 +0,0 @@
-# -*- coding:utf-8 -*-
-#
-# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
-# contact http://www.logilab.fr -- mailto:contact@logilab.fr
-#
-# This program is free software: you can redistribute it and/or modify it under
-# the terms of the GNU Lesser General Public License as published by the Free
-# Software Foundation, either version 2.1 of the License, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
-# details.
-#
-# You should have received a copy of the GNU Lesser General Public License along
-# with this program. If not, see <http://www.gnu.org/licenses/>.
-import unittest2
-
-from nazca.named_entities.sources import (NerSourceLexicon,
-                                          NerSourceSparql,
-                                          NerSourceRql)
-from nazca.named_entities import NerProcess
-from nazca.utils.tokenizer import Token, Sentence
-from nazca.named_entities.preprocessors import NerStopwordsFilterPreprocessor
-
-
-class NerTest(unittest2.TestCase):
-    """ Test of Ner """
-
-    def test_lexicon_source(self):
-        """ Test lexicon source """
-        lexicon = {'everyone': 'http://example.com/everyone',
-                   'me': 'http://example.com/me'}
-        source = NerSourceLexicon(lexicon)
-        self.assertEqual(source.query_word('me'), ['http://example.com/me',])
-        self.assertEqual(source.query_word('everyone'), ['http://example.com/everyone',])
-        self.assertEqual(source.query_word('me everyone'), [])
-        self.assertEqual(source.query_word('toto'), [])
-        # Token
-        token = Token('me', 0, 2, None)
-        self.assertEqual(source.recognize_token(token), ['http://example.com/me',])
-        token = Token('ma', 0, 2, None)
-        self.assertEqual(source.recognize_token(token), [])
-
-    def test_rql_source(self):
-        """ Test rql source """
-        source = NerSourceRql('http://www.cubicweb.org',
-                              'Any U LIMIT 1 WHERE X cwuri U, X name "%(word)s"')
-        self.assertEqual(source.query_word('apycot'), [u'http://www.cubicweb.org/1310453',])
-
-    def test_sparql_source(self):
-        """ Test sparql source """
-        source = NerSourceSparql(u'http://dbpedia.org/sparql',
-                                 u'''SELECT DISTINCT ?uri
-                                     WHERE{
-                                     ?uri rdfs:label "%(word)s"@en .
-                                     ?uri rdf:type ?type}''')
-        self.assertEqual(source.query_word('Python'),
-                         [u'http://dbpedia.org/resource/Python',
-                          u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage',
-                          u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'])
-
-    def test_ner_process(self):
-        """ Test ner process """
-        text = 'Hello everyone, this is   me speaking. And me.'
-        source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
-                                   'me': 'http://example.com/me'})
-        ner = NerProcess((source,))
-        named_entities = ner.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/everyone', None,
-                           Token(word='everyone', start=6, end=14,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46)))])
-
-    def test_ner_process_multisources(self):
-        """ Test ner process """
-        text = 'Hello everyone, this is   me speaking. And me.'
-        source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
-                                    'me': 'http://example.com/me'})
-        source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
-        # Two sources, not unique
-        ner = NerProcess((source1, source2))
-        named_entities = ner.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/everyone', None,
-                           Token(word='everyone', start=6, end=14,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example2.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46))),
-                          ('http://example2.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46)))])
-        # Two sources, unique
-        ner = NerProcess((source1, source2), unique=True)
-        named_entities = ner.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/everyone', None,
-                           Token(word='everyone', start=6, end=14,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46)))])
-        # Two sources inversed, unique
-        ner = NerProcess((source2, source1), unique=True)
-        named_entities = ner.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/everyone', None,
-                           Token(word='everyone', start=6, end=14,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example2.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example2.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46)))])
-
-    def test_ner_process_add_sources(self):
-        """ Test ner process """
-        text = 'Hello everyone, this is   me speaking. And me.'
-        source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
-                                    'me': 'http://example.com/me'})
-        source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
-        ner = NerProcess((source1,))
-        named_entities = ner.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/everyone', None,
-                           Token(word='everyone', start=6, end=14,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46))),])
-        # Two sources, not unique
-        ner.add_ner_source(source2)
-        named_entities = ner.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/everyone', None,
-                           Token(word='everyone', start=6, end=14,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example2.com/me', None,
-                           Token(word='me', start=26, end=28,
-                                           sentence=Sentence(indice=0, start=0, end=38))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46))),
-                          ('http://example2.com/me', None,
-                           Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46)))])
-
-    def test_ner_process_preprocess(self):
-        """ Test ner process """
-        text = 'Hello Toto, this is   me speaking. And me.'
-        source = NerSourceLexicon({'Toto': 'http://example.com/toto',
-                                   'me': 'http://example.com/me'})
-        preprocessor = NerStopwordsFilterPreprocessor()
-        ner = NerProcess((source,),
-                                  preprocessors=(preprocessor,))
-        named_entities = ner.process_text(text)
-        self.assertEqual(named_entities, [('http://example.com/toto', None,
-                                           Token(word='Toto', start=6, end=10,
-                                                 sentence=Sentence(indice=0, start=0, end=34)))])
-
-    def test_ner_process_add_preprocess(self):
-        """ Test ner process """
-        text = 'Hello Toto, this is   me speaking. And me.'
-        source = NerSourceLexicon({'Toto': 'http://example.com/toto',
-                                   'me': 'http://example.com/me'})
-        preprocessor = NerStopwordsFilterPreprocessor()
-        ner = NerProcess((source,),)
-        named_entities = ner.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/toto', None,
-                           Token(word='Toto', start=6, end=10,
-                                 sentence=Sentence(indice=0, start=0, end=34))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=22, end=24,
-                                 sentence=Sentence(indice=0, start=0, end=34))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=39, end=41,
-                                 sentence=Sentence(indice=1, start=34, end=42)))])
-        ner.add_preprocessors(preprocessor)
-        named_entities = ner.process_text(text)
-        self.assertEqual(named_entities, [('http://example.com/toto', None,
-                                           Token(word='Toto', start=6, end=10,
-                                                 sentence=Sentence(indice=0, start=0, end=34)))])
-
-    def test_ner_process_chained_word(self):
-        """ Test ner process """
-        text = 'Hello everyone me, this is   me speaking. And me.'
-        source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
-                                   'everyone me': 'http://example.com/everyone_me',
-                                   'me': 'http://example.com/me'})
-        ner = NerProcess((source,))
-        named_entities = ner.process_text(text)
-        self.assertEqual(named_entities,
-                         [('http://example.com/everyone_me', None,
-                           Token(word='everyone me', start=6, end=17,
-                                 sentence=Sentence(indice=0, start=0, end=41))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=29, end=31,
-                                 sentence=Sentence(indice=0, start=0, end=41))),
-                          ('http://example.com/me', None,
-                           Token(word='me', start=46, end=48, sentence=Sentence(indice=1, start=41, end=49)))])
-
-
-if __name__ == '__main__':
-    unittest2.main()
-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_ner.py	Thu Dec 19 14:45:56 2013 +0000
@@ -0,0 +1,230 @@
+# -*- coding:utf-8 -*-
+#
+# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+import unittest2
+
+from nazca.ner.sources import (NerSourceLexicon,
+                                          NerSourceSparql,
+                                          NerSourceRql)
+from nazca.ner import NerProcess
+from nazca.utils.tokenizer import Token, Sentence
+from nazca.ner.preprocessors import NerStopwordsFilterPreprocessor
+
+
+class NerTest(unittest2.TestCase):
+    """ Test of Ner """
+
+    def test_lexicon_source(self):
+        """ Test lexicon source """
+        lexicon = {'everyone': 'http://example.com/everyone',
+                   'me': 'http://example.com/me'}
+        source = NerSourceLexicon(lexicon)
+        self.assertEqual(source.query_word('me'), ['http://example.com/me',])
+        self.assertEqual(source.query_word('everyone'), ['http://example.com/everyone',])
+        self.assertEqual(source.query_word('me everyone'), [])
+        self.assertEqual(source.query_word('toto'), [])
+        # Token
+        token = Token('me', 0, 2, None)
+        self.assertEqual(source.recognize_token(token), ['http://example.com/me',])
+        token = Token('ma', 0, 2, None)
+        self.assertEqual(source.recognize_token(token), [])
+
+    def test_rql_source(self):
+        """ Test rql source """
+        source = NerSourceRql('http://www.cubicweb.org',
+                              'Any U LIMIT 1 WHERE X cwuri U, X name "%(word)s"')
+        self.assertEqual(source.query_word('apycot'), [u'http://www.cubicweb.org/1310453',])
+
+    def test_sparql_source(self):
+        """ Test sparql source """
+        source = NerSourceSparql(u'http://dbpedia.org/sparql',
+                                 u'''SELECT DISTINCT ?uri
+                                     WHERE{
+                                     ?uri rdfs:label "%(word)s"@en .
+                                     ?uri rdf:type ?type}''')
+        self.assertEqual(source.query_word('Python'),
+                         [u'http://dbpedia.org/resource/Python',
+                          u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage',
+                          u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'])
+
+    def test_ner_process(self):
+        """ Test ner process """
+        text = 'Hello everyone, this is   me speaking. And me.'
+        source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
+                                   'me': 'http://example.com/me'})
+        ner = NerProcess((source,))
+        named_entities = ner.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone', None,
+                           Token(word='everyone', start=6, end=14,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46)))])
+
+    def test_ner_process_multisources(self):
+        """ Test ner process """
+        text = 'Hello everyone, this is   me speaking. And me.'
+        source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
+                                    'me': 'http://example.com/me'})
+        source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
+        # Two sources, not unique
+        ner = NerProcess((source1, source2))
+        named_entities = ner.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone', None,
+                           Token(word='everyone', start=6, end=14,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46)))])
+        # Two sources, unique
+        ner = NerProcess((source1, source2), unique=True)
+        named_entities = ner.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone', None,
+                           Token(word='everyone', start=6, end=14,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46)))])
+        # Two sources inversed, unique
+        ner = NerProcess((source2, source1), unique=True)
+        named_entities = ner.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone', None,
+                           Token(word='everyone', start=6, end=14,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46)))])
+
+    def test_ner_process_add_sources(self):
+        """ Test ner process """
+        text = 'Hello everyone, this is   me speaking. And me.'
+        source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
+                                    'me': 'http://example.com/me'})
+        source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
+        ner = NerProcess((source1,))
+        named_entities = ner.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone', None,
+                           Token(word='everyone', start=6, end=14,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46))),])
+        # Two sources, not unique
+        ner.add_ner_source(source2)
+        named_entities = ner.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone', None,
+                           Token(word='everyone', start=6, end=14,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46)))])
+
+    def test_ner_process_preprocess(self):
+        """ Test ner process """
+        text = 'Hello Toto, this is   me speaking. And me.'
+        source = NerSourceLexicon({'Toto': 'http://example.com/toto',
+                                   'me': 'http://example.com/me'})
+        preprocessor = NerStopwordsFilterPreprocessor()
+        ner = NerProcess((source,),
+                                  preprocessors=(preprocessor,))
+        named_entities = ner.process_text(text)
+        self.assertEqual(named_entities, [('http://example.com/toto', None,
+                                           Token(word='Toto', start=6, end=10,
+                                                 sentence=Sentence(indice=0, start=0, end=34)))])
+
+    def test_ner_process_add_preprocess(self):
+        """ Test ner process """
+        text = 'Hello Toto, this is   me speaking. And me.'
+        source = NerSourceLexicon({'Toto': 'http://example.com/toto',
+                                   'me': 'http://example.com/me'})
+        preprocessor = NerStopwordsFilterPreprocessor()
+        ner = NerProcess((source,),)
+        named_entities = ner.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/toto', None,
+                           Token(word='Toto', start=6, end=10,
+                                 sentence=Sentence(indice=0, start=0, end=34))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=22, end=24,
+                                 sentence=Sentence(indice=0, start=0, end=34))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=39, end=41,
+                                 sentence=Sentence(indice=1, start=34, end=42)))])
+        ner.add_preprocessors(preprocessor)
+        named_entities = ner.process_text(text)
+        self.assertEqual(named_entities, [('http://example.com/toto', None,
+                                           Token(word='Toto', start=6, end=10,
+                                                 sentence=Sentence(indice=0, start=0, end=34)))])
+
+    def test_ner_process_chained_word(self):
+        """ Test ner process """
+        text = 'Hello everyone me, this is   me speaking. And me.'
+        source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
+                                   'everyone me': 'http://example.com/everyone_me',
+                                   'me': 'http://example.com/me'})
+        ner = NerProcess((source,))
+        named_entities = ner.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone_me', None,
+                           Token(word='everyone me', start=6, end=17,
+                                 sentence=Sentence(indice=0, start=0, end=41))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=29, end=31,
+                                 sentence=Sentence(indice=0, start=0, end=41))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=46, end=48, sentence=Sentence(indice=1, start=41, end=49)))])
+
+
+if __name__ == '__main__':
+    unittest2.main()
+
--- a/test/test_preprocessors.py	Thu Dec 19 14:45:43 2013 +0000
+++ b/test/test_preprocessors.py	Thu Dec 19 14:45:56 2013 +0000
@@ -18,7 +18,7 @@
 import unittest2
 
 from nazca.utils import tokenizer
-from nazca.named_entities import preprocessors
+from nazca.ner import preprocessors
 
 
 class PreprocessorTest(unittest2.TestCase):