[pandas] Add support of Pandas, closes #248556 draft
authorVincent Michel <vincent.michel@logilab.fr>
Thu, 20 Nov 2014 11:13:24 +0100
changeset 476 004224904efa
parent 469 9d9d8c4f2bab
[pandas] Add support of Pandas, closes #248556
notebooks/Record linkage with Nazca - part 4 - Using Pandas.ipynb
test/test_pandas.py
utils/distances.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/notebooks/Record linkage with Nazca - part 4 - Using Pandas.ipynb	Thu Nov 20 11:13:24 2014 +0100
@@ -0,0 +1,413 @@
+{
+ "metadata": {
+  "name": "Record linkage with Nazca - part 4 - Using Pandas"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": [
+  {
+   "cells": [
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "<h1>Record linkage with Nazca - part 4 - Using Pandas</h1>"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "import urllib\n",
+      "from StringIO import StringIO\n",
+      "from zipfile import ZipFile\n",
+      "\n",
+      "import pandas\n",
+      "\n",
+      "import nazca.utils.dataio as nio\n",
+      "import nazca.utils.distances as nud\n",
+      "import nazca.rl.aligner as nrla"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 54
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "<h3>Importing data from Geonames</h3>"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "url = 'http://download.geonames.org/export/dump/cities15000.zip'\n",
+      "zipdata = StringIO(urllib.urlopen(url).read())\n",
+      "zf = ZipFile(zipdata)\n",
+      "geonames = StringIO(zf.read('cities15000.txt'))\n",
+      "names = ('geonameid', 'name', 'asciiname', 'alternatenames'\n",
+      "        'latitude', 'longitude', 'feature class',\n",
+      "        'feature code', 'country code', 'cc2',\n",
+      "        'admin1 code', 'admin2 code', 'admin3 code',\n",
+      "        'admin4 code', 'population', 'elevation',\n",
+      "        'dem', 'timezone', 'modification date')\n",
+      "pgeonames = pandas.io.parsers.read_csv(geonames, sep='\\t', header=None, names=names)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 31
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "print len(pgeonames)\n",
+      "print pgeonames.name\n",
+      "print pgeonames.irow(slice(1,2)).values"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "22852\n",
+        "3040051        les Escaldes\n",
+        "3041563    Andorra la Vella\n",
+        "290594       Umm al Qaywayn\n",
+        "291074       Ras al-Khaimah\n",
+        "291696         Khawr Fakkan\n",
+        "292223                Dubai\n",
+        "292239        Dibba Al-Hisn\n",
+        "292672              Sharjah\n",
+        "292688            Ar Ruways\n",
+        "292878          Al Fujayrah\n",
+        "292913               Al Ain\n",
+        "292932                Ajman\n",
+        "292953            Adh Dhayd\n",
+        "292968            Abu Dhabi\n",
+        "1120985              Zaranj\n",
+        "...\n",
+        "889215          Kariba\n",
+        "889453          Kadoma\n",
+        "889942          Hwange\n",
+        "890299          Harare\n",
+        "890422           Gweru\n",
+        "890983           Gokwe\n",
+        "893485        Chiredzi\n",
+        "893549        Chipinge\n",
+        "893697        Chinhoyi\n",
+        "894239         Chegutu\n",
+        "894701        Bulawayo\n",
+        "895061         Bindura\n",
+        "895269      Beitbridge\n",
+        "1085510        Epworth\n",
+        "1106542    Chitungwiza\n",
+        "Name: name, Length: 22852\n",
+        "[[Andorra la Vella Andorra la Vella\n",
+        "  ALV,Ando-la-Vyey,Andora,Andora la Vela,Andora la Velja,Andora lja Vehl'ja,Andoro Malnova,Andorra,Andorra Tuan,Andorra a Vella,Andorra la Biella,Andorra la Vella,Andorra la Vielha,Andorra-a-Velha,Andorra-la-Vel'ja,Andorra-la-Vielye,Andorre-la-Vieille,And\u00f2-la-Vy\u00e8y,And\u00f2rra la Vi\u00e8lha,an dao er cheng,andolalabeya,andwra la fyla,\u0391\u03bd\u03b4\u03cc\u03c1\u03c1\u03b1,\u0410\u043d\u0434\u043e\u0440\u0430 \u043b\u0430 \u0412\u0435\u043b\u044f,\u0410\u043d\u0434\u043e\u0440\u0430 \u043b\u0430 \u0412\u0435\u0459\u0430,\u0410\u043d\u0434\u043e\u0440\u0430 \u043b\u044f \u0412\u044d\u043b\u044c\u044f,\u0410\u043d\u0434\u043e\u0440\u0440\u0430-\u043b\u0430-\u0412\u0435\u043b\u044c\u044f,\u05d0\u05e0\u05d3\u05d5\u05e8\u05d4 \u05dc\u05d4 \u05d5\u05d5\u05dc\u05d4,\u0623\u0646\u062f\u0648\u0631\u0627 \u0644\u0627 \u0641\u064a\u0644\u0627,\u12a0\u1295\u12f6\u122b \u120b \u126c\u120b,\u30a2\u30f3\u30c9\u30e9\u30fb\u30e9\u30fb\u30f4\u30a7\u30ea\u30e3,\u5b89\u9053\u723e\u57ce,\uc548\ub3c4\ub77c\ub77c\ubca0\uc57c\n",
+        "  42.50779 1.52109 P PPLC AD nan 07 nan nan nan 20430 nan 1037\n",
+        "  Europe/Andorra 2010-05-30]]\n"
+       ]
+      }
+     ],
+     "prompt_number": 74
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "<h3>Importing data from Dbpedia</h3>"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "dbpedia = nio.sparqlquery('http://demo.cubicweb.org/sparql',\n",
+      "                          '''PREFIX dbonto:<http://dbpedia.org/ontology/>\n",
+      "                             PREFIX geo: <http://www.w3.org/2003/01/geo/wgs84_pos#>\n",
+      "                             SELECT ?p ?n ?c ?la ?lo\n",
+      "                             WHERE {?p a dbonto:PopulatedPlace.\n",
+      "                                    ?p dbonto:country dbpedia:France.\n",
+      "                                    ?p foaf:name ?n.\n",
+      "                                    ?p dbpprop:insee ?c.\n",
+      "                                    ?p geo:lat ?la.\n",
+      "                                    ?p geo:long ?lo.}''',\n",
+      "                          autocast_data=True)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 46
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "print len(dbpedia)\n",
+      "print dbpedia[:10]"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "3702\n",
+        "[[u'http://dbpedia.org/resource/Ajaccio', u'Ajaccio', 2, 41.92670059204102, 8.736900329589844], [u'http://dbpedia.org/resource/Ajaccio', u'Aiacciu', 2, 41.92670059204102, 8.736900329589844], [u'http://dbpedia.org/resource/Bastia', u'Bastia', 2, 42.70080184936523, 9.450300216674805], [u'http://dbpedia.org/resource/Sart%C3%A8ne', u'Sart\\xe8ne', 2, 41.62170028686523, 8.974200248718262], [u'http://dbpedia.org/resource/Corte', u'Corte', 2, 42.30640029907227, 9.151399612426758], [u'http://dbpedia.org/resource/Corte', u'Corti', 2, 42.30640029907227, 9.151399612426758], [u'http://dbpedia.org/resource/Bonifacio,_Corse-du-Sud', u'Bonifacio', 2, 41.38869857788086, 9.156100273132324], [u'http://dbpedia.org/resource/Bonifacio,_Corse-du-Sud', u'Bunifaziu', 2, 41.38869857788086, 9.156100273132324], [u'http://dbpedia.org/resource/Calvi,_Haute-Corse', u'Calvi', 2, 42.56859970092773, 8.7568998336792], [u'http://dbpedia.org/resource/Vescovato,_Haute-Corse', u'Vescovato', 2, 42.49470138549805, 9.440600395202637]]\n"
+       ]
+      }
+     ],
+     "prompt_number": 47
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "<h3>Creating the alignment</h3>"
+     ]
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "We create here two distances:\n",
+      "\n",
+      "  * one difflib distance on the name - this is the column \"name\" in the geonames dataframe,\n",
+      "    and the second column (index 1) in the dbpedia result set;\n",
+      "\n",
+      "  * one geographical distance on the couple (latitude, longitude);"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "processing_name = nud.DifflibProcessing(\"name\", 1)\n",
+      "processing_name.distance(pgeonames.irow(slice(1,2)), dbpedia[0])"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "ename": "KeyError",
+       "evalue": "0",
+       "output_type": "pyerr",
+       "traceback": [
+        "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
+        "\u001b[0;32m<ipython-input-75-27da59eb5e3d>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mprocessing_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnud\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDifflibProcessing\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"name\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mprocessing_name\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdistance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpgeonames\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mirow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mslice\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdbpedia\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+        "\u001b[0;32m/home/vmic/docs/pivot/transfert_comps/nazca/nazca/utils/distances.pyc\u001b[0m in \u001b[0;36mdistance\u001b[0;34m(self, reference_record, target_record)\u001b[0m\n\u001b[1;32m    395\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    396\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mrecord\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mrecord\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 397\u001b[0;31m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    398\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mdistance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreference_record\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget_record\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    399\u001b[0m         \"\"\" Compute the distance between two records\n",
+        "\u001b[0;32m/home/vmic/docs/pivot/transfert_comps/nazca/nazca/utils/distances.pyc\u001b[0m in \u001b[0;36mdifflib_match\u001b[0;34m(stra, strb)\u001b[0m\n\u001b[1;32m    250\u001b[0m     \u001b[0mExtract\u001b[0m \u001b[0mof\u001b[0m \u001b[0mSequenceMatched\u001b[0m \u001b[0mdocumentation\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    251\u001b[0m     \u001b[0;31m'\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0mThe\u001b[0m \u001b[0mbasic\u001b[0m \u001b[0malgorithm\u001b[0m \u001b[0mpredates\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0ma\u001b[0m \u001b[0mlittle\u001b[0m \u001b[0mfancier\u001b[0m \u001b[0mthan\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0man\u001b[0m \u001b[0malgorithm\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 252\u001b[0;31m     \u001b[0mpublished\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mlate\u001b[0m \u001b[0;36m1980\u001b[0m\u001b[0;31m'\u001b[0m\u001b[0ms\u001b[0m \u001b[0mby\u001b[0m \u001b[0mRatcliff\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mObershelp\u001b[0m \u001b[0munder\u001b[0m \u001b[0mthe\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    253\u001b[0m     \u001b[0mhyperbolic\u001b[0m \u001b[0mname\u001b[0m \u001b[0;34m\"gestalt pattern matching\"\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;31m'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    254\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+        "\u001b[0;32m/usr/lib/python2.7/difflib.pyc\u001b[0m in \u001b[0;36mratio\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    657\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    658\u001b[0m         matches = reduce(lambda sum, triple: sum + triple[-1],\n\u001b[0;32m--> 659\u001b[0;31m                          self.get_matching_blocks(), 0)\n\u001b[0m\u001b[1;32m    660\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0m_calculate_ratio\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmatches\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    661\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+        "\u001b[0;32m/usr/lib/python2.7/difflib.pyc\u001b[0m in \u001b[0;36mget_matching_blocks\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    491\u001b[0m         \u001b[0;32mwhile\u001b[0m \u001b[0mqueue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    492\u001b[0m             \u001b[0malo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mahi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mblo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbhi\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mqueue\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 493\u001b[0;31m             \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfind_longest_match\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0malo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mahi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mblo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbhi\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    494\u001b[0m             \u001b[0;31m# a[alo:i] vs b[blo:j] unknown\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    495\u001b[0m             \u001b[0;31m# a[i:i+k] same as b[j:j+k]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+        "\u001b[0;32m/usr/lib/python2.7/difflib.pyc\u001b[0m in \u001b[0;36mfind_longest_match\u001b[0;34m(self, alo, ahi, blo, bhi)\u001b[0m\n\u001b[1;32m    416\u001b[0m             \u001b[0mj2lenget\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mj2len\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    417\u001b[0m             \u001b[0mnewj2len\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 418\u001b[0;31m             \u001b[0;32mfor\u001b[0m \u001b[0mj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mb2j\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnothing\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    419\u001b[0m                 \u001b[0;31m# a[i] matches b[j]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    420\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0mj\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0mblo\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+        "\u001b[0;32m/usr/lib/pymodules/python2.7/pandas/core/series.pyc\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m    427\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__getitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    428\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 429\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_value\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    430\u001b[0m         \u001b[0;32mexcept\u001b[0m \u001b[0mInvalidIndexError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    431\u001b[0m             \u001b[0;32mpass\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+        "\u001b[0;32m/usr/lib/pymodules/python2.7/pandas/core/index.pyc\u001b[0m in \u001b[0;36mget_value\u001b[0;34m(self, series, key)\u001b[0m\n\u001b[1;32m    639\u001b[0m         \"\"\"\n\u001b[1;32m    640\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 641\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_value\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseries\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    642\u001b[0m         \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0me1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    643\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minferred_type\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'integer'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+        "\u001b[0;32m/usr/lib/pymodules/python2.7/pandas/lib.so\u001b[0m in \u001b[0;36mpandas.lib.IndexEngine.get_value (pandas/src/tseries.c:104257)\u001b[0;34m()\u001b[0m\n",
+        "\u001b[0;32m/usr/lib/pymodules/python2.7/pandas/lib.so\u001b[0m in \u001b[0;36mpandas.lib.IndexEngine.get_value (pandas/src/tseries.c:104085)\u001b[0;34m()\u001b[0m\n",
+        "\u001b[0;32m/usr/lib/pymodules/python2.7/pandas/lib.so\u001b[0m in \u001b[0;36mpandas.lib.IndexEngine.get_loc (pandas/src/tseries.c:104794)\u001b[0;34m()\u001b[0m\n",
+        "\u001b[0;32m/usr/lib/pymodules/python2.7/pandas/lib.so\u001b[0m in \u001b[0;36mpandas.lib.Int64HashTable.get_item (pandas/src/tseries.c:15561)\u001b[0;34m()\u001b[0m\n",
+        "\u001b[0;32m/usr/lib/pymodules/python2.7/pandas/lib.so\u001b[0m in \u001b[0;36mpandas.lib.Int64HashTable.get_item (pandas/src/tseries.c:15515)\u001b[0;34m()\u001b[0m\n",
+        "\u001b[0;31mKeyError\u001b[0m: 0"
+       ]
+      }
+     ],
+     "prompt_number": 75
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "processings = (GeographicalProcessing(2, 2, units='km'),)\n",
+      "aligner = BaseAligner(threshold=30, processings=processings)\n",
+      "mat, matched = aligner.align(refset, targetset)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "[[   4.55325174  107.09278107   29.12484169]\n",
+        " [  33.33169937  133.59967041   11.39668941]\n",
+        " [ 141.97203064   31.38606644  162.75946045]\n",
+        " [ 126.65346527   15.69240952  147.18429565]]\n",
+        "{0: [(0, 4.5532517), (2, 29.124842)], 1: [(2, 11.396689)], 3: [(1, 15.69241)]}\n"
+       ]
+      }
+     ],
+     "prompt_number": 2
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "The `get_aligned_pairs()` directly yield the found aligned pairs and the distance"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "aligner = BaseAligner(threshold=30, processings=processings)\n",
+      "for pair in aligner.get_aligned_pairs(refset, targetset):\n",
+      "    print pair"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "(('R1', 0), ('T1', 0), 4.5532517)\n",
+        "(('R2', 1), ('T3', 2), 11.396689)\n",
+        "(('R4', 3), ('T2', 1), 15.69241)\n"
+       ]
+      }
+     ],
+     "prompt_number": 5
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "<h4>Plugging preprocessings and blocking</h4>\n",
+      "\n",
+      "We can plug the preprocessings using `register_ref_normalizer()` and `register_target_normalizer`, and the blocking using `register_blocking()`. Only ONE blocking is allowed, thus you should use PipelineBlocking for multiple blockings."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "import nazca.utils.normalize as nno\n",
+      "from nazca.rl import blocking as nrb\n",
+      "\n",
+      "normalizer = nno.SimplifyNormalizer(attr_index=1)\n",
+      "blocking = nrb.KdTreeBlocking(ref_attr_index=2, target_attr_index=2, threshold=0.3)\n",
+      "aligner = BaseAligner(threshold=30, processings=processings)\n",
+      "aligner.register_ref_normalizer(normalizer)\n",
+      "aligner.register_target_normalizer(normalizer)\n",
+      "aligner.register_blocking(blocking)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 8
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "for pair in aligner.get_aligned_pairs(refset, targetset):\n",
+      "    print pair"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "(('R1', 0), ('T1', 0), 0)\n",
+        "(('R2', 1), ('T3', 2), 0)\n",
+        "(('R4', 3), ('T2', 1), 0)\n"
+       ]
+      }
+     ],
+     "prompt_number": 9
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "An `unique` boolean could be set to False to get all the alignments and not just the one unique on the target set."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "for pair in aligner.get_aligned_pairs(refset, targetset, unique=False):\n",
+      "    print pair"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "(('R1', 0), ('T3', 2), 0)\n",
+        "(('R1', 0), ('T1', 0), 0)\n",
+        "(('R2', 1), ('T3', 2), 0)\n",
+        "(('R4', 3), ('T2', 1), 0)\n"
+       ]
+      }
+     ],
+     "prompt_number": 14
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "<h3>Aligner - nazca.rl.aligner</h3>\n",
+      "\n",
+      "A pipeline of aligners could be created using `PipelineAligner`."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "from nazca.utils.distances import LevenshteinProcessing, GeographicalProcessing\n",
+      "from nazca.rl.aligner import PipelineAligner\n",
+      "\n",
+      "processings = (GeographicalProcessing(2, 2, units='km'),)\n",
+      "aligner_1 = BaseAligner(threshold=30, processings=processings)\n",
+      "processings = (LevenshteinProcessing(1, 1),)\n",
+      "aligner_2 = BaseAligner(threshold=1, processings=processings)\n",
+      "\n",
+      "pipeline = PipelineAligner((aligner_1, aligner_2))"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 11
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "for pair in pipeline.get_aligned_pairs(refset, targetset):\n",
+      "    print pair"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "(('R1', 0), ('T1', 0))\n",
+        "(('R2', 1), ('T3', 2))\n",
+        "(('R4', 3), ('T2', 1))\n"
+       ]
+      }
+     ],
+     "prompt_number": 13
+    }
+   ],
+   "metadata": {}
+  }
+ ]
+}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_pandas.py	Thu Nov 20 11:13:24 2014 +0100
@@ -0,0 +1,101 @@
+# -*- coding:utf-8 -*-
+#
+# copyright 2012 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+
+import sys
+if sys.version_info >= (2, 7):
+    import unittest
+else:
+    import unittest2 as unittest
+import random
+random.seed(6) ### Make sure tests are repeatable
+from os import path
+
+from nazca.utils.normalize import simplify
+import nazca.rl.aligner as alig
+import nazca.rl.blocking as blo
+from nazca.utils.distances import (LevenshteinProcessing,
+                                   GeographicalProcessing,
+                                   check_dataset_idx)
+
+TESTDIR = path.dirname(__file__)
+
+try:
+    import pandas as pd
+    with_pd = True
+except ImportError:
+    with_pd = False
+
+
+@unittest.skipUnless(with_pd, 'Pandas is not installed')
+class PandasTestCase(unittest.TestCase):
+
+    def build_data(self):
+        refframe = pd.DataFrame({'Id': ('V1', 'V2', 'V3', 'V4'),
+                                 'Label': ('label1', 'label2', 'label3', 'label4'),
+                                 'Geo': ((6.14194444444, 48.67),
+                                         (6.2, 49),
+                                         (5.1, 48),
+                                         (5.2, 48.1))})
+        tarframe = pd.DataFrame({'Id': ('T1', 'T2', 'T3'),
+                                 'Label': ('labelt1', 'labelt2', 'labelt3'),
+                                 'Geo': ((6.17, 48.7),
+                                         (5.3, 48.2),
+                                         (6.25, 48.91))})
+        return refframe, tarframe
+
+    def test_check_dataset_idx(self):
+        refframe, tarframe = self.build_data()
+        with self.assertRaises(ValueError):
+            check_dataset_idx(refframe, (1,))
+
+    def test_check_dataset_idx_2(self):
+        refframe, tarframe = self.build_data()
+        with self.assertRaises(ValueError):
+            check_dataset_idx(refframe, (1, 'Label'))
+
+    def test_check_dataset_idx_3(self):
+        refset = [['V1', 'label1', (6.14194444444, 48.67)],
+                  ['V2', 'label2', (6.2, 49)],
+                  ['V3', 'label3', (5.1, 48)],
+                  ['V4', 'label4', (5.2, 48.1)],
+                  ]
+        with self.assertRaises(ValueError):
+            check_dataset_idx(refset, (1, 'Label'))
+
+    def test_check_dataset_idx_4(self):
+        refset = [['V1', 'label1', (6.14194444444, 48.67)],
+                  ['V2', 'label2', (6.2, 49)],
+                  ['V3', 'label3', (5.1, 48)],
+                  ['V4', 'label4', (5.2, 48.1)],
+                  ]
+        self.assertIsNone(check_dataset_idx(refset, (1,)))
+
+    def test_align(self):
+        refframe, tarframe = self.build_data()
+        processings = (GeographicalProcessing('Geo', 'Geo', units='km'),)
+        aligner = alig.BaseAligner(threshold=30, processings=processings)
+        mat, matched = aligner.align(refframe, tarframe)
+        true_matched = [(0,0), (0, 2), (1,2), (3,1)]
+        for k, values in matched.iteritems():
+            for v, distance in values:
+                self.assertIn((k,v), true_matched)
+
+
+if __name__ == '__main__':
+    unittest.main()
+
--- a/utils/distances.py	Fri Aug 01 12:25:26 2014 +0200
+++ b/utils/distances.py	Thu Nov 20 11:13:24 2014 +0100
@@ -25,6 +25,12 @@
     DATEUTIL_ENABLED = False
 from scipy import matrix, empty
 
+try:
+    import pandas as pd
+    with_pd = True
+except ImportError:
+    with_pd = False
+
 from nazca.utils.normalize import tokenize
 
 
@@ -417,6 +423,9 @@
         A distance matrix, of shape (len(refset), len(targetset))
         with the distance of each element in it.
         """
+        # Convert correct ref_indexes and target_indexes for pandas
+        check_dataset_idx(refset, ref_indexes)
+        check_dataset_idx(targetset, target_indexes)
         return cdist(self.distance, refset, targetset,
                      matrix_normalized=self.matrix_normalized,
                      ref_indexes=ref_indexes, target_indexes=target_indexes)
@@ -448,6 +457,19 @@
         return values
 
 
+def check_dataset_idx(dataset, indices):
+    """check indices wrt dataset type consistency
+
+    if dataset is a DataFrame, all indices should be strings,
+    otherwise they should be integers
+    """
+    if with_pd and isinstance(dataset, pd.DataFrame):
+        if any(isinstance(idx, int) for idx in indices):
+            raise ValueError('Indices for pandas dataframe should be string, not integer')
+    elif any(isinstance(idx, basestring) for idx in indices):
+        raise ValueError('Indices of dataset should be integer')
+
+
 ###############################################################################
 ### CONCRETE PROCESSINGS #######################################################
 ###############################################################################