notebooks/Record linkage with Nazca - part 3 - Putting it all together.ipynb
author Vincent Michel <vincent.michel@logilab.fr>
Mon, 30 Jun 2014 14:11:18 +0000
changeset 453 7debe5d58a19
parent 449 80c0a2d8624a
child 456 d93286fdd149
permissions -rw-r--r--
preparing 0.6.0

{
 "metadata": {
  "name": "Record linkage with Nazca - part 3 - Putting it all together"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "<h1>Record linkage with Nazca - part 3 - Putting it all together</h1>"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "<h3>Aligner - nazca.rl.aligner</h3>\n",
      "\n",
      "Once you have created your datasets, and define your preprocessings and blockings, you can use the `BaseAligner` object to perform the alignment.\n",
      "\n",
      "The `BaseAligner` is defined as:\n",
      "\n",
      "    class BaseAligner(object):\n",
      "\n",
      "        def register_ref_normalizer(self, normalizer):\n",
      "            \"\"\" Register normalizers to be applied before alignment \"\"\"\n",
      "\n",
      "        def register_target_normalizer(self, normalizer):\n",
      "            \"\"\" Register normalizers to be applied before alignment \"\"\"\n",
      "\n",
      "        def register_blocking(self, blocking):\n",
      "            self.blocking = blocking\n",
      "\n",
      "        def align(self, refset, targetset, get_matrix=True):\n",
      "            \"\"\" Perform the alignment on the referenceset and the targetset \"\"\"\n",
      "\n",
      "        def get_aligned_pairs(self, refset, targetset, unique=True):\n",
      "            \"\"\" Get the pairs of aligned elements \"\"\""
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "The `align()` function return the global distance matrix and the matched elements as a dictionnary, with key the index of reference records, and values the list of aligned target set records."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from nazca.utils.distances import GeographicalProcessing \n",
      "from nazca.rl.aligner import BaseAligner\n",
      "\n",
      "refset = [['R1', 'ref1', (6.14194444444, 48.67)],\n",
      "          ['R2', 'ref2', (6.2, 49)],\n",
      "          ['R3', 'ref3', (5.1, 48)],\n",
      "          ['R4', 'ref4', (5.2, 48.1)]]\n",
      "targetset = [['T1', 'target1', (6.17, 48.7)],\n",
      "             ['T2', 'target2', (5.3, 48.2)],\n",
      "             ['T3', 'target3', (6.25, 48.91)]]\n",
      "processings = (GeographicalProcessing(2, 2, units='km'),)\n",
      "aligner = BaseAligner(threshold=30, processings=processings)\n",
      "mat, matched = aligner.align(refset, targetset)\n",
      "print mat\n",
      "print matched"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[[   4.55325174  107.09278107   29.12484169]\n",
        " [  33.33169937  133.59967041   11.39668941]\n",
        " [ 141.97203064   31.38606644  162.75946045]\n",
        " [ 126.65346527   15.69240952  147.18429565]]\n",
        "{0: [(0, 4.5532517), (2, 29.124842)], 1: [(2, 11.396689)], 3: [(1, 15.69241)]}\n"
       ]
      }
     ],
     "prompt_number": 2
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "The `get_aligned_pairs()` directly yield the found aligned pairs and the distance"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "aligner = BaseAligner(threshold=30, processings=processings)\n",
      "for pair in aligner.get_aligned_pairs(refset, targetset):\n",
      "    print pair"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "(('R1', 0), ('T1', 0), 4.5532517)\n",
        "(('R2', 1), ('T3', 2), 11.396689)\n",
        "(('R4', 3), ('T2', 1), 15.69241)\n"
       ]
      }
     ],
     "prompt_number": 5
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "<h4>Plugging preprocessings and blocking</h4>\n",
      "\n",
      "We can plug the preprocessings using `register_ref_normalizer()` and `register_target_normalizer`, and the blocking using `register_blocking()`. Only ONE blocking is allowed, thus you should use PipelineBlocking for multiple blockings."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import nazca.utils.normalize as nno\n",
      "from nazca.rl import blocking as nrb\n",
      "\n",
      "normalizer = nno.SimplifyNormalizer(attr_index=1)\n",
      "blocking = nrb.KdTreeBlocking(ref_attr_index=2, target_attr_index=2, threshold=0.3)\n",
      "aligner = BaseAligner(threshold=30, processings=processings)\n",
      "aligner.register_ref_normalizer(normalizer)\n",
      "aligner.register_target_normalizer(normalizer)\n",
      "aligner.register_blocking(blocking)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "for pair in aligner.get_aligned_pairs(refset, targetset):\n",
      "    print pair"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "(('R1', 0), ('T1', 0), 0)\n",
        "(('R2', 1), ('T3', 2), 0)\n",
        "(('R4', 3), ('T2', 1), 0)\n"
       ]
      }
     ],
     "prompt_number": 9
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "An `unique` boolean could be set to False to get all the alignments and not just the one unique on the target set."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "for pair in aligner.get_aligned_pairs(refset, targetset, unique=False):\n",
      "    print pair"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "(('R1', 0), ('T3', 2), 0)\n",
        "(('R1', 0), ('T1', 0), 0)\n",
        "(('R2', 1), ('T3', 2), 0)\n",
        "(('R4', 3), ('T2', 1), 0)\n"
       ]
      }
     ],
     "prompt_number": 14
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "<h3>Aligner - nazca.rl.aligner</h3>\n",
      "\n",
      "A pipeline of aligners could be created using `PipelineAligner`."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from nazca.utils.distances import LevenshteinProcessing, GeographicalProcessing\n",
      "from nazca.rl.aligner import PipelineAligner\n",
      "\n",
      "processings = (GeographicalProcessing(2, 2, units='km'),)\n",
      "aligner_1 = BaseAligner(threshold=30, processings=processings)\n",
      "processings = (LevenshteinProcessing(1, 1),)\n",
      "aligner_2 = BaseAligner(threshold=1, processings=processings)\n",
      "\n",
      "pipeline = PipelineAligner((aligner_1, aligner_2))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 11
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "for pair in pipeline.get_aligned_pairs(refset, targetset):\n",
      "    print pair"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "(('R1', 0), ('T1', 0))\n",
        "(('R2', 1), ('T3', 2))\n",
        "(('R4', 3), ('T2', 1))\n"
       ]
      }
     ],
     "prompt_number": 13
    }
   ],
   "metadata": {}
  }
 ]
}