[Distance] Temporal distance supports ambiguity and fuzzyness
authorSimon Chabot <simon.chabot@logilab.fr>
Thu, 18 Oct 2012 13:55:25 +0200
changeset 18 64f8789a2951
parent 17 8cd8b96f9333
child 19 fa22f8965c4a
[Distance] Temporal distance supports ambiguity and fuzzyness - You can precise if the day or the year is given in first (day/month/year or year/month/day or month/day/year format). By default, it assumes the current format is the french common used one, ie day/month/year - You can give fuzzy sentence and compare dates : temporal('Jean est né le 1er octobre 1958', 'Le 01-10-1958, Jean est né') yields 0 !
distances.py
test/test_alignment.py
--- a/distances.py	Thu Oct 18 12:22:30 2012 +0200
+++ b/distances.py	Thu Oct 18 13:55:25 2012 +0200
@@ -115,12 +115,16 @@
     jab = 1.0 * len(seta.intersection(setb)) / len(seta.union(setb))
     return 1.0 - jab
 
-def temporal(stra, strb, granularity = u'days', language = u'french'):
+def temporal(stra, strb, granularity = u'days', language = u'french',
+             dayfirst = True, yearfirst = False):
     """ Return the distance between two strings (read as dates).
 
         ``granularity`` can be either ``days`` or ``months`` or ``years``
         (be careful to the plural form !)
         ``language`` can be either french or english
+
+        ``dayfirst`` and ``yearfirst`` are used in case of ambiguity, for
+        instance 09/09/09, by default it assumes it's day/month/year
     """
     class customparserinfo(dateparser.parserinfo):
         if language.lower() == u'french':
@@ -128,7 +132,7 @@
                         (u'm', u'minute', u'minutes'),
                         (u's', u'seconde', u'seconde'),]
             JUMP     = [u' ', u'.', u',', u';', u'-', u'/', u"'",
-                        u'a', u'le', u'et',]
+                        u'a', u'le', u'et', u'er']
             MONTHS   = [(u'Jan', u'Janvier'), (u'Fev', u'Fevrier'), (u'Mar', u'Mars'),
                        (u'Avr', u'Avril'), (u'Mai', u'Mai'), (u'Jun', u'Juin'),
                        (u'Jui', u'Juillet'), (u'Aou', u'Aout'),
@@ -142,8 +146,10 @@
                         (u'Ven', u'Vendredi'),
                         (u'Sam', u'Samedi'),
                         (u'Dim', u'Dimanche'),]
-    datea = dateparser.parse(stra, parserinfo = customparserinfo())
-    dateb = dateparser.parse(strb, parserinfo = customparserinfo())
+    datea = dateparser.parse(stra, parserinfo = customparserinfo(dayfirst,
+                             yearfirst), fuzzy = True)
+    dateb = dateparser.parse(strb, parserinfo = customparserinfo(dayfirst,
+                             yearfirst), fuzzy = True)
     diff  = datea - dateb
     if granularity.lower() == 'years':
         return abs(diff.days / 365.25)
--- a/test/test_alignment.py	Thu Oct 18 12:22:30 2012 +0200
+++ b/test/test_alignment.py	Thu Oct 18 13:55:25 2012 +0200
@@ -94,12 +94,20 @@
         self.assertEqual(temporal('14 aout 1991', '14/08/1991'), 0)
         self.assertEqual(temporal('14 aout 1991', '08/14/1991'), 0)
         self.assertEqual(temporal('14 aout 1991', '08/15/1992'), 367)
+        #Test a case of ambiguity
+        self.assertEqual(temporal('1er mai 2012', '01/05/2012'), 0)
+        self.assertEqual(temporal('1er mai 2012', '05/01/2012', dayfirst = False), 0)
+        #Test the different granularities available
         self.assertAlmostEqual(temporal('14 aout 1991', '08/15/1992', 'years'), 1.0, 1)
         self.assertAlmostEqual(temporal('1991', '1992', 'years'), 1.0, 1)
         self.assertAlmostEqual(temporal('13 mars', '13 mai', 'months'), 2.0, 1)
         self.assertAlmostEqual(temporal('13 march', '13 may', 'months',
                                         'english'), 2.0, 1)
 
+        #Test fuzzyness
+        self.assertEqual(temporal('Jean est né le 1er octobre 1958',
+                                  'Le 01-10-1958, Jean est né'), 0)
+
     def test_euclidean(self):
         self.assertEqual(euclidean(10, 11), 1)
         self.assertEqual(euclidean(-10, 11), 21)