[utils] Use sentences delimiter from NLTK
authorDenis Laxalde <denis.laxalde@logilab.fr>
Fri, 01 Aug 2014 12:26:40 +0200
changeset 477 30af4456d4b0
parent 469 9d9d8c4f2bab
child 478 1382bb5550ca
[utils] Use sentences delimiter from NLTK `nltk.tokenize.punkt` module has a more powerful sentence tokenizer, it handles single capital letters followed by a dot in particular. NLTK is added as a dependency because the new sentence delimiter does not yield the same results as the old one, even in simple cases. In particular, leading spaces between sentences are no longer part of tokenized sentences now. All tests have been adjusted in this respect. Note that there's currently no Debian package for NLTK, though there seems to be some hope: see https://bugs.debian.org/279422. Closes #198624.
debian/control
python-nazca.spec
test/test_filters.py
test/test_ner.py
test/test_tokenizer.py
utils/tokenizer.py
--- a/debian/control	Fri Aug 01 12:25:26 2014 +0200
+++ b/debian/control	Fri Aug 01 12:26:40 2014 +0200
@@ -9,6 +9,8 @@
 Package: nazca
 Architecture: all
 Depends: ${python:Depends}
+Recommends:
+  python-nltk (>= 3.0)
 Description: Python library for data alignment.
  Nazca is a python library that provides a set of alignment helpers
  .
--- a/python-nazca.spec	Fri Aug 01 12:25:26 2014 +0200
+++ b/python-nazca.spec	Fri Aug 01 12:26:40 2014 +0200
@@ -50,4 +50,4 @@
 
 %files
 %defattr(-,root,root,-)
-/*
\ No newline at end of file
+/*
--- a/test/test_filters.py	Fri Aug 01 12:25:26 2014 +0200
+++ b/test/test_filters.py	Fri Aug 01 12:26:40 2014 +0200
@@ -51,10 +51,10 @@
                                            sentence=Sentence(indice=0, start=0, end=38))),
                           ('http://example.com/me', None,
                            Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46))),
+                                           sentence=Sentence(indice=1, start=39, end=46))),
                           ('http://example2.com/me', None,
                            Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46)))])
+                                           sentence=Sentence(indice=1, start=39, end=46)))])
 
     def test_occurence_filter_max_occ(self):
         """ Test occurence filter """
@@ -84,7 +84,7 @@
                                  sentence=Sentence(indice=0, start=0, end=16))),
                           ('http://example.com/toto_tutu', None,
                            Token(word='toto', start=21, end=25,
-                                 sentence=Sentence(indice=1, start=16, end=26)))])
+                                 sentence=Sentence(indice=1, start=17, end=26)))])
 
     def test_disambiguation_word_case(self):
         """ Test occurence filter """
@@ -100,7 +100,7 @@
                                  sentence=Sentence(indice=0, start=0, end=16))),
                           ('http://example.com/toto_tutu', None,
                            Token(word='Toto', start=21, end=25,
-                                 sentence=Sentence(indice=1, start=16, end=26)))])
+                                 sentence=Sentence(indice=1, start=17, end=26)))])
 
     def test_rules_filter(self):
         """ Test rules filter """
@@ -117,7 +117,7 @@
                                  sentence=Sentence(indice=0, start=0, end=16))),
                           ('http://example.com/tata', None,
                            Token(word='toto', start=21, end=25,
-                                 sentence=Sentence(indice=1, start=16, end=26)))])
+                                 sentence=Sentence(indice=1, start=17, end=26)))])
 
 if __name__ == '__main__':
     unittest.main()
--- a/test/test_ner.py	Fri Aug 01 12:25:26 2014 +0200
+++ b/test/test_ner.py	Fri Aug 01 12:26:40 2014 +0200
@@ -26,7 +26,7 @@
                                           NerSourceSparql,
                                           NerSourceRql)
 from nazca.ner import NerProcess
-from nazca.utils.tokenizer import Token, Sentence
+from nazca.utils.tokenizer import Token, Sentence, NLTK_AVAILABLE
 from nazca.ner.preprocessors import NerStopwordsFilterPreprocessor
 
 
@@ -66,6 +66,7 @@
                           u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage',
                           u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'])
 
+    @unittest.skipUnless(NLTK_AVAILABLE, 'nltk is not available')
     def test_ner_process(self):
         """ Test ner process """
         text = 'Hello everyone, this is   me speaking. And me.'
@@ -82,8 +83,9 @@
                                            sentence=Sentence(indice=0, start=0, end=38))),
                           ('http://example.com/me', None,
                            Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46)))])
+                                           sentence=Sentence(indice=1, start=39, end=46)))])
 
+    @unittest.skipUnless(NLTK_AVAILABLE, 'nltk is not available')
     def test_ner_process_multisources(self):
         """ Test ner process """
         text = 'Hello everyone, this is   me speaking. And me.'
@@ -105,10 +107,10 @@
                                            sentence=Sentence(indice=0, start=0, end=38))),
                           ('http://example.com/me', None,
                            Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46))),
+                                           sentence=Sentence(indice=1, start=39, end=46))),
                           ('http://example2.com/me', None,
                            Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46)))])
+                                           sentence=Sentence(indice=1, start=39, end=46)))])
         # Two sources, unique
         ner = NerProcess((source1, source2), unique=True)
         named_entities = ner.process_text(text)
@@ -121,7 +123,7 @@
                                            sentence=Sentence(indice=0, start=0, end=38))),
                           ('http://example.com/me', None,
                            Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46)))])
+                                           sentence=Sentence(indice=1, start=39, end=46)))])
         # Two sources inversed, unique
         ner = NerProcess((source2, source1), unique=True)
         named_entities = ner.process_text(text)
@@ -134,8 +136,9 @@
                                            sentence=Sentence(indice=0, start=0, end=38))),
                           ('http://example2.com/me', None,
                            Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46)))])
+                                           sentence=Sentence(indice=1, start=39, end=46)))])
 
+    @unittest.skipUnless(NLTK_AVAILABLE, 'nltk is not available')
     def test_ner_process_add_sources(self):
         """ Test ner process """
         text = 'Hello everyone, this is   me speaking. And me.'
@@ -153,7 +156,7 @@
                                            sentence=Sentence(indice=0, start=0, end=38))),
                           ('http://example.com/me', None,
                            Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46))),])
+                                           sentence=Sentence(indice=1, start=39, end=46))),])
         # Two sources, not unique
         ner.add_ner_source(source2)
         named_entities = ner.process_text(text)
@@ -169,11 +172,12 @@
                                            sentence=Sentence(indice=0, start=0, end=38))),
                           ('http://example.com/me', None,
                            Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46))),
+                                           sentence=Sentence(indice=1, start=39, end=46))),
                           ('http://example2.com/me', None,
                            Token(word='me', start=43, end=45,
-                                           sentence=Sentence(indice=1, start=38, end=46)))])
+                                           sentence=Sentence(indice=1, start=39, end=46)))])
 
+    @unittest.skipUnless(NLTK_AVAILABLE, 'nltk is not available')
     def test_ner_process_preprocess(self):
         """ Test ner process """
         text = 'Hello Toto, this is   me speaking. And me.'
@@ -187,6 +191,7 @@
                                            Token(word='Toto', start=6, end=10,
                                                  sentence=Sentence(indice=0, start=0, end=34)))])
 
+    @unittest.skipUnless(NLTK_AVAILABLE, 'nltk is not available')
     def test_ner_process_add_preprocess(self):
         """ Test ner process """
         text = 'Hello Toto, this is   me speaking. And me.'
@@ -204,13 +209,14 @@
                                  sentence=Sentence(indice=0, start=0, end=34))),
                           ('http://example.com/me', None,
                            Token(word='me', start=39, end=41,
-                                 sentence=Sentence(indice=1, start=34, end=42)))])
+                                 sentence=Sentence(indice=1, start=35, end=42)))])
         ner.add_preprocessors(preprocessor)
         named_entities = ner.process_text(text)
         self.assertEqual(named_entities, [('http://example.com/toto', None,
                                            Token(word='Toto', start=6, end=10,
                                                  sentence=Sentence(indice=0, start=0, end=34)))])
 
+    @unittest.skipUnless(NLTK_AVAILABLE, 'nltk is not available')
     def test_ner_process_chained_word(self):
         """ Test ner process """
         text = 'Hello everyone me, this is   me speaking. And me.'
@@ -227,7 +233,8 @@
                            Token(word='me', start=29, end=31,
                                  sentence=Sentence(indice=0, start=0, end=41))),
                           ('http://example.com/me', None,
-                           Token(word='me', start=46, end=48, sentence=Sentence(indice=1, start=41, end=49)))])
+                           Token(word='me', start=46, end=48,
+                                 sentence=Sentence(indice=1, start=42, end=49)))])
 
 
 if __name__ == '__main__':
--- a/test/test_tokenizer.py	Fri Aug 01 12:25:26 2014 +0200
+++ b/test/test_tokenizer.py	Fri Aug 01 12:26:40 2014 +0200
@@ -28,6 +28,12 @@
 class TokenizerTest(unittest.TestCase):
     """ Test of tokenizer """
 
+    def test_find_sentences(self):
+        text = 'Hello everyone, this is   me speaking. And me.'
+        sentences = RichStringTokenizer.find_sentences(text)
+        self.assertEqual(sentences[0], Sentence(indice=0, start=0, end=38))
+        self.assertEqual(sentences[1], Sentence(indice=1, start=39, end=46))
+
     def test_richstringtokenizer(self):
         text = 'Hello everyone, this is   me speaking. And me.'
         tokenizer = RichStringTokenizer(text,
@@ -37,7 +43,7 @@
         self.assertEqual(len(tokens), 18)
         t1 = Token(word='Hello everyone this', start=0, end=20, sentence=Sentence(indice=0, start=0, end=38))
         self.assertEqual(tokens[0], t1)
-        t2 = Token(word='And', start=39, end=42, sentence=Sentence(indice=1, start=38, end=46))
+        t2 = Token(word='And', start=39, end=42, sentence=Sentence(indice=1, start=39, end=46))
         self.assertEqual(tokens[16], t2)
 
     def test_richstringtokenizer_loadtext(self):
@@ -68,11 +74,11 @@
                                         token_max_size=4)
         tokens = list(tokenizer)
         self.assertEqual(len(tokens), 21)
-        t1 = Token(word='And me', start=39, end=45, sentence=Sentence(indice=1, start=38, end=46))
+        t1 = Token(word='And me', start=39, end=45, sentence=Sentence(indice=1, start=39, end=46))
         self.assertEqual(tokens[18], t1)
 
     def test_richstringtokenizer_sentences(self):
-        text = 'Hello everyone, this is   me speaking. And me !Why not me ? Blup'
+        text = 'Hello everyone, this is   me speaking. And me ! Why not me ? Blup'
         tokenizer = RichStringTokenizer(text,
                                         token_min_size=1,
                                         token_max_size=4)
@@ -81,11 +87,11 @@
         self.assertEqual(text[sentences[0].start:sentences[0].end],
                          'Hello everyone, this is   me speaking.')
         self.assertEqual(text[sentences[1].start:sentences[1].end],
-                         ' And me !')
+                         'And me !')
         self.assertEqual(text[sentences[2].start:sentences[2].end],
                          'Why not me ?')
         self.assertEqual(text[sentences[3].start:sentences[3].end],
-                         ' Blup')
+                         'Blup')
 
 
 if __name__ == '__main__':
--- a/utils/tokenizer.py	Fri Aug 01 12:25:26 2014 +0200
+++ b/utils/tokenizer.py	Fri Aug 01 12:26:40 2014 +0200
@@ -5,6 +5,12 @@
 import collections
 import re
 
+try:
+    from nltk.tokenize.punkt import PunktSentenceTokenizer
+except ImportError:
+    NLTK_AVAILABLE = False
+else:
+    NLTK_AVAILABLE = True
 
 Token = collections.namedtuple('Token', ['word', 'start', 'end', 'sentence'])
 Sentence = collections.namedtuple('Sentence', ['indice', 'start', 'end'])
@@ -50,11 +56,15 @@
                 yield Token(normalized_word, _words[0].start(), _words[-1].end(), current_sentence)
             indice += 1
 
-    def find_sentences(self, text):
+    @staticmethod
+    def find_sentences(text):
         """ Find the sentences
         """
-        return [Sentence(ind, s.start(), s.end()) for ind, s in
-                enumerate(re.finditer(r'[^.!?]+(?:[.!?]|$)', text, re.UNICODE))]
+        if not NLTK_AVAILABLE:
+            raise RuntimeError("find_sentences requires NLTK to be installed")
+        sentences = PunktSentenceTokenizer().span_tokenize(text)
+        return [Sentence(ind, start, end)
+                for ind, (start, end) in enumerate(sentences)]
 
     def load_text(self, text):
         """ Load the text to be tokenized