[test] Add tests
authorvincent.michel@logilab.fr
Fri, 31 May 2013 12:47:31 +0200
changeset 328 efeed3940aa0
parent 327 eebaf821d5df
child 329 2cc2ba9a5532
[test] Add tests
test/test_core.py
test/test_dataio.py
test/test_filter.py
test/test_preprocessor.py
test/test_tokenizer.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_core.py	Fri May 31 12:47:31 2013 +0200
@@ -0,0 +1,207 @@
+# -*- coding:utf-8 -*-
+#
+# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+import unittest2
+
+from nerdy import core
+from nerdy.tokenizer import Token, Sentence
+
+
+class CoreTest(unittest2.TestCase):
+    """ Test of core """
+
+    def test_lexical_source(self):
+        """ Test lexical source """
+        lexicon = {'everyone': 'http://example.com/everyone',
+                   'me': 'http://example.com/me'}
+        source = core.NerdySourceLexical(lexicon)
+        self.assertEqual(source.query_word('me'), ['http://example.com/me',])
+        self.assertEqual(source.query_word('everyone'), ['http://example.com/everyone',])
+        self.assertEqual(source.query_word('me everyone'), [])
+        self.assertEqual(source.query_word('toto'), [])
+        # Token
+        token = Token('me', 0, 2, None)
+        self.assertEqual(source.recognize_token(token), ['http://example.com/me',])
+        token = Token('ma', 0, 2, None)
+        self.assertEqual(source.recognize_token(token), [])
+
+    def test_rql_source(self):
+        """ Test rql source """
+        source = core.NerdySourceUrlRql('Any U LIMIT 1 WHERE X cwuri U, X name "%(word)s"',
+                                       'http://www.cubicweb.org')
+        self.assertEqual(source.query_word('apycot'), [u'http://www.cubicweb.org/1310453',])
+
+    def test_sparql_source(self):
+        """ Test sparql source """
+        source = core.NerdySourceSparql(u'''SELECT ?uri
+                                            WHERE{
+                                            ?uri rdfs:label "Python"@en .
+                                            ?uri rdf:type ?type}''',
+                                        u'http://dbpedia.org/sparql')
+        self.assertEqual(source.query_word('cubicweb'),
+                         [u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage',
+                          u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'])
+
+    def test_nerdy_process(self):
+        """ Test nerdy process """
+        text = 'Hello everyone, this is   me speaking. And me.'
+        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
+                                          'me': 'http://example.com/me'})
+        nerdy = core.NerdyProcess((source,))
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone', None,
+                           Token(word='everyone', start=6, end=14,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46)))])
+
+    def test_nerdy_process_multisources(self):
+        """ Test nerdy process """
+        text = 'Hello everyone, this is   me speaking. And me.'
+        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
+                                          'me': 'http://example.com/me'})
+        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
+        # Two sources, not unique
+        nerdy = core.NerdyProcess((source1, source2))
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone', None,
+                           Token(word='everyone', start=6, end=14,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46)))])
+        # Two sources, unique
+        nerdy = core.NerdyProcess((source1, source2), unique=True)
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone', None,
+                           Token(word='everyone', start=6, end=14,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46)))])
+        # Two sources inversed, unique
+        nerdy = core.NerdyProcess((source2, source1), unique=True)
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone', None,
+                           Token(word='everyone', start=6, end=14,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46)))])
+
+    def test_nerdy_process_add_sources(self):
+        """ Test nerdy process """
+        text = 'Hello everyone, this is   me speaking. And me.'
+        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
+                                          'me': 'http://example.com/me'})
+        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
+        nerdy = core.NerdyProcess((source1,))
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone', None,
+                           Token(word='everyone', start=6, end=14,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46))),])
+        # Two sources, not unique
+        nerdy.add_ner_source(source2)
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone', None,
+                           Token(word='everyone', start=6, end=14,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46)))])
+
+    def test_nerdy_process_preprocess(self):
+        """ Test nerdy process """
+        text = 'Hello Toto, this is   me speaking. And me.'
+        source = core.NerdySourceLexical({'Toto': 'http://example.com/toto',
+                                          'me': 'http://example.com/me'})
+        preprocessor = core.NerdyStopwordsFilterPreprocessor()
+        nerdy = core.NerdyProcess((source,),
+                                  preprocessors=(preprocessor,))
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities, [('http://example.com/toto', None,
+                                           Token(word='Toto', start=6, end=10,
+                                                 sentence=Sentence(indice=0, start=0, end=34)))])
+
+    def test_nerdy_process_add_preprocess(self):
+        """ Test nerdy process """
+        text = 'Hello Toto, this is   me speaking. And me.'
+        source = core.NerdySourceLexical({'Toto': 'http://example.com/toto',
+                                          'me': 'http://example.com/me'})
+        preprocessor = core.NerdyStopwordsFilterPreprocessor()
+        nerdy = core.NerdyProcess((source,),)
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/toto', None,
+                           Token(word='Toto', start=6, end=10,
+                                 sentence=Sentence(indice=0, start=0, end=34))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=22, end=24,
+                                 sentence=Sentence(indice=0, start=0, end=34))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=39, end=41,
+                                 sentence=Sentence(indice=1, start=34, end=42)))])
+        nerdy.add_preprocessors(preprocessor)
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities, [('http://example.com/toto', None,
+                                           Token(word='Toto', start=6, end=10,
+                                                 sentence=Sentence(indice=0, start=0, end=34)))])
+
+
+if __name__ == '__main__':
+    unittest2.main()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_dataio.py	Fri May 31 12:47:31 2013 +0200
@@ -0,0 +1,60 @@
+# -*- coding:utf-8 -*-
+#
+# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+import unittest2
+
+from nerdy import dataio, core
+
+
+class DataioTest(unittest2.TestCase):
+    """ Test of dataio """
+
+    def test_sparql_query(self):
+        results = dataio.sparql_query(query=u'''SELECT ?uri
+                                                WHERE{
+                                                ?uri rdfs:label "Python"@en .
+                                                ?uri rdf:type ?type}''',
+                                      endpoint=u'http://dbpedia.org/sparql')
+        truth = [{u'uri':
+                  {u'type': u'uri',
+                   u'value': u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage'}},
+                 {u'uri':
+                  {u'type': u'uri',
+                   u'value': u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'}}]
+        self.assertEqual(results, truth)
+
+    def test_rql_url_query(self):
+        results = dataio.rql_url_query('Any U LIMIT 1 WHERE X cwuri U, X name "apycot"',
+                                       'http://www.cubicweb.org')
+        self.assertEqual(results, [[u'http://www.cubicweb.org/1310453']])
+
+    def test_prerryprint(self):
+        text = 'Hello everyone, this is   me speaking. And me.'
+        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
+                                          'me': 'http://example.com/me'})
+        nerdy = core.NerdyProcess((source,))
+        named_entities = nerdy.process_text(text)
+        html = dataio.NerdyHTMLPrettyPrint().pprint_text(text, named_entities)
+        self.assertEqual(html, (u'Hello <a href="http://example.com/everyone">everyone</a>, '
+                                u'this is   <a href="http://example.com/me">me</a> speaking. '
+                                u'And <a href="http://example.com/me">me</a>.'))
+
+
+
+if __name__ == '__main__':
+    unittest2.main()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_filter.py	Fri May 31 12:47:31 2013 +0200
@@ -0,0 +1,68 @@
+# -*- coding:utf-8 -*-
+#
+# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+import unittest2
+
+from nerdy import core
+from nerdy.tokenizer import Token, Sentence
+
+
+class FilterTest(unittest2.TestCase):
+    """ Test of filters """
+
+    def test_occurence_filter_min_occ(self):
+        """ Test occurence filter """
+        text = 'Hello everyone, this is   me speaking. And me.'
+        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
+                                          'me': 'http://example.com/me'})
+        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
+        _filter = core.NerdyOccurenceFilter(min_occ=2)
+        nerdy = core.NerdyProcess((source1, source2), filters=(_filter,))
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=26, end=28,
+                                           sentence=Sentence(indice=0, start=0, end=38))),
+                          ('http://example.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46))),
+                          ('http://example2.com/me', None,
+                           Token(word='me', start=43, end=45,
+                                           sentence=Sentence(indice=1, start=38, end=46)))])
+
+    def test_occurence_filter_max_occ(self):
+        """ Test occurence filter """
+        text = 'Hello everyone, this is   me speaking. And me.'
+        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
+                                          'me': 'http://example.com/me'})
+        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
+        _filter = core.NerdyOccurenceFilter(max_occ=1)
+        nerdy = core.NerdyProcess((source1, source2), filters=(_filter,))
+        named_entities = nerdy.process_text(text)
+        self.assertEqual(named_entities,
+                         [('http://example.com/everyone', None,
+                           Token(word='everyone', start=6, end=14,
+                                           sentence=Sentence(indice=0, start=0, end=38))),])
+
+
+
+if __name__ == '__main__':
+    unittest2.main()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_preprocessor.py	Fri May 31 12:47:31 2013 +0200
@@ -0,0 +1,97 @@
+# -*- coding:utf-8 -*-
+#
+# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+import unittest2
+
+from nerdy import core, tokenizer
+
+
+class PreprocessorTest(unittest2.TestCase):
+    """ Test of preprocessors """
+
+    def test_lowercasefilter(self):
+        preprocessor = core.NerdyLowerCaseFilterPreprocessor()
+        token = tokenizer.Token('toto', 0, 4, None)
+        self.assertEqual(preprocessor(token), None)
+        token = tokenizer.Token('toto Tata', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+        token = tokenizer.Token('toto tata', 0, 4, None)
+        self.assertEqual(preprocessor(token), None)
+
+    def test_wordsizefilter(self):
+        preprocessor = core.NerdyWordSizeFilterPreprocessor()
+        token = tokenizer.Token('toto', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+        preprocessor = core.NerdyWordSizeFilterPreprocessor(min_size=3)
+        token = tokenizer.Token('toto', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+        token = tokenizer.Token('to', 0, 4, None)
+        self.assertEqual(preprocessor(token), None)
+        preprocessor = core.NerdyWordSizeFilterPreprocessor(max_size=3)
+        token = tokenizer.Token('toto', 0, 4, None)
+        self.assertEqual(preprocessor(token), None)
+        token = tokenizer.Token('to', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+
+    def test_lowerfirstword(self):
+        preprocessor = core.NerdyLowerFirstWordPreprocessor()
+        sentence = tokenizer.Sentence(0, 0, 20)
+        # Start of the sentence
+        token1 = tokenizer.Token('Toto tata', 0, 4, sentence)
+        token2 = tokenizer.Token('Toto tata', 0, 4, sentence)
+        self.assertEqual(preprocessor(token1), token2)
+        token1 = tokenizer.Token('Us tata', 0, 4, sentence)
+        token2 = tokenizer.Token('us tata', 0, 4, sentence)
+        self.assertEqual(preprocessor(token1), token2)
+        # Not start of the sentence
+        token1 = tokenizer.Token('Toto tata', 12, 16, sentence)
+        token2 = tokenizer.Token('Toto tata', 12, 16, sentence)
+        self.assertEqual(preprocessor(token1), token2)
+        token1 = tokenizer.Token('Us tata', 12, 16, sentence)
+        token2 = tokenizer.Token('Us tata', 12, 16, sentence)
+        self.assertEqual(preprocessor(token1), token2)
+
+    def test_stopwordsfilter(self):
+        preprocessor = core.NerdyStopwordsFilterPreprocessor()
+        token = tokenizer.Token('Toto', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+        token = tokenizer.Token('Us', 0, 4, None)
+        self.assertEqual(preprocessor(token), None)
+        token = tokenizer.Token('Us there', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+        # Split words
+        preprocessor = core.NerdyStopwordsFilterPreprocessor(split_words=True)
+        token = tokenizer.Token('Us there', 0, 4, None)
+        self.assertEqual(preprocessor(token), None)
+        token = tokenizer.Token('Us there toto', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+
+    def test_hashtag(self):
+        preprocessor = core.NerdyHashTagPreprocessor()
+        token = tokenizer.Token('Toto', 0, 4, None)
+        self.assertEqual(preprocessor(token), token)
+        token1 = tokenizer.Token('@BarackObama', 0, 4, None)
+        token2 = tokenizer.Token('BarackObama', 0, 4, None)
+        self.assertEqual(preprocessor(token1), token2)
+        token1 = tokenizer.Token('@Barack_Obama', 0, 4, None)
+        token2 = tokenizer.Token('Barack Obama', 0, 4, None)
+        self.assertEqual(preprocessor(token1), token2)
+
+
+if __name__ == '__main__':
+    unittest2.main()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_tokenizer.py	Fri May 31 12:47:31 2013 +0200
@@ -0,0 +1,88 @@
+# -*- coding:utf-8 -*-
+#
+# copyright 2013 LOGILAB S.A. (Paris, FRANCE), all rights reserved.
+# contact http://www.logilab.fr -- mailto:contact@logilab.fr
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation, either version 2.1 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License along
+# with this program. If not, see <http://www.gnu.org/licenses/>.
+import unittest2
+
+from nerdy.tokenizer import RichStringTokenizer, Token, Sentence
+
+
+class TokenizerTest(unittest2.TestCase):
+    """ Test of tokenizer """
+
+    def test_richstringtokenizer(self):
+        text = 'Hello everyone, this is   me speaking. And me.'
+        tokenizer = RichStringTokenizer(text,
+                                        token_min_size=1,
+                                        token_max_size=3)
+        tokens = list(tokenizer)
+        self.assertEqual(len(tokens), 18)
+        t1 = Token(word='Hello everyone this', start=0, end=20, sentence=Sentence(indice=0, start=0, end=38))
+        self.assertEqual(tokens[0], t1)
+        t2 = Token(word='And', start=39, end=42, sentence=Sentence(indice=1, start=38, end=46))
+        self.assertEqual(tokens[16], t2)
+
+    def test_richstringtokenizer_loadtext(self):
+        text = 'Hello everyone, this is   me speaking. And me.'
+        tokenizer = RichStringTokenizer(text,
+                                        token_min_size=1,
+                                        token_max_size=3)
+        tokens = list(tokenizer)
+        self.assertEqual(len(tokens), 18)
+        tokenizer.load_text('Hello everyone')
+        tokens = list(tokenizer)
+        self.assertEqual(len(tokens), 3)
+
+    def test_richstringtokenizer_minsize(self):
+        text = 'Hello everyone, this is   me speaking. And me.'
+        tokenizer = RichStringTokenizer(text,
+                                        token_min_size=2,
+                                        token_max_size=3)
+        tokens = list(tokenizer)
+        self.assertEqual(len(tokens), 10)
+        t1 =  Token(word='me speaking', start=26, end=37, sentence=Sentence(indice=0, start=0, end=38))
+        self.assertEqual(tokens[8], t1)
+
+    def test_richstringtokenizer_maxsize(self):
+        text = 'Hello everyone, this is   me speaking. And me.'
+        tokenizer = RichStringTokenizer(text,
+                                        token_min_size=1,
+                                        token_max_size=4)
+        tokens = list(tokenizer)
+        self.assertEqual(len(tokens), 21)
+        t1 = Token(word='And me', start=39, end=45, sentence=Sentence(indice=1, start=38, end=46))
+        self.assertEqual(tokens[18], t1)
+
+    def test_richstringtokenizer_sentences(self):
+        text = 'Hello everyone, this is   me speaking. And me !Why not me ? Blup'
+        tokenizer = RichStringTokenizer(text,
+                                        token_min_size=1,
+                                        token_max_size=4)
+        sentences = tokenizer.find_sentences(text)
+        self.assertEqual(len(sentences), 4)
+        self.assertEqual(text[sentences[0].start:sentences[0].end],
+                         'Hello everyone, this is   me speaking.')
+        self.assertEqual(text[sentences[1].start:sentences[1].end],
+                         ' And me !')
+        self.assertEqual(text[sentences[2].start:sentences[2].end],
+                         'Why not me ?')
+        self.assertEqual(text[sentences[3].start:sentences[3].end],
+                         ' Blup')
+
+
+if __name__ == '__main__':
+    unittest2.main()
+