[named entities] Move tokenizer to utils and create a sources module for named entities, related to #187461
authorvincent.michel@logilab.fr
Thu, 19 Dec 2013 14:45:26 +0000
changeset 373 77a3a4107f5c
parent 372 4ef3109eab7a
child 374 102c6331f3f6
[named entities] Move tokenizer to utils and create a sources module for named entities, related to #187461
named_entities/named_entities.py
named_entities/sources.py
named_entities/tokenizer.py
test/test_named_entities.py
test/test_tokenizer.py
utils/dataio.py
utils/tokenizer.py
--- a/named_entities/named_entities.py	Thu Dec 19 14:45:21 2013 +0000
+++ b/named_entities/named_entities.py	Thu Dec 19 14:45:26 2013 +0000
@@ -1,141 +1,13 @@
 # -*- coding: utf-8 -*-
 """ Core functions for Named Entities Recognition.
 """
-from nerdy.tokenizer import RichStringTokenizer, Token
-from nerdy.dataio import sparql_query, rql_url_query, rql_appid_query
-from nerdy.stopwords import FRENCH_STOPWORDS, ENGLISH_STOPWORDS
+from nazca.utils.tokenizer import RichStringTokenizer, Token
+from nazca.utils.dataio import sparqlquery
+from nazca.reference_data.stopwords import FRENCH_STOPWORDS, ENGLISH_STOPWORDS
 
 STOPWORDS = {'fr': FRENCH_STOPWORDS,
              'en': ENGLISH_STOPWORDS}
 
-# XXX Add SQL source ?
-# XXX NER preprocessor
-
-###############################################################################
-### NER SOURCE ################################################################
-###############################################################################
-class AbstractNerdySource(object):
-    """ High-level source for Named Entities Recognition
-    """
-
-    def __init__(self, query, endpoint, name=None, use_cache=True, preprocessors=None):
-        """ Initialise the class.
-        """
-        self.query = query
-        self.endpoint = endpoint
-        self.name = name
-        self.preprocessors = preprocessors or []
-        self.use_cache = use_cache
-        self._recognized_cache = {}
-
-    def add_preprocessors(self, preprocessor):
-        """ Add a preprocessor
-        """
-        self.preprocessors.append(preprocessor)
-
-    def recognize_token(self, token):
-        """ Recognize a token
-        """
-        # Applies source specific preprocessors
-        for preprocessor in self.preprocessors:
-            token = preprocessor(token)
-            if not token:
-                return []
-        if self.use_cache and token.word in self._recognized_cache:
-            return self._recognized_cache[token.word]
-        uris = self.query_word(token.word) if token.word else []
-        if self.use_cache:
-            self._recognized_cache[token.word] = uris
-        return uris
-
-    def query_word(self, word):
-        """ Query a word for a Named Entities Recognition process
-        """
-        raise NotImplementedError
-
-
-class NerdySourceLexical(AbstractNerdySource):
-    """ Source based on a (pre-computed) dictionnary of words (token, uri)
-    """
-    def __init__(self, lexicon, name=None, use_cache=True, preprocessors=None):
-        self.lexicon = lexicon
-        self.name = name
-        self.preprocessors = preprocessors or []
-        self.use_cache = use_cache
-        self._recognized_cache = {}
-
-    def query_word(self, word):
-        uri = self.lexicon.get(word)
-        return [uri,] if uri else []
-
-
-class NerdySourceLocalRql(AbstractNerdySource):
-    """ High-level source for Named Entities Recognition
-    Local RQL version
-    """
-
-    def __init__(self, query, session, name=None, use_cache=True, preprocessors=None):
-        """ Initialise the class.
-        """
-        self.query = query
-        self.session = session
-        self.name = name
-        self.preprocessors = preprocessors or []
-        self.use_cache = use_cache
-        self._recognized_cache = {}
-
-    def query_word(self, word):
-        """ Query a word for a Named Entities Recognition process
-        """
-        return [r[0] for r in self.session.execute(self.query, dict(word=word))]
-
-
-class NerdySourceAppidRql(AbstractNerdySource):
-    """ High-level source for Named Entities Recognition
-    Appid RQL version
-    """
-
-    def query_word(self, word):
-        """ Query a word for a Named Entities Recognition process
-        """
-        return [r[0] for r in rql_appid_query(self.query, self.endpoint, word=word)]
-
-
-class NerdySourceUrlRql(AbstractNerdySource):
-    """ High-level source for Named Entities Recognition
-    Url RQL version
-    """
-
-    def query_word(self, word):
-        """ Query a word for a Named Entities Recognition process
-        """
-        return [r[0] for r in rql_url_query(self.query % {'word': word}, self.endpoint)]
-
-
-class NerdySourceSparql(AbstractNerdySource):
-    """ High-level source for Named Entities Recognition
-    SPARQL version
-
-   >>> from nerdy.core import NerdySourceSparql
-   >>> ner_source = NerdySourceSparql('''SELECT ?uri
-                                         WHERE{
-                                         ?uri rdfs:label "%(word)s"@en}''',
-			                 'http://dbpedia.org/sparql')
-   >>> print ner_source.recognize_token('Victor Hugo')
-		... ['http://dbpedia.org/resource/Category:Victor_Hugo',
-		     'http://dbpedia.org/resource/Victor_Hugo',
-		     'http://dbpedia.org/class/yago/VictorHugo',
-		     'http://dbpedia.org/class/yago/VictorHugo(ParisM%C3%A9tro)',
-		     'http://sw.opencyc.org/2008/06/10/concept/en/VictorHugo',
-		     'http://sw.opencyc.org/2008/06/10/concept/Mx4rve1ZXJwpEbGdrcN5Y29ycA']
-
-    """
-
-    def query_word(self, word):
-        """ Query a word for a Named Entities Recognition process
-        """
-        return [r['uri']['value'] for r in sparql_query(self.query % {'word': word}, self.endpoint)]
-
 
 ###############################################################################
 ### NER PREPROCESSORS #########################################################
@@ -261,7 +133,7 @@
                 if seen_uris[uri]:
                     filtered_named_entities.append((uri, p, t))
             else:
-                results = sparql_query(self.query % {'uri': uri}, self.endpoint)
+                results = sparqlquery(self.endpoint, self.query % {'uri': uri})
                 types = set([r['type']['value'] for r in results])
                 if not len(types.intersection(self.accepted_types)):
                     seen_uris[uri] = False
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/named_entities/sources.py	Thu Dec 19 14:45:26 2013 +0000
@@ -0,0 +1,124 @@
+# -*- coding: utf-8 -*-
+""" Core functions for Named Entities Recognition.
+"""
+from nazca.utils.tokenizer import Token
+from nazca.utils.dataio import sparqlquery, rqlquery
+
+
+###############################################################################
+### NER SOURCE ################################################################
+###############################################################################
+class AbstractNerSource(object):
+    """ High-level source for Named Entities Recognition
+    """
+
+    def __init__(self, endpoint, query, name=None, use_cache=True, preprocessors=None):
+        """ Initialise the class.
+        """
+        self.endpoint = endpoint
+        self.query = query
+        self.name = name
+        self.preprocessors = preprocessors or []
+        self.use_cache = use_cache
+        self._recognized_cache = {}
+
+    def add_preprocessors(self, preprocessor):
+        """ Add a preprocessor
+        """
+        self.preprocessors.append(preprocessor)
+
+    def recognize_token(self, token):
+        """ Recognize a token
+        """
+        # Applies source specific preprocessors
+        for preprocessor in self.preprocessors:
+            token = preprocessor(token)
+            if not token:
+                return []
+        if self.use_cache and token.word in self._recognized_cache:
+            return self._recognized_cache[token.word]
+        uris = self.query_word(token.word) if token.word else []
+        if self.use_cache:
+            self._recognized_cache[token.word] = uris
+        return uris
+
+    def query_word(self, word):
+        """ Query a word for a Named Entities Recognition process
+        """
+        raise NotImplementedError
+
+
+class NerSourceLexicon(AbstractNerSource):
+    """ Source based on a (pre-computed) dictionnary of words (token, uri)
+    """
+    def __init__(self, lexicon, name=None, use_cache=True, preprocessors=None):
+        self.lexicon = lexicon
+        self.name = name
+        self.preprocessors = preprocessors or []
+        self.use_cache = use_cache
+        self._recognized_cache = {}
+
+    def query_word(self, word):
+        uri = self.lexicon.get(word)
+        return [uri,] if uri else []
+
+
+class NerSourceLocalRql(AbstractNerSource):
+    """ High-level source for Named Entities Recognition
+    Local RQL version
+    """
+
+    def __init__(self, session, query, name=None, use_cache=True, preprocessors=None):
+        """ Initialise the class.
+        """
+        self.query = query
+        self.session = session
+        self.name = name
+        self.preprocessors = preprocessors or []
+        self.use_cache = use_cache
+        self._recognized_cache = {}
+
+    def query_word(self, word):
+        """ Query a word for a Named Entities Recognition process
+        """
+        return [r[0] for r in self.session.execute(self.query, dict(word=word))]
+
+
+class NerSourceRql(AbstractNerSource):
+    """ High-level source for Named Entities Recognition
+    Url version (distant source)
+    """
+
+    def query_word(self, word):
+        """ Query a word for a Named Entities Recognition process
+        """
+        if self.endpoint.startswith('http://'):
+            # url
+            return [r[0] for r in rqlquery(self.endpoint, self.query % {'word': word})]
+        else:
+            return [r[0] for r in rqlquery(self.endpoint, self.query, word=word)]
+
+
+class NerSourceSparql(AbstractNerSource):
+    """ High-level source for Named Entities Recognition
+    SPARQL version
+
+   >>> from nerdy.core import NerSourceSparql
+   >>> ner_source = NerSourceSparql('''SELECT ?uri
+                                         WHERE{
+                                         ?uri rdfs:label "%(word)s"@en}''',
+			                 'http://dbpedia.org/sparql')
+   >>> print ner_source.recognize_token('Victor Hugo')
+		... ['http://dbpedia.org/resource/Category:Victor_Hugo',
+		     'http://dbpedia.org/resource/Victor_Hugo',
+		     'http://dbpedia.org/class/yago/VictorHugo',
+		     'http://dbpedia.org/class/yago/VictorHugo(ParisM%C3%A9tro)',
+		     'http://sw.opencyc.org/2008/06/10/concept/en/VictorHugo',
+		     'http://sw.opencyc.org/2008/06/10/concept/Mx4rve1ZXJwpEbGdrcN5Y29ycA']
+
+    """
+
+    def query_word(self, word):
+        """ Query a word for a Named Entities Recognition process
+        """
+        return [r['uri']['value'] for r in sparqlquery(self.endpoint, self.query % {'word': word})]
--- a/named_entities/tokenizer.py	Thu Dec 19 14:45:21 2013 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,66 +0,0 @@
-# -*- coding: utf-8 -*-
-""" Tokenizer for sentences/words segmentation.
-"""
-import itertools
-import collections
-import re
-
-
-Token = collections.namedtuple('Token', ['word', 'start', 'end', 'sentence'])
-Sentence = collections.namedtuple('Sentence', ['indice', 'start', 'end'])
-
-
-class RichStringTokenizer(object):
-    """Tokenizer for Yams' RichString content.
-
-    The tokenizer uses a variable-length sliding window, i.e. a sliding
-    window yielding tokens of N words.
-    """
-
-    def __init__(self, text, token_min_size=1, token_max_size=3):
-        """
-        :token_min_size: minimum number of words required to be a valid token
-        :token_max_size: minimum number of words required to be a valid token
-        """
-        self.text = text
-        self.token_min_size = token_min_size
-        self.token_max_size = token_max_size
-
-    def iter_tokens(self, text):
-        """ Iterate tokens over a text
-        """
-        # Compute sentences
-        sentences = self.find_sentences(text)
-        # Compute words
-        words = list([m for m in re.finditer(r'[\w@-]+', text, re.UNICODE)])
-        indice = 0
-        while indice < len(words):
-            # Choose the current sentence of the first word
-            current_sentence = [s for s in sentences if s.start<=words[indice].start()][-1]
-            # Sliding windows over the different words for each sentence
-            remaining = len(words) - indice
-            for length in range(min(self.token_max_size, remaining), self.token_min_size-1, -1):
-                _words = words[indice:indice+length]
-                if _words[-1].start() > current_sentence.end:
-                    # The last word in not in the same sentence anymore, split
-                    continue
-                normalized_word = ' '.join([w.group() for w in _words]).strip()
-                yield Token(normalized_word, _words[0].start(), _words[-1].end(), current_sentence)
-            indice += 1
-
-    def find_sentences(self, text):
-        """ Find the sentences
-        """
-        return [Sentence(ind, s.start(), s.end()) for ind, s in
-                enumerate(re.finditer(r'[^.!?]+(?:[.!?]|$)', text, re.UNICODE))]
-
-    def load_text(self, text):
-        """ Load the text to be tokenized
-        """
-        self.text = text
-
-    def __iter__(self):
-        """ Iterator over the text given in the object instantiation
-        """
-        for t in self.iter_tokens(self.text):
-            yield t
--- a/test/test_named_entities.py	Thu Dec 19 14:45:21 2013 +0000
+++ b/test/test_named_entities.py	Thu Dec 19 14:45:26 2013 +0000
@@ -17,18 +17,21 @@
 # with this program. If not, see <http://www.gnu.org/licenses/>.
 import unittest2
 
-from nerdy import core
-from nerdy.tokenizer import Token, Sentence
+from nazca.named_entities.sources import (NerSourceLexicon,
+                                          NerSourceSparql,
+                                          NerSourceRql)
+from nazca.named_entities import named_entities as core
+from nazca.utils.tokenizer import Token, Sentence
 
 
 class CoreTest(unittest2.TestCase):
     """ Test of core """
 
-    def test_lexical_source(self):
-        """ Test lexical source """
+    def test_lexicon_source(self):
+        """ Test lexicon source """
         lexicon = {'everyone': 'http://example.com/everyone',
                    'me': 'http://example.com/me'}
-        source = core.NerdySourceLexical(lexicon)
+        source = NerSourceLexicon(lexicon)
         self.assertEqual(source.query_word('me'), ['http://example.com/me',])
         self.assertEqual(source.query_word('everyone'), ['http://example.com/everyone',])
         self.assertEqual(source.query_word('me everyone'), [])
@@ -41,17 +44,17 @@
 
     def test_rql_source(self):
         """ Test rql source """
-        source = core.NerdySourceUrlRql('Any U LIMIT 1 WHERE X cwuri U, X name "%(word)s"',
-                                       'http://www.cubicweb.org')
+        source = NerSourceRql('http://www.cubicweb.org',
+                              'Any U LIMIT 1 WHERE X cwuri U, X name "%(word)s"')
         self.assertEqual(source.query_word('apycot'), [u'http://www.cubicweb.org/1310453',])
 
     def test_sparql_source(self):
         """ Test sparql source """
-        source = core.NerdySourceSparql(u'''SELECT ?uri
-                                            WHERE{
-                                            ?uri rdfs:label "Python"@en .
-                                            ?uri rdf:type ?type}''',
-                                        u'http://dbpedia.org/sparql')
+        source = NerSourceSparql(u'http://dbpedia.org/sparql',
+                                 u'''SELECT ?uri
+                                     WHERE{
+                                     ?uri rdfs:label "Python"@en .
+                                     ?uri rdf:type ?type}''')
         self.assertEqual(source.query_word('cubicweb'),
                          [u'http://sw.opencyc.org/2008/06/10/concept/en/Python_ProgrammingLanguage',
                           u'http://sw.opencyc.org/2008/06/10/concept/Mx4r74UIARqkEdac2QACs0uFOQ'])
@@ -59,8 +62,8 @@
     def test_nerdy_process(self):
         """ Test nerdy process """
         text = 'Hello everyone, this is   me speaking. And me.'
-        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
-                                          'me': 'http://example.com/me'})
+        source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
+                                   'me': 'http://example.com/me'})
         nerdy = core.NerdyProcess((source,))
         named_entities = nerdy.process_text(text)
         self.assertEqual(named_entities,
@@ -77,9 +80,9 @@
     def test_nerdy_process_multisources(self):
         """ Test nerdy process """
         text = 'Hello everyone, this is   me speaking. And me.'
-        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
-                                          'me': 'http://example.com/me'})
-        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
+        source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
+                                    'me': 'http://example.com/me'})
+        source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
         # Two sources, not unique
         nerdy = core.NerdyProcess((source1, source2))
         named_entities = nerdy.process_text(text)
@@ -129,9 +132,9 @@
     def test_nerdy_process_add_sources(self):
         """ Test nerdy process """
         text = 'Hello everyone, this is   me speaking. And me.'
-        source1 = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
-                                          'me': 'http://example.com/me'})
-        source2 = core.NerdySourceLexical({'me': 'http://example2.com/me'})
+        source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
+                                    'me': 'http://example.com/me'})
+        source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
         nerdy = core.NerdyProcess((source1,))
         named_entities = nerdy.process_text(text)
         self.assertEqual(named_entities,
@@ -167,8 +170,8 @@
     def test_nerdy_process_preprocess(self):
         """ Test nerdy process """
         text = 'Hello Toto, this is   me speaking. And me.'
-        source = core.NerdySourceLexical({'Toto': 'http://example.com/toto',
-                                          'me': 'http://example.com/me'})
+        source = NerSourceLexicon({'Toto': 'http://example.com/toto',
+                                   'me': 'http://example.com/me'})
         preprocessor = core.NerdyStopwordsFilterPreprocessor()
         nerdy = core.NerdyProcess((source,),
                                   preprocessors=(preprocessor,))
@@ -180,8 +183,8 @@
     def test_nerdy_process_add_preprocess(self):
         """ Test nerdy process """
         text = 'Hello Toto, this is   me speaking. And me.'
-        source = core.NerdySourceLexical({'Toto': 'http://example.com/toto',
-                                          'me': 'http://example.com/me'})
+        source = NerSourceLexicon({'Toto': 'http://example.com/toto',
+                                   'me': 'http://example.com/me'})
         preprocessor = core.NerdyStopwordsFilterPreprocessor()
         nerdy = core.NerdyProcess((source,),)
         named_entities = nerdy.process_text(text)
@@ -204,9 +207,9 @@
     def test_nerdy_process_chained_word(self):
         """ Test nerdy process """
         text = 'Hello everyone me, this is   me speaking. And me.'
-        source = core.NerdySourceLexical({'everyone': 'http://example.com/everyone',
-                                          'everyone me': 'http://example.com/everyone_me',
-                                          'me': 'http://example.com/me'})
+        source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
+                                   'everyone me': 'http://example.com/everyone_me',
+                                   'me': 'http://example.com/me'})
         nerdy = core.NerdyProcess((source,))
         named_entities = nerdy.process_text(text)
         self.assertEqual(named_entities,
--- a/test/test_tokenizer.py	Thu Dec 19 14:45:21 2013 +0000
+++ b/test/test_tokenizer.py	Thu Dec 19 14:45:26 2013 +0000
@@ -17,7 +17,7 @@
 # with this program. If not, see <http://www.gnu.org/licenses/>.
 import unittest2
 
-from nerdy.tokenizer import RichStringTokenizer, Token, Sentence
+from nazca.utils.tokenizer import RichStringTokenizer, Token, Sentence
 
 
 class TokenizerTest(unittest2.TestCase):
--- a/utils/dataio.py	Thu Dec 19 14:45:21 2013 +0000
+++ b/utils/dataio.py	Thu Dec 19 14:45:26 2013 +0000
@@ -103,7 +103,6 @@
     if not SPARQL_ENABLED:
         raise ImportError("You have to install SPARQLWrapper and JSON modules to"
                           "used this function")
-
     sparql = SPARQLWrapper(endpoint)
     sparql.setQuery(query)
     sparql.setReturnFormat(JSON)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils/tokenizer.py	Thu Dec 19 14:45:26 2013 +0000
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+""" Tokenizer for sentences/words segmentation.
+"""
+import itertools
+import collections
+import re
+
+
+Token = collections.namedtuple('Token', ['word', 'start', 'end', 'sentence'])
+Sentence = collections.namedtuple('Sentence', ['indice', 'start', 'end'])
+
+
+class RichStringTokenizer(object):
+    """Tokenizer for Yams' RichString content.
+
+    The tokenizer uses a variable-length sliding window, i.e. a sliding
+    window yielding tokens of N words.
+    """
+
+    def __init__(self, text, token_min_size=1, token_max_size=3):
+        """
+        :token_min_size: minimum number of words required to be a valid token
+        :token_max_size: minimum number of words required to be a valid token
+        """
+        self.text = text
+        self.token_min_size = token_min_size
+        self.token_max_size = token_max_size
+
+    def iter_tokens(self, text):
+        """ Iterate tokens over a text
+        """
+        # Compute sentences
+        sentences = self.find_sentences(text)
+        # Compute words
+        words = list([m for m in re.finditer(r'[\w@-]+', text, re.UNICODE)])
+        indice = 0
+        while indice < len(words):
+            # Choose the current sentence of the first word
+            current_sentence = [s for s in sentences if s.start<=words[indice].start()][-1]
+            # Sliding windows over the different words for each sentence
+            remaining = len(words) - indice
+            for length in range(min(self.token_max_size, remaining), self.token_min_size-1, -1):
+                _words = words[indice:indice+length]
+                if _words[-1].start() > current_sentence.end:
+                    # The last word in not in the same sentence anymore, split
+                    continue
+                normalized_word = ' '.join([w.group() for w in _words]).strip()
+                yield Token(normalized_word, _words[0].start(), _words[-1].end(), current_sentence)
+            indice += 1
+
+    def find_sentences(self, text):
+        """ Find the sentences
+        """
+        return [Sentence(ind, s.start(), s.end()) for ind, s in
+                enumerate(re.finditer(r'[^.!?]+(?:[.!?]|$)', text, re.UNICODE))]
+
+    def load_text(self, text):
+        """ Load the text to be tokenized
+        """
+        self.text = text
+
+    def __iter__(self):
+        """ Iterator over the text given in the object instantiation
+        """
+        for t in self.iter_tokens(self.text):
+            yield t