[utils] Extract words splitting regular expression for easier overridding
authorDenis Laxalde <denis.laxalde@logilab.fr>
Fri, 01 Aug 2014 12:25:26 +0200
changeset 469 9d9d8c4f2bab
parent 466 a507ff7a2ced
child 476 004224904efa
child 477 30af4456d4b0
[utils] Extract words splitting regular expression for easier overridding
utils/tokenizer.py
--- a/utils/tokenizer.py	Tue Jul 15 17:38:12 2014 +0000
+++ b/utils/tokenizer.py	Fri Aug 01 12:25:26 2014 +0200
@@ -26,13 +26,15 @@
         self.token_min_size = token_min_size
         self.token_max_size = token_max_size
 
+    words_re = r'[\w@-]+'
+
     def iter_tokens(self, text):
         """ Iterate tokens over a text
         """
         # Compute sentences
         sentences = self.find_sentences(text)
         # Compute words
-        words = list([m for m in re.finditer(r'[\w@-]+', text, re.UNICODE)])
+        words = [m for m in re.finditer(self.words_re, text, re.UNICODE)]
         indice = 0
         while indice < len(words):
             # Choose the current sentence of the first word