support sklearn 0.10->0.14 versions for minhashing nazca-centos-version-0.7.0-1 nazca-debian-version-0.7.0-1 nazca-version-0.7.0
authorAdrien Di Mascio <Adrien.DiMascio@logilab.fr>
Sun, 23 Nov 2014 19:26:09 +0100
changeset 484 36e8adaa0b18
parent 483 57db633dd297
child 485 fcc69759856e
support sklearn 0.10->0.14 versions for minhashing
utils/minhashing.py
--- a/utils/minhashing.py	Sun Nov 23 19:02:07 2014 +0100
+++ b/utils/minhashing.py	Sun Nov 23 19:26:09 2014 +0100
@@ -44,8 +44,16 @@
 def count_vectorizer_func(sentences, min_n, max_n):
     """ Perform a tokenization using scikit learn
     """
-    from sklearn.feature_extraction.text import CountVectorizer
-    count_vec = CountVectorizer(min_n=min_n, max_n=max_n)
+    import sklearn
+    import sklearn.feature_extraction.text as sklt
+    skversion = tuple(int(x) for x in sklearn.__version__.split('.')[:2])
+    if skversion < (0, 11):
+        word_ngram = sklt.WordNGramAnalyzer(min_n=min_n, max_n=max)
+        count_vec = sklt.CountVectorizer(analyzer=word_ngram)
+    elif skversion < (0, 14):
+        count_vec = sklt.CountVectorizer(min_n=min_n, max_n=max_n)
+    else:
+        count_vec = sklt.CountVectorizer(ngram_range=(min_n, max_n))
     # Transform and convert to lil to get rows
     data = count_vec.fit_transform(sentences).tolil()
     return [list(l) for l in data.rows], data.shape