projektAI/venv/Lib/site-packages/sklearn/feature_extraction/tests/test_text.py

# -*- coding: utf-8 -*-
from collections.abc import Mapping
import re

import pytest
from scipy import sparse

from sklearn.feature_extraction.text import strip_tags
from sklearn.feature_extraction.text import strip_accents_unicode
from sklearn.feature_extraction.text import strip_accents_ascii

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

from sklearn.base import clone

import numpy as np
from numpy.testing import assert_array_almost_equal
from numpy.testing import assert_array_equal
from sklearn.utils import IS_PYPY
from sklearn.utils._testing import (assert_almost_equal,
                                    assert_warns_message, assert_raise_message,
                                    assert_no_warnings,
                                    fails_if_pypy,
                                    assert_allclose_dense_sparse,
                                    skip_if_32bit)
from collections import defaultdict
from functools import partial
import pickle
from io import StringIO

JUNK_FOOD_DOCS = (
    "the pizza pizza beer copyright",
    "the pizza burger beer copyright",
    "the the pizza beer beer copyright",
    "the burger beer beer copyright",
    "the coke burger coke copyright",
    "the coke burger burger",
)

NOTJUNK_FOOD_DOCS = (
    "the salad celeri copyright",
    "the salad salad sparkling water copyright",
    "the the celeri celeri copyright",
    "the tomato tomato salad water",
    "the tomato salad water copyright",
)

ALL_FOOD_DOCS = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS


def uppercase(s):
    return strip_accents_unicode(s).upper()


def strip_eacute(s):
    return s.replace('é', 'e')


def split_tokenize(s):
    return s.split()


def lazy_analyze(s):
    return ['the_ultimate_feature']


def test_strip_accents():
    # check some classical latin accentuated symbols
    a = 'àáâãäåçèéêë'
    expected = 'aaaaaaceeee'
    assert strip_accents_unicode(a) == expected

    a = 'ìíîïñòóôõöùúûüý'
    expected = 'iiiinooooouuuuy'
    assert strip_accents_unicode(a) == expected

    # check some arabic
    a = '\u0625'  # alef with a hamza below: إ
    expected = '\u0627'  # simple alef: ا
    assert strip_accents_unicode(a) == expected

    # mix letters accentuated and not
    a = "this is à test"
    expected = 'this is a test'
    assert strip_accents_unicode(a) == expected

    # strings that are already decomposed
    a = "o\u0308"  # o with diaresis
    expected = "o"
    assert strip_accents_unicode(a) == expected

    # combining marks by themselves
    a = "\u0300\u0301\u0302\u0303"
    expected = ""
    assert strip_accents_unicode(a) == expected

    # Multiple combining marks on one character
    a = "o\u0308\u0304"
    expected = "o"
    assert strip_accents_unicode(a) == expected


def test_to_ascii():
    # check some classical latin accentuated symbols
    a = 'àáâãäåçèéêë'
    expected = 'aaaaaaceeee'
    assert strip_accents_ascii(a) == expected

    a = "ìíîïñòóôõöùúûüý"
    expected = 'iiiinooooouuuuy'
    assert strip_accents_ascii(a) == expected

    # check some arabic
    a = '\u0625'  # halef with a hamza below
    expected = ''  # halef has no direct ascii match
    assert strip_accents_ascii(a) == expected

    # mix letters accentuated and not
    a = "this is à test"
    expected = 'this is a test'
    assert strip_accents_ascii(a) == expected


@pytest.mark.parametrize('Vectorizer', (CountVectorizer, HashingVectorizer))
def test_word_analyzer_unigrams(Vectorizer):
    wa = Vectorizer(strip_accents='ascii').build_analyzer()
    text = ("J'ai mangé du kangourou  ce midi, "
            "c'était pas très bon.")
    expected = ['ai', 'mange', 'du', 'kangourou', 'ce', 'midi',
                'etait', 'pas', 'tres', 'bon']
    assert wa(text) == expected

    text = "This is a test, really.\n\n I met Harry yesterday."
    expected = ['this', 'is', 'test', 'really', 'met', 'harry',
                'yesterday']
    assert wa(text) == expected

    wa = Vectorizer(input='file').build_analyzer()
    text = StringIO("This is a test with a file-like object!")
    expected = ['this', 'is', 'test', 'with', 'file', 'like',
                'object']
    assert wa(text) == expected

    # with custom preprocessor
    wa = Vectorizer(preprocessor=uppercase).build_analyzer()
    text = ("J'ai mangé du kangourou  ce midi, "
            " c'était pas très bon.")
    expected = ['AI', 'MANGE', 'DU', 'KANGOUROU', 'CE', 'MIDI',
                'ETAIT', 'PAS', 'TRES', 'BON']
    assert wa(text) == expected

    # with custom tokenizer
    wa = Vectorizer(tokenizer=split_tokenize,
                    strip_accents='ascii').build_analyzer()
    text = ("J'ai mangé du kangourou  ce midi, "
            "c'était pas très bon.")
    expected = ["j'ai", 'mange', 'du', 'kangourou', 'ce', 'midi,',
                "c'etait", 'pas', 'tres', 'bon.']
    assert wa(text) == expected


def test_word_analyzer_unigrams_and_bigrams():
    wa = CountVectorizer(analyzer="word", strip_accents='unicode',
                         ngram_range=(1, 2)).build_analyzer()

    text = "J'ai mangé du kangourou  ce midi, c'était pas très bon."
    expected = ['ai', 'mange', 'du', 'kangourou', 'ce', 'midi',
                'etait', 'pas', 'tres', 'bon', 'ai mange', 'mange du',
                'du kangourou', 'kangourou ce', 'ce midi', 'midi etait',
                'etait pas', 'pas tres', 'tres bon']
    assert wa(text) == expected


def test_unicode_decode_error():
    # decode_error default to strict, so this should fail
    # First, encode (as bytes) a unicode string.
    text = "J'ai mangé du kangourou  ce midi, c'était pas très bon."
    text_bytes = text.encode('utf-8')

    # Then let the Analyzer try to decode it as ascii. It should fail,
    # because we have given it an incorrect encoding.
    wa = CountVectorizer(ngram_range=(1, 2), encoding='ascii').build_analyzer()
    with pytest.raises(UnicodeDecodeError):
        wa(text_bytes)

    ca = CountVectorizer(analyzer='char', ngram_range=(3, 6),
                         encoding='ascii').build_analyzer()
    with pytest.raises(UnicodeDecodeError):
        ca(text_bytes)


def test_char_ngram_analyzer():
    cnga = CountVectorizer(analyzer='char', strip_accents='unicode',
                           ngram_range=(3, 6)).build_analyzer()

    text = "J'ai mangé du kangourou  ce midi, c'était pas très bon"
    expected = ["j'a", "'ai", 'ai ', 'i m', ' ma']
    assert cnga(text)[:5] == expected
    expected = ['s tres', ' tres ', 'tres b', 'res bo', 'es bon']
    assert cnga(text)[-5:] == expected

    text = "This \n\tis a test, really.\n\n I met Harry yesterday"
    expected = ['thi', 'his', 'is ', 's i', ' is']
    assert cnga(text)[:5] == expected

    expected = [' yeste', 'yester', 'esterd', 'sterda', 'terday']
    assert cnga(text)[-5:] == expected

    cnga = CountVectorizer(input='file', analyzer='char',
                           ngram_range=(3, 6)).build_analyzer()
    text = StringIO("This is a test with a file-like object!")
    expected = ['thi', 'his', 'is ', 's i', ' is']
    assert cnga(text)[:5] == expected


def test_char_wb_ngram_analyzer():
    cnga = CountVectorizer(analyzer='char_wb', strip_accents='unicode',
                           ngram_range=(3, 6)).build_analyzer()

    text = "This \n\tis a test, really.\n\n I met Harry yesterday"
    expected = [' th', 'thi', 'his', 'is ', ' thi']
    assert cnga(text)[:5] == expected

    expected = ['yester', 'esterd', 'sterda', 'terday', 'erday ']
    assert cnga(text)[-5:] == expected

    cnga = CountVectorizer(input='file', analyzer='char_wb',
                           ngram_range=(3, 6)).build_analyzer()
    text = StringIO("A test with a file-like object!")
    expected = [' a ', ' te', 'tes', 'est', 'st ', ' tes']
    assert cnga(text)[:6] == expected


def test_word_ngram_analyzer():
    cnga = CountVectorizer(analyzer='word', strip_accents='unicode',
                           ngram_range=(3, 6)).build_analyzer()

    text = "This \n\tis a test, really.\n\n I met Harry yesterday"
    expected = ['this is test', 'is test really', 'test really met']
    assert cnga(text)[:3] == expected

    expected = ['test really met harry yesterday',
                'this is test really met harry',
                'is test really met harry yesterday']
    assert cnga(text)[-3:] == expected

    cnga_file = CountVectorizer(input='file', analyzer='word',
                                ngram_range=(3, 6)).build_analyzer()
    file = StringIO(text)
    assert cnga_file(file) == cnga(text)


def test_countvectorizer_custom_vocabulary():
    vocab = {"pizza": 0, "beer": 1}
    terms = set(vocab.keys())

    # Try a few of the supported types.
    for typ in [dict, list, iter, partial(defaultdict, int)]:
        v = typ(vocab)
        vect = CountVectorizer(vocabulary=v)
        vect.fit(JUNK_FOOD_DOCS)
        if isinstance(v, Mapping):
            assert vect.vocabulary_ == vocab
        else:
            assert set(vect.vocabulary_) == terms
        X = vect.transform(JUNK_FOOD_DOCS)
        assert X.shape[1] == len(terms)
        v = typ(vocab)
        vect = CountVectorizer(vocabulary=v)
        inv = vect.inverse_transform(X)
        assert len(inv) == X.shape[0]


def test_countvectorizer_custom_vocabulary_pipeline():
    what_we_like = ["pizza", "beer"]
    pipe = Pipeline([
        ('count', CountVectorizer(vocabulary=what_we_like)),
        ('tfidf', TfidfTransformer())])
    X = pipe.fit_transform(ALL_FOOD_DOCS)
    assert (set(pipe.named_steps['count'].vocabulary_) ==
            set(what_we_like))
    assert X.shape[1] == len(what_we_like)


def test_countvectorizer_custom_vocabulary_repeated_indices():
    vocab = {"pizza": 0, "beer": 0}
    msg = "Vocabulary contains repeated indices"
    with pytest.raises(ValueError, match=msg):
        vect = CountVectorizer(vocabulary=vocab)
        vect.fit(["pasta_siziliana"])


def test_countvectorizer_custom_vocabulary_gap_index():
    vocab = {"pizza": 1, "beer": 2}
    with pytest.raises(ValueError, match="doesn't contain index"):
        vect = CountVectorizer(vocabulary=vocab)
        vect.fit(['pasta_verdura'])


def test_countvectorizer_stop_words():
    cv = CountVectorizer()
    cv.set_params(stop_words='english')
    assert cv.get_stop_words() == ENGLISH_STOP_WORDS
    cv.set_params(stop_words='_bad_str_stop_')
    with pytest.raises(ValueError):
        cv.get_stop_words()
    cv.set_params(stop_words='_bad_unicode_stop_')
    with pytest.raises(ValueError):
        cv.get_stop_words()
    stoplist = ['some', 'other', 'words']
    cv.set_params(stop_words=stoplist)
    assert cv.get_stop_words() == set(stoplist)


def test_countvectorizer_empty_vocabulary():
    with pytest.raises(ValueError, match="empty vocabulary"):
        vect = CountVectorizer(vocabulary=[])
        vect.fit(["foo"])

    with pytest.raises(ValueError, match="empty vocabulary"):
        v = CountVectorizer(max_df=1.0, stop_words="english")
        # fit on stopwords only
        v.fit(["to be or not to be", "and me too", "and so do you"])


def test_fit_countvectorizer_twice():
    cv = CountVectorizer()
    X1 = cv.fit_transform(ALL_FOOD_DOCS[:5])
    X2 = cv.fit_transform(ALL_FOOD_DOCS[5:])
    assert X1.shape[1] != X2.shape[1]


def test_countvectorizer_custom_token_pattern():
    """Check `get_feature_names()` when a custom token pattern is passed.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/12971
    """
    corpus = [
        'This is the 1st document in my corpus.',
        'This document is the 2nd sample.',
        'And this is the 3rd one.',
        'Is this the 4th document?',
    ]
    token_pattern = r"[0-9]{1,3}(?:st|nd|rd|th)\s\b(\w{2,})\b"
    vectorizer = CountVectorizer(token_pattern=token_pattern)
    vectorizer.fit_transform(corpus)
    expected = ['document', 'one', 'sample']
    assert vectorizer.get_feature_names() == expected


def test_countvectorizer_custom_token_pattern_with_several_group():
    """Check that we raise an error if token pattern capture several groups.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/12971
    """
    corpus = [
        'This is the 1st document in my corpus.',
        'This document is the 2nd sample.',
        'And this is the 3rd one.',
        'Is this the 4th document?',
    ]

    token_pattern = r"([0-9]{1,3}(?:st|nd|rd|th))\s\b(\w{2,})\b"
    err_msg = "More than 1 capturing group in token pattern"
    vectorizer = CountVectorizer(token_pattern=token_pattern)
    with pytest.raises(ValueError, match=err_msg):
        vectorizer.fit(corpus)


def test_tf_idf_smoothing():
    X = [[1, 1, 1],
         [1, 1, 0],
         [1, 0, 0]]
    tr = TfidfTransformer(smooth_idf=True, norm='l2')
    tfidf = tr.fit_transform(X).toarray()
    assert (tfidf >= 0).all()

    # check normalization
    assert_array_almost_equal((tfidf ** 2).sum(axis=1), [1., 1., 1.])

    # this is robust to features with only zeros
    X = [[1, 1, 0],
         [1, 1, 0],
         [1, 0, 0]]
    tr = TfidfTransformer(smooth_idf=True, norm='l2')
    tfidf = tr.fit_transform(X).toarray()
    assert (tfidf >= 0).all()


def test_tfidf_no_smoothing():
    X = [[1, 1, 1],
         [1, 1, 0],
         [1, 0, 0]]
    tr = TfidfTransformer(smooth_idf=False, norm='l2')
    tfidf = tr.fit_transform(X).toarray()
    assert (tfidf >= 0).all()

    # check normalization
    assert_array_almost_equal((tfidf ** 2).sum(axis=1), [1., 1., 1.])

    # the lack of smoothing make IDF fragile in the presence of feature with
    # only zeros
    X = [[1, 1, 0],
         [1, 1, 0],
         [1, 0, 0]]
    tr = TfidfTransformer(smooth_idf=False, norm='l2')

    in_warning_message = 'divide by zero'
    assert_warns_message(RuntimeWarning, in_warning_message,
                         tr.fit_transform, X).toarray()


def test_sublinear_tf():
    X = [[1], [2], [3]]
    tr = TfidfTransformer(sublinear_tf=True, use_idf=False, norm=None)
    tfidf = tr.fit_transform(X).toarray()
    assert tfidf[0] == 1
    assert tfidf[1] > tfidf[0]
    assert tfidf[2] > tfidf[1]
    assert tfidf[1] < 2
    assert tfidf[2] < 3


def test_vectorizer():
    # raw documents as an iterator
    train_data = iter(ALL_FOOD_DOCS[:-1])
    test_data = [ALL_FOOD_DOCS[-1]]
    n_train = len(ALL_FOOD_DOCS) - 1

    # test without vocabulary
    v1 = CountVectorizer(max_df=0.5)
    counts_train = v1.fit_transform(train_data)
    if hasattr(counts_train, 'tocsr'):
        counts_train = counts_train.tocsr()
    assert counts_train[0, v1.vocabulary_["pizza"]] == 2

    # build a vectorizer v1 with the same vocabulary as the one fitted by v1
    v2 = CountVectorizer(vocabulary=v1.vocabulary_)

    # compare that the two vectorizer give the same output on the test sample
    for v in (v1, v2):
        counts_test = v.transform(test_data)
        if hasattr(counts_test, 'tocsr'):
            counts_test = counts_test.tocsr()

        vocabulary = v.vocabulary_
        assert counts_test[0, vocabulary["salad"]] == 1
        assert counts_test[0, vocabulary["tomato"]] == 1
        assert counts_test[0, vocabulary["water"]] == 1

        # stop word from the fixed list
        assert "the" not in vocabulary

        # stop word found automatically by the vectorizer DF thresholding
        # words that are high frequent across the complete corpus are likely
        # to be not informative (either real stop words of extraction
        # artifacts)
        assert "copyright" not in vocabulary

        # not present in the sample
        assert counts_test[0, vocabulary["coke"]] == 0
        assert counts_test[0, vocabulary["burger"]] == 0
        assert counts_test[0, vocabulary["beer"]] == 0
        assert counts_test[0, vocabulary["pizza"]] == 0

    # test tf-idf
    t1 = TfidfTransformer(norm='l1')
    tfidf = t1.fit(counts_train).transform(counts_train).toarray()
    assert len(t1.idf_) == len(v1.vocabulary_)
    assert tfidf.shape == (n_train, len(v1.vocabulary_))

    # test tf-idf with new data
    tfidf_test = t1.transform(counts_test).toarray()
    assert tfidf_test.shape == (len(test_data), len(v1.vocabulary_))

    # test tf alone
    t2 = TfidfTransformer(norm='l1', use_idf=False)
    tf = t2.fit(counts_train).transform(counts_train).toarray()
    assert not hasattr(t2, "idf_")

    # test idf transform with unlearned idf vector
    t3 = TfidfTransformer(use_idf=True)
    with pytest.raises(ValueError):
        t3.transform(counts_train)

    # test idf transform with incompatible n_features
    X = [[1, 1, 5],
         [1, 1, 0]]
    t3.fit(X)
    X_incompt = [[1, 3],
                 [1, 3]]
    with pytest.raises(ValueError):
        t3.transform(X_incompt)

    # L1-normalized term frequencies sum to one
    assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)

    # test the direct tfidf vectorizer
    # (equivalent to term count vectorizer + tfidf transformer)
    train_data = iter(ALL_FOOD_DOCS[:-1])
    tv = TfidfVectorizer(norm='l1')

    tv.max_df = v1.max_df
    tfidf2 = tv.fit_transform(train_data).toarray()
    assert not tv.fixed_vocabulary_
    assert_array_almost_equal(tfidf, tfidf2)

    # test the direct tfidf vectorizer with new data
    tfidf_test2 = tv.transform(test_data).toarray()
    assert_array_almost_equal(tfidf_test, tfidf_test2)

    # test transform on unfitted vectorizer with empty vocabulary
    v3 = CountVectorizer(vocabulary=None)
    with pytest.raises(ValueError):
        v3.transform(train_data)

    # ascii preprocessor?
    v3.set_params(strip_accents='ascii', lowercase=False)
    processor = v3.build_preprocessor()
    text = ("J'ai mangé du kangourou  ce midi, "
            "c'était pas très bon.")
    expected = strip_accents_ascii(text)
    result = processor(text)
    assert expected == result

    # error on bad strip_accents param
    v3.set_params(strip_accents='_gabbledegook_', preprocessor=None)
    with pytest.raises(ValueError):
        v3.build_preprocessor()

    # error with bad analyzer type
    v3.set_params = '_invalid_analyzer_type_'
    with pytest.raises(ValueError):
        v3.build_analyzer()


def test_tfidf_vectorizer_setters():
    tv = TfidfVectorizer(norm='l2', use_idf=False, smooth_idf=False,
                         sublinear_tf=False)
    tv.norm = 'l1'
    assert tv._tfidf.norm == 'l1'
    tv.use_idf = True
    assert tv._tfidf.use_idf
    tv.smooth_idf = True
    assert tv._tfidf.smooth_idf
    tv.sublinear_tf = True
    assert tv._tfidf.sublinear_tf


@fails_if_pypy
def test_hashing_vectorizer():
    v = HashingVectorizer()
    X = v.transform(ALL_FOOD_DOCS)
    token_nnz = X.nnz
    assert X.shape == (len(ALL_FOOD_DOCS), v.n_features)
    assert X.dtype == v.dtype

    # By default the hashed values receive a random sign and l2 normalization
    # makes the feature values bounded
    assert np.min(X.data) > -1
    assert np.min(X.data) < 0
    assert np.max(X.data) > 0
    assert np.max(X.data) < 1

    # Check that the rows are normalized
    for i in range(X.shape[0]):
        assert_almost_equal(np.linalg.norm(X[0].data, 2), 1.0)

    # Check vectorization with some non-default parameters
    v = HashingVectorizer(ngram_range=(1, 2), norm='l1')
    X = v.transform(ALL_FOOD_DOCS)
    assert X.shape == (len(ALL_FOOD_DOCS), v.n_features)
    assert X.dtype == v.dtype

    # ngrams generate more non zeros
    ngrams_nnz = X.nnz
    assert ngrams_nnz > token_nnz
    assert ngrams_nnz < 2 * token_nnz

    # makes the feature values bounded
    assert np.min(X.data) > -1
    assert np.max(X.data) < 1

    # Check that the rows are normalized
    for i in range(X.shape[0]):
        assert_almost_equal(np.linalg.norm(X[0].data, 1), 1.0)


def test_feature_names():
    cv = CountVectorizer(max_df=0.5)

    # test for Value error on unfitted/empty vocabulary
    with pytest.raises(ValueError):
        cv.get_feature_names()
    assert not cv.fixed_vocabulary_

    # test for vocabulary learned from data
    X = cv.fit_transform(ALL_FOOD_DOCS)
    n_samples, n_features = X.shape
    assert len(cv.vocabulary_) == n_features

    feature_names = cv.get_feature_names()
    assert len(feature_names) == n_features
    assert_array_equal(['beer', 'burger', 'celeri', 'coke', 'pizza',
                        'salad', 'sparkling', 'tomato', 'water'],
                       feature_names)

    for idx, name in enumerate(feature_names):
        assert idx == cv.vocabulary_.get(name)

    # test for custom vocabulary
    vocab = ['beer', 'burger', 'celeri', 'coke', 'pizza',
             'salad', 'sparkling', 'tomato', 'water']

    cv = CountVectorizer(vocabulary=vocab)
    feature_names = cv.get_feature_names()
    assert_array_equal(['beer', 'burger', 'celeri', 'coke', 'pizza', 'salad',
                        'sparkling', 'tomato', 'water'], feature_names)
    assert cv.fixed_vocabulary_

    for idx, name in enumerate(feature_names):
        assert idx == cv.vocabulary_.get(name)


@pytest.mark.parametrize('Vectorizer', (CountVectorizer, TfidfVectorizer))
def test_vectorizer_max_features(Vectorizer):
    expected_vocabulary = {'burger', 'beer', 'salad', 'pizza'}
    expected_stop_words = {'celeri', 'tomato', 'copyright', 'coke',
                           'sparkling', 'water', 'the'}

    # test bounded number of extracted features
    vectorizer = Vectorizer(max_df=0.6, max_features=4)
    vectorizer.fit(ALL_FOOD_DOCS)
    assert set(vectorizer.vocabulary_) == expected_vocabulary
    assert vectorizer.stop_words_ == expected_stop_words


def test_count_vectorizer_max_features():
    # Regression test: max_features didn't work correctly in 0.14.

    cv_1 = CountVectorizer(max_features=1)
    cv_3 = CountVectorizer(max_features=3)
    cv_None = CountVectorizer(max_features=None)

    counts_1 = cv_1.fit_transform(JUNK_FOOD_DOCS).sum(axis=0)
    counts_3 = cv_3.fit_transform(JUNK_FOOD_DOCS).sum(axis=0)
    counts_None = cv_None.fit_transform(JUNK_FOOD_DOCS).sum(axis=0)

    features_1 = cv_1.get_feature_names()
    features_3 = cv_3.get_feature_names()
    features_None = cv_None.get_feature_names()

    # The most common feature is "the", with frequency 7.
    assert 7 == counts_1.max()
    assert 7 == counts_3.max()
    assert 7 == counts_None.max()

    # The most common feature should be the same
    assert "the" == features_1[np.argmax(counts_1)]
    assert "the" == features_3[np.argmax(counts_3)]
    assert "the" == features_None[np.argmax(counts_None)]


def test_vectorizer_max_df():
    test_data = ['abc', 'dea', 'eat']
    vect = CountVectorizer(analyzer='char', max_df=1.0)
    vect.fit(test_data)
    assert 'a' in vect.vocabulary_.keys()
    assert len(vect.vocabulary_.keys()) == 6
    assert len(vect.stop_words_) == 0

    vect.max_df = 0.5  # 0.5 * 3 documents -> max_doc_count == 1.5
    vect.fit(test_data)
    assert 'a' not in vect.vocabulary_.keys()  # {ae} ignored
    assert len(vect.vocabulary_.keys()) == 4    # {bcdt} remain
    assert 'a' in vect.stop_words_
    assert len(vect.stop_words_) == 2

    vect.max_df = 1
    vect.fit(test_data)
    assert 'a' not in vect.vocabulary_.keys()  # {ae} ignored
    assert len(vect.vocabulary_.keys()) == 4    # {bcdt} remain
    assert 'a' in vect.stop_words_
    assert len(vect.stop_words_) == 2


def test_vectorizer_min_df():
    test_data = ['abc', 'dea', 'eat']
    vect = CountVectorizer(analyzer='char', min_df=1)
    vect.fit(test_data)
    assert 'a' in vect.vocabulary_.keys()
    assert len(vect.vocabulary_.keys()) == 6
    assert len(vect.stop_words_) == 0

    vect.min_df = 2
    vect.fit(test_data)
    assert 'c' not in vect.vocabulary_.keys()  # {bcdt} ignored
    assert len(vect.vocabulary_.keys()) == 2    # {ae} remain
    assert 'c' in vect.stop_words_
    assert len(vect.stop_words_) == 4

    vect.min_df = 0.8  # 0.8 * 3 documents -> min_doc_count == 2.4
    vect.fit(test_data)
    assert 'c' not in vect.vocabulary_.keys()  # {bcdet} ignored
    assert len(vect.vocabulary_.keys()) == 1    # {a} remains
    assert 'c' in vect.stop_words_
    assert len(vect.stop_words_) == 5


def test_count_binary_occurrences():
    # by default multiple occurrences are counted as longs
    test_data = ['aaabc', 'abbde']
    vect = CountVectorizer(analyzer='char', max_df=1.0)
    X = vect.fit_transform(test_data).toarray()
    assert_array_equal(['a', 'b', 'c', 'd', 'e'], vect.get_feature_names())
    assert_array_equal([[3, 1, 1, 0, 0],
                        [1, 2, 0, 1, 1]], X)

    # using boolean features, we can fetch the binary occurrence info
    # instead.
    vect = CountVectorizer(analyzer='char', max_df=1.0, binary=True)
    X = vect.fit_transform(test_data).toarray()
    assert_array_equal([[1, 1, 1, 0, 0],
                        [1, 1, 0, 1, 1]], X)

    # check the ability to change the dtype
    vect = CountVectorizer(analyzer='char', max_df=1.0,
                           binary=True, dtype=np.float32)
    X_sparse = vect.fit_transform(test_data)
    assert X_sparse.dtype == np.float32


@fails_if_pypy
def test_hashed_binary_occurrences():
    # by default multiple occurrences are counted as longs
    test_data = ['aaabc', 'abbde']
    vect = HashingVectorizer(alternate_sign=False, analyzer='char', norm=None)
    X = vect.transform(test_data)
    assert np.max(X[0:1].data) == 3
    assert np.max(X[1:2].data) == 2
    assert X.dtype == np.float64

    # using boolean features, we can fetch the binary occurrence info
    # instead.
    vect = HashingVectorizer(analyzer='char', alternate_sign=False,
                             binary=True, norm=None)
    X = vect.transform(test_data)
    assert np.max(X.data) == 1
    assert X.dtype == np.float64

    # check the ability to change the dtype
    vect = HashingVectorizer(analyzer='char', alternate_sign=False,
                             binary=True, norm=None, dtype=np.float64)
    X = vect.transform(test_data)
    assert X.dtype == np.float64


@pytest.mark.parametrize('Vectorizer', (CountVectorizer, TfidfVectorizer))
def test_vectorizer_inverse_transform(Vectorizer):
    # raw documents
    data = ALL_FOOD_DOCS
    vectorizer = Vectorizer()
    transformed_data = vectorizer.fit_transform(data)
    inversed_data = vectorizer.inverse_transform(transformed_data)
    assert isinstance(inversed_data, list)

    analyze = vectorizer.build_analyzer()
    for doc, inversed_terms in zip(data, inversed_data):
        terms = np.sort(np.unique(analyze(doc)))
        inversed_terms = np.sort(np.unique(inversed_terms))
        assert_array_equal(terms, inversed_terms)

    assert sparse.issparse(transformed_data)
    assert transformed_data.format == "csr"

    # Test that inverse_transform also works with numpy arrays and
    # scipy
    transformed_data2 = transformed_data.toarray()
    inversed_data2 = vectorizer.inverse_transform(transformed_data2)
    for terms, terms2 in zip(inversed_data, inversed_data2):
        assert_array_equal(np.sort(terms), np.sort(terms2))

    # Check that inverse_transform also works on non CSR sparse data:
    transformed_data3 = transformed_data.tocsc()
    inversed_data3 = vectorizer.inverse_transform(transformed_data3)
    for terms, terms3 in zip(inversed_data, inversed_data3):
        assert_array_equal(np.sort(terms), np.sort(terms3))


def test_count_vectorizer_pipeline_grid_selection():
    # raw documents
    data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS

    # label junk food as -1, the others as +1
    target = [-1] * len(JUNK_FOOD_DOCS) + [1] * len(NOTJUNK_FOOD_DOCS)

    # split the dataset for model development and final evaluation
    train_data, test_data, target_train, target_test = train_test_split(
        data, target, test_size=.2, random_state=0)

    pipeline = Pipeline([('vect', CountVectorizer()),
                         ('svc', LinearSVC())])

    parameters = {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'svc__loss': ('hinge', 'squared_hinge')
    }

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=1, cv=3)

    # Check that the best model found by grid search is 100% correct on the
    # held out evaluation set.
    pred = grid_search.fit(train_data, target_train).predict(test_data)
    assert_array_equal(pred, target_test)

    # on this toy dataset bigram representation which is used in the last of
    # the grid_search is considered the best estimator since they all converge
    # to 100% accuracy models
    assert grid_search.best_score_ == 1.0
    best_vectorizer = grid_search.best_estimator_.named_steps['vect']
    assert best_vectorizer.ngram_range == (1, 1)


def test_vectorizer_pipeline_grid_selection():
    # raw documents
    data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS

    # label junk food as -1, the others as +1
    target = [-1] * len(JUNK_FOOD_DOCS) + [1] * len(NOTJUNK_FOOD_DOCS)

    # split the dataset for model development and final evaluation
    train_data, test_data, target_train, target_test = train_test_split(
        data, target, test_size=.1, random_state=0)

    pipeline = Pipeline([('vect', TfidfVectorizer()),
                         ('svc', LinearSVC())])

    parameters = {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'vect__norm': ('l1', 'l2'),
        'svc__loss': ('hinge', 'squared_hinge'),
    }

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=1)

    # Check that the best model found by grid search is 100% correct on the
    # held out evaluation set.
    pred = grid_search.fit(train_data, target_train).predict(test_data)
    assert_array_equal(pred, target_test)

    # on this toy dataset bigram representation which is used in the last of
    # the grid_search is considered the best estimator since they all converge
    # to 100% accuracy models
    assert grid_search.best_score_ == 1.0
    best_vectorizer = grid_search.best_estimator_.named_steps['vect']
    assert best_vectorizer.ngram_range == (1, 1)
    assert best_vectorizer.norm == 'l2'
    assert not best_vectorizer.fixed_vocabulary_


def test_vectorizer_pipeline_cross_validation():
    # raw documents
    data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS

    # label junk food as -1, the others as +1
    target = [-1] * len(JUNK_FOOD_DOCS) + [1] * len(NOTJUNK_FOOD_DOCS)

    pipeline = Pipeline([('vect', TfidfVectorizer()),
                         ('svc', LinearSVC())])

    cv_scores = cross_val_score(pipeline, data, target, cv=3)
    assert_array_equal(cv_scores, [1., 1., 1.])


@fails_if_pypy
def test_vectorizer_unicode():
    # tests that the count vectorizer works with cyrillic.
    document = (
        "Машинное обучение — обширный подраздел искусственного "
        "интеллекта, изучающий методы построения алгоритмов, "
        "способных обучаться."
        )

    vect = CountVectorizer()
    X_counted = vect.fit_transform([document])
    assert X_counted.shape == (1, 12)

    vect = HashingVectorizer(norm=None, alternate_sign=False)
    X_hashed = vect.transform([document])
    assert X_hashed.shape == (1, 2 ** 20)

    # No collisions on such a small dataset
    assert X_counted.nnz == X_hashed.nnz

    # When norm is None and not alternate_sign, the tokens are counted up to
    # collisions
    assert_array_equal(np.sort(X_counted.data), np.sort(X_hashed.data))


def test_tfidf_vectorizer_with_fixed_vocabulary():
    # non regression smoke test for inheritance issues
    vocabulary = ['pizza', 'celeri']
    vect = TfidfVectorizer(vocabulary=vocabulary)
    X_1 = vect.fit_transform(ALL_FOOD_DOCS)
    X_2 = vect.transform(ALL_FOOD_DOCS)
    assert_array_almost_equal(X_1.toarray(), X_2.toarray())
    assert vect.fixed_vocabulary_


def test_pickling_vectorizer():
    instances = [
        HashingVectorizer(),
        HashingVectorizer(norm='l1'),
        HashingVectorizer(binary=True),
        HashingVectorizer(ngram_range=(1, 2)),
        CountVectorizer(),
        CountVectorizer(preprocessor=strip_tags),
        CountVectorizer(analyzer=lazy_analyze),
        CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS),
        CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS),
        TfidfVectorizer(),
        TfidfVectorizer(analyzer=lazy_analyze),
        TfidfVectorizer().fit(JUNK_FOOD_DOCS),
    ]

    for orig in instances:
        s = pickle.dumps(orig)
        copy = pickle.loads(s)
        assert type(copy) == orig.__class__
        assert copy.get_params() == orig.get_params()
        if IS_PYPY and isinstance(orig, HashingVectorizer):
            continue
        else:
            assert_array_equal(
                copy.fit_transform(JUNK_FOOD_DOCS).toarray(),
                orig.fit_transform(JUNK_FOOD_DOCS).toarray())


@pytest.mark.parametrize('factory', [
    CountVectorizer.build_analyzer,
    CountVectorizer.build_preprocessor,
    CountVectorizer.build_tokenizer,
])
def test_pickling_built_processors(factory):
    """Tokenizers cannot be pickled
    https://github.com/scikit-learn/scikit-learn/issues/12833
    """
    vec = CountVectorizer()
    function = factory(vec)
    text = ("J'ai mangé du kangourou  ce midi, "
            "c'était pas très bon.")
    roundtripped_function = pickle.loads(pickle.dumps(function))
    expected = function(text)
    result = roundtripped_function(text)
    assert result == expected


def test_countvectorizer_vocab_sets_when_pickling():
    # ensure that vocabulary of type set is coerced to a list to
    # preserve iteration ordering after deserialization
    rng = np.random.RandomState(0)
    vocab_words = np.array(['beer', 'burger', 'celeri', 'coke', 'pizza',
                            'salad', 'sparkling', 'tomato', 'water'])
    for x in range(0, 100):
        vocab_set = set(rng.choice(vocab_words, size=5, replace=False))
        cv = CountVectorizer(vocabulary=vocab_set)
        unpickled_cv = pickle.loads(pickle.dumps(cv))
        cv.fit(ALL_FOOD_DOCS)
        unpickled_cv.fit(ALL_FOOD_DOCS)
        assert cv.get_feature_names() == unpickled_cv.get_feature_names()


def test_countvectorizer_vocab_dicts_when_pickling():
    rng = np.random.RandomState(0)
    vocab_words = np.array(['beer', 'burger', 'celeri', 'coke', 'pizza',
                            'salad', 'sparkling', 'tomato', 'water'])
    for x in range(0, 100):
        vocab_dict = dict()
        words = rng.choice(vocab_words, size=5, replace=False)
        for y in range(0, 5):
            vocab_dict[words[y]] = y
        cv = CountVectorizer(vocabulary=vocab_dict)
        unpickled_cv = pickle.loads(pickle.dumps(cv))
        cv.fit(ALL_FOOD_DOCS)
        unpickled_cv.fit(ALL_FOOD_DOCS)
        assert cv.get_feature_names() == unpickled_cv.get_feature_names()


def test_stop_words_removal():
    # Ensure that deleting the stop_words_ attribute doesn't affect transform

    fitted_vectorizers = (
        TfidfVectorizer().fit(JUNK_FOOD_DOCS),
        CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS),
        CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS)
    )

    for vect in fitted_vectorizers:
        vect_transform = vect.transform(JUNK_FOOD_DOCS).toarray()

        vect.stop_words_ = None
        stop_None_transform = vect.transform(JUNK_FOOD_DOCS).toarray()

        delattr(vect, 'stop_words_')
        stop_del_transform = vect.transform(JUNK_FOOD_DOCS).toarray()

        assert_array_equal(stop_None_transform, vect_transform)
        assert_array_equal(stop_del_transform, vect_transform)


def test_pickling_transformer():
    X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS)
    orig = TfidfTransformer().fit(X)
    s = pickle.dumps(orig)
    copy = pickle.loads(s)
    assert type(copy) == orig.__class__
    assert_array_equal(
        copy.fit_transform(X).toarray(),
        orig.fit_transform(X).toarray())


def test_transformer_idf_setter():
    X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS)
    orig = TfidfTransformer().fit(X)
    copy = TfidfTransformer()
    copy.idf_ = orig.idf_
    assert_array_equal(
        copy.transform(X).toarray(),
        orig.transform(X).toarray())


def test_tfidf_vectorizer_setter():
    orig = TfidfVectorizer(use_idf=True)
    orig.fit(JUNK_FOOD_DOCS)
    copy = TfidfVectorizer(vocabulary=orig.vocabulary_, use_idf=True)
    copy.idf_ = orig.idf_
    assert_array_equal(
        copy.transform(JUNK_FOOD_DOCS).toarray(),
        orig.transform(JUNK_FOOD_DOCS).toarray())


def test_tfidfvectorizer_invalid_idf_attr():
    vect = TfidfVectorizer(use_idf=True)
    vect.fit(JUNK_FOOD_DOCS)
    copy = TfidfVectorizer(vocabulary=vect.vocabulary_, use_idf=True)
    expected_idf_len = len(vect.idf_)
    invalid_idf = [1.0] * (expected_idf_len + 1)
    with pytest.raises(ValueError):
        setattr(copy, 'idf_', invalid_idf)


def test_non_unique_vocab():
    vocab = ['a', 'b', 'c', 'a', 'a']
    vect = CountVectorizer(vocabulary=vocab)
    with pytest.raises(ValueError):
        vect.fit([])


@fails_if_pypy
def test_hashingvectorizer_nan_in_docs():
    # np.nan can appear when using pandas to load text fields from a csv file
    # with missing values.
    message = "np.nan is an invalid document, expected byte or unicode string."
    exception = ValueError

    def func():
        hv = HashingVectorizer()
        hv.fit_transform(['hello world', np.nan, 'hello hello'])

    assert_raise_message(exception, message, func)


def test_tfidfvectorizer_binary():
    # Non-regression test: TfidfVectorizer used to ignore its "binary" param.
    v = TfidfVectorizer(binary=True, use_idf=False, norm=None)
    assert v.binary

    X = v.fit_transform(['hello world', 'hello hello']).toarray()
    assert_array_equal(X.ravel(), [1, 1, 1, 0])
    X2 = v.transform(['hello world', 'hello hello']).toarray()
    assert_array_equal(X2.ravel(), [1, 1, 1, 0])


def test_tfidfvectorizer_export_idf():
    vect = TfidfVectorizer(use_idf=True)
    vect.fit(JUNK_FOOD_DOCS)
    assert_array_almost_equal(vect.idf_, vect._tfidf.idf_)


def test_vectorizer_vocab_clone():
    vect_vocab = TfidfVectorizer(vocabulary=["the"])
    vect_vocab_clone = clone(vect_vocab)
    vect_vocab.fit(ALL_FOOD_DOCS)
    vect_vocab_clone.fit(ALL_FOOD_DOCS)
    assert vect_vocab_clone.vocabulary_ == vect_vocab.vocabulary_


@pytest.mark.parametrize('Vectorizer',
                         (CountVectorizer, TfidfVectorizer, HashingVectorizer))
def test_vectorizer_string_object_as_input(Vectorizer):
    message = ("Iterable over raw text documents expected, "
               "string object received.")
    vec = Vectorizer()
    assert_raise_message(
            ValueError, message, vec.fit_transform, "hello world!")
    assert_raise_message(ValueError, message, vec.fit, "hello world!")
    vec.fit(["some text", "some other text"])
    assert_raise_message(ValueError, message, vec.transform, "hello world!")


@pytest.mark.parametrize("X_dtype", [np.float32, np.float64])
def test_tfidf_transformer_type(X_dtype):
    X = sparse.rand(10, 20000, dtype=X_dtype, random_state=42)
    X_trans = TfidfTransformer().fit_transform(X)
    assert X_trans.dtype == X.dtype


def test_tfidf_transformer_sparse():
    X = sparse.rand(10, 20000, dtype=np.float64, random_state=42)
    X_csc = sparse.csc_matrix(X)
    X_csr = sparse.csr_matrix(X)

    X_trans_csc = TfidfTransformer().fit_transform(X_csc)
    X_trans_csr = TfidfTransformer().fit_transform(X_csr)
    assert_allclose_dense_sparse(X_trans_csc, X_trans_csr)
    assert X_trans_csc.format == X_trans_csr.format


@pytest.mark.parametrize(
    "vectorizer_dtype, output_dtype, warning_expected",
    [(np.int32, np.float64, True),
     (np.int64, np.float64, True),
     (np.float32, np.float32, False),
     (np.float64, np.float64, False)]
)
def test_tfidf_vectorizer_type(vectorizer_dtype, output_dtype,
                               warning_expected):
    X = np.array(["numpy", "scipy", "sklearn"])
    vectorizer = TfidfVectorizer(dtype=vectorizer_dtype)

    warning_msg_match = "'dtype' should be used."
    warning_cls = UserWarning
    expected_warning_cls = warning_cls if warning_expected else None
    with pytest.warns(expected_warning_cls,
                      match=warning_msg_match) as record:
        X_idf = vectorizer.fit_transform(X)
    if expected_warning_cls is None:
        relevant_warnings = [w for w in record
                             if isinstance(w, warning_cls)]
        assert len(relevant_warnings) == 0
    assert X_idf.dtype == output_dtype


@pytest.mark.parametrize("vec", [
        HashingVectorizer(ngram_range=(2, 1)),
        CountVectorizer(ngram_range=(2, 1)),
        TfidfVectorizer(ngram_range=(2, 1))
    ])
def test_vectorizers_invalid_ngram_range(vec):
    # vectorizers could be initialized with invalid ngram range
    # test for raising error message
    invalid_range = vec.ngram_range
    message = ("Invalid value for ngram_range=%s "
               "lower boundary larger than the upper boundary."
               % str(invalid_range))
    if isinstance(vec, HashingVectorizer) and IS_PYPY:
        pytest.xfail(reason='HashingVectorizer is not supported on PyPy')

    assert_raise_message(
        ValueError, message, vec.fit, ["good news everyone"])
    assert_raise_message(
        ValueError, message, vec.fit_transform, ["good news everyone"])

    if isinstance(vec, HashingVectorizer):
        assert_raise_message(
            ValueError, message, vec.transform, ["good news everyone"])


def _check_stop_words_consistency(estimator):
    stop_words = estimator.get_stop_words()
    tokenize = estimator.build_tokenizer()
    preprocess = estimator.build_preprocessor()
    return estimator._check_stop_words_consistency(stop_words, preprocess,
                                                   tokenize)


@fails_if_pypy
def test_vectorizer_stop_words_inconsistent():
    lstr = "['and', 'll', 've']"
    message = ('Your stop_words may be inconsistent with your '
               'preprocessing. Tokenizing the stop words generated '
               'tokens %s not in stop_words.' % lstr)
    for vec in [CountVectorizer(),
                TfidfVectorizer(), HashingVectorizer()]:
        vec.set_params(stop_words=["you've", "you", "you'll", 'AND'])
        assert_warns_message(UserWarning, message, vec.fit_transform,
                             ['hello world'])
        # reset stop word validation
        del vec._stop_words_id
        assert _check_stop_words_consistency(vec) is False

    # Only one warning per stop list
    assert_no_warnings(vec.fit_transform, ['hello world'])
    assert _check_stop_words_consistency(vec) is None

    # Test caching of inconsistency assessment
    vec.set_params(stop_words=["you've", "you", "you'll", 'blah', 'AND'])
    assert_warns_message(UserWarning, message, vec.fit_transform,
                         ['hello world'])


@skip_if_32bit
def test_countvectorizer_sort_features_64bit_sparse_indices():
    """
    Check that CountVectorizer._sort_features preserves the dtype of its sparse
    feature matrix.

    This test is skipped on 32bit platforms, see:
        https://github.com/scikit-learn/scikit-learn/pull/11295
    for more details.
    """

    X = sparse.csr_matrix((5, 5), dtype=np.int64)

    # force indices and indptr to int64.
    INDICES_DTYPE = np.int64
    X.indices = X.indices.astype(INDICES_DTYPE)
    X.indptr = X.indptr.astype(INDICES_DTYPE)

    vocabulary = {
            "scikit-learn": 0,
            "is": 1,
            "great!": 2
            }

    Xs = CountVectorizer()._sort_features(X, vocabulary)

    assert INDICES_DTYPE == Xs.indices.dtype


@fails_if_pypy
@pytest.mark.parametrize('Estimator',
                         [CountVectorizer, TfidfVectorizer, HashingVectorizer])
def test_stop_word_validation_custom_preprocessor(Estimator):
    data = [{'text': 'some text'}]

    vec = Estimator()
    assert _check_stop_words_consistency(vec) is True

    vec = Estimator(preprocessor=lambda x: x['text'],
                    stop_words=['and'])
    assert _check_stop_words_consistency(vec) == 'error'
    # checks are cached
    assert _check_stop_words_consistency(vec) is None
    vec.fit_transform(data)

    class CustomEstimator(Estimator):
        def build_preprocessor(self):
            return lambda x: x['text']

    vec = CustomEstimator(stop_words=['and'])
    assert _check_stop_words_consistency(vec) == 'error'

    vec = Estimator(tokenizer=lambda doc: re.compile(r'\w{1,}')
                                            .findall(doc),
                    stop_words=['and'])
    assert _check_stop_words_consistency(vec) is True


@pytest.mark.parametrize(
    'Estimator',
    [CountVectorizer,
     TfidfVectorizer,
     HashingVectorizer]
)
@pytest.mark.parametrize(
    'input_type, err_type, err_msg',
    [('filename', FileNotFoundError, ''),
     ('file', AttributeError, "'str' object has no attribute 'read'")]
)
def test_callable_analyzer_error(Estimator, input_type, err_type, err_msg):
    if issubclass(Estimator, HashingVectorizer):
        pytest.xfail('HashingVectorizer is not supported on PyPy')
    data = ['this is text, not file or filename']
    with pytest.raises(err_type, match=err_msg):
        Estimator(analyzer=lambda x: x.split(),
                  input=input_type).fit_transform(data)


@pytest.mark.parametrize(
    'Estimator',
    [CountVectorizer,
     TfidfVectorizer,
     pytest.param(HashingVectorizer, marks=fails_if_pypy)]
)
@pytest.mark.parametrize(
    'analyzer', [lambda doc: open(doc, 'r'), lambda doc: doc.read()]
)
@pytest.mark.parametrize('input_type', ['file', 'filename'])
def test_callable_analyzer_change_behavior(Estimator, analyzer, input_type):
    data = ['this is text, not file or filename']
    with pytest.raises((FileNotFoundError, AttributeError)):
        Estimator(analyzer=analyzer, input=input_type).fit_transform(data)


@pytest.mark.parametrize(
    'Estimator',
    [CountVectorizer,
     TfidfVectorizer,
     HashingVectorizer]
)
def test_callable_analyzer_reraise_error(tmpdir, Estimator):
    # check if a custom exception from the analyzer is shown to the user
    def analyzer(doc):
        raise Exception("testing")

    if issubclass(Estimator, HashingVectorizer):
        pytest.xfail('HashingVectorizer is not supported on PyPy')

    f = tmpdir.join("file.txt")
    f.write("sample content\n")

    with pytest.raises(Exception, match="testing"):
        Estimator(analyzer=analyzer, input='file').fit_transform([f])


@pytest.mark.parametrize(
    'Vectorizer',
    [CountVectorizer, HashingVectorizer, TfidfVectorizer]
)
@pytest.mark.parametrize(
    'stop_words, tokenizer, preprocessor, ngram_range, token_pattern,'
    'analyzer, unused_name, ovrd_name, ovrd_msg',
    [(["you've", "you'll"], None, None, (1, 1), None, 'char',
     "'stop_words'", "'analyzer'", "!= 'word'"),
     (None, lambda s: s.split(), None, (1, 1), None, 'char',
     "'tokenizer'", "'analyzer'", "!= 'word'"),
     (None, lambda s: s.split(), None, (1, 1), r'\w+', 'word',
      "'token_pattern'", "'tokenizer'", "is not None"),
     (None, None, lambda s:s.upper(), (1, 1), r'\w+', lambda s:s.upper(),
      "'preprocessor'", "'analyzer'", "is callable"),
     (None, None, None, (1, 2), None, lambda s:s.upper(),
      "'ngram_range'", "'analyzer'", "is callable"),
     (None, None, None, (1, 1), r'\w+', 'char',
      "'token_pattern'", "'analyzer'", "!= 'word'")]
)
def test_unused_parameters_warn(Vectorizer, stop_words,
                                tokenizer, preprocessor,
                                ngram_range, token_pattern,
                                analyzer, unused_name, ovrd_name,
                                ovrd_msg):

    train_data = JUNK_FOOD_DOCS
    # setting parameter and checking for corresponding warning messages
    vect = Vectorizer()
    vect.set_params(stop_words=stop_words, tokenizer=tokenizer,
                    preprocessor=preprocessor, ngram_range=ngram_range,
                    token_pattern=token_pattern, analyzer=analyzer)
    msg = ("The parameter %s will not be used"
           " since %s %s" % (unused_name, ovrd_name, ovrd_msg)
           )
    with pytest.warns(UserWarning, match=msg):
        vect.fit(train_data)


@pytest.mark.parametrize('Vectorizer, X', (
    (HashingVectorizer, [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]),
    (CountVectorizer, JUNK_FOOD_DOCS))
)
def test_n_features_in(Vectorizer, X):
    # For vectorizers, n_features_in_ does not make sense
    vectorizer = Vectorizer()
    assert not hasattr(vectorizer, 'n_features_in_')
    vectorizer.fit(X)
    assert not hasattr(vectorizer, 'n_features_in_')


def test_tie_breaking_sample_order_invariance():
    # Checks the sample order invariance when setting max_features
    # non-regression test for #17939
    vec = CountVectorizer(max_features=1)
    vocab1 = vec.fit(['hello', 'world']).vocabulary_
    vocab2 = vec.fit(['world', 'hello']).vocabulary_
    assert vocab1 == vocab2