projektAI/venv/Lib/site-packages/sklearn/feature_extraction/tests/test_dict_vectorizer.py

168 lines
5.1 KiB
Python
Raw Normal View History

2021-06-06 22:13:05 +02:00
# Authors: Lars Buitinck
# Dan Blanchard <dblanchard@ets.org>
# License: BSD 3 clause
from random import Random
import numpy as np
import scipy.sparse as sp
from numpy.testing import assert_array_equal
import pytest
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectKBest, chi2
@pytest.mark.parametrize('sparse', (True, False))
@pytest.mark.parametrize('dtype', (int, np.float32, np.int16))
@pytest.mark.parametrize('sort', (True, False))
@pytest.mark.parametrize('iterable', (True, False))
def test_dictvectorizer(sparse, dtype, sort, iterable):
D = [{"foo": 1, "bar": 3},
{"bar": 4, "baz": 2},
{"bar": 1, "quux": 1, "quuux": 2}]
v = DictVectorizer(sparse=sparse, dtype=dtype, sort=sort)
X = v.fit_transform(iter(D) if iterable else D)
assert sp.issparse(X) == sparse
assert X.shape == (3, 5)
assert X.sum() == 14
assert v.inverse_transform(X) == D
if sparse:
# CSR matrices can't be compared for equality
assert_array_equal(X.A, v.transform(iter(D) if iterable
else D).A)
else:
assert_array_equal(X, v.transform(iter(D) if iterable
else D))
if sort:
assert (v.feature_names_ ==
sorted(v.feature_names_))
def test_feature_selection():
# make two feature dicts with two useful features and a bunch of useless
# ones, in terms of chi2
d1 = dict([("useless%d" % i, 10) for i in range(20)],
useful1=1, useful2=20)
d2 = dict([("useless%d" % i, 10) for i in range(20)],
useful1=20, useful2=1)
for indices in (True, False):
v = DictVectorizer().fit([d1, d2])
X = v.transform([d1, d2])
sel = SelectKBest(chi2, k=2).fit(X, [0, 1])
v.restrict(sel.get_support(indices=indices), indices=indices)
assert v.get_feature_names() == ["useful1", "useful2"]
def test_one_of_k():
D_in = [{"version": "1", "ham": 2},
{"version": "2", "spam": .3},
{"version=3": True, "spam": -1}]
v = DictVectorizer()
X = v.fit_transform(D_in)
assert X.shape == (3, 5)
D_out = v.inverse_transform(X)
assert D_out[0] == {"version=1": 1, "ham": 2}
names = v.get_feature_names()
assert "version=2" in names
assert "version" not in names
def test_iterable_value():
D_names = ['ham', 'spam', 'version=1', 'version=2', 'version=3']
X_expected = [[2.0, 0.0, 2.0, 1.0, 0.0],
[0.0, 0.3, 0.0, 1.0, 0.0],
[0.0, -1.0, 0.0, 0.0, 1.0]]
D_in = [{"version": ["1", "2", "1"], "ham": 2},
{"version": "2", "spam": .3},
{"version=3": True, "spam": -1}]
v = DictVectorizer()
X = v.fit_transform(D_in)
X = X.toarray()
assert_array_equal(X, X_expected)
D_out = v.inverse_transform(X)
assert D_out[0] == {"version=1": 2, "version=2": 1, "ham": 2}
names = v.get_feature_names()
assert names == D_names
def test_iterable_not_string_error():
error_value = ("Unsupported type <class 'int'> in iterable value. "
"Only iterables of string are supported.")
D2 = [{'foo': '1', 'bar': '2'},
{'foo': '3', 'baz': '1'},
{'foo': [1, 'three']}]
v = DictVectorizer(sparse=False)
with pytest.raises(TypeError) as error:
v.fit(D2)
assert str(error.value) == error_value
def test_mapping_error():
error_value = ("Unsupported value type <class 'dict'> "
"for foo: {'one': 1, 'three': 3}.\n"
"Mapping objects are not supported.")
D2 = [{'foo': '1', 'bar': '2'},
{'foo': '3', 'baz': '1'},
{'foo': {'one': 1, 'three': 3}}]
v = DictVectorizer(sparse=False)
with pytest.raises(TypeError) as error:
v.fit(D2)
assert str(error.value) == error_value
def test_unseen_or_no_features():
D = [{"camelot": 0, "spamalot": 1}]
for sparse in [True, False]:
v = DictVectorizer(sparse=sparse).fit(D)
X = v.transform({"push the pram a lot": 2})
if sparse:
X = X.toarray()
assert_array_equal(X, np.zeros((1, 2)))
X = v.transform({})
if sparse:
X = X.toarray()
assert_array_equal(X, np.zeros((1, 2)))
try:
v.transform([])
except ValueError as e:
assert "empty" in str(e)
def test_deterministic_vocabulary():
# Generate equal dictionaries with different memory layouts
items = [("%03d" % i, i) for i in range(1000)]
rng = Random(42)
d_sorted = dict(items)
rng.shuffle(items)
d_shuffled = dict(items)
# check that the memory layout does not impact the resulting vocabulary
v_1 = DictVectorizer().fit([d_sorted])
v_2 = DictVectorizer().fit([d_shuffled])
assert v_1.vocabulary_ == v_2.vocabulary_
def test_n_features_in():
# For vectorizers, n_features_in_ does not make sense and does not exist.
dv = DictVectorizer()
assert not hasattr(dv, 'n_features_in_')
d = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
dv.fit(d)
assert not hasattr(dv, 'n_features_in_')