from bz2 import BZ2File import gzip from io import BytesIO import numpy as np import scipy.sparse as sp import os import shutil from tempfile import NamedTemporaryFile import pytest from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import fails_if_pypy import sklearn from sklearn.datasets import (load_svmlight_file, load_svmlight_files, dump_svmlight_file) currdir = os.path.dirname(os.path.abspath(__file__)) datafile = os.path.join(currdir, "data", "svmlight_classification.txt") multifile = os.path.join(currdir, "data", "svmlight_multilabel.txt") invalidfile = os.path.join(currdir, "data", "svmlight_invalid.txt") invalidfile2 = os.path.join(currdir, "data", "svmlight_invalid_order.txt") pytestmark = fails_if_pypy def test_load_svmlight_file(): X, y = load_svmlight_file(datafile) # test X's shape assert X.indptr.shape[0] == 7 assert X.shape[0] == 6 assert X.shape[1] == 21 assert y.shape[0] == 6 # test X's non-zero values for i, j, val in ((0, 2, 2.5), (0, 10, -5.2), (0, 15, 1.5), (1, 5, 1.0), (1, 12, -3), (2, 20, 27)): assert X[i, j] == val # tests X's zero values assert X[0, 3] == 0 assert X[0, 5] == 0 assert X[1, 8] == 0 assert X[1, 16] == 0 assert X[2, 18] == 0 # test can change X's values X[0, 2] *= 2 assert X[0, 2] == 5 # test y assert_array_equal(y, [1, 2, 3, 4, 1, 2]) def test_load_svmlight_file_fd(): # test loading from file descriptor X1, y1 = load_svmlight_file(datafile) fd = os.open(datafile, os.O_RDONLY) try: X2, y2 = load_svmlight_file(fd) assert_array_almost_equal(X1.data, X2.data) assert_array_almost_equal(y1, y2) finally: os.close(fd) def test_load_svmlight_file_multilabel(): X, y = load_svmlight_file(multifile, multilabel=True) assert y == [(0, 1), (2,), (), (1, 2)] def test_load_svmlight_files(): X_train, y_train, X_test, y_test = load_svmlight_files([datafile] * 2, dtype=np.float32) assert_array_equal(X_train.toarray(), X_test.toarray()) assert_array_almost_equal(y_train, y_test) assert X_train.dtype == np.float32 assert X_test.dtype == np.float32 X1, y1, X2, y2, X3, y3 = load_svmlight_files([datafile] * 3, dtype=np.float64) assert X1.dtype == X2.dtype assert X2.dtype == X3.dtype assert X3.dtype == np.float64 def test_load_svmlight_file_n_features(): X, y = load_svmlight_file(datafile, n_features=22) # test X'shape assert X.indptr.shape[0] == 7 assert X.shape[0] == 6 assert X.shape[1] == 22 # test X's non-zero values for i, j, val in ((0, 2, 2.5), (0, 10, -5.2), (1, 5, 1.0), (1, 12, -3)): assert X[i, j] == val # 21 features in file with pytest.raises(ValueError): load_svmlight_file(datafile, n_features=20) def test_load_compressed(): X, y = load_svmlight_file(datafile) with NamedTemporaryFile(prefix="sklearn-test", suffix=".gz") as tmp: tmp.close() # necessary under windows with open(datafile, "rb") as f: with gzip.open(tmp.name, "wb") as fh_out: shutil.copyfileobj(f, fh_out) Xgz, ygz = load_svmlight_file(tmp.name) # because we "close" it manually and write to it, # we need to remove it manually. os.remove(tmp.name) assert_array_almost_equal(X.toarray(), Xgz.toarray()) assert_array_almost_equal(y, ygz) with NamedTemporaryFile(prefix="sklearn-test", suffix=".bz2") as tmp: tmp.close() # necessary under windows with open(datafile, "rb") as f: with BZ2File(tmp.name, "wb") as fh_out: shutil.copyfileobj(f, fh_out) Xbz, ybz = load_svmlight_file(tmp.name) # because we "close" it manually and write to it, # we need to remove it manually. os.remove(tmp.name) assert_array_almost_equal(X.toarray(), Xbz.toarray()) assert_array_almost_equal(y, ybz) def test_load_invalid_file(): with pytest.raises(ValueError): load_svmlight_file(invalidfile) def test_load_invalid_order_file(): with pytest.raises(ValueError): load_svmlight_file(invalidfile2) def test_load_zero_based(): f = BytesIO(b"-1 4:1.\n1 0:1\n") with pytest.raises(ValueError): load_svmlight_file(f, zero_based=False) def test_load_zero_based_auto(): data1 = b"-1 1:1 2:2 3:3\n" data2 = b"-1 0:0 1:1\n" f1 = BytesIO(data1) X, y = load_svmlight_file(f1, zero_based="auto") assert X.shape == (1, 3) f1 = BytesIO(data1) f2 = BytesIO(data2) X1, y1, X2, y2 = load_svmlight_files([f1, f2], zero_based="auto") assert X1.shape == (1, 4) assert X2.shape == (1, 4) def test_load_with_qid(): # load svmfile with qid attribute data = b""" 3 qid:1 1:0.53 2:0.12 2 qid:1 1:0.13 2:0.1 7 qid:2 1:0.87 2:0.12""" X, y = load_svmlight_file(BytesIO(data), query_id=False) assert_array_equal(y, [3, 2, 7]) assert_array_equal(X.toarray(), [[.53, .12], [.13, .1], [.87, .12]]) res1 = load_svmlight_files([BytesIO(data)], query_id=True) res2 = load_svmlight_file(BytesIO(data), query_id=True) for X, y, qid in (res1, res2): assert_array_equal(y, [3, 2, 7]) assert_array_equal(qid, [1, 1, 2]) assert_array_equal(X.toarray(), [[.53, .12], [.13, .1], [.87, .12]]) @pytest.mark.skip("testing the overflow of 32 bit sparse indexing requires a" " large amount of memory") def test_load_large_qid(): """ load large libsvm / svmlight file with qid attribute. Tests 64-bit query ID """ data = b"\n".join(("3 qid:{0} 1:0.53 2:0.12\n2 qid:{0} 1:0.13 2:0.1" .format(i).encode() for i in range(1, 40*1000*1000))) X, y, qid = load_svmlight_file(BytesIO(data), query_id=True) assert_array_equal(y[-4:], [3, 2, 3, 2]) assert_array_equal(np.unique(qid), np.arange(1, 40*1000*1000)) def test_load_invalid_file2(): with pytest.raises(ValueError): load_svmlight_files([datafile, invalidfile, datafile]) def test_not_a_filename(): # in python 3 integers are valid file opening arguments (taken as unix # file descriptors) with pytest.raises(TypeError): load_svmlight_file(.42) def test_invalid_filename(): with pytest.raises(IOError): load_svmlight_file("trou pic nic douille") def test_dump(): X_sparse, y_dense = load_svmlight_file(datafile) X_dense = X_sparse.toarray() y_sparse = sp.csr_matrix(y_dense) # slicing a csr_matrix can unsort its .indices, so test that we sort # those correctly X_sliced = X_sparse[np.arange(X_sparse.shape[0])] y_sliced = y_sparse[np.arange(y_sparse.shape[0])] for X in (X_sparse, X_dense, X_sliced): for y in (y_sparse, y_dense, y_sliced): for zero_based in (True, False): for dtype in [np.float32, np.float64, np.int32, np.int64]: f = BytesIO() # we need to pass a comment to get the version info in; # LibSVM doesn't grok comments so they're not put in by # default anymore. if (sp.issparse(y) and y.shape[0] == 1): # make sure y's shape is: (n_samples, n_labels) # when it is sparse y = y.T # Note: with dtype=np.int32 we are performing unsafe casts, # where X.astype(dtype) overflows. The result is # then platform dependent and X_dense.astype(dtype) may be # different from X_sparse.astype(dtype).asarray(). X_input = X.astype(dtype) dump_svmlight_file(X_input, y, f, comment="test", zero_based=zero_based) f.seek(0) comment = f.readline() comment = str(comment, "utf-8") assert "scikit-learn %s" % sklearn.__version__ in comment comment = f.readline() comment = str(comment, "utf-8") assert ["one", "zero"][zero_based] + "-based" in comment X2, y2 = load_svmlight_file(f, dtype=dtype, zero_based=zero_based) assert X2.dtype == dtype assert_array_equal(X2.sorted_indices().indices, X2.indices) X2_dense = X2.toarray() if sp.issparse(X_input): X_input_dense = X_input.toarray() else: X_input_dense = X_input if dtype == np.float32: # allow a rounding error at the last decimal place assert_array_almost_equal( X_input_dense, X2_dense, 4) assert_array_almost_equal( y_dense.astype(dtype, copy=False), y2, 4) else: # allow a rounding error at the last decimal place assert_array_almost_equal( X_input_dense, X2_dense, 15) assert_array_almost_equal( y_dense.astype(dtype, copy=False), y2, 15) def test_dump_multilabel(): X = [[1, 0, 3, 0, 5], [0, 0, 0, 0, 0], [0, 5, 0, 1, 0]] y_dense = [[0, 1, 0], [1, 0, 1], [1, 1, 0]] y_sparse = sp.csr_matrix(y_dense) for y in [y_dense, y_sparse]: f = BytesIO() dump_svmlight_file(X, y, f, multilabel=True) f.seek(0) # make sure it dumps multilabel correctly assert f.readline() == b"1 0:1 2:3 4:5\n" assert f.readline() == b"0,2 \n" assert f.readline() == b"0,1 1:5 3:1\n" def test_dump_concise(): one = 1 two = 2.1 three = 3.01 exact = 1.000000000000001 # loses the last decimal place almost = 1.0000000000000001 X = [[one, two, three, exact, almost], [1e9, 2e18, 3e27, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]] y = [one, two, three, exact, almost] f = BytesIO() dump_svmlight_file(X, y, f) f.seek(0) # make sure it's using the most concise format possible assert (f.readline() == b"1 0:1 1:2.1 2:3.01 3:1.000000000000001 4:1\n") assert f.readline() == b"2.1 0:1000000000 1:2e+18 2:3e+27\n" assert f.readline() == b"3.01 \n" assert f.readline() == b"1.000000000000001 \n" assert f.readline() == b"1 \n" f.seek(0) # make sure it's correct too :) X2, y2 = load_svmlight_file(f) assert_array_almost_equal(X, X2.toarray()) assert_array_almost_equal(y, y2) def test_dump_comment(): X, y = load_svmlight_file(datafile) X = X.toarray() f = BytesIO() ascii_comment = "This is a comment\nspanning multiple lines." dump_svmlight_file(X, y, f, comment=ascii_comment, zero_based=False) f.seek(0) X2, y2 = load_svmlight_file(f, zero_based=False) assert_array_almost_equal(X, X2.toarray()) assert_array_almost_equal(y, y2) # XXX we have to update this to support Python 3.x utf8_comment = b"It is true that\n\xc2\xbd\xc2\xb2 = \xc2\xbc" f = BytesIO() with pytest.raises(UnicodeDecodeError): dump_svmlight_file(X, y, f, comment=utf8_comment) unicode_comment = utf8_comment.decode("utf-8") f = BytesIO() dump_svmlight_file(X, y, f, comment=unicode_comment, zero_based=False) f.seek(0) X2, y2 = load_svmlight_file(f, zero_based=False) assert_array_almost_equal(X, X2.toarray()) assert_array_almost_equal(y, y2) f = BytesIO() with pytest.raises(ValueError): dump_svmlight_file(X, y, f, comment="I've got a \0.") def test_dump_invalid(): X, y = load_svmlight_file(datafile) f = BytesIO() y2d = [y] with pytest.raises(ValueError): dump_svmlight_file(X, y2d, f) f = BytesIO() with pytest.raises(ValueError): dump_svmlight_file(X, y[:-1], f) def test_dump_query_id(): # test dumping a file with query_id X, y = load_svmlight_file(datafile) X = X.toarray() query_id = np.arange(X.shape[0]) // 2 f = BytesIO() dump_svmlight_file(X, y, f, query_id=query_id, zero_based=True) f.seek(0) X1, y1, query_id1 = load_svmlight_file(f, query_id=True, zero_based=True) assert_array_almost_equal(X, X1.toarray()) assert_array_almost_equal(y, y1) assert_array_almost_equal(query_id, query_id1) def test_load_with_long_qid(): # load svmfile with longint qid attribute data = b""" 1 qid:0 0:1 1:2 2:3 0 qid:72048431380967004 0:1440446648 1:72048431380967004 2:236784985 0 qid:-9223372036854775807 0:1440446648 1:72048431380967004 2:236784985 3 qid:9223372036854775807 0:1440446648 1:72048431380967004 2:236784985""" X, y, qid = load_svmlight_file(BytesIO(data), query_id=True) true_X = [[1, 2, 3], [1440446648, 72048431380967004, 236784985], [1440446648, 72048431380967004, 236784985], [1440446648, 72048431380967004, 236784985]] true_y = [1, 0, 0, 3] trueQID = [0, 72048431380967004, -9223372036854775807, 9223372036854775807] assert_array_equal(y, true_y) assert_array_equal(X.toarray(), true_X) assert_array_equal(qid, trueQID) f = BytesIO() dump_svmlight_file(X, y, f, query_id=qid, zero_based=True) f.seek(0) X, y, qid = load_svmlight_file(f, query_id=True, zero_based=True) assert_array_equal(y, true_y) assert_array_equal(X.toarray(), true_X) assert_array_equal(qid, trueQID) f.seek(0) X, y = load_svmlight_file(f, query_id=False, zero_based=True) assert_array_equal(y, true_y) assert_array_equal(X.toarray(), true_X) def test_load_zeros(): f = BytesIO() true_X = sp.csr_matrix(np.zeros(shape=(3, 4))) true_y = np.array([0, 1, 0]) dump_svmlight_file(true_X, true_y, f) for zero_based in ['auto', True, False]: f.seek(0) X, y = load_svmlight_file(f, n_features=4, zero_based=zero_based) assert_array_almost_equal(y, true_y) assert_array_almost_equal(X.toarray(), true_X.toarray()) @pytest.mark.parametrize('sparsity', [0, 0.1, .5, 0.99, 1]) @pytest.mark.parametrize('n_samples', [13, 101]) @pytest.mark.parametrize('n_features', [2, 7, 41]) def test_load_with_offsets(sparsity, n_samples, n_features): rng = np.random.RandomState(0) X = rng.uniform(low=0.0, high=1.0, size=(n_samples, n_features)) if sparsity: X[X < sparsity] = 0.0 X = sp.csr_matrix(X) y = rng.randint(low=0, high=2, size=n_samples) f = BytesIO() dump_svmlight_file(X, y, f) f.seek(0) size = len(f.getvalue()) # put some marks that are likely to happen anywhere in a row mark_0 = 0 mark_1 = size // 3 length_0 = mark_1 - mark_0 mark_2 = 4 * size // 5 length_1 = mark_2 - mark_1 # load the original sparse matrix into 3 independent CSR matrices X_0, y_0 = load_svmlight_file(f, n_features=n_features, offset=mark_0, length=length_0) X_1, y_1 = load_svmlight_file(f, n_features=n_features, offset=mark_1, length=length_1) X_2, y_2 = load_svmlight_file(f, n_features=n_features, offset=mark_2) y_concat = np.concatenate([y_0, y_1, y_2]) X_concat = sp.vstack([X_0, X_1, X_2]) assert_array_almost_equal(y, y_concat) assert_array_almost_equal(X.toarray(), X_concat.toarray()) def test_load_offset_exhaustive_splits(): rng = np.random.RandomState(0) X = np.array([ [0, 0, 0, 0, 0, 0], [1, 2, 3, 4, 0, 6], [1, 2, 3, 4, 0, 6], [0, 0, 0, 0, 0, 0], [1, 0, 3, 0, 0, 0], [0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0], ]) X = sp.csr_matrix(X) n_samples, n_features = X.shape y = rng.randint(low=0, high=2, size=n_samples) query_id = np.arange(n_samples) // 2 f = BytesIO() dump_svmlight_file(X, y, f, query_id=query_id) f.seek(0) size = len(f.getvalue()) # load the same data in 2 parts with all the possible byte offsets to # locate the split so has to test for particular boundary cases for mark in range(size): f.seek(0) X_0, y_0, q_0 = load_svmlight_file(f, n_features=n_features, query_id=True, offset=0, length=mark) X_1, y_1, q_1 = load_svmlight_file(f, n_features=n_features, query_id=True, offset=mark, length=-1) q_concat = np.concatenate([q_0, q_1]) y_concat = np.concatenate([y_0, y_1]) X_concat = sp.vstack([X_0, X_1]) assert_array_almost_equal(y, y_concat) assert_array_equal(query_id, q_concat) assert_array_almost_equal(X.toarray(), X_concat.toarray()) def test_load_with_offsets_error(): with pytest.raises(ValueError, match="n_features is required"): load_svmlight_file(datafile, offset=3, length=3)