import lzma import sys from io import StringIO from sklearn.feature_extraction.text import TfidfVectorizer import pandas as pd pathX = "./train/in.tsv.xz" pathY = "./train/expected.tsv" data = lzma.open(pathX, mode='rt', encoding='utf-8').read() stringIO = StringIO(data) df = pd.read_csv(stringIO, sep="\t", header=None) df = df.drop(df.columns[[1]], axis=1) topics = pd.read_csv(pathY, sep='\t', header=None) vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(df.to_numpy().ravel()) print(vectorizer.get_feature_names_out())