22 lines
546 B
Python
22 lines
546 B
Python
import lzma
|
|
import sys
|
|
from io import StringIO
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
pathX = "./train/in.tsv.xz"
|
|
pathY = "./train/expected.tsv"
|
|
|
|
data = lzma.open(pathX, mode='rt', encoding='utf-8').read()
|
|
stringIO = StringIO(data)
|
|
df = pd.read_csv(stringIO, sep="\t", header=None)
|
|
df = df.drop(df.columns[[1]], axis=1)
|
|
topics = pd.read_csv(pathY, sep='\t', header=None)
|
|
|
|
vectorizer = TfidfVectorizer()
|
|
X = vectorizer.fit_transform(df.to_numpy().ravel())
|
|
print(vectorizer.get_feature_names_out()) |