s444417-paranormal-or-skept.../run.py
2022-05-10 23:41:58 +02:00

22 lines
546 B
Python

import lzma
import sys
from io import StringIO
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
pathX = "./train/in.tsv.xz"
pathY = "./train/expected.tsv"
data = lzma.open(pathX, mode='rt', encoding='utf-8').read()
stringIO = StringIO(data)
df = pd.read_csv(stringIO, sep="\t", header=None)
df = df.drop(df.columns[[1]], axis=1)
topics = pd.read_csv(pathY, sep='\t', header=None)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df.to_numpy().ravel())
print(vectorizer.get_feature_names_out())