diff --git a/run.py b/run.py index 2eafc5c..ddff716 100644 --- a/run.py +++ b/run.py @@ -1,22 +1,84 @@ +# %% import lzma import sys from io import StringIO from sklearn.feature_extraction.text import TfidfVectorizer - - import pandas as pd - - +import numpy pathX = "./train/in.tsv.xz" +# pathX = "./train/in.tsv" pathY = "./train/expected.tsv" - -data = lzma.open(pathX, mode='rt', encoding='utf-8').read() -stringIO = StringIO(data) -df = pd.read_csv(stringIO, sep="\t", header=None) -df = df.drop(df.columns[[1]], axis=1) -topics = pd.read_csv(pathY, sep='\t', header=None) +nrows = 10000 -vectorizer = TfidfVectorizer() +# %% +# data = lzma.open(pathX, mode='rt', encoding='utf-8').read() +# stringIO = StringIO(data) +# df = pd.read_csv(stringIO, sep="\t", header=None) +df = pd.read_csv(pathX, sep='\t', nrows=nrows, header=None) +df = df.drop(df.columns[1], axis=1) +topics = pd.read_csv(pathY, sep='\t', nrows=nrows, header=None) + +# %% +print(len(df.index)) + +print(len(topics.index)) + + +# %% +df.sample() + +# %% +vectorizer = TfidfVectorizer(lowercase=True, stop_words=['english']) X = vectorizer.fit_transform(df.to_numpy().ravel()) -print(vectorizer.get_feature_names_out()) \ No newline at end of file +vectorizer.get_feature_names_out() + + +# %% +# vectorizer.transform("Ala ma kotka".lower().split()) + +# %% +df = df.reset_index() + +# %% +tfidfVector = vectorizer.transform(df[0]) + + + +# %% +from sklearn.model_selection import train_test_split +from sklearn.naive_bayes import GaussianNB + +gnb = GaussianNB() +gnb.fit(tfidfVector.todense(), topics) + +# %% +testXPath = "./dev-0/in.tsv.xz" +testYPath = "./dev-0/expected.tsv" + +testX = pd.read_csv(testXPath, sep='\t', nrows=nrows, header=None) + +testY = pd.read_csv(testYPath, sep='\t', nrows=nrows, header=None) +testXtfidfVector = vectorizer.transform(testX[0]) + + +# %% +testXPath = "./test-A/in.tsv.xz" +testYPath = "./test-A/expected.tsv" + +testX = pd.read_csv(testXPath, sep='\t', nrows=nrows, header=None) + +# testY = pd.read_csv(testYPath, sep='\t', nrows=nrows, header=None) +testXtfidfVector = vectorizer.transform(testX[0]) + + +# %% +pred = gnb.predict(testXtfidfVector.todense()) +print(pred) + +import csv +with open(testYPath, 'w', newline='') as f_output: + tsv_output = csv.writer(f_output, delimiter='\n') + tsv_output.writerow(pred) + +