#!/usr/bin/python3 import pandas as pd import csv import numpy as np from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.neighbors import KNeighborsClassifier vectorizer = CountVectorizer() def program(): train = pd.read_csv("train/in.tsv.xz", header=None, names=["document","date"], quoting=csv.QUOTE_NONE) document = train["document"] y = pd.read_csv("train/expected.tsv", header=None, sep = ' ') vectorizer = CountVectorizer() x_vectorizer = vectorizer.fit_transform(document) tfidf_transformer = TfidfTransformer() x_tfidf = tfidf_transformer.fit_transform(x_vectorizer) clf = KNeighborsClassifier(n_neighbors = 8).fit(x_tfidf, y) dev0 = pd.read_csv("dev-0/in.tsv.xz", sep="\t", header=None, names=["document","date"], error_bad_lines = False)["document"] testA = pd.read_csv("test-A/in.tsv.xz", sep="\t", header=None, names=["document","date"], error_bad_lines = False)["document"] dev0_vectorizer = vectorizer.transform(dev0) dev0_tfidf = tfidf_transformer.transform(dev0_vectorizer) testA_vectorizer = vectorizer.transform(testA) testA_tfidf = tfidf_transformer.transform(testA_vectorizer) y_dev = clf.predict_proba(dev0_tfidf) y_test = clf.predict_proba(testA_tfidf) np.savetxt('test-A/out.tsv', y_test, '%f') np.savetxt('dev-0/out.tsv', y_dev, '%f') program()