knn tf
This commit is contained in:
parent
0f31b5f89d
commit
fcd3461f16
10544
dev-0/out.tsv
10544
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
38
program_knn.py
Normal file
38
program_knn.py
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import csv
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
from sklearn.feature_extraction.text import TfidfTransformer
|
||||||
|
from sklearn.neighbors import KNeighborsClassifier
|
||||||
|
vectorizer = CountVectorizer()
|
||||||
|
|
||||||
|
def program():
|
||||||
|
train = pd.read_csv("train/in.tsv.xz", header=None, names=["document","date"], quoting=csv.QUOTE_NONE)
|
||||||
|
document = train["document"]
|
||||||
|
y = pd.read_csv("train/expected.tsv", header=None, sep = ' ')
|
||||||
|
|
||||||
|
vectorizer = CountVectorizer()
|
||||||
|
x_vectorizer = vectorizer.fit_transform(document)
|
||||||
|
|
||||||
|
tfidf_transformer = TfidfTransformer()
|
||||||
|
x_tfidf = tfidf_transformer.fit_transform(x_vectorizer)
|
||||||
|
|
||||||
|
clf = KNeighborsClassifier(n_neighbors = 8).fit(x_tfidf, y)
|
||||||
|
|
||||||
|
dev0 = pd.read_csv("dev-0/in.tsv.xz", sep="\t", header=None, names=["document","date"], error_bad_lines = False)["document"]
|
||||||
|
testA = pd.read_csv("test-A/in.tsv.xz", sep="\t", header=None, names=["document","date"], error_bad_lines = False)["document"]
|
||||||
|
|
||||||
|
dev0_vectorizer = vectorizer.transform(dev0)
|
||||||
|
dev0_tfidf = tfidf_transformer.transform(dev0_vectorizer)
|
||||||
|
testA_vectorizer = vectorizer.transform(testA)
|
||||||
|
testA_tfidf = tfidf_transformer.transform(testA_vectorizer)
|
||||||
|
|
||||||
|
y_dev = clf.predict_proba(dev0_tfidf)
|
||||||
|
y_test = clf.predict_proba(testA_tfidf)
|
||||||
|
|
||||||
|
np.savetxt('test-A/out.tsv', y_test, '%f')
|
||||||
|
np.savetxt('dev-0/out.tsv', y_dev, '%f')
|
||||||
|
|
||||||
|
program()
|
10304
test-A/out.tsv
10304
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user