59 lines
2.2 KiB
Python
59 lines
2.2 KiB
Python
from sklearn.feature_extraction.text import CountVectorizer
|
|
from sklearn.feature_extraction.text import TfidfTransformer
|
|
from sklearn.neighbors import KNeighborsClassifier
|
|
import pandas as pd
|
|
import csv
|
|
|
|
|
|
def generate_output(predicted_proba, path):
|
|
f3 = open(path, "w")
|
|
|
|
predicted_proba[predicted_proba < 0.05] = 0.05
|
|
predicted_proba[predicted_proba > 0.95] = 0.95
|
|
|
|
string = ""
|
|
for probability in predicted_proba:
|
|
string += f"{probability[1]}\n"
|
|
f3.write(string)
|
|
|
|
|
|
training_data = pd.read_csv('train/in.tsv.xz', header=None, names=["text", "date"], quoting=csv.QUOTE_NONE)
|
|
y_train = pd.read_csv('train/expected.tsv', header=None, sep=' ')
|
|
|
|
count_vect = CountVectorizer()
|
|
X_train_counts = count_vect.fit_transform(training_data['text'].values.astype('U'))
|
|
|
|
# transform a count matrix to a normalized tf-idf representation (tf-idf transformer)
|
|
tfidf_transformer = TfidfTransformer()
|
|
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
|
|
|
|
knn = KNeighborsClassifier(n_neighbors=15)
|
|
|
|
# training our classifier ; y_train will have numbers assigned for each category in train data
|
|
clf = knn.fit(X_train_tfidf, y_train)
|
|
|
|
# Input Data to predict their classes of the given y_train
|
|
dev_data = pd.read_csv("dev-0/in.tsv.xz", header=None, sep='\t', error_bad_lines=False)
|
|
# building up feature vector of our input
|
|
X_new_counts = count_vect.transform(dev_data[0])
|
|
# We call transform instead of fit_transform because it's already been fit
|
|
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
|
|
|
|
print("Generating dev-0 output...")
|
|
predicted_proba_dev = knn.predict_proba(X_new_tfidf)
|
|
generate_output(predicted_proba_dev, "dev-0/out.tsv")
|
|
print("Generated dev-0 output!")
|
|
|
|
# Input Data to predict their classes of the given y_train
|
|
test_data = pd.read_csv("test-A/in.tsv.xz", header=None, sep='\t', error_bad_lines=False)
|
|
|
|
# building up feature vector of our input
|
|
X_new_counts = count_vect.transform(test_data[0])
|
|
# We call transform instead of fit_transform because it's already been fit
|
|
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
|
|
|
|
print("Generating test-A output...")
|
|
predicted_proba_test = knn.predict_proba(X_new_tfidf)
|
|
generate_output(predicted_proba_test, "test-A/out.tsv")
|
|
print("Generated test-A output!")
|