diff --git a/geval b/geval old mode 100644 new mode 100755 diff --git a/main.py b/main.py index 545f5a4..e0ba702 100644 --- a/main.py +++ b/main.py @@ -1,34 +1,36 @@ import numpy as np from sklearn.preprocessing import LabelEncoder from sklearn.naive_bayes import MultinomialNB -from sklearn.pipeline import Pipeline +from sklearn.pipeline import make_pipeline from sklearn.feature_extraction.text import TfidfVectorizer -def train_model(train_in, train_expected): - with open(train_expected, 'r', encoding='utf-8') as f: - exp = f.readlines() +def Create_model(X_tsv, Y_tsv): - with open(train_in, 'r', encoding='utf-8') as f: - train_data = f.readlines() + with open(X_tsv) as f: + X = f.readlines() - exp_encoded = LabelEncoder().fit_transform(exp) - pipeline = Pipeline(steps=[ - ('tfidf', TfidfVectorizer()), - ('naive-bayes', MultinomialNB()) - ]) + with open(Y_tsv) as f: + Y = f.readlines() - return pipeline.fit(train_data, exp_encoded) + Y = LabelEncoder().fit_transform(Y) + pipeline = make_pipeline(TfidfVectorizer(),MultinomialNB()) + + return pipeline.fit(X, Y) -def predict(model, in_file, out_file): - with open(in_file, 'r', encoding='utf-8') as f: - lines = f.readlines() - prediction = model.predict(lines) - np.savetxt(out_file, prediction, fmt='%d') +def predict(model, X_tsv, file_name): + + with open(X_tsv) as f: + X = f.readlines() + + prediction = model.predict(X) + np.savetxt(file_name, prediction, fmt='%d') def main(): - model = train_model("train/in.tsv", "train/expected.tsv") + + model = Create_model("train/in.tsv", "train/expected.tsv") + predict(model, "dev-0/in.tsv", "dev-0/out.tsv") predict(model, "test-A/in.tsv", "test-A/out.tsv")