2021-05-08 19:02:05 +02:00
|
|
|
import numpy as np
|
|
|
|
from sklearn.preprocessing import LabelEncoder
|
|
|
|
from sklearn.naive_bayes import MultinomialNB
|
2021-05-08 22:45:55 +02:00
|
|
|
from sklearn.pipeline import make_pipeline
|
2021-05-08 19:02:05 +02:00
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
|
2021-05-08 22:45:55 +02:00
|
|
|
def Create_model(X_tsv, Y_tsv):
|
2021-05-08 19:02:05 +02:00
|
|
|
|
2021-05-08 22:45:55 +02:00
|
|
|
with open(X_tsv) as f:
|
|
|
|
X = f.readlines()
|
2021-05-08 19:02:05 +02:00
|
|
|
|
2021-05-08 22:45:55 +02:00
|
|
|
with open(Y_tsv) as f:
|
|
|
|
Y = f.readlines()
|
2021-05-08 19:02:05 +02:00
|
|
|
|
2021-05-08 22:45:55 +02:00
|
|
|
Y = LabelEncoder().fit_transform(Y)
|
|
|
|
pipeline = make_pipeline(TfidfVectorizer(),MultinomialNB())
|
2021-05-08 19:02:05 +02:00
|
|
|
|
2021-05-08 22:45:55 +02:00
|
|
|
return pipeline.fit(X, Y)
|
2021-05-08 19:02:05 +02:00
|
|
|
|
2021-05-08 22:45:55 +02:00
|
|
|
|
|
|
|
def predict(model, X_tsv, file_name):
|
|
|
|
|
|
|
|
with open(X_tsv) as f:
|
|
|
|
X = f.readlines()
|
|
|
|
|
|
|
|
prediction = model.predict(X)
|
|
|
|
np.savetxt(file_name, prediction, fmt='%d')
|
2021-05-08 19:02:05 +02:00
|
|
|
|
|
|
|
|
|
|
|
def main():
|
2021-05-08 22:45:55 +02:00
|
|
|
|
|
|
|
model = Create_model("train/in.tsv", "train/expected.tsv")
|
|
|
|
|
2021-05-08 19:02:05 +02:00
|
|
|
predict(model, "dev-0/in.tsv", "dev-0/out.tsv")
|
|
|
|
predict(model, "test-A/in.tsv", "test-A/out.tsv")
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|