diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..b607810 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,5 @@ +FROM ubuntu:latest +WORKDIR /isi +COPY . /isi +RUN pip install -r /isi/requirements.txt +CMD ["program.py"] \ No newline at end of file diff --git a/program.py b/program.py new file mode 100644 index 0000000..68599c6 --- /dev/null +++ b/program.py @@ -0,0 +1,42 @@ +import pandas as pd +import csv +from sklearn.feature_extraction.text import CountVectorizer +from xgboost import XGBClassifier +import numpy as np + +def load_data(path): + with open(path, encoding='utf-8') as f: + data = f.readlines() + data = [x.strip() for x in data] + return data + +def save_data(path, data): + with open(path, "w") as out: + for line in data: + out.write(str(line[1])) + out.write("\n") + +def program(): + train = load_data("train/in.tsv") + y = load_data("train/expected.tsv") + + vectorizer = CountVectorizer() + x_vectorizer = vectorizer.fit_transform(train) + clf = XGBClassifier().fit(x_vectorizer, y) + + dev0 = load_data("dev-0/in.tsv") + dev0_vectorizer = vectorizer.transform(dev0) + y_dev0 = clf.predict_proba(dev0_vectorizer) + save_data("dev-0/out.tsv", y_dev0) + + dev1 = load_data("dev-1/in.tsv") + dev1_vectorizer = vectorizer.transform(dev1) + y_dev1 = clf.predict_proba(dev1_vectorizer) + save_data("dev-1/out.tsv", y_dev1) + + testA = load_data("test-A/in.tsv") + testA_vectorizer = vectorizer.transform(testA) + y_testA = clf.predict_proba(testA_vectorizer) + save_data("test-A/out.tsv", y_testA) + +program() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..20de7f8 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +pandas +sklearn +xgboost +numpy \ No newline at end of file