Compare commits

...

2 Commits

Author SHA1 Message Date
nlitkowski
511acc7aa7 Remove unused import 2021-05-12 23:43:02 +02:00
nlitkowski
86940a2dd9 Add out files 2021-05-12 23:42:43 +02:00
3 changed files with 10944 additions and 0 deletions

5452
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

47
main.py Normal file
View File

@ -0,0 +1,47 @@
import pandas as pd
import numpy as np
import gzip
import os
import sys
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
IN_FILE_NAME = "in.tsv"
OUT_FILE_NAME = "out.tsv"
def main(dirname):
in_path = os.path.join(dirname, IN_FILE_NAME)
if not os.path.exists(in_path):
raise Exception(f"Path {in_path} does not exist!")
input = pd.read_table(in_path,
error_bad_lines=False, header=None)
X_train = []
y_train = []
with gzip.open('train/train.tsv.gz', 'r') as f:
for l in f:
line = l.decode('UTF-8').replace("\n", "").split("\t")
y_train.append(int(line[0]))
X_train.append(str(line[1:]))
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)
X = input[0].values
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(X_train, y_train)
pred = model.predict(X)
pred.tofile(os.path.join(dirname, OUT_FILE_NAME), sep='\n')
if __name__ == "__main__":
if len(sys.argv) < 2:
raise Exception("Name of working dir not specified!")
main(sys.argv[1])

5445
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff