sport-text-classification-b.../main.py

49 lines
1.2 KiB
Python
Raw Normal View History

2021-05-12 23:42:43 +02:00
import pandas as pd
import numpy as np
import gzip
import os
import sys
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
IN_FILE_NAME = "in.tsv"
OUT_FILE_NAME = "out.tsv"
def main(dirname):
in_path = os.path.join(dirname, IN_FILE_NAME)
if not os.path.exists(in_path):
raise Exception(f"Path {in_path} does not exist!")
input = pd.read_table(in_path,
error_bad_lines=False, header=None)
X_train = []
y_train = []
with gzip.open('train/train.tsv.gz', 'r') as f:
for l in f:
line = l.decode('UTF-8').replace("\n", "").split("\t")
y_train.append(int(line[0]))
X_train.append(str(line[1:]))
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)
X = input[0].values
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(X_train, y_train)
pred = model.predict(X)
pred.tofile(os.path.join(dirname, OUT_FILE_NAME), sep='\n')
if __name__ == "__main__":
if len(sys.argv) < 2:
raise Exception("Name of working dir not specified!")
main(sys.argv[1])