sport-text-classification-ball/run.py

44 lines
1.1 KiB
Python

import lzma
import pandas as pd
import numpy as np
import gzip
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
#test
def readFile(filename):
X_dev = []
with open(filename, 'r', encoding="utf-8") as dev_in:
for line in dev_in:
text = line.split("\t")[0].strip()
X_dev.append(text)
return X_dev
def writePred(filename, predictions):
with open(filename, "w") as out_file:
for pred in predictions:
out_file.write(str(pred) + "\n")
with gzip.open('train/train.tsv.gz', 'rb') as f:
data = pd.read_csv(f, sep='\t',error_bad_lines=False,names=['isBall','text'])
x = data['text']
y = data['isBall']
x = np.asarray(x)
y = np.asarray(y)
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(x,y)
dev = readFile('dev-0/in.tsv')
pred = model.predict(dev)
writePred('dev-0/out.tsv',pred)
dev = readFile('test-A/in.tsv')
pred = model.predict(dev)
writePred('test-A/out.tsv',pred)