s444386 bayes
This commit is contained in:
parent
9cb2fb2612
commit
0de016faa0
5452
dev-0/out.tsv
Normal file
5452
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
43
run.py
Normal file
43
run.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
import lzma
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import gzip
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.naive_bayes import MultinomialNB
|
||||||
|
from sklearn.pipeline import make_pipeline
|
||||||
|
from sklearn.metrics import accuracy_score
|
||||||
|
|
||||||
|
def readFile(filename):
|
||||||
|
X_dev = []
|
||||||
|
with open(filename, 'r', encoding="utf-8") as dev_in:
|
||||||
|
for line in dev_in:
|
||||||
|
text = line.split("\t")[0].strip()
|
||||||
|
X_dev.append(text)
|
||||||
|
return X_dev
|
||||||
|
|
||||||
|
def writePred(filename, predictions):
|
||||||
|
with open(filename, "w") as out_file:
|
||||||
|
for pred in predictions:
|
||||||
|
out_file.write(str(pred) + "\n")
|
||||||
|
|
||||||
|
with gzip.open('train/train.tsv.gz', 'rb') as f:
|
||||||
|
data = pd.read_csv(f, sep='\t',error_bad_lines=False,names=['isBall','text'])
|
||||||
|
|
||||||
|
x = data['text']
|
||||||
|
y = data['isBall']
|
||||||
|
|
||||||
|
x = np.asarray(x)
|
||||||
|
y = np.asarray(y)
|
||||||
|
|
||||||
|
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
|
||||||
|
model.fit(x,y)
|
||||||
|
|
||||||
|
dev = readFile('dev-0/in.tsv')
|
||||||
|
pred = model.predict(dev)
|
||||||
|
writePred('dev-0/out.tsv',pred)
|
||||||
|
|
||||||
|
dev = readFile('test-A/in.tsv')
|
||||||
|
pred = model.predict(dev)
|
||||||
|
writePred('test-A/out.tsv',pred)
|
||||||
|
|
||||||
|
|
5447
test-A/out.tsv
Normal file
5447
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user