Rozwiazanie zadania bayes2.
This commit is contained in:
parent
9cb2fb2612
commit
43d80423a4
5452
dev-0/out.tsv
Normal file
5452
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
67
rozwiazanie.py
Normal file
67
rozwiazanie.py
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
#import numpy as np
|
||||||
|
import gzip
|
||||||
|
from sklearn.naive_bayes import MultinomialNB
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
|
||||||
|
from sklearn import metrics
|
||||||
|
|
||||||
|
#df = pd.read_csv('sport-text-classification-ball-ISI-public/train/train.tsv.gz', compression='gzip', header=None, sep='\t', error_bad_lines=False)
|
||||||
|
train_X = []
|
||||||
|
train_y = []
|
||||||
|
with gzip.open('train/train.tsv.gz','r') as fin:
|
||||||
|
for line in fin:
|
||||||
|
sline = line.decode('UTF-8').replace("\n", "").split("\t")
|
||||||
|
train_y.append(sline[0])
|
||||||
|
train_X.append(''.join(sline[1:]))
|
||||||
|
|
||||||
|
test_X = []
|
||||||
|
with open('dev-0/in.tsv','r') as test_in_file:
|
||||||
|
for line in test_in_file:
|
||||||
|
test_X.append(line.rstrip('\n'))
|
||||||
|
|
||||||
|
test_y = []
|
||||||
|
with open('dev-0/expected.tsv','r') as test_expected_file:
|
||||||
|
for line in test_expected_file:
|
||||||
|
test_y.append(line.rstrip('\n'))
|
||||||
|
|
||||||
|
vectorizer = TfidfVectorizer(lowercase = True)
|
||||||
|
X_train_tf = vectorizer.fit_transform(train_X)
|
||||||
|
print("n_samples: %d, n_features: %d" % X_train_tf.shape)
|
||||||
|
|
||||||
|
X_test_tf = vectorizer.transform(test_X)
|
||||||
|
print("n_samples: %d, n_features: %d" % X_test_tf.shape)
|
||||||
|
|
||||||
|
naive_bayes_classifier = MultinomialNB()
|
||||||
|
naive_bayes_classifier.fit(X_train_tf, train_y)
|
||||||
|
|
||||||
|
y_pred = naive_bayes_classifier.predict(X_test_tf)
|
||||||
|
|
||||||
|
score1 = metrics.accuracy_score(test_y, y_pred)
|
||||||
|
print("accuracy: %0.3f" % score1)
|
||||||
|
|
||||||
|
print(metrics.classification_report(test_y, y_pred,
|
||||||
|
target_names=['1', '0']))
|
||||||
|
|
||||||
|
print("confusion matrix:")
|
||||||
|
print(metrics.confusion_matrix(test_y, y_pred))
|
||||||
|
|
||||||
|
print('------------------------------')
|
||||||
|
|
||||||
|
file = open('dev-0/out.tsv',"w")
|
||||||
|
for i in y_pred:
|
||||||
|
file.writelines("{}\n".format(i))
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
val_X = []
|
||||||
|
with open('test-A/in.tsv','r') as test_in_file:
|
||||||
|
for line in test_in_file:
|
||||||
|
val_X.append(line.rstrip('\n'))
|
||||||
|
|
||||||
|
X_val_tf = vectorizer.transform(val_X)
|
||||||
|
print("n_samples: %d, n_features: %d" % X_val_tf.shape)
|
||||||
|
|
||||||
|
val_y_pred = naive_bayes_classifier.predict(X_val_tf)
|
||||||
|
|
||||||
|
file = open('test-A/out.tsv',"w")
|
||||||
|
for i in val_y_pred:
|
||||||
|
file.writelines("{}\n".format(i))
|
||||||
|
file.close()
|
5447
test-A/out.tsv
Normal file
5447
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user