solution
This commit is contained in:
parent
6916e0e54b
commit
f4414f094d
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
68
run.py
68
run.py
|
@ -1,48 +1,30 @@
|
|||
from naivebayes import NaiveBayesTextClassifier
|
||||
from spacy.lang.en.stop_words import STOP_WORDS as en_stop
|
||||
import pandas as pd
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
|
||||
naive_bayes = NaiveBayesTextClassifier(
|
||||
categories=[0, 1],
|
||||
stop_words=en_stop
|
||||
)
|
||||
with open('train/in.tsv', 'r', encoding='utf-8') as f:
|
||||
x_train = pd.DataFrame([line.strip().split('\t') for line in f.readlines()], columns=['text', 'text_id'])
|
||||
with open('dev-0/in.tsv', 'r', encoding='utf-8') as f:
|
||||
x_dev = pd.DataFrame([line.strip().split('\t') for line in f.readlines()], columns=['text', 'text_id'])
|
||||
with open('train/in.tsv', 'r', encoding='utf-8') as f:
|
||||
x_test = pd.DataFrame([line.strip().split('\t') for line in f.readlines()], columns=['text', 'text_id'])
|
||||
|
||||
with open('train/in.tsv', 'r', encoding='utf8') as f:
|
||||
train = f.readlines()
|
||||
y_train = pd.read_csv('train/expected.tsv', sep='\t', names=['paranormal'], encoding='utf-8')
|
||||
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, max_features=500)
|
||||
x_train_vectorized = tfidf_vectorizer.fit_transform(x_train['text'].values)
|
||||
|
||||
with open('train/expected.tsv', 'r', encoding='utf8') as f:
|
||||
expected = f.readlines()
|
||||
mnb_model = MultinomialNB().fit(x_train_vectorized, y_train.values.ravel())
|
||||
|
||||
for i in range(0, len(expected)):
|
||||
expected[i] = int(expected[i])
|
||||
# Dev data
|
||||
x_dev_prepared = tfidf_vectorizer.transform(x_dev['text'].values)
|
||||
predictions = mnb_model.predict(x_dev_prepared)
|
||||
with open('dev-0/out.tsv', 'w') as f:
|
||||
for pred in predictions:
|
||||
f.write(f'{pred}\n')
|
||||
|
||||
step = 20000
|
||||
start, end = 0, step
|
||||
|
||||
for i in range(0, len(expected), step):
|
||||
naive_bayes.train(train[start:end], expected[start:end])
|
||||
if start + step < len(expected):
|
||||
start += step
|
||||
else:
|
||||
start = 0
|
||||
end = min(start + step, len(expected))
|
||||
|
||||
|
||||
with open('dev-0/in.tsv', 'r', encoding='utf8') as f:
|
||||
dev_0 = f.readlines()
|
||||
|
||||
predicted_dev_0 = naive_bayes.classify(dev_0)
|
||||
|
||||
with open('dev-0/out.tsv', 'wt') as f:
|
||||
for p in predicted_dev_0:
|
||||
f.write(str(p) + '\n')
|
||||
f.close()
|
||||
|
||||
with open('test-A/in.tsv', 'r', encoding='utf8') as f:
|
||||
test_A = f.readlines()
|
||||
|
||||
predicted_test_A = naive_bayes.classify(test_A)
|
||||
|
||||
with open('test-A/out.tsv', 'wt') as f:
|
||||
for p in predicted_test_A:
|
||||
f.write(str(p) + '\n')
|
||||
f.close()
|
||||
# Test data
|
||||
x_test_vectorized = tfidf_vectorizer.transform(x_test['text'].values)
|
||||
predictions = mnb_model.predict(x_test_vectorized)
|
||||
with open('test-A/out.tsv', 'w') as f:
|
||||
for pred in predictions:
|
||||
f.write(f'{pred}\n')
|
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue