Add initial implementation
This commit is contained in:
parent
756ef4277a
commit
5bb9042abf
58
run.py
Normal file
58
run.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
import lzma
|
||||||
|
from naivebayes import NaiveBayesTextClassifier
|
||||||
|
from spacy.lang.en.stop_words import STOP_WORDS as en_stop
|
||||||
|
|
||||||
|
|
||||||
|
def get_data(fname):
|
||||||
|
with open(fname, 'r', encoding='utf8') as file:
|
||||||
|
return file.readlines()
|
||||||
|
|
||||||
|
|
||||||
|
def get_data_zipped(fname):
|
||||||
|
with lzma.open(fname, 'r') as file:
|
||||||
|
return file.readlines()
|
||||||
|
|
||||||
|
|
||||||
|
def train_bayes(model, x, y, step=15000):
|
||||||
|
start = 0
|
||||||
|
end = step
|
||||||
|
|
||||||
|
for _ in range(0, len(x), step):
|
||||||
|
model.train(x[start:end], y[start:end])
|
||||||
|
if start + step < len(x):
|
||||||
|
start += step
|
||||||
|
else:
|
||||||
|
start = 0
|
||||||
|
end = min(start + step, len(x))
|
||||||
|
|
||||||
|
|
||||||
|
def write_file(fname, data):
|
||||||
|
with open(fname, 'wt') as f:
|
||||||
|
for d in data:
|
||||||
|
f.write(f'{str(d)}\n')
|
||||||
|
|
||||||
|
def main():
|
||||||
|
train_x = get_data_zipped('train/in.tsv.xz')
|
||||||
|
train_y = get_data('train/expected.tsv')
|
||||||
|
# preprocessing
|
||||||
|
train_y = [int(y) for y in train_y]
|
||||||
|
|
||||||
|
test_x = get_data_zipped('test-A/in.tsv.xz')
|
||||||
|
|
||||||
|
dev_x = get_data_zipped('dev-0/in.tsv.xz')
|
||||||
|
|
||||||
|
model = NaiveBayesTextClassifier(
|
||||||
|
categories=[0, 1],
|
||||||
|
stop_words=en_stop
|
||||||
|
)
|
||||||
|
|
||||||
|
train_bayes(model, train_x, train_y)
|
||||||
|
|
||||||
|
predicted = model.classify(dev_x)
|
||||||
|
predicted_2= model.classify(test_x)
|
||||||
|
|
||||||
|
write_file('dev-0/out.tsv', predicted)
|
||||||
|
write_file('test-A/out.tsv', predicted_2)
|
||||||
|
|
||||||
|
|
||||||
|
main()
|
Loading…
Reference in New Issue
Block a user