change script for fine-tuning alpha

This commit is contained in:
Łukasz Jędyk 2022-04-09 14:54:19 +02:00
parent 1b0c901e32
commit ca339fcfcc

18
run.py
View File

@ -1,5 +1,6 @@
import pandas as pd import pandas as pd
import csv import csv
import sys
import regex as re import regex as re
from collections import Counter, defaultdict from collections import Counter, defaultdict
from nltk import trigrams, word_tokenize from nltk import trigrams, word_tokenize
@ -17,7 +18,7 @@ class Model():
self.vocab = set() self.vocab = set()
def train(self, data): def train(self, data):
for _, row in data.iterrows(): for index, row in data.iterrows():
text = clean_text(str(row['text'])) text = clean_text(str(row['text']))
words = word_tokenize(text) words = word_tokenize(text)
for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True): for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):
@ -26,6 +27,9 @@ class Model():
self.vocab.add(w2) self.vocab.add(w2)
self.vocab.add(w3) self.vocab.add(w3)
self.probs[(w1, w3)][w2] += 1 self.probs[(w1, w3)][w2] += 1
# limit number of data rows used for training
if index > 10000:
break
for w1_w3 in self.probs: for w1_w3 in self.probs:
total_count = float(sum(self.probs[w1_w3].values())) total_count = float(sum(self.probs[w1_w3].values()))
@ -46,15 +50,19 @@ class Model():
str_prediction += f'{word}:{prob} ' str_prediction += f'{word}:{prob} '
remaining_prob = 1 - total_prob remaining_prob = 1 - total_prob
if remaining_prob == 0:
remaining_prob = 0.01
str_prediction += f':{remaining_prob}' str_prediction += f':{remaining_prob}'
return str_prediction return str_prediction
# check arguments
if len(sys.argv) != 2:
print('Wrong number of arguments. Expected 1 - alpha smoothing parameter.')
quit()
else:
alpha = sys.argv[1]
# load training data # load training data
train_data = pd.read_csv('train/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE) train_data = pd.read_csv('train/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
train_labels = pd.read_csv('train/expected.tsv', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE) train_labels = pd.read_csv('train/expected.tsv', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
@ -66,7 +74,7 @@ train_data['text'] = train_data[6] + train_data[0] + train_data[7]
train_data = train_data[['text']] train_data = train_data[['text']]
# init model with given aplha # init model with given aplha
model = Model(0.01) model = Model(alpha)
# train model probs # train model probs
model.train(train_data) model.train(train_data)