This commit is contained in:
ZarebaMichal 2022-04-03 19:28:02 +02:00
parent 3d96a41f40
commit 206774da84
3 changed files with 10132 additions and 10144 deletions

File diff suppressed because it is too large Load Diff

28
run.py
View File

@ -69,30 +69,18 @@ def prepare_output(file_path):
def train_model(training_data): def train_model(training_data):
for _, row in training_data.iterrows(): for index, row in training_data.iterrows():
text = preprocess_text(str(row["final"])) text = preprocess_text(str(row["final"]))
words = word_tokenize(text) words = word_tokenize(text)
for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True): for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):
if all([w1, w2]): if w1 and w2 and w3:
model[(w1, w2)][w2] += 1 model[(w2, w3)][w1] += 1
total_count = 0 model[(w1, w2)][w3] += 1
for w1, w2 in model:
total_count = float(sum(model[(w1, w2)].values()))
for w3 in model[(w1, w2)]:
model[(w1, w2)][w3] /= total_count
# for index, row in training_data.iterrows():
# text = preprocess_text(str(row['final']))
# words = word_tokenize(text)
# for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):
# if w1 and w2 and w3:
# model[(w1, w2)][w3] += 1
#
# for w1, w2 in model:
# total_count = float(sum(model[(w1, w2)].values()))
# for w3 in model:
# model[(w1, w2)][w3] /= total_count
# print(model) for word_pair in model:
num_n_grams = float(sum(model[word_pair].values()))
for word in model[word_pair]:
model[word_pair][word] /= num_n_grams
data = pd.read_csv( data = pd.read_csv(

File diff suppressed because it is too large Load Diff