s430705
This commit is contained in:
parent
3d96a41f40
commit
206774da84
10124
dev-0/out.tsv
10124
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
28
run.py
28
run.py
@ -69,30 +69,18 @@ def prepare_output(file_path):
|
||||
|
||||
|
||||
def train_model(training_data):
|
||||
for _, row in training_data.iterrows():
|
||||
for index, row in training_data.iterrows():
|
||||
text = preprocess_text(str(row["final"]))
|
||||
words = word_tokenize(text)
|
||||
for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):
|
||||
if all([w1, w2]):
|
||||
model[(w1, w2)][w2] += 1
|
||||
total_count = 0
|
||||
for w1, w2 in model:
|
||||
total_count = float(sum(model[(w1, w2)].values()))
|
||||
for w3 in model[(w1, w2)]:
|
||||
model[(w1, w2)][w3] /= total_count
|
||||
# for index, row in training_data.iterrows():
|
||||
# text = preprocess_text(str(row['final']))
|
||||
# words = word_tokenize(text)
|
||||
# for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):
|
||||
# if w1 and w2 and w3:
|
||||
# model[(w1, w2)][w3] += 1
|
||||
#
|
||||
# for w1, w2 in model:
|
||||
# total_count = float(sum(model[(w1, w2)].values()))
|
||||
# for w3 in model:
|
||||
# model[(w1, w2)][w3] /= total_count
|
||||
if w1 and w2 and w3:
|
||||
model[(w2, w3)][w1] += 1
|
||||
model[(w1, w2)][w3] += 1
|
||||
|
||||
# print(model)
|
||||
for word_pair in model:
|
||||
num_n_grams = float(sum(model[word_pair].values()))
|
||||
for word in model[word_pair]:
|
||||
model[word_pair][word] /= num_n_grams
|
||||
|
||||
|
||||
data = pd.read_csv(
|
||||
|
10124
test-A/out.tsv
10124
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user