s430705
This commit is contained in:
parent
3d96a41f40
commit
206774da84
10124
dev-0/out.tsv
10124
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
28
run.py
28
run.py
@ -69,30 +69,18 @@ def prepare_output(file_path):
|
|||||||
|
|
||||||
|
|
||||||
def train_model(training_data):
|
def train_model(training_data):
|
||||||
for _, row in training_data.iterrows():
|
for index, row in training_data.iterrows():
|
||||||
text = preprocess_text(str(row["final"]))
|
text = preprocess_text(str(row["final"]))
|
||||||
words = word_tokenize(text)
|
words = word_tokenize(text)
|
||||||
for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):
|
for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):
|
||||||
if all([w1, w2]):
|
if w1 and w2 and w3:
|
||||||
model[(w1, w2)][w2] += 1
|
model[(w2, w3)][w1] += 1
|
||||||
total_count = 0
|
model[(w1, w2)][w3] += 1
|
||||||
for w1, w2 in model:
|
|
||||||
total_count = float(sum(model[(w1, w2)].values()))
|
|
||||||
for w3 in model[(w1, w2)]:
|
|
||||||
model[(w1, w2)][w3] /= total_count
|
|
||||||
# for index, row in training_data.iterrows():
|
|
||||||
# text = preprocess_text(str(row['final']))
|
|
||||||
# words = word_tokenize(text)
|
|
||||||
# for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):
|
|
||||||
# if w1 and w2 and w3:
|
|
||||||
# model[(w1, w2)][w3] += 1
|
|
||||||
#
|
|
||||||
# for w1, w2 in model:
|
|
||||||
# total_count = float(sum(model[(w1, w2)].values()))
|
|
||||||
# for w3 in model:
|
|
||||||
# model[(w1, w2)][w3] /= total_count
|
|
||||||
|
|
||||||
# print(model)
|
for word_pair in model:
|
||||||
|
num_n_grams = float(sum(model[word_pair].values()))
|
||||||
|
for word in model[word_pair]:
|
||||||
|
model[word_pair][word] /= num_n_grams
|
||||||
|
|
||||||
|
|
||||||
data = pd.read_csv(
|
data = pd.read_csv(
|
||||||
|
10124
test-A/out.tsv
10124
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user