452627
This commit is contained in:
parent
c9d855a803
commit
4807f6e442
8
.gitignore
vendored
Normal file
8
.gitignore
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
|
||||
*~
|
||||
*.swp
|
||||
*.bak
|
||||
*.pyc
|
||||
*.o
|
||||
.DS_Store
|
||||
.token
|
10519
dev-0/out.tsv
Normal file
10519
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
7
generate_model.py
Normal file
7
generate_model.py
Normal file
@ -0,0 +1,7 @@
|
||||
KENLM_BUILD_PATH='/home/ladislaus_iii/kenlm/build'
|
||||
|
||||
!$KENLM_BUILD_PATH/bin/lmplz -o 5 < train/in.txt > model.arpa
|
||||
|
||||
!$KENLM_BUILD_PATH/bin/build_binary model.arpa model.binary
|
||||
|
||||
|
79
predict.py
Normal file
79
predict.py
Normal file
@ -0,0 +1,79 @@
|
||||
import kenlm
|
||||
import csv
|
||||
|
||||
def predict_probability(sentence):
|
||||
return model.score(sentence)
|
||||
|
||||
def load_candidate_words(file_path):
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
candidate_words = {line.strip() for line in file}
|
||||
return candidate_words
|
||||
|
||||
def predict_word_between(text1, text2, model, candidate_words):
|
||||
max_prob = float("-inf")
|
||||
best_word = None
|
||||
|
||||
for word in candidate_words:
|
||||
sentence = f"{text1} {word} {text2}"
|
||||
prob = model.score(sentence)
|
||||
|
||||
if prob > max_prob:
|
||||
max_prob = prob
|
||||
best_word = word
|
||||
|
||||
return best_word
|
||||
|
||||
dev = []
|
||||
test = []
|
||||
|
||||
with open('dev-0/in_1.csv', 'r', newline='', encoding='utf-8') as file:
|
||||
reader = csv.reader(file, delimiter=',')
|
||||
|
||||
for row in reader:
|
||||
dev.append(row)
|
||||
|
||||
with open('test-A/in_1.csv', 'r', newline='', encoding='utf-8') as file:
|
||||
reader = csv.reader(file, delimiter=',')
|
||||
|
||||
for row in reader:
|
||||
test.append(row)
|
||||
|
||||
model_path = "model.binary"
|
||||
model = kenlm.Model(model_path)
|
||||
|
||||
candidate_words_file = "words_3.txt"
|
||||
candidate_words = load_candidate_words(candidate_words_file)
|
||||
|
||||
predicted_dev = []
|
||||
predicted_test = []
|
||||
|
||||
i = 0
|
||||
for row in dev:
|
||||
text1 = row[0]
|
||||
text2 = row[1]
|
||||
predicted_word = predict_word_between(text1, text2, model, candidate_words)
|
||||
predicted_dev.append(predicted_word)
|
||||
if i % 500 == 0:
|
||||
print(f'{i/len(dev)*100}%')
|
||||
i += 1
|
||||
|
||||
with open('dev-0/out.tsv', 'w', newline='') as tsv_file:
|
||||
tsv_writer = csv.writer(tsv_file, delimiter='\t')
|
||||
for row in predicted_dev:
|
||||
tsv_writer.writerow(row)
|
||||
|
||||
i = 0
|
||||
for row in test:
|
||||
text1 = row[0]
|
||||
text2 = row[1]
|
||||
predicted_word = predict_word_between(text1, text2, model, candidate_words)
|
||||
predicted_test.append(predicted_word)
|
||||
if i % 500 == 0:
|
||||
print(f'{i/len(dev)*100}%')
|
||||
i += 1
|
||||
|
||||
|
||||
with open('test-A/out.tsv', 'w', newline='') as tsv_file:
|
||||
tsv_writer = csv.writer(tsv_file, delimiter='\t')
|
||||
for row in predicted_test:
|
||||
tsv_writer.writerow(row)
|
149
prep_data.py
Normal file
149
prep_data.py
Normal file
@ -0,0 +1,149 @@
|
||||
import csv
|
||||
import re
|
||||
from gensim.models import Word2Vec
|
||||
import gensim.downloader as api
|
||||
import numpy as np
|
||||
from spellchecker import SpellChecker
|
||||
import pandas as pd
|
||||
|
||||
folder = 'test-A'
|
||||
filename = f"{folder}/in_1.csv"
|
||||
|
||||
data = []
|
||||
|
||||
data = pd.read_csv(f'{folder}/in.tsv',delimiter='\t', header=None, encoding='utf-8', quoting=csv.QUOTE_NONE, engine='python').values.tolist()
|
||||
|
||||
data_a = []
|
||||
data_b = []
|
||||
data_pair = []
|
||||
|
||||
for i in range(len(data)):
|
||||
data_a.append(data[i][6])
|
||||
try:
|
||||
data_b.append(data[i][7])
|
||||
except:
|
||||
data_b.append('')
|
||||
|
||||
for i in range(len(data)):
|
||||
data_pair.append([data_a[i], data_b[i]])
|
||||
|
||||
data_tabs = []
|
||||
|
||||
for x, y in data_pair:
|
||||
cleaned_text_a = x.replace('\\t', '\t').replace('\\n', '\n').strip("[]")
|
||||
cleaned_text_b = y.replace('\\t', '\t').replace('\\n', '\n').strip("[]")
|
||||
data_tabs.append([cleaned_text_a, cleaned_text_b])
|
||||
|
||||
data_removed = []
|
||||
|
||||
for x, y in data_tabs:
|
||||
text = re.sub(r'(?<!-)\n', ' ', x)
|
||||
text = re.sub(r'[\n-]', '', text)
|
||||
text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
text_2 = re.sub(r'(?<!-)\n', ' ', y)
|
||||
text_2 = re.sub(r'[\n-]', '', text_2)
|
||||
text_2 = re.sub(r'[^a-zA-Z0-9\s]', '', text_2)
|
||||
text_2 = re.sub(r'\s+', ' ', text_2)
|
||||
data_removed.append([text, text_2])
|
||||
|
||||
model = api.load("word2vec-google-news-300")
|
||||
|
||||
def is_close_to_actual(word, threshold=0.5):
|
||||
if word in model:
|
||||
similarities = model.similar_by_word(word)
|
||||
return any(similarity > threshold for _, similarity in similarities)
|
||||
else:
|
||||
return False
|
||||
|
||||
def remove_words(text, words_to_destroy):
|
||||
pattern = r'\b(?:{})\b'.format('|'.join(words_to_destroy))
|
||||
cleaned_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
|
||||
return cleaned_text
|
||||
|
||||
spell = SpellChecker()
|
||||
|
||||
data_cleared = []
|
||||
|
||||
i = 0
|
||||
for x, y in data_removed:
|
||||
|
||||
words = x.split()
|
||||
words_2 = y.split()
|
||||
|
||||
misspelled = spell.unknown(words + words_2)
|
||||
|
||||
text = remove_words(x, list(misspelled))
|
||||
text_2 = remove_words(y, list(misspelled))
|
||||
|
||||
data_cleared.append([text, text_2])
|
||||
|
||||
if i % 20000 == 0:
|
||||
print(f'{i/430000*100}%')
|
||||
i += 1
|
||||
|
||||
data_cleared_2 = []
|
||||
|
||||
for x, y in data_cleared:
|
||||
text = re.sub(r'(?<!-)\n', ' ', x)
|
||||
text = re.sub(r'[\n-]', '', text)
|
||||
text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
text_2 = re.sub(r'(?<!-)\n', ' ', y)
|
||||
text_2 = re.sub(r'[\n-]', '', text_2)
|
||||
text_2 = re.sub(r'[^a-zA-Z0-9\s]', '', text_2)
|
||||
text_2 = re.sub(r'\s+', ' ', text_2)
|
||||
data_cleared_2.append([text, text_2])
|
||||
|
||||
with open(filename, 'w', newline='') as csvfile:
|
||||
writer = csv.writer(csvfile)
|
||||
writer.writerows(data_cleared_2)
|
||||
|
||||
"""import wordninja
|
||||
|
||||
from spellchecker import SpellChecker
|
||||
|
||||
spell = SpellChecker()
|
||||
|
||||
concatenated_misspelled = []
|
||||
|
||||
for x, y in data_removed:
|
||||
|
||||
words = x.split()
|
||||
words_2 = y.split()
|
||||
|
||||
misspelled = spell.unknown(words + words_2)
|
||||
|
||||
concatenated_misspelled.append(list(misspelled))
|
||||
|
||||
data_corrected = []
|
||||
|
||||
i = 0
|
||||
for x, y in data_removed:
|
||||
|
||||
text = x
|
||||
text_2 = y
|
||||
|
||||
for word in flattened_concatenated_misspelled:
|
||||
if is_close_to_actual(word, model):
|
||||
corrected_word = spell.correction(word)
|
||||
if corrected_word != None:
|
||||
text = text.replace(word, corrected_word)
|
||||
text_2 = text_2.replace(word, corrected_word)
|
||||
else:
|
||||
if len(word) > 6:
|
||||
tokens = wordninja.split(word)
|
||||
my_string = ' '.join(tokens)
|
||||
text = text.replace(word, my_string)
|
||||
text_2 = text_2.replace(word, my_string)
|
||||
else:
|
||||
text = text.replace(word, '')
|
||||
text_2 = text_2.replace(word, '')
|
||||
|
||||
if i % 20000 == 0:
|
||||
print(f'{i/430000*100}%')
|
||||
i += 1
|
||||
|
||||
data_corrected.append([text, text_2])"""
|
||||
|
||||
|
33
prep_txt.py
Normal file
33
prep_txt.py
Normal file
@ -0,0 +1,33 @@
|
||||
import csv
|
||||
|
||||
tr = []
|
||||
tr_r = []
|
||||
|
||||
folder = 'dev-0'
|
||||
|
||||
with open(f'{folder}/in_1.csv', 'r', encoding='utf-8') as file:
|
||||
csv_reader = csv.reader(file, delimiter=',')
|
||||
for row in csv_reader:
|
||||
tr.append(row)
|
||||
|
||||
with open(f'{folder}/expected.tsv', 'r', encoding='utf-8') as file:
|
||||
csv_reader = csv.reader(file, delimiter='\t')
|
||||
for row in csv_reader:
|
||||
tr_r.append(row)
|
||||
|
||||
data = []
|
||||
|
||||
for i in range(len(tr)):
|
||||
try:
|
||||
data.append([tr[i][0], tr_r[i], tr[i][1]])
|
||||
except:
|
||||
try:
|
||||
data.append([tr[i][0], tr_r[i], ''])
|
||||
except:
|
||||
pass
|
||||
|
||||
with open(f'{folder}/in.txt', 'w', encoding='utf-8') as f:
|
||||
for item in data:
|
||||
f.write(str(item[0]) + ' ' + str(item[1][0]) + ' ' + str(item[2]) + '\n')
|
||||
|
||||
|
7414
test-A/out.tsv
Normal file
7414
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user