152 lines
6.3 KiB
Python
152 lines
6.3 KiB
Python
import random
|
|
import morfeusz2
|
|
import spacy
|
|
from tokenizer import Tokenizer
|
|
from scipy.stats import norm
|
|
import regex as re
|
|
import sys
|
|
from collections import OrderedDict
|
|
|
|
|
|
class SyntheticErrorsGenerator:
|
|
def __init__(self):
|
|
self.substitution_prob = 0.7
|
|
self.remain_prob = 0.4
|
|
spacy.load('pl_core_news_lg')
|
|
self.morf = morfeusz2.Morfeusz()
|
|
self.tokenizer = Tokenizer()
|
|
|
|
def read_input_file(self, input_filename):
|
|
with open(input_filename, encoding="utf-8", mode='r') as input:
|
|
yield from input
|
|
|
|
def remove_unused_whitespaces(self, text):
|
|
new_text = re.sub(r'(?<=[^!.,>$%&-][!.,>$%&-])[!.,>$%& -]+(?<! )', '', text)
|
|
new_text = re.sub(r'[^\w\s?.!,:;()[\]]', '', new_text)
|
|
new_text = re.sub(r'\s\s+', ' ', new_text)
|
|
new_text = re.sub(r'\s+([?.!,:;\])}”])', r'\1', new_text)
|
|
new_text = re.sub(r'([\[({„])\s+', r'\1', new_text)
|
|
new_text = re.sub(r'_ ', '', new_text)
|
|
new_text = re.sub(r',, ', '', new_text)
|
|
return new_text
|
|
|
|
# Functions to modify characters in words
|
|
|
|
def delete_character(self, str, idx):
|
|
return str[:idx] + str[idx+1:]
|
|
|
|
def swap_characters(self, str, idx):
|
|
strlst = list(str)
|
|
if not (len(str) - 1) == idx:
|
|
strlst[idx], strlst[idx+1] = strlst[idx+1], strlst[idx]
|
|
return "".join(strlst)
|
|
else:
|
|
strlst[idx-1], strlst[idx] = strlst[idx-1], strlst[idx]
|
|
return "".join(strlst)
|
|
|
|
def introduce_spelling_error(self, tokens):
|
|
errors_matrix = {
|
|
'ą': 'a',
|
|
'ć': 'c',
|
|
'ę': 'e',
|
|
'ł': 'l',
|
|
'ń': 'n',
|
|
"ó": 'u',
|
|
"u": 'ó',
|
|
'ś': 's',
|
|
'ź': 'z',
|
|
'ż': 'z'
|
|
}
|
|
|
|
items = list(errors_matrix.items())
|
|
random.shuffle(items)
|
|
errors_matrix = OrderedDict(items)
|
|
|
|
letters_existing_in_word = []
|
|
for letter, _ in errors_matrix.items():
|
|
if letter in tokens:
|
|
letters_existing_in_word.append(letter)
|
|
|
|
if len(letters_existing_in_word) > 0:
|
|
letter_to_replace = random.choice(letters_existing_in_word)
|
|
tokens = tokens.replace(letter_to_replace, errors_matrix[letter_to_replace])
|
|
|
|
return tokens
|
|
|
|
def duplicate_character(self, str, idx):
|
|
return str[:idx] + str[idx] + str[idx:]
|
|
|
|
def substitute_delete_add(self, tokens, token_idx, operation):
|
|
if operation == 'SWAP_CHARACTERS':
|
|
return self.swap_characters(tokens, token_idx)
|
|
elif operation == 'DELETE_CHARACTER':
|
|
return self.delete_character(tokens, token_idx)
|
|
elif operation == 'DUPLICATE_CHARACTER':
|
|
return self.duplicate_character(tokens, token_idx)
|
|
elif operation == 'SPELLING_ERROR':
|
|
return self.introduce_spelling_error(tokens)
|
|
|
|
def introduce_character_error(self, tokens, word, idx):
|
|
if len(word) >= 1:
|
|
random_operation = random.sample(['DELETE_CHARACTER', 'SWAP_CHARACTERS', 'DUPLICATE_CHARACTER', 'SPELLING_ERROR'], 1)[0]
|
|
random_idx = random.sample(range(0, len(word)), 1)[0]
|
|
tokens[idx] = self.substitute_delete_add(word, random_idx, random_operation)
|
|
return tokens
|
|
|
|
def introduce_error(self, line):
|
|
tokens = self.tokenizer.tokenize(line)
|
|
num_words_to_change = round(abs(norm.mean(0.15, 0.2) * norm.std(0.15, 0.2)) * len(line))
|
|
if num_words_to_change > len(set(tokens)):
|
|
num_words_to_change = 1
|
|
words_to_change = random.sample(set(tokens), num_words_to_change)
|
|
num_words_to_change_letters = round(len(tokens) * 0.1)
|
|
|
|
words_for_spelling_errors = [tokens.index(word) for word in tokens if word not in words_to_change]
|
|
if len(words_for_spelling_errors) >= num_words_to_change_letters:
|
|
for idx in random.sample(words_for_spelling_errors, num_words_to_change_letters):
|
|
word = tokens[idx]
|
|
if word.isalnum():
|
|
random_number = random.random()
|
|
tokens = self.introduce_character_error(tokens, word, idx)
|
|
|
|
for word_to_change in words_to_change:
|
|
idx = tokens.index(word_to_change)
|
|
random_number = random.random()
|
|
if random_number <= self.remain_prob:
|
|
word = tokens[idx]
|
|
random_idx = random.sample(range(0, len(word)), 1)[0]
|
|
tokens[idx] = self.substitute_delete_add(word, random_idx, 'SPELLING_ERROR')
|
|
elif random_number <= self.substitution_prob:
|
|
try:
|
|
basic_form = self.morf.analyse(word_to_change)[0][2][1].split(":")[0]
|
|
forms_to_choose_from = self.morf.generate(basic_form)
|
|
if len(forms_to_choose_from) > 0:
|
|
choice = word_to_change
|
|
choice = random.choice(forms_to_choose_from)[0]
|
|
if idx == 0:
|
|
choice = choice[0].upper() + choice[1:]
|
|
tokens[idx] = choice
|
|
else:
|
|
word = tokens[idx]
|
|
tokens = self.introduce_character_error(tokens, word, idx)
|
|
except Exception:
|
|
print('Form not found')
|
|
word = tokens[idx]
|
|
tokens = self.introduce_character_error(tokens, word, idx)
|
|
|
|
return ' '.join(tokens)
|
|
|
|
def generate_synthetic_errors_from_file(self, source_filename, input_filename, output_filename):
|
|
with open(input_filename, encoding="utf-8", mode="w") as input:
|
|
with open(output_filename, encoding="utf-8", mode="w") as output:
|
|
for line in self.read_input_file(source_filename):
|
|
if len(line.split()) > 7:
|
|
new_line = line.strip()
|
|
new_line = new_line[0].capitalize() + new_line[1:]
|
|
new_line_with_error = self.introduce_error(new_line)
|
|
input.write(self.remove_unused_whitespaces(new_line).strip() + "\n")
|
|
output.write(self.remove_unused_whitespaces(new_line_with_error).strip() + "\n")
|
|
|
|
|
|
synthetic_errors_generator = SyntheticErrorsGenerator()
|
|
synthetic_errors_generator.generate_synthetic_errors_from_file(sys.argv[1], sys.argv[2], sys.argv[3]) |