synthetic_errors/generate_errors.py
2022-04-26 19:24:23 +02:00

152 lines
6.3 KiB
Python

import random
import morfeusz2
import spacy
from tokenizer import Tokenizer
from scipy.stats import norm
import regex as re
import sys
from collections import OrderedDict
class SyntheticErrorsGenerator:
def __init__(self):
self.substitution_prob = 0.7
self.remain_prob = 0.4
spacy.load('pl_core_news_lg')
self.morf = morfeusz2.Morfeusz()
self.tokenizer = Tokenizer()
def read_input_file(self, input_filename):
with open(input_filename, encoding="utf-8", mode='r') as input:
yield from input
def remove_unused_whitespaces(self, text):
new_text = re.sub(r'(?<=[^!.,>$%&-][!.,>$%&-])[!.,>$%& -]+(?<! )', '', text)
new_text = re.sub(r'[^\w\s?.!,:;()[\]]', '', new_text)
new_text = re.sub(r'\s\s+', ' ', new_text)
new_text = re.sub(r'\s+([?.!,:;\])}”])', r'\1', new_text)
new_text = re.sub(r'([\[({„])\s+', r'\1', new_text)
new_text = re.sub(r'_ ', '', new_text)
new_text = re.sub(r',, ', '', new_text)
return new_text
# Functions to modify characters in words
def delete_character(self, str, idx):
return str[:idx] + str[idx+1:]
def swap_characters(self, str, idx):
strlst = list(str)
if not (len(str) - 1) == idx:
strlst[idx], strlst[idx+1] = strlst[idx+1], strlst[idx]
return "".join(strlst)
else:
strlst[idx-1], strlst[idx] = strlst[idx-1], strlst[idx]
return "".join(strlst)
def introduce_spelling_error(self, tokens):
errors_matrix = {
'ą': 'a',
'ć': 'c',
'ę': 'e',
'ł': 'l',
'ń': 'n',
"ó": 'u',
"u": 'ó',
'ś': 's',
'ź': 'z',
'ż': 'z'
}
items = list(errors_matrix.items())
random.shuffle(items)
errors_matrix = OrderedDict(items)
letters_existing_in_word = []
for letter, _ in errors_matrix.items():
if letter in tokens:
letters_existing_in_word.append(letter)
if len(letters_existing_in_word) > 0:
letter_to_replace = random.choice(letters_existing_in_word)
tokens = tokens.replace(letter_to_replace, errors_matrix[letter_to_replace])
return tokens
def duplicate_character(self, str, idx):
return str[:idx] + str[idx] + str[idx:]
def substitute_delete_add(self, tokens, token_idx, operation):
if operation == 'SWAP_CHARACTERS':
return self.swap_characters(tokens, token_idx)
elif operation == 'DELETE_CHARACTER':
return self.delete_character(tokens, token_idx)
elif operation == 'DUPLICATE_CHARACTER':
return self.duplicate_character(tokens, token_idx)
elif operation == 'SPELLING_ERROR':
return self.introduce_spelling_error(tokens)
def introduce_character_error(self, tokens, word, idx):
if len(word) >= 1:
random_operation = random.sample(['DELETE_CHARACTER', 'SWAP_CHARACTERS', 'DUPLICATE_CHARACTER', 'SPELLING_ERROR'], 1)[0]
random_idx = random.sample(range(0, len(word)), 1)[0]
tokens[idx] = self.substitute_delete_add(word, random_idx, random_operation)
return tokens
def introduce_error(self, line):
tokens = self.tokenizer.tokenize(line)
num_words_to_change = round(abs(norm.mean(0.15, 0.2) * norm.std(0.15, 0.2)) * len(line))
if num_words_to_change > len(set(tokens)):
num_words_to_change = 1
words_to_change = random.sample(set(tokens), num_words_to_change)
num_words_to_change_letters = round(len(tokens) * 0.1)
words_for_spelling_errors = [tokens.index(word) for word in tokens if word not in words_to_change]
if len(words_for_spelling_errors) >= num_words_to_change_letters:
for idx in random.sample(words_for_spelling_errors, num_words_to_change_letters):
word = tokens[idx]
if word.isalnum():
random_number = random.random()
tokens = self.introduce_character_error(tokens, word, idx)
for word_to_change in words_to_change:
idx = tokens.index(word_to_change)
random_number = random.random()
if random_number <= self.remain_prob:
word = tokens[idx]
random_idx = random.sample(range(0, len(word)), 1)[0]
tokens[idx] = self.substitute_delete_add(word, random_idx, 'SPELLING_ERROR')
elif random_number <= self.substitution_prob:
try:
basic_form = self.morf.analyse(word_to_change)[0][2][1].split(":")[0]
forms_to_choose_from = self.morf.generate(basic_form)
if len(forms_to_choose_from) > 0:
choice = word_to_change
choice = random.choice(forms_to_choose_from)[0]
if idx == 0:
choice = choice[0].upper() + choice[1:]
tokens[idx] = choice
else:
word = tokens[idx]
tokens = self.introduce_character_error(tokens, word, idx)
except Exception:
print('Form not found')
word = tokens[idx]
tokens = self.introduce_character_error(tokens, word, idx)
return ' '.join(tokens)
def generate_synthetic_errors_from_file(self, source_filename, input_filename, output_filename):
with open(input_filename, encoding="utf-8", mode="w") as input:
with open(output_filename, encoding="utf-8", mode="w") as output:
for line in self.read_input_file(source_filename):
if len(line.split()) > 7:
new_line = line.strip()
new_line = new_line[0].capitalize() + new_line[1:]
new_line_with_error = self.introduce_error(new_line)
input.write(self.remove_unused_whitespaces(new_line).strip() + "\n")
output.write(self.remove_unused_whitespaces(new_line_with_error).strip() + "\n")
synthetic_errors_generator = SyntheticErrorsGenerator()
synthetic_errors_generator.generate_synthetic_errors_from_file(sys.argv[1], sys.argv[2], sys.argv[3])