From d4d77d4f178ada552c2fae764038bd72ff8a6c5d Mon Sep 17 00:00:00 2001 From: Wojciech Jarmosz Date: Sun, 24 Apr 2022 20:48:00 +0200 Subject: [PATCH] Add synthetic data generator --- __init__.py | 0 generate_errors.py | 168 +++++++++++++++++++++++++++++++++++++++++++++ install.sh | 14 ++++ requirements.txt | 77 +++++++++++++++++++++ tokenizer.py | 8 +++ 5 files changed, 267 insertions(+) create mode 100644 __init__.py create mode 100644 generate_errors.py create mode 100644 install.sh create mode 100644 requirements.txt create mode 100644 tokenizer.py diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/generate_errors.py b/generate_errors.py new file mode 100644 index 0000000..d19a29d --- /dev/null +++ b/generate_errors.py @@ -0,0 +1,168 @@ +import random +import hunspell +import spacy +from tokenizer import Tokenizer +from scipy.stats import norm +import pandas as pd +import regex as re +import glob +import threading + +# pip install hunspell spacy +# python -m spacy download pl_core_news_lg +# sed -ri '/^\s*$/d' ./datasets_original/oscar/oscar_pl_test.txt +# unzip -p oscar_filtered.zip | sed -r '/^\s*$/d' | gawk 'NF>6' > oscar_filtered.txt + + +class SyntheticErrorsGenerator: + def __init__(self): + self.substitution_prob = 0.7 + self.remain_prob = 0.3 + self.input_dataframe = pd.DataFrame([], columns=['sentence']) + self.output_dataframe = pd.DataFrame([], columns=['sentence']) + spacy.load('pl_core_news_lg') + self.spellchecker = hunspell.HunSpell('./pl.dic', './pl.aff') + self.tokenizer = Tokenizer() + + def read_input_file(self, input_filename): + with open(input_filename, encoding="utf-8", mode='r') as input: + yield from input.readlines() + + def delete_character(self, str, idx): + return str[:idx] + str[idx+1:] + + def delete(self, tokens, idx): + tokens.pop(idx) + return tokens + + def swap_characters(self, str, idx): + strlst = list(str) + if not (len(str) - 1) == idx: + strlst[idx], strlst[idx+1] = strlst[idx+1], strlst[idx] + return "".join(strlst) + else: + strlst[idx-1], strlst[idx] = strlst[idx-1], strlst[idx] + return "".join(strlst) + + def spelling_error(self, tokens): + errors_matrix = { + 'ą': 'a', + 'ć': 'c', + 'ę': 'e', + 'ł': 'l', + 'ń': 'n', + "ó": 'u', + "u": 'ó', + 'ś': 's', + 'ź': 'z', + 'ż': 'z' + } + + letters_existing_in_word = [] + for letter, _ in errors_matrix.items(): + if letter in tokens: + letters_existing_in_word.append(letter) + + if len(letters_existing_in_word) > 0: + letter_to_replace = random.choice(letters_existing_in_word) + tokens = tokens.replace(letter_to_replace, errors_matrix[letter_to_replace]) + + return tokens + + def swap(self, tokens, idx): + if not (len(tokens) - 1) == idx: + tokens[idx], tokens[idx + 1] = tokens[idx + 1], tokens[idx] + else: + tokens[idx - 1], tokens[idx] = tokens[idx], tokens[idx - 1] + return tokens + + def add_random(self, tokens, idx): + confusion_set = self.spellchecker.suggest(tokens[idx])[:3] + if len(confusion_set) > 1: + word_to_replace = random.sample(confusion_set, 1)[0] + tokens.insert(idx + 1, word_to_replace) + return tokens + + def add_random_character(self, str, idx): + confusion_set = self.spellchecker.suggest(str[idx])[:3] + if len(confusion_set) > 1: + char_to_replace = random.sample(confusion_set, 1)[0].lower() + return str[:idx] + char_to_replace + str[idx:] + return str + + def substitute_delete_add(self, tokens, token_idx, operation): + if operation == 'DELETE': + return self.delete(tokens, token_idx) + elif operation == 'SWAP': + return self.swap(tokens, token_idx) + elif operation == 'ADD_RANDOM': + return self.add_random(tokens, token_idx) + elif operation == 'SWAP_CHARACTERS': + return self.swap_characters(tokens, token_idx) + elif operation == 'DELETE_CHARACTER': + return self.delete_character(tokens, token_idx) + elif operation == 'ADD_RANDOM_CHARACTER': + return self.add_random_character(tokens, token_idx) + elif operation == 'SPELLING_ERROR': + return self.spelling_error(tokens) + + def introduce_error(self, line): + tokens = self.tokenizer.tokenize(line) + num_words_to_change = round(abs(norm.mean(0.15, 0.2) * norm.std(0.15, 0.2)) * len(line)) + if num_words_to_change > len(set(tokens)): + num_words_to_change = 1 + words_to_change = random.sample(set(tokens), num_words_to_change) + num_words_to_change_letters = round(len(tokens) * 0.1) + + words_for_spelling_errors = [tokens.index(word) for word in tokens if word not in words_to_change] + for idx in random.sample(words_for_spelling_errors, num_words_to_change_letters): + word = tokens[idx] + if word.isalnum(): + random_number = random.random() + random_operation = random.sample(['DELETE_CHARACTER', 'SWAP_CHARACTERS', 'ADD_RANDOM_CHARACTER', 'SPELLING_ERROR'], 1)[0] + random_idx = random.sample(range(0, len(word)), 1)[0] + tokens[idx] = self.substitute_delete_add(word, random_idx, random_operation) + + for word_to_change in words_to_change: + idx = tokens.index(word_to_change) + random_number = random.random() + random_operation = random.sample(['DELETE', 'SWAP', 'ADD_RANDOM'], 1)[0] + if random_number < self.remain_prob: + tokens = self.substitute_delete_add(tokens, idx, random_operation) + elif random_number < self.substitution_prob: + word_to_replace = '' + confusion_set = self.spellchecker.suggest(word_to_change)[:3] + if len(confusion_set) > 1: + word_to_replace = random.sample(confusion_set, 1)[0] + while(word_to_replace == word_to_change): + word_to_replace = random.sample(confusion_set, 1)[0] + tokens[idx] = word_to_replace + else: + tokens = self.substitute_delete_add(tokens, idx, random_operation) + return ' '.join(tokens) + + def generate_synthetic_errors_from_folder(self, folder_path): + for idx, path in enumerate(glob.glob(folder_path)[:11]): + t = threading.Thread(target=self.generate_synthetic_errors_from_file, args=(path, f'./datasets_original/oscar/splitted_oscar/input{idx}.txt', f'./datasets_original/oscar/splitted_oscar/output{idx}.txt')) + t.start() + + + def generate_synthetic_errors_from_file(self, source_filename, input_filename, output_filename): + with open(input_filename, encoding="utf-8", mode="w") as input: + with open(output_filename, encoding="utf-8", mode="w") as output: + for line in self.read_input_file(source_filename): + if len(line.split()) > 7: + new_line = line.strip() + new_line = new_line[0].capitalize() + new_line[1:] + new_line_with_error = self.introduce_error(new_line) + input.write(new_line + "\n") + output.write(new_line_with_error + "\n") + + +synthetic_errors_generator = SyntheticErrorsGenerator() +synthetic_errors_generator.generate_synthetic_errors_from_file('./datasets_original/oscar/splitted_oscar/output_fileaa', './datasets_original/oscar/splitted_oscar/results/input1.txt', './datasets_original/oscar/splitted_oscar/results/output1.txt') + +# synthetic_errors_generator.generate_synthetic_errors_from_folder(folder_path='./datasets_original/oscar/splitted_oscar/*') +# synthetic_errors_generator.generate_synthetic_errors('./datasets_original/oscar/splitted_oscar/output_fileaa', './datasets_original/oscar/splitted_oscar/input1.txt', './datasets_original/oscar/splitted_oscar/output1.txt') +# synthetic_errors_generator.generate_synthetic_errors('./datasets_original/lektury/input2_lektury.txt', './datasets_original/lektury/input2.txt', './datasets_original/lektury/output2.txt') + diff --git a/install.sh b/install.sh new file mode 100644 index 0000000..cc1076b --- /dev/null +++ b/install.sh @@ -0,0 +1,14 @@ +#!/bin/bash +apt update -y +apt upgrade + +apt-get install python3 python3-pip python3-venv unzip gawk split +python3 -m venv env +source ./env/bin/activate +pip install -r requirements.txt +python -m spacy download pl_core_news_lg + +mkdir data && cd data +wget https://minio.clarin-pl.eu/ermlab/public/PoLitBert/corpus-oscar/corpus_oscar_2020-04-10_64M_lines.zip +unzip -p corpus_oscar_2020-04-10_64M_lines.zip | sed -r '/^\s*$/d' | gawk 'NF>6' > oscar_filtered.txt +split -l 1000000 --numeric-suffixes=1 --suffix-length=1 --additional-suffix=".txt" oscar_filtered.txt "" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d210a0f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,77 @@ +alabaster==0.7.12 +arrow==1.2.2 +async-generator==1.10 +attrs==21.4.0 +Babel==2.9.1 +beautifulsoup4==4.11.1 +bibtexparser==1.2.0 +blis==0.7.7 +catalogue==2.0.7 +certifi==2021.10.8 +charset-normalizer==2.0.12 +click==8.0.4 +cymem==2.0.6 +Deprecated==1.2.13 +docutils==0.17.1 +fake-useragent==0.1.11 +free-proxy==1.0.6 +future==0.18.2 +h11==0.13.0 +hunspell==0.5.5 +idna==3.3 +imagesize==1.3.0 +importlib-metadata==4.11.3 +Jinja2==3.1.1 +langcodes==3.3.0 +lxml==4.8.0 +MarkupSafe==2.1.1 +morfeusz2==1.99.4 +murmurhash==1.0.6 +numpy==1.22.3 +outcome==1.1.0 +packaging==21.3 +pathy==0.6.1 +pl-core-news-lg==3.2.0 +preshed==3.0.6 +pydantic==1.8.2 +Pygments==2.11.2 +pyMorfologik==0.2.2 +pyparsing==3.0.7 +PySocks==1.7.1 +python-dateutil==2.8.2 +python-dotenv==0.20.0 +pytz==2022.1 +regex==2022.3.15 +requests==2.27.1 +scholarly==1.6.0 +scipy==1.8.0 +selenium==4.1.3 +six==1.16.0 +smart-open==5.2.1 +sniffio==1.2.0 +snowballstemmer==2.2.0 +sortedcontainers==2.4.0 +soupsieve==2.3.2 +spacy==3.2.4 +spacy-legacy==3.0.9 +spacy-loggers==1.0.2 +Sphinx==4.5.0 +sphinx-rtd-theme==1.0.0 +sphinxcontrib-applehelp==1.0.2 +sphinxcontrib-devhelp==1.0.2 +sphinxcontrib-htmlhelp==2.0.0 +sphinxcontrib-jsmath==1.0.1 +sphinxcontrib-qthelp==1.0.3 +sphinxcontrib-serializinghtml==1.1.5 +srsly==2.4.2 +thinc==8.0.15 +tqdm==4.64.0 +trio==0.20.0 +trio-websocket==0.9.2 +typer==0.4.1 +typing-extensions==4.1.1 +urllib3==1.26.9 +wasabi==0.9.1 +wrapt==1.14.0 +wsproto==1.1.0 +zipp==3.8.0 \ No newline at end of file diff --git a/tokenizer.py b/tokenizer.py new file mode 100644 index 0000000..40e4575 --- /dev/null +++ b/tokenizer.py @@ -0,0 +1,8 @@ +import spacy + +class Tokenizer: + def __init__(self): + self.polish_tokenizer = spacy.load('pl_core_news_lg') + + def tokenize(self, text): + return [tok.text for tok in self.polish_tokenizer.tokenizer(text)]