From d4d77d4f178ada552c2fae764038bd72ff8a6c5d Mon Sep 17 00:00:00 2001
From: Wojciech Jarmosz <jarmoszw@gmail.com>
Date: Sun, 24 Apr 2022 20:48:00 +0200
Subject: [PATCH] Add synthetic data generator

---
 __init__.py        |   0
 generate_errors.py | 168 +++++++++++++++++++++++++++++++++++++++++++++
 install.sh         |  14 ++++
 requirements.txt   |  77 +++++++++++++++++++++
 tokenizer.py       |   8 +++
 5 files changed, 267 insertions(+)
 create mode 100644 __init__.py
 create mode 100644 generate_errors.py
 create mode 100644 install.sh
 create mode 100644 requirements.txt
 create mode 100644 tokenizer.py

diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/generate_errors.py b/generate_errors.py
new file mode 100644
index 0000000..d19a29d
--- /dev/null
+++ b/generate_errors.py
@@ -0,0 +1,168 @@
+import random
+import hunspell
+import spacy
+from tokenizer import Tokenizer
+from scipy.stats import norm
+import pandas as pd
+import regex as re
+import glob
+import threading
+
+# pip install hunspell spacy
+# python -m spacy download pl_core_news_lg
+# sed -ri '/^\s*$/d' ./datasets_original/oscar/oscar_pl_test.txt
+#  unzip -p oscar_filtered.zip | sed -r '/^\s*$/d' | gawk 'NF>6'  > oscar_filtered.txt
+
+
+class SyntheticErrorsGenerator:
+    def __init__(self):
+        self.substitution_prob = 0.7
+        self.remain_prob = 0.3
+        self.input_dataframe = pd.DataFrame([], columns=['sentence'])
+        self.output_dataframe = pd.DataFrame([], columns=['sentence'])
+        spacy.load('pl_core_news_lg')
+        self.spellchecker = hunspell.HunSpell('./pl.dic', './pl.aff')
+        self.tokenizer = Tokenizer()
+
+    def read_input_file(self, input_filename):
+        with open(input_filename, encoding="utf-8", mode='r') as input:
+            yield from input.readlines()
+
+    def delete_character(self, str, idx):
+        return str[:idx] + str[idx+1:]
+
+    def delete(self, tokens, idx):
+        tokens.pop(idx)
+        return tokens
+
+    def swap_characters(self, str, idx):
+        strlst = list(str)
+        if not (len(str) - 1) == idx:
+            strlst[idx], strlst[idx+1] = strlst[idx+1], strlst[idx]
+            return "".join(strlst)
+        else:
+            strlst[idx-1], strlst[idx] = strlst[idx-1], strlst[idx]
+            return "".join(strlst)
+
+    def spelling_error(self, tokens):
+        errors_matrix = {
+            'ą': 'a',
+            'ć': 'c',
+            'ę': 'e',
+            'ł': 'l',
+            'ń': 'n',
+            "ó": 'u',
+            "u": 'ó',
+            'ś': 's',
+            'ź': 'z',
+            'ż': 'z'
+        }
+
+        letters_existing_in_word = []
+        for letter, _ in errors_matrix.items():
+            if letter in tokens:
+                letters_existing_in_word.append(letter)
+        
+        if len(letters_existing_in_word) > 0:
+            letter_to_replace = random.choice(letters_existing_in_word)
+            tokens = tokens.replace(letter_to_replace, errors_matrix[letter_to_replace])
+
+        return tokens
+
+    def swap(self, tokens, idx):
+        if not (len(tokens) - 1) == idx:
+            tokens[idx], tokens[idx + 1] = tokens[idx + 1], tokens[idx]
+        else:
+            tokens[idx - 1], tokens[idx] = tokens[idx], tokens[idx - 1]
+        return tokens
+
+    def add_random(self, tokens, idx):
+        confusion_set = self.spellchecker.suggest(tokens[idx])[:3]
+        if len(confusion_set) > 1:
+            word_to_replace = random.sample(confusion_set, 1)[0]
+            tokens.insert(idx + 1, word_to_replace)
+        return tokens
+
+    def add_random_character(self, str, idx):
+        confusion_set = self.spellchecker.suggest(str[idx])[:3]
+        if len(confusion_set) > 1:
+            char_to_replace = random.sample(confusion_set, 1)[0].lower()
+            return str[:idx] + char_to_replace + str[idx:]
+        return str
+
+    def substitute_delete_add(self, tokens, token_idx, operation):
+        if operation == 'DELETE':
+            return self.delete(tokens, token_idx)
+        elif operation == 'SWAP':
+            return self.swap(tokens, token_idx)
+        elif operation == 'ADD_RANDOM':
+            return self.add_random(tokens, token_idx)
+        elif operation == 'SWAP_CHARACTERS':
+            return self.swap_characters(tokens, token_idx)
+        elif operation == 'DELETE_CHARACTER':
+            return self.delete_character(tokens, token_idx)
+        elif operation == 'ADD_RANDOM_CHARACTER':
+            return self.add_random_character(tokens, token_idx)
+        elif operation == 'SPELLING_ERROR':
+            return self.spelling_error(tokens)
+         
+    def introduce_error(self, line):
+        tokens = self.tokenizer.tokenize(line)
+        num_words_to_change = round(abs(norm.mean(0.15, 0.2) * norm.std(0.15, 0.2)) * len(line))
+        if num_words_to_change > len(set(tokens)):
+            num_words_to_change = 1
+        words_to_change = random.sample(set(tokens), num_words_to_change)
+        num_words_to_change_letters = round(len(tokens) * 0.1)
+
+        words_for_spelling_errors = [tokens.index(word) for word in tokens if word not in words_to_change]
+        for idx in random.sample(words_for_spelling_errors, num_words_to_change_letters):
+            word = tokens[idx]
+            if word.isalnum():
+                random_number = random.random()
+                random_operation = random.sample(['DELETE_CHARACTER', 'SWAP_CHARACTERS', 'ADD_RANDOM_CHARACTER', 'SPELLING_ERROR'], 1)[0]
+                random_idx = random.sample(range(0, len(word)), 1)[0]
+                tokens[idx] = self.substitute_delete_add(word, random_idx, random_operation)
+
+        for word_to_change in words_to_change:
+            idx = tokens.index(word_to_change)
+            random_number = random.random()
+            random_operation = random.sample(['DELETE', 'SWAP', 'ADD_RANDOM'], 1)[0]
+            if random_number < self.remain_prob:
+                tokens = self.substitute_delete_add(tokens, idx, random_operation)
+            elif random_number < self.substitution_prob:
+                word_to_replace = ''
+                confusion_set = self.spellchecker.suggest(word_to_change)[:3]
+                if len(confusion_set) > 1:
+                    word_to_replace = random.sample(confusion_set, 1)[0]
+                    while(word_to_replace == word_to_change):
+                        word_to_replace = random.sample(confusion_set, 1)[0]
+                    tokens[idx] = word_to_replace
+                else:
+                    tokens = self.substitute_delete_add(tokens, idx, random_operation)
+        return ' '.join(tokens)
+
+    def generate_synthetic_errors_from_folder(self, folder_path):
+        for idx, path in enumerate(glob.glob(folder_path)[:11]):
+            t = threading.Thread(target=self.generate_synthetic_errors_from_file, args=(path, f'./datasets_original/oscar/splitted_oscar/input{idx}.txt', f'./datasets_original/oscar/splitted_oscar/output{idx}.txt'))
+            t.start()
+
+
+    def generate_synthetic_errors_from_file(self, source_filename, input_filename, output_filename):
+        with open(input_filename, encoding="utf-8", mode="w") as input:
+            with open(output_filename, encoding="utf-8", mode="w") as output:
+                for line in self.read_input_file(source_filename):
+                    if len(line.split()) > 7:
+                        new_line = line.strip()
+                        new_line = new_line[0].capitalize() + new_line[1:]
+                        new_line_with_error = self.introduce_error(new_line)
+                        input.write(new_line + "\n")
+                        output.write(new_line_with_error + "\n")
+
+
+synthetic_errors_generator = SyntheticErrorsGenerator()
+synthetic_errors_generator.generate_synthetic_errors_from_file('./datasets_original/oscar/splitted_oscar/output_fileaa', './datasets_original/oscar/splitted_oscar/results/input1.txt', './datasets_original/oscar/splitted_oscar/results/output1.txt')
+
+# synthetic_errors_generator.generate_synthetic_errors_from_folder(folder_path='./datasets_original/oscar/splitted_oscar/*')
+# synthetic_errors_generator.generate_synthetic_errors('./datasets_original/oscar/splitted_oscar/output_fileaa', './datasets_original/oscar/splitted_oscar/input1.txt', './datasets_original/oscar/splitted_oscar/output1.txt')
+# synthetic_errors_generator.generate_synthetic_errors('./datasets_original/lektury/input2_lektury.txt', './datasets_original/lektury/input2.txt', './datasets_original/lektury/output2.txt')
+
diff --git a/install.sh b/install.sh
new file mode 100644
index 0000000..cc1076b
--- /dev/null
+++ b/install.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+apt update -y
+apt upgrade
+
+apt-get install python3 python3-pip python3-venv unzip gawk split
+python3 -m venv env
+source ./env/bin/activate
+pip install -r requirements.txt
+python -m spacy download pl_core_news_lg
+
+mkdir data && cd data
+wget https://minio.clarin-pl.eu/ermlab/public/PoLitBert/corpus-oscar/corpus_oscar_2020-04-10_64M_lines.zip
+unzip -p corpus_oscar_2020-04-10_64M_lines.zip | sed -r '/^\s*$/d' | gawk 'NF>6'  > oscar_filtered.txt
+split -l 1000000 --numeric-suffixes=1 --suffix-length=1 --additional-suffix=".txt"  oscar_filtered.txt ""
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..d210a0f
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,77 @@
+alabaster==0.7.12
+arrow==1.2.2
+async-generator==1.10
+attrs==21.4.0
+Babel==2.9.1
+beautifulsoup4==4.11.1
+bibtexparser==1.2.0
+blis==0.7.7
+catalogue==2.0.7
+certifi==2021.10.8
+charset-normalizer==2.0.12
+click==8.0.4
+cymem==2.0.6
+Deprecated==1.2.13
+docutils==0.17.1
+fake-useragent==0.1.11
+free-proxy==1.0.6
+future==0.18.2
+h11==0.13.0
+hunspell==0.5.5
+idna==3.3
+imagesize==1.3.0
+importlib-metadata==4.11.3
+Jinja2==3.1.1
+langcodes==3.3.0
+lxml==4.8.0
+MarkupSafe==2.1.1
+morfeusz2==1.99.4
+murmurhash==1.0.6
+numpy==1.22.3
+outcome==1.1.0
+packaging==21.3
+pathy==0.6.1
+pl-core-news-lg==3.2.0
+preshed==3.0.6
+pydantic==1.8.2
+Pygments==2.11.2
+pyMorfologik==0.2.2
+pyparsing==3.0.7
+PySocks==1.7.1
+python-dateutil==2.8.2
+python-dotenv==0.20.0
+pytz==2022.1
+regex==2022.3.15
+requests==2.27.1
+scholarly==1.6.0
+scipy==1.8.0
+selenium==4.1.3
+six==1.16.0
+smart-open==5.2.1
+sniffio==1.2.0
+snowballstemmer==2.2.0
+sortedcontainers==2.4.0
+soupsieve==2.3.2
+spacy==3.2.4
+spacy-legacy==3.0.9
+spacy-loggers==1.0.2
+Sphinx==4.5.0
+sphinx-rtd-theme==1.0.0
+sphinxcontrib-applehelp==1.0.2
+sphinxcontrib-devhelp==1.0.2
+sphinxcontrib-htmlhelp==2.0.0
+sphinxcontrib-jsmath==1.0.1
+sphinxcontrib-qthelp==1.0.3
+sphinxcontrib-serializinghtml==1.1.5
+srsly==2.4.2
+thinc==8.0.15
+tqdm==4.64.0
+trio==0.20.0
+trio-websocket==0.9.2
+typer==0.4.1
+typing-extensions==4.1.1
+urllib3==1.26.9
+wasabi==0.9.1
+wrapt==1.14.0
+wsproto==1.1.0
+zipp==3.8.0
\ No newline at end of file
diff --git a/tokenizer.py b/tokenizer.py
new file mode 100644
index 0000000..40e4575
--- /dev/null
+++ b/tokenizer.py
@@ -0,0 +1,8 @@
+import spacy
+
+class Tokenizer:
+    def __init__(self):
+        self.polish_tokenizer = spacy.load('pl_core_news_lg')
+
+    def tokenize(self, text):
+        return [tok.text for tok in self.polish_tokenizer.tokenizer(text)]