Add train datasets

2022-04-26 19:24:23 +02:00 · 2022-04-26 19:24:23 +02:00 · 24ecd25e72
commit 24ecd25e72
parent 15c97ebee3
10 changed files with 273 additions and 351041 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,160 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/#use-with-ide
 .pdm.toml
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
--- a/README.md
+++ b/README.md
@ -0,0 +1,3 @@
 # GEC system for polish language
 ## Synthetic errors generator
--- a/generate_errors.py
+++ b/generate_errors.py
@ -1,35 +1,40 @@
 import random
-import hunspell
+import morfeusz2
 import spacy
 from tokenizer import Tokenizer
 from scipy.stats import norm
 import pandas as pd
 import regex as re
 import glob
 import sys
-import threading
+from collections import OrderedDict
 class SyntheticErrorsGenerator:
    def __init__(self):
        self.substitution_prob = 0.7
-        self.remain_prob = 0.3
+        self.remain_prob = 0.4
        self.input_dataframe = pd.DataFrame([], columns=['sentence'])
        self.output_dataframe = pd.DataFrame([], columns=['sentence'])
        spacy.load('pl_core_news_lg')
-        self.spellchecker = hunspell.HunSpell('./pl.dic', './pl.aff')
+        self.morf = morfeusz2.Morfeusz()
        self.tokenizer = Tokenizer()
    def read_input_file(self, input_filename):
        with open(input_filename, encoding="utf-8", mode='r') as input:
-            yield from input.readlines()
+            yield from input
    def remove_unused_whitespaces(self, text):
        new_text = re.sub(r'(?<=[^!.,>$%&-][!.,>$%&-])[!.,>$%& -]+(?<! )', '', text)
        new_text = re.sub(r'[^\w\s?.!,:;()[\]]', '', new_text)
        new_text = re.sub(r'\s\s+', ' ', new_text)
        new_text = re.sub(r'\s+([?.!,:;\])}”])', r'\1', new_text)
        new_text = re.sub(r'([\[({„])\s+', r'\1', new_text)
        new_text = re.sub(r'_ ', '', new_text)
        new_text = re.sub(r',, ', '', new_text)
        return new_text
    # Functions to modify characters in words
    def delete_character(self, str, idx):
        return str[:idx] + str[idx+1:]
    def delete(self, tokens, idx):
        tokens.pop(idx)
        return tokens
    def swap_characters(self, str, idx):
        strlst = list(str)
        if not (len(str) - 1) == idx:
@ -39,7 +44,7 @@ class SyntheticErrorsGenerator:
            strlst[idx-1], strlst[idx] = strlst[idx-1], strlst[idx]
            return "".join(strlst)
-    def spelling_error(self, tokens):
+    def introduce_spelling_error(self, tokens):
        errors_matrix = {
            'ą': 'a',
            'ć': 'c',
@ -53,6 +58,10 @@ class SyntheticErrorsGenerator:
            'ż': 'z'
        }
        items = list(errors_matrix.items())
        random.shuffle(items)
        errors_matrix = OrderedDict(items)
        letters_existing_in_word = []
        for letter, _ in errors_matrix.items():
            if letter in tokens:
@ -64,42 +73,25 @@ class SyntheticErrorsGenerator:
        return tokens
-    def swap(self, tokens, idx):
+    def duplicate_character(self, str, idx):
-        if not (len(tokens) - 1) == idx:
+        return str[:idx] + str[idx] + str[idx:]
            tokens[idx], tokens[idx + 1] = tokens[idx + 1], tokens[idx]
        else:
            tokens[idx - 1], tokens[idx] = tokens[idx], tokens[idx - 1]
        return tokens
    def add_random(self, tokens, idx):
        confusion_set = self.spellchecker.suggest(tokens[idx])[:3]
        if len(confusion_set) > 1:
            word_to_replace = random.sample(confusion_set, 1)[0]
            tokens.insert(idx + 1, word_to_replace)
        return tokens
    def add_random_character(self, str, idx):
        confusion_set = self.spellchecker.suggest(str[idx])[:3]
        if len(confusion_set) > 1:
            char_to_replace = random.sample(confusion_set, 1)[0].lower()
            return str[:idx] + char_to_replace + str[idx:]
        return str
    def substitute_delete_add(self, tokens, token_idx, operation):
-        if operation == 'DELETE':
+        if operation == 'SWAP_CHARACTERS':
            return self.delete(tokens, token_idx)
        elif operation == 'SWAP':
            return self.swap(tokens, token_idx)
        elif operation == 'ADD_RANDOM':
            return self.add_random(tokens, token_idx)
        elif operation == 'SWAP_CHARACTERS':
            return self.swap_characters(tokens, token_idx)
        elif operation == 'DELETE_CHARACTER':
            return self.delete_character(tokens, token_idx)
-        elif operation == 'ADD_RANDOM_CHARACTER':
+        elif operation == 'DUPLICATE_CHARACTER':
-            return self.add_random_character(tokens, token_idx)
+            return self.duplicate_character(tokens, token_idx)
        elif operation == 'SPELLING_ERROR':
-            return self.spelling_error(tokens)
+            return self.introduce_spelling_error(tokens)
    def introduce_character_error(self, tokens, word, idx):
        if len(word) >= 1:
            random_operation = random.sample(['DELETE_CHARACTER', 'SWAP_CHARACTERS', 'DUPLICATE_CHARACTER', 'SPELLING_ERROR'], 1)[0]
            random_idx = random.sample(range(0, len(word)), 1)[0]
            tokens[idx] = self.substitute_delete_add(word, random_idx, random_operation)
        return tokens
    def introduce_error(self, line):
        tokens = self.tokenizer.tokenize(line)
@ -110,38 +102,40 @@ class SyntheticErrorsGenerator:
        num_words_to_change_letters = round(len(tokens) * 0.1)
        words_for_spelling_errors = [tokens.index(word) for word in tokens if word not in words_to_change]
-        for idx in random.sample(words_for_spelling_errors, num_words_to_change_letters):
+        if len(words_for_spelling_errors) >= num_words_to_change_letters:
-            word = tokens[idx]
+            for idx in random.sample(words_for_spelling_errors, num_words_to_change_letters):
-            if word.isalnum():
+                word = tokens[idx]
-                random_number = random.random()
+                if word.isalnum():
-                random_operation = random.sample(['DELETE_CHARACTER', 'SWAP_CHARACTERS', 'ADD_RANDOM_CHARACTER', 'SPELLING_ERROR'], 1)[0]
+                    random_number = random.random()
-                random_idx = random.sample(range(0, len(word)), 1)[0]
+                    tokens = self.introduce_character_error(tokens, word, idx)
                tokens[idx] = self.substitute_delete_add(word, random_idx, random_operation)
        for word_to_change in words_to_change:
            idx = tokens.index(word_to_change)
            random_number = random.random()
-            random_operation = random.sample(['DELETE', 'SWAP', 'ADD_RANDOM'], 1)[0]
+            if random_number <= self.remain_prob:
-            if random_number < self.remain_prob:
+                word = tokens[idx]
-                tokens = self.substitute_delete_add(tokens, idx, random_operation)
+                random_idx = random.sample(range(0, len(word)), 1)[0]
-            elif random_number < self.substitution_prob:
+                tokens[idx] = self.substitute_delete_add(word, random_idx, 'SPELLING_ERROR')
-                word_to_replace = ''
+            elif random_number <= self.substitution_prob:
-                confusion_set = self.spellchecker.suggest(word_to_change)[:3]
+                try:
-                if len(confusion_set) > 1:
+                    basic_form = self.morf.analyse(word_to_change)[0][2][1].split(":")[0]
-                    word_to_replace = random.sample(confusion_set, 1)[0]
+                    forms_to_choose_from = self.morf.generate(basic_form)
-                    while(word_to_replace == word_to_change):
+                    if len(forms_to_choose_from) > 0:
-                        word_to_replace = random.sample(confusion_set, 1)[0]
+                        choice = word_to_change
-                    tokens[idx] = word_to_replace
+                        choice = random.choice(forms_to_choose_from)[0]
-                else:
+                        if idx == 0:
-                    tokens = self.substitute_delete_add(tokens, idx, random_operation)
+                            choice = choice[0].upper() + choice[1:]
                        tokens[idx] = choice
                    else:
                        word = tokens[idx]
                        tokens = self.introduce_character_error(tokens, word, idx)      
                except Exception:
                    print('Form not found')
                    word = tokens[idx]
                    tokens = self.introduce_character_error(tokens, word, idx)
        return ' '.join(tokens)
    def generate_synthetic_errors_from_folder(self, folder_path):
        for idx, path in enumerate(glob.glob(folder_path)[:11]):
            t = threading.Thread(target=self.generate_synthetic_errors_from_file, args=(path, f'./datasets_original/oscar/splitted_oscar/input{idx}.txt', f'./datasets_original/oscar/splitted_oscar/output{idx}.txt'))
            t.start()
    def generate_synthetic_errors_from_file(self, source_filename, input_filename, output_filename):
        with open(input_filename, encoding="utf-8", mode="w") as input:
            with open(output_filename, encoding="utf-8", mode="w") as output:
@ -150,9 +144,9 @@ class SyntheticErrorsGenerator:
                        new_line = line.strip()
                        new_line = new_line[0].capitalize() + new_line[1:]
                        new_line_with_error = self.introduce_error(new_line)
-                        input.write(new_line + "\n")
+                        input.write(self.remove_unused_whitespaces(new_line).strip() + "\n")
-                        output.write(new_line_with_error + "\n")
+                        output.write(self.remove_unused_whitespaces(new_line_with_error).strip() + "\n")
 synthetic_errors_generator = SyntheticErrorsGenerator()
-synthetic_errors_generator.generate_synthetic_errors_from_file(sys.argv[1], 'input.txt', 'output.txt')
+synthetic_errors_generator.generate_synthetic_errors_from_file(sys.argv[1], sys.argv[2], sys.argv[3])
--- a/install.sh
+++ b/install.sh
@ -5,7 +5,7 @@ apt upgrade -y
 apt-get install python3 python3-pip python3-venv unzip gawk screen python-dev libhunspell-dev -y
 python3 -m venv env
 source ./env/bin/activate
-pip install pandas spacy regex scipy hunspell
+pip install pandas spacy regex scipy wheel hunspell
 python -m spacy download pl_core_news_lg
 mkdir data && cd data
--- a/pl.aff
+++ b/pl.aff
--- a/pl.dic
+++ b/pl.dic
--- a/preprocess_dataset.py
+++ b/preprocess_dataset.py
@ -0,0 +1,22 @@
 from sklearn.model_selection import train_test_split
 import pandas as pd
 def read_input_file(input_filename):
    with open(input_filename, encoding="utf-8", mode='r') as input:
        yield from input
 def save_file(input, file_name):
    with open(file_name, encoding="utf-8", mode='w') as file:
        for line in input:
            file.write(line)
 X_train, X_test, y_train, y_test  = train_test_split(list(read_input_file('./train.er')), list(read_input_file('./train.co')), test_size=0.001, random_state=1)
 X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.001, random_state=1)
 save_file(X_train, "./data/train.er")
 save_file(y_train, "./data/train.co")
 save_file(X_test, "./data/test.er")
 save_file(y_test, "./data/test.co")
 save_file(X_val, "./data/dev.er")
 save_file(y_val, "./data/dev.co")
--- a/preprocess_plewi.py
+++ b/preprocess_plewi.py
@ -0,0 +1,20 @@
 import regex as re
 filenames = ['test.co', 'test.er', 'train.co', 'train.er', 'tune.co', 'tune.er']
 output_filenames = ['./plewi_co.txt',
             './plewi_er.txt',
             './plewi_co.txt',
             './plewi_er.txt',
             './plewi_co.txt',
             './plewi_er.txt']
 for idx, filename in enumerate(filenames):
  with open('./plewic/' + filename, encoding="utf-8", mode='r') as f:
      with open(output_filenames[idx], encoding="utf-8", mode='w') as f2:
          for line in f.readlines():
              new_line = line.replace("\n", "").replace("\t", " ")
              if re.match(r"^\!\s\'.*\'$", new_line):
                  new_line = new_line[3:len(new_line)-1]
              elif re.match(r"^\!\s\".*\"$", new_line):
                  new_line = new_line[3:len(new_line)-1]
              f2.write(new_line.strip() + "\n")
--- a/train.co.xz
+++ b/train.co.xz
--- a/train.er.xz
+++ b/train.er.xz
		`@ -0,0 +1,3 @@`
							`# GEC system for polish language`

							`## Synthetic errors generator`