Add train datasets

2022-04-26 19:24:23 +02:00 · 2022-04-26 19:24:23 +02:00 · 24ecd25e72
commit 24ecd25e72
parent 15c97ebee3
10 changed files with 273 additions and 351041 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,160 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
--- a/README.md
+++ b/README.md
@ -0,0 +1,3 @@
+# GEC system for polish language
+
+## Synthetic errors generator
--- a/generate_errors.py
+++ b/generate_errors.py
@ -1,35 +1,40 @@
 import random
-import hunspell
+import morfeusz2
 import spacy
 from tokenizer import Tokenizer
 from scipy.stats import norm
-import pandas as pd
 import regex as re
-import glob
 import sys
-import threading
+from collections import OrderedDict
+

 class SyntheticErrorsGenerator:
    def __init__(self):
        self.substitution_prob = 0.7
-        self.remain_prob = 0.3
-        self.input_dataframe = pd.DataFrame([], columns=['sentence'])
-        self.output_dataframe = pd.DataFrame([], columns=['sentence'])
+        self.remain_prob = 0.4
        spacy.load('pl_core_news_lg')
-        self.spellchecker = hunspell.HunSpell('./pl.dic', './pl.aff')
+        self.morf = morfeusz2.Morfeusz()
        self.tokenizer = Tokenizer()

    def read_input_file(self, input_filename):
        with open(input_filename, encoding="utf-8", mode='r') as input:
-            yield from input.readlines()
+            yield from input
+
+    def remove_unused_whitespaces(self, text):
+        new_text = re.sub(r'(?<=[^!.,>$%&-][!.,>$%&-])[!.,>$%& -]+(?<! )', '', text)
+        new_text = re.sub(r'[^\w\s?.!,:;()[\]]', '', new_text)
+        new_text = re.sub(r'\s\s+', ' ', new_text)
+        new_text = re.sub(r'\s+([?.!,:;\])}”])', r'\1', new_text)
+        new_text = re.sub(r'([\[({„])\s+', r'\1', new_text)
+        new_text = re.sub(r'_ ', '', new_text)
+        new_text = re.sub(r',, ', '', new_text)
+        return new_text
+
+    # Functions to modify characters in words

    def delete_character(self, str, idx):
        return str[:idx] + str[idx+1:]

-    def delete(self, tokens, idx):
-        tokens.pop(idx)
-        return tokens
-
    def swap_characters(self, str, idx):
        strlst = list(str)
        if not (len(str) - 1) == idx:
@ -39,7 +44,7 @@ class SyntheticErrorsGenerator:
            strlst[idx-1], strlst[idx] = strlst[idx-1], strlst[idx]
            return "".join(strlst)

-    def spelling_error(self, tokens):
+    def introduce_spelling_error(self, tokens):
        errors_matrix = {
            'ą': 'a',
            'ć': 'c',
@ -53,6 +58,10 @@ class SyntheticErrorsGenerator:
            'ż': 'z'
        }

+        items = list(errors_matrix.items())
+        random.shuffle(items)
+        errors_matrix = OrderedDict(items)
+
        letters_existing_in_word = []
        for letter, _ in errors_matrix.items():
            if letter in tokens:
@ -64,42 +73,25 @@ class SyntheticErrorsGenerator:

        return tokens

-    def swap(self, tokens, idx):
-        if not (len(tokens) - 1) == idx:
-            tokens[idx], tokens[idx + 1] = tokens[idx + 1], tokens[idx]
-        else:
-            tokens[idx - 1], tokens[idx] = tokens[idx], tokens[idx - 1]
-        return tokens
-
-    def add_random(self, tokens, idx):
-        confusion_set = self.spellchecker.suggest(tokens[idx])[:3]
-        if len(confusion_set) > 1:
-            word_to_replace = random.sample(confusion_set, 1)[0]
-            tokens.insert(idx + 1, word_to_replace)
-        return tokens
-
-    def add_random_character(self, str, idx):
-        confusion_set = self.spellchecker.suggest(str[idx])[:3]
-        if len(confusion_set) > 1:
-            char_to_replace = random.sample(confusion_set, 1)[0].lower()
-            return str[:idx] + char_to_replace + str[idx:]
-        return str
+    def duplicate_character(self, str, idx):
+        return str[:idx] + str[idx] + str[idx:]

    def substitute_delete_add(self, tokens, token_idx, operation):
-        if operation == 'DELETE':
-            return self.delete(tokens, token_idx)
-        elif operation == 'SWAP':
-            return self.swap(tokens, token_idx)
-        elif operation == 'ADD_RANDOM':
-            return self.add_random(tokens, token_idx)
-        elif operation == 'SWAP_CHARACTERS':
+        if operation == 'SWAP_CHARACTERS':
            return self.swap_characters(tokens, token_idx)
        elif operation == 'DELETE_CHARACTER':
            return self.delete_character(tokens, token_idx)
-        elif operation == 'ADD_RANDOM_CHARACTER':
-            return self.add_random_character(tokens, token_idx)
+        elif operation == 'DUPLICATE_CHARACTER':
+            return self.duplicate_character(tokens, token_idx)
        elif operation == 'SPELLING_ERROR':
-            return self.spelling_error(tokens)
+            return self.introduce_spelling_error(tokens)
+
+    def introduce_character_error(self, tokens, word, idx):
+        if len(word) >= 1:
+            random_operation = random.sample(['DELETE_CHARACTER', 'SWAP_CHARACTERS', 'DUPLICATE_CHARACTER', 'SPELLING_ERROR'], 1)[0]
+            random_idx = random.sample(range(0, len(word)), 1)[0]
+            tokens[idx] = self.substitute_delete_add(word, random_idx, random_operation)
+        return tokens
         
    def introduce_error(self, line):
        tokens = self.tokenizer.tokenize(line)
@ -110,38 +102,40 @@ class SyntheticErrorsGenerator:
        num_words_to_change_letters = round(len(tokens) * 0.1)

        words_for_spelling_errors = [tokens.index(word) for word in tokens if word not in words_to_change]
-        for idx in random.sample(words_for_spelling_errors, num_words_to_change_letters):
-            word = tokens[idx]
-            if word.isalnum():
-                random_number = random.random()
-                random_operation = random.sample(['DELETE_CHARACTER', 'SWAP_CHARACTERS', 'ADD_RANDOM_CHARACTER', 'SPELLING_ERROR'], 1)[0]
-                random_idx = random.sample(range(0, len(word)), 1)[0]
-                tokens[idx] = self.substitute_delete_add(word, random_idx, random_operation)
+        if len(words_for_spelling_errors) >= num_words_to_change_letters:
+            for idx in random.sample(words_for_spelling_errors, num_words_to_change_letters):
+                word = tokens[idx]
+                if word.isalnum():
+                    random_number = random.random()
+                    tokens = self.introduce_character_error(tokens, word, idx)

        for word_to_change in words_to_change:
            idx = tokens.index(word_to_change)
            random_number = random.random()
-            random_operation = random.sample(['DELETE', 'SWAP', 'ADD_RANDOM'], 1)[0]
-            if random_number < self.remain_prob:
-                tokens = self.substitute_delete_add(tokens, idx, random_operation)
-            elif random_number < self.substitution_prob:
-                word_to_replace = ''
-                confusion_set = self.spellchecker.suggest(word_to_change)[:3]
-                if len(confusion_set) > 1:
-                    word_to_replace = random.sample(confusion_set, 1)[0]
-                    while(word_to_replace == word_to_change):
-                        word_to_replace = random.sample(confusion_set, 1)[0]
-                    tokens[idx] = word_to_replace
-                else:
-                    tokens = self.substitute_delete_add(tokens, idx, random_operation)
+            if random_number <= self.remain_prob:
+                word = tokens[idx]
+                random_idx = random.sample(range(0, len(word)), 1)[0]
+                tokens[idx] = self.substitute_delete_add(word, random_idx, 'SPELLING_ERROR')
+            elif random_number <= self.substitution_prob:
+                try:
+                    basic_form = self.morf.analyse(word_to_change)[0][2][1].split(":")[0]
+                    forms_to_choose_from = self.morf.generate(basic_form)
+                    if len(forms_to_choose_from) > 0:
+                        choice = word_to_change
+                        choice = random.choice(forms_to_choose_from)[0]
+                        if idx == 0:
+                            choice = choice[0].upper() + choice[1:]
+                        tokens[idx] = choice
+                    else:
+                        word = tokens[idx]
+                        tokens = self.introduce_character_error(tokens, word, idx)      
+                except Exception:
+                    print('Form not found')
+                    word = tokens[idx]
+                    tokens = self.introduce_character_error(tokens, word, idx)
+
        return ' '.join(tokens)

-    def generate_synthetic_errors_from_folder(self, folder_path):
-        for idx, path in enumerate(glob.glob(folder_path)[:11]):
-            t = threading.Thread(target=self.generate_synthetic_errors_from_file, args=(path, f'./datasets_original/oscar/splitted_oscar/input{idx}.txt', f'./datasets_original/oscar/splitted_oscar/output{idx}.txt'))
-            t.start()
-
-
    def generate_synthetic_errors_from_file(self, source_filename, input_filename, output_filename):
        with open(input_filename, encoding="utf-8", mode="w") as input:
            with open(output_filename, encoding="utf-8", mode="w") as output:
@ -150,9 +144,9 @@ class SyntheticErrorsGenerator:
                        new_line = line.strip()
                        new_line = new_line[0].capitalize() + new_line[1:]
                        new_line_with_error = self.introduce_error(new_line)
-                        input.write(new_line + "\n")
-                        output.write(new_line_with_error + "\n")
+                        input.write(self.remove_unused_whitespaces(new_line).strip() + "\n")
+                        output.write(self.remove_unused_whitespaces(new_line_with_error).strip() + "\n")


 synthetic_errors_generator = SyntheticErrorsGenerator()
-synthetic_errors_generator.generate_synthetic_errors_from_file(sys.argv[1], 'input.txt', 'output.txt')
+synthetic_errors_generator.generate_synthetic_errors_from_file(sys.argv[1], sys.argv[2], sys.argv[3])
--- a/install.sh
+++ b/install.sh
@ -5,7 +5,7 @@ apt upgrade -y
 apt-get install python3 python3-pip python3-venv unzip gawk screen python-dev libhunspell-dev -y
 python3 -m venv env
 source ./env/bin/activate
-pip install pandas spacy regex scipy hunspell
+pip install pandas spacy regex scipy wheel hunspell
 python -m spacy download pl_core_news_lg

 mkdir data && cd data
--- a/pl.aff
+++ b/pl.aff
--- a/pl.dic
+++ b/pl.dic
--- a/preprocess_dataset.py
+++ b/preprocess_dataset.py
@ -0,0 +1,22 @@
+from sklearn.model_selection import train_test_split
+import pandas as pd
+
+def read_input_file(input_filename):
+    with open(input_filename, encoding="utf-8", mode='r') as input:
+        yield from input
+
+def save_file(input, file_name):
+    with open(file_name, encoding="utf-8", mode='w') as file:
+        for line in input:
+            file.write(line)
+
+X_train, X_test, y_train, y_test  = train_test_split(list(read_input_file('./train.er')), list(read_input_file('./train.co')), test_size=0.001, random_state=1)
+X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.001, random_state=1)
+
+save_file(X_train, "./data/train.er")
+save_file(y_train, "./data/train.co")
+save_file(X_test, "./data/test.er")
+save_file(y_test, "./data/test.co")
+save_file(X_val, "./data/dev.er")
+save_file(y_val, "./data/dev.co")
+
--- a/preprocess_plewi.py
+++ b/preprocess_plewi.py
@ -0,0 +1,20 @@
+import regex as re
+
+filenames = ['test.co', 'test.er', 'train.co', 'train.er', 'tune.co', 'tune.er']
+output_filenames = ['./plewi_co.txt',
+             './plewi_er.txt',
+             './plewi_co.txt',
+             './plewi_er.txt',
+             './plewi_co.txt',
+             './plewi_er.txt']
+
+for idx, filename in enumerate(filenames):
+  with open('./plewic/' + filename, encoding="utf-8", mode='r') as f:
+      with open(output_filenames[idx], encoding="utf-8", mode='w') as f2:
+          for line in f.readlines():
+              new_line = line.replace("\n", "").replace("\t", " ")
+              if re.match(r"^\!\s\'.*\'$", new_line):
+                  new_line = new_line[3:len(new_line)-1]
+              elif re.match(r"^\!\s\".*\"$", new_line):
+                  new_line = new_line[3:len(new_line)-1]
+              f2.write(new_line.strip() + "\n")
--- a/train.co.xz
+++ b/train.co.xz
--- a/train.er.xz
+++ b/train.er.xz