Add train datasets

This commit is contained in:
Wojciech Jarmosz 2022-04-26 19:24:23 +02:00
parent 15c97ebee3
commit 24ecd25e72
10 changed files with 273 additions and 351041 deletions

160
.gitignore vendored Normal file
View File

@ -0,0 +1,160 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

3
README.md Normal file
View File

@ -0,0 +1,3 @@
# GEC system for polish language
## Synthetic errors generator

View File

@ -1,35 +1,40 @@
import random
import hunspell
import morfeusz2
import spacy
from tokenizer import Tokenizer
from scipy.stats import norm
import pandas as pd
import regex as re
import glob
import sys
import threading
from collections import OrderedDict
class SyntheticErrorsGenerator:
def __init__(self):
self.substitution_prob = 0.7
self.remain_prob = 0.3
self.input_dataframe = pd.DataFrame([], columns=['sentence'])
self.output_dataframe = pd.DataFrame([], columns=['sentence'])
self.remain_prob = 0.4
spacy.load('pl_core_news_lg')
self.spellchecker = hunspell.HunSpell('./pl.dic', './pl.aff')
self.morf = morfeusz2.Morfeusz()
self.tokenizer = Tokenizer()
def read_input_file(self, input_filename):
with open(input_filename, encoding="utf-8", mode='r') as input:
yield from input.readlines()
yield from input
def remove_unused_whitespaces(self, text):
new_text = re.sub(r'(?<=[^!.,>$%&-][!.,>$%&-])[!.,>$%& -]+(?<! )', '', text)
new_text = re.sub(r'[^\w\s?.!,:;()[\]]', '', new_text)
new_text = re.sub(r'\s\s+', ' ', new_text)
new_text = re.sub(r'\s+([?.!,:;\])}”])', r'\1', new_text)
new_text = re.sub(r'([\[({„])\s+', r'\1', new_text)
new_text = re.sub(r'_ ', '', new_text)
new_text = re.sub(r',, ', '', new_text)
return new_text
# Functions to modify characters in words
def delete_character(self, str, idx):
return str[:idx] + str[idx+1:]
def delete(self, tokens, idx):
tokens.pop(idx)
return tokens
def swap_characters(self, str, idx):
strlst = list(str)
if not (len(str) - 1) == idx:
@ -39,7 +44,7 @@ class SyntheticErrorsGenerator:
strlst[idx-1], strlst[idx] = strlst[idx-1], strlst[idx]
return "".join(strlst)
def spelling_error(self, tokens):
def introduce_spelling_error(self, tokens):
errors_matrix = {
'ą': 'a',
'ć': 'c',
@ -53,6 +58,10 @@ class SyntheticErrorsGenerator:
'ż': 'z'
}
items = list(errors_matrix.items())
random.shuffle(items)
errors_matrix = OrderedDict(items)
letters_existing_in_word = []
for letter, _ in errors_matrix.items():
if letter in tokens:
@ -64,42 +73,25 @@ class SyntheticErrorsGenerator:
return tokens
def swap(self, tokens, idx):
if not (len(tokens) - 1) == idx:
tokens[idx], tokens[idx + 1] = tokens[idx + 1], tokens[idx]
else:
tokens[idx - 1], tokens[idx] = tokens[idx], tokens[idx - 1]
return tokens
def add_random(self, tokens, idx):
confusion_set = self.spellchecker.suggest(tokens[idx])[:3]
if len(confusion_set) > 1:
word_to_replace = random.sample(confusion_set, 1)[0]
tokens.insert(idx + 1, word_to_replace)
return tokens
def add_random_character(self, str, idx):
confusion_set = self.spellchecker.suggest(str[idx])[:3]
if len(confusion_set) > 1:
char_to_replace = random.sample(confusion_set, 1)[0].lower()
return str[:idx] + char_to_replace + str[idx:]
return str
def duplicate_character(self, str, idx):
return str[:idx] + str[idx] + str[idx:]
def substitute_delete_add(self, tokens, token_idx, operation):
if operation == 'DELETE':
return self.delete(tokens, token_idx)
elif operation == 'SWAP':
return self.swap(tokens, token_idx)
elif operation == 'ADD_RANDOM':
return self.add_random(tokens, token_idx)
elif operation == 'SWAP_CHARACTERS':
if operation == 'SWAP_CHARACTERS':
return self.swap_characters(tokens, token_idx)
elif operation == 'DELETE_CHARACTER':
return self.delete_character(tokens, token_idx)
elif operation == 'ADD_RANDOM_CHARACTER':
return self.add_random_character(tokens, token_idx)
elif operation == 'DUPLICATE_CHARACTER':
return self.duplicate_character(tokens, token_idx)
elif operation == 'SPELLING_ERROR':
return self.spelling_error(tokens)
return self.introduce_spelling_error(tokens)
def introduce_character_error(self, tokens, word, idx):
if len(word) >= 1:
random_operation = random.sample(['DELETE_CHARACTER', 'SWAP_CHARACTERS', 'DUPLICATE_CHARACTER', 'SPELLING_ERROR'], 1)[0]
random_idx = random.sample(range(0, len(word)), 1)[0]
tokens[idx] = self.substitute_delete_add(word, random_idx, random_operation)
return tokens
def introduce_error(self, line):
tokens = self.tokenizer.tokenize(line)
@ -110,38 +102,40 @@ class SyntheticErrorsGenerator:
num_words_to_change_letters = round(len(tokens) * 0.1)
words_for_spelling_errors = [tokens.index(word) for word in tokens if word not in words_to_change]
if len(words_for_spelling_errors) >= num_words_to_change_letters:
for idx in random.sample(words_for_spelling_errors, num_words_to_change_letters):
word = tokens[idx]
if word.isalnum():
random_number = random.random()
random_operation = random.sample(['DELETE_CHARACTER', 'SWAP_CHARACTERS', 'ADD_RANDOM_CHARACTER', 'SPELLING_ERROR'], 1)[0]
random_idx = random.sample(range(0, len(word)), 1)[0]
tokens[idx] = self.substitute_delete_add(word, random_idx, random_operation)
tokens = self.introduce_character_error(tokens, word, idx)
for word_to_change in words_to_change:
idx = tokens.index(word_to_change)
random_number = random.random()
random_operation = random.sample(['DELETE', 'SWAP', 'ADD_RANDOM'], 1)[0]
if random_number < self.remain_prob:
tokens = self.substitute_delete_add(tokens, idx, random_operation)
elif random_number < self.substitution_prob:
word_to_replace = ''
confusion_set = self.spellchecker.suggest(word_to_change)[:3]
if len(confusion_set) > 1:
word_to_replace = random.sample(confusion_set, 1)[0]
while(word_to_replace == word_to_change):
word_to_replace = random.sample(confusion_set, 1)[0]
tokens[idx] = word_to_replace
if random_number <= self.remain_prob:
word = tokens[idx]
random_idx = random.sample(range(0, len(word)), 1)[0]
tokens[idx] = self.substitute_delete_add(word, random_idx, 'SPELLING_ERROR')
elif random_number <= self.substitution_prob:
try:
basic_form = self.morf.analyse(word_to_change)[0][2][1].split(":")[0]
forms_to_choose_from = self.morf.generate(basic_form)
if len(forms_to_choose_from) > 0:
choice = word_to_change
choice = random.choice(forms_to_choose_from)[0]
if idx == 0:
choice = choice[0].upper() + choice[1:]
tokens[idx] = choice
else:
tokens = self.substitute_delete_add(tokens, idx, random_operation)
word = tokens[idx]
tokens = self.introduce_character_error(tokens, word, idx)
except Exception:
print('Form not found')
word = tokens[idx]
tokens = self.introduce_character_error(tokens, word, idx)
return ' '.join(tokens)
def generate_synthetic_errors_from_folder(self, folder_path):
for idx, path in enumerate(glob.glob(folder_path)[:11]):
t = threading.Thread(target=self.generate_synthetic_errors_from_file, args=(path, f'./datasets_original/oscar/splitted_oscar/input{idx}.txt', f'./datasets_original/oscar/splitted_oscar/output{idx}.txt'))
t.start()
def generate_synthetic_errors_from_file(self, source_filename, input_filename, output_filename):
with open(input_filename, encoding="utf-8", mode="w") as input:
with open(output_filename, encoding="utf-8", mode="w") as output:
@ -150,9 +144,9 @@ class SyntheticErrorsGenerator:
new_line = line.strip()
new_line = new_line[0].capitalize() + new_line[1:]
new_line_with_error = self.introduce_error(new_line)
input.write(new_line + "\n")
output.write(new_line_with_error + "\n")
input.write(self.remove_unused_whitespaces(new_line).strip() + "\n")
output.write(self.remove_unused_whitespaces(new_line_with_error).strip() + "\n")
synthetic_errors_generator = SyntheticErrorsGenerator()
synthetic_errors_generator.generate_synthetic_errors_from_file(sys.argv[1], 'input.txt', 'output.txt')
synthetic_errors_generator.generate_synthetic_errors_from_file(sys.argv[1], sys.argv[2], sys.argv[3])

View File

@ -5,7 +5,7 @@ apt upgrade -y
apt-get install python3 python3-pip python3-venv unzip gawk screen python-dev libhunspell-dev -y
python3 -m venv env
source ./env/bin/activate
pip install pandas spacy regex scipy hunspell
pip install pandas spacy regex scipy wheel hunspell
python -m spacy download pl_core_news_lg
mkdir data && cd data

7574
pl.aff

File diff suppressed because it is too large Load Diff

343393
pl.dic

File diff suppressed because it is too large Load Diff

22
preprocess_dataset.py Normal file
View File

@ -0,0 +1,22 @@
from sklearn.model_selection import train_test_split
import pandas as pd
def read_input_file(input_filename):
with open(input_filename, encoding="utf-8", mode='r') as input:
yield from input
def save_file(input, file_name):
with open(file_name, encoding="utf-8", mode='w') as file:
for line in input:
file.write(line)
X_train, X_test, y_train, y_test = train_test_split(list(read_input_file('./train.er')), list(read_input_file('./train.co')), test_size=0.001, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.001, random_state=1)
save_file(X_train, "./data/train.er")
save_file(y_train, "./data/train.co")
save_file(X_test, "./data/test.er")
save_file(y_test, "./data/test.co")
save_file(X_val, "./data/dev.er")
save_file(y_val, "./data/dev.co")

20
preprocess_plewi.py Normal file
View File

@ -0,0 +1,20 @@
import regex as re
filenames = ['test.co', 'test.er', 'train.co', 'train.er', 'tune.co', 'tune.er']
output_filenames = ['./plewi_co.txt',
'./plewi_er.txt',
'./plewi_co.txt',
'./plewi_er.txt',
'./plewi_co.txt',
'./plewi_er.txt']
for idx, filename in enumerate(filenames):
with open('./plewic/' + filename, encoding="utf-8", mode='r') as f:
with open(output_filenames[idx], encoding="utf-8", mode='w') as f2:
for line in f.readlines():
new_line = line.replace("\n", "").replace("\t", " ")
if re.match(r"^\!\s\'.*\'$", new_line):
new_line = new_line[3:len(new_line)-1]
elif re.match(r"^\!\s\".*\"$", new_line):
new_line = new_line[3:len(new_line)-1]
f2.write(new_line.strip() + "\n")

BIN
train.co.xz Normal file

Binary file not shown.

BIN
train.er.xz Normal file

Binary file not shown.