Add synthetic data generator
This commit is contained in:
commit
d4d77d4f17
0
__init__.py
Normal file
0
__init__.py
Normal file
168
generate_errors.py
Normal file
168
generate_errors.py
Normal file
@ -0,0 +1,168 @@
|
|||||||
|
import random
|
||||||
|
import hunspell
|
||||||
|
import spacy
|
||||||
|
from tokenizer import Tokenizer
|
||||||
|
from scipy.stats import norm
|
||||||
|
import pandas as pd
|
||||||
|
import regex as re
|
||||||
|
import glob
|
||||||
|
import threading
|
||||||
|
|
||||||
|
# pip install hunspell spacy
|
||||||
|
# python -m spacy download pl_core_news_lg
|
||||||
|
# sed -ri '/^\s*$/d' ./datasets_original/oscar/oscar_pl_test.txt
|
||||||
|
# unzip -p oscar_filtered.zip | sed -r '/^\s*$/d' | gawk 'NF>6' > oscar_filtered.txt
|
||||||
|
|
||||||
|
|
||||||
|
class SyntheticErrorsGenerator:
|
||||||
|
def __init__(self):
|
||||||
|
self.substitution_prob = 0.7
|
||||||
|
self.remain_prob = 0.3
|
||||||
|
self.input_dataframe = pd.DataFrame([], columns=['sentence'])
|
||||||
|
self.output_dataframe = pd.DataFrame([], columns=['sentence'])
|
||||||
|
spacy.load('pl_core_news_lg')
|
||||||
|
self.spellchecker = hunspell.HunSpell('./pl.dic', './pl.aff')
|
||||||
|
self.tokenizer = Tokenizer()
|
||||||
|
|
||||||
|
def read_input_file(self, input_filename):
|
||||||
|
with open(input_filename, encoding="utf-8", mode='r') as input:
|
||||||
|
yield from input.readlines()
|
||||||
|
|
||||||
|
def delete_character(self, str, idx):
|
||||||
|
return str[:idx] + str[idx+1:]
|
||||||
|
|
||||||
|
def delete(self, tokens, idx):
|
||||||
|
tokens.pop(idx)
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
def swap_characters(self, str, idx):
|
||||||
|
strlst = list(str)
|
||||||
|
if not (len(str) - 1) == idx:
|
||||||
|
strlst[idx], strlst[idx+1] = strlst[idx+1], strlst[idx]
|
||||||
|
return "".join(strlst)
|
||||||
|
else:
|
||||||
|
strlst[idx-1], strlst[idx] = strlst[idx-1], strlst[idx]
|
||||||
|
return "".join(strlst)
|
||||||
|
|
||||||
|
def spelling_error(self, tokens):
|
||||||
|
errors_matrix = {
|
||||||
|
'ą': 'a',
|
||||||
|
'ć': 'c',
|
||||||
|
'ę': 'e',
|
||||||
|
'ł': 'l',
|
||||||
|
'ń': 'n',
|
||||||
|
"ó": 'u',
|
||||||
|
"u": 'ó',
|
||||||
|
'ś': 's',
|
||||||
|
'ź': 'z',
|
||||||
|
'ż': 'z'
|
||||||
|
}
|
||||||
|
|
||||||
|
letters_existing_in_word = []
|
||||||
|
for letter, _ in errors_matrix.items():
|
||||||
|
if letter in tokens:
|
||||||
|
letters_existing_in_word.append(letter)
|
||||||
|
|
||||||
|
if len(letters_existing_in_word) > 0:
|
||||||
|
letter_to_replace = random.choice(letters_existing_in_word)
|
||||||
|
tokens = tokens.replace(letter_to_replace, errors_matrix[letter_to_replace])
|
||||||
|
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
def swap(self, tokens, idx):
|
||||||
|
if not (len(tokens) - 1) == idx:
|
||||||
|
tokens[idx], tokens[idx + 1] = tokens[idx + 1], tokens[idx]
|
||||||
|
else:
|
||||||
|
tokens[idx - 1], tokens[idx] = tokens[idx], tokens[idx - 1]
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
def add_random(self, tokens, idx):
|
||||||
|
confusion_set = self.spellchecker.suggest(tokens[idx])[:3]
|
||||||
|
if len(confusion_set) > 1:
|
||||||
|
word_to_replace = random.sample(confusion_set, 1)[0]
|
||||||
|
tokens.insert(idx + 1, word_to_replace)
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
def add_random_character(self, str, idx):
|
||||||
|
confusion_set = self.spellchecker.suggest(str[idx])[:3]
|
||||||
|
if len(confusion_set) > 1:
|
||||||
|
char_to_replace = random.sample(confusion_set, 1)[0].lower()
|
||||||
|
return str[:idx] + char_to_replace + str[idx:]
|
||||||
|
return str
|
||||||
|
|
||||||
|
def substitute_delete_add(self, tokens, token_idx, operation):
|
||||||
|
if operation == 'DELETE':
|
||||||
|
return self.delete(tokens, token_idx)
|
||||||
|
elif operation == 'SWAP':
|
||||||
|
return self.swap(tokens, token_idx)
|
||||||
|
elif operation == 'ADD_RANDOM':
|
||||||
|
return self.add_random(tokens, token_idx)
|
||||||
|
elif operation == 'SWAP_CHARACTERS':
|
||||||
|
return self.swap_characters(tokens, token_idx)
|
||||||
|
elif operation == 'DELETE_CHARACTER':
|
||||||
|
return self.delete_character(tokens, token_idx)
|
||||||
|
elif operation == 'ADD_RANDOM_CHARACTER':
|
||||||
|
return self.add_random_character(tokens, token_idx)
|
||||||
|
elif operation == 'SPELLING_ERROR':
|
||||||
|
return self.spelling_error(tokens)
|
||||||
|
|
||||||
|
def introduce_error(self, line):
|
||||||
|
tokens = self.tokenizer.tokenize(line)
|
||||||
|
num_words_to_change = round(abs(norm.mean(0.15, 0.2) * norm.std(0.15, 0.2)) * len(line))
|
||||||
|
if num_words_to_change > len(set(tokens)):
|
||||||
|
num_words_to_change = 1
|
||||||
|
words_to_change = random.sample(set(tokens), num_words_to_change)
|
||||||
|
num_words_to_change_letters = round(len(tokens) * 0.1)
|
||||||
|
|
||||||
|
words_for_spelling_errors = [tokens.index(word) for word in tokens if word not in words_to_change]
|
||||||
|
for idx in random.sample(words_for_spelling_errors, num_words_to_change_letters):
|
||||||
|
word = tokens[idx]
|
||||||
|
if word.isalnum():
|
||||||
|
random_number = random.random()
|
||||||
|
random_operation = random.sample(['DELETE_CHARACTER', 'SWAP_CHARACTERS', 'ADD_RANDOM_CHARACTER', 'SPELLING_ERROR'], 1)[0]
|
||||||
|
random_idx = random.sample(range(0, len(word)), 1)[0]
|
||||||
|
tokens[idx] = self.substitute_delete_add(word, random_idx, random_operation)
|
||||||
|
|
||||||
|
for word_to_change in words_to_change:
|
||||||
|
idx = tokens.index(word_to_change)
|
||||||
|
random_number = random.random()
|
||||||
|
random_operation = random.sample(['DELETE', 'SWAP', 'ADD_RANDOM'], 1)[0]
|
||||||
|
if random_number < self.remain_prob:
|
||||||
|
tokens = self.substitute_delete_add(tokens, idx, random_operation)
|
||||||
|
elif random_number < self.substitution_prob:
|
||||||
|
word_to_replace = ''
|
||||||
|
confusion_set = self.spellchecker.suggest(word_to_change)[:3]
|
||||||
|
if len(confusion_set) > 1:
|
||||||
|
word_to_replace = random.sample(confusion_set, 1)[0]
|
||||||
|
while(word_to_replace == word_to_change):
|
||||||
|
word_to_replace = random.sample(confusion_set, 1)[0]
|
||||||
|
tokens[idx] = word_to_replace
|
||||||
|
else:
|
||||||
|
tokens = self.substitute_delete_add(tokens, idx, random_operation)
|
||||||
|
return ' '.join(tokens)
|
||||||
|
|
||||||
|
def generate_synthetic_errors_from_folder(self, folder_path):
|
||||||
|
for idx, path in enumerate(glob.glob(folder_path)[:11]):
|
||||||
|
t = threading.Thread(target=self.generate_synthetic_errors_from_file, args=(path, f'./datasets_original/oscar/splitted_oscar/input{idx}.txt', f'./datasets_original/oscar/splitted_oscar/output{idx}.txt'))
|
||||||
|
t.start()
|
||||||
|
|
||||||
|
|
||||||
|
def generate_synthetic_errors_from_file(self, source_filename, input_filename, output_filename):
|
||||||
|
with open(input_filename, encoding="utf-8", mode="w") as input:
|
||||||
|
with open(output_filename, encoding="utf-8", mode="w") as output:
|
||||||
|
for line in self.read_input_file(source_filename):
|
||||||
|
if len(line.split()) > 7:
|
||||||
|
new_line = line.strip()
|
||||||
|
new_line = new_line[0].capitalize() + new_line[1:]
|
||||||
|
new_line_with_error = self.introduce_error(new_line)
|
||||||
|
input.write(new_line + "\n")
|
||||||
|
output.write(new_line_with_error + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
synthetic_errors_generator = SyntheticErrorsGenerator()
|
||||||
|
synthetic_errors_generator.generate_synthetic_errors_from_file('./datasets_original/oscar/splitted_oscar/output_fileaa', './datasets_original/oscar/splitted_oscar/results/input1.txt', './datasets_original/oscar/splitted_oscar/results/output1.txt')
|
||||||
|
|
||||||
|
# synthetic_errors_generator.generate_synthetic_errors_from_folder(folder_path='./datasets_original/oscar/splitted_oscar/*')
|
||||||
|
# synthetic_errors_generator.generate_synthetic_errors('./datasets_original/oscar/splitted_oscar/output_fileaa', './datasets_original/oscar/splitted_oscar/input1.txt', './datasets_original/oscar/splitted_oscar/output1.txt')
|
||||||
|
# synthetic_errors_generator.generate_synthetic_errors('./datasets_original/lektury/input2_lektury.txt', './datasets_original/lektury/input2.txt', './datasets_original/lektury/output2.txt')
|
||||||
|
|
14
install.sh
Normal file
14
install.sh
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
apt update -y
|
||||||
|
apt upgrade
|
||||||
|
|
||||||
|
apt-get install python3 python3-pip python3-venv unzip gawk split
|
||||||
|
python3 -m venv env
|
||||||
|
source ./env/bin/activate
|
||||||
|
pip install -r requirements.txt
|
||||||
|
python -m spacy download pl_core_news_lg
|
||||||
|
|
||||||
|
mkdir data && cd data
|
||||||
|
wget https://minio.clarin-pl.eu/ermlab/public/PoLitBert/corpus-oscar/corpus_oscar_2020-04-10_64M_lines.zip
|
||||||
|
unzip -p corpus_oscar_2020-04-10_64M_lines.zip | sed -r '/^\s*$/d' | gawk 'NF>6' > oscar_filtered.txt
|
||||||
|
split -l 1000000 --numeric-suffixes=1 --suffix-length=1 --additional-suffix=".txt" oscar_filtered.txt ""
|
77
requirements.txt
Normal file
77
requirements.txt
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
alabaster==0.7.12
|
||||||
|
arrow==1.2.2
|
||||||
|
async-generator==1.10
|
||||||
|
attrs==21.4.0
|
||||||
|
Babel==2.9.1
|
||||||
|
beautifulsoup4==4.11.1
|
||||||
|
bibtexparser==1.2.0
|
||||||
|
blis==0.7.7
|
||||||
|
catalogue==2.0.7
|
||||||
|
certifi==2021.10.8
|
||||||
|
charset-normalizer==2.0.12
|
||||||
|
click==8.0.4
|
||||||
|
cymem==2.0.6
|
||||||
|
Deprecated==1.2.13
|
||||||
|
docutils==0.17.1
|
||||||
|
fake-useragent==0.1.11
|
||||||
|
free-proxy==1.0.6
|
||||||
|
future==0.18.2
|
||||||
|
h11==0.13.0
|
||||||
|
hunspell==0.5.5
|
||||||
|
idna==3.3
|
||||||
|
imagesize==1.3.0
|
||||||
|
importlib-metadata==4.11.3
|
||||||
|
Jinja2==3.1.1
|
||||||
|
langcodes==3.3.0
|
||||||
|
lxml==4.8.0
|
||||||
|
MarkupSafe==2.1.1
|
||||||
|
morfeusz2==1.99.4
|
||||||
|
murmurhash==1.0.6
|
||||||
|
numpy==1.22.3
|
||||||
|
outcome==1.1.0
|
||||||
|
packaging==21.3
|
||||||
|
pathy==0.6.1
|
||||||
|
pl-core-news-lg==3.2.0
|
||||||
|
preshed==3.0.6
|
||||||
|
pydantic==1.8.2
|
||||||
|
Pygments==2.11.2
|
||||||
|
pyMorfologik==0.2.2
|
||||||
|
pyparsing==3.0.7
|
||||||
|
PySocks==1.7.1
|
||||||
|
python-dateutil==2.8.2
|
||||||
|
python-dotenv==0.20.0
|
||||||
|
pytz==2022.1
|
||||||
|
regex==2022.3.15
|
||||||
|
requests==2.27.1
|
||||||
|
scholarly==1.6.0
|
||||||
|
scipy==1.8.0
|
||||||
|
selenium==4.1.3
|
||||||
|
six==1.16.0
|
||||||
|
smart-open==5.2.1
|
||||||
|
sniffio==1.2.0
|
||||||
|
snowballstemmer==2.2.0
|
||||||
|
sortedcontainers==2.4.0
|
||||||
|
soupsieve==2.3.2
|
||||||
|
spacy==3.2.4
|
||||||
|
spacy-legacy==3.0.9
|
||||||
|
spacy-loggers==1.0.2
|
||||||
|
Sphinx==4.5.0
|
||||||
|
sphinx-rtd-theme==1.0.0
|
||||||
|
sphinxcontrib-applehelp==1.0.2
|
||||||
|
sphinxcontrib-devhelp==1.0.2
|
||||||
|
sphinxcontrib-htmlhelp==2.0.0
|
||||||
|
sphinxcontrib-jsmath==1.0.1
|
||||||
|
sphinxcontrib-qthelp==1.0.3
|
||||||
|
sphinxcontrib-serializinghtml==1.1.5
|
||||||
|
srsly==2.4.2
|
||||||
|
thinc==8.0.15
|
||||||
|
tqdm==4.64.0
|
||||||
|
trio==0.20.0
|
||||||
|
trio-websocket==0.9.2
|
||||||
|
typer==0.4.1
|
||||||
|
typing-extensions==4.1.1
|
||||||
|
urllib3==1.26.9
|
||||||
|
wasabi==0.9.1
|
||||||
|
wrapt==1.14.0
|
||||||
|
wsproto==1.1.0
|
||||||
|
zipp==3.8.0
|
8
tokenizer.py
Normal file
8
tokenizer.py
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
import spacy
|
||||||
|
|
||||||
|
class Tokenizer:
|
||||||
|
def __init__(self):
|
||||||
|
self.polish_tokenizer = spacy.load('pl_core_news_lg')
|
||||||
|
|
||||||
|
def tokenize(self, text):
|
||||||
|
return [tok.text for tok in self.polish_tokenizer.tokenizer(text)]
|
Loading…
Reference in New Issue
Block a user