add todos

This commit is contained in:
jakubknczny 2021-06-21 20:29:44 +02:00
parent bcb0975e8c
commit 500b8eb019

View File

@ -5,6 +5,7 @@ import europarl
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
# todo: add a parameter for probability of injection
def inject_translations(corpus, dictionary): def inject_translations(corpus, dictionary):
llist = [] llist = []
corpus = strip_lower(corpus) corpus = strip_lower(corpus)
@ -51,10 +52,7 @@ def space_wrap(word):
return ' ' + word + ' ' return ' ' + word + ' '
mark_start = 'ssss '
mark_end = ' eeee'
language_code = 'pl' language_code = 'pl'
europarl.maybe_download_and_extract(language_code=language_code) europarl.maybe_download_and_extract(language_code=language_code)
data_src = europarl.load_data(english=True, language_code=language_code) data_src = europarl.load_data(english=True, language_code=language_code)
data_dest = europarl.load_data(english=False, data_dest = europarl.load_data(english=False,
@ -63,6 +61,7 @@ data_dest = europarl.load_data(english=False,
test_size = 0.25 test_size = 0.25
df_dict = pd.read_csv('kompendium.tsv', sep='\t', header=None, index_col=0) df_dict = pd.read_csv('kompendium.tsv', sep='\t', header=None, index_col=0)
# todo: divide dictionary with hashing
dtr, dts = train_test_split(df_dict, test_size=test_size, shuffle=True, random_state=42) dtr, dts = train_test_split(df_dict, test_size=test_size, shuffle=True, random_state=42)
print('dictionary len: ', len(df_dict)) print('dictionary len: ', len(df_dict))
print('train dictionary len: ', len(dtr)) print('train dictionary len: ', len(dtr))
@ -83,6 +82,7 @@ for row in dict_reader_ts:
k, v = row k, v = row
dictionary_test[k] = v dictionary_test[k] = v
# todo: divide data with hashing
data_src_train, data_src_test, data_dest_train, data_dest_test = \ data_src_train, data_src_test, data_dest_train, data_dest_test = \
train_test_split(data_src, data_dest, test_size=test_size, random_state=42) train_test_split(data_src, data_dest, test_size=test_size, random_state=42)