add todos
This commit is contained in:
parent
bcb0975e8c
commit
500b8eb019
@ -5,6 +5,7 @@ import europarl
|
|||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
|
||||||
|
# todo: add a parameter for probability of injection
|
||||||
def inject_translations(corpus, dictionary):
|
def inject_translations(corpus, dictionary):
|
||||||
llist = []
|
llist = []
|
||||||
corpus = strip_lower(corpus)
|
corpus = strip_lower(corpus)
|
||||||
@ -51,10 +52,7 @@ def space_wrap(word):
|
|||||||
return ' ' + word + ' '
|
return ' ' + word + ' '
|
||||||
|
|
||||||
|
|
||||||
mark_start = 'ssss '
|
|
||||||
mark_end = ' eeee'
|
|
||||||
language_code = 'pl'
|
language_code = 'pl'
|
||||||
|
|
||||||
europarl.maybe_download_and_extract(language_code=language_code)
|
europarl.maybe_download_and_extract(language_code=language_code)
|
||||||
data_src = europarl.load_data(english=True, language_code=language_code)
|
data_src = europarl.load_data(english=True, language_code=language_code)
|
||||||
data_dest = europarl.load_data(english=False,
|
data_dest = europarl.load_data(english=False,
|
||||||
@ -63,6 +61,7 @@ data_dest = europarl.load_data(english=False,
|
|||||||
test_size = 0.25
|
test_size = 0.25
|
||||||
|
|
||||||
df_dict = pd.read_csv('kompendium.tsv', sep='\t', header=None, index_col=0)
|
df_dict = pd.read_csv('kompendium.tsv', sep='\t', header=None, index_col=0)
|
||||||
|
# todo: divide dictionary with hashing
|
||||||
dtr, dts = train_test_split(df_dict, test_size=test_size, shuffle=True, random_state=42)
|
dtr, dts = train_test_split(df_dict, test_size=test_size, shuffle=True, random_state=42)
|
||||||
print('dictionary len: ', len(df_dict))
|
print('dictionary len: ', len(df_dict))
|
||||||
print('train dictionary len: ', len(dtr))
|
print('train dictionary len: ', len(dtr))
|
||||||
@ -83,6 +82,7 @@ for row in dict_reader_ts:
|
|||||||
k, v = row
|
k, v = row
|
||||||
dictionary_test[k] = v
|
dictionary_test[k] = v
|
||||||
|
|
||||||
|
# todo: divide data with hashing
|
||||||
data_src_train, data_src_test, data_dest_train, data_dest_test = \
|
data_src_train, data_src_test, data_dest_train, data_dest_test = \
|
||||||
train_test_split(data_src, data_dest, test_size=test_size, random_state=42)
|
train_test_split(data_src, data_dest, test_size=test_size, random_state=42)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user