64 lines
1.3 KiB
Python
64 lines
1.3 KiB
Python
|
|
|
|
|
|
import random
|
|
import string
|
|
import numpy as np
|
|
|
|
|
|
text = ''.join([random.choices(['0', '1'], weights=[0.1, 0.9])[0] for _ in range(100000)])
|
|
with open('100k_dwupunkt_p_09', 'w') as f:
|
|
f.write(text)
|
|
|
|
# ==========
|
|
|
|
text = ''.join([random.choice('01') for _ in range(100000)])
|
|
with open('100k_dwupunkt_p_05', 'w') as f:
|
|
f.write(text)
|
|
|
|
# ==========
|
|
|
|
vocab = list(string.ascii_letters + string.digits + ' ')
|
|
vocab_len = len(vocab)
|
|
# print(vocab_len)
|
|
|
|
text = ''
|
|
for i in range(100000):
|
|
r = np.random.geometric(p=0.3, size=1)[0]
|
|
if r > 61:
|
|
raise Exception('Value is: ', r)
|
|
text += vocab[r]
|
|
with open('100k_gemotric', 'w') as f:
|
|
f.write(text)
|
|
|
|
|
|
# ==========
|
|
|
|
text = ''.join([random.choice(string.ascii_letters + string.digits + ' ') for _ in range(100000)])
|
|
|
|
with open('100k_jednostajny', 'w') as f:
|
|
f.write(text)
|
|
# ==========
|
|
|
|
|
|
|
|
counter = 0
|
|
f = open('100k_corpus', 'w')
|
|
with open('News-Commentary-v16', 'r') as file:
|
|
for line in file:
|
|
|
|
line = line.rstrip()
|
|
|
|
|
|
word_letters = ''
|
|
for letter in line:
|
|
counter +=1
|
|
word_letters += letter
|
|
if counter == 99999:
|
|
f.write(word_letters + '\n')
|
|
break
|
|
if counter == 99999:
|
|
break
|
|
f.write(line + '\n')
|
|
counter +=1
|
|
|