64 lines
1.3 KiB
Python
64 lines
1.3 KiB
Python
|
|
||
|
|
||
|
|
||
|
import random
|
||
|
import string
|
||
|
import numpy as np
|
||
|
|
||
|
|
||
|
text = ''.join([random.choices(['0', '1'], weights=[0.1, 0.9])[0] for _ in range(100000)])
|
||
|
with open('100k_dwupunkt_p_09', 'w') as f:
|
||
|
f.write(text)
|
||
|
|
||
|
# ==========
|
||
|
|
||
|
text = ''.join([random.choice('01') for _ in range(100000)])
|
||
|
with open('100k_dwupunkt_p_05', 'w') as f:
|
||
|
f.write(text)
|
||
|
|
||
|
# ==========
|
||
|
|
||
|
vocab = list(string.ascii_letters + string.digits + ' ')
|
||
|
vocab_len = len(vocab)
|
||
|
# print(vocab_len)
|
||
|
|
||
|
text = ''
|
||
|
for i in range(100000):
|
||
|
r = np.random.geometric(p=0.3, size=1)[0]
|
||
|
if r > 61:
|
||
|
raise Exception('Value is: ', r)
|
||
|
text += vocab[r]
|
||
|
with open('100k_gemotric', 'w') as f:
|
||
|
f.write(text)
|
||
|
|
||
|
|
||
|
# ==========
|
||
|
|
||
|
text = ''.join([random.choice(string.ascii_letters + string.digits + ' ') for _ in range(100000)])
|
||
|
|
||
|
with open('100k_jednostajny', 'w') as f:
|
||
|
f.write(text)
|
||
|
# ==========
|
||
|
|
||
|
|
||
|
|
||
|
counter = 0
|
||
|
f = open('100k_corpus', 'w')
|
||
|
with open('News-Commentary-v16', 'r') as file:
|
||
|
for line in file:
|
||
|
|
||
|
line = line.rstrip()
|
||
|
|
||
|
|
||
|
word_letters = ''
|
||
|
for letter in line:
|
||
|
counter +=1
|
||
|
word_letters += letter
|
||
|
if counter == 99999:
|
||
|
f.write(word_letters + '\n')
|
||
|
break
|
||
|
if counter == 99999:
|
||
|
break
|
||
|
f.write(line + '\n')
|
||
|
counter +=1
|
||
|
|