import random import string import numpy as np text = ''.join([random.choices(['0', '1'], weights=[0.1, 0.9])[0] for _ in range(100000)]) with open('100k_dwupunkt_p_09', 'w') as f: f.write(text) # ========== text = ''.join([random.choice('01') for _ in range(100000)]) with open('100k_dwupunkt_p_05', 'w') as f: f.write(text) # ========== vocab = list(string.ascii_letters + string.digits + ' ') vocab_len = len(vocab) # print(vocab_len) text = '' for i in range(100000): r = np.random.geometric(p=0.3, size=1)[0] if r > 61: raise Exception('Value is: ', r) text += vocab[r] with open('100k_gemotric', 'w') as f: f.write(text) # ========== text = ''.join([random.choice(string.ascii_letters + string.digits + ' ') for _ in range(100000)]) with open('100k_jednostajny', 'w') as f: f.write(text) # ========== counter = 0 f = open('100k_corpus', 'w') with open('News-Commentary-v16', 'r') as file: for line in file: line = line.rstrip() word_letters = '' for letter in line: counter +=1 word_letters += letter if counter == 99999: f.write(word_letters + '\n') break if counter == 99999: break f.write(line + '\n') counter +=1