moj_zad_3/generate_100k_corpus.py

64 lines
1.3 KiB
Python

import random
import string
import numpy as np
text = ''.join([random.choices(['0', '1'], weights=[0.1, 0.9])[0] for _ in range(100000)])
with open('100k_dwupunkt_p_09', 'w') as f:
f.write(text)
# ==========
text = ''.join([random.choice('01') for _ in range(100000)])
with open('100k_dwupunkt_p_05', 'w') as f:
f.write(text)
# ==========
vocab = list(string.ascii_letters + string.digits + ' ')
vocab_len = len(vocab)
# print(vocab_len)
text = ''
for i in range(100000):
r = np.random.geometric(p=0.3, size=1)[0]
if r > 61:
raise Exception('Value is: ', r)
text += vocab[r]
with open('100k_gemotric', 'w') as f:
f.write(text)
# ==========
text = ''.join([random.choice(string.ascii_letters + string.digits + ' ') for _ in range(100000)])
with open('100k_jednostajny', 'w') as f:
f.write(text)
# ==========
counter = 0
f = open('100k_corpus', 'w')
with open('News-Commentary-v16', 'r') as file:
for line in file:
line = line.rstrip()
word_letters = ''
for letter in line:
counter +=1
word_letters += letter
if counter == 99999:
f.write(word_letters + '\n')
break
if counter == 99999:
break
f.write(line + '\n')
counter +=1