retro-gap/train.py

39 lines
1.3 KiB
Python
Raw Normal View History

2020-12-08 12:01:14 +01:00
#!/usr/bin/python3
import sys
import regex as re
import pickle
def into_words(sentence):
return re.findall(r'\p{P}|[^\p{P}\s]+', sentence.lower())
def main():
ngrams = {1: {}, 2: {}}
lowest_ngram = 1
highest_ngram = 2
counter = 0
for line in sys.stdin: #dla kazdej linii z pliku
line = line.split('\t')[4] # podziel na 4
tokens = into_words(line) #na slowa
number_of_tokens = len(tokens) #ile slow?
for n in range(lowest_ngram, highest_ngram+1): #dla kazdego ngram
for i in range(0, number_of_tokens-n+1): #i tyle ile jest slow -n gram + 1
ngram = tuple(tokens[i:i+n])
if ngram in ngrams[n]:
ngrams[n][ngram] += 1
else:
ngrams[n][ngram] = 1
if counter % 1000 == 0:
print('counter = ', counter)
counter += 1
ngrams[1] = dict(sorted(ngrams[1].items(), key=lambda item: ngrams[1][item[0]], reverse=True)[:1000])
ngrams[2] = dict(sorted(ngrams[2].items(), key=lambda item: ngrams[2][item[0]], reverse=True)[:100000])
#ngrams[3] = dict(sorted(ngrams[3].items(), key=lambda item: ngrams[3][item[0]], reverse=True)[:120000])
pickle.dump(ngrams, open('ngrams_2.pkl', 'wb'))
if __name__ == '__main__':
main()