retro-gap/train.py
Michal Maciaszek 07afcec739 improved
2020-12-08 14:14:52 +01:00

37 lines
1.2 KiB
Python

#!/usr/bin/python3
import sys
import regex as re
import pickle
def into_words(sentence):
return re.findall(r'\p{P}|[^\p{P}\s]+', sentence.lower())
def main():
ngrams = {1: {}, 2: {}}
lowest_ngram = 1
highest_ngram = 2
counter = 0
for line in sys.stdin: #dla kazdej linii z pliku
line = line.split('\t')[4] # podziel na 4
tokens = into_words(line) #na slowa
number_of_tokens = len(tokens) #ile slow?
for n in range(lowest_ngram, highest_ngram+1): #dla kazdego ngram
for i in range(0, number_of_tokens-n+1): #i tyle ile jest slow -n gram + 1
ngram = tuple(tokens[i:i+n])
if ngram in ngrams[n]:
ngrams[n][ngram] += 1
else:
ngrams[n][ngram] = 1
if counter % 1000 == 0:
print('counter = ', counter)
counter += 1
ngrams[1] = dict(sorted(ngrams[1].items(), key=lambda item: ngrams[1][item[0]], reverse=True)[:1000])
ngrams[2] = dict(sorted(ngrams[2].items(), key=lambda item: ngrams[2][item[0]], reverse=True)[:100000])
pickle.dump(ngrams, open('ngrams_2.pkl', 'wb'))
if __name__ == '__main__':
main()