39 lines
1.3 KiB
Python
39 lines
1.3 KiB
Python
|
#!/usr/bin/python3
|
||
|
|
||
|
import sys
|
||
|
import regex as re
|
||
|
import pickle
|
||
|
|
||
|
|
||
|
def into_words(sentence):
|
||
|
return re.findall(r'\p{P}|[^\p{P}\s]+', sentence.lower())
|
||
|
|
||
|
|
||
|
def main():
|
||
|
ngrams = {1: {}, 2: {}}
|
||
|
lowest_ngram = 1
|
||
|
highest_ngram = 2
|
||
|
counter = 0
|
||
|
for line in sys.stdin: #dla kazdej linii z pliku
|
||
|
line = line.split('\t')[4] # podziel na 4
|
||
|
tokens = into_words(line) #na slowa
|
||
|
number_of_tokens = len(tokens) #ile slow?
|
||
|
for n in range(lowest_ngram, highest_ngram+1): #dla kazdego ngram
|
||
|
for i in range(0, number_of_tokens-n+1): #i tyle ile jest slow -n gram + 1
|
||
|
ngram = tuple(tokens[i:i+n])
|
||
|
if ngram in ngrams[n]:
|
||
|
ngrams[n][ngram] += 1
|
||
|
else:
|
||
|
ngrams[n][ngram] = 1
|
||
|
if counter % 1000 == 0:
|
||
|
print('counter = ', counter)
|
||
|
counter += 1
|
||
|
ngrams[1] = dict(sorted(ngrams[1].items(), key=lambda item: ngrams[1][item[0]], reverse=True)[:1000])
|
||
|
ngrams[2] = dict(sorted(ngrams[2].items(), key=lambda item: ngrams[2][item[0]], reverse=True)[:100000])
|
||
|
#ngrams[3] = dict(sorted(ngrams[3].items(), key=lambda item: ngrams[3][item[0]], reverse=True)[:120000])
|
||
|
pickle.dump(ngrams, open('ngrams_2.pkl', 'wb'))
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
main()
|