#!/usr/bin/python3 import sys import regex as re import pickle def into_words(sentence): return re.findall(r'\p{P}|[^\p{P}\s]+', sentence.lower()) def main(): ngrams = {1: {}, 2: {}} lowest_ngram = 1 highest_ngram = 2 counter = 0 for line in sys.stdin: #dla kazdej linii z pliku line = line.split('\t')[4] # podziel na 4 tokens = into_words(line) #na slowa number_of_tokens = len(tokens) #ile slow? for n in range(lowest_ngram, highest_ngram+1): #dla kazdego ngram for i in range(0, number_of_tokens-n+1): #i tyle ile jest slow -n gram + 1 ngram = tuple(tokens[i:i+n]) if ngram in ngrams[n]: ngrams[n][ngram] += 1 else: ngrams[n][ngram] = 1 if counter % 1000 == 0: print('counter = ', counter) counter += 1 ngrams[1] = dict(sorted(ngrams[1].items(), key=lambda item: ngrams[1][item[0]], reverse=True)[:1000]) ngrams[2] = dict(sorted(ngrams[2].items(), key=lambda item: ngrams[2][item[0]], reverse=True)[:100000]) #ngrams[3] = dict(sorted(ngrams[3].items(), key=lambda item: ngrams[3][item[0]], reverse=True)[:120000]) pickle.dump(ngrams, open('ngrams_2.pkl', 'wb')) if __name__ == '__main__': main()