import sys import pickle import regex as re def main(): ngrams = {1: {}, 2: {}} unigram = 1 bigram = 2 file = open('C:/Users/eryk6/PycharmProjects/retro-gap/train/train.tsv', encoding = 'utf-8') for line in file: line = line.split('\t')[4] tokens = re.findall(r'\p{P}|[^\p{P}\s]+', line.lower()) total_tokens = len(tokens) for numer in range(unigram, bigram+1): for i in range(0, total_tokens-numer+1): tupla = tuple(tokens[i:i+numer]) if tupla in ngrams[numer]: ngrams[numer][tupla] += 1 else: ngrams[numer][tupla] = 1 ngrams[1] = dict(sorted(ngrams[1].items(), key=lambda item: ngrams[1][item[0]], reverse=True)[:1000]) ngrams[2] = dict(sorted(ngrams[2].items(), key=lambda item: ngrams[2][item[0]], reverse=True)[:120000]) pickle.dump(ngrams, open('ngrams.pkl', 'wb')) if __name__ == '__main__': main()