34 lines
983 B
Python
34 lines
983 B
Python
import sys
|
|
import pickle
|
|
import regex as re
|
|
|
|
|
|
def main():
|
|
ngrams = {1: {}, 2: {}}
|
|
unigram = 1
|
|
bigram = 2
|
|
|
|
file = open('C:/Users/eryk6/PycharmProjects/retro-gap/train/train.tsv', encoding = 'utf-8')
|
|
|
|
for line in file:
|
|
line = line.split('\t')[4]
|
|
tokens = re.findall(r'\p{P}|[^\p{P}\s]+', line.lower())
|
|
total_tokens = len(tokens)
|
|
|
|
for numer in range(unigram, bigram+1):
|
|
for i in range(0, total_tokens-numer+1):
|
|
tupla = tuple(tokens[i:i+numer])
|
|
|
|
if tupla in ngrams[numer]:
|
|
ngrams[numer][tupla] += 1
|
|
else:
|
|
ngrams[numer][tupla] = 1
|
|
|
|
ngrams[1] = dict(sorted(ngrams[1].items(), key=lambda item: ngrams[1][item[0]], reverse=True)[:1000])
|
|
ngrams[2] = dict(sorted(ngrams[2].items(), key=lambda item: ngrams[2][item[0]], reverse=True)[:120000])
|
|
|
|
pickle.dump(ngrams, open('ngrams.pkl', 'wb'))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main() |