retro-gap/train.py

34 lines
983 B
Python

import sys
import pickle
import regex as re
def main():
ngrams = {1: {}, 2: {}}
unigram = 1
bigram = 2
file = open('C:/Users/eryk6/PycharmProjects/retro-gap/train/train.tsv', encoding = 'utf-8')
for line in file:
line = line.split('\t')[4]
tokens = re.findall(r'\p{P}|[^\p{P}\s]+', line.lower())
total_tokens = len(tokens)
for numer in range(unigram, bigram+1):
for i in range(0, total_tokens-numer+1):
tupla = tuple(tokens[i:i+numer])
if tupla in ngrams[numer]:
ngrams[numer][tupla] += 1
else:
ngrams[numer][tupla] = 1
ngrams[1] = dict(sorted(ngrams[1].items(), key=lambda item: ngrams[1][item[0]], reverse=True)[:1000])
ngrams[2] = dict(sorted(ngrams[2].items(), key=lambda item: ngrams[2][item[0]], reverse=True)[:120000])
pickle.dump(ngrams, open('ngrams.pkl', 'wb'))
if __name__ == '__main__':
main()