challenging-america-word-ga.../run.py
2022-04-11 11:15:56 +02:00

82 lines
3.1 KiB
Python

import pandas as pd
import csv
from collections import Counter, defaultdict
from nltk.tokenize import RegexpTokenizer
from nltk import trigrams
import regex as re
import lzma
class WordPred:
def __init__(self):
self.tokenizer = RegexpTokenizer(r"\w+")
self.model = defaultdict(lambda: defaultdict(lambda: 0))
self.vocab = set()
self.alpha = 0.001
def read_file(self, file):
for line in file:
text = line.split("\t")
yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n"," ").replace("\n","").lower()))
def read_file_7(self, file):
for line in file:
text = line.split("\t")
yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', text[7].replace("\\n"," ").replace("\n","").lower()))
def read_train_data(self, file_path):
with lzma.open(file_path, mode='rt') as file:
for index, text in enumerate(self.read_file(file)):
tokens = self.tokenizer.tokenize(text)
for w1, w2, w3 in trigrams(tokens, pad_right=True, pad_left=True):
if w1 and w2 and w3:
self.model[(w2, w3)][w1] += 1
self.vocab.add(w1)
self.vocab.add(w2)
self.vocab.add(w3)
if index == 300000:
break
for word_pair in self.model:
num_n_grams = float(sum(self.model[word_pair].values()))
for word in self.model[word_pair]:
self.model[word_pair][word] = (self.model[word_pair][word] + self.alpha) / (num_n_grams + self.alpha*len(self.vocab))
def generate_outputs(self, input_file, output_file):
with open(output_file, 'w') as outputf:
with lzma.open(input_file, mode='rt') as file:
for index, text in enumerate(self.read_file_7(file)):
tokens = self.tokenizer.tokenize(text)
if len(tokens) < 4:
prediction = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'
else:
prediction = wp.predict_probs(tokens[0], tokens[1])
outputf.write(prediction + '\n')
def predict_probs(self, word1, word2):
predictions = dict(self.model[word1, word2])
most_common = dict(Counter(predictions).most_common(6))
total_prob = 0.0
str_prediction = ''
for word, prob in most_common.items():
total_prob += prob
str_prediction += f'{word}:{prob} '
if total_prob == 0.0:
return 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'
if 1 - total_prob >= 0.01:
str_prediction += f":{1-total_prob}"
else:
str_prediction += f":0.01"
return str_prediction
wp = WordPred()
wp.read_train_data('train/in.tsv.xz')
wp.generate_outputs('dev-0/in.tsv.xz', 'dev-0/out.tsv')
wp.generate_outputs('test-A/in.tsv.xz', 'test-A/out.tsv')