challenging-america-word-ga.../run.py
2022-04-25 16:58:55 +02:00

88 lines
3.0 KiB
Python

import pandas as pd
import csv
from collections import Counter, defaultdict
from nltk.tokenize import RegexpTokenizer
from nltk import trigrams
import regex as re
import lzma
import kenlm
class WordPred:
def __init__(self):
self.tokenizer = RegexpTokenizer(r"\w+")
# self.model = defaultdict(lambda: defaultdict(lambda: 0))
self.model = kenlm.Model("model.binary")
self.words = set()
def read_file(self, file):
for line in file:
text = line.split("\t")
yield re.sub(r"[^\w\d'\s]+", '',
re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n", " ").replace("\n", "").lower()))
def read_file_7(self, file):
for line in file:
text = line.split("\t")
yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', text[7].replace("\\n", " ").replace("\n", "").lower()))
def fill_words(self, file_path, output_file):
with open(output_file, 'w') as out:
with lzma.open(file_path, mode='rt') as file:
for text in self.read_file(file):
for word in text.split(" "):
if word not in self.words:
out.write(word + "\n")
self.words.add(word)
def read_words(self, file_path):
with open(file_path, 'r') as fin:
for word in fin.readline():
self.words.add(word.replace("\n",""))
def create_train_file(self, file_path, output_path, rows=10000):
with open(output_path, 'w') as outputfile:
with lzma.open(file_path, mode='rt') as file:
for index, text in enumerate(self.read_file(file)):
outputfile.write(text)
if index == rows:
break
outputfile.close()
def generate_outputs(self, input_file, output_file):
with open(output_file, 'w') as outputf:
with lzma.open(input_file, mode='rt') as file:
for index, text in enumerate(self.read_file_7(file)):
tokens = self.tokenizer.tokenize(text)
if len(tokens) < 4:
prediction = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'
else:
prediction = wp.predict_probs(tokens[0], tokens[1])
outputf.write(prediction + '\n')
def predict_probs(self, word1, word2):
total_prob = 0.0
str_prediction = ''
for word, prob in most_common.items():
total_prob += prob
str_prediction += f'{word}:{prob} '
if total_prob == 0.0:
return 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'
if 1 - total_prob >= 0.01:
str_prediction += f":{1 - total_prob}"
else:
str_prediction += f":0.01"
return str_prediction
if __name__ == "__main__":
wp = WordPred()
# wp.create_train_file("train/in.tsv.xz", "train/in.txt")
# wp.fill_words("train/in.tsv.xz", "words.txt")