challenging-america-word-ga.../main.ipynb
2022-04-11 13:02:50 +02:00

4.9 KiB

import pandas as pd

dev_data = list()
directory = 'dev-0'
data_path = directory+'/in.tsv'
expected_path = directory+'/expected.tsv'
out_path = directory+'/out.tsv'

with open(data_path, "r") as f:
    for line in f.readlines():
        dev_data.append(line.split('\t')[-2])

dev_expected = list()
with open(expected_path, "r") as f:
    for line in f.readlines():
        dev_expected.append(line.replace('\n',''))
def interpolate(bigram, unigramCounts, bigramCounts, listOfProb):
    lambdaValue = 0.4
    word1 = bigram[0]
    word2 = bigram[1]
    listOfProb[bigram] = (bigramCounts.get(bigram, 0))/(unigramCounts.get(word1, 0)) + (1-lambdaValue)*(unigramCounts.get(word2, 0))/(unigramCounts.get(word1, 0))
def calcProbability(bigram, unigramCounts, bigramCounts, listOfProb):
    word1 = bigram[0]
    word2 = bigram[1]
    listOfProb[bigram] = (bigramCounts.get(bigram, 0))/(unigramCounts.get(word1, 0))
from nltk.tokenize import word_tokenize 
import re

def createBigram(data, expected):
   listOfBigrams = []
   bigramCounts = {}
   unigramCounts = {}

   for i in range(len(data)):
       tokenized = word_tokenize(data[i])
       word = tokenized[-1]
       word = word.lower()
       word = re.sub('\W+','', word)
       exp = expected[i].lower()
       listOfBigrams.append((word, exp))
       if (word, exp) in bigramCounts:
        bigramCounts[(word, exp)] += 1
       else:
        bigramCounts[(word, exp)] = 1
       if word in unigramCounts:
        unigramCounts[word] += 1
       else:
        unigramCounts[word] = 1
        
   return listOfBigrams, unigramCounts, bigramCounts

def calcBigramProb(listOfBigrams, unigramCounts, bigramCounts):
    listOfProb = {}
    for bigram in listOfBigrams:
        calcProbability(bigram, unigramCounts, bigramCounts, listOfProb)
    return listOfProb
bigrams, uniCounts, biCounts = createBigram(dev_data, dev_expected)
probs = calcBigramProb(bigrams, uniCounts, biCounts)
def save_results(probs, in_data):
    with open(out_path, 'w') as f:
        for i in range(len(in_data)):
            tokenized = word_tokenize(in_data[i])
            word = tokenized[-1]
            word = word.lower()
            word = re.sub('\W+','', word)
            word_probs = dict(filter(lambda elem: elem[0][0] == word, probs.items()))
            word_probs = dict(sorted(word_probs.items(), key=lambda item: item[1], reverse=True))
            rest = 1.0 - sum(word_probs.values())
            word_probs = list(map(lambda elem: elem[0][1] + ":"  + '{:.7f}'.format(elem[1]), list(word_probs.items())))
            word_probs.append(':'+ '{:.7f}'.format(rest))
            word_probs.append('\n')
            word_probs = ' '.join(word_probs)
            f.write(word_probs)
save_results(probs, dev_data)