challenging-america-word-gap-prediction/main.ipynb at master

4.9 KiB

Raw Permalink Blame History

import pandas as pd

dev_data = list()
directory = 'test-A'
data_path = directory+'/in.tsv'
expected_path = directory+'/expected.tsv'
out_path = directory+'/out.tsv'

with open(data_path, "r") as f:
    for line in f.readlines():
        dev_data.append(line.split('\t')[-2])

# dev_expected = list()
# with open(expected_path, "r") as f:
#     for line in f.readlines():
#         dev_expected.append(line.replace('\n',''))

def interpolate(bigram, unigramCounts, bigramCounts, listOfProb):
    lambdaValue = 0.4
    word1 = bigram[0]
    word2 = bigram[1]
    listOfProb[bigram] = (bigramCounts.get(bigram, 0))/(unigramCounts.get(word1, 0)) + (1-lambdaValue)*(unigramCounts.get(word2, 0))/(unigramCounts.get(word1, 0))

def calcProbability(bigram, unigramCounts, bigramCounts, listOfProb):
    word1 = bigram[0]
    word2 = bigram[1]
    listOfProb[bigram] = ((bigramCounts.get(bigram, 0))/len(bigramCounts.items()))/((unigramCounts.get(word1, 0))/len(unigramCounts.items()))

from nltk.tokenize import word_tokenize 
import re

def createBigram(data, expected):
   listOfBigrams = []
   bigramCounts = {}
   unigramCounts = {}

   for i in range(len(data)):
       tokenized = word_tokenize(data[i])
       word = tokenized[-1]
       word = word.lower()
       word = re.sub('\W+','', word)
       exp = expected[i].lower()
       listOfBigrams.append((word, exp))
       if (word, exp) in bigramCounts:
        bigramCounts[(word, exp)] += 1
       else:
        bigramCounts[(word, exp)] = 1
       if word in unigramCounts:
        unigramCounts[word] += 1
       else:
        unigramCounts[word] = 1
        
   return listOfBigrams, unigramCounts, bigramCounts

def calcBigramProb(listOfBigrams, unigramCounts, bigramCounts):
    listOfProb = {}
    for bigram in listOfBigrams:
        calcProbability(bigram, unigramCounts, bigramCounts, listOfProb)
    return listOfProb

bigrams, uniCounts, biCounts = createBigram(dev_data, dev_expected)

probs = calcBigramProb(bigrams, uniCounts, biCounts)

def save_results(probs, in_data):
    with open(out_path, 'w') as f:
        for i in range(len(in_data)):
            tokenized = word_tokenize(in_data[i])
            word = tokenized[-1]
            word = word.lower()
            word = re.sub('\W+','', word)
            word_probs = dict(filter(lambda elem: elem[0][0] == word, probs.items()))
            word_probs = dict(sorted(word_probs.items(), key=lambda item: item[1], reverse=True))
            rest = 1.0 - sum(word_probs.values())
            word_probs = list(map(lambda elem: elem[0][1] + ":"  + '{:.7f}'.format(elem[1]), list(word_probs.items())))
            word_probs.append(':'+'{:.7f}'.format(rest))
            word_probs = ' '.join(word_probs)
            word_probs += '\n'
            f.write(word_probs)

save_results(probs, dev_data)

4.9 KiB Raw Permalink Blame History

4.9 KiB

Raw Permalink Blame History