4.9 KiB
4.9 KiB
import pandas as pd
dev_data = list()
directory = 'dev-0'
data_path = directory+'/in.tsv'
expected_path = directory+'/expected.tsv'
out_path = directory+'/out.tsv'
with open(data_path, "r") as f:
for line in f.readlines():
dev_data.append(line.split('\t')[-2])
dev_expected = list()
with open(expected_path, "r") as f:
for line in f.readlines():
dev_expected.append(line.replace('\n',''))
def interpolate(bigram, unigramCounts, bigramCounts, listOfProb):
lambdaValue = 0.4
word1 = bigram[0]
word2 = bigram[1]
listOfProb[bigram] = (bigramCounts.get(bigram, 0))/(unigramCounts.get(word1, 0)) + (1-lambdaValue)*(unigramCounts.get(word2, 0))/(unigramCounts.get(word1, 0))
def calcProbability(bigram, unigramCounts, bigramCounts, listOfProb):
word1 = bigram[0]
word2 = bigram[1]
listOfProb[bigram] = (bigramCounts.get(bigram, 0))/(unigramCounts.get(word1, 0))
from nltk.tokenize import word_tokenize
import re
def createBigram(data, expected):
listOfBigrams = []
bigramCounts = {}
unigramCounts = {}
for i in range(len(data)):
tokenized = word_tokenize(data[i])
word = tokenized[-1]
word = word.lower()
word = re.sub('\W+','', word)
exp = expected[i].lower()
listOfBigrams.append((word, exp))
if (word, exp) in bigramCounts:
bigramCounts[(word, exp)] += 1
else:
bigramCounts[(word, exp)] = 1
if word in unigramCounts:
unigramCounts[word] += 1
else:
unigramCounts[word] = 1
return listOfBigrams, unigramCounts, bigramCounts
def calcBigramProb(listOfBigrams, unigramCounts, bigramCounts):
listOfProb = {}
for bigram in listOfBigrams:
calcProbability(bigram, unigramCounts, bigramCounts, listOfProb)
return listOfProb
bigrams, uniCounts, biCounts = createBigram(dev_data, dev_expected)
probs = calcBigramProb(bigrams, uniCounts, biCounts)
def save_results(probs, in_data):
with open(out_path, 'w') as f:
for i in range(len(in_data)):
tokenized = word_tokenize(in_data[i])
word = tokenized[-1]
word = word.lower()
word = re.sub('\W+','', word)
word_probs = dict(filter(lambda elem: elem[0][0] == word, probs.items()))
word_probs = dict(sorted(word_probs.items(), key=lambda item: item[1], reverse=True))
rest = 1.0 - sum(word_probs.values())
word_probs = list(map(lambda elem: elem[0][1] + ":" + '{:.7f}'.format(elem[1]), list(word_probs.items())))
word_probs.append(':'+ '{:.7f}'.format(rest))
word_probs.append('\n')
word_probs = ' '.join(word_probs)
f.write(word_probs)
save_results(probs, dev_data)