challenging-america-word-gap-prediction/main.ipynb at a23511bd83a9e09ef4d419d3659b10a26c9f264e

4.2 KiB

Raw Blame History

import pandas as pd
columns = ['FileId','Paper', 'Idk1', 'Year','Idk2','Idk3', 'LeftContext', 'RightContext']

# dev_data = pd.read_csv('dev-0/in.tsv', sep='\t', names=columns, engine='python', quotechar='"', error_bad_lines=False)
# dev_expected = pd.read_csv('dev-0/expected.tsv', sep='\t', engine='python', quotechar='"', error_bad_lines=False)
dev_data = list()
data_path = 'dev-0/in.tsv'
expected_path = 'dev-0/expected.tsv'
out_path = 'dev-0/out.tsv'

with open(data_path, "r") as f:
    for line in f.readlines():
        dev_data.append(line.split('\t')[-2])

dev_expected = list()
with open(expected_path, "r") as f:
    for line in f.readlines():
        dev_expected.append(line.replace('\n',''))

from nltk.tokenize import word_tokenize 
import re

def createBigram(data, expected):
   listOfBigrams = []
   bigramCounts = {}
   unigramCounts = {}

   for i in range(len(data)):
       tokenized = word_tokenize(data[i])
       word = tokenized[-1]
       word = word.lower()
       word = re.sub('\W+','', word)
       exp = expected[i].lower()
       listOfBigrams.append((word, exp))
       if (word, exp) in bigramCounts:
        bigramCounts[(word, exp)] += 1
       else:
        bigramCounts[(word, exp)] = 1
       if word in unigramCounts:
        unigramCounts[word] += 1
       else:
        unigramCounts[word] = 1
        
   return listOfBigrams, unigramCounts, bigramCounts

def calcBigramProb(listOfBigrams, unigramCounts, bigramCounts):
    listOfProb = {}
    for bigram in listOfBigrams:
        word1 = bigram[0]
        word2 = bigram[1]
        listOfProb[bigram] = (bigramCounts.get(bigram))/(sum(unigramCounts.values()))
    return listOfProb

bigrams, uniCounts, biCounts = createBigram(dev_data, dev_expected)

probs = calcBigramProb(bigrams, uniCounts, biCounts)

def save_results(probs, in_data):
    with open(out_path, 'w') as f:
        for i in range(len(in_data)):
            tokenized = word_tokenize(in_data[i])
            word = tokenized[-1]
            word = word.lower()
            word = re.sub('\W+','', word)
            word_probs = dict(filter(lambda elem: elem[0][0] == word, probs.items()))
            rest = 1.0 - sum(word_probs.values())
            word_probs = list(map(lambda elem: elem[0][0] + ":"  + str(elem[1]), list(word_probs.items())))
            word_probs.append(':'+str(rest))
            word_probs = ' '.join(word_probs)
            f.write(word_probs)

save_results(probs, dev_data)

4.2 KiB Raw Blame History

4.2 KiB

Raw Blame History