diff --git a/main.ipynb b/main.ipynb index b07eef2..dfa1a6f 100644 --- a/main.ipynb +++ b/main.ipynb @@ -24,17 +24,14 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 26, "metadata": { "tags": [] }, "outputs": [], "source": [ "import pandas as pd\n", - "columns = ['FileId','Paper', 'Idk1', 'Year','Idk2','Idk3', 'LeftContext', 'RightContext']\n", "\n", - "# dev_data = pd.read_csv('dev-0/in.tsv', sep='\\t', names=columns, engine='python', quotechar='\"', error_bad_lines=False)\n", - "# dev_expected = pd.read_csv('dev-0/expected.tsv', sep='\\t', engine='python', quotechar='\"', error_bad_lines=False)\n", "dev_data = list()\n", "directory = 'dev-0'\n", "data_path = directory+'/in.tsv'\n", @@ -53,7 +50,32 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "def interpolate(bigram, unigramCounts, bigramCounts, listOfProb):\n", + " lambdaValue = 0.4\n", + " word1 = bigram[0]\n", + " word2 = bigram[1]\n", + " listOfProb[bigram] = (bigramCounts.get(bigram, 0))/(unigramCounts.get(word1, 0)) + (1-lambdaValue)*(unigramCounts.get(word2, 0))/(unigramCounts.get(word1, 0))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "def calcProbability(bigram, unigramCounts, bigramCounts, listOfProb):\n", + " word1 = bigram[0]\n", + " word2 = bigram[1]\n", + " listOfProb[bigram] = (bigramCounts.get(bigram, 0))/(unigramCounts.get(word1, 0))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -86,15 +108,13 @@ "def calcBigramProb(listOfBigrams, unigramCounts, bigramCounts):\n", " listOfProb = {}\n", " for bigram in listOfBigrams:\n", - " word1 = bigram[0]\n", - " word2 = bigram[1]\n", - " listOfProb[bigram] = (bigramCounts.get(bigram))/(sum(unigramCounts.values()))\n", + " calcProbability(bigram, unigramCounts, bigramCounts, listOfProb)\n", " return listOfProb" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -103,7 +123,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -112,7 +132,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -135,7 +155,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 33, "metadata": { "tags": [] },