interpolate

This commit is contained in:
Ramon Dyzman 2022-04-11 13:02:50 +02:00
parent 7fd6b19be4
commit db6c660600

View File

@ -24,17 +24,14 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 26,
"metadata": { "metadata": {
"tags": [] "tags": []
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"import pandas as pd\n", "import pandas as pd\n",
"columns = ['FileId','Paper', 'Idk1', 'Year','Idk2','Idk3', 'LeftContext', 'RightContext']\n",
"\n", "\n",
"# dev_data = pd.read_csv('dev-0/in.tsv', sep='\\t', names=columns, engine='python', quotechar='\"', error_bad_lines=False)\n",
"# dev_expected = pd.read_csv('dev-0/expected.tsv', sep='\\t', engine='python', quotechar='\"', error_bad_lines=False)\n",
"dev_data = list()\n", "dev_data = list()\n",
"directory = 'dev-0'\n", "directory = 'dev-0'\n",
"data_path = directory+'/in.tsv'\n", "data_path = directory+'/in.tsv'\n",
@ -53,7 +50,32 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"def interpolate(bigram, unigramCounts, bigramCounts, listOfProb):\n",
" lambdaValue = 0.4\n",
" word1 = bigram[0]\n",
" word2 = bigram[1]\n",
" listOfProb[bigram] = (bigramCounts.get(bigram, 0))/(unigramCounts.get(word1, 0)) + (1-lambdaValue)*(unigramCounts.get(word2, 0))/(unigramCounts.get(word1, 0))\n"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"def calcProbability(bigram, unigramCounts, bigramCounts, listOfProb):\n",
" word1 = bigram[0]\n",
" word2 = bigram[1]\n",
" listOfProb[bigram] = (bigramCounts.get(bigram, 0))/(unigramCounts.get(word1, 0))\n"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -86,15 +108,13 @@
"def calcBigramProb(listOfBigrams, unigramCounts, bigramCounts):\n", "def calcBigramProb(listOfBigrams, unigramCounts, bigramCounts):\n",
" listOfProb = {}\n", " listOfProb = {}\n",
" for bigram in listOfBigrams:\n", " for bigram in listOfBigrams:\n",
" word1 = bigram[0]\n", " calcProbability(bigram, unigramCounts, bigramCounts, listOfProb)\n",
" word2 = bigram[1]\n",
" listOfProb[bigram] = (bigramCounts.get(bigram))/(sum(unigramCounts.values()))\n",
" return listOfProb" " return listOfProb"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 30,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -103,7 +123,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 31,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -112,7 +132,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 51, "execution_count": 32,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -135,7 +155,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 52, "execution_count": 33,
"metadata": { "metadata": {
"tags": [] "tags": []
}, },