interpolate

This commit is contained in:
Ramon Dyzman 2022-04-11 13:02:50 +02:00
parent 7fd6b19be4
commit db6c660600
1 changed files with 32 additions and 12 deletions

View File

@ -24,17 +24,14 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 26,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import pandas as pd\n",
"columns = ['FileId','Paper', 'Idk1', 'Year','Idk2','Idk3', 'LeftContext', 'RightContext']\n",
"\n",
"# dev_data = pd.read_csv('dev-0/in.tsv', sep='\\t', names=columns, engine='python', quotechar='\"', error_bad_lines=False)\n",
"# dev_expected = pd.read_csv('dev-0/expected.tsv', sep='\\t', engine='python', quotechar='\"', error_bad_lines=False)\n",
"dev_data = list()\n",
"directory = 'dev-0'\n",
"data_path = directory+'/in.tsv'\n",
@ -53,7 +50,32 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"def interpolate(bigram, unigramCounts, bigramCounts, listOfProb):\n",
" lambdaValue = 0.4\n",
" word1 = bigram[0]\n",
" word2 = bigram[1]\n",
" listOfProb[bigram] = (bigramCounts.get(bigram, 0))/(unigramCounts.get(word1, 0)) + (1-lambdaValue)*(unigramCounts.get(word2, 0))/(unigramCounts.get(word1, 0))\n"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"def calcProbability(bigram, unigramCounts, bigramCounts, listOfProb):\n",
" word1 = bigram[0]\n",
" word2 = bigram[1]\n",
" listOfProb[bigram] = (bigramCounts.get(bigram, 0))/(unigramCounts.get(word1, 0))\n"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
@ -86,15 +108,13 @@
"def calcBigramProb(listOfBigrams, unigramCounts, bigramCounts):\n",
" listOfProb = {}\n",
" for bigram in listOfBigrams:\n",
" word1 = bigram[0]\n",
" word2 = bigram[1]\n",
" listOfProb[bigram] = (bigramCounts.get(bigram))/(sum(unigramCounts.values()))\n",
" calcProbability(bigram, unigramCounts, bigramCounts, listOfProb)\n",
" return listOfProb"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
@ -103,7 +123,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
@ -112,7 +132,7 @@
},
{
"cell_type": "code",
"execution_count": 51,
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
@ -135,7 +155,7 @@
},
{
"cell_type": "code",
"execution_count": 52,
"execution_count": 33,
"metadata": {
"tags": []
},