2022-03-28 11:04:33 +02:00
|
|
|
{
|
|
|
|
"metadata": {
|
|
|
|
"language_info": {
|
|
|
|
"codemirror_mode": {
|
|
|
|
"name": "ipython",
|
|
|
|
"version": 3
|
|
|
|
},
|
|
|
|
"file_extension": ".py",
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
"name": "python",
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
"pygments_lexer": "ipython3",
|
|
|
|
"version": "3.8.10-final"
|
|
|
|
},
|
|
|
|
"orig_nbformat": 2,
|
|
|
|
"kernelspec": {
|
|
|
|
"name": "python3",
|
|
|
|
"display_name": "Python 3 (ipykernel)",
|
|
|
|
"language": "python"
|
|
|
|
}
|
|
|
|
},
|
|
|
|
"nbformat": 4,
|
|
|
|
"nbformat_minor": 2,
|
|
|
|
"cells": [
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2022-04-11 11:29:29 +02:00
|
|
|
"execution_count": 1,
|
2022-03-28 11:04:33 +02:00
|
|
|
"metadata": {
|
|
|
|
"tags": []
|
|
|
|
},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"import pandas as pd\n",
|
|
|
|
"columns = ['FileId','Paper', 'Idk1', 'Year','Idk2','Idk3', 'LeftContext', 'RightContext']\n",
|
|
|
|
"\n",
|
|
|
|
"# dev_data = pd.read_csv('dev-0/in.tsv', sep='\\t', names=columns, engine='python', quotechar='\"', error_bad_lines=False)\n",
|
|
|
|
"# dev_expected = pd.read_csv('dev-0/expected.tsv', sep='\\t', engine='python', quotechar='\"', error_bad_lines=False)\n",
|
|
|
|
"dev_data = list()\n",
|
2022-04-11 11:29:29 +02:00
|
|
|
"directory = 'dev-0'\n",
|
|
|
|
"data_path = directory+'/in.tsv'\n",
|
|
|
|
"expected_path = directory+'/expected.tsv'\n",
|
|
|
|
"out_path = directory+'/out.tsv'\n",
|
2022-04-11 10:54:53 +02:00
|
|
|
"\n",
|
|
|
|
"with open(data_path, \"r\") as f:\n",
|
2022-03-28 11:04:33 +02:00
|
|
|
" for line in f.readlines():\n",
|
|
|
|
" dev_data.append(line.split('\\t')[-2])\n",
|
|
|
|
"\n",
|
|
|
|
"dev_expected = list()\n",
|
2022-04-11 10:54:53 +02:00
|
|
|
"with open(expected_path, \"r\") as f:\n",
|
2022-03-28 11:04:33 +02:00
|
|
|
" for line in f.readlines():\n",
|
|
|
|
" dev_expected.append(line.replace('\\n',''))"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2022-04-11 11:29:29 +02:00
|
|
|
"execution_count": 2,
|
2022-03-28 11:04:33 +02:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"from nltk.tokenize import word_tokenize \n",
|
2022-04-11 10:54:53 +02:00
|
|
|
"import re\n",
|
2022-03-28 11:04:33 +02:00
|
|
|
"\n",
|
|
|
|
"def createBigram(data, expected):\n",
|
|
|
|
" listOfBigrams = []\n",
|
|
|
|
" bigramCounts = {}\n",
|
|
|
|
" unigramCounts = {}\n",
|
|
|
|
"\n",
|
|
|
|
" for i in range(len(data)):\n",
|
|
|
|
" tokenized = word_tokenize(data[i])\n",
|
|
|
|
" word = tokenized[-1]\n",
|
2022-04-11 10:54:53 +02:00
|
|
|
" word = word.lower()\n",
|
|
|
|
" word = re.sub('\\W+','', word)\n",
|
|
|
|
" exp = expected[i].lower()\n",
|
|
|
|
" listOfBigrams.append((word, exp))\n",
|
|
|
|
" if (word, exp) in bigramCounts:\n",
|
|
|
|
" bigramCounts[(word, exp)] += 1\n",
|
2022-03-28 11:04:33 +02:00
|
|
|
" else:\n",
|
2022-04-11 10:54:53 +02:00
|
|
|
" bigramCounts[(word, exp)] = 1\n",
|
|
|
|
" if word in unigramCounts:\n",
|
2022-03-28 11:04:33 +02:00
|
|
|
" unigramCounts[word] += 1\n",
|
|
|
|
" else:\n",
|
|
|
|
" unigramCounts[word] = 1\n",
|
|
|
|
" \n",
|
|
|
|
" return listOfBigrams, unigramCounts, bigramCounts\n",
|
|
|
|
"\n",
|
|
|
|
"def calcBigramProb(listOfBigrams, unigramCounts, bigramCounts):\n",
|
|
|
|
" listOfProb = {}\n",
|
|
|
|
" for bigram in listOfBigrams:\n",
|
|
|
|
" word1 = bigram[0]\n",
|
|
|
|
" word2 = bigram[1]\n",
|
2022-04-11 10:54:53 +02:00
|
|
|
" listOfProb[bigram] = (bigramCounts.get(bigram))/(sum(unigramCounts.values()))\n",
|
2022-03-28 11:04:33 +02:00
|
|
|
" return listOfProb"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2022-04-11 11:29:29 +02:00
|
|
|
"execution_count": 3,
|
2022-03-28 11:04:33 +02:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"bigrams, uniCounts, biCounts = createBigram(dev_data, dev_expected)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2022-04-11 11:29:29 +02:00
|
|
|
"execution_count": 4,
|
2022-03-28 11:04:33 +02:00
|
|
|
"metadata": {},
|
2022-04-11 10:54:53 +02:00
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"probs = calcBigramProb(bigrams, uniCounts, biCounts)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2022-04-11 11:29:29 +02:00
|
|
|
"execution_count": 51,
|
2022-04-11 10:54:53 +02:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"def save_results(probs, in_data):\n",
|
|
|
|
" with open(out_path, 'w') as f:\n",
|
|
|
|
" for i in range(len(in_data)):\n",
|
|
|
|
" tokenized = word_tokenize(in_data[i])\n",
|
|
|
|
" word = tokenized[-1]\n",
|
|
|
|
" word = word.lower()\n",
|
|
|
|
" word = re.sub('\\W+','', word)\n",
|
|
|
|
" word_probs = dict(filter(lambda elem: elem[0][0] == word, probs.items()))\n",
|
2022-04-11 11:29:29 +02:00
|
|
|
" word_probs = dict(sorted(word_probs.items(), key=lambda item: item[1], reverse=True))\n",
|
2022-04-11 10:54:53 +02:00
|
|
|
" rest = 1.0 - sum(word_probs.values())\n",
|
2022-04-11 11:29:29 +02:00
|
|
|
" word_probs = list(map(lambda elem: elem[0][1] + \":\" + '{:.7f}'.format(elem[1]), list(word_probs.items())))\n",
|
|
|
|
" word_probs.append(':'+ '{:.7f}'.format(rest))\n",
|
|
|
|
" word_probs.append('\\n')\n",
|
2022-04-11 10:54:53 +02:00
|
|
|
" word_probs = ' '.join(word_probs)\n",
|
|
|
|
" f.write(word_probs)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2022-04-11 11:29:29 +02:00
|
|
|
"execution_count": 52,
|
2022-04-11 10:54:53 +02:00
|
|
|
"metadata": {
|
|
|
|
"tags": []
|
|
|
|
},
|
|
|
|
"outputs": [],
|
2022-03-28 11:04:33 +02:00
|
|
|
"source": [
|
2022-04-11 10:54:53 +02:00
|
|
|
"save_results(probs, dev_data)"
|
2022-03-28 11:04:33 +02:00
|
|
|
]
|
2022-04-11 11:29:29 +02:00
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": null,
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": []
|
2022-03-28 11:04:33 +02:00
|
|
|
}
|
|
|
|
]
|
|
|
|
}
|