{ "metadata": { "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10-final" }, "orig_nbformat": 2, "kernelspec": { "name": "python3", "display_name": "Python 3 (ipykernel)", "language": "python" } }, "nbformat": 4, "nbformat_minor": 2, "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "tags": [] }, "outputs": [], "source": [ "import pandas as pd\n", "columns = ['FileId','Paper', 'Idk1', 'Year','Idk2','Idk3', 'LeftContext', 'RightContext']\n", "\n", "# dev_data = pd.read_csv('dev-0/in.tsv', sep='\\t', names=columns, engine='python', quotechar='\"', error_bad_lines=False)\n", "# dev_expected = pd.read_csv('dev-0/expected.tsv', sep='\\t', engine='python', quotechar='\"', error_bad_lines=False)\n", "dev_data = list()\n", "directory = 'dev-0'\n", "data_path = directory+'/in.tsv'\n", "expected_path = directory+'/expected.tsv'\n", "out_path = directory+'/out.tsv'\n", "\n", "with open(data_path, \"r\") as f:\n", " for line in f.readlines():\n", " dev_data.append(line.split('\\t')[-2])\n", "\n", "dev_expected = list()\n", "with open(expected_path, \"r\") as f:\n", " for line in f.readlines():\n", " dev_expected.append(line.replace('\\n',''))" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from nltk.tokenize import word_tokenize \n", "import re\n", "\n", "def createBigram(data, expected):\n", " listOfBigrams = []\n", " bigramCounts = {}\n", " unigramCounts = {}\n", "\n", " for i in range(len(data)):\n", " tokenized = word_tokenize(data[i])\n", " word = tokenized[-1]\n", " word = word.lower()\n", " word = re.sub('\\W+','', word)\n", " exp = expected[i].lower()\n", " listOfBigrams.append((word, exp))\n", " if (word, exp) in bigramCounts:\n", " bigramCounts[(word, exp)] += 1\n", " else:\n", " bigramCounts[(word, exp)] = 1\n", " if word in unigramCounts:\n", " unigramCounts[word] += 1\n", " else:\n", " unigramCounts[word] = 1\n", " \n", " return listOfBigrams, unigramCounts, bigramCounts\n", "\n", "def calcBigramProb(listOfBigrams, unigramCounts, bigramCounts):\n", " listOfProb = {}\n", " for bigram in listOfBigrams:\n", " word1 = bigram[0]\n", " word2 = bigram[1]\n", " listOfProb[bigram] = (bigramCounts.get(bigram))/(sum(unigramCounts.values()))\n", " return listOfProb" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "bigrams, uniCounts, biCounts = createBigram(dev_data, dev_expected)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "probs = calcBigramProb(bigrams, uniCounts, biCounts)" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "def save_results(probs, in_data):\n", " with open(out_path, 'w') as f:\n", " for i in range(len(in_data)):\n", " tokenized = word_tokenize(in_data[i])\n", " word = tokenized[-1]\n", " word = word.lower()\n", " word = re.sub('\\W+','', word)\n", " word_probs = dict(filter(lambda elem: elem[0][0] == word, probs.items()))\n", " word_probs = dict(sorted(word_probs.items(), key=lambda item: item[1], reverse=True))\n", " rest = 1.0 - sum(word_probs.values())\n", " word_probs = list(map(lambda elem: elem[0][1] + \":\" + '{:.7f}'.format(elem[1]), list(word_probs.items())))\n", " word_probs.append(':'+ '{:.7f}'.format(rest))\n", " word_probs.append('\\n')\n", " word_probs = ' '.join(word_probs)\n", " f.write(word_probs)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "tags": [] }, "outputs": [], "source": [ "save_results(probs, dev_data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ] }