{ "metadata": { "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10-final" }, "orig_nbformat": 2, "kernelspec": { "name": "python3", "display_name": "Python 3 (ipykernel)", "language": "python" } }, "nbformat": 4, "nbformat_minor": 2, "cells": [ { "cell_type": "code", "execution_count": 17, "metadata": { "tags": [] }, "outputs": [], "source": [ "import pandas as pd\n", "\n", "dev_data = list()\n", "directory = 'test-A'\n", "data_path = directory+'/in.tsv'\n", "expected_path = directory+'/expected.tsv'\n", "out_path = directory+'/out.tsv'\n", "\n", "with open(data_path, \"r\") as f:\n", " for line in f.readlines():\n", " dev_data.append(line.split('\\t')[-2])\n", "\n", "# dev_expected = list()\n", "# with open(expected_path, \"r\") as f:\n", "# for line in f.readlines():\n", "# dev_expected.append(line.replace('\\n',''))" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "def interpolate(bigram, unigramCounts, bigramCounts, listOfProb):\n", " lambdaValue = 0.4\n", " word1 = bigram[0]\n", " word2 = bigram[1]\n", " listOfProb[bigram] = (bigramCounts.get(bigram, 0))/(unigramCounts.get(word1, 0)) + (1-lambdaValue)*(unigramCounts.get(word2, 0))/(unigramCounts.get(word1, 0))\n" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "def calcProbability(bigram, unigramCounts, bigramCounts, listOfProb):\n", " word1 = bigram[0]\n", " word2 = bigram[1]\n", " listOfProb[bigram] = ((bigramCounts.get(bigram, 0))/len(bigramCounts.items()))/((unigramCounts.get(word1, 0))/len(unigramCounts.items()))\n" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "from nltk.tokenize import word_tokenize \n", "import re\n", "\n", "def createBigram(data, expected):\n", " listOfBigrams = []\n", " bigramCounts = {}\n", " unigramCounts = {}\n", "\n", " for i in range(len(data)):\n", " tokenized = word_tokenize(data[i])\n", " word = tokenized[-1]\n", " word = word.lower()\n", " word = re.sub('\\W+','', word)\n", " exp = expected[i].lower()\n", " listOfBigrams.append((word, exp))\n", " if (word, exp) in bigramCounts:\n", " bigramCounts[(word, exp)] += 1\n", " else:\n", " bigramCounts[(word, exp)] = 1\n", " if word in unigramCounts:\n", " unigramCounts[word] += 1\n", " else:\n", " unigramCounts[word] = 1\n", " \n", " return listOfBigrams, unigramCounts, bigramCounts\n", "\n", "def calcBigramProb(listOfBigrams, unigramCounts, bigramCounts):\n", " listOfProb = {}\n", " for bigram in listOfBigrams:\n", " calcProbability(bigram, unigramCounts, bigramCounts, listOfProb)\n", " return listOfProb" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "bigrams, uniCounts, biCounts = createBigram(dev_data, dev_expected)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "probs = calcBigramProb(bigrams, uniCounts, biCounts)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "def save_results(probs, in_data):\n", " with open(out_path, 'w') as f:\n", " for i in range(len(in_data)):\n", " tokenized = word_tokenize(in_data[i])\n", " word = tokenized[-1]\n", " word = word.lower()\n", " word = re.sub('\\W+','', word)\n", " word_probs = dict(filter(lambda elem: elem[0][0] == word, probs.items()))\n", " word_probs = dict(sorted(word_probs.items(), key=lambda item: item[1], reverse=True))\n", " rest = 1.0 - sum(word_probs.values())\n", " word_probs = list(map(lambda elem: elem[0][1] + \":\" + '{:.7f}'.format(elem[1]), list(word_probs.items())))\n", " word_probs.append(':'+'{:.7f}'.format(rest))\n", " word_probs = ' '.join(word_probs)\n", " word_probs += '\\n'\n", " f.write(word_probs)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "tags": [] }, "outputs": [], "source": [ "save_results(probs, dev_data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ] }