challenging-america-word-ga.../main.ipynb

{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3 (ipykernel)",
   "language": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "columns = ['FileId','Paper', 'Idk1', 'Year','Idk2','Idk3', 'LeftContext', 'RightContext']\n",
    "\n",
    "# dev_data = pd.read_csv('dev-0/in.tsv', sep='\\t', names=columns, engine='python', quotechar='\"', error_bad_lines=False)\n",
    "# dev_expected = pd.read_csv('dev-0/expected.tsv', sep='\\t', engine='python', quotechar='\"', error_bad_lines=False)\n",
    "dev_data = list()\n",
    "data_path = 'dev-0/in.tsv'\n",
    "expected_path = 'dev-0/expected.tsv'\n",
    "out_path = 'dev-0/out.tsv'\n",
    "\n",
    "with open(data_path, \"r\") as f:\n",
    "    for line in f.readlines():\n",
    "        dev_data.append(line.split('\\t')[-2])\n",
    "\n",
    "dev_expected = list()\n",
    "with open(expected_path, \"r\") as f:\n",
    "    for line in f.readlines():\n",
    "        dev_expected.append(line.replace('\\n',''))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.tokenize import word_tokenize \n",
    "import re\n",
    "\n",
    "def createBigram(data, expected):\n",
    "   listOfBigrams = []\n",
    "   bigramCounts = {}\n",
    "   unigramCounts = {}\n",
    "\n",
    "   for i in range(len(data)):\n",
    "       tokenized = word_tokenize(data[i])\n",
    "       word = tokenized[-1]\n",
    "       word = word.lower()\n",
    "       word = re.sub('\\W+','', word)\n",
    "       exp = expected[i].lower()\n",
    "       listOfBigrams.append((word, exp))\n",
    "       if (word, exp) in bigramCounts:\n",
    "        bigramCounts[(word, exp)] += 1\n",
    "       else:\n",
    "        bigramCounts[(word, exp)] = 1\n",
    "       if word in unigramCounts:\n",
    "        unigramCounts[word] += 1\n",
    "       else:\n",
    "        unigramCounts[word] = 1\n",
    "        \n",
    "   return listOfBigrams, unigramCounts, bigramCounts\n",
    "\n",
    "def calcBigramProb(listOfBigrams, unigramCounts, bigramCounts):\n",
    "    listOfProb = {}\n",
    "    for bigram in listOfBigrams:\n",
    "        word1 = bigram[0]\n",
    "        word2 = bigram[1]\n",
    "        listOfProb[bigram] = (bigramCounts.get(bigram))/(sum(unigramCounts.values()))\n",
    "    return listOfProb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [],
   "source": [
    "bigrams, uniCounts, biCounts = createBigram(dev_data, dev_expected)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [],
   "source": [
    "probs = calcBigramProb(bigrams, uniCounts, biCounts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [],
   "source": [
    "def save_results(probs, in_data):\n",
    "    with open(out_path, 'w') as f:\n",
    "        for i in range(len(in_data)):\n",
    "            tokenized = word_tokenize(in_data[i])\n",
    "            word = tokenized[-1]\n",
    "            word = word.lower()\n",
    "            word = re.sub('\\W+','', word)\n",
    "            word_probs = dict(filter(lambda elem: elem[0][0] == word, probs.items()))\n",
    "            rest = 1.0 - sum(word_probs.values())\n",
    "            word_probs = list(map(lambda elem: elem[0][0] + \":\"  + str(elem[1]), list(word_probs.items())))\n",
    "            word_probs.append(':'+str(rest))\n",
    "            word_probs = ' '.join(word_probs)\n",
    "            f.write(word_probs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "save_results(probs, dev_data)"
   ]
  }
 ]
}
415366 2022-03-28 11:04:33 +02:00			`{`
			`"metadata": {`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.8.10-final"`
			`},`
			`"orig_nbformat": 2,`
			`"kernelspec": {`
			`"name": "python3",`
			`"display_name": "Python 3 (ipykernel)",`
			`"language": "python"`
			`}`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 2,`
			`"cells": [`
			`{`
			`"cell_type": "code",`
smthing 2022-04-11 10:54:53 +02:00			`"execution_count": 88,`
415366 2022-03-28 11:04:33 +02:00			`"metadata": {`
			`"tags": []`
			`},`
			`"outputs": [],`
			`"source": [`
			`"import pandas as pd\n",`
			`"columns = ['FileId','Paper', 'Idk1', 'Year','Idk2','Idk3', 'LeftContext', 'RightContext']\n",`
			`"\n",`
			`"# dev_data = pd.read_csv('dev-0/in.tsv', sep='\\t', names=columns, engine='python', quotechar='\"', error_bad_lines=False)\n",`
			`"# dev_expected = pd.read_csv('dev-0/expected.tsv', sep='\\t', engine='python', quotechar='\"', error_bad_lines=False)\n",`
			`"dev_data = list()\n",`
smthing 2022-04-11 10:54:53 +02:00			`"data_path = 'dev-0/in.tsv'\n",`
			`"expected_path = 'dev-0/expected.tsv'\n",`
			`"out_path = 'dev-0/out.tsv'\n",`
			`"\n",`
			`"with open(data_path, \"r\") as f:\n",`
415366 2022-03-28 11:04:33 +02:00			`" for line in f.readlines():\n",`
			`" dev_data.append(line.split('\\t')[-2])\n",`
			`"\n",`
			`"dev_expected = list()\n",`
smthing 2022-04-11 10:54:53 +02:00			`"with open(expected_path, \"r\") as f:\n",`
415366 2022-03-28 11:04:33 +02:00			`" for line in f.readlines():\n",`
			`" dev_expected.append(line.replace('\\n',''))"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
smthing 2022-04-11 10:54:53 +02:00			`"execution_count": 89,`
415366 2022-03-28 11:04:33 +02:00			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"from nltk.tokenize import word_tokenize \n",`
smthing 2022-04-11 10:54:53 +02:00			`"import re\n",`
415366 2022-03-28 11:04:33 +02:00			`"\n",`
			`"def createBigram(data, expected):\n",`
			`" listOfBigrams = []\n",`
			`" bigramCounts = {}\n",`
			`" unigramCounts = {}\n",`
			`"\n",`
			`" for i in range(len(data)):\n",`
			`" tokenized = word_tokenize(data[i])\n",`
			`" word = tokenized[-1]\n",`
smthing 2022-04-11 10:54:53 +02:00			`" word = word.lower()\n",`
			`" word = re.sub('\\W+','', word)\n",`
			`" exp = expected[i].lower()\n",`
			`" listOfBigrams.append((word, exp))\n",`
			`" if (word, exp) in bigramCounts:\n",`
			`" bigramCounts[(word, exp)] += 1\n",`
415366 2022-03-28 11:04:33 +02:00			`" else:\n",`
smthing 2022-04-11 10:54:53 +02:00			`" bigramCounts[(word, exp)] = 1\n",`
			`" if word in unigramCounts:\n",`
415366 2022-03-28 11:04:33 +02:00			`" unigramCounts[word] += 1\n",`
			`" else:\n",`
			`" unigramCounts[word] = 1\n",`
			`" \n",`
			`" return listOfBigrams, unigramCounts, bigramCounts\n",`
			`"\n",`
			`"def calcBigramProb(listOfBigrams, unigramCounts, bigramCounts):\n",`
			`" listOfProb = {}\n",`
			`" for bigram in listOfBigrams:\n",`
			`" word1 = bigram[0]\n",`
			`" word2 = bigram[1]\n",`
smthing 2022-04-11 10:54:53 +02:00			`" listOfProb[bigram] = (bigramCounts.get(bigram))/(sum(unigramCounts.values()))\n",`
415366 2022-03-28 11:04:33 +02:00			`" return listOfProb"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
smthing 2022-04-11 10:54:53 +02:00			`"execution_count": 90,`
415366 2022-03-28 11:04:33 +02:00			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"bigrams, uniCounts, biCounts = createBigram(dev_data, dev_expected)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
smthing 2022-04-11 10:54:53 +02:00			`"execution_count": 91,`
415366 2022-03-28 11:04:33 +02:00			`"metadata": {},`
smthing 2022-04-11 10:54:53 +02:00			`"outputs": [],`
			`"source": [`
			`"probs = calcBigramProb(bigrams, uniCounts, biCounts)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 112,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def save_results(probs, in_data):\n",`
			`" with open(out_path, 'w') as f:\n",`
			`" for i in range(len(in_data)):\n",`
			`" tokenized = word_tokenize(in_data[i])\n",`
			`" word = tokenized[-1]\n",`
			`" word = word.lower()\n",`
			`" word = re.sub('\\W+','', word)\n",`
			`" word_probs = dict(filter(lambda elem: elem[0][0] == word, probs.items()))\n",`
			`" rest = 1.0 - sum(word_probs.values())\n",`
			`" word_probs = list(map(lambda elem: elem[0][0] + \":\" + str(elem[1]), list(word_probs.items())))\n",`
			`" word_probs.append(':'+str(rest))\n",`
			`" word_probs = ' '.join(word_probs)\n",`
			`" f.write(word_probs)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 113,`
			`"metadata": {`
			`"tags": []`
			`},`
			`"outputs": [],`
415366 2022-03-28 11:04:33 +02:00			`"source": [`
smthing 2022-04-11 10:54:53 +02:00			`"save_results(probs, dev_data)"`
415366 2022-03-28 11:04:33 +02:00			`]`
			`}`
			`]`
			`}`