This commit is contained in:
s434732 2022-04-10 21:13:49 +02:00
parent e362ba345c
commit bd9a546fdb
6 changed files with 14177 additions and 13744 deletions

File diff suppressed because it is too large Load Diff

BIN
geval Normal file

Binary file not shown.

215
run.ipynb Normal file
View File

@ -0,0 +1,215 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 136,
"id": "e1ae390b",
"metadata": {},
"outputs": [],
"source": [
"import lzma\n",
"import nltk\n",
"\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk import trigrams\n",
"from nltk.stem import PorterStemmer\n",
"from nltk.tokenize import word_tokenize\n",
"from statistics import mean\n",
"from wordcloud import WordCloud,STOPWORDS\n",
"from collections import defaultdict, Counter\n",
"import plotly.express as px\n",
"import pandas as pd\n",
"from tqdm import tqdm\n",
"from nltk import ngrams\n",
"import pandas as pd\n",
"import csv\n",
"import cleantext\n",
"import re\n",
"import string"
]
},
{
"cell_type": "code",
"execution_count": 172,
"id": "32ece3fd",
"metadata": {},
"outputs": [],
"source": [
"model = defaultdict(lambda: defaultdict(lambda: 0))"
]
},
{
"cell_type": "code",
"execution_count": 173,
"id": "9174b2bb",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_755/1610768154.py:1: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
"\n",
"\n",
" train_file_in = pd.read_csv(\"train/in.tsv.xz\", sep=\"\\t\", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=200000)\n",
"/tmp/ipykernel_755/1610768154.py:2: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
"\n",
"\n",
" train_file_out = pd.read_csv(\"train/expected.tsv\", sep=\"\\t\", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=200000)\n"
]
}
],
"source": [
"train_file_in = pd.read_csv(\"train/in.tsv.xz\", sep=\"\\t\", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=200000)\n",
"train_file_out = pd.read_csv(\"train/expected.tsv\", sep=\"\\t\", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=200000)"
]
},
{
"cell_type": "code",
"execution_count": 174,
"id": "52848cf3",
"metadata": {},
"outputs": [],
"source": [
"stop_words= nltk.corpus.stopwords.words('english')\n",
"\n",
"def get_20common_2grams(text, n):\n",
" outputTrigrams = []\n",
" n_grams = ngrams(nltk.tokenize.word_tokenize(text), n)\n",
" for grams in n_grams:\n",
" outputTrigrams.append(grams)\n",
" return outputTrigrams\n",
"\n",
"def get_20common_2grams_no_stop(text, n):\n",
" tokenized_world = nltk.tokenize.word_tokenize(text)\n",
" stop_words= nltk.corpus.stopwords.words('english') \n",
" tokenized_no_stop = [i for i in tokenized_world if i not in stop_words]\n",
" n_grams = ngrams(tokenized_no_stop, n)\n",
" return n_grams\n",
"\n",
"def predict(word_before, word_after):\n",
" print(\"tu jestem\")\n",
" prob_list = dict(Counter(model[(word_before, word_after)]).most_common(6)).items()\n",
" predictions = []\n",
" prob_sum = 0.0\n",
" for key, value in prob_list:\n",
" print(\"tu jestem .................................\")\n",
" prob_sum += value\n",
" predictions.append(f'{key}:{value}')\n",
" if prob_sum == 0.0:\n",
" print(\"a teraz tu\")\n",
" return 'the:0:2 be:0.2 to:0.2 of:0.15 and:0.15 :0.1'\n",
" remaining_prob = 1 - prob_sum\n",
" if remaining_prob < 0.01:\n",
" predictions.append(f':{0.01}')\n",
" return ' '.join(predictions)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 175,
"id": "b7757d06",
"metadata": {},
"outputs": [],
"source": [
"train = train_file_in[[6, 7]]\n",
"train = pd.concat([train, train_file_out], axis=1)\n",
"\n",
"train[\"result\"] = train[6] + train[0] + train[7]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0b9ce1a1",
"metadata": {},
"outputs": [],
"source": [
"for index, row in train.iterrows():\n",
" lower= str(row[\"result\"]).lower()\n",
" new_doc = re.sub(\"s+\",\" \", lower)\n",
" text_clean = \"\".join([i for i in new_doc if i not in string.punctuation])\n",
" words = word_tokenize(text_clean)\n",
" for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):\n",
" if w1 and w2 and w3:\n",
" model[(w2, w3)][w1] += 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "57c08749",
"metadata": {},
"outputs": [],
"source": [
"for key in model:\n",
" total_count = float(sum(model[key].values()))\n",
" for value in model[key]:\n",
" model[key][value] /= total_count"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f0ad2b1a",
"metadata": {},
"outputs": [],
"source": [
"dev_data = pd.read_csv('dev-0/in.tsv.xz', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)\n",
"test_a_data = pd.read_csv('test-A/in.tsv.xz', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e91e51e5",
"metadata": {},
"outputs": [],
"source": [
"with open('dev-0/out.tsv', 'w') as file:\n",
" for index, row in dev_data.iterrows():\n",
" lower= str(row[7]).lower()\n",
" new_doc = re.sub(\"s+\",\" \", lower)\n",
" text_clean = \"\".join([i for i in new_doc if i not in string.punctuation])\n",
" words = word_tokenize(text_clean)\n",
" if len(words) < 4:\n",
" print(words)\n",
" prediction = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'\n",
" else:\n",
" prediction = predict(words[0], words[1])\n",
" file.write(prediction + '\\n')\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "01e3f7e8",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

17
run.py
View File

@ -15,7 +15,8 @@ import re
model = defaultdict(lambda: defaultdict(lambda: 0))
setOf = set()
alpha = 0.02
# In[173]:
@ -76,6 +77,14 @@ for index, row in train.iterrows():
for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):
if w1 and w2 and w3:
model[(w2, w3)][w1] += 1
setOf.add(w1)
setOf.add(w2)
setOf.add(w3)
for words in model:
num_n_grams = float(sum(model[words].values()))
for word in model[words]:
model[words][word] = (model[words][word] + alpha) / (num_n_grams + alpha * len(setOf))
# In[ ]:
@ -122,6 +131,6 @@ with open('test-A/out.tsv', 'w', encoding="utf-8") as file:
# In[ ]:
# test dla alpha 0.01 = 918.59
# test dla alpha 0.001 = 919.14
# test dla alpha 0.02 = 917.99

209
run_plusaplha.ipynb Normal file
View File

@ -0,0 +1,209 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "e1ae390b",
"metadata": {},
"outputs": [],
"source": [
"import lzma\n",
"import nltk\n",
"\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk import trigrams\n",
"from nltk.stem import PorterStemmer\n",
"from nltk.tokenize import word_tokenize\n",
"from statistics import mean\n",
"from wordcloud import WordCloud,STOPWORDS\n",
"from collections import defaultdict, Counter\n",
"import plotly.express as px\n",
"import pandas as pd\n",
"from tqdm import tqdm\n",
"from nltk import ngrams\n",
"import pandas as pd\n",
"import csv\n",
"import re\n",
"import string"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "32ece3fd",
"metadata": {},
"outputs": [],
"source": [
"model = defaultdict(lambda: defaultdict(lambda: 0))\n",
"setOf = set()\n",
"alpha = 0.01"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "9174b2bb",
"metadata": {},
"outputs": [],
"source": [
"train_file_in = pd.read_csv(\"train/in.tsv.xz\", sep=\"\\t\", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=200000)\n",
"train_file_out = pd.read_csv(\"train/expected.tsv\", sep=\"\\t\", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=200000)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "52848cf3",
"metadata": {},
"outputs": [],
"source": [
"stop_words= nltk.corpus.stopwords.words('english')\n",
"\n",
"def get_20common_2grams(text, n):\n",
" outputTrigrams = []\n",
" n_grams = ngrams(nltk.tokenize.word_tokenize(text), n)\n",
" for grams in n_grams:\n",
" outputTrigrams.append(grams)\n",
" return outputTrigrams\n",
"\n",
"def get_20common_2grams_no_stop(text, n):\n",
" tokenized_world = nltk.tokenize.word_tokenize(text)\n",
" stop_words= nltk.corpus.stopwords.words('english') \n",
" tokenized_no_stop = [i for i in tokenized_world if i not in stop_words]\n",
" n_grams = ngrams(tokenized_no_stop, n)\n",
" return n_grams\n",
"\n",
"def predict(word_before, word_after):\n",
" print(\"tu jestem\")\n",
" prob_list = dict(Counter(model[(word_before, word_after)]).most_common(6)).items()\n",
" predictions = []\n",
" prob_sum = 0.0\n",
" for key, value in prob_list:\n",
" print(\"tu jestem .................................\")\n",
" prob_sum += value\n",
" predictions.append(f'{key}:{value}')\n",
" if prob_sum == 0.0:\n",
" print(\"a teraz tu\")\n",
" return 'the:0:2 be:0.2 to:0.2 of:0.15 and:0.15 :0.1'\n",
" remaining_prob = 1 - prob_sum\n",
" if remaining_prob < 0.01:\n",
" predictions.append(f':{0.01}')\n",
" return ' '.join(predictions)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 175,
"id": "b7757d06",
"metadata": {},
"outputs": [],
"source": [
"train = train_file_in[[6, 7]]\n",
"train = pd.concat([train, train_file_out], axis=1)\n",
"\n",
"train[\"result\"] = train[6] + train[0] + train[7]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0b9ce1a1",
"metadata": {},
"outputs": [],
"source": [
"for index, row in train.iterrows():\n",
" lower= str(row[\"result\"]).lower()\n",
" new_doc = re.sub(\"s+\",\" \", lower)\n",
" text_clean = \"\".join([i for i in new_doc if i not in string.punctuation])\n",
" words = word_tokenize(text_clean)\n",
" for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):\n",
" if w1 and w2 and w3:\n",
" model[(w2, w3)][w1] += 1\n",
" setOf.add(w1)\n",
" setOf.add(w2)\n",
" setOf.add(w3)\n",
" \n",
" for words in model:\n",
" num_n_grams = float(sum(model[words].values()))\n",
" for word in model[words]:\n",
" model[words][word] = (model[words][word] + alpha) / (num_n_grams + alpha*len(vocab))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "57c08749",
"metadata": {},
"outputs": [],
"source": [
"for key in model:\n",
" total_count = float(sum(model[key].values()))\n",
" for value in model[key]:\n",
" model[key][value] /= total_count"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f0ad2b1a",
"metadata": {},
"outputs": [],
"source": [
"dev_data = pd.read_csv('dev-0/in.tsv.xz', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)\n",
"test_a_data = pd.read_csv('test-A/in.tsv.xz', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e91e51e5",
"metadata": {},
"outputs": [],
"source": [
"with open('dev-0/out.tsv', 'w') as file:\n",
" for index, row in dev_data.iterrows():\n",
" lower= str(row[7]).lower()\n",
" new_doc = re.sub(\"s+\",\" \", lower)\n",
" text_clean = \"\".join([i for i in new_doc if i not in string.punctuation])\n",
" words = word_tokenize(text_clean)\n",
" if len(words) < 4:\n",
" print(words)\n",
" prediction = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'\n",
" else:\n",
" prediction = predict(words[0], words[1])\n",
" file.write(prediction + '\\n')\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "01e3f7e8",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because it is too large Load Diff