diff --git a/.gitignore b/.gitignore index 1c18d74..cbe754f 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ *.o .DS_Store .token +geval +*in.tsv \ No newline at end of file diff --git a/notebook.ipynb b/notebook.ipynb new file mode 100644 index 0000000..da6f02d --- /dev/null +++ b/notebook.ipynb @@ -0,0 +1,163 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "unxz: challenging-america-word-gap-prediction/train/in.tsv.xz: No such file or directory\n", + "unxz: challenging-america-word-gap-prediction/test-A/in.tsv.xz: No such file or directory\n", + "unxz: challenging-america-word-gap-prediction/dev-0/in.tsv.xz: No such file or directory\n" + ] + } + ], + "source": [ + "!unxz challenging-america-word-gap-prediction/train/in.tsv.xz --keep\n", + "!unxz challenging-america-word-gap-prediction/test-A/in.tsv.xz --keep\n", + "!unxz challenging-america-word-gap-prediction/dev-0/in.tsv.xz --keep" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "expected.tsv in.tsv\n" + ] + } + ], + "source": [ + "!ls challenging-america-word-gap-prediction/train" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "All texts: 10\n", + "All labels: 10\n" + ] + } + ], + "source": [ + "import nltk\n", + "def get_texts():\n", + " with open(\"challenging-america-word-gap-prediction/train/in.tsv\", \"r\", encoding=\"UTF-8\") as f:\n", + " i = 0\n", + " while True:\n", + " i+=1\n", + " text = f.readline()\n", + " if(text == None or i > 10):\n", + " break\n", + " text = text.split('\\t')[6]\n", + " text = text.replace(\"-\\n\", \"\").replace(\"\\n\", \" \")\n", + " yield \n", + "\n", + "# def get_words():\n", + "# for text in get_texts():\n", + "# for word in nltk.word_tokenize(text):\n", + "# yield word\n", + "\n", + "def get_labels():\n", + " with open(\"challenging-america-word-gap-prediction/train/expected.tsv\", \"r\", encoding=\"UTF-8\") as f:\n", + " yield from f.readlines()[0:10]\n", + "\n", + "texts_sum = sum(1 for text in get_texts())\n", + "labels_sum = sum(1 for label in get_labels())\n", + "# words_sum = sum(1 for word in get_words())\n", + "print(f\"All texts: {texts_sum}\")\n", + "print(f\"All labels: {labels_sum}\")\n", + "# print(f\"All words: {words_sum}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "None\n", + "None\n", + "None\n", + "None\n", + "None\n", + "None\n", + "None\n", + "None\n", + "None\n", + "None\n" + ] + } + ], + "source": [ + "for text in get_texts():\n", + " print(text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model bigramowy odwrotny" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class Model():\n", + " def __init__(self, vocab_size, UNK_token= ''):\n", + " pass\n", + " \n", + " def train(corpus:list) -> None:\n", + " pass\n", + " \n", + " def predict(text: list, probs: str) -> float:\n", + " pass" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" + }, + "kernelspec": { + "display_name": "Python 3.8.5 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}