diff --git a/dev-0/.ipynb_checkpoints/model-checkpoint.ipynb b/dev-0/.ipynb_checkpoints/model-checkpoint.ipynb new file mode 100644 index 0000000..363fcab --- /dev/null +++ b/dev-0/.ipynb_checkpoints/model-checkpoint.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/dev-0/.jovianrc b/dev-0/.jovianrc new file mode 100644 index 0000000..2f4547b --- /dev/null +++ b/dev-0/.jovianrc @@ -0,0 +1,3 @@ +{ + "notebooks": {} +} \ No newline at end of file diff --git a/dev-0/model.ipynb b/dev-0/model.ipynb new file mode 100644 index 0000000..d2af76b --- /dev/null +++ b/dev-0/model.ipynb @@ -0,0 +1,248 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 90, + "id": "5b55a105", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import csv" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "9364cf2c", + "metadata": {}, + "outputs": [], + "source": [ + "tsv_data = pd.read_csv('in.tsv', sep='\\t',header=None, quoting=csv.QUOTE_NONE)[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "id": "9d3f7db9", + "metadata": {}, + "outputs": [], + "source": [ + "expected = pd.read_csv('expected.tsv', sep='\\t',header=None)[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "id": "5062478d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "137314\n", + "137314\n" + ] + } + ], + "source": [ + "print(len(expected))\n", + "print(len(tsv_data))" + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "id": "5eca7aab", + "metadata": {}, + "outputs": [], + "source": [ + "male={'silnik', 'windows', 'gb', 'mb', 'mecz', 'pc', 'opony', 'apple', 'iphone', 'zwiastuny', 'hd', 'ubuntu', 'system', 'serwer'}\n", + "female={'ciąża', 'miesiączki', 'ciasto', 'ciąże', 'zadowolona', 'antykoncepcyjne', 'ginekologia', 'tabletki', 'porodzie', 'mąż', 'krwawienie', 'ciasta'}\n", + "male = {x[:6].lower() for x in male}\n", + "female = {x[:6].lower() for x in female}" + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "id": "0bdd1845", + "metadata": {}, + "outputs": [], + "source": [ + "trimmed_docs=[]\n", + "for document in tsv_data:\n", + " new_doc=[]\n", + " for word in str(document).lower().split():\n", + " new_doc.append(word[:6])\n", + " trimmed_docs.append(new_doc)" + ] + }, + { + "cell_type": "code", + "execution_count": 160, + "id": "b36bbd92", + "metadata": {}, + "outputs": [], + "source": [ + "male_or_female=[]\n", + "for doc in trimmed_docs:\n", + " male_or_female.append((len(male&set(doc)), len(female&set(doc))))" + ] + }, + { + "cell_type": "code", + "execution_count": 161, + "id": "ccbad95c", + "metadata": {}, + "outputs": [], + "source": [ + "answers=[]\n", + "for i in male_or_female:\n", + " if i[0]>i[1]:\n", + " answers.append(1)\n", + " else:\n", + " answers.append(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "id": "02ee0acf", + "metadata": {}, + "outputs": [], + "source": [ + "result=[]\n", + "for i in range(len(answers)):\n", + " if answers[i]==expected[i]:\n", + " result.append(1)\n", + " else:\n", + " result.append(0)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "id": "db803a58", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Predykcja modelu wynosi 51.007909%\n" + ] + } + ], + "source": [ + "print(f'Predykcja modelu wynosi {sum(result)/len(result)*100:.6f}%')" + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "id": "e1a15db7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['cierpi',\n", + " 'na',\n", + " 'strasz',\n", + " 'lagi',\n", + " '–',\n", + " 'kilkan',\n", + " 'sekund',\n", + " 'lub',\n", + " 'dłużej',\n", + " 'czarne',\n", + " 'ekranu',\n", + " 'przy',\n", + " 'próbie',\n", + " 'przełą',\n", + " 'się',\n", + " '/',\n", + " 'urucho',\n", + " 'prawie',\n", + " 'każdej',\n", + " 'aplika',\n", + " 'dodatk',\n", + " 'telefo',\n", + " 'mi',\n", + " 'się',\n", + " 'wyłącz',\n", + " 'czasem',\n", + " 'bez',\n", + " 'powodu',\n", + " '–',\n", + " 'sam',\n", + " 'z',\n", + " 'siebie',\n", + " 'albo',\n", + " 'reseto',\n", + " 'ostatn',\n", + " 'nawet',\n", + " 'przegl',\n", + " 'zaczęł',\n", + " 'się',\n", + " 'często',\n", + " 'zawies',\n", + " 'i',\n", + " 'androi',\n", + " 'propon',\n", + " 'wymusz',\n", + " 'zamkni',\n", + " 'do',\n", + " 'tego',\n", + " 'te',\n", + " 'proble',\n", + " 'z',\n", + " 'połącz',\n", + " 'do',\n", + " 'komput',\n", + " 'przez',\n", + " 'usb.']" + ] + }, + "execution_count": 164, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trimmed_docs[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7403c1bb", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}