{ "cells": [ { "cell_type": "code", "execution_count": 90, "id": "5b55a105", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import csv" ] }, { "cell_type": "code", "execution_count": 91, "id": "9364cf2c", "metadata": {}, "outputs": [], "source": [ "tsv_data = pd.read_csv('in.tsv', sep='\\t',header=None, quoting=csv.QUOTE_NONE)[0]" ] }, { "cell_type": "code", "execution_count": 139, "id": "9d3f7db9", "metadata": {}, "outputs": [], "source": [ "expected = pd.read_csv('expected.tsv', sep='\\t',header=None)[0]" ] }, { "cell_type": "code", "execution_count": 94, "id": "5062478d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "137314\n", "137314\n" ] } ], "source": [ "print(len(expected))\n", "print(len(tsv_data))" ] }, { "cell_type": "code", "execution_count": 158, "id": "5eca7aab", "metadata": {}, "outputs": [], "source": [ "male={'silnik', 'windows', 'gb', 'mb', 'mecz', 'pc', 'opony', 'apple', 'iphone', 'zwiastuny', 'hd', 'ubuntu', 'system', 'serwer'}\n", "female={'ciąża', 'miesiączki', 'ciasto', 'ciąże', 'zadowolona', 'antykoncepcyjne', 'ginekologia', 'tabletki', 'porodzie', 'mąż', 'krwawienie', 'ciasta'}\n", "male = {x[:6].lower() for x in male}\n", "female = {x[:6].lower() for x in female}" ] }, { "cell_type": "code", "execution_count": 159, "id": "0bdd1845", "metadata": {}, "outputs": [], "source": [ "trimmed_docs=[]\n", "for document in tsv_data:\n", " new_doc=[]\n", " for word in str(document).lower().split():\n", " new_doc.append(word[:6])\n", " trimmed_docs.append(new_doc)" ] }, { "cell_type": "code", "execution_count": 160, "id": "b36bbd92", "metadata": {}, "outputs": [], "source": [ "male_or_female=[]\n", "for doc in trimmed_docs:\n", " male_or_female.append((len(male&set(doc)), len(female&set(doc))))" ] }, { "cell_type": "code", "execution_count": 161, "id": "ccbad95c", "metadata": {}, "outputs": [], "source": [ "answers=[]\n", "for i in male_or_female:\n", " if i[0]>i[1]:\n", " answers.append(1)\n", " else:\n", " answers.append(0)" ] }, { "cell_type": "code", "execution_count": 162, "id": "02ee0acf", "metadata": {}, "outputs": [], "source": [ "result=[]\n", "for i in range(len(answers)):\n", " if answers[i]==expected[i]:\n", " result.append(1)\n", " else:\n", " result.append(0)\n" ] }, { "cell_type": "code", "execution_count": 163, "id": "db803a58", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Predykcja modelu wynosi 51.007909%\n" ] } ], "source": [ "print(f'Predykcja modelu wynosi {sum(result)/len(result)*100:.6f}%')" ] }, { "cell_type": "code", "execution_count": 164, "id": "e1a15db7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['cierpi',\n", " 'na',\n", " 'strasz',\n", " 'lagi',\n", " '–',\n", " 'kilkan',\n", " 'sekund',\n", " 'lub',\n", " 'dłużej',\n", " 'czarne',\n", " 'ekranu',\n", " 'przy',\n", " 'próbie',\n", " 'przełą',\n", " 'się',\n", " '/',\n", " 'urucho',\n", " 'prawie',\n", " 'każdej',\n", " 'aplika',\n", " 'dodatk',\n", " 'telefo',\n", " 'mi',\n", " 'się',\n", " 'wyłącz',\n", " 'czasem',\n", " 'bez',\n", " 'powodu',\n", " '–',\n", " 'sam',\n", " 'z',\n", " 'siebie',\n", " 'albo',\n", " 'reseto',\n", " 'ostatn',\n", " 'nawet',\n", " 'przegl',\n", " 'zaczęł',\n", " 'się',\n", " 'często',\n", " 'zawies',\n", " 'i',\n", " 'androi',\n", " 'propon',\n", " 'wymusz',\n", " 'zamkni',\n", " 'do',\n", " 'tego',\n", " 'te',\n", " 'proble',\n", " 'z',\n", " 'połącz',\n", " 'do',\n", " 'komput',\n", " 'przez',\n", " 'usb.']" ] }, "execution_count": 164, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trimmed_docs[0]" ] }, { "cell_type": "code", "execution_count": null, "id": "7403c1bb", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }