{ "cells": [ { "cell_type": "code", "execution_count": 90, "id": "7dc5e391", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import csv" ] }, { "cell_type": "code", "execution_count": 91, "id": "a0825c64", "metadata": {}, "outputs": [], "source": [ "tsv_data = pd.read_csv('in.tsv', sep='\\t',header=None, quoting=csv.QUOTE_NONE)[0]" ] }, { "cell_type": "code", "execution_count": 139, "id": "4b9092a6", "metadata": {}, "outputs": [], "source": [ "expected = pd.read_csv('expected.tsv', sep='\\t',header=None)[0]" ] }, { "cell_type": "code", "execution_count": 94, "id": "56c39aa1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "137314\n", "137314\n" ] } ], "source": [ "print(len(expected))\n", "print(len(tsv_data))" ] }, { "cell_type": "code", "execution_count": 158, "id": "d7b300ca", "metadata": {}, "outputs": [], "source": [ "male={'silnik', 'windows', 'gb', 'mb', 'mecz', 'pc', 'opony', 'apple', 'iphone', 'zwiastuny', 'hd', 'ubuntu', 'system', 'serwer'}\n", "female={'ciąża', 'miesiączki', 'ciasto', 'ciąże', 'zadowolona', 'antykoncepcyjne', 'ginekologia', 'tabletki', 'porodzie', 'mąż', 'krwawienie', 'ciasta'}\n", "male = {x[:6].lower() for x in male}\n", "female = {x[:6].lower() for x in female}" ] }, { "cell_type": "code", "execution_count": 159, "id": "31b5864b", "metadata": {}, "outputs": [], "source": [ "trimmed_docs=[]\n", "for document in tsv_data:\n", " new_doc=[]\n", " for word in str(document).lower().split():\n", " new_doc.append(word[:6])\n", " trimmed_docs.append(new_doc)" ] }, { "cell_type": "code", "execution_count": 160, "id": "c1f02d77", "metadata": {}, "outputs": [], "source": [ "male_or_female=[]\n", "for doc in trimmed_docs:\n", " male_or_female.append((len(male&set(doc)), len(female&set(doc))))" ] }, { "cell_type": "code", "execution_count": 161, "id": "6edfd944", "metadata": {}, "outputs": [], "source": [ "answers=[]\n", "for i in male_or_female:\n", " if i[0]>i[1]:\n", " answers.append(1)\n", " else:\n", " answers.append(0)" ] }, { "cell_type": "code", "execution_count": 162, "id": "40369c2b", "metadata": {}, "outputs": [], "source": [ "result=[]\n", "for i in range(len(answers)):\n", " if answers[i]==expected[i]:\n", " result.append(1)\n", " else:\n", " result.append(0)\n" ] }, { "cell_type": "code", "execution_count": 163, "id": "e296921c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Predykcja modelu wynosi 51.007909%\n" ] } ], "source": [ "print(f'Predykcja modelu wynosi {sum(result)/len(result)*100:.6f}%')" ] }, { "cell_type": "code", "execution_count": 167, "id": "fee431a4", "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame(result)\n", "df.to_csv('out.tsv', sep = '\\t')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }