249 lines
5.1 KiB
Plaintext
249 lines
5.1 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 90,
|
||
"id": "5b55a105",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import csv"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 91,
|
||
"id": "9364cf2c",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"tsv_data = pd.read_csv('in.tsv', sep='\\t',header=None, quoting=csv.QUOTE_NONE)[0]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 139,
|
||
"id": "9d3f7db9",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"expected = pd.read_csv('expected.tsv', sep='\\t',header=None)[0]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 94,
|
||
"id": "5062478d",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"137314\n",
|
||
"137314\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(len(expected))\n",
|
||
"print(len(tsv_data))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 158,
|
||
"id": "5eca7aab",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"male={'silnik', 'windows', 'gb', 'mb', 'mecz', 'pc', 'opony', 'apple', 'iphone', 'zwiastuny', 'hd', 'ubuntu', 'system', 'serwer'}\n",
|
||
"female={'ciąża', 'miesiączki', 'ciasto', 'ciąże', 'zadowolona', 'antykoncepcyjne', 'ginekologia', 'tabletki', 'porodzie', 'mąż', 'krwawienie', 'ciasta'}\n",
|
||
"male = {x[:6].lower() for x in male}\n",
|
||
"female = {x[:6].lower() for x in female}"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 159,
|
||
"id": "0bdd1845",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"trimmed_docs=[]\n",
|
||
"for document in tsv_data:\n",
|
||
" new_doc=[]\n",
|
||
" for word in str(document).lower().split():\n",
|
||
" new_doc.append(word[:6])\n",
|
||
" trimmed_docs.append(new_doc)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 160,
|
||
"id": "b36bbd92",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"male_or_female=[]\n",
|
||
"for doc in trimmed_docs:\n",
|
||
" male_or_female.append((len(male&set(doc)), len(female&set(doc))))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 161,
|
||
"id": "ccbad95c",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"answers=[]\n",
|
||
"for i in male_or_female:\n",
|
||
" if i[0]>i[1]:\n",
|
||
" answers.append(1)\n",
|
||
" else:\n",
|
||
" answers.append(0)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 162,
|
||
"id": "02ee0acf",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"result=[]\n",
|
||
"for i in range(len(answers)):\n",
|
||
" if answers[i]==expected[i]:\n",
|
||
" result.append(1)\n",
|
||
" else:\n",
|
||
" result.append(0)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 163,
|
||
"id": "db803a58",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Predykcja modelu wynosi 51.007909%\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(f'Predykcja modelu wynosi {sum(result)/len(result)*100:.6f}%')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 164,
|
||
"id": "e1a15db7",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"['cierpi',\n",
|
||
" 'na',\n",
|
||
" 'strasz',\n",
|
||
" 'lagi',\n",
|
||
" '–',\n",
|
||
" 'kilkan',\n",
|
||
" 'sekund',\n",
|
||
" 'lub',\n",
|
||
" 'dłużej',\n",
|
||
" 'czarne',\n",
|
||
" 'ekranu',\n",
|
||
" 'przy',\n",
|
||
" 'próbie',\n",
|
||
" 'przełą',\n",
|
||
" 'się',\n",
|
||
" '/',\n",
|
||
" 'urucho',\n",
|
||
" 'prawie',\n",
|
||
" 'każdej',\n",
|
||
" 'aplika',\n",
|
||
" 'dodatk',\n",
|
||
" 'telefo',\n",
|
||
" 'mi',\n",
|
||
" 'się',\n",
|
||
" 'wyłącz',\n",
|
||
" 'czasem',\n",
|
||
" 'bez',\n",
|
||
" 'powodu',\n",
|
||
" '–',\n",
|
||
" 'sam',\n",
|
||
" 'z',\n",
|
||
" 'siebie',\n",
|
||
" 'albo',\n",
|
||
" 'reseto',\n",
|
||
" 'ostatn',\n",
|
||
" 'nawet',\n",
|
||
" 'przegl',\n",
|
||
" 'zaczęł',\n",
|
||
" 'się',\n",
|
||
" 'często',\n",
|
||
" 'zawies',\n",
|
||
" 'i',\n",
|
||
" 'androi',\n",
|
||
" 'propon',\n",
|
||
" 'wymusz',\n",
|
||
" 'zamkni',\n",
|
||
" 'do',\n",
|
||
" 'tego',\n",
|
||
" 'te',\n",
|
||
" 'proble',\n",
|
||
" 'z',\n",
|
||
" 'połącz',\n",
|
||
" 'do',\n",
|
||
" 'komput',\n",
|
||
" 'przez',\n",
|
||
" 'usb.']"
|
||
]
|
||
},
|
||
"execution_count": 164,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"trimmed_docs[0]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "7403c1bb",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.9.7"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|