projekt
This commit is contained in:
commit
a49e52ee6c
5452
expected.tsv
Normal file
5452
expected.tsv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
projekt.docx
Normal file
BIN
projekt.docx
Normal file
Binary file not shown.
146
projekt.ipynb
Normal file
146
projekt.ipynb
Normal file
@ -0,0 +1,146 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"C:\\Users\\szymo\\AppData\\Local\\Temp\\ipykernel_17472\\3947148253.py:28: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" data = pd.read_csv(f, sep='\\t',error_bad_lines=False,names=['isBall','text'])\n",
|
||||
"b'Skipping line 25706: expected 2 fields, saw 3\\nSkipping line 58881: expected 2 fields, saw 3\\nSkipping line 73761: expected 2 fields, saw 3\\n'\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Naiwny bayes:\n",
|
||||
"recal score = 0.9939463822427212\n",
|
||||
"acuracy score = 0.9889948642699926\n",
|
||||
"precision score = 0.9888156008029825\n",
|
||||
"f score score = 0.9913743530764807\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"c:\\Users\\szymo\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:814: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
|
||||
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
|
||||
"\n",
|
||||
"Increase the number of iterations (max_iter) or scale the data as shown in:\n",
|
||||
" https://scikit-learn.org/stable/modules/preprocessing.html\n",
|
||||
"Please also refer to the documentation for alternative solver options:\n",
|
||||
" https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
|
||||
" n_iter_i = _check_optimize_result(\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"Regresja logistyczna:\n",
|
||||
"recal score = 0.9979821274142404\n",
|
||||
"acuracy score = 0.9891782831988262\n",
|
||||
"precision score = 0.9852020489470689\n",
|
||||
"f score score = 0.9915509093512818\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import lzma\n",
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"import gzip\n",
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"from sklearn.naive_bayes import MultinomialNB\n",
|
||||
"from sklearn.pipeline import make_pipeline\n",
|
||||
"from sklearn.metrics import recall_score\n",
|
||||
"from sklearn.metrics import precision_score\n",
|
||||
"from sklearn.metrics import accuracy_score\n",
|
||||
"from sklearn.metrics import f1_score\n",
|
||||
"from sklearn.linear_model import LogisticRegression\n",
|
||||
"\n",
|
||||
"def readFile(filename):\n",
|
||||
" X_dev = []\n",
|
||||
" with open(filename, 'r', encoding=\"utf-8\") as dev_in:\n",
|
||||
" for line in dev_in:\n",
|
||||
" text = line.split(\"\\t\")[0].strip()\n",
|
||||
" X_dev.append(text)\n",
|
||||
" return X_dev\n",
|
||||
" \n",
|
||||
"def writePred(filename, predictions):\n",
|
||||
" with open(filename, \"w\") as out_file:\n",
|
||||
" for pred in predictions:\n",
|
||||
" out_file.write(str(pred) + \"\\n\")\n",
|
||||
"\n",
|
||||
"with gzip.open('train.tsv.gz', 'rb') as f:\n",
|
||||
" data = pd.read_csv(f, sep='\\t',error_bad_lines=False,names=['isBall','text'])\n",
|
||||
"\n",
|
||||
"x = data['text']\n",
|
||||
"y = data['isBall']\n",
|
||||
"\n",
|
||||
"x = np.asarray(x)\n",
|
||||
"y = np.asarray(y)\n",
|
||||
"\n",
|
||||
"model = make_pipeline(TfidfVectorizer(), MultinomialNB())\n",
|
||||
"model.fit(x,y)\n",
|
||||
"\n",
|
||||
"dev = readFile('in.tsv')\n",
|
||||
"pred = model.predict(dev)\n",
|
||||
"trueClass = readFile('expected.tsv')\n",
|
||||
"\n",
|
||||
"trueClass = [int(x) for x in trueClass]\n",
|
||||
"print('Naiwny bayes:')\n",
|
||||
"print(\"recal score = \",recall_score(trueClass,list(pred)))\n",
|
||||
"print(\"acuracy score = \",accuracy_score(trueClass,list(pred)))\n",
|
||||
"print(\"precision score = \",precision_score(trueClass,list(pred)))\n",
|
||||
"print(\"f score score = \",f1_score(trueClass,list(pred)))\n",
|
||||
"\n",
|
||||
"#x = x[:50000]\n",
|
||||
"#y = y[:50000]\n",
|
||||
"model = make_pipeline(TfidfVectorizer(),LogisticRegression())\n",
|
||||
"model.fit(x,y)\n",
|
||||
"pred = model.predict(dev)\n",
|
||||
"\n",
|
||||
"print('\\nRegresja logistyczna:')\n",
|
||||
"print(\"recal score = \",recall_score(trueClass,list(pred)))\n",
|
||||
"print(\"acuracy score = \",accuracy_score(trueClass,list(pred)))\n",
|
||||
"print(\"precision score = \",precision_score(trueClass,list(pred)))\n",
|
||||
"print(\"f score score = \",f1_score(trueClass,list(pred)))\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"interpreter": {
|
||||
"hash": "393784674bcf6e74f2fc9b1b5fb3713f9bd5fc6f8172c594e5cfa8e8c12849bc"
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.9.2 64-bit",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.2"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
BIN
train.tsv.gz
Normal file
BIN
train.tsv.gz
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user