135 lines
3.0 KiB
Plaintext
135 lines
3.0 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"outputs": [],
|
|
"source": [
|
|
"#!/usr/bin/env python\n",
|
|
"# coding: utf-8\n",
|
|
"\n",
|
|
"from sklearn.naive_bayes import MultinomialNB\n",
|
|
"from sklearn.metrics import accuracy_score\n",
|
|
"from sklearn.feature_extraction.text import CountVectorizer\n",
|
|
"import lzma\n",
|
|
"\n",
|
|
"X_train = lzma.open(\"train/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()\n",
|
|
"y_train = open('train/expected.tsv').readlines()\n",
|
|
"X_dev0 = lzma.open(\"dev-0/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()\n",
|
|
"y_expected_dev0 = open(\"dev-0/expected.tsv\", \"r\").readlines()\n",
|
|
"X_test = lzma.open(\"test-A/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"outputs": [],
|
|
"source": [
|
|
"count_vect = CountVectorizer()\n",
|
|
"X_train_counts = count_vect.fit_transform(X_train)\n",
|
|
"X_dev0_counts = count_vect.transform(X_dev0)\n",
|
|
"X_test_counts = count_vect.transform(X_test)"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"outputs": [],
|
|
"source": [
|
|
"clf = MultinomialNB().fit(X_train_counts, y_train)\n",
|
|
"\n",
|
|
"y_predicted_dev0_MNB = clf.predict(X_dev0_counts)\n",
|
|
"y_predicted_test_MNB = clf.predict(X_test_counts)"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Accuracy dev0: 0.8025417298937785\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"accuracy_dev0_MNB = accuracy_score(y_expected_dev0, y_predicted_dev0_MNB)\n",
|
|
"print(f\"Accuracy dev0: {accuracy_dev0_MNB}\")\n"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"outputs": [],
|
|
"source": [
|
|
"open(\"dev-0/out.tsv\", mode='w').writelines(y_predicted_dev0_MNB)\n",
|
|
"open(\"test-A/out.tsv\", mode='w').writelines(y_predicted_test_MNB)"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [],
|
|
"metadata": {
|
|
"collapsed": false,
|
|
"pycharm": {
|
|
"name": "#%%\n"
|
|
}
|
|
}
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 2
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython2",
|
|
"version": "2.7.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 0
|
|
} |