multinomial naive bayes

2022-04-22 16:59:51 +02:00 · 2022-04-22 16:59:51 +02:00 · bb4e135e1c
commit bb4e135e1c
19 changed files with 5184167 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -0,0 +1,4 @@
+<a name="2.0.0"></a>
+## 2.0.0 (2020-05-22)
+
+* Switch to probabilities as the main metric
--- a/README.md
+++ b/README.md
@ -0,0 +1,38 @@
+
+"He Said She Said" classification challenge (2nd edition)
+=========================================================
+
+Give the probability that a text in Polish was written by a man.
+
+This challenge is based on the "He Said She Said" corpus for Polish.
+The corpus was created by grepping gender-specific first person
+expressions (e.g. "zrobiłem/zrobiłam", "jestem zadowolony/zadowolona",
+"będę robił/robiła") in the Common Crawl corpus. Such expressions were
+normalised here into masculine forms.
+
+Classes
+-------
+
+* `0` — text written by a woman
+* `1` — text written by a man
+
+Directory structure
+-------------------
+
+* `README.md` — this file
+* `config.txt` — configuration file
+* `train/` — directory with training data
+* `train/train.tsv.gz` — train set (gzipped), the class is given in the first column,
+  a text fragment in the second one
+* `train/meta.tsv.gz` — metadata (do not use during training)
+* `dev-0/` — directory with dev (test) data
+* `dev-0/in.tsv` — input data for the dev set (text fragments)
+* `dev-0/expected.tsv` — expected (reference) data for the dev set
+* `dev-0/meta.tsv` — metadata (not used during testing)
+* `dev-1/` — directory with extra dev (test) data
+* `dev-1/in.tsv` — input data for the extra dev set (text fragments)
+* `dev-1/expected.tsv` — expected (reference) data for the extra dev set
+* `dev-1/meta.tsv` — metadata (not used during testing)
+* `test-A` — directory with test data
+* `test-A/in.tsv` — input data for the test set (text fragments)
+* `test-A/expected.tsv` — expected (reference) data for the test set (hidden)
--- a/dev-0/.ipynb_checkpoints/out-checkpoint.tsv
+++ b/dev-0/.ipynb_checkpoints/out-checkpoint.tsv
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/dev-0/in.tsv
+++ b/dev-0/in.tsv
--- a/dev-0/meta.tsv
+++ b/dev-0/meta.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/dev-1/expected.tsv
+++ b/dev-1/expected.tsv
--- a/dev-1/in.tsv
+++ b/dev-1/in.tsv
--- a/dev-1/meta.tsv
+++ b/dev-1/meta.tsv
--- a/dev-1/out.tsv
+++ b/dev-1/out.tsv
--- a/run-Copy1.ipynb
+++ b/run-Copy1.ipynb
@ -0,0 +1,254 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "quality-quebec",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:35: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  eps=np.finfo(np.float).eps,\n",
+      "/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:597: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,\n",
+      "/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:836: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,\n",
+      "/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:862: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  eps=np.finfo(np.float).eps, positive=False):\n",
+      "/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:1097: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,\n",
+      "/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:1344: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,\n",
+      "/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:1480: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  eps=np.finfo(np.float).eps, copy_X=True, positive=False):\n",
+      "/usr/lib/python3/dist-packages/sklearn/linear_model/randomized_l1.py:152: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  precompute=False, eps=np.finfo(np.float).eps,\n",
+      "/usr/lib/python3/dist-packages/sklearn/linear_model/randomized_l1.py:320: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  eps=np.finfo(np.float).eps, random_state=None,\n",
+      "/usr/lib/python3/dist-packages/sklearn/linear_model/randomized_l1.py:580: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  eps=4 * np.finfo(np.float).eps, n_jobs=None,\n",
+      "/usr/lib/python3/dist-packages/sklearn/feature_extraction/image.py:167: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  dtype=np.int):\n",
+      "ERROR:root:Internal Python error in the inspect module.\n",
+      "Below is the traceback from this internal error.\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py\", line 3427, in run_code\n",
+      "    exec(code_obj, self.user_global_ns, self.user_ns)\n",
+      "  File \"<ipython-input-1-a756e6b5ff53>\", line 22, in <module>\n",
+      "    X_train_counts = count_vect.fit_transform(X_train_raw)\n",
+      "  File \"/usr/lib/python3/dist-packages/sklearn/feature_extraction/text.py\", line 1032, in fit_transform\n",
+      "    self.fixed_vocabulary_)\n",
+      "  File \"/usr/lib/python3/dist-packages/sklearn/feature_extraction/text.py\", line 944, in _count_vocab\n",
+      "    feature_idx = vocabulary[feature]\n",
+      "KeyboardInterrupt\n",
+      "\n",
+      "During handling of the above exception, another exception occurred:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py\", line 2054, in showtraceback\n",
+      "    stb = value._render_traceback_()\n",
+      "AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'\n",
+      "\n",
+      "During handling of the above exception, another exception occurred:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py\", line 1101, in get_records\n",
+      "    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)\n",
+      "  File \"/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py\", line 248, in wrapped\n",
+      "    return f(*args, **kwargs)\n",
+      "  File \"/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py\", line 281, in _fixed_getinnerframes\n",
+      "    records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))\n",
+      "  File \"/usr/lib/python3.7/inspect.py\", line 1502, in getinnerframes\n",
+      "    frameinfo = (tb.tb_frame,) + getframeinfo(tb, context)\n",
+      "  File \"/usr/lib/python3.7/inspect.py\", line 1460, in getframeinfo\n",
+      "    filename = getsourcefile(frame) or getfile(frame)\n",
+      "  File \"/usr/lib/python3.7/inspect.py\", line 696, in getsourcefile\n",
+      "    if getattr(getmodule(object, filename), '__loader__', None) is not None:\n",
+      "  File \"/usr/lib/python3.7/inspect.py\", line 732, in getmodule\n",
+      "    for modname, module in list(sys.modules.items()):\n",
+      "KeyboardInterrupt\n"
+     ]
+    },
+    {
+     "ename": "TypeError",
+     "evalue": "object of type 'NoneType' has no len()",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "    \u001b[0;31m[... skipping hidden 1 frame]\u001b[0m\n",
+      "\u001b[0;32m<ipython-input-1-a756e6b5ff53>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     21\u001b[0m \u001b[0mcount_vect\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mCountVectorizer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 22\u001b[0;31m \u001b[0mX_train_counts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcount_vect\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train_raw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     23\u001b[0m \u001b[0mX_dev0_counts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcount_vect\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_dev0_cleaned\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/usr/lib/python3/dist-packages/sklearn/feature_extraction/text.py\u001b[0m in \u001b[0;36mfit_transform\u001b[0;34m(self, raw_documents, y)\u001b[0m\n\u001b[1;32m   1031\u001b[0m         vocabulary, X = self._count_vocab(raw_documents,\n\u001b[0;32m-> 1032\u001b[0;31m                                           self.fixed_vocabulary_)\n\u001b[0m\u001b[1;32m   1033\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/usr/lib/python3/dist-packages/sklearn/feature_extraction/text.py\u001b[0m in \u001b[0;36m_count_vocab\u001b[0;34m(self, raw_documents, fixed_vocab)\u001b[0m\n\u001b[1;32m    943\u001b[0m                 \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 944\u001b[0;31m                     \u001b[0mfeature_idx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvocabulary\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mfeature\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    945\u001b[0m                     \u001b[0;32mif\u001b[0m \u001b[0mfeature_idx\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mfeature_counter\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: ",
+      "\nDuring handling of the above exception, another exception occurred:\n",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py\u001b[0m in \u001b[0;36mshowtraceback\u001b[0;34m(self, exc_tuple, filename, tb_offset, exception_only, running_compiled_code)\u001b[0m\n\u001b[1;32m   2053\u001b[0m                         \u001b[0;31m# in the engines. This should return a list of strings.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2054\u001b[0;31m                         \u001b[0mstb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_render_traceback_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2055\u001b[0m                     \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'KeyboardInterrupt' object has no attribute '_render_traceback_'",
+      "\nDuring handling of the above exception, another exception occurred:\n",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "    \u001b[0;31m[... skipping hidden 1 frame]\u001b[0m\n",
+      "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py\u001b[0m in \u001b[0;36mshowtraceback\u001b[0;34m(self, exc_tuple, filename, tb_offset, exception_only, running_compiled_code)\u001b[0m\n\u001b[1;32m   2055\u001b[0m                     \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2056\u001b[0m                         stb = self.InteractiveTB.structured_traceback(etype,\n\u001b[0;32m-> 2057\u001b[0;31m                                             value, tb, tb_offset=tb_offset)\n\u001b[0m\u001b[1;32m   2058\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2059\u001b[0m                     \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_showtraceback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0metype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py\u001b[0m in \u001b[0;36mstructured_traceback\u001b[0;34m(self, etype, value, tb, tb_offset, number_of_lines_of_context)\u001b[0m\n\u001b[1;32m   1366\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtb\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1367\u001b[0m         return FormattedTB.structured_traceback(\n\u001b[0;32m-> 1368\u001b[0;31m             self, etype, value, tb, tb_offset, number_of_lines_of_context)\n\u001b[0m\u001b[1;32m   1369\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1370\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py\u001b[0m in \u001b[0;36mstructured_traceback\u001b[0;34m(self, etype, value, tb, tb_offset, number_of_lines_of_context)\u001b[0m\n\u001b[1;32m   1266\u001b[0m             \u001b[0;31m# Verbose modes need a full traceback\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1267\u001b[0m             return VerboseTB.structured_traceback(\n\u001b[0;32m-> 1268\u001b[0;31m                 \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0metype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtb\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtb_offset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnumber_of_lines_of_context\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1269\u001b[0m             )\n\u001b[1;32m   1270\u001b[0m         \u001b[0;32melif\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'Minimal'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py\u001b[0m in \u001b[0;36mstructured_traceback\u001b[0;34m(self, etype, evalue, etb, tb_offset, number_of_lines_of_context)\u001b[0m\n\u001b[1;32m   1123\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1124\u001b[0m         formatted_exception = self.format_exception_as_a_whole(etype, evalue, etb, number_of_lines_of_context,\n\u001b[0;32m-> 1125\u001b[0;31m                                                                tb_offset)\n\u001b[0m\u001b[1;32m   1126\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1127\u001b[0m         \u001b[0mcolors\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mColors\u001b[0m  \u001b[0;31m# just a shorthand + quicker name lookup\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py\u001b[0m in \u001b[0;36mformat_exception_as_a_whole\u001b[0;34m(self, etype, evalue, etb, number_of_lines_of_context, tb_offset)\u001b[0m\n\u001b[1;32m   1080\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1081\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1082\u001b[0;31m         \u001b[0mlast_unique\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrecursion_repeat\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfind_recursion\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0morig_etype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mevalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrecords\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1083\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1084\u001b[0m         \u001b[0mframes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat_records\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrecords\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlast_unique\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrecursion_repeat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py\u001b[0m in \u001b[0;36mfind_recursion\u001b[0;34m(etype, value, records)\u001b[0m\n\u001b[1;32m    380\u001b[0m     \u001b[0;31m# first frame (from in to out) that looks different.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    381\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mis_recursion_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0metype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrecords\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 382\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrecords\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    383\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    384\u001b[0m     \u001b[0;31m# Select filename, lineno, func_name to track frames with\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mTypeError\u001b[0m: object of type 'NoneType' has no len()"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.neural_network import MLPClassifier\n",
+    "from sklearn.metrics import  accuracy_score\n",
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "import lzma\n",
+    "import re\n",
+    "\n",
+    "X_train_raw = lzma.open(\"train/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()\n",
+    "y_train_raw = open('train/expected.tsv').readlines()\n",
+    "X_dev0_raw = open(\"dev-0/in.tsv\", \"r\").readlines()\n",
+    "y_expected_dev0_raw = open(\"dev-0/expected.tsv\", \"r\").readlines()\n",
+    "X_dev1_raw = open(\"dev-1/in.tsv\", \"r\").readlines()\n",
+    "y_expected_dev1_raw = open(\"dev-1/expected.tsv\", \"r\").readlines()\n",
+    "X_test_raw = open(\"test-A/in.tsv\", \"r\").readlines()\n",
+    "X_test_raw = open(\"test-A/in.tsv\", \"r\").readlines()\n",
+    "\n",
+    "X_dev0_cleaned = [re.sub('\\t(not-)?for-humans\\t(not-)?contaminated\\n', '', line) for line in X_dev0_raw]\n",
+    "X_dev1_cleaned = [re.sub('\\t(not-)?for-humans\\t(not-)?contaminated\\n', '', line) for line in X_dev1_raw]\n",
+    "X_test_cleaned = [re.sub('\\t(not-)?for-humans\\t(not-)?contaminated\\n', '', line) for line in X_test_raw]\n",
+    "\n",
+    "count_vect = CountVectorizer()\n",
+    "X_train_counts = count_vect.fit_transform(X_train_raw)\n",
+    "X_dev0_counts = count_vect.transform(X_dev0_cleaned)\n",
+    "X_dev1_counts = count_vect.transform(X_dev1_cleaned)\n",
+    "X_test_counts = count_vect.transform(X_test_cleaned)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "deluxe-inventory",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# clf = MLPClassifier().fit(X_train_counts, y_train_raw)\n",
+    "\n",
+    "# y_predicted_dev0_mlp = clf.predict(X_dev0_counts)\n",
+    "# y_predicted_dev1_mlp = clf.predict(X_dev1_counts)\n",
+    "\n",
+    "# accuracy_dev0_mlp = accuracy_score(y_expected_dev0_raw, y_predicted_dev0_mlp)\n",
+    "# print(f\"Accuracy dev0: {accuracy_dev0_mlp}\")\n",
+    "# accuracy_dev1_mlp = accuracy_score(y_expected_dev1_raw, y_predicted_dev1_mlp)\n",
+    "# print(f\"Accuracy dev1: {accuracy_dev1_mlp}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "affected-expansion",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = LogisticRegression().fit(X_train_counts, y_train_raw, max_iter=300)\n",
+    "\n",
+    "y_predicted_dev0_logreg = model.predict(X_dev0_counts)\n",
+    "y_predicted_dev1_logreg = model.predict(X_dev1_counts)\n",
+    "\n",
+    "accuracy_dev0_logreg = accuracy_score(y_expected_dev0_raw, y_predicted_dev0_logreg)\n",
+    "print(f\"Accuracy dev0: {accuracy_dev0_logreg}\")\n",
+    "accuracy_dev1_logreg = accuracy_score(y_expected_dev1_raw, y_predicted_dev1_logreg)\n",
+    "print(f\"Accuracy dev1: {accuracy_dev1_logreg}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "associate-benjamin",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# from sklearn.naive_bayes import MultinomialNB\n",
+    "# clf2 = MultinomialNB().fit(X_train_counts, y_train_raw)\n",
+    "\n",
+    "# y_predicted_dev0_MNB = clf2.predict(X_dev0_counts)\n",
+    "# y_predicted_dev1_MNB = clf2.predict(X_dev1_counts)\n",
+    "\n",
+    "# accuracy_dev0_MNB = accuracy_score(y_expected_dev0_raw, y_predicted_dev0_MNB)\n",
+    "# print(f\"Accuracy dev0: {accuracy_dev0_MNB}\")\n",
+    "# accuracy_dev1_MNB = accuracy_score(y_expected_dev1_raw, y_predicted_dev1_MNB)\n",
+    "# print(f\"Accuracy dev1: {accuracy_dev1_MNB}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "treated-neighborhood",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "independent-humor",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lista = [1, 0, 1, 0, 0]\n",
+    "open(\"out.tsv\", mode='w').writelines([str(i)+\"\\n\" for i in lista])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "baking-rhythm",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/run.ipynb
+++ b/run.ipynb
@ -0,0 +1,171 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "instructional-fellow",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:35: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  eps=np.finfo(np.float).eps,\n",
+      "/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:597: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,\n",
+      "/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:836: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,\n",
+      "/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:862: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  eps=np.finfo(np.float).eps, positive=False):\n",
+      "/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:1097: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,\n",
+      "/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:1344: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,\n",
+      "/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:1480: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  eps=np.finfo(np.float).eps, copy_X=True, positive=False):\n",
+      "/usr/lib/python3/dist-packages/sklearn/linear_model/randomized_l1.py:152: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  precompute=False, eps=np.finfo(np.float).eps,\n",
+      "/usr/lib/python3/dist-packages/sklearn/linear_model/randomized_l1.py:320: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  eps=np.finfo(np.float).eps, random_state=None,\n",
+      "/usr/lib/python3/dist-packages/sklearn/linear_model/randomized_l1.py:580: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  eps=4 * np.finfo(np.float).eps, n_jobs=None,\n",
+      "/usr/lib/python3/dist-packages/sklearn/feature_extraction/image.py:167: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
+      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+      "  dtype=np.int):\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.neural_network import MLPClassifier\n",
+    "from sklearn.metrics import  accuracy_score\n",
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "import lzma\n",
+    "import re\n",
+    "\n",
+    "X_train_raw = lzma.open(\"train/in.tsv.xz\", mode='rt', encoding='utf-8').readlines()\n",
+    "y_train_raw = open('train/expected.tsv').readlines()\n",
+    "X_dev0_raw = open(\"dev-0/in.tsv\", \"r\").readlines()\n",
+    "y_expected_dev0_raw = open(\"dev-0/expected.tsv\", \"r\").readlines()\n",
+    "X_dev1_raw = open(\"dev-1/in.tsv\", \"r\").readlines()\n",
+    "y_expected_dev1_raw = open(\"dev-1/expected.tsv\", \"r\").readlines()\n",
+    "X_test_raw = open(\"test-A/in.tsv\", \"r\").readlines()\n",
+    "\n",
+    "X_dev0_cleaned = [re.sub('\\t(not-)?for-humans\\t(not-)?contaminated\\n', '', line) for line in X_dev0_raw]\n",
+    "X_dev1_cleaned = [re.sub('\\t(not-)?for-humans\\t(not-)?contaminated\\n', '', line) for line in X_dev1_raw]\n",
+    "X_test_cleaned = [re.sub('\\t(not-)?for-humans\\t(not-)?contaminated\\n', '', line) for line in X_test_raw]\n",
+    "\n",
+    "count_vect = CountVectorizer()\n",
+    "X_train_counts = count_vect.fit_transform(X_train_raw)\n",
+    "X_dev0_counts = count_vect.transform(X_dev0_cleaned)\n",
+    "X_dev1_counts = count_vect.transform(X_dev1_cleaned)\n",
+    "X_test_counts = count_vect.transform(X_test_cleaned)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "widespread-chick",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy dev0: 0.6688975632491952\n",
+      "Accuracy dev1: 0.6502815984061914\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.naive_bayes import MultinomialNB\n",
+    "clf2 = MultinomialNB().fit(X_train_counts, y_train_raw)\n",
+    "\n",
+    "y_predicted_dev0_MNB = clf2.predict(X_dev0_counts)\n",
+    "y_predicted_dev1_MNB = clf2.predict(X_dev1_counts)\n",
+    "y_predicted_test_MNB = clf2.predict(X_test_counts)\n",
+    "\n",
+    "# accuracy_dev0_MNB = accuracy_score(y_expected_dev0_raw, y_predicted_dev0_MNB)\n",
+    "# print(f\"Accuracy dev0: {accuracy_dev0_MNB}\")\n",
+    "# accuracy_dev1_MNB = accuracy_score(y_expected_dev1_raw, y_predicted_dev1_MNB)\n",
+    "# print(f\"Accuracy dev1: {accuracy_dev1_MNB}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "looking-thomson",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "open(\"dev-0/out.tsv\", mode='w').writelines(y_predicted_dev0_MNB)\n",
+    "open(\"dev-1/out.tsv\", mode='w').writelines(y_predicted_dev1_MNB)\n",
+    "open(\"test-A/out.tsv\", mode='w').writelines(y_predicted_test_MNB)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "experienced-sympathy",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# y_proba_dev0_MNB = clf2.predict_proba(X_dev0_counts)\n",
+    "# y_proba_dev1_MNB = clf2.predict_proba(X_dev1_counts)\n",
+    "# y_proba_test_MNB = clf2.predict_proba(X_test_counts)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "drawn-yeast",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# open(\"dev-0/out.tsv\", mode='w').writelines([str(i[1])+'\\n' for i in y_proba_dev0_MNB])\n",
+    "# open(\"dev-1/out.tsv\", mode='w').writelines([str(i[1])+'\\n' for i in y_proba_dev1_MNB])\n",
+    "# open(\"test-A/out.tsv\", mode='w').writelines([str(i[1])+'\\n' for i in y_proba_test_MNB])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "organized-hawaii",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/run.py
+++ b/run.py
@ -0,0 +1,46 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+from sklearn.naive_bayes import MultinomialNB
+#from sklearn.metrics import  accuracy_score
+from sklearn.feature_extraction.text import CountVectorizer
+import lzma
+import re
+
+X_train_raw = lzma.open("train/in.tsv.xz", mode='rt', encoding='utf-8').readlines()
+y_train_raw = open('train/expected.tsv').readlines()
+X_dev0_raw = open("dev-0/in.tsv", "r").readlines()
+y_expected_dev0_raw = open("dev-0/expected.tsv", "r").readlines()
+X_dev1_raw = open("dev-1/in.tsv", "r").readlines()
+y_expected_dev1_raw = open("dev-1/expected.tsv", "r").readlines()
+X_test_raw = open("test-A/in.tsv", "r").readlines()
+
+X_dev0_cleaned = [re.sub('\t(not-)?for-humans\t(not-)?contaminated\n', '', line) for line in X_dev0_raw]
+X_dev1_cleaned = [re.sub('\t(not-)?for-humans\t(not-)?contaminated\n', '', line) for line in X_dev1_raw]
+X_test_cleaned = [re.sub('\t(not-)?for-humans\t(not-)?contaminated\n', '', line) for line in X_test_raw]
+
+count_vect = CountVectorizer()
+X_train_counts = count_vect.fit_transform(X_train_raw)
+X_dev0_counts = count_vect.transform(X_dev0_cleaned)
+X_dev1_counts = count_vect.transform(X_dev1_cleaned)
+X_test_counts = count_vect.transform(X_test_cleaned)
+
+clf2 = MultinomialNB().fit(X_train_counts, y_train_raw)
+
+y_predicted_dev0_MNB = clf2.predict(X_dev0_counts)
+y_predicted_dev1_MNB = clf2.predict(X_dev1_counts)
+y_predicted_test_MNB = clf2.predict(X_test_counts)
+
+# accuracy_dev0_MNB = accuracy_score(y_expected_dev0_raw, y_predicted_dev0_MNB)
+# print(f"Accuracy dev0: {accuracy_dev0_MNB}")
+# accuracy_dev1_MNB = accuracy_score(y_expected_dev1_raw, y_predicted_dev1_MNB)
+# print(f"Accuracy dev1: {accuracy_dev1_MNB}")
+
+open("dev-0/out.tsv", mode='w').writelines(y_predicted_dev0_MNB)
+open("dev-1/out.tsv", mode='w').writelines(y_predicted_dev1_MNB)
+open("test-A/out.tsv", mode='w').writelines(y_predicted_test_MNB)
+
+
+
+
+
--- a/test-A/in.tsv
+++ b/test-A/in.tsv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train/expected.tsv
+++ b/train/expected.tsv
--- a/train/in.tsv.xz
+++ b/train/in.tsv.xz
--- a/train/meta.tsv.gz
+++ b/train/meta.tsv.gz