petite-difference-challenge2/run-Copy1.ipynb at master

Dominik Zaleśny bb4e135e1c multinomial naive bayes

2022-04-22 16:59:51 +02:00

23 KiB

Raw Permalink Blame History

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import  accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
import lzma
import re

X_train_raw = lzma.open("train/in.tsv.xz", mode='rt', encoding='utf-8').readlines()
y_train_raw = open('train/expected.tsv').readlines()
X_dev0_raw = open("dev-0/in.tsv", "r").readlines()
y_expected_dev0_raw = open("dev-0/expected.tsv", "r").readlines()
X_dev1_raw = open("dev-1/in.tsv", "r").readlines()
y_expected_dev1_raw = open("dev-1/expected.tsv", "r").readlines()
X_test_raw = open("test-A/in.tsv", "r").readlines()
X_test_raw = open("test-A/in.tsv", "r").readlines()

X_dev0_cleaned = [re.sub('\t(not-)?for-humans\t(not-)?contaminated\n', '', line) for line in X_dev0_raw]
X_dev1_cleaned = [re.sub('\t(not-)?for-humans\t(not-)?contaminated\n', '', line) for line in X_dev1_raw]
X_test_cleaned = [re.sub('\t(not-)?for-humans\t(not-)?contaminated\n', '', line) for line in X_test_raw]

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train_raw)
X_dev0_counts = count_vect.transform(X_dev0_cleaned)
X_dev1_counts = count_vect.transform(X_dev1_cleaned)
X_test_counts = count_vect.transform(X_test_cleaned)

/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:35: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
eps=np.finfo(np.float).eps,
/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:597: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:836: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:862: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
eps=np.finfo(np.float).eps, positive=False):
/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:1097: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,
/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:1344: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,
/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:1480: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
eps=np.finfo(np.float).eps, copy_X=True, positive=False):
/usr/lib/python3/dist-packages/sklearn/linear_model/randomized_l1.py:152: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
precompute=False, eps=np.finfo(np.float).eps,
/usr/lib/python3/dist-packages/sklearn/linear_model/randomized_l1.py:320: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
eps=np.finfo(np.float).eps, random_state=None,
/usr/lib/python3/dist-packages/sklearn/linear_model/randomized_l1.py:580: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
eps=4 * np.finfo(np.float).eps, n_jobs=None,
/usr/lib/python3/dist-packages/sklearn/feature_extraction/image.py:167: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
dtype=np.int):
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 3427, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-1-a756e6b5ff53>", line 22, in <module>
    X_train_counts = count_vect.fit_transform(X_train_raw)
  File "/usr/lib/python3/dist-packages/sklearn/feature_extraction/text.py", line 1032, in fit_transform
    self.fixed_vocabulary_)
  File "/usr/lib/python3/dist-packages/sklearn/feature_extraction/text.py", line 944, in _count_vocab
    feature_idx = vocabulary[feature]
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 2054, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py", line 1101, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py", line 248, in wrapped
    return f(*args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py", line 281, in _fixed_getinnerframes
    records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))
  File "/usr/lib/python3.7/inspect.py", line 1502, in getinnerframes
    frameinfo = (tb.tb_frame,) + getframeinfo(tb, context)
  File "/usr/lib/python3.7/inspect.py", line 1460, in getframeinfo
    filename = getsourcefile(frame) or getfile(frame)
  File "/usr/lib/python3.7/inspect.py", line 696, in getsourcefile
    if getattr(getmodule(object, filename), '__loader__', None) is not None:
  File "/usr/lib/python3.7/inspect.py", line 732, in getmodule
    for modname, module in list(sys.modules.items()):
KeyboardInterrupt

[0;31m---------------------------------------------------------------------------[0m
[0;31mKeyboardInterrupt[0m                         Traceback (most recent call last)
    [0;31m[... skipping hidden 1 frame][0m

[0;32m<ipython-input-1-a756e6b5ff53>[0m in [0;36m<module>[0;34m[0m
[1;32m     21[0m [0mcount_vect[0m [0;34m=[0m [0mCountVectorizer[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;32m---> 22[0;31m [0mX_train_counts[0m [0;34m=[0m [0mcount_vect[0m[0;34m.[0m[0mfit_transform[0m[0;34m([0m[0mX_train_raw[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m     23[0m [0mX_dev0_counts[0m [0;34m=[0m [0mcount_vect[0m[0;34m.[0m[0mtransform[0m[0;34m([0m[0mX_dev0_cleaned[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m

[0;32m/usr/lib/python3/dist-packages/sklearn/feature_extraction/text.py[0m in [0;36mfit_transform[0;34m(self, raw_documents, y)[0m
[1;32m   1031[0m         vocabulary, X = self._count_vocab(raw_documents,
[0;32m-> 1032[0;31m                                           self.fixed_vocabulary_)
[0m[1;32m   1033[0m [0;34m[0m[0m

[0;32m/usr/lib/python3/dist-packages/sklearn/feature_extraction/text.py[0m in [0;36m_count_vocab[0;34m(self, raw_documents, fixed_vocab)[0m
[1;32m    943[0m                 [0;32mtry[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0;32m--> 944[0;31m                     [0mfeature_idx[0m [0;34m=[0m [0mvocabulary[0m[0;34m[[0m[0mfeature[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m    945[0m                     [0;32mif[0m [0mfeature_idx[0m [0;32mnot[0m [0;32min[0m [0mfeature_counter[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m

[0;31mKeyboardInterrupt[0m: 

During handling of the above exception, another exception occurred:

[0;31mAttributeError[0m                            Traceback (most recent call last)
[0;32m/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py[0m in [0;36mshowtraceback[0;34m(self, exc_tuple, filename, tb_offset, exception_only, running_compiled_code)[0m
[1;32m   2053[0m                         [0;31m# in the engines. This should return a list of strings.[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0;32m-> 2054[0;31m                         [0mstb[0m [0;34m=[0m [0mvalue[0m[0;34m.[0m[0m_render_traceback_[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m   2055[0m                     [0;32mexcept[0m [0mException[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m

[0;31mAttributeError[0m: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

[0;31mTypeError[0m                                 Traceback (most recent call last)
    [0;31m[... skipping hidden 1 frame][0m

[0;32m/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py[0m in [0;36mshowtraceback[0;34m(self, exc_tuple, filename, tb_offset, exception_only, running_compiled_code)[0m
[1;32m   2055[0m                     [0;32mexcept[0m [0mException[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[1;32m   2056[0m                         stb = self.InteractiveTB.structured_traceback(etype,
[0;32m-> 2057[0;31m                                             value, tb, tb_offset=tb_offset)
[0m[1;32m   2058[0m [0;34m[0m[0m
[1;32m   2059[0m                     [0mself[0m[0;34m.[0m[0m_showtraceback[0m[0;34m([0m[0metype[0m[0;34m,[0m [0mvalue[0m[0;34m,[0m [0mstb[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m

[0;32m/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py[0m in [0;36mstructured_traceback[0;34m(self, etype, value, tb, tb_offset, number_of_lines_of_context)[0m
[1;32m   1366[0m             [0mself[0m[0;34m.[0m[0mtb[0m [0;34m=[0m [0mtb[0m[0;34m[0m[0;34m[0m[0m
[1;32m   1367[0m         return FormattedTB.structured_traceback(
[0;32m-> 1368[0;31m             self, etype, value, tb, tb_offset, number_of_lines_of_context)
[0m[1;32m   1369[0m [0;34m[0m[0m
[1;32m   1370[0m [0;34m[0m[0m

[0;32m/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py[0m in [0;36mstructured_traceback[0;34m(self, etype, value, tb, tb_offset, number_of_lines_of_context)[0m
[1;32m   1266[0m             [0;31m# Verbose modes need a full traceback[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m   1267[0m             return VerboseTB.structured_traceback(
[0;32m-> 1268[0;31m                 [0mself[0m[0;34m,[0m [0metype[0m[0;34m,[0m [0mvalue[0m[0;34m,[0m [0mtb[0m[0;34m,[0m [0mtb_offset[0m[0;34m,[0m [0mnumber_of_lines_of_context[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m   1269[0m             )
[1;32m   1270[0m         [0;32melif[0m [0mmode[0m [0;34m==[0m [0;34m'Minimal'[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m

[0;32m/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py[0m in [0;36mstructured_traceback[0;34m(self, etype, evalue, etb, tb_offset, number_of_lines_of_context)[0m
[1;32m   1123[0m [0;34m[0m[0m
[1;32m   1124[0m         formatted_exception = self.format_exception_as_a_whole(etype, evalue, etb, number_of_lines_of_context,
[0;32m-> 1125[0;31m                                                                tb_offset)
[0m[1;32m   1126[0m [0;34m[0m[0m
[1;32m   1127[0m         [0mcolors[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mColors[0m  [0;31m# just a shorthand + quicker name lookup[0m[0;34m[0m[0;34m[0m[0m

[0;32m/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py[0m in [0;36mformat_exception_as_a_whole[0;34m(self, etype, evalue, etb, number_of_lines_of_context, tb_offset)[0m
[1;32m   1080[0m [0;34m[0m[0m
[1;32m   1081[0m [0;34m[0m[0m
[0;32m-> 1082[0;31m         [0mlast_unique[0m[0;34m,[0m [0mrecursion_repeat[0m [0;34m=[0m [0mfind_recursion[0m[0;34m([0m[0morig_etype[0m[0;34m,[0m [0mevalue[0m[0;34m,[0m [0mrecords[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m   1083[0m [0;34m[0m[0m
[1;32m   1084[0m         [0mframes[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mformat_records[0m[0;34m([0m[0mrecords[0m[0;34m,[0m [0mlast_unique[0m[0;34m,[0m [0mrecursion_repeat[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m

[0;32m/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py[0m in [0;36mfind_recursion[0;34m(etype, value, records)[0m
[1;32m    380[0m     [0;31m# first frame (from in to out) that looks different.[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m    381[0m     [0;32mif[0m [0;32mnot[0m [0mis_recursion_error[0m[0;34m([0m[0metype[0m[0;34m,[0m [0mvalue[0m[0;34m,[0m [0mrecords[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0;32m--> 382[0;31m         [0;32mreturn[0m [0mlen[0m[0;34m([0m[0mrecords[0m[0;34m)[0m[0;34m,[0m [0;36m0[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m    383[0m [0;34m[0m[0m
[1;32m    384[0m     [0;31m# Select filename, lineno, func_name to track frames with[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m

[0;31mTypeError[0m: object of type 'NoneType' has no len()

# clf = MLPClassifier().fit(X_train_counts, y_train_raw)

# y_predicted_dev0_mlp = clf.predict(X_dev0_counts)
# y_predicted_dev1_mlp = clf.predict(X_dev1_counts)

# accuracy_dev0_mlp = accuracy_score(y_expected_dev0_raw, y_predicted_dev0_mlp)
# print(f"Accuracy dev0: {accuracy_dev0_mlp}")
# accuracy_dev1_mlp = accuracy_score(y_expected_dev1_raw, y_predicted_dev1_mlp)
# print(f"Accuracy dev1: {accuracy_dev1_mlp}")

model = LogisticRegression().fit(X_train_counts, y_train_raw, max_iter=300)

y_predicted_dev0_logreg = model.predict(X_dev0_counts)
y_predicted_dev1_logreg = model.predict(X_dev1_counts)

accuracy_dev0_logreg = accuracy_score(y_expected_dev0_raw, y_predicted_dev0_logreg)
print(f"Accuracy dev0: {accuracy_dev0_logreg}")
accuracy_dev1_logreg = accuracy_score(y_expected_dev1_raw, y_predicted_dev1_logreg)
print(f"Accuracy dev1: {accuracy_dev1_logreg}")

# from sklearn.naive_bayes import MultinomialNB
# clf2 = MultinomialNB().fit(X_train_counts, y_train_raw)

# y_predicted_dev0_MNB = clf2.predict(X_dev0_counts)
# y_predicted_dev1_MNB = clf2.predict(X_dev1_counts)

# accuracy_dev0_MNB = accuracy_score(y_expected_dev0_raw, y_predicted_dev0_MNB)
# print(f"Accuracy dev0: {accuracy_dev0_MNB}")
# accuracy_dev1_MNB = accuracy_score(y_expected_dev1_raw, y_predicted_dev1_MNB)
# print(f"Accuracy dev1: {accuracy_dev1_MNB}")

lista = [1, 0, 1, 0, 0]
open("out.tsv", mode='w').writelines([str(i)+"\n" for i in lista])

23 KiB Raw Permalink Blame History Unescape Escape

23 KiB

Raw Permalink Blame History