23 KiB
23 KiB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
import lzma
import re
X_train_raw = lzma.open("train/in.tsv.xz", mode='rt', encoding='utf-8').readlines()
y_train_raw = open('train/expected.tsv').readlines()
X_dev0_raw = open("dev-0/in.tsv", "r").readlines()
y_expected_dev0_raw = open("dev-0/expected.tsv", "r").readlines()
X_dev1_raw = open("dev-1/in.tsv", "r").readlines()
y_expected_dev1_raw = open("dev-1/expected.tsv", "r").readlines()
X_test_raw = open("test-A/in.tsv", "r").readlines()
X_test_raw = open("test-A/in.tsv", "r").readlines()
X_dev0_cleaned = [re.sub('\t(not-)?for-humans\t(not-)?contaminated\n', '', line) for line in X_dev0_raw]
X_dev1_cleaned = [re.sub('\t(not-)?for-humans\t(not-)?contaminated\n', '', line) for line in X_dev1_raw]
X_test_cleaned = [re.sub('\t(not-)?for-humans\t(not-)?contaminated\n', '', line) for line in X_test_raw]
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train_raw)
X_dev0_counts = count_vect.transform(X_dev0_cleaned)
X_dev1_counts = count_vect.transform(X_dev1_cleaned)
X_test_counts = count_vect.transform(X_test_cleaned)
/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:35: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations eps=np.finfo(np.float).eps, /usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:597: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations eps=np.finfo(np.float).eps, copy_X=True, fit_path=True, /usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:836: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations eps=np.finfo(np.float).eps, copy_X=True, fit_path=True, /usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:862: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations eps=np.finfo(np.float).eps, positive=False): /usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:1097: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps, /usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:1344: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps, /usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:1480: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations eps=np.finfo(np.float).eps, copy_X=True, positive=False): /usr/lib/python3/dist-packages/sklearn/linear_model/randomized_l1.py:152: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations precompute=False, eps=np.finfo(np.float).eps, /usr/lib/python3/dist-packages/sklearn/linear_model/randomized_l1.py:320: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations eps=np.finfo(np.float).eps, random_state=None, /usr/lib/python3/dist-packages/sklearn/linear_model/randomized_l1.py:580: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations eps=4 * np.finfo(np.float).eps, n_jobs=None, /usr/lib/python3/dist-packages/sklearn/feature_extraction/image.py:167: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations dtype=np.int): ERROR:root:Internal Python error in the inspect module. Below is the traceback from this internal error.
Traceback (most recent call last): File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 3427, in run_code exec(code_obj, self.user_global_ns, self.user_ns) File "<ipython-input-1-a756e6b5ff53>", line 22, in <module> X_train_counts = count_vect.fit_transform(X_train_raw) File "/usr/lib/python3/dist-packages/sklearn/feature_extraction/text.py", line 1032, in fit_transform self.fixed_vocabulary_) File "/usr/lib/python3/dist-packages/sklearn/feature_extraction/text.py", line 944, in _count_vocab feature_idx = vocabulary[feature] KeyboardInterrupt During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 2054, in showtraceback stb = value._render_traceback_() AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_' During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py", line 1101, in get_records return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset) File "/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py", line 248, in wrapped return f(*args, **kwargs) File "/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py", line 281, in _fixed_getinnerframes records = fix_frame_records_filenames(inspect.getinnerframes(etb, context)) File "/usr/lib/python3.7/inspect.py", line 1502, in getinnerframes frameinfo = (tb.tb_frame,) + getframeinfo(tb, context) File "/usr/lib/python3.7/inspect.py", line 1460, in getframeinfo filename = getsourcefile(frame) or getfile(frame) File "/usr/lib/python3.7/inspect.py", line 696, in getsourcefile if getattr(getmodule(object, filename), '__loader__', None) is not None: File "/usr/lib/python3.7/inspect.py", line 732, in getmodule for modname, module in list(sys.modules.items()): KeyboardInterrupt
[0;31m---------------------------------------------------------------------------[0m [0;31mKeyboardInterrupt[0m Traceback (most recent call last) [0;31m[... skipping hidden 1 frame][0m [0;32m<ipython-input-1-a756e6b5ff53>[0m in [0;36m<module>[0;34m[0m [1;32m 21[0m [0mcount_vect[0m [0;34m=[0m [0mCountVectorizer[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0;32m---> 22[0;31m [0mX_train_counts[0m [0;34m=[0m [0mcount_vect[0m[0;34m.[0m[0mfit_transform[0m[0;34m([0m[0mX_train_raw[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 23[0m [0mX_dev0_counts[0m [0;34m=[0m [0mcount_vect[0m[0;34m.[0m[0mtransform[0m[0;34m([0m[0mX_dev0_cleaned[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0;32m/usr/lib/python3/dist-packages/sklearn/feature_extraction/text.py[0m in [0;36mfit_transform[0;34m(self, raw_documents, y)[0m [1;32m 1031[0m vocabulary, X = self._count_vocab(raw_documents, [0;32m-> 1032[0;31m self.fixed_vocabulary_) [0m[1;32m 1033[0m [0;34m[0m[0m [0;32m/usr/lib/python3/dist-packages/sklearn/feature_extraction/text.py[0m in [0;36m_count_vocab[0;34m(self, raw_documents, fixed_vocab)[0m [1;32m 943[0m [0;32mtry[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m--> 944[0;31m [0mfeature_idx[0m [0;34m=[0m [0mvocabulary[0m[0;34m[[0m[0mfeature[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 945[0m [0;32mif[0m [0mfeature_idx[0m [0;32mnot[0m [0;32min[0m [0mfeature_counter[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;31mKeyboardInterrupt[0m: During handling of the above exception, another exception occurred: [0;31mAttributeError[0m Traceback (most recent call last) [0;32m/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py[0m in [0;36mshowtraceback[0;34m(self, exc_tuple, filename, tb_offset, exception_only, running_compiled_code)[0m [1;32m 2053[0m [0;31m# in the engines. This should return a list of strings.[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m [0;32m-> 2054[0;31m [0mstb[0m [0;34m=[0m [0mvalue[0m[0;34m.[0m[0m_render_traceback_[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 2055[0m [0;32mexcept[0m [0mException[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;31mAttributeError[0m: 'KeyboardInterrupt' object has no attribute '_render_traceback_' During handling of the above exception, another exception occurred: [0;31mTypeError[0m Traceback (most recent call last) [0;31m[... skipping hidden 1 frame][0m [0;32m/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py[0m in [0;36mshowtraceback[0;34m(self, exc_tuple, filename, tb_offset, exception_only, running_compiled_code)[0m [1;32m 2055[0m [0;32mexcept[0m [0mException[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [1;32m 2056[0m stb = self.InteractiveTB.structured_traceback(etype, [0;32m-> 2057[0;31m value, tb, tb_offset=tb_offset) [0m[1;32m 2058[0m [0;34m[0m[0m [1;32m 2059[0m [0mself[0m[0;34m.[0m[0m_showtraceback[0m[0;34m([0m[0metype[0m[0;34m,[0m [0mvalue[0m[0;34m,[0m [0mstb[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0;32m/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py[0m in [0;36mstructured_traceback[0;34m(self, etype, value, tb, tb_offset, number_of_lines_of_context)[0m [1;32m 1366[0m [0mself[0m[0;34m.[0m[0mtb[0m [0;34m=[0m [0mtb[0m[0;34m[0m[0;34m[0m[0m [1;32m 1367[0m return FormattedTB.structured_traceback( [0;32m-> 1368[0;31m self, etype, value, tb, tb_offset, number_of_lines_of_context) [0m[1;32m 1369[0m [0;34m[0m[0m [1;32m 1370[0m [0;34m[0m[0m [0;32m/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py[0m in [0;36mstructured_traceback[0;34m(self, etype, value, tb, tb_offset, number_of_lines_of_context)[0m [1;32m 1266[0m [0;31m# Verbose modes need a full traceback[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m [1;32m 1267[0m return VerboseTB.structured_traceback( [0;32m-> 1268[0;31m [0mself[0m[0;34m,[0m [0metype[0m[0;34m,[0m [0mvalue[0m[0;34m,[0m [0mtb[0m[0;34m,[0m [0mtb_offset[0m[0;34m,[0m [0mnumber_of_lines_of_context[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 1269[0m ) [1;32m 1270[0m [0;32melif[0m [0mmode[0m [0;34m==[0m [0;34m'Minimal'[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py[0m in [0;36mstructured_traceback[0;34m(self, etype, evalue, etb, tb_offset, number_of_lines_of_context)[0m [1;32m 1123[0m [0;34m[0m[0m [1;32m 1124[0m formatted_exception = self.format_exception_as_a_whole(etype, evalue, etb, number_of_lines_of_context, [0;32m-> 1125[0;31m tb_offset) [0m[1;32m 1126[0m [0;34m[0m[0m [1;32m 1127[0m [0mcolors[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mColors[0m [0;31m# just a shorthand + quicker name lookup[0m[0;34m[0m[0;34m[0m[0m [0;32m/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py[0m in [0;36mformat_exception_as_a_whole[0;34m(self, etype, evalue, etb, number_of_lines_of_context, tb_offset)[0m [1;32m 1080[0m [0;34m[0m[0m [1;32m 1081[0m [0;34m[0m[0m [0;32m-> 1082[0;31m [0mlast_unique[0m[0;34m,[0m [0mrecursion_repeat[0m [0;34m=[0m [0mfind_recursion[0m[0;34m([0m[0morig_etype[0m[0;34m,[0m [0mevalue[0m[0;34m,[0m [0mrecords[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 1083[0m [0;34m[0m[0m [1;32m 1084[0m [0mframes[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mformat_records[0m[0;34m([0m[0mrecords[0m[0;34m,[0m [0mlast_unique[0m[0;34m,[0m [0mrecursion_repeat[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m [0;32m/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py[0m in [0;36mfind_recursion[0;34m(etype, value, records)[0m [1;32m 380[0m [0;31m# first frame (from in to out) that looks different.[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m [1;32m 381[0m [0;32mif[0m [0;32mnot[0m [0mis_recursion_error[0m[0;34m([0m[0metype[0m[0;34m,[0m [0mvalue[0m[0;34m,[0m [0mrecords[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m [0;32m--> 382[0;31m [0;32mreturn[0m [0mlen[0m[0;34m([0m[0mrecords[0m[0;34m)[0m[0;34m,[0m [0;36m0[0m[0;34m[0m[0;34m[0m[0m [0m[1;32m 383[0m [0;34m[0m[0m [1;32m 384[0m [0;31m# Select filename, lineno, func_name to track frames with[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m [0;31mTypeError[0m: object of type 'NoneType' has no len()
# clf = MLPClassifier().fit(X_train_counts, y_train_raw)
# y_predicted_dev0_mlp = clf.predict(X_dev0_counts)
# y_predicted_dev1_mlp = clf.predict(X_dev1_counts)
# accuracy_dev0_mlp = accuracy_score(y_expected_dev0_raw, y_predicted_dev0_mlp)
# print(f"Accuracy dev0: {accuracy_dev0_mlp}")
# accuracy_dev1_mlp = accuracy_score(y_expected_dev1_raw, y_predicted_dev1_mlp)
# print(f"Accuracy dev1: {accuracy_dev1_mlp}")
model = LogisticRegression().fit(X_train_counts, y_train_raw, max_iter=300)
y_predicted_dev0_logreg = model.predict(X_dev0_counts)
y_predicted_dev1_logreg = model.predict(X_dev1_counts)
accuracy_dev0_logreg = accuracy_score(y_expected_dev0_raw, y_predicted_dev0_logreg)
print(f"Accuracy dev0: {accuracy_dev0_logreg}")
accuracy_dev1_logreg = accuracy_score(y_expected_dev1_raw, y_predicted_dev1_logreg)
print(f"Accuracy dev1: {accuracy_dev1_logreg}")
# from sklearn.naive_bayes import MultinomialNB
# clf2 = MultinomialNB().fit(X_train_counts, y_train_raw)
# y_predicted_dev0_MNB = clf2.predict(X_dev0_counts)
# y_predicted_dev1_MNB = clf2.predict(X_dev1_counts)
# accuracy_dev0_MNB = accuracy_score(y_expected_dev0_raw, y_predicted_dev0_MNB)
# print(f"Accuracy dev0: {accuracy_dev0_MNB}")
# accuracy_dev1_MNB = accuracy_score(y_expected_dev1_raw, y_predicted_dev1_MNB)
# print(f"Accuracy dev1: {accuracy_dev1_MNB}")
lista = [1, 0, 1, 0, 0]
open("out.tsv", mode='w').writelines([str(i)+"\n" for i in lista])