petite-difference-challenge2/run-Copy1.ipynb

23 KiB
Raw Permalink Blame History

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import  accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
import lzma
import re

X_train_raw = lzma.open("train/in.tsv.xz", mode='rt', encoding='utf-8').readlines()
y_train_raw = open('train/expected.tsv').readlines()
X_dev0_raw = open("dev-0/in.tsv", "r").readlines()
y_expected_dev0_raw = open("dev-0/expected.tsv", "r").readlines()
X_dev1_raw = open("dev-1/in.tsv", "r").readlines()
y_expected_dev1_raw = open("dev-1/expected.tsv", "r").readlines()
X_test_raw = open("test-A/in.tsv", "r").readlines()
X_test_raw = open("test-A/in.tsv", "r").readlines()

X_dev0_cleaned = [re.sub('\t(not-)?for-humans\t(not-)?contaminated\n', '', line) for line in X_dev0_raw]
X_dev1_cleaned = [re.sub('\t(not-)?for-humans\t(not-)?contaminated\n', '', line) for line in X_dev1_raw]
X_test_cleaned = [re.sub('\t(not-)?for-humans\t(not-)?contaminated\n', '', line) for line in X_test_raw]

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train_raw)
X_dev0_counts = count_vect.transform(X_dev0_cleaned)
X_dev1_counts = count_vect.transform(X_dev1_cleaned)
X_test_counts = count_vect.transform(X_test_cleaned)
/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:35: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps,
/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:597: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:836: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:862: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, positive=False):
/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:1097: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,
/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:1344: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,
/usr/lib/python3/dist-packages/sklearn/linear_model/least_angle.py:1480: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, positive=False):
/usr/lib/python3/dist-packages/sklearn/linear_model/randomized_l1.py:152: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  precompute=False, eps=np.finfo(np.float).eps,
/usr/lib/python3/dist-packages/sklearn/linear_model/randomized_l1.py:320: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, random_state=None,
/usr/lib/python3/dist-packages/sklearn/linear_model/randomized_l1.py:580: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=4 * np.finfo(np.float).eps, n_jobs=None,
/usr/lib/python3/dist-packages/sklearn/feature_extraction/image.py:167: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.int):
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 3427, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-1-a756e6b5ff53>", line 22, in <module>
    X_train_counts = count_vect.fit_transform(X_train_raw)
  File "/usr/lib/python3/dist-packages/sklearn/feature_extraction/text.py", line 1032, in fit_transform
    self.fixed_vocabulary_)
  File "/usr/lib/python3/dist-packages/sklearn/feature_extraction/text.py", line 944, in _count_vocab
    feature_idx = vocabulary[feature]
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 2054, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py", line 1101, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py", line 248, in wrapped
    return f(*args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py", line 281, in _fixed_getinnerframes
    records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))
  File "/usr/lib/python3.7/inspect.py", line 1502, in getinnerframes
    frameinfo = (tb.tb_frame,) + getframeinfo(tb, context)
  File "/usr/lib/python3.7/inspect.py", line 1460, in getframeinfo
    filename = getsourcefile(frame) or getfile(frame)
  File "/usr/lib/python3.7/inspect.py", line 696, in getsourcefile
    if getattr(getmodule(object, filename), '__loader__', None) is not None:
  File "/usr/lib/python3.7/inspect.py", line 732, in getmodule
    for modname, module in list(sys.modules.items()):
KeyboardInterrupt
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
    [... skipping hidden 1 frame]

<ipython-input-1-a756e6b5ff53> in <module>
     21 count_vect = CountVectorizer()
---> 22 X_train_counts = count_vect.fit_transform(X_train_raw)
     23 X_dev0_counts = count_vect.transform(X_dev0_cleaned)

/usr/lib/python3/dist-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
   1031         vocabulary, X = self._count_vocab(raw_documents,
-> 1032                                           self.fixed_vocabulary_)
   1033 

/usr/lib/python3/dist-packages/sklearn/feature_extraction/text.py in _count_vocab(self, raw_documents, fixed_vocab)
    943                 try:
--> 944                     feature_idx = vocabulary[feature]
    945                     if feature_idx not in feature_counter:

KeyboardInterrupt: 

During handling of the above exception, another exception occurred:

AttributeError                            Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py in showtraceback(self, exc_tuple, filename, tb_offset, exception_only, running_compiled_code)
   2053                         # in the engines. This should return a list of strings.
-> 2054                         stb = value._render_traceback_()
   2055                     except Exception:

AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
    [... skipping hidden 1 frame]

/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py in showtraceback(self, exc_tuple, filename, tb_offset, exception_only, running_compiled_code)
   2055                     except Exception:
   2056                         stb = self.InteractiveTB.structured_traceback(etype,
-> 2057                                             value, tb, tb_offset=tb_offset)
   2058 
   2059                     self._showtraceback(etype, value, stb)

/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py in structured_traceback(self, etype, value, tb, tb_offset, number_of_lines_of_context)
   1366             self.tb = tb
   1367         return FormattedTB.structured_traceback(
-> 1368             self, etype, value, tb, tb_offset, number_of_lines_of_context)
   1369 
   1370 

/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py in structured_traceback(self, etype, value, tb, tb_offset, number_of_lines_of_context)
   1266             # Verbose modes need a full traceback
   1267             return VerboseTB.structured_traceback(
-> 1268                 self, etype, value, tb, tb_offset, number_of_lines_of_context
   1269             )
   1270         elif mode == 'Minimal':

/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py in structured_traceback(self, etype, evalue, etb, tb_offset, number_of_lines_of_context)
   1123 
   1124         formatted_exception = self.format_exception_as_a_whole(etype, evalue, etb, number_of_lines_of_context,
-> 1125                                                                tb_offset)
   1126 
   1127         colors = self.Colors  # just a shorthand + quicker name lookup

/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py in format_exception_as_a_whole(self, etype, evalue, etb, number_of_lines_of_context, tb_offset)
   1080 
   1081 
-> 1082         last_unique, recursion_repeat = find_recursion(orig_etype, evalue, records)
   1083 
   1084         frames = self.format_records(records, last_unique, recursion_repeat)

/usr/local/lib/python3.7/dist-packages/IPython/core/ultratb.py in find_recursion(etype, value, records)
    380     # first frame (from in to out) that looks different.
    381     if not is_recursion_error(etype, value, records):
--> 382         return len(records), 0
    383 
    384     # Select filename, lineno, func_name to track frames with

TypeError: object of type 'NoneType' has no len()
# clf = MLPClassifier().fit(X_train_counts, y_train_raw)

# y_predicted_dev0_mlp = clf.predict(X_dev0_counts)
# y_predicted_dev1_mlp = clf.predict(X_dev1_counts)

# accuracy_dev0_mlp = accuracy_score(y_expected_dev0_raw, y_predicted_dev0_mlp)
# print(f"Accuracy dev0: {accuracy_dev0_mlp}")
# accuracy_dev1_mlp = accuracy_score(y_expected_dev1_raw, y_predicted_dev1_mlp)
# print(f"Accuracy dev1: {accuracy_dev1_mlp}")
model = LogisticRegression().fit(X_train_counts, y_train_raw, max_iter=300)

y_predicted_dev0_logreg = model.predict(X_dev0_counts)
y_predicted_dev1_logreg = model.predict(X_dev1_counts)

accuracy_dev0_logreg = accuracy_score(y_expected_dev0_raw, y_predicted_dev0_logreg)
print(f"Accuracy dev0: {accuracy_dev0_logreg}")
accuracy_dev1_logreg = accuracy_score(y_expected_dev1_raw, y_predicted_dev1_logreg)
print(f"Accuracy dev1: {accuracy_dev1_logreg}")
# from sklearn.naive_bayes import MultinomialNB
# clf2 = MultinomialNB().fit(X_train_counts, y_train_raw)

# y_predicted_dev0_MNB = clf2.predict(X_dev0_counts)
# y_predicted_dev1_MNB = clf2.predict(X_dev1_counts)

# accuracy_dev0_MNB = accuracy_score(y_expected_dev0_raw, y_predicted_dev0_MNB)
# print(f"Accuracy dev0: {accuracy_dev0_MNB}")
# accuracy_dev1_MNB = accuracy_score(y_expected_dev1_raw, y_predicted_dev1_MNB)
# print(f"Accuracy dev1: {accuracy_dev1_MNB}")
lista = [1, 0, 1, 0, 0]
open("out.tsv", mode='w').writelines([str(i)+"\n" for i in lista])