18 KiB
18 KiB
import numpy as np
from collections import Counter
import pandas as pd
df = pd.read_csv('train\\\\train.tsv', sep = '\t', on_bad_lines='skip', names=['y', 'x'])
df = df.loc[:,['x','y']]
df
[1;31m---------------------------------------------------------------------------[0m [1;31mFileNotFoundError[0m Traceback (most recent call last) Input [1;32mIn [2][0m, in [0;36m<cell line: 1>[1;34m()[0m [1;32m----> 1[0m df [38;5;241m=[39m [43mpd[49m[38;5;241;43m.[39;49m[43mread_csv[49m[43m([49m[38;5;124;43m'[39;49m[38;5;124;43mtrain[39;49m[38;5;130;43;01m\\\\[39;49;00m[38;5;124;43mtrain.tsv[39;49m[38;5;124;43m'[39;49m[43m,[49m[43m [49m[43msep[49m[43m [49m[38;5;241;43m=[39;49m[43m [49m[38;5;124;43m'[39;49m[38;5;130;43;01m\t[39;49;00m[38;5;124;43m'[39;49m[43m,[49m[43m [49m[43mon_bad_lines[49m[38;5;241;43m=[39;49m[38;5;124;43m'[39;49m[38;5;124;43mskip[39;49m[38;5;124;43m'[39;49m[43m,[49m[43m [49m[43mnames[49m[38;5;241;43m=[39;49m[43m[[49m[38;5;124;43m'[39;49m[38;5;124;43my[39;49m[38;5;124;43m'[39;49m[43m,[49m[43m [49m[38;5;124;43m'[39;49m[38;5;124;43mx[39;49m[38;5;124;43m'[39;49m[43m][49m[43m)[49m [0;32m 2[0m df [38;5;241m=[39m df[38;5;241m.[39mloc[:,[[38;5;124m'[39m[38;5;124mx[39m[38;5;124m'[39m,[38;5;124m'[39m[38;5;124my[39m[38;5;124m'[39m]] [0;32m 3[0m df File [1;32m~\AppData\Roaming\Python\Python310\site-packages\pandas\util\_decorators.py:311[0m, in [0;36mdeprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper[1;34m(*args, **kwargs)[0m [0;32m 305[0m [38;5;28;01mif[39;00m [38;5;28mlen[39m(args) [38;5;241m>[39m num_allow_args: [0;32m 306[0m warnings[38;5;241m.[39mwarn( [0;32m 307[0m msg[38;5;241m.[39mformat(arguments[38;5;241m=[39marguments), [0;32m 308[0m [38;5;167;01mFutureWarning[39;00m, [0;32m 309[0m stacklevel[38;5;241m=[39mstacklevel, [0;32m 310[0m ) [1;32m--> 311[0m [38;5;28;01mreturn[39;00m func([38;5;241m*[39margs, [38;5;241m*[39m[38;5;241m*[39mkwargs) File [1;32m~\AppData\Roaming\Python\Python310\site-packages\pandas\io\parsers\readers.py:680[0m, in [0;36mread_csv[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)[0m [0;32m 665[0m kwds_defaults [38;5;241m=[39m _refine_defaults_read( [0;32m 666[0m dialect, [0;32m 667[0m delimiter, [1;32m (...)[0m [0;32m 676[0m defaults[38;5;241m=[39m{[38;5;124m"[39m[38;5;124mdelimiter[39m[38;5;124m"[39m: [38;5;124m"[39m[38;5;124m,[39m[38;5;124m"[39m}, [0;32m 677[0m ) [0;32m 678[0m kwds[38;5;241m.[39mupdate(kwds_defaults) [1;32m--> 680[0m [38;5;28;01mreturn[39;00m [43m_read[49m[43m([49m[43mfilepath_or_buffer[49m[43m,[49m[43m [49m[43mkwds[49m[43m)[49m File [1;32m~\AppData\Roaming\Python\Python310\site-packages\pandas\io\parsers\readers.py:575[0m, in [0;36m_read[1;34m(filepath_or_buffer, kwds)[0m [0;32m 572[0m _validate_names(kwds[38;5;241m.[39mget([38;5;124m"[39m[38;5;124mnames[39m[38;5;124m"[39m, [38;5;28;01mNone[39;00m)) [0;32m 574[0m [38;5;66;03m# Create the parser.[39;00m [1;32m--> 575[0m parser [38;5;241m=[39m TextFileReader(filepath_or_buffer, [38;5;241m*[39m[38;5;241m*[39mkwds) [0;32m 577[0m [38;5;28;01mif[39;00m chunksize [38;5;129;01mor[39;00m iterator: [0;32m 578[0m [38;5;28;01mreturn[39;00m parser File [1;32m~\AppData\Roaming\Python\Python310\site-packages\pandas\io\parsers\readers.py:933[0m, in [0;36mTextFileReader.__init__[1;34m(self, f, engine, **kwds)[0m [0;32m 930[0m [38;5;28mself[39m[38;5;241m.[39moptions[[38;5;124m"[39m[38;5;124mhas_index_names[39m[38;5;124m"[39m] [38;5;241m=[39m kwds[[38;5;124m"[39m[38;5;124mhas_index_names[39m[38;5;124m"[39m] [0;32m 932[0m [38;5;28mself[39m[38;5;241m.[39mhandles: IOHandles [38;5;241m|[39m [38;5;28;01mNone[39;00m [38;5;241m=[39m [38;5;28;01mNone[39;00m [1;32m--> 933[0m [38;5;28mself[39m[38;5;241m.[39m_engine [38;5;241m=[39m [38;5;28;43mself[39;49m[38;5;241;43m.[39;49m[43m_make_engine[49m[43m([49m[43mf[49m[43m,[49m[43m [49m[38;5;28;43mself[39;49m[38;5;241;43m.[39;49m[43mengine[49m[43m)[49m File [1;32m~\AppData\Roaming\Python\Python310\site-packages\pandas\io\parsers\readers.py:1217[0m, in [0;36mTextFileReader._make_engine[1;34m(self, f, engine)[0m [0;32m 1213[0m mode [38;5;241m=[39m [38;5;124m"[39m[38;5;124mrb[39m[38;5;124m"[39m [0;32m 1214[0m [38;5;66;03m# error: No overload variant of "get_handle" matches argument types[39;00m [0;32m 1215[0m [38;5;66;03m# "Union[str, PathLike[str], ReadCsvBuffer[bytes], ReadCsvBuffer[str]]"[39;00m [0;32m 1216[0m [38;5;66;03m# , "str", "bool", "Any", "Any", "Any", "Any", "Any"[39;00m [1;32m-> 1217[0m [38;5;28mself[39m[38;5;241m.[39mhandles [38;5;241m=[39m [43mget_handle[49m[43m([49m[43m [49m[38;5;66;43;03m# type: ignore[call-overload][39;49;00m [0;32m 1218[0m [43m [49m[43mf[49m[43m,[49m [0;32m 1219[0m [43m [49m[43mmode[49m[43m,[49m [0;32m 1220[0m [43m [49m[43mencoding[49m[38;5;241;43m=[39;49m[38;5;28;43mself[39;49m[38;5;241;43m.[39;49m[43moptions[49m[38;5;241;43m.[39;49m[43mget[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mencoding[39;49m[38;5;124;43m"[39;49m[43m,[49m[43m [49m[38;5;28;43;01mNone[39;49;00m[43m)[49m[43m,[49m [0;32m 1221[0m [43m [49m[43mcompression[49m[38;5;241;43m=[39;49m[38;5;28;43mself[39;49m[38;5;241;43m.[39;49m[43moptions[49m[38;5;241;43m.[39;49m[43mget[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mcompression[39;49m[38;5;124;43m"[39;49m[43m,[49m[43m [49m[38;5;28;43;01mNone[39;49;00m[43m)[49m[43m,[49m [0;32m 1222[0m [43m [49m[43mmemory_map[49m[38;5;241;43m=[39;49m[38;5;28;43mself[39;49m[38;5;241;43m.[39;49m[43moptions[49m[38;5;241;43m.[39;49m[43mget[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mmemory_map[39;49m[38;5;124;43m"[39;49m[43m,[49m[43m [49m[38;5;28;43;01mFalse[39;49;00m[43m)[49m[43m,[49m [0;32m 1223[0m [43m [49m[43mis_text[49m[38;5;241;43m=[39;49m[43mis_text[49m[43m,[49m [0;32m 1224[0m [43m [49m[43merrors[49m[38;5;241;43m=[39;49m[38;5;28;43mself[39;49m[38;5;241;43m.[39;49m[43moptions[49m[38;5;241;43m.[39;49m[43mget[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mencoding_errors[39;49m[38;5;124;43m"[39;49m[43m,[49m[43m [49m[38;5;124;43m"[39;49m[38;5;124;43mstrict[39;49m[38;5;124;43m"[39;49m[43m)[49m[43m,[49m [0;32m 1225[0m [43m [49m[43mstorage_options[49m[38;5;241;43m=[39;49m[38;5;28;43mself[39;49m[38;5;241;43m.[39;49m[43moptions[49m[38;5;241;43m.[39;49m[43mget[49m[43m([49m[38;5;124;43m"[39;49m[38;5;124;43mstorage_options[39;49m[38;5;124;43m"[39;49m[43m,[49m[43m [49m[38;5;28;43;01mNone[39;49;00m[43m)[49m[43m,[49m [0;32m 1226[0m [43m[49m[43m)[49m [0;32m 1227[0m [38;5;28;01massert[39;00m [38;5;28mself[39m[38;5;241m.[39mhandles [38;5;129;01mis[39;00m [38;5;129;01mnot[39;00m [38;5;28;01mNone[39;00m [0;32m 1228[0m f [38;5;241m=[39m [38;5;28mself[39m[38;5;241m.[39mhandles[38;5;241m.[39mhandle File [1;32m~\AppData\Roaming\Python\Python310\site-packages\pandas\io\common.py:789[0m, in [0;36mget_handle[1;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)[0m [0;32m 784[0m [38;5;28;01melif[39;00m [38;5;28misinstance[39m(handle, [38;5;28mstr[39m): [0;32m 785[0m [38;5;66;03m# Check whether the filename is to be opened in binary mode.[39;00m [0;32m 786[0m [38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.[39;00m [0;32m 787[0m [38;5;28;01mif[39;00m ioargs[38;5;241m.[39mencoding [38;5;129;01mand[39;00m [38;5;124m"[39m[38;5;124mb[39m[38;5;124m"[39m [38;5;129;01mnot[39;00m [38;5;129;01min[39;00m ioargs[38;5;241m.[39mmode: [0;32m 788[0m [38;5;66;03m# Encoding[39;00m [1;32m--> 789[0m handle [38;5;241m=[39m [38;5;28;43mopen[39;49m[43m([49m [0;32m 790[0m [43m [49m[43mhandle[49m[43m,[49m [0;32m 791[0m [43m [49m[43mioargs[49m[38;5;241;43m.[39;49m[43mmode[49m[43m,[49m [0;32m 792[0m [43m [49m[43mencoding[49m[38;5;241;43m=[39;49m[43mioargs[49m[38;5;241;43m.[39;49m[43mencoding[49m[43m,[49m [0;32m 793[0m [43m [49m[43merrors[49m[38;5;241;43m=[39;49m[43merrors[49m[43m,[49m [0;32m 794[0m [43m [49m[43mnewline[49m[38;5;241;43m=[39;49m[38;5;124;43m"[39;49m[38;5;124;43m"[39;49m[43m,[49m [0;32m 795[0m [43m [49m[43m)[49m [0;32m 796[0m [38;5;28;01melse[39;00m: [0;32m 797[0m [38;5;66;03m# Binary mode[39;00m [0;32m 798[0m handle [38;5;241m=[39m [38;5;28mopen[39m(handle, ioargs[38;5;241m.[39mmode) [1;31mFileNotFoundError[0m: [Errno 2] No such file or directory: 'train\\\\train.tsv'
# df = df.head(20000)
Y = df.y
# data = df.T.to_dict().values()
data = df.x
data
# from sklearn.feature_extraction import DictVectorizer
# dv = DictVectorizer(sparse=False)
# X = dv.fit_transform(data)
# Y, X
from sklearn.feature_extraction.text import TfidfVectorizer
dv = TfidfVectorizer()
X = dv.fit_transform(data)
Y, X
from sklearn.naive_bayes import BernoulliNB
algorithm = BernoulliNB()
t = algorithm.fit(X, Y)
test = pd.read_csv('dev-0\\\\in.tsv', sep = '\t', on_bad_lines='warn', names=['x'])
Y_t = pd.read_csv('dev-0\\\\expected.tsv', sep = '\t', on_bad_lines='warn', names=['y'])
# Y_t = Y_t.drop([1983, 5199])
# test = test.drop([1983, 5199])
# Y_t = Y_t.head(5400)
# test = test.head(5400)
# X_t = dv.transform(test.T.to_dict().values())
X_t = dv.transform(test.x)
# Y_t
prediction = [item[1] for item in algorithm.predict_proba(X_t)]
prediction
prediction_bin = [item[1] for item in algorithm.predict_proba(X_t).round()]
prediction_bin
out = pd.DataFrame(prediction_bin)
out.to_csv('out_2.tsv', sep='\t', index=False, header=False)
from sklearn.metrics import log_loss, accuracy_score
print(log_loss(Y_t, prediction))
print(accuracy_score(Y_t, prediction_bin))
test_a = pd.read_csv('test-A\\\\in.tsv', sep = '\t', on_bad_lines='warn', names=['x'])
# test_a[0]
# test_a = 0
test_a = dv.transform(test_a.x)
test_a
prediction = [item[1] for item in algorithm.predict_proba(test_a)]
prediction_bin = [item[1] for item in algorithm.predict_proba(test_a).round()]
prediction_bin
out = pd.DataFrame(prediction_bin)
out.to_csv('out_t.tsv', sep='\t', index=False, header=False)