sport-text-classification-b.../sport-text-classification-b...

18 KiB
Raw Permalink Blame History

import numpy as np
from collections import Counter
import pandas as pd
df = pd.read_csv('train\\\\train.tsv', sep = '\t', on_bad_lines='skip', names=['y', 'x'])
df = df.loc[:,['x','y']]
df
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Input In [2], in <cell line: 1>()
----> 1 df = pd.read_csv('train\\\\train.tsv', sep = '\t', on_bad_lines='skip', names=['y', 'x'])
      2 df = df.loc[:,['x','y']]
      3 df

File ~\AppData\Roaming\Python\Python310\site-packages\pandas\util\_decorators.py:311, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
    305 if len(args) > num_allow_args:
    306     warnings.warn(
    307         msg.format(arguments=arguments),
    308         FutureWarning,
    309         stacklevel=stacklevel,
    310     )
--> 311 return func(*args, **kwargs)

File ~\AppData\Roaming\Python\Python310\site-packages\pandas\io\parsers\readers.py:680, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
    665 kwds_defaults = _refine_defaults_read(
    666     dialect,
    667     delimiter,
   (...)
    676     defaults={"delimiter": ","},
    677 )
    678 kwds.update(kwds_defaults)
--> 680 return _read(filepath_or_buffer, kwds)

File ~\AppData\Roaming\Python\Python310\site-packages\pandas\io\parsers\readers.py:575, in _read(filepath_or_buffer, kwds)
    572 _validate_names(kwds.get("names", None))
    574 # Create the parser.
--> 575 parser = TextFileReader(filepath_or_buffer, **kwds)
    577 if chunksize or iterator:
    578     return parser

File ~\AppData\Roaming\Python\Python310\site-packages\pandas\io\parsers\readers.py:933, in TextFileReader.__init__(self, f, engine, **kwds)
    930     self.options["has_index_names"] = kwds["has_index_names"]
    932 self.handles: IOHandles | None = None
--> 933 self._engine = self._make_engine(f, self.engine)

File ~\AppData\Roaming\Python\Python310\site-packages\pandas\io\parsers\readers.py:1217, in TextFileReader._make_engine(self, f, engine)
   1213     mode = "rb"
   1214 # error: No overload variant of "get_handle" matches argument types
   1215 # "Union[str, PathLike[str], ReadCsvBuffer[bytes], ReadCsvBuffer[str]]"
   1216 # , "str", "bool", "Any", "Any", "Any", "Any", "Any"
-> 1217 self.handles = get_handle(  # type: ignore[call-overload]
   1218     f,
   1219     mode,
   1220     encoding=self.options.get("encoding", None),
   1221     compression=self.options.get("compression", None),
   1222     memory_map=self.options.get("memory_map", False),
   1223     is_text=is_text,
   1224     errors=self.options.get("encoding_errors", "strict"),
   1225     storage_options=self.options.get("storage_options", None),
   1226 )
   1227 assert self.handles is not None
   1228 f = self.handles.handle

File ~\AppData\Roaming\Python\Python310\site-packages\pandas\io\common.py:789, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    784 elif isinstance(handle, str):
    785     # Check whether the filename is to be opened in binary mode.
    786     # Binary mode does not support 'encoding' and 'newline'.
    787     if ioargs.encoding and "b" not in ioargs.mode:
    788         # Encoding
--> 789         handle = open(
    790             handle,
    791             ioargs.mode,
    792             encoding=ioargs.encoding,
    793             errors=errors,
    794             newline="",
    795         )
    796     else:
    797         # Binary mode
    798         handle = open(handle, ioargs.mode)

FileNotFoundError: [Errno 2] No such file or directory: 'train\\\\train.tsv'
# df = df.head(20000)
Y = df.y
# data = df.T.to_dict().values()
data = df.x
data
# from sklearn.feature_extraction import DictVectorizer

# dv = DictVectorizer(sparse=False)
# X = dv.fit_transform(data)
# Y, X

from sklearn.feature_extraction.text import TfidfVectorizer

dv = TfidfVectorizer()
X = dv.fit_transform(data)
Y, X
from sklearn.naive_bayes import BernoulliNB
algorithm = BernoulliNB()
t = algorithm.fit(X, Y)
test = pd.read_csv('dev-0\\\\in.tsv', sep = '\t', on_bad_lines='warn', names=['x'])
Y_t = pd.read_csv('dev-0\\\\expected.tsv', sep = '\t', on_bad_lines='warn', names=['y'])
# Y_t = Y_t.drop([1983, 5199])
# test = test.drop([1983, 5199])

# Y_t = Y_t.head(5400)
# test = test.head(5400)
# X_t = dv.transform(test.T.to_dict().values())
X_t = dv.transform(test.x)
# Y_t
prediction = [item[1] for item in algorithm.predict_proba(X_t)]
prediction
prediction_bin = [item[1] for item in algorithm.predict_proba(X_t).round()]
prediction_bin
out = pd.DataFrame(prediction_bin)
out.to_csv('out_2.tsv', sep='\t', index=False, header=False)
from sklearn.metrics import log_loss, accuracy_score
print(log_loss(Y_t, prediction))
print(accuracy_score(Y_t, prediction_bin))
test_a = pd.read_csv('test-A\\\\in.tsv', sep = '\t', on_bad_lines='warn', names=['x'])
# test_a[0]
# test_a = 0
test_a = dv.transform(test_a.x)
test_a
prediction = [item[1] for item in algorithm.predict_proba(test_a)]
prediction_bin = [item[1] for item in algorithm.predict_proba(test_a).round()]
prediction_bin
out = pd.DataFrame(prediction_bin)
out.to_csv('out_t.tsv', sep='\t', index=False, header=False)