retroc2/run.ipynb
Adrian Charkiewicz f69ed316f2 s444354 larger_set
2022-05-14 03:41:15 +02:00

15 KiB
Raw Permalink Blame History

import lzma
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
import pandas as pd
with lzma.open('train/train.tsv.xz', 'rt', encoding="utf-8") as f:
    df = pd.read_csv(f, sep='\t', names=['Begin', 'End', 'Title', 'Publisher', 'Text'])
def readFile(filename):
    result = []
    with open(filename, 'r', encoding="utf-8") as f:
        for line in f:
            text = line.split("\t")[0].strip()
            result.append(text)
    return result
def predict(filename, predictions):
    with open(filename, "w") as f:
        for p in predictions:
            f.write(str(p) + "\n")
df = df[['Text', 'Begin']]
X_train = df['Text']
y_train = df['Begin']
model = make_pipeline(TfidfVectorizer(), LinearRegression())
model.fit(X_train, y_train)
[Pipeline] ... (step 1 of 2) Processing tfidfvectorizer, total= 1.3min
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_21668/3545253539.py in <module>
      4 y_train = df['Begin']
      5 model = make_pipeline(TfidfVectorizer(), LinearRegression(), verbose=2)
----> 6 model.fit(X_train, y_train)

~\anaconda3\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
    392             if self._final_estimator != "passthrough":
    393                 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
--> 394                 self._final_estimator.fit(Xt, y, **fit_params_last_step)
    395 
    396         return self

~\anaconda3\lib\site-packages\sklearn\linear_model\_base.py in fit(self, X, y, sample_weight)
    704 
    705             if y.ndim < 2:
--> 706                 out = sparse_lsqr(X_centered, y)
    707                 self.coef_ = out[0]
    708                 self._residues = out[3]

~\anaconda3\lib\site-packages\scipy\sparse\linalg\isolve\lsqr.py in lsqr(A, b, damp, atol, btol, conlim, iter_lim, show, calc_var, x0)
    411         #     beta*u  =  a*v   -  alfa*u,
    412         #     alfa*v  =  A'*u  -  beta*v.
--> 413         u = A.matvec(v) - alfa * u
    414         beta = np.linalg.norm(u)
    415 

~\anaconda3\lib\site-packages\scipy\sparse\linalg\interface.py in matvec(self, x)
    230             raise ValueError('dimension mismatch')
    231 
--> 232         y = self._matvec(x)
    233 
    234         if isinstance(x, np.matrix):

~\anaconda3\lib\site-packages\scipy\sparse\linalg\interface.py in _matvec(self, x)
    528 
    529     def _matvec(self, x):
--> 530         return self.__matvec_impl(x)
    531 
    532     def _rmatvec(self, x):

~\anaconda3\lib\site-packages\sklearn\linear_model\_base.py in matvec(b)
    694 
    695             def matvec(b):
--> 696                 return X.dot(b) - b.dot(X_offset_scale)
    697 
    698             def rmatvec(b):

~\anaconda3\lib\site-packages\scipy\sparse\base.py in dot(self, other)
    357 
    358         """
--> 359         return self * other
    360 
    361     def power(self, n, dtype=None):

~\anaconda3\lib\site-packages\scipy\sparse\base.py in __mul__(self, other)
    465             # Fast path for the most common case
    466             if other.shape == (N,):
--> 467                 return self._mul_vector(other)
    468             elif other.shape == (N, 1):
    469                 return self._mul_vector(other.ravel()).reshape(M, 1)

~\anaconda3\lib\site-packages\scipy\sparse\compressed.py in _mul_vector(self, other)
    476         # csr_matvec or csc_matvec
    477         fn = getattr(_sparsetools, self.format + '_matvec')
--> 478         fn(M, N, self.indptr, self.indices, self.data, other, result)
    479 
    480         return result

KeyboardInterrupt: 
filenames=[('dev-0/in.tsv',"dev-0/out.tsv"), ('dev-1/in.tsv', "dev-1/out.tsv"), ('test-A/in.tsv', 'test-A/out.tsv')]
for filename in filenames:
    f=readFile(filename[0])
    y_predict=model.predict(f)
    predict(filename[1],y_predict)