diff --git a/run.ipynb b/run.ipynb index d190601..0bf04d2 100644 --- a/run.ipynb +++ b/run.ipynb @@ -132,7 +132,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "3bc8418b-64f1-4163-a0ec-8e3293032341", "metadata": {}, "outputs": [], @@ -152,10 +152,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "a18aea56-7fa1-40bd-8aa3-bbaf9d66d6b7", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[NbConvertApp] Converting notebook run.ipynb to script\n", + "[NbConvertApp] Writing 1608 bytes to run.py\n" + ] + } + ], "source": [ "!jupyter nbconvert --to script run.ipynb" ] diff --git a/run.py b/run.py index 821fffd..9afc0dc 100644 --- a/run.py +++ b/run.py @@ -19,7 +19,7 @@ from sklearn.pipeline import make_pipeline train = pd.read_csv('train/train.tsv', header=None, sep='\t', error_bad_lines=False) print(len(train)) -train = train.head(20000) +train = train.head(100000) # In[3]: @@ -39,14 +39,14 @@ x_dev[20000] = "a ten tekst jest najbardziej testowy" y_dev = pd.read_csv('dev-0/expected.tsv', header=None, sep='\t') -# In[5]: +# In[ ]: model = make_pipeline(TfidfVectorizer(), LinearRegression()) model.fit(x_train, y_train) -# In[6]: +# In[ ]: dev_predicted = model.predict(x_dev) @@ -59,7 +59,7 @@ dev_out = pd.read_csv('dev-0/out.tsv', header=None, sep='\t') dev_expected = pd.read_csv('dev-0/expected.tsv', header=None, sep='\t') -# In[7]: +# In[ ]: print(mean_squared_error(dev_out, dev_expected))