retroc2/run.ipynb

4.4 KiB

from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
import lzma
import numpy as np

X_train_raw = lzma.open("train/train.tsv.xz", mode='rt').readlines()
X_dev0 = open("dev-0/in.tsv", "r").readlines()
y_expected_dev0 = open("dev-0/expected.tsv", "r").readlines()
X_dev1 = open("dev-1/in.tsv", "r").readlines()
y_expected_dev1 = open("dev-1/expected.tsv", "r").readlines()
X_test = open("test-A/in.tsv", "r").readlines()
X = [i.split('\t') for i in X_train_raw]
X_train = [x[4] for x in X]
y_expected_train = [x[0] for x in X]
vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_dev0_tfidf = vectorizer.transform(X_dev0)
X_dev1_tfidf = vectorizer.transform(X_dev1)
X_test_tfidf = vectorizer.transform(X_test)
model = LinearRegression()
model.fit(X_train_tfidf, y_expected_train)
y_predicted_dev0 = model.predict(X_dev0_tfidf)
y_predicted_dev1 = model.predict(X_dev1_tfidf)
y_predicted_test = model.predict(X_test_tfidf)
rmse_dev0 = mean_squared_error(y_expected_dev0, y_predicted_dev0, squared=False)
rmse_dev1 = mean_squared_error(y_expected_dev1, y_predicted_dev1, squared=False)
/Users/Dominik/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py:63: FutureWarning: Arrays of bytes/strings is being converted to decimal numbers if dtype='numeric'. This behavior is deprecated in 0.24 and will be removed in 1.1 (renaming of 0.26). Please convert your data to numeric values explicitly instead.
  return f(*args, **kwargs)
/Users/Dominik/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py:63: FutureWarning: Arrays of bytes/strings is being converted to decimal numbers if dtype='numeric'. This behavior is deprecated in 0.24 and will be removed in 1.1 (renaming of 0.26). Please convert your data to numeric values explicitly instead.
  return f(*args, **kwargs)
print(rmse_dev0, rmse_dev1)
24.077488520623103 22.447122551358966
open("dev-0/out.tsv", mode='w').writelines([str(i)+'\n' for i in y_predicted_dev0])
open("dev-1/out.tsv", mode='w').writelines([str(i)+'\n' for i in y_predicted_dev1])
open("test-A/out.tsv", mode='w').writelines([str(i)+'\n' for i in y_predicted_test])