from sklearn.linear_model import LinearRegression from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics import mean_squared_error import lzma import numpy as np X_train_raw = lzma.open("train/train.tsv.xz", mode='rt').readlines() X_dev0 = open("dev-0/in.tsv", "r").readlines() y_expected_dev0 = open("dev-0/expected.tsv", "r").readlines() X_dev1 = open("dev-1/in.tsv", "r").readlines() y_expected_dev1 = open("dev-1/expected.tsv", "r").readlines() X_test = open("test-A/in.tsv", "r").readlines() X = [i.split('\t') for i in X_train_raw] X_train = [x[4] for x in X] y_expected_train = [x[0] for x in X] vectorizer = TfidfVectorizer(max_features=10000) X_train_tfidf = vectorizer.fit_transform(X_train) X_dev0_tfidf = vectorizer.transform(X_dev0) X_dev1_tfidf = vectorizer.transform(X_dev1) X_test_tfidf = vectorizer.transform(X_test) model = LinearRegression() model.fit(X_train_tfidf, y_expected_train) y_predicted_dev0 = model.predict(X_dev0_tfidf) y_predicted_dev1 = model.predict(X_dev1_tfidf) y_predicted_test = model.predict(X_test_tfidf) rmse_dev0 = mean_squared_error(y_expected_dev0, y_predicted_dev0, squared=False) rmse_dev1 = mean_squared_error(y_expected_dev1, y_predicted_dev1, squared=False) print(rmse_dev0, rmse_dev1) open("dev-0/out.tsv", mode='w').writelines([str(i)+'\n' for i in y_predicted_dev0]) open("dev-1/out.tsv", mode='w').writelines([str(i)+'\n' for i in y_predicted_dev1]) open("test-A/out.tsv", mode='w').writelines([str(i)+'\n' for i in y_predicted_test])