import pandas as pd from sklearn.linear_model import LinearRegression from sklearn.feature_extraction.text import TfidfVectorizer def linear_regression(train_path, predict_path, out_path): train = pd.read_csv(train_path, sep='\t', header=None) X_train = train[4] Y_train = train[0] pred_x = [] with open(predict_path, encoding='utf-8') as f: for line in f: pred_x.append(line) vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(X_train) pred_x = vectorizer.transform(pred_x) model = LinearRegression() model.fit(X_train, Y_train) pred_y = model.predict(pred_x) pd.DataFrame(pred_y).to_csv(out_path, header=False, index=None) linear_regression("train/train.tsv", "dev-0/in.tsv", "dev-0/out.tsv") linear_regression("train/train.tsv", "dev-1/in.tsv", "dev-1/out.tsv") linear_regression("train/train.tsv", "test-A/in.tsv", "test-A/out.tsv")