from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression import lzma X_train = [] Y_train = [] print("Reading train_in...") with lzma.open('train/in.tsv.xz', 'rt', encoding="utf-8") as train_in: for line in train_in: text = line.strip() X_train.append(text) print("Reading train_expected") with open('train/expected.tsv', 'rt') as train_expected: for line in train_expected: text = line.strip() Y_train.append(int(text)) print("Training TFIDF...") vectorizer = TfidfVectorizer(decode_error="replace", stop_words="english", max_df=0.8, sublinear_tf=True) X_train = vectorizer.fit_transform(X_train) print("Training...") model = LogisticRegression() model.fit(X_train, Y_train) print("Predicting dev...") X_dev = [] with open('dev-0/in.tsv', 'r', encoding="utf-8") as dev_in: for line in dev_in: text = line.split("\t")[0].strip() X_dev.append(text) X_dev = vectorizer.transform(X_dev) predictions = model.predict(X_dev) with open("dev-0/out.tsv", "w") as out_file: for pred in predictions: out_file.write(str(pred) + "\n") print("Predicting test...") X_test = [] with open('test-A/in.tsv', 'r', encoding="utf-8") as test_in: for line in test_in: text = line.split("\t")[0].strip() X_test.append(text) X_test = vectorizer.transform(X_test) predictions = model.predict(X_test) with open("test-A/out.tsv", "w") as out_file: for pred in predictions: out_file.write(str(pred)+"\n")