from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression import lzma X_train = [] Y_train = [] print("Reading train_in...") with lzma.open('train/in.tsv.xz', 'rt', encoding="utf-8") as train_in: for line in train_in: text = line.strip() X_train.append(text) print("Reading train_expected") with open('train/expected.tsv', 'rt') as train_expected: for line in train_expected: text = line.strip() Y_train.append(int(text)) print("Training TFIDF...") vectorizer = TfidfVectorizer(ngram_range=(1, 2), decode_error="replace", stop_words="english", max_df=0.3, max_features=500000) X_train = vectorizer.fit_transform(X_train) print("Training...") model = LogisticRegression() model.fit(X_train, Y_train) print("Predicting dev-0...") X_dev = [] with open('dev-0/in.tsv', 'r', encoding="utf-8") as dev_in: for line in dev_in: text = line.split("\t")[0].strip() X_dev.append(text) X_dev = vectorizer.transform(X_dev) predictions = model.predict(X_dev) with open("dev-0/out.tsv", "w") as out_file: for pred in predictions: out_file.write(str(pred) + "\n") print("Predicting dev-1...") X_dev = [] with open('dev-1/in.tsv', 'r', encoding="utf-8") as dev_in: for line in dev_in: text = line.split("\t")[0].strip() X_dev.append(text) X_dev = vectorizer.transform(X_dev) predictions = model.predict(X_dev) with open("dev-1/out.tsv", "w") as out_file: for pred in predictions: out_file.write(str(pred) + "\n") print("Predicting test...") X_test = [] with open('test-A/in.tsv', 'r', encoding="utf-8") as test_in: for line in test_in: text = line.split("\t")[0].strip() X_test.append(text) X_test = vectorizer.transform(X_test) predictions = model.predict(X_test) with open("test-A/out.tsv", "w") as out_file: for pred in predictions: out_file.write(str(pred)+"\n")