from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression import lzma if __name__ == "__main__": X = [] Y = [] with lzma.open('train/in.tsv.xz', 'r') as file: for line in file: line = line.strip() X.append(line.decode("utf-8")) print("step 1") with open('train/expected.tsv', 'r') as file: for line in file: line = line.strip() Y.append(int(line)) print("step 2") vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(X) print("step 3") model = LogisticRegression() model.fit(X, Y) print("step 4") X_dev = [] Y_dev = [] with open('test-A/in.tsv', 'r') as file: for line in file: line = line.strip() X_dev.append(line) print("step 5") X_dev = vectorizer.transform(X_dev) prediction = model.predict(X_dev) print("step 6") f = open("test-A/out.tsv", "a") for p in prediction: f.write(str(p) + '\n') f.close()