#!/usr/bin/env python import os import pandas as pd from sklearn.metrics import accuracy_score from lzma import open as open_xz from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression def read_xz_file(path, filename, cutoff=None): i = 0 data = [] with open_xz(os.path.join(path, filename), 'rt', encoding='utf-8') as file: for line in file: if cutoff and i >= cutoff: break data.append(line.strip()) i += 1 df = pd.DataFrame(data, columns=['col_name']) return df def read_tsv(path, filename, cutoff=None): i = 0 data = [] with open(os.path.join(path, filename), 'r', encoding='utf-8') as file: for line in file: if cutoff and i >= cutoff: break data.append(line.strip()) i += 1 df = pd.DataFrame(data, columns=['col_name']) return df def evaluate_and_save(path, file, model, vectorizer): # print(path, file) df = read_tsv(path, file) df = vectorizer.transform(df['col_name'].values) predicted = model.predict(df) # expected = read_tsv(path, 'expected.tsv')['col_name'].values # print('score: ', accuracy_score(expected, predicted)) # print(type(predicted)) with open(os.path.join(path, 'out.tsv'), 'w') as f: for value in predicted: f.write(f'{value}\n') def main(): train_x = read_xz_file('train', 'in.tsv.xz', 500000) train_y = read_tsv('train', 'expected.tsv', 500000) tfidf_vectorizer = TfidfVectorizer() train_x_vectorized = tfidf_vectorizer.fit_transform(train_x['col_name'].values) model = LogisticRegression() model.fit(train_x_vectorized, train_y['col_name'].values) for path, file in (('dev-0', 'in.tsv'), ('dev-1', 'in.tsv'), ('test-A', 'in.tsv')): evaluate_and_save(path, file, model, tfidf_vectorizer) if __name__ == '__main__': main()