petite-difference-challenge2/run.py
2022-04-26 22:13:43 +02:00

68 lines
2.0 KiB
Python

#!/usr/bin/env python
import os
import pandas as pd
from sklearn.metrics import accuracy_score
from lzma import open as open_xz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
def read_xz_file(path, filename, cutoff=None):
i = 0
data = []
with open_xz(os.path.join(path, filename), 'rt', encoding='utf-8') as file:
for line in file:
if cutoff and i >= cutoff:
break
data.append(line.strip())
i += 1
df = pd.DataFrame(data, columns=['col_name'])
return df
def read_tsv(path, filename, cutoff=None):
i = 0
data = []
with open(os.path.join(path, filename), 'r', encoding='utf-8') as file:
for line in file:
if cutoff and i >= cutoff:
break
data.append(line.strip())
i += 1
df = pd.DataFrame(data, columns=['col_name'])
return df
def evaluate_and_save(path, file, model, vectorizer):
# print(path, file)
df = read_tsv(path, file)
df = vectorizer.transform(df['col_name'].values)
predicted = model.predict(df)
# expected = read_tsv(path, 'expected.tsv')['col_name'].values
# print('score: ', accuracy_score(expected, predicted))
# print(type(predicted))
with open(os.path.join(path, 'out.tsv'), 'w') as f:
for value in predicted:
f.write(f'{value}\n')
def main():
train_x = read_xz_file('train', 'in.tsv.xz', 500000)
# train_x = read_xz_file('train', 'in.tsv.xz')
train_y = read_tsv('train', 'expected.tsv', 500000)
# train_y = read_tsv('train', 'expected.tsv')
tfidf_vectorizer = TfidfVectorizer()
train_x_vectorized = tfidf_vectorizer.fit_transform(train_x['col_name'].values)
model = LogisticRegression()
model.fit(train_x_vectorized, train_y['col_name'].values)
for path, file in (('dev-0', 'in.tsv'), ('dev-1', 'in.tsv'), ('test-A', 'in.tsv')):
evaluate_and_save(path, file, model, tfidf_vectorizer)
if __name__ == '__main__':
main()