68 lines
2.0 KiB
Python
68 lines
2.0 KiB
Python
#!/usr/bin/env python
|
|
|
|
import os
|
|
import pandas as pd
|
|
from sklearn.metrics import accuracy_score
|
|
from lzma import open as open_xz
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.linear_model import LogisticRegression
|
|
|
|
|
|
def read_xz_file(path, filename, cutoff=None):
|
|
i = 0
|
|
data = []
|
|
with open_xz(os.path.join(path, filename), 'rt', encoding='utf-8') as file:
|
|
for line in file:
|
|
if cutoff and i >= cutoff:
|
|
break
|
|
data.append(line.strip())
|
|
i += 1
|
|
df = pd.DataFrame(data, columns=['col_name'])
|
|
return df
|
|
|
|
|
|
def read_tsv(path, filename, cutoff=None):
|
|
i = 0
|
|
data = []
|
|
with open(os.path.join(path, filename), 'r', encoding='utf-8') as file:
|
|
for line in file:
|
|
if cutoff and i >= cutoff:
|
|
break
|
|
data.append(line.strip())
|
|
i += 1
|
|
df = pd.DataFrame(data, columns=['col_name'])
|
|
return df
|
|
|
|
|
|
def evaluate_and_save(path, file, model, vectorizer):
|
|
# print(path, file)
|
|
df = read_tsv(path, file)
|
|
df = vectorizer.transform(df['col_name'].values)
|
|
predicted = model.predict(df)
|
|
# expected = read_tsv(path, 'expected.tsv')['col_name'].values
|
|
# print('score: ', accuracy_score(expected, predicted))
|
|
# print(type(predicted))
|
|
with open(os.path.join(path, 'out.tsv'), 'w') as f:
|
|
for value in predicted:
|
|
f.write(f'{value}\n')
|
|
|
|
|
|
def main():
|
|
train_x = read_xz_file('train', 'in.tsv.xz', 500000)
|
|
# train_x = read_xz_file('train', 'in.tsv.xz')
|
|
train_y = read_tsv('train', 'expected.tsv', 500000)
|
|
# train_y = read_tsv('train', 'expected.tsv')
|
|
|
|
tfidf_vectorizer = TfidfVectorizer()
|
|
train_x_vectorized = tfidf_vectorizer.fit_transform(train_x['col_name'].values)
|
|
|
|
model = LogisticRegression()
|
|
model.fit(train_x_vectorized, train_y['col_name'].values)
|
|
|
|
for path, file in (('dev-0', 'in.tsv'), ('dev-1', 'in.tsv'), ('test-A', 'in.tsv')):
|
|
evaluate_and_save(path, file, model, tfidf_vectorizer)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|