42 lines
1.1 KiB
Python
42 lines
1.1 KiB
Python
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.linear_model import LogisticRegression
|
|
from sklearn.metrics import accuracy_score
|
|
import pickle
|
|
|
|
|
|
|
|
with open('../train/in.tsv') as f:
|
|
train_in = [x.split('\t')[-1].rstrip() for x in f.readlines()]
|
|
|
|
with open('../train/expected.tsv') as f:
|
|
train_expected = [x.rstrip() for x in f.readlines()]
|
|
|
|
classes = sorted(set(train_expected))
|
|
|
|
vectorizer = TfidfVectorizer()
|
|
X_train = vectorizer.fit_transform(train_in)
|
|
y_train = [classes.index(x) for x in train_expected]
|
|
|
|
model = LogisticRegression().fit(X_train, y_train)
|
|
|
|
print(accuracy_score(y_train, model.predict(X_train)))
|
|
|
|
|
|
|
|
for DATASET in 'dev-0', 'test-A', 'test-B':
|
|
with open(f'../{DATASET}/in.tsv') as f:
|
|
d_in = [x.split('\t')[-1].rstrip() for x in f.readlines()]
|
|
|
|
X_d = vectorizer.transform(d_in)
|
|
out = model.predict(X_d)
|
|
with open(f'../{DATASET}/out.tsv','w') as f:
|
|
for sample in out:
|
|
class_name = classes[sample]
|
|
f.write(class_name + '\n')
|
|
|
|
with open('vectorizer.pickle','wb') as f:
|
|
pickle.dump(vectorizer, f)
|
|
|
|
with open('model.pickle','wb') as f:
|
|
pickle.dump(model, f)
|