ireland-news-headlines/solutions/1.py
Jakub Pokrywka c5d9b752dc tfidf + lr
2023-11-16 16:41:27 +01:00

42 lines
1.1 KiB
Python

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle
with open('../train/in.tsv') as f:
train_in = [x.split('\t')[-1].rstrip() for x in f.readlines()]
with open('../train/expected.tsv') as f:
train_expected = [x.rstrip() for x in f.readlines()]
classes = sorted(set(train_expected))
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_in)
y_train = [classes.index(x) for x in train_expected]
model = LogisticRegression().fit(X_train, y_train)
print(accuracy_score(y_train, model.predict(X_train)))
for DATASET in 'dev-0', 'test-A', 'test-B':
with open(f'../{DATASET}/in.tsv') as f:
d_in = [x.split('\t')[-1].rstrip() for x in f.readlines()]
X_d = vectorizer.transform(d_in)
out = model.predict(X_d)
with open(f'../{DATASET}/out.tsv','w') as f:
for sample in out:
class_name = classes[sample]
f.write(class_name + '\n')
with open('vectorizer.pickle','wb') as f:
pickle.dump(vectorizer, f)
with open('model.pickle','wb') as f:
pickle.dump(model, f)