tfidf + lr

This commit is contained in:
Jakub Pokrywka 2023-11-16 16:41:27 +01:00
parent 29bb18a1f2
commit c5d9b752dc
6 changed files with 89081 additions and 158229 deletions

File diff suppressed because it is too large Load Diff

41
solutions/1.py Normal file
View File

@ -0,0 +1,41 @@
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle
with open('../train/in.tsv') as f:
train_in = [x.split('\t')[-1].rstrip() for x in f.readlines()]
with open('../train/expected.tsv') as f:
train_expected = [x.rstrip() for x in f.readlines()]
classes = sorted(set(train_expected))
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_in)
y_train = [classes.index(x) for x in train_expected]
model = LogisticRegression().fit(X_train, y_train)
print(accuracy_score(y_train, model.predict(X_train)))
for DATASET in 'dev-0', 'test-A', 'test-B':
with open(f'../{DATASET}/in.tsv') as f:
d_in = [x.split('\t')[-1].rstrip() for x in f.readlines()]
X_d = vectorizer.transform(d_in)
out = model.predict(X_d)
with open(f'../{DATASET}/out.tsv','w') as f:
for sample in out:
class_name = classes[sample]
f.write(class_name + '\n')
with open('vectorizer.pickle','wb') as f:
pickle.dump(vectorizer, f)
with open('model.pickle','wb') as f:
pickle.dump(model, f)

BIN
solutions/model.pickle Normal file

Binary file not shown.

BIN
solutions/vectorizer.pickle Normal file

Binary file not shown.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff