Add solution
This commit is contained in:
commit
142f0ca72e
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
.idea
|
||||||
|
train.tsv
|
25
README.md
Normal file
25
README.md
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
|
||||||
|
Sport Texts Classification Challenge - Ball
|
||||||
|
======================
|
||||||
|
|
||||||
|
Guess whether the sport is connected to the ball for a Polish article. Evaluation metrics: Accuracy, Likelihood.
|
||||||
|
|
||||||
|
Classes
|
||||||
|
-------
|
||||||
|
|
||||||
|
* `1` — ball
|
||||||
|
* `0` — no-ball
|
||||||
|
|
||||||
|
Directory structure
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
* `README.md` — this file
|
||||||
|
* `config.txt` — configuration file
|
||||||
|
* `train/` — directory with training data
|
||||||
|
* `train/train.tsv` — sample train set
|
||||||
|
* `dev-0/` — directory with dev (test) data
|
||||||
|
* `dev-0/in.tsv` — input data for the dev set
|
||||||
|
* `dev-0/expected.tsv` — expected (reference) data for the dev set
|
||||||
|
* `test-A` — directory with test data
|
||||||
|
* `test-A/in.tsv` — input data for the test set
|
||||||
|
* `test-A/expected.tsv` — expected (reference) data for the test set
|
BIN
__pycache__/model.cpython-39.pyc
Normal file
BIN
__pycache__/model.cpython-39.pyc
Normal file
Binary file not shown.
1
config.txt
Normal file
1
config.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
--metric Likelihood --metric Accuracy --precision 5
|
5452
dev-0/expected.tsv
Normal file
5452
dev-0/expected.tsv
Normal file
File diff suppressed because it is too large
Load Diff
5452
dev-0/in.tsv
Normal file
5452
dev-0/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
5452
dev-0/out.tsv
Normal file
5452
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
38
model.py
Normal file
38
model.py
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
n_features = 1000
|
||||||
|
batch_size = 5
|
||||||
|
|
||||||
|
class NeuralNetworkModel(torch.nn.Module):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super(NeuralNetworkModel, self).__init__()
|
||||||
|
self.fc1 = torch.nn.Linear(n_features, 500)
|
||||||
|
self.fc2 = torch.nn.Linear(500, 1)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = self.fc1(x)
|
||||||
|
x = torch.relu(x)
|
||||||
|
x = self.fc2(x)
|
||||||
|
x = torch.sigmoid(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
def get_loss_acc(model, X_dataset, Y_dataset, criterion, optimizer,):
|
||||||
|
loss_score = 0
|
||||||
|
acc_score = 0
|
||||||
|
items_total = 0
|
||||||
|
model.eval()
|
||||||
|
for i in range(0, Y_dataset.shape[0], batch_size):
|
||||||
|
X = X_dataset[i:i+batch_size]
|
||||||
|
X = torch.tensor(X.astype(np.float32))
|
||||||
|
Y = Y_dataset[i:i+batch_size]
|
||||||
|
Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)
|
||||||
|
Y_predictions = model(X)
|
||||||
|
acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
|
||||||
|
items_total += Y.shape[0]
|
||||||
|
optimizer.zero_grad()
|
||||||
|
loss = criterion(Y_predictions, Y)
|
||||||
|
loss.backward()
|
||||||
|
optimizer.step()
|
||||||
|
loss_score += loss.item() * Y.shape[0]
|
||||||
|
return (loss_score / items_total), (acc_score / items_total)
|
82
solution.py
Normal file
82
solution.py
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
from model import n_features, NeuralNetworkModel, get_loss_acc, batch_size
|
||||||
|
|
||||||
|
stopwords = []
|
||||||
|
# stopwords source - https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt
|
||||||
|
with open('stopwords') as f:
|
||||||
|
stopwords = [line.rstrip() for line in f]
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess(doc):
|
||||||
|
doc = doc.lower().split(' ')
|
||||||
|
doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
vectorizer = Doc2Vec()
|
||||||
|
model = NeuralNetworkModel()
|
||||||
|
criterion = torch.nn.BCELoss()
|
||||||
|
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
|
||||||
|
|
||||||
|
def train():
|
||||||
|
with open('train/train.tsv') as f:
|
||||||
|
docs = [line.rstrip() for line in f]
|
||||||
|
docs_preprocessed = []
|
||||||
|
y = []
|
||||||
|
for doc in docs[:10000]:
|
||||||
|
y_with_doc = doc.split('\t')
|
||||||
|
y.append(y_with_doc[0])
|
||||||
|
doc = y_with_doc[1]
|
||||||
|
doc = preprocess(doc)
|
||||||
|
docs_preprocessed.append(doc)
|
||||||
|
y = [int(value) for value in y]
|
||||||
|
y = np.reshape(y, (len(y), 1))
|
||||||
|
tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs_preprocessed)]
|
||||||
|
global vectorizer
|
||||||
|
vectorizer = Doc2Vec(tagged_documents, min_count=10, epochs=300, dm=0, vector_size=n_features)
|
||||||
|
x = vectorizer.dv.vectors
|
||||||
|
for epoch in range(5):
|
||||||
|
loss_score = 0
|
||||||
|
acc_score = 0
|
||||||
|
items_total = 0
|
||||||
|
model.train()
|
||||||
|
for i in range(0, y.shape[0], batch_size):
|
||||||
|
X = x[i:i + batch_size]
|
||||||
|
X = torch.tensor(X.astype(np.float32))
|
||||||
|
Y = y[i:i + batch_size]
|
||||||
|
Y = torch.tensor(Y.astype(np.float32)).reshape(-1, 1)
|
||||||
|
Y_predictions = model(X)
|
||||||
|
acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
|
||||||
|
items_total += Y.shape[0]
|
||||||
|
optimizer.zero_grad()
|
||||||
|
loss = criterion(Y_predictions, Y)
|
||||||
|
loss.backward()
|
||||||
|
optimizer.step()
|
||||||
|
loss_score += loss.item() * Y.shape[0]
|
||||||
|
print(epoch)
|
||||||
|
print(get_loss_acc(model, x, y, criterion, optimizer))
|
||||||
|
|
||||||
|
def classify(path):
|
||||||
|
with open(path + 'in.tsv') as f:
|
||||||
|
docs = [line.rstrip() for line in f]
|
||||||
|
docs = [preprocess(doc) for doc in docs]
|
||||||
|
test_x = [vectorizer.infer_vector(doc) for doc in docs]
|
||||||
|
test_x = np.array(test_x, np.float32)
|
||||||
|
test_x = torch.tensor(test_x)
|
||||||
|
predictions = model(test_x).detach().numpy()[:, 0]
|
||||||
|
p = []
|
||||||
|
for prediction in predictions:
|
||||||
|
if prediction >= 0.5:
|
||||||
|
p.append(1)
|
||||||
|
else:
|
||||||
|
p.append(0)
|
||||||
|
with open(path + 'out.tsv', 'w') as file:
|
||||||
|
for prediction in p:
|
||||||
|
file.write("%i\n" % prediction)
|
||||||
|
|
||||||
|
|
||||||
|
train()
|
||||||
|
classify('dev-0/')
|
||||||
|
# classify('test-A/')
|
350
stopwords
Normal file
350
stopwords
Normal file
@ -0,0 +1,350 @@
|
|||||||
|
a
|
||||||
|
aby
|
||||||
|
ach
|
||||||
|
acz
|
||||||
|
aczkolwiek
|
||||||
|
aj
|
||||||
|
albo
|
||||||
|
ale
|
||||||
|
alez
|
||||||
|
ależ
|
||||||
|
ani
|
||||||
|
az
|
||||||
|
aż
|
||||||
|
bardziej
|
||||||
|
bardzo
|
||||||
|
beda
|
||||||
|
bedzie
|
||||||
|
bez
|
||||||
|
deda
|
||||||
|
będą
|
||||||
|
bede
|
||||||
|
będę
|
||||||
|
będzie
|
||||||
|
bo
|
||||||
|
bowiem
|
||||||
|
by
|
||||||
|
byc
|
||||||
|
być
|
||||||
|
byl
|
||||||
|
byla
|
||||||
|
byli
|
||||||
|
bylo
|
||||||
|
byly
|
||||||
|
był
|
||||||
|
była
|
||||||
|
było
|
||||||
|
były
|
||||||
|
bynajmniej
|
||||||
|
cala
|
||||||
|
cali
|
||||||
|
caly
|
||||||
|
cała
|
||||||
|
cały
|
||||||
|
ci
|
||||||
|
cie
|
||||||
|
ciebie
|
||||||
|
cię
|
||||||
|
co
|
||||||
|
cokolwiek
|
||||||
|
cos
|
||||||
|
coś
|
||||||
|
czasami
|
||||||
|
czasem
|
||||||
|
czemu
|
||||||
|
czy
|
||||||
|
czyli
|
||||||
|
daleko
|
||||||
|
dla
|
||||||
|
dlaczego
|
||||||
|
dlatego
|
||||||
|
do
|
||||||
|
dobrze
|
||||||
|
dokad
|
||||||
|
dokąd
|
||||||
|
dosc
|
||||||
|
dość
|
||||||
|
duzo
|
||||||
|
dużo
|
||||||
|
dwa
|
||||||
|
dwaj
|
||||||
|
dwie
|
||||||
|
dwoje
|
||||||
|
dzis
|
||||||
|
dzisiaj
|
||||||
|
dziś
|
||||||
|
gdy
|
||||||
|
gdyby
|
||||||
|
gdyz
|
||||||
|
gdyż
|
||||||
|
gdzie
|
||||||
|
gdziekolwiek
|
||||||
|
gdzies
|
||||||
|
gdzieś
|
||||||
|
go
|
||||||
|
i
|
||||||
|
ich
|
||||||
|
ile
|
||||||
|
im
|
||||||
|
inna
|
||||||
|
inne
|
||||||
|
inny
|
||||||
|
innych
|
||||||
|
iz
|
||||||
|
iż
|
||||||
|
ja
|
||||||
|
jak
|
||||||
|
jakas
|
||||||
|
jakaś
|
||||||
|
jakby
|
||||||
|
jaki
|
||||||
|
jakichs
|
||||||
|
jakichś
|
||||||
|
jakie
|
||||||
|
jakis
|
||||||
|
jakiś
|
||||||
|
jakiz
|
||||||
|
jakiż
|
||||||
|
jakkolwiek
|
||||||
|
jako
|
||||||
|
jakos
|
||||||
|
jakoś
|
||||||
|
ją
|
||||||
|
je
|
||||||
|
jeden
|
||||||
|
jedna
|
||||||
|
jednak
|
||||||
|
jednakze
|
||||||
|
jednakże
|
||||||
|
jedno
|
||||||
|
jego
|
||||||
|
jej
|
||||||
|
jemu
|
||||||
|
jesli
|
||||||
|
jest
|
||||||
|
jestem
|
||||||
|
jeszcze
|
||||||
|
jeśli
|
||||||
|
jezeli
|
||||||
|
jeżeli
|
||||||
|
juz
|
||||||
|
już
|
||||||
|
kazdy
|
||||||
|
każdy
|
||||||
|
kiedy
|
||||||
|
kilka
|
||||||
|
kims
|
||||||
|
kimś
|
||||||
|
kto
|
||||||
|
ktokolwiek
|
||||||
|
ktora
|
||||||
|
ktore
|
||||||
|
ktorego
|
||||||
|
ktorej
|
||||||
|
ktory
|
||||||
|
ktorych
|
||||||
|
ktorym
|
||||||
|
ktorzy
|
||||||
|
ktos
|
||||||
|
ktoś
|
||||||
|
która
|
||||||
|
które
|
||||||
|
którego
|
||||||
|
której
|
||||||
|
który
|
||||||
|
których
|
||||||
|
którym
|
||||||
|
którzy
|
||||||
|
ku
|
||||||
|
lat
|
||||||
|
lecz
|
||||||
|
lub
|
||||||
|
ma
|
||||||
|
mają
|
||||||
|
mało
|
||||||
|
mam
|
||||||
|
mi
|
||||||
|
miedzy
|
||||||
|
między
|
||||||
|
mimo
|
||||||
|
mna
|
||||||
|
mną
|
||||||
|
mnie
|
||||||
|
moga
|
||||||
|
mogą
|
||||||
|
moi
|
||||||
|
moim
|
||||||
|
moj
|
||||||
|
moja
|
||||||
|
moje
|
||||||
|
moze
|
||||||
|
mozliwe
|
||||||
|
mozna
|
||||||
|
może
|
||||||
|
możliwe
|
||||||
|
można
|
||||||
|
mój
|
||||||
|
mu
|
||||||
|
musi
|
||||||
|
my
|
||||||
|
na
|
||||||
|
nad
|
||||||
|
nam
|
||||||
|
nami
|
||||||
|
nas
|
||||||
|
nasi
|
||||||
|
nasz
|
||||||
|
nasza
|
||||||
|
nasze
|
||||||
|
naszego
|
||||||
|
naszych
|
||||||
|
natomiast
|
||||||
|
natychmiast
|
||||||
|
nawet
|
||||||
|
nia
|
||||||
|
nią
|
||||||
|
nic
|
||||||
|
nich
|
||||||
|
nie
|
||||||
|
niech
|
||||||
|
niego
|
||||||
|
niej
|
||||||
|
niemu
|
||||||
|
nigdy
|
||||||
|
nim
|
||||||
|
nimi
|
||||||
|
niz
|
||||||
|
niż
|
||||||
|
no
|
||||||
|
o
|
||||||
|
obok
|
||||||
|
od
|
||||||
|
około
|
||||||
|
on
|
||||||
|
ona
|
||||||
|
one
|
||||||
|
oni
|
||||||
|
ono
|
||||||
|
oraz
|
||||||
|
oto
|
||||||
|
owszem
|
||||||
|
pan
|
||||||
|
pana
|
||||||
|
pani
|
||||||
|
po
|
||||||
|
pod
|
||||||
|
podczas
|
||||||
|
pomimo
|
||||||
|
ponad
|
||||||
|
poniewaz
|
||||||
|
ponieważ
|
||||||
|
powinien
|
||||||
|
powinna
|
||||||
|
powinni
|
||||||
|
powinno
|
||||||
|
poza
|
||||||
|
prawie
|
||||||
|
przeciez
|
||||||
|
przecież
|
||||||
|
przed
|
||||||
|
przede
|
||||||
|
przedtem
|
||||||
|
przez
|
||||||
|
przy
|
||||||
|
roku
|
||||||
|
rowniez
|
||||||
|
również
|
||||||
|
sam
|
||||||
|
sama
|
||||||
|
są
|
||||||
|
sie
|
||||||
|
się
|
||||||
|
skad
|
||||||
|
skąd
|
||||||
|
soba
|
||||||
|
sobą
|
||||||
|
sobie
|
||||||
|
sposob
|
||||||
|
sposób
|
||||||
|
swoje
|
||||||
|
ta
|
||||||
|
tak
|
||||||
|
taka
|
||||||
|
taki
|
||||||
|
takie
|
||||||
|
takze
|
||||||
|
także
|
||||||
|
tam
|
||||||
|
te
|
||||||
|
tego
|
||||||
|
tej
|
||||||
|
ten
|
||||||
|
teraz
|
||||||
|
też
|
||||||
|
to
|
||||||
|
toba
|
||||||
|
tobą
|
||||||
|
tobie
|
||||||
|
totez
|
||||||
|
toteż
|
||||||
|
totobą
|
||||||
|
trzeba
|
||||||
|
tu
|
||||||
|
tutaj
|
||||||
|
twoi
|
||||||
|
twoim
|
||||||
|
twoj
|
||||||
|
twoja
|
||||||
|
twoje
|
||||||
|
twój
|
||||||
|
twym
|
||||||
|
ty
|
||||||
|
tych
|
||||||
|
tylko
|
||||||
|
tym
|
||||||
|
u
|
||||||
|
w
|
||||||
|
wam
|
||||||
|
wami
|
||||||
|
was
|
||||||
|
wasz
|
||||||
|
wasza
|
||||||
|
wasze
|
||||||
|
we
|
||||||
|
według
|
||||||
|
wiele
|
||||||
|
wielu
|
||||||
|
więc
|
||||||
|
więcej
|
||||||
|
wlasnie
|
||||||
|
właśnie
|
||||||
|
wszyscy
|
||||||
|
wszystkich
|
||||||
|
wszystkie
|
||||||
|
wszystkim
|
||||||
|
wszystko
|
||||||
|
wtedy
|
||||||
|
wy
|
||||||
|
z
|
||||||
|
za
|
||||||
|
zaden
|
||||||
|
zadna
|
||||||
|
zadne
|
||||||
|
zadnych
|
||||||
|
zapewne
|
||||||
|
zawsze
|
||||||
|
ze
|
||||||
|
zeby
|
||||||
|
zeznowu
|
||||||
|
zł
|
||||||
|
znow
|
||||||
|
znowu
|
||||||
|
znów
|
||||||
|
zostal
|
||||||
|
został
|
||||||
|
żaden
|
||||||
|
żadna
|
||||||
|
żadne
|
||||||
|
żadnych
|
||||||
|
że
|
||||||
|
żeby
|
5447
test-A/in.tsv
Normal file
5447
test-A/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
train/train.tsv.gz
Normal file
BIN
train/train.tsv.gz
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user