Add solution
This commit is contained in:
commit
142f0ca72e
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
.idea
|
||||
train.tsv
|
25
README.md
Normal file
25
README.md
Normal file
@ -0,0 +1,25 @@
|
||||
|
||||
Sport Texts Classification Challenge - Ball
|
||||
======================
|
||||
|
||||
Guess whether the sport is connected to the ball for a Polish article. Evaluation metrics: Accuracy, Likelihood.
|
||||
|
||||
Classes
|
||||
-------
|
||||
|
||||
* `1` — ball
|
||||
* `0` — no-ball
|
||||
|
||||
Directory structure
|
||||
-------------------
|
||||
|
||||
* `README.md` — this file
|
||||
* `config.txt` — configuration file
|
||||
* `train/` — directory with training data
|
||||
* `train/train.tsv` — sample train set
|
||||
* `dev-0/` — directory with dev (test) data
|
||||
* `dev-0/in.tsv` — input data for the dev set
|
||||
* `dev-0/expected.tsv` — expected (reference) data for the dev set
|
||||
* `test-A` — directory with test data
|
||||
* `test-A/in.tsv` — input data for the test set
|
||||
* `test-A/expected.tsv` — expected (reference) data for the test set
|
BIN
__pycache__/model.cpython-39.pyc
Normal file
BIN
__pycache__/model.cpython-39.pyc
Normal file
Binary file not shown.
1
config.txt
Normal file
1
config.txt
Normal file
@ -0,0 +1 @@
|
||||
--metric Likelihood --metric Accuracy --precision 5
|
5452
dev-0/expected.tsv
Normal file
5452
dev-0/expected.tsv
Normal file
File diff suppressed because it is too large
Load Diff
5452
dev-0/in.tsv
Normal file
5452
dev-0/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
5452
dev-0/out.tsv
Normal file
5452
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
38
model.py
Normal file
38
model.py
Normal file
@ -0,0 +1,38 @@
|
||||
import torch
|
||||
import numpy as np
|
||||
n_features = 1000
|
||||
batch_size = 5
|
||||
|
||||
class NeuralNetworkModel(torch.nn.Module):
|
||||
|
||||
def __init__(self):
|
||||
super(NeuralNetworkModel, self).__init__()
|
||||
self.fc1 = torch.nn.Linear(n_features, 500)
|
||||
self.fc2 = torch.nn.Linear(500, 1)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.fc1(x)
|
||||
x = torch.relu(x)
|
||||
x = self.fc2(x)
|
||||
x = torch.sigmoid(x)
|
||||
return x
|
||||
|
||||
def get_loss_acc(model, X_dataset, Y_dataset, criterion, optimizer,):
|
||||
loss_score = 0
|
||||
acc_score = 0
|
||||
items_total = 0
|
||||
model.eval()
|
||||
for i in range(0, Y_dataset.shape[0], batch_size):
|
||||
X = X_dataset[i:i+batch_size]
|
||||
X = torch.tensor(X.astype(np.float32))
|
||||
Y = Y_dataset[i:i+batch_size]
|
||||
Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)
|
||||
Y_predictions = model(X)
|
||||
acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
|
||||
items_total += Y.shape[0]
|
||||
optimizer.zero_grad()
|
||||
loss = criterion(Y_predictions, Y)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
loss_score += loss.item() * Y.shape[0]
|
||||
return (loss_score / items_total), (acc_score / items_total)
|
82
solution.py
Normal file
82
solution.py
Normal file
@ -0,0 +1,82 @@
|
||||
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
|
||||
import torch
|
||||
import numpy as np
|
||||
from model import n_features, NeuralNetworkModel, get_loss_acc, batch_size
|
||||
|
||||
stopwords = []
|
||||
# stopwords source - https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt
|
||||
with open('stopwords') as f:
|
||||
stopwords = [line.rstrip() for line in f]
|
||||
|
||||
|
||||
def preprocess(doc):
|
||||
doc = doc.lower().split(' ')
|
||||
doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
|
||||
return doc
|
||||
|
||||
|
||||
vectorizer = Doc2Vec()
|
||||
model = NeuralNetworkModel()
|
||||
criterion = torch.nn.BCELoss()
|
||||
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
|
||||
|
||||
def train():
|
||||
with open('train/train.tsv') as f:
|
||||
docs = [line.rstrip() for line in f]
|
||||
docs_preprocessed = []
|
||||
y = []
|
||||
for doc in docs[:10000]:
|
||||
y_with_doc = doc.split('\t')
|
||||
y.append(y_with_doc[0])
|
||||
doc = y_with_doc[1]
|
||||
doc = preprocess(doc)
|
||||
docs_preprocessed.append(doc)
|
||||
y = [int(value) for value in y]
|
||||
y = np.reshape(y, (len(y), 1))
|
||||
tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs_preprocessed)]
|
||||
global vectorizer
|
||||
vectorizer = Doc2Vec(tagged_documents, min_count=10, epochs=300, dm=0, vector_size=n_features)
|
||||
x = vectorizer.dv.vectors
|
||||
for epoch in range(5):
|
||||
loss_score = 0
|
||||
acc_score = 0
|
||||
items_total = 0
|
||||
model.train()
|
||||
for i in range(0, y.shape[0], batch_size):
|
||||
X = x[i:i + batch_size]
|
||||
X = torch.tensor(X.astype(np.float32))
|
||||
Y = y[i:i + batch_size]
|
||||
Y = torch.tensor(Y.astype(np.float32)).reshape(-1, 1)
|
||||
Y_predictions = model(X)
|
||||
acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
|
||||
items_total += Y.shape[0]
|
||||
optimizer.zero_grad()
|
||||
loss = criterion(Y_predictions, Y)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
loss_score += loss.item() * Y.shape[0]
|
||||
print(epoch)
|
||||
print(get_loss_acc(model, x, y, criterion, optimizer))
|
||||
|
||||
def classify(path):
|
||||
with open(path + 'in.tsv') as f:
|
||||
docs = [line.rstrip() for line in f]
|
||||
docs = [preprocess(doc) for doc in docs]
|
||||
test_x = [vectorizer.infer_vector(doc) for doc in docs]
|
||||
test_x = np.array(test_x, np.float32)
|
||||
test_x = torch.tensor(test_x)
|
||||
predictions = model(test_x).detach().numpy()[:, 0]
|
||||
p = []
|
||||
for prediction in predictions:
|
||||
if prediction >= 0.5:
|
||||
p.append(1)
|
||||
else:
|
||||
p.append(0)
|
||||
with open(path + 'out.tsv', 'w') as file:
|
||||
for prediction in p:
|
||||
file.write("%i\n" % prediction)
|
||||
|
||||
|
||||
train()
|
||||
classify('dev-0/')
|
||||
# classify('test-A/')
|
350
stopwords
Normal file
350
stopwords
Normal file
@ -0,0 +1,350 @@
|
||||
a
|
||||
aby
|
||||
ach
|
||||
acz
|
||||
aczkolwiek
|
||||
aj
|
||||
albo
|
||||
ale
|
||||
alez
|
||||
ależ
|
||||
ani
|
||||
az
|
||||
aż
|
||||
bardziej
|
||||
bardzo
|
||||
beda
|
||||
bedzie
|
||||
bez
|
||||
deda
|
||||
będą
|
||||
bede
|
||||
będę
|
||||
będzie
|
||||
bo
|
||||
bowiem
|
||||
by
|
||||
byc
|
||||
być
|
||||
byl
|
||||
byla
|
||||
byli
|
||||
bylo
|
||||
byly
|
||||
był
|
||||
była
|
||||
było
|
||||
były
|
||||
bynajmniej
|
||||
cala
|
||||
cali
|
||||
caly
|
||||
cała
|
||||
cały
|
||||
ci
|
||||
cie
|
||||
ciebie
|
||||
cię
|
||||
co
|
||||
cokolwiek
|
||||
cos
|
||||
coś
|
||||
czasami
|
||||
czasem
|
||||
czemu
|
||||
czy
|
||||
czyli
|
||||
daleko
|
||||
dla
|
||||
dlaczego
|
||||
dlatego
|
||||
do
|
||||
dobrze
|
||||
dokad
|
||||
dokąd
|
||||
dosc
|
||||
dość
|
||||
duzo
|
||||
dużo
|
||||
dwa
|
||||
dwaj
|
||||
dwie
|
||||
dwoje
|
||||
dzis
|
||||
dzisiaj
|
||||
dziś
|
||||
gdy
|
||||
gdyby
|
||||
gdyz
|
||||
gdyż
|
||||
gdzie
|
||||
gdziekolwiek
|
||||
gdzies
|
||||
gdzieś
|
||||
go
|
||||
i
|
||||
ich
|
||||
ile
|
||||
im
|
||||
inna
|
||||
inne
|
||||
inny
|
||||
innych
|
||||
iz
|
||||
iż
|
||||
ja
|
||||
jak
|
||||
jakas
|
||||
jakaś
|
||||
jakby
|
||||
jaki
|
||||
jakichs
|
||||
jakichś
|
||||
jakie
|
||||
jakis
|
||||
jakiś
|
||||
jakiz
|
||||
jakiż
|
||||
jakkolwiek
|
||||
jako
|
||||
jakos
|
||||
jakoś
|
||||
ją
|
||||
je
|
||||
jeden
|
||||
jedna
|
||||
jednak
|
||||
jednakze
|
||||
jednakże
|
||||
jedno
|
||||
jego
|
||||
jej
|
||||
jemu
|
||||
jesli
|
||||
jest
|
||||
jestem
|
||||
jeszcze
|
||||
jeśli
|
||||
jezeli
|
||||
jeżeli
|
||||
juz
|
||||
już
|
||||
kazdy
|
||||
każdy
|
||||
kiedy
|
||||
kilka
|
||||
kims
|
||||
kimś
|
||||
kto
|
||||
ktokolwiek
|
||||
ktora
|
||||
ktore
|
||||
ktorego
|
||||
ktorej
|
||||
ktory
|
||||
ktorych
|
||||
ktorym
|
||||
ktorzy
|
||||
ktos
|
||||
ktoś
|
||||
która
|
||||
które
|
||||
którego
|
||||
której
|
||||
który
|
||||
których
|
||||
którym
|
||||
którzy
|
||||
ku
|
||||
lat
|
||||
lecz
|
||||
lub
|
||||
ma
|
||||
mają
|
||||
mało
|
||||
mam
|
||||
mi
|
||||
miedzy
|
||||
między
|
||||
mimo
|
||||
mna
|
||||
mną
|
||||
mnie
|
||||
moga
|
||||
mogą
|
||||
moi
|
||||
moim
|
||||
moj
|
||||
moja
|
||||
moje
|
||||
moze
|
||||
mozliwe
|
||||
mozna
|
||||
może
|
||||
możliwe
|
||||
można
|
||||
mój
|
||||
mu
|
||||
musi
|
||||
my
|
||||
na
|
||||
nad
|
||||
nam
|
||||
nami
|
||||
nas
|
||||
nasi
|
||||
nasz
|
||||
nasza
|
||||
nasze
|
||||
naszego
|
||||
naszych
|
||||
natomiast
|
||||
natychmiast
|
||||
nawet
|
||||
nia
|
||||
nią
|
||||
nic
|
||||
nich
|
||||
nie
|
||||
niech
|
||||
niego
|
||||
niej
|
||||
niemu
|
||||
nigdy
|
||||
nim
|
||||
nimi
|
||||
niz
|
||||
niż
|
||||
no
|
||||
o
|
||||
obok
|
||||
od
|
||||
około
|
||||
on
|
||||
ona
|
||||
one
|
||||
oni
|
||||
ono
|
||||
oraz
|
||||
oto
|
||||
owszem
|
||||
pan
|
||||
pana
|
||||
pani
|
||||
po
|
||||
pod
|
||||
podczas
|
||||
pomimo
|
||||
ponad
|
||||
poniewaz
|
||||
ponieważ
|
||||
powinien
|
||||
powinna
|
||||
powinni
|
||||
powinno
|
||||
poza
|
||||
prawie
|
||||
przeciez
|
||||
przecież
|
||||
przed
|
||||
przede
|
||||
przedtem
|
||||
przez
|
||||
przy
|
||||
roku
|
||||
rowniez
|
||||
również
|
||||
sam
|
||||
sama
|
||||
są
|
||||
sie
|
||||
się
|
||||
skad
|
||||
skąd
|
||||
soba
|
||||
sobą
|
||||
sobie
|
||||
sposob
|
||||
sposób
|
||||
swoje
|
||||
ta
|
||||
tak
|
||||
taka
|
||||
taki
|
||||
takie
|
||||
takze
|
||||
także
|
||||
tam
|
||||
te
|
||||
tego
|
||||
tej
|
||||
ten
|
||||
teraz
|
||||
też
|
||||
to
|
||||
toba
|
||||
tobą
|
||||
tobie
|
||||
totez
|
||||
toteż
|
||||
totobą
|
||||
trzeba
|
||||
tu
|
||||
tutaj
|
||||
twoi
|
||||
twoim
|
||||
twoj
|
||||
twoja
|
||||
twoje
|
||||
twój
|
||||
twym
|
||||
ty
|
||||
tych
|
||||
tylko
|
||||
tym
|
||||
u
|
||||
w
|
||||
wam
|
||||
wami
|
||||
was
|
||||
wasz
|
||||
wasza
|
||||
wasze
|
||||
we
|
||||
według
|
||||
wiele
|
||||
wielu
|
||||
więc
|
||||
więcej
|
||||
wlasnie
|
||||
właśnie
|
||||
wszyscy
|
||||
wszystkich
|
||||
wszystkie
|
||||
wszystkim
|
||||
wszystko
|
||||
wtedy
|
||||
wy
|
||||
z
|
||||
za
|
||||
zaden
|
||||
zadna
|
||||
zadne
|
||||
zadnych
|
||||
zapewne
|
||||
zawsze
|
||||
ze
|
||||
zeby
|
||||
zeznowu
|
||||
zł
|
||||
znow
|
||||
znowu
|
||||
znów
|
||||
zostal
|
||||
został
|
||||
żaden
|
||||
żadna
|
||||
żadne
|
||||
żadnych
|
||||
że
|
||||
żeby
|
5447
test-A/in.tsv
Normal file
5447
test-A/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
train/train.tsv.gz
Normal file
BIN
train/train.tsv.gz
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user