test
This commit is contained in:
parent
7ddece883e
commit
a3db769933
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
word2vec_100_3_polish.bin
|
||||||
|
word2vec_100_3_polish.bin.syn0.npy
|
25
README.md
Normal file
25
README.md
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
|
||||||
|
Sport Texts Classification Challenge - Ball
|
||||||
|
======================
|
||||||
|
|
||||||
|
Guess whether the sport is connected to the ball for a Polish article. Evaluation metrics: Accuracy, Likelihood.
|
||||||
|
|
||||||
|
Classes
|
||||||
|
-------
|
||||||
|
|
||||||
|
* `1` — ball
|
||||||
|
* `0` — no-ball
|
||||||
|
|
||||||
|
Directory structure
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
* `README.md` — this file
|
||||||
|
* `config.txt` — configuration file
|
||||||
|
* `train/` — directory with training data
|
||||||
|
* `train/train.tsv` — sample train set
|
||||||
|
* `dev-0/` — directory with dev (test) data
|
||||||
|
* `dev-0/in.tsv` — input data for the dev set
|
||||||
|
* `dev-0/expected.tsv` — expected (reference) data for the dev set
|
||||||
|
* `test-A` — directory with test data
|
||||||
|
* `test-A/in.tsv` — input data for the test set
|
||||||
|
* `test-A/expected.tsv` — expected (reference) data for the test set
|
1
config.txt
Normal file
1
config.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
--metric Likelihood --metric Accuracy --precision 5
|
5452
dev-0/expected.tsv
Normal file
5452
dev-0/expected.tsv
Normal file
File diff suppressed because it is too large
Load Diff
5452
dev-0/in.tsv
Normal file
5452
dev-0/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
5452
dev-0/out.tsv
Normal file
5452
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
101
run.py
Normal file
101
run.py
Normal file
@ -0,0 +1,101 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from gensim.models import KeyedVectors
|
||||||
|
import tensorflow as tf
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def read_data():
|
||||||
|
train_dataset = pd.read_csv(
|
||||||
|
"train/train.tsv.gz",
|
||||||
|
sep="\t",
|
||||||
|
compression="gzip",
|
||||||
|
names=["Class", "Text"],
|
||||||
|
on_bad_lines="skip",
|
||||||
|
)
|
||||||
|
dev_0_dataset = pd.read_csv(
|
||||||
|
"dev-0/in.tsv",
|
||||||
|
sep="\t",
|
||||||
|
names=["Text"],
|
||||||
|
on_bad_lines="skip",
|
||||||
|
)
|
||||||
|
test_A_dataset = pd.read_csv(
|
||||||
|
"test-A/in.tsv",
|
||||||
|
sep="\t",
|
||||||
|
names=["Text"],
|
||||||
|
on_bad_lines="skip",
|
||||||
|
)
|
||||||
|
|
||||||
|
return train_dataset, dev_0_dataset, test_A_dataset
|
||||||
|
|
||||||
|
|
||||||
|
def text_to_vector(text, word2vec, vector_size):
|
||||||
|
words = text.split()
|
||||||
|
text_vector = np.zeros(vector_size)
|
||||||
|
for word in words:
|
||||||
|
if word in word2vec:
|
||||||
|
text_vector += word2vec[word]
|
||||||
|
return text_vector / len(words)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
train_dataset, dev_0_dataset, test_A_dataset = read_data()
|
||||||
|
|
||||||
|
# Word2Vec parameters
|
||||||
|
vector_size = 100
|
||||||
|
|
||||||
|
# Training the Word2Vec model
|
||||||
|
word2vec = KeyedVectors.load("word2vec_100_3_polish.bin")
|
||||||
|
|
||||||
|
# Convert text to vectors
|
||||||
|
train_vectors = np.array(
|
||||||
|
[text_to_vector(text, word2vec, vector_size) for text in train_dataset["Text"]]
|
||||||
|
)
|
||||||
|
dev_0_vectors = np.array(
|
||||||
|
[text_to_vector(text, word2vec, vector_size) for text in dev_0_dataset["Text"]]
|
||||||
|
)
|
||||||
|
test_A_vectors = np.array(
|
||||||
|
[text_to_vector(text, word2vec, vector_size) for text in test_A_dataset["Text"]]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Train a simple neural network
|
||||||
|
model = tf.keras.Sequential(
|
||||||
|
[
|
||||||
|
tf.keras.layers.Input(shape=(vector_size,)),
|
||||||
|
tf.keras.layers.Dense(64, activation="relu"),
|
||||||
|
tf.keras.layers.Dense(32, activation="relu"),
|
||||||
|
tf.keras.layers.Dense(1, activation="sigmoid"),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
model.compile(
|
||||||
|
optimizer="adam",
|
||||||
|
loss="binary_crossentropy",
|
||||||
|
metrics=["accuracy"],
|
||||||
|
)
|
||||||
|
|
||||||
|
model.fit(
|
||||||
|
train_vectors,
|
||||||
|
train_dataset["Class"],
|
||||||
|
epochs=10,
|
||||||
|
batch_size=16,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Predict on test set
|
||||||
|
test_A_predictions = model.predict(test_A_vectors)
|
||||||
|
test_A_predictions = (test_A_predictions > 0.5).astype(int)
|
||||||
|
|
||||||
|
# Save predictions
|
||||||
|
test_A_predictions = pd.DataFrame(test_A_predictions)
|
||||||
|
test_A_predictions.to_csv("test-A/out.tsv", index=False, header=False)
|
||||||
|
|
||||||
|
# Predict on dev-0 set
|
||||||
|
dev_0_predictions = model.predict(dev_0_vectors)
|
||||||
|
dev_0_predictions = (dev_0_predictions > 0.5).astype(int)
|
||||||
|
|
||||||
|
# Save predictions
|
||||||
|
dev_0_predictions = pd.DataFrame(dev_0_predictions)
|
||||||
|
dev_0_predictions.to_csv("dev-0/out.tsv", index=False, header=False)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
5447
test-A/in.tsv
Normal file
5447
test-A/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
5445
test-A/out.tsv
Normal file
5445
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
98132
train/train.tsv
Normal file
98132
train/train.tsv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
train/train.tsv.gz
Normal file
BIN
train/train.tsv.gz
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user