This commit is contained in:
Mateusz 2024-05-15 20:16:07 +02:00
parent 7ddece883e
commit a3db769933
12 changed files with 125509 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
word2vec_100_3_polish.bin
word2vec_100_3_polish.bin.syn0.npy

25
README.md Normal file
View File

@ -0,0 +1,25 @@
Sport Texts Classification Challenge - Ball
======================
Guess whether the sport is connected to the ball for a Polish article. Evaluation metrics: Accuracy, Likelihood.
Classes
-------
* `1` — ball
* `0` — no-ball
Directory structure
-------------------
* `README.md` — this file
* `config.txt` — configuration file
* `train/` — directory with training data
* `train/train.tsv` — sample train set
* `dev-0/` — directory with dev (test) data
* `dev-0/in.tsv` — input data for the dev set
* `dev-0/expected.tsv` — expected (reference) data for the dev set
* `test-A` — directory with test data
* `test-A/in.tsv` — input data for the test set
* `test-A/expected.tsv` — expected (reference) data for the test set

1
config.txt Normal file
View File

@ -0,0 +1 @@
--metric Likelihood --metric Accuracy --precision 5

5452
dev-0/expected.tsv Normal file

File diff suppressed because it is too large Load Diff

5452
dev-0/in.tsv Normal file

File diff suppressed because it is too large Load Diff

5452
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

View File

101
run.py Normal file
View File

@ -0,0 +1,101 @@
import pandas as pd
from gensim.models import KeyedVectors
import tensorflow as tf
import numpy as np
def read_data():
train_dataset = pd.read_csv(
"train/train.tsv.gz",
sep="\t",
compression="gzip",
names=["Class", "Text"],
on_bad_lines="skip",
)
dev_0_dataset = pd.read_csv(
"dev-0/in.tsv",
sep="\t",
names=["Text"],
on_bad_lines="skip",
)
test_A_dataset = pd.read_csv(
"test-A/in.tsv",
sep="\t",
names=["Text"],
on_bad_lines="skip",
)
return train_dataset, dev_0_dataset, test_A_dataset
def text_to_vector(text, word2vec, vector_size):
words = text.split()
text_vector = np.zeros(vector_size)
for word in words:
if word in word2vec:
text_vector += word2vec[word]
return text_vector / len(words)
def main():
train_dataset, dev_0_dataset, test_A_dataset = read_data()
# Word2Vec parameters
vector_size = 100
# Training the Word2Vec model
word2vec = KeyedVectors.load("word2vec_100_3_polish.bin")
# Convert text to vectors
train_vectors = np.array(
[text_to_vector(text, word2vec, vector_size) for text in train_dataset["Text"]]
)
dev_0_vectors = np.array(
[text_to_vector(text, word2vec, vector_size) for text in dev_0_dataset["Text"]]
)
test_A_vectors = np.array(
[text_to_vector(text, word2vec, vector_size) for text in test_A_dataset["Text"]]
)
# Train a simple neural network
model = tf.keras.Sequential(
[
tf.keras.layers.Input(shape=(vector_size,)),
tf.keras.layers.Dense(64, activation="relu"),
tf.keras.layers.Dense(32, activation="relu"),
tf.keras.layers.Dense(1, activation="sigmoid"),
]
)
model.compile(
optimizer="adam",
loss="binary_crossentropy",
metrics=["accuracy"],
)
model.fit(
train_vectors,
train_dataset["Class"],
epochs=10,
batch_size=16,
)
# Predict on test set
test_A_predictions = model.predict(test_A_vectors)
test_A_predictions = (test_A_predictions > 0.5).astype(int)
# Save predictions
test_A_predictions = pd.DataFrame(test_A_predictions)
test_A_predictions.to_csv("test-A/out.tsv", index=False, header=False)
# Predict on dev-0 set
dev_0_predictions = model.predict(dev_0_vectors)
dev_0_predictions = (dev_0_predictions > 0.5).astype(int)
# Save predictions
dev_0_predictions = pd.DataFrame(dev_0_predictions)
dev_0_predictions.to_csv("dev-0/out.tsv", index=False, header=False)
if __name__ == "__main__":
main()

5447
test-A/in.tsv Normal file

File diff suppressed because it is too large Load Diff

5445
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

98132
train/train.tsv Normal file

File diff suppressed because it is too large Load Diff

BIN
train/train.tsv.gz Normal file

Binary file not shown.