initial commit

This commit is contained in:
Franciszek Czajka 2024-05-20 07:11:19 +02:00
commit ed502c693f
12 changed files with 125499 additions and 0 deletions

7
.gitignore vendored Normal file
View File

@ -0,0 +1,7 @@
fasttext_100_3_polish.bin
fasttext_100_3_polish.bin.trainables.syn1neg.npy
fasttext_100_3_polish.bin.trainables.vectors_ngrams_lockf.npy
fasttext_100_3_polish.bin.trainables.vectors_vocab_lockf.npy
fasttext_100_3_polish.bin.wv.vectors_ngrams.npy
fasttext_100_3_polish.bin.wv.vectors_vocab.npy
fasttext_100_3_polish.bin.wv.vectors.npy

25
README.md Normal file
View File

@ -0,0 +1,25 @@
Sport Texts Classification Challenge - Ball
======================
Guess whether the sport is connected to the ball for a Polish article. Evaluation metrics: Accuracy, Likelihood.
Classes
-------
* `1` — ball
* `0` — no-ball
Directory structure
-------------------
* `README.md` — this file
* `config.txt` — configuration file
* `train/` — directory with training data
* `train/train.tsv` — sample train set
* `dev-0/` — directory with dev (test) data
* `dev-0/in.tsv` — input data for the dev set
* `dev-0/expected.tsv` — expected (reference) data for the dev set
* `test-A` — directory with test data
* `test-A/in.tsv` — input data for the test set
* `test-A/expected.tsv` — expected (reference) data for the test set

1
config.txt Normal file
View File

@ -0,0 +1 @@
--metric Likelihood --metric Accuracy --precision 5

5452
dev-0/expected.tsv Normal file

File diff suppressed because it is too large Load Diff

5452
dev-0/in.tsv Normal file

File diff suppressed because it is too large Load Diff

5452
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

1
link.tsv Normal file
View File

@ -0,0 +1 @@
Link: https://git.wmi.amu.edu.pl/s464861/Word2Vec
1 Link: https://git.wmi.amu.edu.pl/s464861/Word2Vec

83
main.py Normal file
View File

@ -0,0 +1,83 @@
import pandas as pd
from gensim.models import KeyedVectors
import gensim
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.optimizers import Adam
class TextClassifier:
def __init__(self, vector_size=100, model_path="fasttext_100_3_polish.bin"):
self.vector_size = vector_size
self.word2vec = KeyedVectors.load(model_path)
self.model = self._build_model()
def _build_model(self):
model = tf.keras.Sequential([
tf.keras.layers.Input(shape=(self.vector_size,)),
tf.keras.layers.Dense(64, activation="relu"),
tf.keras.layers.Dense(32, activation="relu"),
tf.keras.layers.Dense(1, activation="sigmoid")
])
model.compile(
optimizer=Adam(learning_rate=0.001),
loss="binary_crossentropy",
metrics=["accuracy"]
)
return model
def load_data(self, train_path, dev_path, test_path):
self.train_data = pd.read_csv(train_path, sep="\t", names=["Class", "Text"])
self.dev_data = pd.read_csv(dev_path, sep="\t", names=["Text"])
self.test_data = pd.read_csv(test_path, sep="\t", names=["Text"])
def preprocess_text(self, text):
tokens = gensim.utils.simple_preprocess(text)
vector = np.zeros(self.vector_size)
count = 0
for token in tokens:
if token in self.word2vec.wv:
vector += self.word2vec.wv[token]
count += 1
if count > 0:
vector /= count
return vector
def prepare_datasets(self):
self.train_vectors = np.array(
[self.preprocess_text(text) for text in self.train_data["Text"]]
)
self.dev_vectors = np.array(
[self.preprocess_text(text) for text in self.dev_data["Text"]]
)
self.test_vectors = np.array(
[self.preprocess_text(text) for text in self.test_data["Text"]]
)
self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
self.train_vectors, self.train_data["Class"], test_size=0.1, random_state=42
)
def train_model(self, epochs=30, batch_size=16):
self.model.fit(
self.X_train,
self.y_train,
validation_data=(self.X_val, self.y_val),
epochs=epochs,
batch_size=batch_size
)
def predict_and_save(self, data, output_path):
predictions = (self.model.predict(data) > 0.5).astype(int)
pd.DataFrame(predictions).to_csv(output_path, index=False, header=False)
def run(self):
self.load_data("train/train.tsv", "dev-0/in.tsv", "test-A/in.tsv")
self.prepare_datasets()
self.train_model()
self.predict_and_save(self.test_vectors, "test-A/out.tsv")
self.predict_and_save(self.dev_vectors, "dev-0/out.tsv")
if __name__ == "__main__":
classifier = TextClassifier()
classifier.run()

5447
test-A/in.tsv Normal file

File diff suppressed because it is too large Load Diff

5447
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

98132
train/train.tsv Normal file

File diff suppressed because it is too large Load Diff

BIN
train/train.tsv.gz Normal file

Binary file not shown.