initial commit
This commit is contained in:
commit
ed502c693f
7
.gitignore
vendored
Normal file
7
.gitignore
vendored
Normal file
@ -0,0 +1,7 @@
|
||||
fasttext_100_3_polish.bin
|
||||
fasttext_100_3_polish.bin.trainables.syn1neg.npy
|
||||
fasttext_100_3_polish.bin.trainables.vectors_ngrams_lockf.npy
|
||||
fasttext_100_3_polish.bin.trainables.vectors_vocab_lockf.npy
|
||||
fasttext_100_3_polish.bin.wv.vectors_ngrams.npy
|
||||
fasttext_100_3_polish.bin.wv.vectors_vocab.npy
|
||||
fasttext_100_3_polish.bin.wv.vectors.npy
|
25
README.md
Normal file
25
README.md
Normal file
@ -0,0 +1,25 @@
|
||||
|
||||
Sport Texts Classification Challenge - Ball
|
||||
======================
|
||||
|
||||
Guess whether the sport is connected to the ball for a Polish article. Evaluation metrics: Accuracy, Likelihood.
|
||||
|
||||
Classes
|
||||
-------
|
||||
|
||||
* `1` — ball
|
||||
* `0` — no-ball
|
||||
|
||||
Directory structure
|
||||
-------------------
|
||||
|
||||
* `README.md` — this file
|
||||
* `config.txt` — configuration file
|
||||
* `train/` — directory with training data
|
||||
* `train/train.tsv` — sample train set
|
||||
* `dev-0/` — directory with dev (test) data
|
||||
* `dev-0/in.tsv` — input data for the dev set
|
||||
* `dev-0/expected.tsv` — expected (reference) data for the dev set
|
||||
* `test-A` — directory with test data
|
||||
* `test-A/in.tsv` — input data for the test set
|
||||
* `test-A/expected.tsv` — expected (reference) data for the test set
|
1
config.txt
Normal file
1
config.txt
Normal file
@ -0,0 +1 @@
|
||||
--metric Likelihood --metric Accuracy --precision 5
|
5452
dev-0/expected.tsv
Normal file
5452
dev-0/expected.tsv
Normal file
File diff suppressed because it is too large
Load Diff
5452
dev-0/in.tsv
Normal file
5452
dev-0/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
5452
dev-0/out.tsv
Normal file
5452
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
1
link.tsv
Normal file
1
link.tsv
Normal file
@ -0,0 +1 @@
|
||||
Link: https://git.wmi.amu.edu.pl/s464861/Word2Vec
|
|
83
main.py
Normal file
83
main.py
Normal file
@ -0,0 +1,83 @@
|
||||
import pandas as pd
|
||||
from gensim.models import KeyedVectors
|
||||
import gensim
|
||||
import numpy as np
|
||||
from sklearn.model_selection import train_test_split
|
||||
import tensorflow as tf
|
||||
from keras.optimizers import Adam
|
||||
|
||||
class TextClassifier:
|
||||
def __init__(self, vector_size=100, model_path="fasttext_100_3_polish.bin"):
|
||||
self.vector_size = vector_size
|
||||
self.word2vec = KeyedVectors.load(model_path)
|
||||
self.model = self._build_model()
|
||||
|
||||
def _build_model(self):
|
||||
model = tf.keras.Sequential([
|
||||
tf.keras.layers.Input(shape=(self.vector_size,)),
|
||||
tf.keras.layers.Dense(64, activation="relu"),
|
||||
tf.keras.layers.Dense(32, activation="relu"),
|
||||
tf.keras.layers.Dense(1, activation="sigmoid")
|
||||
])
|
||||
model.compile(
|
||||
optimizer=Adam(learning_rate=0.001),
|
||||
loss="binary_crossentropy",
|
||||
metrics=["accuracy"]
|
||||
)
|
||||
return model
|
||||
|
||||
def load_data(self, train_path, dev_path, test_path):
|
||||
self.train_data = pd.read_csv(train_path, sep="\t", names=["Class", "Text"])
|
||||
self.dev_data = pd.read_csv(dev_path, sep="\t", names=["Text"])
|
||||
self.test_data = pd.read_csv(test_path, sep="\t", names=["Text"])
|
||||
|
||||
def preprocess_text(self, text):
|
||||
tokens = gensim.utils.simple_preprocess(text)
|
||||
vector = np.zeros(self.vector_size)
|
||||
count = 0
|
||||
for token in tokens:
|
||||
if token in self.word2vec.wv:
|
||||
vector += self.word2vec.wv[token]
|
||||
count += 1
|
||||
if count > 0:
|
||||
vector /= count
|
||||
return vector
|
||||
|
||||
def prepare_datasets(self):
|
||||
self.train_vectors = np.array(
|
||||
[self.preprocess_text(text) for text in self.train_data["Text"]]
|
||||
)
|
||||
self.dev_vectors = np.array(
|
||||
[self.preprocess_text(text) for text in self.dev_data["Text"]]
|
||||
)
|
||||
self.test_vectors = np.array(
|
||||
[self.preprocess_text(text) for text in self.test_data["Text"]]
|
||||
)
|
||||
|
||||
self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
|
||||
self.train_vectors, self.train_data["Class"], test_size=0.1, random_state=42
|
||||
)
|
||||
|
||||
def train_model(self, epochs=30, batch_size=16):
|
||||
self.model.fit(
|
||||
self.X_train,
|
||||
self.y_train,
|
||||
validation_data=(self.X_val, self.y_val),
|
||||
epochs=epochs,
|
||||
batch_size=batch_size
|
||||
)
|
||||
|
||||
def predict_and_save(self, data, output_path):
|
||||
predictions = (self.model.predict(data) > 0.5).astype(int)
|
||||
pd.DataFrame(predictions).to_csv(output_path, index=False, header=False)
|
||||
|
||||
def run(self):
|
||||
self.load_data("train/train.tsv", "dev-0/in.tsv", "test-A/in.tsv")
|
||||
self.prepare_datasets()
|
||||
self.train_model()
|
||||
self.predict_and_save(self.test_vectors, "test-A/out.tsv")
|
||||
self.predict_and_save(self.dev_vectors, "dev-0/out.tsv")
|
||||
|
||||
if __name__ == "__main__":
|
||||
classifier = TextClassifier()
|
||||
classifier.run()
|
5447
test-A/in.tsv
Normal file
5447
test-A/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
5447
test-A/out.tsv
Normal file
5447
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
98132
train/train.tsv
Normal file
98132
train/train.tsv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
train/train.tsv.gz
Normal file
BIN
train/train.tsv.gz
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user