initial commit
This commit is contained in:
commit
ed502c693f
7
.gitignore
vendored
Normal file
7
.gitignore
vendored
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
fasttext_100_3_polish.bin
|
||||||
|
fasttext_100_3_polish.bin.trainables.syn1neg.npy
|
||||||
|
fasttext_100_3_polish.bin.trainables.vectors_ngrams_lockf.npy
|
||||||
|
fasttext_100_3_polish.bin.trainables.vectors_vocab_lockf.npy
|
||||||
|
fasttext_100_3_polish.bin.wv.vectors_ngrams.npy
|
||||||
|
fasttext_100_3_polish.bin.wv.vectors_vocab.npy
|
||||||
|
fasttext_100_3_polish.bin.wv.vectors.npy
|
25
README.md
Normal file
25
README.md
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
|
||||||
|
Sport Texts Classification Challenge - Ball
|
||||||
|
======================
|
||||||
|
|
||||||
|
Guess whether the sport is connected to the ball for a Polish article. Evaluation metrics: Accuracy, Likelihood.
|
||||||
|
|
||||||
|
Classes
|
||||||
|
-------
|
||||||
|
|
||||||
|
* `1` — ball
|
||||||
|
* `0` — no-ball
|
||||||
|
|
||||||
|
Directory structure
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
* `README.md` — this file
|
||||||
|
* `config.txt` — configuration file
|
||||||
|
* `train/` — directory with training data
|
||||||
|
* `train/train.tsv` — sample train set
|
||||||
|
* `dev-0/` — directory with dev (test) data
|
||||||
|
* `dev-0/in.tsv` — input data for the dev set
|
||||||
|
* `dev-0/expected.tsv` — expected (reference) data for the dev set
|
||||||
|
* `test-A` — directory with test data
|
||||||
|
* `test-A/in.tsv` — input data for the test set
|
||||||
|
* `test-A/expected.tsv` — expected (reference) data for the test set
|
1
config.txt
Normal file
1
config.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
--metric Likelihood --metric Accuracy --precision 5
|
5452
dev-0/expected.tsv
Normal file
5452
dev-0/expected.tsv
Normal file
File diff suppressed because it is too large
Load Diff
5452
dev-0/in.tsv
Normal file
5452
dev-0/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
5452
dev-0/out.tsv
Normal file
5452
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
1
link.tsv
Normal file
1
link.tsv
Normal file
@ -0,0 +1 @@
|
|||||||
|
Link: https://git.wmi.amu.edu.pl/s464861/Word2Vec
|
|
83
main.py
Normal file
83
main.py
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from gensim.models import KeyedVectors
|
||||||
|
import gensim
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
import tensorflow as tf
|
||||||
|
from keras.optimizers import Adam
|
||||||
|
|
||||||
|
class TextClassifier:
|
||||||
|
def __init__(self, vector_size=100, model_path="fasttext_100_3_polish.bin"):
|
||||||
|
self.vector_size = vector_size
|
||||||
|
self.word2vec = KeyedVectors.load(model_path)
|
||||||
|
self.model = self._build_model()
|
||||||
|
|
||||||
|
def _build_model(self):
|
||||||
|
model = tf.keras.Sequential([
|
||||||
|
tf.keras.layers.Input(shape=(self.vector_size,)),
|
||||||
|
tf.keras.layers.Dense(64, activation="relu"),
|
||||||
|
tf.keras.layers.Dense(32, activation="relu"),
|
||||||
|
tf.keras.layers.Dense(1, activation="sigmoid")
|
||||||
|
])
|
||||||
|
model.compile(
|
||||||
|
optimizer=Adam(learning_rate=0.001),
|
||||||
|
loss="binary_crossentropy",
|
||||||
|
metrics=["accuracy"]
|
||||||
|
)
|
||||||
|
return model
|
||||||
|
|
||||||
|
def load_data(self, train_path, dev_path, test_path):
|
||||||
|
self.train_data = pd.read_csv(train_path, sep="\t", names=["Class", "Text"])
|
||||||
|
self.dev_data = pd.read_csv(dev_path, sep="\t", names=["Text"])
|
||||||
|
self.test_data = pd.read_csv(test_path, sep="\t", names=["Text"])
|
||||||
|
|
||||||
|
def preprocess_text(self, text):
|
||||||
|
tokens = gensim.utils.simple_preprocess(text)
|
||||||
|
vector = np.zeros(self.vector_size)
|
||||||
|
count = 0
|
||||||
|
for token in tokens:
|
||||||
|
if token in self.word2vec.wv:
|
||||||
|
vector += self.word2vec.wv[token]
|
||||||
|
count += 1
|
||||||
|
if count > 0:
|
||||||
|
vector /= count
|
||||||
|
return vector
|
||||||
|
|
||||||
|
def prepare_datasets(self):
|
||||||
|
self.train_vectors = np.array(
|
||||||
|
[self.preprocess_text(text) for text in self.train_data["Text"]]
|
||||||
|
)
|
||||||
|
self.dev_vectors = np.array(
|
||||||
|
[self.preprocess_text(text) for text in self.dev_data["Text"]]
|
||||||
|
)
|
||||||
|
self.test_vectors = np.array(
|
||||||
|
[self.preprocess_text(text) for text in self.test_data["Text"]]
|
||||||
|
)
|
||||||
|
|
||||||
|
self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
|
||||||
|
self.train_vectors, self.train_data["Class"], test_size=0.1, random_state=42
|
||||||
|
)
|
||||||
|
|
||||||
|
def train_model(self, epochs=30, batch_size=16):
|
||||||
|
self.model.fit(
|
||||||
|
self.X_train,
|
||||||
|
self.y_train,
|
||||||
|
validation_data=(self.X_val, self.y_val),
|
||||||
|
epochs=epochs,
|
||||||
|
batch_size=batch_size
|
||||||
|
)
|
||||||
|
|
||||||
|
def predict_and_save(self, data, output_path):
|
||||||
|
predictions = (self.model.predict(data) > 0.5).astype(int)
|
||||||
|
pd.DataFrame(predictions).to_csv(output_path, index=False, header=False)
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
self.load_data("train/train.tsv", "dev-0/in.tsv", "test-A/in.tsv")
|
||||||
|
self.prepare_datasets()
|
||||||
|
self.train_model()
|
||||||
|
self.predict_and_save(self.test_vectors, "test-A/out.tsv")
|
||||||
|
self.predict_and_save(self.dev_vectors, "dev-0/out.tsv")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
classifier = TextClassifier()
|
||||||
|
classifier.run()
|
5447
test-A/in.tsv
Normal file
5447
test-A/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
5447
test-A/out.tsv
Normal file
5447
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
98132
train/train.tsv
Normal file
98132
train/train.tsv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
train/train.tsv.gz
Normal file
BIN
train/train.tsv.gz
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user