83 lines
2.9 KiB
Python
83 lines
2.9 KiB
Python
|
import pandas as pd
|
||
|
from gensim.models import KeyedVectors
|
||
|
import gensim
|
||
|
import numpy as np
|
||
|
from sklearn.model_selection import train_test_split
|
||
|
import tensorflow as tf
|
||
|
from keras.optimizers import Adam
|
||
|
|
||
|
class TextClassifier:
|
||
|
def __init__(self, vector_size=100, model_path="fasttext_100_3_polish.bin"):
|
||
|
self.vector_size = vector_size
|
||
|
self.word2vec = KeyedVectors.load(model_path)
|
||
|
self.model = self._build_model()
|
||
|
|
||
|
def _build_model(self):
|
||
|
model = tf.keras.Sequential([
|
||
|
tf.keras.layers.Input(shape=(self.vector_size,)),
|
||
|
tf.keras.layers.Dense(64, activation="relu"),
|
||
|
tf.keras.layers.Dense(32, activation="relu"),
|
||
|
tf.keras.layers.Dense(1, activation="sigmoid")
|
||
|
])
|
||
|
model.compile(
|
||
|
optimizer=Adam(learning_rate=0.001),
|
||
|
loss="binary_crossentropy",
|
||
|
metrics=["accuracy"]
|
||
|
)
|
||
|
return model
|
||
|
|
||
|
def load_data(self, train_path, dev_path, test_path):
|
||
|
self.train_data = pd.read_csv(train_path, sep="\t", names=["Class", "Text"])
|
||
|
self.dev_data = pd.read_csv(dev_path, sep="\t", names=["Text"])
|
||
|
self.test_data = pd.read_csv(test_path, sep="\t", names=["Text"])
|
||
|
|
||
|
def preprocess_text(self, text):
|
||
|
tokens = gensim.utils.simple_preprocess(text)
|
||
|
vector = np.zeros(self.vector_size)
|
||
|
count = 0
|
||
|
for token in tokens:
|
||
|
if token in self.word2vec.wv:
|
||
|
vector += self.word2vec.wv[token]
|
||
|
count += 1
|
||
|
if count > 0:
|
||
|
vector /= count
|
||
|
return vector
|
||
|
|
||
|
def prepare_datasets(self):
|
||
|
self.train_vectors = np.array(
|
||
|
[self.preprocess_text(text) for text in self.train_data["Text"]]
|
||
|
)
|
||
|
self.dev_vectors = np.array(
|
||
|
[self.preprocess_text(text) for text in self.dev_data["Text"]]
|
||
|
)
|
||
|
self.test_vectors = np.array(
|
||
|
[self.preprocess_text(text) for text in self.test_data["Text"]]
|
||
|
)
|
||
|
|
||
|
self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
|
||
|
self.train_vectors, self.train_data["Class"], test_size=0.1, random_state=42
|
||
|
)
|
||
|
|
||
|
def train_model(self, epochs=30, batch_size=16):
|
||
|
self.model.fit(
|
||
|
self.X_train,
|
||
|
self.y_train,
|
||
|
validation_data=(self.X_val, self.y_val),
|
||
|
epochs=epochs,
|
||
|
batch_size=batch_size
|
||
|
)
|
||
|
|
||
|
def predict_and_save(self, data, output_path):
|
||
|
predictions = (self.model.predict(data) > 0.5).astype(int)
|
||
|
pd.DataFrame(predictions).to_csv(output_path, index=False, header=False)
|
||
|
|
||
|
def run(self):
|
||
|
self.load_data("train/train.tsv", "dev-0/in.tsv", "test-A/in.tsv")
|
||
|
self.prepare_datasets()
|
||
|
self.train_model()
|
||
|
self.predict_and_save(self.test_vectors, "test-A/out.tsv")
|
||
|
self.predict_and_save(self.dev_vectors, "dev-0/out.tsv")
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
classifier = TextClassifier()
|
||
|
classifier.run()
|