Word2Vec/main.py

83 lines
2.9 KiB
Python
Raw Permalink Normal View History

2024-05-20 07:11:19 +02:00
import pandas as pd
from gensim.models import KeyedVectors
import gensim
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.optimizers import Adam
class TextClassifier:
def __init__(self, vector_size=100, model_path="fasttext_100_3_polish.bin"):
self.vector_size = vector_size
self.word2vec = KeyedVectors.load(model_path)
self.model = self._build_model()
def _build_model(self):
model = tf.keras.Sequential([
tf.keras.layers.Input(shape=(self.vector_size,)),
tf.keras.layers.Dense(64, activation="relu"),
tf.keras.layers.Dense(32, activation="relu"),
tf.keras.layers.Dense(1, activation="sigmoid")
])
model.compile(
optimizer=Adam(learning_rate=0.001),
loss="binary_crossentropy",
metrics=["accuracy"]
)
return model
def load_data(self, train_path, dev_path, test_path):
self.train_data = pd.read_csv(train_path, sep="\t", names=["Class", "Text"])
self.dev_data = pd.read_csv(dev_path, sep="\t", names=["Text"])
self.test_data = pd.read_csv(test_path, sep="\t", names=["Text"])
def preprocess_text(self, text):
tokens = gensim.utils.simple_preprocess(text)
vector = np.zeros(self.vector_size)
count = 0
for token in tokens:
if token in self.word2vec.wv:
vector += self.word2vec.wv[token]
count += 1
if count > 0:
vector /= count
return vector
def prepare_datasets(self):
self.train_vectors = np.array(
[self.preprocess_text(text) for text in self.train_data["Text"]]
)
self.dev_vectors = np.array(
[self.preprocess_text(text) for text in self.dev_data["Text"]]
)
self.test_vectors = np.array(
[self.preprocess_text(text) for text in self.test_data["Text"]]
)
self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
self.train_vectors, self.train_data["Class"], test_size=0.1, random_state=42
)
def train_model(self, epochs=30, batch_size=16):
self.model.fit(
self.X_train,
self.y_train,
validation_data=(self.X_val, self.y_val),
epochs=epochs,
batch_size=batch_size
)
def predict_and_save(self, data, output_path):
predictions = (self.model.predict(data) > 0.5).astype(int)
pd.DataFrame(predictions).to_csv(output_path, index=False, header=False)
def run(self):
self.load_data("train/train.tsv", "dev-0/in.tsv", "test-A/in.tsv")
self.prepare_datasets()
self.train_model()
self.predict_and_save(self.test_vectors, "test-A/out.tsv")
self.predict_and_save(self.dev_vectors, "dev-0/out.tsv")
if __name__ == "__main__":
classifier = TextClassifier()
classifier.run()