105 lines
3.4 KiB
105 lines
3.4 KiB
import re
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from keras.layers import Dense, Dropout
from keras.models import Sequential
def load_word2vec_model(path):
return KeyedVectors.load(path)
def load_data(file_path, data_type):
if data_type == 'labels':
with open(file_path, 'r', encoding='utf-8') as f:
labels = [int(line.strip()) for line in f]
return np.array(labels)
texts = []
labels = []
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
parts = line.strip().split('\t')
if data_type == 'train' and len(parts) == 2:
elif data_type in ['dev', 'test'] and len(parts) == 1:
if data_type == 'train':
return pd.DataFrame({'label': labels, 'text': texts})
return pd.DataFrame({'text': texts})
def remove_stop_words(text):
text = text.lower()
text = re.sub(r'\d+', '', text)
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[^\w\s]', '', text)
return text
def preprocess_data(train_path, dev_path, dev_labels_path, test_path):
train_data = load_data(train_path, 'train')
train_data['text'] = train_data['text'].apply(remove_stop_words)
dev_data = load_data(dev_path, 'dev')
dev_data['text'] = dev_data['text'].apply(remove_stop_words)
dev_labels = load_data(dev_labels_path, 'labels')
test_data = load_data(test_path, 'test')
test_data['text'] = test_data['text'].apply(remove_stop_words)
return train_data, dev_data, dev_labels, test_data
def text_to_vector(text, model):
words = text.split()
word_vecs = [model[word] for word in words if word in model]
return np.mean(word_vecs, axis=0) if len(word_vecs) > 0 else np.zeros(model.vector_size)
def vectorize_data(data, model):
return np.array([text_to_vector(text, model) for text in data['text']])
def build_model(input_dim):
model = Sequential()
model.add(Dense(512, input_dim=input_dim, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
def predict_and_save(model, X, output_path):
predictions = model.predict(X)
predictions = (predictions > 0.5).astype(int)
pd.DataFrame(predictions).to_csv(output_path, sep='\t', header=False, index=False)
def main():
w2v_model_path = "word2vec_100_3_polish.bin"
train_path = 'train/train.tsv'
dev_path = 'dev-0/in.tsv'
dev_labels_path = 'dev-0/expected.tsv'
test_path = 'test-A/in.tsv'
w2v_model = load_word2vec_model(w2v_model_path)
train_data, dev_data, dev_labels, test_data = preprocess_data(train_path, dev_path, dev_labels_path, test_path)
X_train = vectorize_data(train_data, w2v_model)
y_train = np.array(train_data['label'])
X_dev = vectorize_data(dev_data, w2v_model)
X_test = vectorize_data(test_data, w2v_model)
model = build_model(X_train.shape[1])
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_dev, dev_labels))
predict_and_save(model, X_dev, 'dev-0/out.tsv')
predict_and_save(model, X_test, 'test-A/out.tsv')
if __name__ == "__main__":