import re import numpy as np import pandas as pd from gensim.models import KeyedVectors from keras.layers import Dense, Dropout from keras.models import Sequential def load_word2vec_model(path): return KeyedVectors.load(path) def load_data(file_path, data_type): if data_type == 'labels': with open(file_path, 'r', encoding='utf-8') as f: labels = [int(line.strip()) for line in f] return np.array(labels) texts = [] labels = [] with open(file_path, 'r', encoding='utf-8') as f: for line in f: parts = line.strip().split('\t') if data_type == 'train' and len(parts) == 2: labels.append(int(parts[0])) texts.append(parts[1]) elif data_type in ['dev', 'test'] and len(parts) == 1: texts.append(parts[0]) if data_type == 'train': return pd.DataFrame({'label': labels, 'text': texts}) else: return pd.DataFrame({'text': texts}) def remove_stop_words(text): text = text.lower() text = re.sub(r'\d+', '', text) text = re.sub(r'\s+', ' ', text) text = re.sub(r'[^\w\s]', '', text) return text def preprocess_data(train_path, dev_path, dev_labels_path, test_path): train_data = load_data(train_path, 'train') train_data['text'] = train_data['text'].apply(remove_stop_words) dev_data = load_data(dev_path, 'dev') dev_data['text'] = dev_data['text'].apply(remove_stop_words) dev_labels = load_data(dev_labels_path, 'labels') test_data = load_data(test_path, 'test') test_data['text'] = test_data['text'].apply(remove_stop_words) return train_data, dev_data, dev_labels, test_data def text_to_vector(text, model): words = text.split() word_vecs = [model[word] for word in words if word in model] return np.mean(word_vecs, axis=0) if len(word_vecs) > 0 else np.zeros(model.vector_size) def vectorize_data(data, model): return np.array([text_to_vector(text, model) for text in data['text']]) def build_model(input_dim): model = Sequential() model.add(Dense(512, input_dim=input_dim, activation='relu')) model.add(Dropout(0.5)) model.add(Dense(128, activation='relu')) model.add(Dropout(0.5)) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model def predict_and_save(model, X, output_path): predictions = model.predict(X) predictions = (predictions > 0.5).astype(int) pd.DataFrame(predictions).to_csv(output_path, sep='\t', header=False, index=False) def main(): w2v_model_path = "word2vec_100_3_polish.bin" train_path = 'train/train.tsv' dev_path = 'dev-0/in.tsv' dev_labels_path = 'dev-0/expected.tsv' test_path = 'test-A/in.tsv' w2v_model = load_word2vec_model(w2v_model_path) train_data, dev_data, dev_labels, test_data = preprocess_data(train_path, dev_path, dev_labels_path, test_path) X_train = vectorize_data(train_data, w2v_model) y_train = np.array(train_data['label']) X_dev = vectorize_data(dev_data, w2v_model) X_test = vectorize_data(test_data, w2v_model) model = build_model(X_train.shape[1]) model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_dev, dev_labels)) predict_and_save(model, X_dev, 'dev-0/out.tsv') predict_and_save(model, X_test, 'test-A/out.tsv') if __name__ == "__main__": main()