solution
This commit is contained in:
parent
7db435dfc6
commit
8fcefef17a
104
main.py
104
main.py
@ -0,0 +1,104 @@
|
||||
import re
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from gensim.models import KeyedVectors
|
||||
from keras.layers import Dense, Dropout
|
||||
from keras.models import Sequential
|
||||
|
||||
|
||||
def load_word2vec_model(path):
|
||||
return KeyedVectors.load(path)
|
||||
|
||||
|
||||
def load_data(file_path, data_type):
|
||||
if data_type == 'labels':
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
labels = [int(line.strip()) for line in f]
|
||||
return np.array(labels)
|
||||
texts = []
|
||||
labels = []
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
parts = line.strip().split('\t')
|
||||
if data_type == 'train' and len(parts) == 2:
|
||||
labels.append(int(parts[0]))
|
||||
texts.append(parts[1])
|
||||
elif data_type in ['dev', 'test'] and len(parts) == 1:
|
||||
texts.append(parts[0])
|
||||
if data_type == 'train':
|
||||
return pd.DataFrame({'label': labels, 'text': texts})
|
||||
else:
|
||||
return pd.DataFrame({'text': texts})
|
||||
|
||||
|
||||
def remove_stop_words(text):
|
||||
text = text.lower()
|
||||
text = re.sub(r'\d+', '', text)
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
text = re.sub(r'[^\w\s]', '', text)
|
||||
return text
|
||||
|
||||
|
||||
def preprocess_data(train_path, dev_path, dev_labels_path, test_path):
|
||||
train_data = load_data(train_path, 'train')
|
||||
train_data['text'] = train_data['text'].apply(remove_stop_words)
|
||||
dev_data = load_data(dev_path, 'dev')
|
||||
dev_data['text'] = dev_data['text'].apply(remove_stop_words)
|
||||
dev_labels = load_data(dev_labels_path, 'labels')
|
||||
test_data = load_data(test_path, 'test')
|
||||
test_data['text'] = test_data['text'].apply(remove_stop_words)
|
||||
|
||||
return train_data, dev_data, dev_labels, test_data
|
||||
|
||||
|
||||
def text_to_vector(text, model):
|
||||
words = text.split()
|
||||
word_vecs = [model[word] for word in words if word in model]
|
||||
return np.mean(word_vecs, axis=0) if len(word_vecs) > 0 else np.zeros(model.vector_size)
|
||||
|
||||
|
||||
def vectorize_data(data, model):
|
||||
return np.array([text_to_vector(text, model) for text in data['text']])
|
||||
|
||||
|
||||
def build_model(input_dim):
|
||||
model = Sequential()
|
||||
model.add(Dense(512, input_dim=input_dim, activation='relu'))
|
||||
model.add(Dropout(0.5))
|
||||
model.add(Dense(128, activation='relu'))
|
||||
model.add(Dropout(0.5))
|
||||
model.add(Dense(1, activation='sigmoid'))
|
||||
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
|
||||
return model
|
||||
|
||||
|
||||
def predict_and_save(model, X, output_path):
|
||||
predictions = model.predict(X)
|
||||
predictions = (predictions > 0.5).astype(int)
|
||||
pd.DataFrame(predictions).to_csv(output_path, sep='\t', header=False, index=False)
|
||||
|
||||
|
||||
def main():
|
||||
w2v_model_path = "word2vec_100_3_polish.bin"
|
||||
train_path = 'train/train.tsv'
|
||||
dev_path = 'dev-0/in.tsv'
|
||||
dev_labels_path = 'dev-0/expected.tsv'
|
||||
test_path = 'test-A/in.tsv'
|
||||
|
||||
w2v_model = load_word2vec_model(w2v_model_path)
|
||||
train_data, dev_data, dev_labels, test_data = preprocess_data(train_path, dev_path, dev_labels_path, test_path)
|
||||
|
||||
X_train = vectorize_data(train_data, w2v_model)
|
||||
y_train = np.array(train_data['label'])
|
||||
X_dev = vectorize_data(dev_data, w2v_model)
|
||||
X_test = vectorize_data(test_data, w2v_model)
|
||||
|
||||
model = build_model(X_train.shape[1])
|
||||
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_dev, dev_labels))
|
||||
|
||||
predict_and_save(model, X_dev, 'dev-0/out.tsv')
|
||||
predict_and_save(model, X_test, 'test-A/out.tsv')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue
Block a user