diff --git a/main.py b/main.py new file mode 100644 index 0000000..2688374 --- /dev/null +++ b/main.py @@ -0,0 +1,82 @@ +import numpy as np +import pandas as pd +from tensorflow.keras.preprocessing.text import Tokenizer +from tensorflow.keras.preprocessing.sequence import pad_sequences +from tensorflow.keras.models import Sequential +from tensorflow.keras.layers import Dense, Embedding, Flatten +from tensorflow.keras.optimizers import Adam +import gensim.downloader as api + +# Define the file paths +train_file_path = 'mnt/train/train.tsv' +dev_file_path = 'mnt/dev-0/in.tsv' +test_file_path = 'mnt/test-A/in.tsv' + +# Load data with error handling for problematic lines +def load_tsv(file_path): + try: + df = pd.read_csv(file_path, sep='\t', header=None, names=['text', 'label'], on_bad_lines='skip') + return df + except pd.errors.ParserError as e: + print(f"Error parsing {file_path}: {e}") + # Attempt to read the file with a different approach + with open(file_path, 'r', encoding='utf-8') as file: + lines = file.readlines() + data = [line.strip().split('\t') for line in lines if len(line.strip().split('\t')) == 2] + return pd.DataFrame(data, columns=['text', 'label']) + +# Load the data +train_df = load_tsv(train_file_path) +dev_df = pd.read_csv(dev_file_path, sep='\t', header=None, names=['text']) +test_df = pd.read_csv(test_file_path, sep='\t', header=None, names=['text']) + +# Load pre-trained word2vec model from Google News +word2vec_model = api.load('word2vec-google-news-300') + +# Tokenize and pad sequences +tokenizer = Tokenizer() +tokenizer.fit_on_texts(train_df['text']) +vocab_size = len(tokenizer.word_index) + 1 + +max_length = max(train_df['text'].apply(lambda x: len(x.split()))) + +X_train = tokenizer.texts_to_sequences(train_df['text']) +X_train = pad_sequences(X_train, maxlen=max_length, padding='post') + +X_dev = tokenizer.texts_to_sequences(dev_df['text']) +X_dev = pad_sequences(X_dev, maxlen=max_length, padding='post') + +X_test = tokenizer.texts_to_sequences(test_df['text']) +X_test = pad_sequences(X_test, maxlen=max_length, padding='post') + +y_train = train_df['label'].values + +# Create embedding matrix +embedding_matrix = np.zeros((vocab_size, word2vec_model.vector_size)) +for word, index in tokenizer.word_index.items(): + if word in word2vec_model: + embedding_matrix[index] = word2vec_model[word] + +# Define the model +model = Sequential([ + Embedding(vocab_size, word2vec_model.vector_size, weights=[embedding_matrix], input_length=max_length, trainable=False), + Flatten(), + Dense(10, activation='relu'), + Dense(1, activation='sigmoid') +]) + +model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy']) + +# Train the model +model.fit(X_train, y_train, epochs=10, verbose=2, validation_split=0.2) + +# Predictions +dev_predictions = (model.predict(X_dev) > 0.5).astype(int) +test_predictions = (model.predict(X_test) > 0.5).astype(int) + +# Save predictions +dev_df['prediction'] = dev_predictions +test_df['prediction'] = test_predictions + +dev_df[['prediction']].to_csv('/mnt/data/dev-0/out.tsv', sep='\t', index=False, header=False) +test_df[['prediction']].to_csv('/mnt/data/test-A/out.tsv', sep='\t', index=False, header=False) diff --git a/mnt b/mnt new file mode 160000 index 0000000..9cb2fb2 --- /dev/null +++ b/mnt @@ -0,0 +1 @@ +Subproject commit 9cb2fb26126561611a5539564fac6b5dbcbb0ca2