import numpy as np import pandas as pd from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Embedding, Flatten from tensorflow.keras.optimizers import Adam import gensim.downloader as api # Define the file paths train_file_path = 'mnt/train/train.tsv' dev_file_path = 'mnt/dev-0/in.tsv' test_file_path = 'mnt/test-A/in.tsv' # Load data with error handling for problematic lines def load_tsv(file_path): try: df = pd.read_csv(file_path, sep='\t', header=None, names=['text', 'label'], on_bad_lines='skip') return df except pd.errors.ParserError as e: print(f"Error parsing {file_path}: {e}") # Attempt to read the file with a different approach with open(file_path, 'r', encoding='utf-8') as file: lines = file.readlines() data = [line.strip().split('\t') for line in lines if len(line.strip().split('\t')) == 2] return pd.DataFrame(data, columns=['text', 'label']) # Load the data train_df = load_tsv(train_file_path) dev_df = pd.read_csv(dev_file_path, sep='\t', header=None, names=['text']) test_df = pd.read_csv(test_file_path, sep='\t', header=None, names=['text']) # Load pre-trained word2vec model from Google News word2vec_model = api.load('word2vec-google-news-300') # Tokenize and pad sequences tokenizer = Tokenizer() tokenizer.fit_on_texts(train_df['text']) vocab_size = len(tokenizer.word_index) + 1 max_length = max(train_df['text'].apply(lambda x: len(x.split()))) X_train = tokenizer.texts_to_sequences(train_df['text']) X_train = pad_sequences(X_train, maxlen=max_length, padding='post') X_dev = tokenizer.texts_to_sequences(dev_df['text']) X_dev = pad_sequences(X_dev, maxlen=max_length, padding='post') X_test = tokenizer.texts_to_sequences(test_df['text']) X_test = pad_sequences(X_test, maxlen=max_length, padding='post') y_train = train_df['label'].values # Create embedding matrix embedding_matrix = np.zeros((vocab_size, word2vec_model.vector_size)) for word, index in tokenizer.word_index.items(): if word in word2vec_model: embedding_matrix[index] = word2vec_model[word] # Define the model model = Sequential([ Embedding(vocab_size, word2vec_model.vector_size, weights=[embedding_matrix], input_length=max_length, trainable=False), Flatten(), Dense(10, activation='relu'), Dense(1, activation='sigmoid') ]) model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy']) # Train the model model.fit(X_train, y_train, epochs=10, verbose=2, validation_split=0.2) # Predictions dev_predictions = (model.predict(X_dev) > 0.5).astype(int) test_predictions = (model.predict(X_test) > 0.5).astype(int) # Save predictions dev_df['prediction'] = dev_predictions test_df['prediction'] = test_predictions dev_df[['prediction']].to_csv('/mnt/data/dev-0/out.tsv', sep='\t', index=False, header=False) test_df[['prediction']].to_csv('/mnt/data/test-A/out.tsv', sep='\t', index=False, header=False)