import os import lzma import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.utils import to_categorical from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, TimeDistributed, Dense, Dropout from tensorflow.keras.optimizers import Adam # Load data def load_data(filepath): if filepath.endswith('.xz'): with lzma.open(filepath, 'rt') as file: data = file.read() else: with open(filepath, 'r') as file: data = file.read() return data # Preprocess data def preprocess(data, labels=None): sentences = [] labels_list = [] sentence = [] label_sentence = [] data_lines = data.split('\n') label_lines = [line.split() for line in labels.split('\n')] if labels else [] for i, line in enumerate(data_lines): if line.strip() == '': if len(sentence) > 0: sentences.append(sentence) if labels: labels_list.append(label_sentence) sentence = [] label_sentence = [] else: parts = line.strip().split('\t') if len(parts) == 2: word = parts[0] label = parts[1] sentence.append(word) if labels: label_sentence.append(label) else: continue # Skip lines that don't have the expected format return sentences, labels_list # Extract unique tags from labels def extract_tags(labels_list): tags = set() for label_seq in labels_list: tags.update(label_seq) return tags # Prepare data for model def prepare_data(sentences, labels, word2idx, tag2idx, max_len): X = [[word2idx.get(word, word2idx["UNK"]) for word in sentence] for sentence in sentences] X = pad_sequences(X, maxlen=max_len, padding='post') y = [[tag2idx[label] for label in label_seq] for label_seq in labels] y = pad_sequences(y, maxlen=max_len, padding='post') y = [to_categorical(i, num_classes=len(tag2idx)) for i in y] return X, y # Define model def define_bilstm_model(vocab_size, tag_size, max_len, embedding_dim=50): model = Sequential() model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim)) model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))) model.add(TimeDistributed(Dense(tag_size, activation='softmax'))) model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy']) return model # Write predictions def write_predictions(filepath, sentences, predictions, idx2tag): with open(filepath, 'w') as file: for sentence, prediction in zip(sentences, predictions): for word, pred in zip(sentence, prediction): file.write(f"{word}\t{idx2tag[np.argmax(pred)]}\n") file.write("\n") # Evaluate with GEval def evaluate_with_geval(): os.system('geval -t config.txt -r mnt/dev-0/expected.tsv -p mnt/dev-0/out.tsv') # Paths train_path = 'mnt/train/train.tsv' dev_in_path = 'mnt/dev-0/in.tsv' dev_expected_path = 'mnt/dev-0/expected.tsv' test_in_path = 'mnt/test-A/in.tsv' dev_out_path = 'mnt/dev-0/out.tsv' test_out_path = 'mnt/test-A/out.tsv' # Load data train_data = load_data(train_path) dev_input_data = load_data(dev_in_path) dev_expected_data = load_data(dev_expected_path) test_input_data = load_data(test_in_path) # Preprocess data train_sentences, train_labels = preprocess(train_data) dev_sentences, dev_labels = preprocess(dev_input_data, dev_expected_data) test_sentences, _ = preprocess(test_input_data) # Debugging: Print the lengths of the sentences and labels print(f"Number of training sentences: {len(train_sentences)}") print(f"Number of training labels: {len(train_labels)}") print(f"Number of dev sentences: {len(dev_sentences)}") print(f"Number of dev labels: {len(dev_labels)}") # Extract all unique tags from training and dev labels all_tags = extract_tags(train_labels) | extract_tags(dev_labels) # Create word and tag indices all_words = set([word for sentence in train_sentences for word in sentence]) word2idx = {word: idx + 2 for idx, word in enumerate(all_words)} word2idx["UNK"] = 1 word2idx["PAD"] = 0 tag2idx = {tag: idx for idx, tag in enumerate(all_tags)} idx2tag = {idx: tag for tag, idx in tag2idx.items()} print("Tag to Index Mapping:", tag2idx) # Debugging # Prepare data for model max_len = 100 # Example max length, should be based on actual data X_train, y_train = prepare_data(train_sentences, train_labels, word2idx, tag2idx, max_len) X_dev, y_dev = prepare_data(dev_sentences, dev_labels, word2idx, tag2idx, max_len) X_test, _ = prepare_data(test_sentences, [[]]*len(test_sentences), word2idx, tag2idx, max_len) # Debugging: Print the shapes of the prepared data print(f"Shape of X_train: {X_train.shape}") print(f"Shape of y_train: {len(y_train)}") print(f"Shape of X_dev: {X_dev.shape}") print(f"Shape of y_dev: {len(y_dev)}") # Define model vocab_size = len(word2idx) tag_size = len(tag2idx) model = define_bilstm_model(vocab_size, tag_size, max_len) # Train model model.fit(X_train, np.array(y_train), validation_data=(X_dev, np.array(y_dev)), epochs=3, batch_size=32, verbose=1) # Predict dev_predictions = model.predict(X_dev) test_predictions = model.predict(X_test) # Write predictions write_predictions(dev_out_path, dev_sentences, dev_predictions, idx2tag) write_predictions(test_out_path, test_sentences, test_predictions, idx2tag)