# %% import os import lzma folders = ["./challenging-america-word-gap-prediction/dev-0", "./challenging-america-word-gap-prediction/test-A", "./challenging-america-word-gap-prediction/train"] for folder in folders: for file in os.listdir(folder): if file.endswith(".tsv.xz"): file_path = os.path.join(folder, file) output_path = os.path.splitext(file_path)[0] # Remove the .xz extension with lzma.open(file_path, "rb") as compressed_file: with open(output_path, "wb") as output_file: output_file.write(compressed_file.read()) # %% import nltk nltk.download('punkt') # %% import pandas as pd # Load the data in_df = pd.read_csv('./challenging-america-word-gap-prediction/train/in.tsv', sep='\t', header=None, on_bad_lines='warn') expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None, on_bad_lines='warn') # Print out the first few rows of each DataFrame print("in_df:") print(in_df.head()) print("\nexpected_df:") print(expected_df.head()) print("\nhate_speech_info_df:") # Print out more information about each DataFrame print("\nin_df info:") print(in_df.info()) print("\nexpected_df info:") print(expected_df.info()) # %% # Drop unnecessary columns columns_to_drop = [0, 1, 2, 3, 4, 5] # Column indices to drop in_df.drop(columns_to_drop, axis=1, inplace=True) # Rename remaining columns for clarity in_df.columns = ['text_1', 'text_2'] # %% in_df['text_2'].fillna('', inplace=True) # %% from tqdm import tqdm # Define a function to replace '\n' with ' ' def replace_newline(text): if isinstance(text, str): return text.replace('\\n', ' ') return text def replace_tabulation(text): if isinstance(text, str): return text.replace('\\t', ' ') return text # Apply the function to 'text_1' and 'text_2' columns and show progress tqdm.pandas(desc="Replacing '\\n' in 'text_1'") in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline) tqdm.pandas(desc="Replacing '\\t' in 'text_1'") in_df['text_1'] = in_df['text_1'].progress_apply(replace_tabulation) tqdm.pandas(desc="Replacing '\\n' in 'text_2'") in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline) tqdm.pandas(desc="Replacing '\\t' in 'text_2'") in_df['text_2'] = in_df['text_2'].progress_apply(replace_tabulation) # %% print(in_df.head()) # %% import nltk from nltk.corpus import stopwords from nltk.stem import PorterStemmer from nltk.tokenize import word_tokenize import string from tqdm import tqdm # If not already done, download the NLTK English stopwords and the Punkt Tokenizer Models nltk.download('punkt') nltk.download('stopwords') stop_words = set(stopwords.words('english')) stemmer = PorterStemmer() def preprocess_text(text): # Lowercase the text text = text.lower() # Remove punctuation text = text.translate(str.maketrans('', '', string.punctuation)) # Tokenize the text words = word_tokenize(text) # Remove stopwords and stem the words words = [stemmer.stem(word) for word in words if word not in stop_words] # Join the words back into a single string text = ' '.join(words) return text # Apply the preprocessing to the 'text_1' and 'text_2' columns tqdm.pandas(desc="Processing 'text_1'") in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text) tqdm.pandas(desc="Processing 'text_2'") in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text) # %% print(in_df.head()) # %% # Save 'in_df' DataFrame to a .tsv file in_df.to_csv('preprocessed_text1_text2.tsv', sep='\t', index=False) # %% import re def handle_numbers_and_special_chars(text): # Remove numbers text = re.sub(r'\d+', '', text) # Remove special characters text = re.sub(r'\W+', ' ', text) return text # Apply the function to the 'text_1' and 'text_2' columns tqdm.pandas(desc="Processing 'text_1'") in_df['text_1'] = in_df['text_1'].progress_apply(handle_numbers_and_special_chars) tqdm.pandas(desc="Processing 'text_2'") in_df['text_2'] = in_df['text_2'].progress_apply(handle_numbers_and_special_chars) # %% from spellchecker import SpellChecker spell = SpellChecker() def correct_spelling(text): # Tokenize the text words = text.split() # Correct spelling corrected_words = [spell.correction(word) if spell.correction(word) is not None else '' for word in words] # Join the words back into a single string text = ' '.join(corrected_words) return text # Apply the spelling correction to the 'text_1' and 'text_2' columns tqdm.pandas(desc="Spelling Correction 'text_1'") in_df['text_1'] = in_df['text_1'].progress_apply(correct_spelling) tqdm.pandas(desc="Spelling Correction 'text_2'") in_df['text_2'] = in_df['text_2'].progress_apply(correct_spelling) # %% # Define a function to concatenate 'text_1' and 'text_2' def concatenate_texts(row): return str(row['text_1']) + ' ' + str(row['text_2']) # Apply the function to each row and show progress tqdm.pandas(desc="Concatenating 'text_1' and 'text_2'") in_df['text'] = in_df.progress_apply(concatenate_texts, axis=1) # Now you can drop 'text_1' and 'text_2' columns if you want in_df.drop(['text_1', 'text_2'], axis=1, inplace=True) # %% from tqdm import tqdm tqdm.pandas() # Load the preprocessed data in_df = pd.read_csv('preprocessed_text1_text2.tsv', sep='\t') # Load the expected words expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None, on_bad_lines='warn') expected_df.columns = ['expected_word'] # Add the expected words to in_df in_df = pd.concat([in_df, expected_df], axis=1) # Define a function to concatenate 'text_1' and 'expected_word' and 'text_2' def concatenate_texts(row): return str(row['text_1']) + ' ' + str(row['expected_word']) + ' ' + str(row['text_2']) # Apply the function to each row and show progress in_df['unmasked_text'] = in_df.progress_apply(concatenate_texts, axis=1) # %% in_df['unmasked_text'].to_csv('preprocessed_text_join_unmask.tsv', sep='\t', index=False) # %% # Save the 'text' column to a text file in_df['unmasked_text'].to_csv('training_data.txt', index=False, header=False) # %% !lmplz -o 5 --discount_fallback < training_data.txt > language_model.arpa # %% !build_binary language_model.arpa language_model.binary # %% !lmplz -o 6 --discount_fallback -S 80% --prune 0 0 1 1 1 1 < training_data.txt > language_model.arpa # %% !wc -l ./challenging-america-word-gap-prediction/dev-0/in.tsv # %% import pandas as pd import csv # Load the data try: in_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/in.tsv', sep='\t', header=None, on_bad_lines='error') except Exception as e: print(e) expected_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/expected.tsv', sep='\t', header=None, on_bad_lines='warn', quoting=csv.QUOTE_NONE) print(in_df.shape[0]) # Drop unnecessary columns columns_to_drop = [0, 1, 2, 3, 4, 5] # Column indices to drop in_df.drop(columns_to_drop, axis=1, inplace=True) # Rename remaining columns for clarity in_df.columns = ['text_1', 'text_2'] in_df['text_1'].fillna('placeholder', inplace=True) in_df['text_2'].fillna('placeholder', inplace=True) # %% import pandas as pd import csv # Placeholder line placeholder_line = ['placeholder'] * 8 # Adjust the number of fields as needed # Read the file line by line with open('./challenging-america-word-gap-prediction/dev-0/in.tsv', 'r') as f: lines = f.readlines() # Split each line into fields and replace problematic lines with the placeholder line lines = [line.strip().split('\t') if len(line.strip().split('\t')) == 8 else placeholder_line for line in lines] # Convert the list of lines into a DataFrame in_df = pd.DataFrame(lines) # Print the number of rows in the DataFrame print(in_df.shape[0]) # Drop unnecessary columns columns_to_drop = [0, 1, 2, 3, 4, 5] # Column indices to drop in_df.drop(columns_to_drop, axis=1, inplace=True) # Rename remaining columns for clarity in_df.columns = ['text_1', 'text_2'] in_df['text_1'].fillna('placeholder', inplace=True) in_df['text_2'].fillna('placeholder', inplace=True) # %% print(in_df.shape[0]) # %% from tqdm import tqdm # Define a function to replace '\n' with ' ' def replace_newline(text): if isinstance(text, str): return text.replace('\\n', ' ') return text def replace_tabulation(text): if isinstance(text, str): return text.replace('\\t', ' ') return text # Apply the function to 'text_1' and 'text_2' columns and show progress tqdm.pandas(desc="Replacing '\\n' in 'text_1'") in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline) tqdm.pandas(desc="Replacing '\\t' in 'text_1'") in_df['text_1'] = in_df['text_1'].progress_apply(replace_tabulation) tqdm.pandas(desc="Replacing '\\n' in 'text_2'") in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline) tqdm.pandas(desc="Replacing '\\t' in 'text_2'") in_df['text_2'] = in_df['text_2'].progress_apply(replace_tabulation) # %% import nltk from nltk.corpus import stopwords from nltk.stem import PorterStemmer from nltk.tokenize import word_tokenize import string from tqdm import tqdm # If not already done, download the NLTK English stopwords and the Punkt Tokenizer Models nltk.download('punkt') nltk.download('stopwords') stop_words = set(stopwords.words('english')) stemmer = PorterStemmer() def preprocess_text(text): # Lowercase the text text = text.lower() # Remove punctuation text = text.translate(str.maketrans('', '', string.punctuation)) # Tokenize the text words = word_tokenize(text) # Remove stopwords and stem the words words = [stemmer.stem(word) for word in words if word not in stop_words] # Join the words back into a single string text = ' '.join(words) return text # Apply the preprocessing to the 'text_1' and 'text_2' columns tqdm.pandas(desc="Processing 'text_1'") in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text) tqdm.pandas(desc="Processing 'text_2'") in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text) # %% import re def handle_numbers_and_special_chars(text): # Remove numbers text = re.sub(r'\d+', '', text) # Remove special characters text = re.sub(r'\W+', ' ', text) return text # Apply the function to the 'text_1' and 'text_2' columns tqdm.pandas(desc="Processing 'text_1'") in_df['text_1'] = in_df['text_1'].progress_apply(handle_numbers_and_special_chars) tqdm.pandas(desc="Processing 'text_2'") in_df['text_2'] = in_df['text_2'].progress_apply(handle_numbers_and_special_chars) # %% in_df.to_csv('preprocessed_dev_text1_text2.tsv', sep='\t', index=False) # %% import kenlm # Load the language model model = kenlm.Model('language_model.binary') # %% import numpy as np def softmax(x): e_x = np.exp(x - np.max(x)) return e_x / e_x.sum() def predict_missing_word(model, context_1, context_2): # Define the vocabulary vocabulary = set(' '.join([context_1, context_2]).split()) # Initialize a dictionary to store the words and their scores word_scores = {} # Iterate over the vocabulary for word in vocabulary: try: # Generate the sentence sentence = f"{context_1} {word} {context_2}" # Score the sentence and store it in the dictionary word_scores[word] = model.score(sentence) except Exception as e: print(f"Error processing word '{word}': {e}") continue # If no word was found, return None for all values if not word_scores: return None, None, None # Convert the scores to probabilities using the softmax function word_probs = {word: max(0.001, prob) for word, prob in zip(word_scores.keys(), softmax(list(word_scores.values())))} # Find the word with the highest probability best_word, best_prob = max(word_probs.items(), key=lambda x: x[1]) # Calculate the sum of probabilities for the other words other_probs_sum = sum(prob for word, prob in word_probs.items() if word != best_word) return best_word, best_prob, other_probs_sum # %% # Initialize a counter for the correct predictions correct_predictions = 0 # Open the output file with open('out.tsv', 'w') as f: # Iterate over the rows of the input DataFrame and the expected DataFrame for i, ((_, input_row), expected_word) in enumerate(zip(in_df.iterrows(), expected_df[0])): try: # Get the context context_1 = input_row['text_1'] context_2 = input_row['text_2'] # Predict the missing word and get the probabilities predicted_word, prob, other_probs_sum = predict_missing_word(model, context_1, context_2) # If any of the values are None, use placeholder values if predicted_word is None: predicted_word = 'placeholder' if prob is None: prob = 0.001 if other_probs_sum is None: other_probs_sum = 0.001 # Write the output to the file f.write(f"{predicted_word}:{prob:.4f} :{other_probs_sum:.4f}\n") # Check if the prediction is correct if predicted_word == expected_word: correct_predictions += 1 # Log progress every 1000 iterations if i % 1000 == 0: print(f"Processed {i} rows. Current accuracy: {correct_predictions / (i+1)}") except Exception as e: print(f"Error processing row {i}: {e}") # Calculate the accuracy accuracy = correct_predictions / len(in_df) print(f"The accuracy of the model is {accuracy}") # %% import pandas as pd import csv # Placeholder line placeholder_line = ['placeholder'] * 8 # Adjust the number of fields as needed # Read the file line by line with open('./challenging-america-word-gap-prediction/test-A/in.tsv', 'r') as f: lines = f.readlines() # Split each line into fields and replace problematic lines with the placeholder line lines = [line.strip().split('\t') if len(line.strip().split('\t')) == 8 else placeholder_line for line in lines] # Convert the list of lines into a DataFrame in_df = pd.DataFrame(lines) # Print the number of rows in the DataFrame print(in_df.shape[0]) # Drop unnecessary columns columns_to_drop = [0, 1, 2, 3, 4, 5] # Column indices to drop in_df.drop(columns_to_drop, axis=1, inplace=True) # Rename remaining columns for clarity in_df.columns = ['text_1', 'text_2'] in_df['text_1'].fillna('placeholder', inplace=True) in_df['text_2'].fillna('placeholder', inplace=True) # %% from tqdm import tqdm # Define a function to replace '\n' with ' ' def replace_newline(text): if isinstance(text, str): return text.replace('\\n', ' ') return text def replace_tabulation(text): if isinstance(text, str): return text.replace('\\t', ' ') return text # Apply the function to 'text_1' and 'text_2' columns and show progress tqdm.pandas(desc="Replacing '\\n' in 'text_1'") in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline) tqdm.pandas(desc="Replacing '\\t' in 'text_1'") in_df['text_1'] = in_df['text_1'].progress_apply(replace_tabulation) tqdm.pandas(desc="Replacing '\\n' in 'text_2'") in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline) tqdm.pandas(desc="Replacing '\\t' in 'text_2'") in_df['text_2'] = in_df['text_2'].progress_apply(replace_tabulation) # %% import nltk from nltk.corpus import stopwords from nltk.stem import PorterStemmer from nltk.tokenize import word_tokenize import string from tqdm import tqdm # If not already done, download the NLTK English stopwords and the Punkt Tokenizer Models nltk.download('punkt') nltk.download('stopwords') stop_words = set(stopwords.words('english')) stemmer = PorterStemmer() def preprocess_text(text): # Lowercase the text text = text.lower() # Remove punctuation text = text.translate(str.maketrans('', '', string.punctuation)) # Tokenize the text words = word_tokenize(text) # Remove stopwords and stem the words words = [stemmer.stem(word) for word in words if word not in stop_words] # Join the words back into a single string text = ' '.join(words) return text # Apply the preprocessing to the 'text_1' and 'text_2' columns tqdm.pandas(desc="Processing 'text_1'") in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text) tqdm.pandas(desc="Processing 'text_2'") in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text) # %% import re def handle_numbers_and_special_chars(text): # Remove numbers text = re.sub(r'\d+', '', text) # Remove special characters text = re.sub(r'\W+', ' ', text) return text # Apply the function to the 'text_1' and 'text_2' columns tqdm.pandas(desc="Processing 'text_1'") in_df['text_1'] = in_df['text_1'].progress_apply(handle_numbers_and_special_chars) tqdm.pandas(desc="Processing 'text_2'") in_df['text_2'] = in_df['text_2'].progress_apply(handle_numbers_and_special_chars) # %% in_df.to_csv('preprocessed_test_text1_text2.tsv', sep='\t', index=False) # %% # Initialize a counter for the correct predictions correct_predictions = 0 # Open the output file with open('out.tsv', 'w') as f: # Iterate over the rows of the input DataFrame and the expected DataFrame for i, ((_, input_row), expected_word) in enumerate(zip(in_df.iterrows(), expected_df[0])): try: # Get the context context_1 = input_row['text_1'] context_2 = input_row['text_2'] # Predict the missing word and get the probabilities predicted_word, prob, other_probs_sum = predict_missing_word(model, context_1, context_2) # If any of the values are None, use placeholder values if predicted_word is None: predicted_word = 'placeholder' if prob is None: prob = 0.001 if other_probs_sum is None: other_probs_sum = 0.001 # Write the output to the file f.write(f"{predicted_word}:{prob:.4f} :{other_probs_sum:.4f}\n") # Check if the prediction is correct if predicted_word == expected_word: correct_predictions += 1 # Log progress every 1000 iterations if i % 1000 == 0: print(f"Processed {i} rows. Current accuracy: {correct_predictions / (i+1)}") except Exception as e: print(f"Error processing row {i}: {e}") # Calculate the accuracy accuracy = correct_predictions / len(in_df) print(f"The accuracy of the model is {accuracy}")