06_kenlm/run1.py

# %%
import os
import lzma

folders = ["./challenging-america-word-gap-prediction/dev-0",
           "./challenging-america-word-gap-prediction/test-A",
           "./challenging-america-word-gap-prediction/train"]

for folder in folders:
    for file in os.listdir(folder):
        if file.endswith(".tsv.xz"):
            file_path = os.path.join(folder, file)
            output_path = os.path.splitext(file_path)[0]  # Remove the .xz extension
            with lzma.open(file_path, "rb") as compressed_file:
                with open(output_path, "wb") as output_file:
                    output_file.write(compressed_file.read())

# %%
import nltk
nltk.download('punkt')

# %%
import pandas as pd

# Load the data
in_df = pd.read_csv('./challenging-america-word-gap-prediction/train/in.tsv', sep='\t', header=None, on_bad_lines='warn')
expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None, on_bad_lines='warn')

# Print out the first few rows of each DataFrame
print("in_df:")
print(in_df.head())
print("\nexpected_df:")
print(expected_df.head())
print("\nhate_speech_info_df:")

# Print out more information about each DataFrame
print("\nin_df info:")
print(in_df.info())
print("\nexpected_df info:")
print(expected_df.info())

# %%
# Drop unnecessary columns
columns_to_drop = [0, 1, 2, 3, 4, 5]  # Column indices to drop
in_df.drop(columns_to_drop, axis=1, inplace=True)

# Rename remaining columns for clarity
in_df.columns = ['text_1', 'text_2']

# %%
in_df['text_2'].fillna('', inplace=True)

# %%
from tqdm import tqdm

# Define a function to replace '\n' with ' '
def replace_newline(text):
    if isinstance(text, str):
        return text.replace('\\n', ' ')
    return text

def replace_tabulation(text):
    if isinstance(text, str):
        return text.replace('\\t', ' ')
    return text

# Apply the function to 'text_1' and 'text_2' columns and show progress
tqdm.pandas(desc="Replacing '\\n' in 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline)

tqdm.pandas(desc="Replacing '\\t' in 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(replace_tabulation)

tqdm.pandas(desc="Replacing '\\n' in 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline)


tqdm.pandas(desc="Replacing '\\t' in 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(replace_tabulation)

# %%
print(in_df.head())

# %%
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
from tqdm import tqdm


# If not already done, download the NLTK English stopwords and the Punkt Tokenizer Models
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize the text
    words = word_tokenize(text)
    
    # Remove stopwords and stem the words
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    
    # Join the words back into a single string
    text = ' '.join(words)
    
    return text

# Apply the preprocessing to the 'text_1' and 'text_2' columns
tqdm.pandas(desc="Processing 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text)


tqdm.pandas(desc="Processing 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text)

# %%
print(in_df.head())

# %%
# Save 'in_df' DataFrame to a .tsv file
in_df.to_csv('preprocessed_text1_text2.tsv', sep='\t', index=False)

# %%
import re

def handle_numbers_and_special_chars(text):
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove special characters
    text = re.sub(r'\W+', ' ', text)
    
    return text

# Apply the function to the 'text_1' and 'text_2' columns
tqdm.pandas(desc="Processing 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(handle_numbers_and_special_chars)

tqdm.pandas(desc="Processing 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(handle_numbers_and_special_chars)

# %%
from spellchecker import SpellChecker

spell = SpellChecker()

def correct_spelling(text):
    # Tokenize the text
    words = text.split()
    
    # Correct spelling
    corrected_words = [spell.correction(word) if spell.correction(word) is not None else '' for word in words]
    
    # Join the words back into a single string
    text = ' '.join(corrected_words)
    
    return text

# Apply the spelling correction to the 'text_1' and 'text_2' columns
tqdm.pandas(desc="Spelling Correction 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(correct_spelling)

tqdm.pandas(desc="Spelling Correction 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(correct_spelling)

# %%
# Define a function to concatenate 'text_1' and 'text_2'
def concatenate_texts(row):
    return str(row['text_1']) + ' <MASK> ' + str(row['text_2'])

# Apply the function to each row and show progress
tqdm.pandas(desc="Concatenating 'text_1' and 'text_2'")
in_df['text'] = in_df.progress_apply(concatenate_texts, axis=1)

# Now you can drop 'text_1' and 'text_2' columns if you want
in_df.drop(['text_1', 'text_2'], axis=1, inplace=True)

# %%
from tqdm import tqdm
tqdm.pandas()

# Load the preprocessed data
in_df = pd.read_csv('preprocessed_text1_text2.tsv', sep='\t')

# Load the expected words
expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None, on_bad_lines='warn')
expected_df.columns = ['expected_word']

# Add the expected words to in_df
in_df = pd.concat([in_df, expected_df], axis=1)

# Define a function to concatenate 'text_1' and 'expected_word' and 'text_2'
def concatenate_texts(row):
    return str(row['text_1']) + ' ' + str(row['expected_word']) + ' ' + str(row['text_2'])

# Apply the function to each row and show progress
in_df['unmasked_text'] = in_df.progress_apply(concatenate_texts, axis=1)

# %%
in_df['unmasked_text'].to_csv('preprocessed_text_join_unmask.tsv', sep='\t', index=False)

# %%
# Save the 'text' column to a text file
in_df['unmasked_text'].to_csv('training_data.txt', index=False, header=False)

# %%
!lmplz -o 5 --discount_fallback < training_data.txt > language_model.arpa

# %%
!build_binary language_model.arpa language_model.binary

# %%
!lmplz -o 6 --discount_fallback -S 80% --prune 0 0 1 1 1 1 < training_data.txt > language_model.arpa

# %%
!wc -l ./challenging-america-word-gap-prediction/dev-0/in.tsv

# %%
import pandas as pd
import csv

# Load the data
try:
    in_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/in.tsv', sep='\t', header=None, on_bad_lines='error')
except Exception as e:
    print(e)
expected_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/expected.tsv', sep='\t', header=None, on_bad_lines='warn', quoting=csv.QUOTE_NONE)

print(in_df.shape[0])

# Drop unnecessary columns
columns_to_drop = [0, 1, 2, 3, 4, 5]  # Column indices to drop
in_df.drop(columns_to_drop, axis=1, inplace=True)

# Rename remaining columns for clarity
in_df.columns = ['text_1', 'text_2']

in_df['text_1'].fillna('placeholder', inplace=True)
in_df['text_2'].fillna('placeholder', inplace=True)

# %%
import pandas as pd
import csv

# Placeholder line
placeholder_line = ['placeholder'] * 8  # Adjust the number of fields as needed

# Read the file line by line
with open('./challenging-america-word-gap-prediction/dev-0/in.tsv', 'r') as f:
    lines = f.readlines()

# Split each line into fields and replace problematic lines with the placeholder line
lines = [line.strip().split('\t') if len(line.strip().split('\t')) == 8 else placeholder_line for line in lines]

# Convert the list of lines into a DataFrame
in_df = pd.DataFrame(lines)

# Print the number of rows in the DataFrame
print(in_df.shape[0])

# Drop unnecessary columns
columns_to_drop = [0, 1, 2, 3, 4, 5]  # Column indices to drop
in_df.drop(columns_to_drop, axis=1, inplace=True)

# Rename remaining columns for clarity
in_df.columns = ['text_1', 'text_2']

in_df['text_1'].fillna('placeholder', inplace=True)
in_df['text_2'].fillna('placeholder', inplace=True)

# %%
print(in_df.shape[0])

# %%
from tqdm import tqdm

# Define a function to replace '\n' with ' '
def replace_newline(text):
    if isinstance(text, str):
        return text.replace('\\n', ' ')
    return text

def replace_tabulation(text):
    if isinstance(text, str):
        return text.replace('\\t', ' ')
    return text

# Apply the function to 'text_1' and 'text_2' columns and show progress
tqdm.pandas(desc="Replacing '\\n' in 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline)

tqdm.pandas(desc="Replacing '\\t' in 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(replace_tabulation)

tqdm.pandas(desc="Replacing '\\n' in 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline)


tqdm.pandas(desc="Replacing '\\t' in 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(replace_tabulation)

# %%
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
from tqdm import tqdm


# If not already done, download the NLTK English stopwords and the Punkt Tokenizer Models
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize the text
    words = word_tokenize(text)
    
    # Remove stopwords and stem the words
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    
    # Join the words back into a single string
    text = ' '.join(words)
    
    return text

# Apply the preprocessing to the 'text_1' and 'text_2' columns
tqdm.pandas(desc="Processing 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text)


tqdm.pandas(desc="Processing 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text)

# %%
import re

def handle_numbers_and_special_chars(text):
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove special characters
    text = re.sub(r'\W+', ' ', text)
    
    return text

# Apply the function to the 'text_1' and 'text_2' columns
tqdm.pandas(desc="Processing 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(handle_numbers_and_special_chars)

tqdm.pandas(desc="Processing 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(handle_numbers_and_special_chars)

# %%
in_df.to_csv('preprocessed_dev_text1_text2.tsv', sep='\t', index=False)

# %%
import kenlm

# Load the language model
model = kenlm.Model('language_model.binary')

# %%
import numpy as np

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

def predict_missing_word(model, context_1, context_2):
    # Define the vocabulary
    vocabulary = set(' '.join([context_1, context_2]).split())

    # Initialize a dictionary to store the words and their scores
    word_scores = {}

    # Iterate over the vocabulary
    for word in vocabulary:
        try:
            # Generate the sentence
            sentence = f"{context_1} {word} {context_2}"
            
            # Score the sentence and store it in the dictionary
            word_scores[word] = model.score(sentence)

        except Exception as e:
            print(f"Error processing word '{word}': {e}")
            continue

    # If no word was found, return None for all values
    if not word_scores:
        return None, None, None

    # Convert the scores to probabilities using the softmax function
    word_probs = {word: max(0.001, prob) for word, prob in zip(word_scores.keys(), softmax(list(word_scores.values())))}

    # Find the word with the highest probability
    best_word, best_prob = max(word_probs.items(), key=lambda x: x[1])

    # Calculate the sum of probabilities for the other words
    other_probs_sum = sum(prob for word, prob in word_probs.items() if word != best_word)

    return best_word, best_prob, other_probs_sum

# %%
# Initialize a counter for the correct predictions
correct_predictions = 0

# Open the output file
with open('out.tsv', 'w') as f:
    # Iterate over the rows of the input DataFrame and the expected DataFrame
    for i, ((_, input_row), expected_word) in enumerate(zip(in_df.iterrows(), expected_df[0])):
        try:
            # Get the context
            context_1 = input_row['text_1']
            context_2 = input_row['text_2']

            # Predict the missing word and get the probabilities
            predicted_word, prob, other_probs_sum = predict_missing_word(model, context_1, context_2)

            # If any of the values are None, use placeholder values
            if predicted_word is None:
                predicted_word = 'placeholder'
            if prob is None:
                prob = 0.001
            if other_probs_sum is None:
                other_probs_sum = 0.001

            # Write the output to the file
            f.write(f"{predicted_word}:{prob:.4f} :{other_probs_sum:.4f}\n")

            # Check if the prediction is correct
            if predicted_word == expected_word:
                correct_predictions += 1

            # Log progress every 1000 iterations
            if i % 1000 == 0:
                print(f"Processed {i} rows. Current accuracy: {correct_predictions / (i+1)}")

        except Exception as e:
            print(f"Error processing row {i}: {e}")

# Calculate the accuracy
accuracy = correct_predictions / len(in_df)

print(f"The accuracy of the model is {accuracy}")

# %%
import pandas as pd
import csv

# Placeholder line
placeholder_line = ['placeholder'] * 8  # Adjust the number of fields as needed

# Read the file line by line
with open('./challenging-america-word-gap-prediction/test-A/in.tsv', 'r') as f:
    lines = f.readlines()

# Split each line into fields and replace problematic lines with the placeholder line
lines = [line.strip().split('\t') if len(line.strip().split('\t')) == 8 else placeholder_line for line in lines]

# Convert the list of lines into a DataFrame
in_df = pd.DataFrame(lines)

# Print the number of rows in the DataFrame
print(in_df.shape[0])

# Drop unnecessary columns
columns_to_drop = [0, 1, 2, 3, 4, 5]  # Column indices to drop
in_df.drop(columns_to_drop, axis=1, inplace=True)

# Rename remaining columns for clarity
in_df.columns = ['text_1', 'text_2']

in_df['text_1'].fillna('placeholder', inplace=True)
in_df['text_2'].fillna('placeholder', inplace=True)

# %%
from tqdm import tqdm

# Define a function to replace '\n' with ' '
def replace_newline(text):
    if isinstance(text, str):
        return text.replace('\\n', ' ')
    return text

def replace_tabulation(text):
    if isinstance(text, str):
        return text.replace('\\t', ' ')
    return text

# Apply the function to 'text_1' and 'text_2' columns and show progress
tqdm.pandas(desc="Replacing '\\n' in 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline)

tqdm.pandas(desc="Replacing '\\t' in 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(replace_tabulation)

tqdm.pandas(desc="Replacing '\\n' in 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline)


tqdm.pandas(desc="Replacing '\\t' in 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(replace_tabulation)

# %%
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
from tqdm import tqdm


# If not already done, download the NLTK English stopwords and the Punkt Tokenizer Models
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize the text
    words = word_tokenize(text)
    
    # Remove stopwords and stem the words
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    
    # Join the words back into a single string
    text = ' '.join(words)
    
    return text

# Apply the preprocessing to the 'text_1' and 'text_2' columns
tqdm.pandas(desc="Processing 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text)


tqdm.pandas(desc="Processing 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text)

# %%
import re

def handle_numbers_and_special_chars(text):
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove special characters
    text = re.sub(r'\W+', ' ', text)
    
    return text

# Apply the function to the 'text_1' and 'text_2' columns
tqdm.pandas(desc="Processing 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(handle_numbers_and_special_chars)

tqdm.pandas(desc="Processing 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(handle_numbers_and_special_chars)

# %%
in_df.to_csv('preprocessed_test_text1_text2.tsv', sep='\t', index=False)

# %%
# Initialize a counter for the correct predictions
correct_predictions = 0

# Open the output file
with open('out.tsv', 'w') as f:
    # Iterate over the rows of the input DataFrame and the expected DataFrame
    for i, ((_, input_row), expected_word) in enumerate(zip(in_df.iterrows(), expected_df[0])):
        try:
            # Get the context
            context_1 = input_row['text_1']
            context_2 = input_row['text_2']

            # Predict the missing word and get the probabilities
            predicted_word, prob, other_probs_sum = predict_missing_word(model, context_1, context_2)

            # If any of the values are None, use placeholder values
            if predicted_word is None:
                predicted_word = 'placeholder'
            if prob is None:
                prob = 0.001
            if other_probs_sum is None:
                other_probs_sum = 0.001

            # Write the output to the file
            f.write(f"{predicted_word}:{prob:.4f} :{other_probs_sum:.4f}\n")

            # Check if the prediction is correct
            if predicted_word == expected_word:
                correct_predictions += 1

            # Log progress every 1000 iterations
            if i % 1000 == 0:
                print(f"Processed {i} rows. Current accuracy: {correct_predictions / (i+1)}")

        except Exception as e:
            print(f"Error processing row {i}: {e}")

# Calculate the accuracy
accuracy = correct_predictions / len(in_df)

print(f"The accuracy of the model is {accuracy}")
commit 2024-05-15 04:33:04 +02:00			`# %%`
			`import os`
			`import lzma`

			`folders = ["./challenging-america-word-gap-prediction/dev-0",`
			`"./challenging-america-word-gap-prediction/test-A",`
			`"./challenging-america-word-gap-prediction/train"]`

			`for folder in folders:`
			`for file in os.listdir(folder):`
			`if file.endswith(".tsv.xz"):`
			`file_path = os.path.join(folder, file)`
			`output_path = os.path.splitext(file_path)[0] # Remove the .xz extension`
			`with lzma.open(file_path, "rb") as compressed_file:`
			`with open(output_path, "wb") as output_file:`
			`output_file.write(compressed_file.read())`

			`# %%`
			`import nltk`
			`nltk.download('punkt')`

			`# %%`
			`import pandas as pd`

			`# Load the data`
			`in_df = pd.read_csv('./challenging-america-word-gap-prediction/train/in.tsv', sep='\t', header=None, on_bad_lines='warn')`
			`expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None, on_bad_lines='warn')`

			`# Print out the first few rows of each DataFrame`
			`print("in_df:")`
			`print(in_df.head())`
			`print("\nexpected_df:")`
			`print(expected_df.head())`
			`print("\nhate_speech_info_df:")`

			`# Print out more information about each DataFrame`
			`print("\nin_df info:")`
			`print(in_df.info())`
			`print("\nexpected_df info:")`
			`print(expected_df.info())`

			`# %%`
			`# Drop unnecessary columns`
			`columns_to_drop = [0, 1, 2, 3, 4, 5] # Column indices to drop`
			`in_df.drop(columns_to_drop, axis=1, inplace=True)`

			`# Rename remaining columns for clarity`
			`in_df.columns = ['text_1', 'text_2']`

			`# %%`
			`in_df['text_2'].fillna('', inplace=True)`

			`# %%`
			`from tqdm import tqdm`

			`# Define a function to replace '\n' with ' '`
			`def replace_newline(text):`
			`if isinstance(text, str):`
			`return text.replace('\\n', ' ')`
			`return text`

			`def replace_tabulation(text):`
			`if isinstance(text, str):`
			`return text.replace('\\t', ' ')`
			`return text`

			`# Apply the function to 'text_1' and 'text_2' columns and show progress`
			`tqdm.pandas(desc="Replacing '\\n' in 'text_1'")`
			`in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline)`

			`tqdm.pandas(desc="Replacing '\\t' in 'text_1'")`
			`in_df['text_1'] = in_df['text_1'].progress_apply(replace_tabulation)`

			`tqdm.pandas(desc="Replacing '\\n' in 'text_2'")`
			`in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline)`


			`tqdm.pandas(desc="Replacing '\\t' in 'text_2'")`
			`in_df['text_2'] = in_df['text_2'].progress_apply(replace_tabulation)`

			`# %%`
			`print(in_df.head())`

			`# %%`
			`import nltk`
			`from nltk.corpus import stopwords`
			`from nltk.stem import PorterStemmer`
			`from nltk.tokenize import word_tokenize`
			`import string`
			`from tqdm import tqdm`


			`# If not already done, download the NLTK English stopwords and the Punkt Tokenizer Models`
			`nltk.download('punkt')`
			`nltk.download('stopwords')`

			`stop_words = set(stopwords.words('english'))`
			`stemmer = PorterStemmer()`

			`def preprocess_text(text):`
			`# Lowercase the text`
			`text = text.lower()`

			`# Remove punctuation`
			`text = text.translate(str.maketrans('', '', string.punctuation))`

			`# Tokenize the text`
			`words = word_tokenize(text)`

			`# Remove stopwords and stem the words`
			`words = [stemmer.stem(word) for word in words if word not in stop_words]`

			`# Join the words back into a single string`
			`text = ' '.join(words)`

			`return text`

			`# Apply the preprocessing to the 'text_1' and 'text_2' columns`
			`tqdm.pandas(desc="Processing 'text_1'")`
			`in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text)`


			`tqdm.pandas(desc="Processing 'text_2'")`
			`in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text)`

			`# %%`
			`print(in_df.head())`

			`# %%`
			`# Save 'in_df' DataFrame to a .tsv file`
			`in_df.to_csv('preprocessed_text1_text2.tsv', sep='\t', index=False)`

			`# %%`
			`import re`

			`def handle_numbers_and_special_chars(text):`
			`# Remove numbers`
			`text = re.sub(r'\d+', '', text)`

			`# Remove special characters`
			`text = re.sub(r'\W+', ' ', text)`

			`return text`

			`# Apply the function to the 'text_1' and 'text_2' columns`
			`tqdm.pandas(desc="Processing 'text_1'")`
			`in_df['text_1'] = in_df['text_1'].progress_apply(handle_numbers_and_special_chars)`

			`tqdm.pandas(desc="Processing 'text_2'")`
			`in_df['text_2'] = in_df['text_2'].progress_apply(handle_numbers_and_special_chars)`

			`# %%`
			`from spellchecker import SpellChecker`

			`spell = SpellChecker()`

			`def correct_spelling(text):`
			`# Tokenize the text`
			`words = text.split()`

			`# Correct spelling`
			`corrected_words = [spell.correction(word) if spell.correction(word) is not None else '' for word in words]`

			`# Join the words back into a single string`
			`text = ' '.join(corrected_words)`

			`return text`

			`# Apply the spelling correction to the 'text_1' and 'text_2' columns`
			`tqdm.pandas(desc="Spelling Correction 'text_1'")`
			`in_df['text_1'] = in_df['text_1'].progress_apply(correct_spelling)`

			`tqdm.pandas(desc="Spelling Correction 'text_2'")`
			`in_df['text_2'] = in_df['text_2'].progress_apply(correct_spelling)`

			`# %%`
			`# Define a function to concatenate 'text_1' and 'text_2'`
			`def concatenate_texts(row):`
			`return str(row['text_1']) + ' <MASK> ' + str(row['text_2'])`

			`# Apply the function to each row and show progress`
			`tqdm.pandas(desc="Concatenating 'text_1' and 'text_2'")`
			`in_df['text'] = in_df.progress_apply(concatenate_texts, axis=1)`

			`# Now you can drop 'text_1' and 'text_2' columns if you want`
			`in_df.drop(['text_1', 'text_2'], axis=1, inplace=True)`

			`# %%`
			`from tqdm import tqdm`
			`tqdm.pandas()`

			`# Load the preprocessed data`
			`in_df = pd.read_csv('preprocessed_text1_text2.tsv', sep='\t')`

			`# Load the expected words`
			`expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None, on_bad_lines='warn')`
			`expected_df.columns = ['expected_word']`

			`# Add the expected words to in_df`
			`in_df = pd.concat([in_df, expected_df], axis=1)`

			`# Define a function to concatenate 'text_1' and 'expected_word' and 'text_2'`
			`def concatenate_texts(row):`
			`return str(row['text_1']) + ' ' + str(row['expected_word']) + ' ' + str(row['text_2'])`

			`# Apply the function to each row and show progress`
			`in_df['unmasked_text'] = in_df.progress_apply(concatenate_texts, axis=1)`

			`# %%`
			`in_df['unmasked_text'].to_csv('preprocessed_text_join_unmask.tsv', sep='\t', index=False)`

			`# %%`
			`# Save the 'text' column to a text file`
			`in_df['unmasked_text'].to_csv('training_data.txt', index=False, header=False)`

			`# %%`
			`!lmplz -o 5 --discount_fallback < training_data.txt > language_model.arpa`

			`# %%`
			`!build_binary language_model.arpa language_model.binary`

			`# %%`
			`!lmplz -o 6 --discount_fallback -S 80% --prune 0 0 1 1 1 1 < training_data.txt > language_model.arpa`

			`# %%`
			`!wc -l ./challenging-america-word-gap-prediction/dev-0/in.tsv`

			`# %%`
			`import pandas as pd`
			`import csv`

			`# Load the data`
			`try:`
			`in_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/in.tsv', sep='\t', header=None, on_bad_lines='error')`
			`except Exception as e:`
			`print(e)`
			`expected_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/expected.tsv', sep='\t', header=None, on_bad_lines='warn', quoting=csv.QUOTE_NONE)`

			`print(in_df.shape[0])`

			`# Drop unnecessary columns`
			`columns_to_drop = [0, 1, 2, 3, 4, 5] # Column indices to drop`
			`in_df.drop(columns_to_drop, axis=1, inplace=True)`

			`# Rename remaining columns for clarity`
			`in_df.columns = ['text_1', 'text_2']`

			`in_df['text_1'].fillna('placeholder', inplace=True)`
			`in_df['text_2'].fillna('placeholder', inplace=True)`

			`# %%`
			`import pandas as pd`
			`import csv`

			`# Placeholder line`
			`placeholder_line = ['placeholder'] * 8 # Adjust the number of fields as needed`

			`# Read the file line by line`
			`with open('./challenging-america-word-gap-prediction/dev-0/in.tsv', 'r') as f:`
			`lines = f.readlines()`

			`# Split each line into fields and replace problematic lines with the placeholder line`
			`lines = [line.strip().split('\t') if len(line.strip().split('\t')) == 8 else placeholder_line for line in lines]`

			`# Convert the list of lines into a DataFrame`
			`in_df = pd.DataFrame(lines)`

			`# Print the number of rows in the DataFrame`
			`print(in_df.shape[0])`

			`# Drop unnecessary columns`
			`columns_to_drop = [0, 1, 2, 3, 4, 5] # Column indices to drop`
			`in_df.drop(columns_to_drop, axis=1, inplace=True)`

			`# Rename remaining columns for clarity`
			`in_df.columns = ['text_1', 'text_2']`

			`in_df['text_1'].fillna('placeholder', inplace=True)`
			`in_df['text_2'].fillna('placeholder', inplace=True)`

			`# %%`
			`print(in_df.shape[0])`

			`# %%`
			`from tqdm import tqdm`

			`# Define a function to replace '\n' with ' '`
			`def replace_newline(text):`
			`if isinstance(text, str):`
			`return text.replace('\\n', ' ')`
			`return text`

			`def replace_tabulation(text):`
			`if isinstance(text, str):`
			`return text.replace('\\t', ' ')`
			`return text`

			`# Apply the function to 'text_1' and 'text_2' columns and show progress`
			`tqdm.pandas(desc="Replacing '\\n' in 'text_1'")`
			`in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline)`

			`tqdm.pandas(desc="Replacing '\\t' in 'text_1'")`
			`in_df['text_1'] = in_df['text_1'].progress_apply(replace_tabulation)`

			`tqdm.pandas(desc="Replacing '\\n' in 'text_2'")`
			`in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline)`


			`tqdm.pandas(desc="Replacing '\\t' in 'text_2'")`
			`in_df['text_2'] = in_df['text_2'].progress_apply(replace_tabulation)`

			`# %%`
			`import nltk`
			`from nltk.corpus import stopwords`
			`from nltk.stem import PorterStemmer`
			`from nltk.tokenize import word_tokenize`
			`import string`
			`from tqdm import tqdm`


			`# If not already done, download the NLTK English stopwords and the Punkt Tokenizer Models`
			`nltk.download('punkt')`
			`nltk.download('stopwords')`

			`stop_words = set(stopwords.words('english'))`
			`stemmer = PorterStemmer()`

			`def preprocess_text(text):`
			`# Lowercase the text`
			`text = text.lower()`

			`# Remove punctuation`
			`text = text.translate(str.maketrans('', '', string.punctuation))`

			`# Tokenize the text`
			`words = word_tokenize(text)`

			`# Remove stopwords and stem the words`
			`words = [stemmer.stem(word) for word in words if word not in stop_words]`

			`# Join the words back into a single string`
			`text = ' '.join(words)`

			`return text`

			`# Apply the preprocessing to the 'text_1' and 'text_2' columns`
			`tqdm.pandas(desc="Processing 'text_1'")`
			`in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text)`


			`tqdm.pandas(desc="Processing 'text_2'")`
			`in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text)`

			`# %%`
			`import re`

			`def handle_numbers_and_special_chars(text):`
			`# Remove numbers`
			`text = re.sub(r'\d+', '', text)`

			`# Remove special characters`
			`text = re.sub(r'\W+', ' ', text)`

			`return text`

			`# Apply the function to the 'text_1' and 'text_2' columns`
			`tqdm.pandas(desc="Processing 'text_1'")`
			`in_df['text_1'] = in_df['text_1'].progress_apply(handle_numbers_and_special_chars)`

			`tqdm.pandas(desc="Processing 'text_2'")`
			`in_df['text_2'] = in_df['text_2'].progress_apply(handle_numbers_and_special_chars)`

			`# %%`
			`in_df.to_csv('preprocessed_dev_text1_text2.tsv', sep='\t', index=False)`

			`# %%`
			`import kenlm`

			`# Load the language model`
			`model = kenlm.Model('language_model.binary')`

			`# %%`
			`import numpy as np`

			`def softmax(x):`
			`e_x = np.exp(x - np.max(x))`
			`return e_x / e_x.sum()`

			`def predict_missing_word(model, context_1, context_2):`
			`# Define the vocabulary`
			`vocabulary = set(' '.join([context_1, context_2]).split())`

			`# Initialize a dictionary to store the words and their scores`
			`word_scores = {}`

			`# Iterate over the vocabulary`
			`for word in vocabulary:`
			`try:`
			`# Generate the sentence`
			`sentence = f"{context_1} {word} {context_2}"`

			`# Score the sentence and store it in the dictionary`
			`word_scores[word] = model.score(sentence)`

			`except Exception as e:`
			`print(f"Error processing word '{word}': {e}")`
			`continue`

			`# If no word was found, return None for all values`
			`if not word_scores:`
			`return None, None, None`

			`# Convert the scores to probabilities using the softmax function`
			`word_probs = {word: max(0.001, prob) for word, prob in zip(word_scores.keys(), softmax(list(word_scores.values())))}`

			`# Find the word with the highest probability`
			`best_word, best_prob = max(word_probs.items(), key=lambda x: x[1])`

			`# Calculate the sum of probabilities for the other words`
			`other_probs_sum = sum(prob for word, prob in word_probs.items() if word != best_word)`

			`return best_word, best_prob, other_probs_sum`

			`# %%`
			`# Initialize a counter for the correct predictions`
			`correct_predictions = 0`

			`# Open the output file`
			`with open('out.tsv', 'w') as f:`
			`# Iterate over the rows of the input DataFrame and the expected DataFrame`
			`for i, ((_, input_row), expected_word) in enumerate(zip(in_df.iterrows(), expected_df[0])):`
			`try:`
			`# Get the context`
			`context_1 = input_row['text_1']`
			`context_2 = input_row['text_2']`

			`# Predict the missing word and get the probabilities`
			`predicted_word, prob, other_probs_sum = predict_missing_word(model, context_1, context_2)`

			`# If any of the values are None, use placeholder values`
			`if predicted_word is None:`
			`predicted_word = 'placeholder'`
			`if prob is None:`
			`prob = 0.001`
			`if other_probs_sum is None:`
			`other_probs_sum = 0.001`

			`# Write the output to the file`
			`f.write(f"{predicted_word}:{prob:.4f} :{other_probs_sum:.4f}\n")`

			`# Check if the prediction is correct`
			`if predicted_word == expected_word:`
			`correct_predictions += 1`

			`# Log progress every 1000 iterations`
			`if i % 1000 == 0:`
			`print(f"Processed {i} rows. Current accuracy: {correct_predictions / (i+1)}")`

			`except Exception as e:`
			`print(f"Error processing row {i}: {e}")`

			`# Calculate the accuracy`
			`accuracy = correct_predictions / len(in_df)`

			`print(f"The accuracy of the model is {accuracy}")`

			`# %%`
			`import pandas as pd`
			`import csv`

			`# Placeholder line`
			`placeholder_line = ['placeholder'] * 8 # Adjust the number of fields as needed`

			`# Read the file line by line`
			`with open('./challenging-america-word-gap-prediction/test-A/in.tsv', 'r') as f:`
			`lines = f.readlines()`

			`# Split each line into fields and replace problematic lines with the placeholder line`
			`lines = [line.strip().split('\t') if len(line.strip().split('\t')) == 8 else placeholder_line for line in lines]`

			`# Convert the list of lines into a DataFrame`
			`in_df = pd.DataFrame(lines)`

			`# Print the number of rows in the DataFrame`
			`print(in_df.shape[0])`

			`# Drop unnecessary columns`
			`columns_to_drop = [0, 1, 2, 3, 4, 5] # Column indices to drop`
			`in_df.drop(columns_to_drop, axis=1, inplace=True)`

			`# Rename remaining columns for clarity`
			`in_df.columns = ['text_1', 'text_2']`

			`in_df['text_1'].fillna('placeholder', inplace=True)`
			`in_df['text_2'].fillna('placeholder', inplace=True)`

			`# %%`
			`from tqdm import tqdm`

			`# Define a function to replace '\n' with ' '`
			`def replace_newline(text):`
			`if isinstance(text, str):`
			`return text.replace('\\n', ' ')`
			`return text`

			`def replace_tabulation(text):`
			`if isinstance(text, str):`
			`return text.replace('\\t', ' ')`
			`return text`

			`# Apply the function to 'text_1' and 'text_2' columns and show progress`
			`tqdm.pandas(desc="Replacing '\\n' in 'text_1'")`
			`in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline)`

			`tqdm.pandas(desc="Replacing '\\t' in 'text_1'")`
			`in_df['text_1'] = in_df['text_1'].progress_apply(replace_tabulation)`

			`tqdm.pandas(desc="Replacing '\\n' in 'text_2'")`
			`in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline)`


			`tqdm.pandas(desc="Replacing '\\t' in 'text_2'")`
			`in_df['text_2'] = in_df['text_2'].progress_apply(replace_tabulation)`

			`# %%`
			`import nltk`
			`from nltk.corpus import stopwords`
			`from nltk.stem import PorterStemmer`
			`from nltk.tokenize import word_tokenize`
			`import string`
			`from tqdm import tqdm`


			`# If not already done, download the NLTK English stopwords and the Punkt Tokenizer Models`
			`nltk.download('punkt')`
			`nltk.download('stopwords')`

			`stop_words = set(stopwords.words('english'))`
			`stemmer = PorterStemmer()`

			`def preprocess_text(text):`
			`# Lowercase the text`
			`text = text.lower()`

			`# Remove punctuation`
			`text = text.translate(str.maketrans('', '', string.punctuation))`

			`# Tokenize the text`
			`words = word_tokenize(text)`

			`# Remove stopwords and stem the words`
			`words = [stemmer.stem(word) for word in words if word not in stop_words]`

			`# Join the words back into a single string`
			`text = ' '.join(words)`

			`return text`

			`# Apply the preprocessing to the 'text_1' and 'text_2' columns`
			`tqdm.pandas(desc="Processing 'text_1'")`
			`in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text)`


			`tqdm.pandas(desc="Processing 'text_2'")`
			`in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text)`

			`# %%`
			`import re`

			`def handle_numbers_and_special_chars(text):`
			`# Remove numbers`
			`text = re.sub(r'\d+', '', text)`

			`# Remove special characters`
			`text = re.sub(r'\W+', ' ', text)`

			`return text`

			`# Apply the function to the 'text_1' and 'text_2' columns`
			`tqdm.pandas(desc="Processing 'text_1'")`
			`in_df['text_1'] = in_df['text_1'].progress_apply(handle_numbers_and_special_chars)`

			`tqdm.pandas(desc="Processing 'text_2'")`
			`in_df['text_2'] = in_df['text_2'].progress_apply(handle_numbers_and_special_chars)`

			`# %%`
			`in_df.to_csv('preprocessed_test_text1_text2.tsv', sep='\t', index=False)`

			`# %%`
			`# Initialize a counter for the correct predictions`
			`correct_predictions = 0`

			`# Open the output file`
			`with open('out.tsv', 'w') as f:`
			`# Iterate over the rows of the input DataFrame and the expected DataFrame`
			`for i, ((_, input_row), expected_word) in enumerate(zip(in_df.iterrows(), expected_df[0])):`
			`try:`
			`# Get the context`
			`context_1 = input_row['text_1']`
			`context_2 = input_row['text_2']`

			`# Predict the missing word and get the probabilities`
			`predicted_word, prob, other_probs_sum = predict_missing_word(model, context_1, context_2)`

			`# If any of the values are None, use placeholder values`
			`if predicted_word is None:`
			`predicted_word = 'placeholder'`
			`if prob is None:`
			`prob = 0.001`
			`if other_probs_sum is None:`
			`other_probs_sum = 0.001`

			`# Write the output to the file`
			`f.write(f"{predicted_word}:{prob:.4f} :{other_probs_sum:.4f}\n")`

			`# Check if the prediction is correct`
			`if predicted_word == expected_word:`
			`correct_predictions += 1`

			`# Log progress every 1000 iterations`
			`if i % 1000 == 0:`
			`print(f"Processed {i} rows. Current accuracy: {correct_predictions / (i+1)}")`

			`except Exception as e:`
			`print(f"Error processing row {i}: {e}")`

			`# Calculate the accuracy`
			`accuracy = correct_predictions / len(in_df)`

			`print(f"The accuracy of the model is {accuracy}")`