commit

2024-05-15 02:33:04 +00:00 · 2024-05-15 02:33:04 +00:00 · b3b8a20768
commit b3b8a20768
parent be14714af7
14 changed files with 66335 additions and 0 deletions
--- a/06_kenlm.ipynb
+++ b/06_kenlm.ipynb
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/dev-0/hate-speech-info.tsv
+++ b/dev-0/hate-speech-info.tsv
--- a/dev-0/in.tsv
+++ b/dev-0/in.tsv
--- a/dev-0/in.tsv.xz
+++ b/dev-0/in.tsv.xz
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/BIN
+++ b/BIN
--- a/in-header.tsv
+++ b/in-header.tsv
@ -0,0 +1 @@
+FileId	Year	LeftContext	RightContext
--- a/out-header.tsv
+++ b/out-header.tsv
@ -0,0 +1 @@
+Word
--- a/run1.py
+++ b/run1.py
@ -0,0 +1,632 @@
+# %%
+import os
+import lzma
+
+folders = ["./challenging-america-word-gap-prediction/dev-0",
+           "./challenging-america-word-gap-prediction/test-A",
+           "./challenging-america-word-gap-prediction/train"]
+
+for folder in folders:
+    for file in os.listdir(folder):
+        if file.endswith(".tsv.xz"):
+            file_path = os.path.join(folder, file)
+            output_path = os.path.splitext(file_path)[0]  # Remove the .xz extension
+            with lzma.open(file_path, "rb") as compressed_file:
+                with open(output_path, "wb") as output_file:
+                    output_file.write(compressed_file.read())
+
+# %%
+import nltk
+nltk.download('punkt')
+
+# %%
+import pandas as pd
+
+# Load the data
+in_df = pd.read_csv('./challenging-america-word-gap-prediction/train/in.tsv', sep='\t', header=None, on_bad_lines='warn')
+expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None, on_bad_lines='warn')
+
+# Print out the first few rows of each DataFrame
+print("in_df:")
+print(in_df.head())
+print("\nexpected_df:")
+print(expected_df.head())
+print("\nhate_speech_info_df:")
+
+# Print out more information about each DataFrame
+print("\nin_df info:")
+print(in_df.info())
+print("\nexpected_df info:")
+print(expected_df.info())
+
+# %%
+# Drop unnecessary columns
+columns_to_drop = [0, 1, 2, 3, 4, 5]  # Column indices to drop
+in_df.drop(columns_to_drop, axis=1, inplace=True)
+
+# Rename remaining columns for clarity
+in_df.columns = ['text_1', 'text_2']
+
+# %%
+in_df['text_2'].fillna('', inplace=True)
+
+# %%
+from tqdm import tqdm
+
+# Define a function to replace '\n' with ' '
+def replace_newline(text):
+    if isinstance(text, str):
+        return text.replace('\\n', ' ')
+    return text
+
+def replace_tabulation(text):
+    if isinstance(text, str):
+        return text.replace('\\t', ' ')
+    return text
+
+# Apply the function to 'text_1' and 'text_2' columns and show progress
+tqdm.pandas(desc="Replacing '\\n' in 'text_1'")
+in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline)
+
+tqdm.pandas(desc="Replacing '\\t' in 'text_1'")
+in_df['text_1'] = in_df['text_1'].progress_apply(replace_tabulation)
+
+tqdm.pandas(desc="Replacing '\\n' in 'text_2'")
+in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline)
+
+
+tqdm.pandas(desc="Replacing '\\t' in 'text_2'")
+in_df['text_2'] = in_df['text_2'].progress_apply(replace_tabulation)
+
+# %%
+print(in_df.head())
+
+# %%
+import nltk
+from nltk.corpus import stopwords
+from nltk.stem import PorterStemmer
+from nltk.tokenize import word_tokenize
+import string
+from tqdm import tqdm
+
+
+# If not already done, download the NLTK English stopwords and the Punkt Tokenizer Models
+nltk.download('punkt')
+nltk.download('stopwords')
+
+stop_words = set(stopwords.words('english'))
+stemmer = PorterStemmer()
+
+def preprocess_text(text):
+    # Lowercase the text
+    text = text.lower()
+    
+    # Remove punctuation
+    text = text.translate(str.maketrans('', '', string.punctuation))
+    
+    # Tokenize the text
+    words = word_tokenize(text)
+    
+    # Remove stopwords and stem the words
+    words = [stemmer.stem(word) for word in words if word not in stop_words]
+    
+    # Join the words back into a single string
+    text = ' '.join(words)
+    
+    return text
+
+# Apply the preprocessing to the 'text_1' and 'text_2' columns
+tqdm.pandas(desc="Processing 'text_1'")
+in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text)
+
+
+tqdm.pandas(desc="Processing 'text_2'")
+in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text)
+
+# %%
+print(in_df.head())
+
+# %%
+# Save 'in_df' DataFrame to a .tsv file
+in_df.to_csv('preprocessed_text1_text2.tsv', sep='\t', index=False)
+
+# %%
+import re
+
+def handle_numbers_and_special_chars(text):
+    # Remove numbers
+    text = re.sub(r'\d+', '', text)
+    
+    # Remove special characters
+    text = re.sub(r'\W+', ' ', text)
+    
+    return text
+
+# Apply the function to the 'text_1' and 'text_2' columns
+tqdm.pandas(desc="Processing 'text_1'")
+in_df['text_1'] = in_df['text_1'].progress_apply(handle_numbers_and_special_chars)
+
+tqdm.pandas(desc="Processing 'text_2'")
+in_df['text_2'] = in_df['text_2'].progress_apply(handle_numbers_and_special_chars)
+
+# %%
+from spellchecker import SpellChecker
+
+spell = SpellChecker()
+
+def correct_spelling(text):
+    # Tokenize the text
+    words = text.split()
+    
+    # Correct spelling
+    corrected_words = [spell.correction(word) if spell.correction(word) is not None else '' for word in words]
+    
+    # Join the words back into a single string
+    text = ' '.join(corrected_words)
+    
+    return text
+
+# Apply the spelling correction to the 'text_1' and 'text_2' columns
+tqdm.pandas(desc="Spelling Correction 'text_1'")
+in_df['text_1'] = in_df['text_1'].progress_apply(correct_spelling)
+
+tqdm.pandas(desc="Spelling Correction 'text_2'")
+in_df['text_2'] = in_df['text_2'].progress_apply(correct_spelling)
+
+# %%
+# Define a function to concatenate 'text_1' and 'text_2'
+def concatenate_texts(row):
+    return str(row['text_1']) + ' <MASK> ' + str(row['text_2'])
+
+# Apply the function to each row and show progress
+tqdm.pandas(desc="Concatenating 'text_1' and 'text_2'")
+in_df['text'] = in_df.progress_apply(concatenate_texts, axis=1)
+
+# Now you can drop 'text_1' and 'text_2' columns if you want
+in_df.drop(['text_1', 'text_2'], axis=1, inplace=True)
+
+# %%
+from tqdm import tqdm
+tqdm.pandas()
+
+# Load the preprocessed data
+in_df = pd.read_csv('preprocessed_text1_text2.tsv', sep='\t')
+
+# Load the expected words
+expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None, on_bad_lines='warn')
+expected_df.columns = ['expected_word']
+
+# Add the expected words to in_df
+in_df = pd.concat([in_df, expected_df], axis=1)
+
+# Define a function to concatenate 'text_1' and 'expected_word' and 'text_2'
+def concatenate_texts(row):
+    return str(row['text_1']) + ' ' + str(row['expected_word']) + ' ' + str(row['text_2'])
+
+# Apply the function to each row and show progress
+in_df['unmasked_text'] = in_df.progress_apply(concatenate_texts, axis=1)
+
+# %%
+in_df['unmasked_text'].to_csv('preprocessed_text_join_unmask.tsv', sep='\t', index=False)
+
+# %%
+# Save the 'text' column to a text file
+in_df['unmasked_text'].to_csv('training_data.txt', index=False, header=False)
+
+# %%
+!lmplz -o 5 --discount_fallback < training_data.txt > language_model.arpa
+
+# %%
+!build_binary language_model.arpa language_model.binary
+
+# %%
+!lmplz -o 6 --discount_fallback -S 80% --prune 0 0 1 1 1 1 < training_data.txt > language_model.arpa
+
+# %%
+!wc -l ./challenging-america-word-gap-prediction/dev-0/in.tsv
+
+# %%
+import pandas as pd
+import csv
+
+# Load the data
+try:
+    in_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/in.tsv', sep='\t', header=None, on_bad_lines='error')
+except Exception as e:
+    print(e)
+expected_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/expected.tsv', sep='\t', header=None, on_bad_lines='warn', quoting=csv.QUOTE_NONE)
+
+print(in_df.shape[0])
+
+# Drop unnecessary columns
+columns_to_drop = [0, 1, 2, 3, 4, 5]  # Column indices to drop
+in_df.drop(columns_to_drop, axis=1, inplace=True)
+
+# Rename remaining columns for clarity
+in_df.columns = ['text_1', 'text_2']
+
+in_df['text_1'].fillna('placeholder', inplace=True)
+in_df['text_2'].fillna('placeholder', inplace=True)
+
+# %%
+import pandas as pd
+import csv
+
+# Placeholder line
+placeholder_line = ['placeholder'] * 8  # Adjust the number of fields as needed
+
+# Read the file line by line
+with open('./challenging-america-word-gap-prediction/dev-0/in.tsv', 'r') as f:
+    lines = f.readlines()
+
+# Split each line into fields and replace problematic lines with the placeholder line
+lines = [line.strip().split('\t') if len(line.strip().split('\t')) == 8 else placeholder_line for line in lines]
+
+# Convert the list of lines into a DataFrame
+in_df = pd.DataFrame(lines)
+
+# Print the number of rows in the DataFrame
+print(in_df.shape[0])
+
+# Drop unnecessary columns
+columns_to_drop = [0, 1, 2, 3, 4, 5]  # Column indices to drop
+in_df.drop(columns_to_drop, axis=1, inplace=True)
+
+# Rename remaining columns for clarity
+in_df.columns = ['text_1', 'text_2']
+
+in_df['text_1'].fillna('placeholder', inplace=True)
+in_df['text_2'].fillna('placeholder', inplace=True)
+
+# %%
+print(in_df.shape[0])
+
+# %%
+from tqdm import tqdm
+
+# Define a function to replace '\n' with ' '
+def replace_newline(text):
+    if isinstance(text, str):
+        return text.replace('\\n', ' ')
+    return text
+
+def replace_tabulation(text):
+    if isinstance(text, str):
+        return text.replace('\\t', ' ')
+    return text
+
+# Apply the function to 'text_1' and 'text_2' columns and show progress
+tqdm.pandas(desc="Replacing '\\n' in 'text_1'")
+in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline)
+
+tqdm.pandas(desc="Replacing '\\t' in 'text_1'")
+in_df['text_1'] = in_df['text_1'].progress_apply(replace_tabulation)
+
+tqdm.pandas(desc="Replacing '\\n' in 'text_2'")
+in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline)
+
+
+tqdm.pandas(desc="Replacing '\\t' in 'text_2'")
+in_df['text_2'] = in_df['text_2'].progress_apply(replace_tabulation)
+
+# %%
+import nltk
+from nltk.corpus import stopwords
+from nltk.stem import PorterStemmer
+from nltk.tokenize import word_tokenize
+import string
+from tqdm import tqdm
+
+
+# If not already done, download the NLTK English stopwords and the Punkt Tokenizer Models
+nltk.download('punkt')
+nltk.download('stopwords')
+
+stop_words = set(stopwords.words('english'))
+stemmer = PorterStemmer()
+
+def preprocess_text(text):
+    # Lowercase the text
+    text = text.lower()
+    
+    # Remove punctuation
+    text = text.translate(str.maketrans('', '', string.punctuation))
+    
+    # Tokenize the text
+    words = word_tokenize(text)
+    
+    # Remove stopwords and stem the words
+    words = [stemmer.stem(word) for word in words if word not in stop_words]
+    
+    # Join the words back into a single string
+    text = ' '.join(words)
+    
+    return text
+
+# Apply the preprocessing to the 'text_1' and 'text_2' columns
+tqdm.pandas(desc="Processing 'text_1'")
+in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text)
+
+
+tqdm.pandas(desc="Processing 'text_2'")
+in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text)
+
+# %%
+import re
+
+def handle_numbers_and_special_chars(text):
+    # Remove numbers
+    text = re.sub(r'\d+', '', text)
+    
+    # Remove special characters
+    text = re.sub(r'\W+', ' ', text)
+    
+    return text
+
+# Apply the function to the 'text_1' and 'text_2' columns
+tqdm.pandas(desc="Processing 'text_1'")
+in_df['text_1'] = in_df['text_1'].progress_apply(handle_numbers_and_special_chars)
+
+tqdm.pandas(desc="Processing 'text_2'")
+in_df['text_2'] = in_df['text_2'].progress_apply(handle_numbers_and_special_chars)
+
+# %%
+in_df.to_csv('preprocessed_dev_text1_text2.tsv', sep='\t', index=False)
+
+# %%
+import kenlm
+
+# Load the language model
+model = kenlm.Model('language_model.binary')
+
+# %%
+import numpy as np
+
+def softmax(x):
+    e_x = np.exp(x - np.max(x))
+    return e_x / e_x.sum()
+
+def predict_missing_word(model, context_1, context_2):
+    # Define the vocabulary
+    vocabulary = set(' '.join([context_1, context_2]).split())
+
+    # Initialize a dictionary to store the words and their scores
+    word_scores = {}
+
+    # Iterate over the vocabulary
+    for word in vocabulary:
+        try:
+            # Generate the sentence
+            sentence = f"{context_1} {word} {context_2}"
+            
+            # Score the sentence and store it in the dictionary
+            word_scores[word] = model.score(sentence)
+
+        except Exception as e:
+            print(f"Error processing word '{word}': {e}")
+            continue
+
+    # If no word was found, return None for all values
+    if not word_scores:
+        return None, None, None
+
+    # Convert the scores to probabilities using the softmax function
+    word_probs = {word: max(0.001, prob) for word, prob in zip(word_scores.keys(), softmax(list(word_scores.values())))}
+
+    # Find the word with the highest probability
+    best_word, best_prob = max(word_probs.items(), key=lambda x: x[1])
+
+    # Calculate the sum of probabilities for the other words
+    other_probs_sum = sum(prob for word, prob in word_probs.items() if word != best_word)
+
+    return best_word, best_prob, other_probs_sum
+
+# %%
+# Initialize a counter for the correct predictions
+correct_predictions = 0
+
+# Open the output file
+with open('out.tsv', 'w') as f:
+    # Iterate over the rows of the input DataFrame and the expected DataFrame
+    for i, ((_, input_row), expected_word) in enumerate(zip(in_df.iterrows(), expected_df[0])):
+        try:
+            # Get the context
+            context_1 = input_row['text_1']
+            context_2 = input_row['text_2']
+
+            # Predict the missing word and get the probabilities
+            predicted_word, prob, other_probs_sum = predict_missing_word(model, context_1, context_2)
+
+            # If any of the values are None, use placeholder values
+            if predicted_word is None:
+                predicted_word = 'placeholder'
+            if prob is None:
+                prob = 0.001
+            if other_probs_sum is None:
+                other_probs_sum = 0.001
+
+            # Write the output to the file
+            f.write(f"{predicted_word}:{prob:.4f} :{other_probs_sum:.4f}\n")
+
+            # Check if the prediction is correct
+            if predicted_word == expected_word:
+                correct_predictions += 1
+
+            # Log progress every 1000 iterations
+            if i % 1000 == 0:
+                print(f"Processed {i} rows. Current accuracy: {correct_predictions / (i+1)}")
+
+        except Exception as e:
+            print(f"Error processing row {i}: {e}")
+
+# Calculate the accuracy
+accuracy = correct_predictions / len(in_df)
+
+print(f"The accuracy of the model is {accuracy}")
+
+# %%
+import pandas as pd
+import csv
+
+# Placeholder line
+placeholder_line = ['placeholder'] * 8  # Adjust the number of fields as needed
+
+# Read the file line by line
+with open('./challenging-america-word-gap-prediction/test-A/in.tsv', 'r') as f:
+    lines = f.readlines()
+
+# Split each line into fields and replace problematic lines with the placeholder line
+lines = [line.strip().split('\t') if len(line.strip().split('\t')) == 8 else placeholder_line for line in lines]
+
+# Convert the list of lines into a DataFrame
+in_df = pd.DataFrame(lines)
+
+# Print the number of rows in the DataFrame
+print(in_df.shape[0])
+
+# Drop unnecessary columns
+columns_to_drop = [0, 1, 2, 3, 4, 5]  # Column indices to drop
+in_df.drop(columns_to_drop, axis=1, inplace=True)
+
+# Rename remaining columns for clarity
+in_df.columns = ['text_1', 'text_2']
+
+in_df['text_1'].fillna('placeholder', inplace=True)
+in_df['text_2'].fillna('placeholder', inplace=True)
+
+# %%
+from tqdm import tqdm
+
+# Define a function to replace '\n' with ' '
+def replace_newline(text):
+    if isinstance(text, str):
+        return text.replace('\\n', ' ')
+    return text
+
+def replace_tabulation(text):
+    if isinstance(text, str):
+        return text.replace('\\t', ' ')
+    return text
+
+# Apply the function to 'text_1' and 'text_2' columns and show progress
+tqdm.pandas(desc="Replacing '\\n' in 'text_1'")
+in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline)
+
+tqdm.pandas(desc="Replacing '\\t' in 'text_1'")
+in_df['text_1'] = in_df['text_1'].progress_apply(replace_tabulation)
+
+tqdm.pandas(desc="Replacing '\\n' in 'text_2'")
+in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline)
+
+
+tqdm.pandas(desc="Replacing '\\t' in 'text_2'")
+in_df['text_2'] = in_df['text_2'].progress_apply(replace_tabulation)
+
+# %%
+import nltk
+from nltk.corpus import stopwords
+from nltk.stem import PorterStemmer
+from nltk.tokenize import word_tokenize
+import string
+from tqdm import tqdm
+
+
+# If not already done, download the NLTK English stopwords and the Punkt Tokenizer Models
+nltk.download('punkt')
+nltk.download('stopwords')
+
+stop_words = set(stopwords.words('english'))
+stemmer = PorterStemmer()
+
+def preprocess_text(text):
+    # Lowercase the text
+    text = text.lower()
+    
+    # Remove punctuation
+    text = text.translate(str.maketrans('', '', string.punctuation))
+    
+    # Tokenize the text
+    words = word_tokenize(text)
+    
+    # Remove stopwords and stem the words
+    words = [stemmer.stem(word) for word in words if word not in stop_words]
+    
+    # Join the words back into a single string
+    text = ' '.join(words)
+    
+    return text
+
+# Apply the preprocessing to the 'text_1' and 'text_2' columns
+tqdm.pandas(desc="Processing 'text_1'")
+in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text)
+
+
+tqdm.pandas(desc="Processing 'text_2'")
+in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text)
+
+# %%
+import re
+
+def handle_numbers_and_special_chars(text):
+    # Remove numbers
+    text = re.sub(r'\d+', '', text)
+    
+    # Remove special characters
+    text = re.sub(r'\W+', ' ', text)
+    
+    return text
+
+# Apply the function to the 'text_1' and 'text_2' columns
+tqdm.pandas(desc="Processing 'text_1'")
+in_df['text_1'] = in_df['text_1'].progress_apply(handle_numbers_and_special_chars)
+
+tqdm.pandas(desc="Processing 'text_2'")
+in_df['text_2'] = in_df['text_2'].progress_apply(handle_numbers_and_special_chars)
+
+# %%
+in_df.to_csv('preprocessed_test_text1_text2.tsv', sep='\t', index=False)
+
+# %%
+# Initialize a counter for the correct predictions
+correct_predictions = 0
+
+# Open the output file
+with open('out.tsv', 'w') as f:
+    # Iterate over the rows of the input DataFrame and the expected DataFrame
+    for i, ((_, input_row), expected_word) in enumerate(zip(in_df.iterrows(), expected_df[0])):
+        try:
+            # Get the context
+            context_1 = input_row['text_1']
+            context_2 = input_row['text_2']
+
+            # Predict the missing word and get the probabilities
+            predicted_word, prob, other_probs_sum = predict_missing_word(model, context_1, context_2)
+
+            # If any of the values are None, use placeholder values
+            if predicted_word is None:
+                predicted_word = 'placeholder'
+            if prob is None:
+                prob = 0.001
+            if other_probs_sum is None:
+                other_probs_sum = 0.001
+
+            # Write the output to the file
+            f.write(f"{predicted_word}:{prob:.4f} :{other_probs_sum:.4f}\n")
+
+            # Check if the prediction is correct
+            if predicted_word == expected_word:
+                correct_predictions += 1
+
+            # Log progress every 1000 iterations
+            if i % 1000 == 0:
+                print(f"Processed {i} rows. Current accuracy: {correct_predictions / (i+1)}")
+
+        except Exception as e:
+            print(f"Error processing row {i}: {e}")
+
+# Calculate the accuracy
+accuracy = correct_predictions / len(in_df)
+
+print(f"The accuracy of the model is {accuracy}")
+
+
--- a/test-A/hate-speech-info.tsv
+++ b/test-A/hate-speech-info.tsv
--- a/test-A/in.tsv
+++ b/test-A/in.tsv
--- a/test-A/in.tsv.xz
+++ b/test-A/in.tsv.xz
--- a/test-A/out.tsv
+++ b/test-A/out.tsv