# In[1]: import os import lzma folders = ["./challenging-america-word-gap-prediction/dev-0", "./challenging-america-word-gap-prediction/test-A", "./challenging-america-word-gap-prediction/train"] for folder in folders: for file in os.listdir(folder): if file.endswith(".tsv.xz"): file_path = os.path.join(folder, file) output_path = os.path.splitext(file_path)[0] # Remove the .xz extension with lzma.open(file_path, "rb") as compressed_file: with open(output_path, "wb") as output_file: output_file.write(compressed_file.read()) # In[5]: import nltk nltk.download('punkt') # In[ ]: import pandas as pd in_df = pd.read_csv('./challenging-america-word-gap-prediction/train/in.tsv', sep='\t', header=None, on_bad_lines='warn') expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None, on_bad_lines='warn') print("in_df:") print(in_df.head()) print("\nexpected_df:") print(expected_df.head()) print("\nhate_speech_info_df:") print("\nin_df info:") print(in_df.info()) print("\nexpected_df info:") print(expected_df.info()) # In[2]: columns_to_drop = [0, 1, 2, 3, 4, 5] in_df.drop(columns_to_drop, axis=1, inplace=True) in_df.columns = ['text_1', 'text_2'] # In[3]: print(in_df.head()) # In[4]: in_df['text_1'].fillna('', inplace=True) in_df['text_2'].fillna('', inplace=True) # In[5]: from tqdm import tqdm def replace_newline(text): if isinstance(text, str): return text.replace('\\n', ' ') return text tqdm.pandas(desc="Replacing '\\n' in 'text_1'") in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline) tqdm.pandas(desc="Replacing '\\n' in 'text_2'") in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline) # In[6]: print(in_df.head()) # In[ ]: import nltk from nltk.corpus import stopwords from nltk.stem import PorterStemmer from nltk.tokenize import word_tokenize import string from tqdm import tqdm nltk.download('punkt') nltk.download('stopwords') stop_words = set(stopwords.words('english')) stemmer = PorterStemmer() def preprocess_text(text): text = text.lower() # Remove punctuation text = text.translate(str.maketrans('', '', string.punctuation)) words = word_tokenize(text) # Remove stopwords and stem the words words = [stemmer.stem(word) for word in words if word not in stop_words] text = ' '.join(words) return text tqdm.pandas(desc="Processing 'text_1'") in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text) tqdm.pandas(desc="Processing 'text_2'") in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text) # In[8]: in_df.to_csv('preprocessed_text1_text2.tsv', sep='\t', index=False) # In[9]: def concatenate_texts(row): return str(row['text_1']) + ' ' + str(row['text_2']) tqdm.pandas(desc="Concatenating 'text_1' and 'text_2'") in_df['text'] = in_df.progress_apply(concatenate_texts, axis=1) in_df.drop(['text_1', 'text_2'], axis=1, inplace=True) # In[31]: from tqdm import tqdm tqdm.pandas() in_df = pd.read_csv('preprocessed_text1_text2.tsv', sep='\t') expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None, on_bad_lines='warn') expected_df.columns = ['expected_word'] in_df = pd.concat([in_df, expected_df], axis=1) def concatenate_texts(row): return str(row['text_1']) + ' ' + str(row['expected_word']) + ' ' + str(row['text_2']) # Apply the function to each row and show progress in_df['unmasked_text'] = in_df.progress_apply(concatenate_texts, axis=1) # In[35]: in_df['unmasked_text'].to_csv('preprocessed_text_join_unmask.tsv', sep='\t', index=False) # In[10]: in_df.to_csv('preprocessed_text_join_mask.tsv', sep='\t', index=False) # In[5]: import pandas as pd in_df = pd.read_csv('preprocessed_text_join_mask.tsv', sep='\t') print(in_df.head()) # In[6]: print(in_df.head()) # In[1]: from abc import ABC, abstractmethod from collections import Counter from nltk.util import ngrams from nltk.tokenize import word_tokenize import pandas as pd from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score import numpy as np class Model(ABC): def __init__(self, UNK_token= '', smoothing_parameter=0.5): self.UNK_token = UNK_token self.smoothing_parameter = smoothing_parameter @abstractmethod def train(self, corpus): pass @abstractmethod def predict(self, text): pass class InterpolatedModel(Model): def train(self, corpus, smoothing_parameter=0.5, vocab=15000): self.smoothing_parameter=smoothing_parameter if corpus.empty: raise ValueError("The corpus is empty.") corpus = list(set(corpus)) tokens = [] for i, sentence in enumerate(corpus): sentence_tokens = word_tokenize(sentence) # Add padding to sentences that are less than three words long while len(sentence_tokens) < 3: sentence_tokens.append(self.UNK_token) tokens.append(sentence_tokens) if i % 1000 == 0: print(f'Tokenizing sentence {i} of {len(corpus)}') self.unigram_counts = Counter() self.bigram_counts = Counter() self.trigram_counts = Counter() for i, sentence in enumerate(tokens): self.unigram_counts.update(sentence) self.bigram_counts.update(list(ngrams(sentence, 2, pad_left=True, pad_right=True))) self.trigram_counts.update(list(ngrams(sentence, 3, pad_left=True, pad_right=True))) if i % 1000 == 0: print(f'Counting ngrams in sentence {i} of {len(tokens)}') self.unigram_counts = Counter(dict(self.unigram_counts.most_common(vocab))) self.vocab_size = len(self.unigram_counts) def probability(self, text, word): tokens = word_tokenize(text) if len(tokens) < 2: # Use a unigram model if the input text is only one word return self.unigram_counts[word] / self.vocab_size if word in self.unigram_counts else 0 else: # Use a bigram/trigram model if the input text is at least two words previous_bigram = tuple(tokens[-2:]) unigram_prob = self.unigram_counts[word] / self.vocab_size if word in self.unigram_counts else 0 bigram_prob = self.bigram_counts.get((previous_bigram[1], word), 0) / self.unigram_counts.get(previous_bigram[1], 1) trigram_prob = self.trigram_counts.get((*previous_bigram, word), 0) / self.bigram_counts.get(previous_bigram, 1) return self.smoothing_parameter * unigram_prob + (1 - 2 * self.smoothing_parameter) * bigram_prob + (1 - self.smoothing_parameter) * trigram_prob def perplexity(self, sentence): tokens = word_tokenize(sentence) probabilities = [self.probability(' '.join(tokens[:i]), tokens[i]) for i in range(1, len(tokens))] log_probabilities = [np.log2(p) if p > 0 else float('-inf') for p in probabilities] average_log_probability = np.mean(log_probabilities) return np.power(2, -average_log_probability) def predict(self, text): tokens = word_tokenize(text) if len(tokens) < 1: return None elif len(tokens) < 2: # Use a unigram model if the input text is only one word probabilities = {word: self.unigram_counts[word] / self.vocab_size if word in self.unigram_counts else 0 for word in self.unigram_counts.keys()} else: # Use a bigram/trigram model if the input text is at least two words previous_bigram = tuple(tokens[-2:]) probabilities = {} for word in self.unigram_counts.keys(): unigram_prob = self.unigram_counts[word] / self.vocab_size if word in self.unigram_counts else 0 bigram_prob = self.bigram_counts.get((previous_bigram[1], word), 0) / self.unigram_counts.get(previous_bigram[1], 1) trigram_prob = self.trigram_counts.get((*previous_bigram, word), 0) / self.bigram_counts.get(previous_bigram, 1) probabilities[word] = self.smoothing_parameter * unigram_prob + (1 - 2 * self.smoothing_parameter) * bigram_prob + (1 - self.smoothing_parameter) * trigram_prob return max(probabilities, key=probabilities.get) # In[32]: in_df = pd.read_csv('preprocessed_text_join_unmask.tsv', sep='\t') expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None, on_bad_lines='warn') df = pd.concat([in_df, expected_df], axis=1) df.columns = ['text', 'expected_word'] quarter = len(df) // 4 df = df.iloc[:quarter] train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) model = InterpolatedModel() model.train(train_df['text'], smoothing_parameter=0.4, vocab=60000) # In[33]: import pickle with open('trained_model_quarter_sp04_vocab60k.pkl', 'wb') as file: pickle.dump(model, file) # In[ ]: import pickle with open('trained_model_quarter_sp04_vocab60k.pkl', 'rb') as file: model = pickle.load(file) # Smoothing = 0.4 # In[14]: import pandas as pd df = pd.read_csv('preprocessed_text_join_mask.tsv', sep='\t') expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None) expected_df.columns = ['expected_word'] for index, row in df.head(100).iterrows(): sentence = row['text'] expected_word = expected_df.loc[index, 'expected_word'] words = sentence.split() if '' in words: mask_index = words.index('') start_index = max(0, mask_index - 2) context = ' '.join(words[start_index:mask_index]) predicted_word = model.predict(context) if predicted_word == expected_word: print(f'Index: {index} Predicted Word: {predicted_word}, Expected Word: {expected_word}') # In[21]: import pandas as pd import csv # Load the data in_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/in.tsv', sep='\t', header=None, on_bad_lines='warn') expected_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/expected.tsv', sep='\t', header=None, on_bad_lines='warn', quoting=csv.QUOTE_NONE) columns_to_drop = [0, 1, 2, 3, 4, 5] in_df.drop(columns_to_drop, axis=1, inplace=True) in_df.columns = ['text_1', 'text_2'] in_df['text_2'].fillna('', inplace=True) # In[22]: from tqdm import tqdm def replace_newline(text): if isinstance(text, str): return text.replace('\\n', ' ') return text tqdm.pandas(desc="Replacing '\\n' in 'text_1'") in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline) tqdm.pandas(desc="Replacing '\\n' in 'text_2'") in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline) # In[23]: import nltk from nltk.corpus import stopwords from nltk.stem import PorterStemmer from nltk.tokenize import word_tokenize import string from tqdm import tqdm nltk.download('punkt') nltk.download('stopwords') stop_words = set(stopwords.words('english')) stemmer = PorterStemmer() def preprocess_text(text): text = text.lower() text = text.translate(str.maketrans('', '', string.punctuation)) words = word_tokenize(text) words = [stemmer.stem(word) for word in words if word not in stop_words] text = ' '.join(words) return text tqdm.pandas(desc="Processing 'text_1'") in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text) tqdm.pandas(desc="Processing 'text_2'") in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text) # In[24]: in_df.to_csv('preprocessed_dev_text1_text2.tsv', sep='\t', index=False) # In[25]: def concatenate_texts(row): return str(row['text_1']) + ' ' + str(row['text_2']) tqdm.pandas(desc="Concatenating 'text_1' and 'text_2'") in_df['text'] = in_df.progress_apply(concatenate_texts, axis=1) in_df.drop(['text_1', 'text_2'], axis=1, inplace=True) # In[26]: in_df.to_csv('preprocessed_dev_text_join_mask.tsv', sep='\t', index=False) # In[34]: import csv df = pd.read_csv('preprocessed_dev_text_join_mask.tsv', sep='\t') expected_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/expected.tsv', sep='\t', header=None, quoting=csv.QUOTE_NONE) expected_df.columns = ['expected_word'] for index, row in df.head(100).iterrows(): sentence = row['text'] expected_word = expected_df.loc[index, 'expected_word'] words = sentence.split() if '' in words: mask_index = words.index('') start_index = max(0, mask_index - 2) context = ' '.join(words[start_index:mask_index]) predicted_word = model.predict(context) if predicted_word == expected_word: print(f'Index: {index} Predicted Word: {predicted_word}, Expected Word: {expected_word}') # In[36]: import csv df = pd.read_csv('preprocessed_dev_text_join_mask.tsv', sep='\t') expected_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/expected.tsv', sep='\t', header=None, quoting=csv.QUOTE_NONE) expected_df.columns = ['expected_word'] df['Word'] = None for index, row in df.iterrows(): sentence = row['text'] words = sentence.split() if '' in words: expected_word = expected_df.loc[index, 'expected_word'] mask_index = words.index('') start_index = max(0, mask_index - 2) context = ' '.join(words[start_index:mask_index]) predicted_word = model.predict(context) df.loc[index, 'Word'] = predicted_word if predicted_word == expected_word: print(f'Index: {index} Predicted Word: {predicted_word}, Expected Word: {expected_word}') df['Word'].to_csv('./challenging-america-word-gap-prediction/dev-0/out.tsv', sep='\t', index=False) # In[37]: import pandas as pd in_df = pd.read_csv('./challenging-america-word-gap-prediction/test-A/in.tsv', sep='\t', header=None, on_bad_lines='warn') columns_to_drop = [0, 1, 2, 3, 4, 5] in_df.drop(columns_to_drop, axis=1, inplace=True) in_df.columns = ['text_1', 'text_2'] in_df['text_2'].fillna('', inplace=True) # In[38]: from tqdm import tqdm def replace_newline(text): if isinstance(text, str): return text.replace('\\n', ' ') return text tqdm.pandas(desc="Replacing '\\n' in 'text_1'") in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline) tqdm.pandas(desc="Replacing '\\n' in 'text_2'") in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline) # In[39]: import nltk from nltk.corpus import stopwords from nltk.stem import PorterStemmer from nltk.tokenize import word_tokenize import string from tqdm import tqdm nltk.download('punkt') nltk.download('stopwords') stop_words = set(stopwords.words('english')) stemmer = PorterStemmer() def preprocess_text(text): text = text.lower() text = text.translate(str.maketrans('', '', string.punctuation)) words = word_tokenize(text) words = [stemmer.stem(word) for word in words if word not in stop_words] text = ' '.join(words) return text tqdm.pandas(desc="Processing 'text_1'") in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text) tqdm.pandas(desc="Processing 'text_2'") in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text) # In[40]: in_df.to_csv('preprocessed_test_text1_text2.tsv', sep='\t', index=False) # In[41]: def concatenate_texts(row): return str(row['text_1']) + ' ' + str(row['text_2']) tqdm.pandas(desc="Concatenating 'text_1' and 'text_2'") in_df['text'] = in_df.progress_apply(concatenate_texts, axis=1) in_df.drop(['text_1', 'text_2'], axis=1, inplace=True) # In[42]: in_df.to_csv('preprocessed_test_text_join_mask.tsv', sep='\t', index=False) # In[43]: df = pd.read_csv('preprocessed_dev_text_join_mask.tsv', sep='\t') df['Word'] = None for index, row in df.iterrows(): sentence = row['text'] words = sentence.split() if '' in words: mask_index = words.index('') start_index = max(0, mask_index - 2) context = ' '.join(words[start_index:mask_index]) predicted_word = model.predict(context) df.loc[index, 'Word'] = predicted_word df['Word'].to_csv('./challenging-america-word-gap-prediction/test-A/out.tsv', sep='\t', index=False)