151636

2024-04-23 01:05:48 +00:00 · 2024-04-23 01:05:48 +00:00 · 47d69670a7
commit 47d69670a7
parent 2a9545c47d
3 changed files with 21435 additions and 0 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/run.py
+++ b/run.py
@ -0,0 +1,629 @@
+
+# In[1]:
+
+
+import os
+import lzma
+
+folders = ["./challenging-america-word-gap-prediction/dev-0",
+           "./challenging-america-word-gap-prediction/test-A",
+           "./challenging-america-word-gap-prediction/train"]
+
+for folder in folders:
+    for file in os.listdir(folder):
+        if file.endswith(".tsv.xz"):
+            file_path = os.path.join(folder, file)
+            output_path = os.path.splitext(file_path)[0]  # Remove the .xz extension
+            with lzma.open(file_path, "rb") as compressed_file:
+                with open(output_path, "wb") as output_file:
+                    output_file.write(compressed_file.read())
+
+
+# In[5]:
+
+
+import nltk
+nltk.download('punkt')
+
+
+# In[ ]:
+
+
+import pandas as pd
+
+
+in_df = pd.read_csv('./challenging-america-word-gap-prediction/train/in.tsv', sep='\t', header=None, on_bad_lines='warn')
+expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None, on_bad_lines='warn')
+
+
+print("in_df:")
+print(in_df.head())
+print("\nexpected_df:")
+print(expected_df.head())
+print("\nhate_speech_info_df:")
+
+
+print("\nin_df info:")
+print(in_df.info())
+print("\nexpected_df info:")
+print(expected_df.info())
+
+
+# In[2]:
+
+
+columns_to_drop = [0, 1, 2, 3, 4, 5]  
+in_df.drop(columns_to_drop, axis=1, inplace=True)
+
+
+in_df.columns = ['text_1', 'text_2']
+
+
+
+# In[3]:
+
+
+print(in_df.head())
+
+
+# In[4]:
+
+
+in_df['text_1'].fillna('', inplace=True)
+in_df['text_2'].fillna('', inplace=True)
+
+
+# In[5]:
+
+
+from tqdm import tqdm
+
+
+def replace_newline(text):
+    if isinstance(text, str):
+        return text.replace('\\n', ' ')
+    return text
+
+tqdm.pandas(desc="Replacing '\\n' in 'text_1'")
+in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline)
+
+tqdm.pandas(desc="Replacing '\\n' in 'text_2'")
+in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline)
+
+
+
+# In[6]:
+
+
+print(in_df.head())
+
+
+# In[ ]:
+
+
+import nltk
+from nltk.corpus import stopwords
+from nltk.stem import PorterStemmer
+from nltk.tokenize import word_tokenize
+import string
+from tqdm import tqdm
+
+
+nltk.download('punkt')
+nltk.download('stopwords')
+
+stop_words = set(stopwords.words('english'))
+stemmer = PorterStemmer()
+
+def preprocess_text(text):
+    text = text.lower()
+    
+    # Remove punctuation
+    text = text.translate(str.maketrans('', '', string.punctuation))
+    
+    words = word_tokenize(text)
+    
+    # Remove stopwords and stem the words
+    words = [stemmer.stem(word) for word in words if word not in stop_words]
+    
+    text = ' '.join(words)
+    
+    return text
+
+tqdm.pandas(desc="Processing 'text_1'")
+in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text)
+
+
+tqdm.pandas(desc="Processing 'text_2'")
+in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text)
+
+
+# In[8]:
+
+
+in_df.to_csv('preprocessed_text1_text2.tsv', sep='\t', index=False)
+
+
+# In[9]:
+
+
+def concatenate_texts(row):
+    return str(row['text_1']) + ' <MASK> ' + str(row['text_2'])
+
+tqdm.pandas(desc="Concatenating 'text_1' and 'text_2'")
+in_df['text'] = in_df.progress_apply(concatenate_texts, axis=1)
+
+in_df.drop(['text_1', 'text_2'], axis=1, inplace=True)
+
+
+# In[31]:
+
+
+from tqdm import tqdm
+tqdm.pandas()
+
+in_df = pd.read_csv('preprocessed_text1_text2.tsv', sep='\t')
+
+expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None, on_bad_lines='warn')
+expected_df.columns = ['expected_word']
+
+in_df = pd.concat([in_df, expected_df], axis=1)
+
+def concatenate_texts(row):
+    return str(row['text_1']) + ' ' + str(row['expected_word']) + ' ' + str(row['text_2'])
+
+# Apply the function to each row and show progress
+in_df['unmasked_text'] = in_df.progress_apply(concatenate_texts, axis=1)
+
+
+# In[35]:
+
+
+
+in_df['unmasked_text'].to_csv('preprocessed_text_join_unmask.tsv', sep='\t', index=False)
+
+
+# In[10]:
+
+
+in_df.to_csv('preprocessed_text_join_mask.tsv', sep='\t', index=False)
+
+
+# In[5]:
+
+
+import pandas as pd
+
+in_df = pd.read_csv('preprocessed_text_join_mask.tsv', sep='\t')
+
+print(in_df.head())
+
+
+# In[6]:
+
+
+print(in_df.head())
+
+
+# In[1]:
+
+
+from abc import ABC, abstractmethod
+from collections import Counter
+from nltk.util import ngrams
+from nltk.tokenize import word_tokenize
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score
+import numpy as np
+
+class Model(ABC):
+    
+    def __init__(self, UNK_token= '<MASK>', smoothing_parameter=0.5):
+        self.UNK_token = UNK_token
+        self.smoothing_parameter = smoothing_parameter
+
+    @abstractmethod
+    def train(self, corpus):
+        pass
+
+    @abstractmethod
+    def predict(self, text):
+        pass
+
+class InterpolatedModel(Model):
+
+    def train(self, corpus, smoothing_parameter=0.5, vocab=15000):
+
+        self.smoothing_parameter=smoothing_parameter
+        if corpus.empty:
+            raise ValueError("The corpus is empty.")
+            
+        corpus = list(set(corpus))
+            
+        tokens = []
+        for i, sentence in enumerate(corpus):
+            sentence_tokens = word_tokenize(sentence)
+            # Add padding to sentences that are less than three words long
+            while len(sentence_tokens) < 3:
+                sentence_tokens.append(self.UNK_token)
+            tokens.append(sentence_tokens)
+            if i % 1000 == 0:
+                print(f'Tokenizing sentence {i} of {len(corpus)}')
+
+        self.unigram_counts = Counter()
+        self.bigram_counts = Counter()
+        self.trigram_counts = Counter()
+        for i, sentence in enumerate(tokens):
+            self.unigram_counts.update(sentence)
+            self.bigram_counts.update(list(ngrams(sentence, 2, pad_left=True, pad_right=True)))
+            self.trigram_counts.update(list(ngrams(sentence, 3, pad_left=True, pad_right=True)))
+            if i % 1000 == 0:
+                print(f'Counting ngrams in sentence {i} of {len(tokens)}')
+
+        self.unigram_counts = Counter(dict(self.unigram_counts.most_common(vocab)))
+
+        self.vocab_size = len(self.unigram_counts)
+
+    def probability(self, text, word):
+        tokens = word_tokenize(text)
+        if len(tokens) < 2:
+            # Use a unigram model if the input text is only one word
+            return self.unigram_counts[word] / self.vocab_size if word in self.unigram_counts else 0
+        else:
+            # Use a bigram/trigram model if the input text is at least two words
+            previous_bigram = tuple(tokens[-2:])
+            unigram_prob = self.unigram_counts[word] / self.vocab_size if word in self.unigram_counts else 0
+            bigram_prob = self.bigram_counts.get((previous_bigram[1], word), 0) / self.unigram_counts.get(previous_bigram[1], 1)
+            trigram_prob = self.trigram_counts.get((*previous_bigram, word), 0) / self.bigram_counts.get(previous_bigram, 1)
+            return self.smoothing_parameter * unigram_prob + (1 - 2 * self.smoothing_parameter) * bigram_prob + (1 - self.smoothing_parameter) * trigram_prob
+
+    def perplexity(self, sentence):
+        tokens = word_tokenize(sentence)
+
+        probabilities = [self.probability(' '.join(tokens[:i]), tokens[i]) for i in range(1, len(tokens))]
+
+        log_probabilities = [np.log2(p) if p > 0 else float('-inf') for p in probabilities]
+
+        average_log_probability = np.mean(log_probabilities)
+
+        return np.power(2, -average_log_probability)
+
+    def predict(self, text):
+        tokens = word_tokenize(text)
+        if len(tokens) < 1:
+            return None 
+        elif len(tokens) < 2:
+            # Use a unigram model if the input text is only one word
+            probabilities = {word: self.unigram_counts[word] / self.vocab_size if word in self.unigram_counts else 0 for word in self.unigram_counts.keys()}
+        else:
+            # Use a bigram/trigram model if the input text is at least two words
+            previous_bigram = tuple(tokens[-2:])
+            probabilities = {}
+            for word in self.unigram_counts.keys():
+                unigram_prob = self.unigram_counts[word] / self.vocab_size if word in self.unigram_counts else 0
+                bigram_prob = self.bigram_counts.get((previous_bigram[1], word), 0) / self.unigram_counts.get(previous_bigram[1], 1)
+                trigram_prob = self.trigram_counts.get((*previous_bigram, word), 0) / self.bigram_counts.get(previous_bigram, 1)
+                probabilities[word] = self.smoothing_parameter * unigram_prob + (1 - 2 * self.smoothing_parameter) * bigram_prob + (1 - self.smoothing_parameter) * trigram_prob
+        return max(probabilities, key=probabilities.get)
+
+
+# In[32]:
+
+
+in_df = pd.read_csv('preprocessed_text_join_unmask.tsv', sep='\t')
+expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None, on_bad_lines='warn')
+
+
+df = pd.concat([in_df, expected_df], axis=1)
+df.columns = ['text', 'expected_word']
+
+quarter = len(df) // 4
+df = df.iloc[:quarter]
+
+train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
+
+
+model = InterpolatedModel()
+model.train(train_df['text'], smoothing_parameter=0.4, vocab=60000)
+
+
+
+# In[33]:
+
+
+import pickle
+
+with open('trained_model_quarter_sp04_vocab60k.pkl', 'wb') as file:
+    pickle.dump(model, file)
+
+
+# In[ ]:
+
+
+import pickle
+
+with open('trained_model_quarter_sp04_vocab60k.pkl', 'rb') as file:
+    model = pickle.load(file)
+
+
+# Smoothing = 0.4
+
+# In[14]:
+
+
+import pandas as pd
+
+df = pd.read_csv('preprocessed_text_join_mask.tsv', sep='\t')
+
+expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None)
+expected_df.columns = ['expected_word']
+
+for index, row in df.head(100).iterrows():
+    sentence = row['text']
+    expected_word = expected_df.loc[index, 'expected_word']
+    words = sentence.split()
+    if '<MASK>' in words:
+        mask_index = words.index('<MASK>')
+        start_index = max(0, mask_index - 2)
+        context = ' '.join(words[start_index:mask_index])
+        predicted_word = model.predict(context)
+        if predicted_word == expected_word:
+            print(f'Index: {index} Predicted Word: {predicted_word}, Expected Word: {expected_word}')
+
+
+
+
+
+
+
+# In[21]:
+
+
+import pandas as pd
+import csv
+
+# Load the data
+in_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/in.tsv', sep='\t', header=None, on_bad_lines='warn')
+expected_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/expected.tsv', sep='\t', header=None, on_bad_lines='warn', quoting=csv.QUOTE_NONE)
+
+
+columns_to_drop = [0, 1, 2, 3, 4, 5]  
+in_df.drop(columns_to_drop, axis=1, inplace=True)
+
+in_df.columns = ['text_1', 'text_2']
+in_df['text_2'].fillna('', inplace=True)
+
+
+
+# In[22]:
+
+
+from tqdm import tqdm
+
+def replace_newline(text):
+    if isinstance(text, str):
+        return text.replace('\\n', ' ')
+    return text
+
+tqdm.pandas(desc="Replacing '\\n' in 'text_1'")
+in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline)
+
+tqdm.pandas(desc="Replacing '\\n' in 'text_2'")
+in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline)
+
+
+# In[23]:
+
+
+import nltk
+from nltk.corpus import stopwords
+from nltk.stem import PorterStemmer
+from nltk.tokenize import word_tokenize
+import string
+from tqdm import tqdm
+
+
+nltk.download('punkt')
+nltk.download('stopwords')
+
+stop_words = set(stopwords.words('english'))
+stemmer = PorterStemmer()
+
+def preprocess_text(text):
+    text = text.lower()
+    text = text.translate(str.maketrans('', '', string.punctuation))
+    words = word_tokenize(text)
+    words = [stemmer.stem(word) for word in words if word not in stop_words]
+    text = ' '.join(words)
+    return text
+
+tqdm.pandas(desc="Processing 'text_1'")
+in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text)
+
+
+tqdm.pandas(desc="Processing 'text_2'")
+in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text)
+
+
+# In[24]:
+
+
+in_df.to_csv('preprocessed_dev_text1_text2.tsv', sep='\t', index=False)
+
+
+# In[25]:
+
+
+def concatenate_texts(row):
+    return str(row['text_1']) + ' <MASK> ' + str(row['text_2'])
+
+tqdm.pandas(desc="Concatenating 'text_1' and 'text_2'")
+in_df['text'] = in_df.progress_apply(concatenate_texts, axis=1)
+
+in_df.drop(['text_1', 'text_2'], axis=1, inplace=True)
+
+
+# In[26]:
+
+
+in_df.to_csv('preprocessed_dev_text_join_mask.tsv', sep='\t', index=False)
+
+
+# In[34]:
+
+
+import csv
+df = pd.read_csv('preprocessed_dev_text_join_mask.tsv', sep='\t')
+
+expected_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/expected.tsv', sep='\t', header=None, quoting=csv.QUOTE_NONE)
+expected_df.columns = ['expected_word']
+
+for index, row in df.head(100).iterrows():
+    sentence = row['text']
+    expected_word = expected_df.loc[index, 'expected_word']
+    words = sentence.split()
+    if '<MASK>' in words:
+        mask_index = words.index('<MASK>')
+        start_index = max(0, mask_index - 2)
+        context = ' '.join(words[start_index:mask_index])
+        predicted_word = model.predict(context)
+        if predicted_word == expected_word:
+            print(f'Index: {index} Predicted Word: {predicted_word}, Expected Word: {expected_word}')
+
+
+# In[36]:
+
+
+import csv
+df = pd.read_csv('preprocessed_dev_text_join_mask.tsv', sep='\t')
+
+expected_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/expected.tsv', sep='\t', header=None, quoting=csv.QUOTE_NONE)
+expected_df.columns = ['expected_word']
+
+df['Word'] = None
+
+for index, row in df.iterrows():
+    sentence = row['text']
+    words = sentence.split()
+    if '<MASK>' in words:
+        expected_word = expected_df.loc[index, 'expected_word']
+        mask_index = words.index('<MASK>')
+        start_index = max(0, mask_index - 2)
+        context = ' '.join(words[start_index:mask_index])
+        predicted_word = model.predict(context)
+        df.loc[index, 'Word'] = predicted_word
+        if predicted_word == expected_word:
+            print(f'Index: {index} Predicted Word: {predicted_word}, Expected Word: {expected_word}')
+
+df['Word'].to_csv('./challenging-america-word-gap-prediction/dev-0/out.tsv', sep='\t', index=False)
+
+
+# In[37]:
+
+
+import pandas as pd
+
+in_df = pd.read_csv('./challenging-america-word-gap-prediction/test-A/in.tsv', sep='\t', header=None, on_bad_lines='warn')
+
+columns_to_drop = [0, 1, 2, 3, 4, 5] 
+in_df.drop(columns_to_drop, axis=1, inplace=True)
+
+in_df.columns = ['text_1', 'text_2']
+in_df['text_2'].fillna('', inplace=True)
+
+
+# In[38]:
+
+
+from tqdm import tqdm
+
+def replace_newline(text):
+    if isinstance(text, str):
+        return text.replace('\\n', ' ')
+    return text
+
+tqdm.pandas(desc="Replacing '\\n' in 'text_1'")
+in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline)
+
+tqdm.pandas(desc="Replacing '\\n' in 'text_2'")
+in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline)
+
+
+# In[39]:
+
+
+import nltk
+from nltk.corpus import stopwords
+from nltk.stem import PorterStemmer
+from nltk.tokenize import word_tokenize
+import string
+from tqdm import tqdm
+
+nltk.download('punkt')
+nltk.download('stopwords')
+
+stop_words = set(stopwords.words('english'))
+stemmer = PorterStemmer()
+
+def preprocess_text(text):
+    text = text.lower()
+    text = text.translate(str.maketrans('', '', string.punctuation))
+    words = word_tokenize(text)
+    words = [stemmer.stem(word) for word in words if word not in stop_words]
+    text = ' '.join(words)
+    
+    return text
+
+tqdm.pandas(desc="Processing 'text_1'")
+in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text)
+
+
+tqdm.pandas(desc="Processing 'text_2'")
+in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text)
+
+
+# In[40]:
+
+
+in_df.to_csv('preprocessed_test_text1_text2.tsv', sep='\t', index=False)
+
+
+# In[41]:
+
+
+def concatenate_texts(row):
+    return str(row['text_1']) + ' <MASK> ' + str(row['text_2'])
+
+tqdm.pandas(desc="Concatenating 'text_1' and 'text_2'")
+in_df['text'] = in_df.progress_apply(concatenate_texts, axis=1)
+
+in_df.drop(['text_1', 'text_2'], axis=1, inplace=True)
+
+
+# In[42]:
+
+
+in_df.to_csv('preprocessed_test_text_join_mask.tsv', sep='\t', index=False)
+
+
+# In[43]:
+
+
+df = pd.read_csv('preprocessed_dev_text_join_mask.tsv', sep='\t')
+
+df['Word'] = None
+
+for index, row in df.iterrows():
+    sentence = row['text']
+    words = sentence.split()
+    if '<MASK>' in words:
+        mask_index = words.index('<MASK>')
+        start_index = max(0, mask_index - 2)
+        context = ' '.join(words[start_index:mask_index])
+        predicted_word = model.predict(context)
+        df.loc[index, 'Word'] = predicted_word
+
+
+df['Word'].to_csv('./challenging-america-word-gap-prediction/test-A/out.tsv', sep='\t', index=False)
+
--- a/test-A/out.tsv
+++ b/test-A/out.tsv