151636

2024-04-23 01:05:48 +00:00 · 2024-04-23 01:05:48 +00:00 · 47d69670a7
commit 47d69670a7
parent 2a9545c47d
3 changed files with 21435 additions and 0 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/run.py
+++ b/run.py
@ -0,0 +1,629 @@
 # In[1]:
 import os
 import lzma
 folders = ["./challenging-america-word-gap-prediction/dev-0",
           "./challenging-america-word-gap-prediction/test-A",
           "./challenging-america-word-gap-prediction/train"]
 for folder in folders:
    for file in os.listdir(folder):
        if file.endswith(".tsv.xz"):
            file_path = os.path.join(folder, file)
            output_path = os.path.splitext(file_path)[0]  # Remove the .xz extension
            with lzma.open(file_path, "rb") as compressed_file:
                with open(output_path, "wb") as output_file:
                    output_file.write(compressed_file.read())
 # In[5]:
 import nltk
 nltk.download('punkt')
 # In[ ]:
 import pandas as pd
 in_df = pd.read_csv('./challenging-america-word-gap-prediction/train/in.tsv', sep='\t', header=None, on_bad_lines='warn')
 expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None, on_bad_lines='warn')
 print("in_df:")
 print(in_df.head())
 print("\nexpected_df:")
 print(expected_df.head())
 print("\nhate_speech_info_df:")
 print("\nin_df info:")
 print(in_df.info())
 print("\nexpected_df info:")
 print(expected_df.info())
 # In[2]:
 columns_to_drop = [0, 1, 2, 3, 4, 5]  
 in_df.drop(columns_to_drop, axis=1, inplace=True)
 in_df.columns = ['text_1', 'text_2']
 # In[3]:
 print(in_df.head())
 # In[4]:
 in_df['text_1'].fillna('', inplace=True)
 in_df['text_2'].fillna('', inplace=True)
 # In[5]:
 from tqdm import tqdm
 def replace_newline(text):
    if isinstance(text, str):
        return text.replace('\\n', ' ')
    return text
 tqdm.pandas(desc="Replacing '\\n' in 'text_1'")
 in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline)
 tqdm.pandas(desc="Replacing '\\n' in 'text_2'")
 in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline)
 # In[6]:
 print(in_df.head())
 # In[ ]:
 import nltk
 from nltk.corpus import stopwords
 from nltk.stem import PorterStemmer
 from nltk.tokenize import word_tokenize
 import string
 from tqdm import tqdm
 nltk.download('punkt')
 nltk.download('stopwords')
 stop_words = set(stopwords.words('english'))
 stemmer = PorterStemmer()
 def preprocess_text(text):
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text)
    # Remove stopwords and stem the words
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    text = ' '.join(words)
    return text
 tqdm.pandas(desc="Processing 'text_1'")
 in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text)
 tqdm.pandas(desc="Processing 'text_2'")
 in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text)
 # In[8]:
 in_df.to_csv('preprocessed_text1_text2.tsv', sep='\t', index=False)
 # In[9]:
 def concatenate_texts(row):
    return str(row['text_1']) + ' <MASK> ' + str(row['text_2'])
 tqdm.pandas(desc="Concatenating 'text_1' and 'text_2'")
 in_df['text'] = in_df.progress_apply(concatenate_texts, axis=1)
 in_df.drop(['text_1', 'text_2'], axis=1, inplace=True)
 # In[31]:
 from tqdm import tqdm
 tqdm.pandas()
 in_df = pd.read_csv('preprocessed_text1_text2.tsv', sep='\t')
 expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None, on_bad_lines='warn')
 expected_df.columns = ['expected_word']
 in_df = pd.concat([in_df, expected_df], axis=1)
 def concatenate_texts(row):
    return str(row['text_1']) + ' ' + str(row['expected_word']) + ' ' + str(row['text_2'])
 # Apply the function to each row and show progress
 in_df['unmasked_text'] = in_df.progress_apply(concatenate_texts, axis=1)
 # In[35]:
 in_df['unmasked_text'].to_csv('preprocessed_text_join_unmask.tsv', sep='\t', index=False)
 # In[10]:
 in_df.to_csv('preprocessed_text_join_mask.tsv', sep='\t', index=False)
 # In[5]:
 import pandas as pd
 in_df = pd.read_csv('preprocessed_text_join_mask.tsv', sep='\t')
 print(in_df.head())
 # In[6]:
 print(in_df.head())
 # In[1]:
 from abc import ABC, abstractmethod
 from collections import Counter
 from nltk.util import ngrams
 from nltk.tokenize import word_tokenize
 import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import accuracy_score
 import numpy as np
 class Model(ABC):
    def __init__(self, UNK_token= '<MASK>', smoothing_parameter=0.5):
        self.UNK_token = UNK_token
        self.smoothing_parameter = smoothing_parameter
    @abstractmethod
    def train(self, corpus):
        pass
    @abstractmethod
    def predict(self, text):
        pass
 class InterpolatedModel(Model):
    def train(self, corpus, smoothing_parameter=0.5, vocab=15000):
        self.smoothing_parameter=smoothing_parameter
        if corpus.empty:
            raise ValueError("The corpus is empty.")
        corpus = list(set(corpus))
        tokens = []
        for i, sentence in enumerate(corpus):
            sentence_tokens = word_tokenize(sentence)
            # Add padding to sentences that are less than three words long
            while len(sentence_tokens) < 3:
                sentence_tokens.append(self.UNK_token)
            tokens.append(sentence_tokens)
            if i % 1000 == 0:
                print(f'Tokenizing sentence {i} of {len(corpus)}')
        self.unigram_counts = Counter()
        self.bigram_counts = Counter()
        self.trigram_counts = Counter()
        for i, sentence in enumerate(tokens):
            self.unigram_counts.update(sentence)
            self.bigram_counts.update(list(ngrams(sentence, 2, pad_left=True, pad_right=True)))
            self.trigram_counts.update(list(ngrams(sentence, 3, pad_left=True, pad_right=True)))
            if i % 1000 == 0:
                print(f'Counting ngrams in sentence {i} of {len(tokens)}')
        self.unigram_counts = Counter(dict(self.unigram_counts.most_common(vocab)))
        self.vocab_size = len(self.unigram_counts)
    def probability(self, text, word):
        tokens = word_tokenize(text)
        if len(tokens) < 2:
            # Use a unigram model if the input text is only one word
            return self.unigram_counts[word] / self.vocab_size if word in self.unigram_counts else 0
        else:
            # Use a bigram/trigram model if the input text is at least two words
            previous_bigram = tuple(tokens[-2:])
            unigram_prob = self.unigram_counts[word] / self.vocab_size if word in self.unigram_counts else 0
            bigram_prob = self.bigram_counts.get((previous_bigram[1], word), 0) / self.unigram_counts.get(previous_bigram[1], 1)
            trigram_prob = self.trigram_counts.get((*previous_bigram, word), 0) / self.bigram_counts.get(previous_bigram, 1)
            return self.smoothing_parameter * unigram_prob + (1 - 2 * self.smoothing_parameter) * bigram_prob + (1 - self.smoothing_parameter) * trigram_prob
    def perplexity(self, sentence):
        tokens = word_tokenize(sentence)
        probabilities = [self.probability(' '.join(tokens[:i]), tokens[i]) for i in range(1, len(tokens))]
        log_probabilities = [np.log2(p) if p > 0 else float('-inf') for p in probabilities]
        average_log_probability = np.mean(log_probabilities)
        return np.power(2, -average_log_probability)
    def predict(self, text):
        tokens = word_tokenize(text)
        if len(tokens) < 1:
            return None 
        elif len(tokens) < 2:
            # Use a unigram model if the input text is only one word
            probabilities = {word: self.unigram_counts[word] / self.vocab_size if word in self.unigram_counts else 0 for word in self.unigram_counts.keys()}
        else:
            # Use a bigram/trigram model if the input text is at least two words
            previous_bigram = tuple(tokens[-2:])
            probabilities = {}
            for word in self.unigram_counts.keys():
                unigram_prob = self.unigram_counts[word] / self.vocab_size if word in self.unigram_counts else 0
                bigram_prob = self.bigram_counts.get((previous_bigram[1], word), 0) / self.unigram_counts.get(previous_bigram[1], 1)
                trigram_prob = self.trigram_counts.get((*previous_bigram, word), 0) / self.bigram_counts.get(previous_bigram, 1)
                probabilities[word] = self.smoothing_parameter * unigram_prob + (1 - 2 * self.smoothing_parameter) * bigram_prob + (1 - self.smoothing_parameter) * trigram_prob
        return max(probabilities, key=probabilities.get)
 # In[32]:
 in_df = pd.read_csv('preprocessed_text_join_unmask.tsv', sep='\t')
 expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None, on_bad_lines='warn')
 df = pd.concat([in_df, expected_df], axis=1)
 df.columns = ['text', 'expected_word']
 quarter = len(df) // 4
 df = df.iloc[:quarter]
 train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
 model = InterpolatedModel()
 model.train(train_df['text'], smoothing_parameter=0.4, vocab=60000)
 # In[33]:
 import pickle
 with open('trained_model_quarter_sp04_vocab60k.pkl', 'wb') as file:
    pickle.dump(model, file)
 # In[ ]:
 import pickle
 with open('trained_model_quarter_sp04_vocab60k.pkl', 'rb') as file:
    model = pickle.load(file)
 # Smoothing = 0.4
 # In[14]:
 import pandas as pd
 df = pd.read_csv('preprocessed_text_join_mask.tsv', sep='\t')
 expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None)
 expected_df.columns = ['expected_word']
 for index, row in df.head(100).iterrows():
    sentence = row['text']
    expected_word = expected_df.loc[index, 'expected_word']
    words = sentence.split()
    if '<MASK>' in words:
        mask_index = words.index('<MASK>')
        start_index = max(0, mask_index - 2)
        context = ' '.join(words[start_index:mask_index])
        predicted_word = model.predict(context)
        if predicted_word == expected_word:
            print(f'Index: {index} Predicted Word: {predicted_word}, Expected Word: {expected_word}')
 # In[21]:
 import pandas as pd
 import csv
 # Load the data
 in_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/in.tsv', sep='\t', header=None, on_bad_lines='warn')
 expected_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/expected.tsv', sep='\t', header=None, on_bad_lines='warn', quoting=csv.QUOTE_NONE)
 columns_to_drop = [0, 1, 2, 3, 4, 5]  
 in_df.drop(columns_to_drop, axis=1, inplace=True)
 in_df.columns = ['text_1', 'text_2']
 in_df['text_2'].fillna('', inplace=True)
 # In[22]:
 from tqdm import tqdm
 def replace_newline(text):
    if isinstance(text, str):
        return text.replace('\\n', ' ')
    return text
 tqdm.pandas(desc="Replacing '\\n' in 'text_1'")
 in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline)
 tqdm.pandas(desc="Replacing '\\n' in 'text_2'")
 in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline)
 # In[23]:
 import nltk
 from nltk.corpus import stopwords
 from nltk.stem import PorterStemmer
 from nltk.tokenize import word_tokenize
 import string
 from tqdm import tqdm
 nltk.download('punkt')
 nltk.download('stopwords')
 stop_words = set(stopwords.words('english'))
 stemmer = PorterStemmer()
 def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text)
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    text = ' '.join(words)
    return text
 tqdm.pandas(desc="Processing 'text_1'")
 in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text)
 tqdm.pandas(desc="Processing 'text_2'")
 in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text)
 # In[24]:
 in_df.to_csv('preprocessed_dev_text1_text2.tsv', sep='\t', index=False)
 # In[25]:
 def concatenate_texts(row):
    return str(row['text_1']) + ' <MASK> ' + str(row['text_2'])
 tqdm.pandas(desc="Concatenating 'text_1' and 'text_2'")
 in_df['text'] = in_df.progress_apply(concatenate_texts, axis=1)
 in_df.drop(['text_1', 'text_2'], axis=1, inplace=True)
 # In[26]:
 in_df.to_csv('preprocessed_dev_text_join_mask.tsv', sep='\t', index=False)
 # In[34]:
 import csv
 df = pd.read_csv('preprocessed_dev_text_join_mask.tsv', sep='\t')
 expected_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/expected.tsv', sep='\t', header=None, quoting=csv.QUOTE_NONE)
 expected_df.columns = ['expected_word']
 for index, row in df.head(100).iterrows():
    sentence = row['text']
    expected_word = expected_df.loc[index, 'expected_word']
    words = sentence.split()
    if '<MASK>' in words:
        mask_index = words.index('<MASK>')
        start_index = max(0, mask_index - 2)
        context = ' '.join(words[start_index:mask_index])
        predicted_word = model.predict(context)
        if predicted_word == expected_word:
            print(f'Index: {index} Predicted Word: {predicted_word}, Expected Word: {expected_word}')
 # In[36]:
 import csv
 df = pd.read_csv('preprocessed_dev_text_join_mask.tsv', sep='\t')
 expected_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/expected.tsv', sep='\t', header=None, quoting=csv.QUOTE_NONE)
 expected_df.columns = ['expected_word']
 df['Word'] = None
 for index, row in df.iterrows():
    sentence = row['text']
    words = sentence.split()
    if '<MASK>' in words:
        expected_word = expected_df.loc[index, 'expected_word']
        mask_index = words.index('<MASK>')
        start_index = max(0, mask_index - 2)
        context = ' '.join(words[start_index:mask_index])
        predicted_word = model.predict(context)
        df.loc[index, 'Word'] = predicted_word
        if predicted_word == expected_word:
            print(f'Index: {index} Predicted Word: {predicted_word}, Expected Word: {expected_word}')
 df['Word'].to_csv('./challenging-america-word-gap-prediction/dev-0/out.tsv', sep='\t', index=False)
 # In[37]:
 import pandas as pd
 in_df = pd.read_csv('./challenging-america-word-gap-prediction/test-A/in.tsv', sep='\t', header=None, on_bad_lines='warn')
 columns_to_drop = [0, 1, 2, 3, 4, 5] 
 in_df.drop(columns_to_drop, axis=1, inplace=True)
 in_df.columns = ['text_1', 'text_2']
 in_df['text_2'].fillna('', inplace=True)
 # In[38]:
 from tqdm import tqdm
 def replace_newline(text):
    if isinstance(text, str):
        return text.replace('\\n', ' ')
    return text
 tqdm.pandas(desc="Replacing '\\n' in 'text_1'")
 in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline)
 tqdm.pandas(desc="Replacing '\\n' in 'text_2'")
 in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline)
 # In[39]:
 import nltk
 from nltk.corpus import stopwords
 from nltk.stem import PorterStemmer
 from nltk.tokenize import word_tokenize
 import string
 from tqdm import tqdm
 nltk.download('punkt')
 nltk.download('stopwords')
 stop_words = set(stopwords.words('english'))
 stemmer = PorterStemmer()
 def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text)
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    text = ' '.join(words)
    return text
 tqdm.pandas(desc="Processing 'text_1'")
 in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text)
 tqdm.pandas(desc="Processing 'text_2'")
 in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text)
 # In[40]:
 in_df.to_csv('preprocessed_test_text1_text2.tsv', sep='\t', index=False)
 # In[41]:
 def concatenate_texts(row):
    return str(row['text_1']) + ' <MASK> ' + str(row['text_2'])
 tqdm.pandas(desc="Concatenating 'text_1' and 'text_2'")
 in_df['text'] = in_df.progress_apply(concatenate_texts, axis=1)
 in_df.drop(['text_1', 'text_2'], axis=1, inplace=True)
 # In[42]:
 in_df.to_csv('preprocessed_test_text_join_mask.tsv', sep='\t', index=False)
 # In[43]:
 df = pd.read_csv('preprocessed_dev_text_join_mask.tsv', sep='\t')
 df['Word'] = None
 for index, row in df.iterrows():
    sentence = row['text']
    words = sentence.split()
    if '<MASK>' in words:
        mask_index = words.index('<MASK>')
        start_index = max(0, mask_index - 2)
        context = ' '.join(words[start_index:mask_index])
        predicted_word = model.predict(context)
        df.loc[index, 'Word'] = predicted_word
 df['Word'].to_csv('./challenging-america-word-gap-prediction/test-A/out.tsv', sep='\t', index=False)
--- a/test-A/out.tsv
+++ b/test-A/out.tsv