This commit is contained in:
eugene 2024-04-23 01:05:48 +00:00
parent 2a9545c47d
commit 47d69670a7
3 changed files with 21435 additions and 0 deletions

10403
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

629
run.py Normal file
View File

@ -0,0 +1,629 @@
# In[1]:
import os
import lzma
folders = ["./challenging-america-word-gap-prediction/dev-0",
"./challenging-america-word-gap-prediction/test-A",
"./challenging-america-word-gap-prediction/train"]
for folder in folders:
for file in os.listdir(folder):
if file.endswith(".tsv.xz"):
file_path = os.path.join(folder, file)
output_path = os.path.splitext(file_path)[0] # Remove the .xz extension
with lzma.open(file_path, "rb") as compressed_file:
with open(output_path, "wb") as output_file:
output_file.write(compressed_file.read())
# In[5]:
import nltk
nltk.download('punkt')
# In[ ]:
import pandas as pd
in_df = pd.read_csv('./challenging-america-word-gap-prediction/train/in.tsv', sep='\t', header=None, on_bad_lines='warn')
expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None, on_bad_lines='warn')
print("in_df:")
print(in_df.head())
print("\nexpected_df:")
print(expected_df.head())
print("\nhate_speech_info_df:")
print("\nin_df info:")
print(in_df.info())
print("\nexpected_df info:")
print(expected_df.info())
# In[2]:
columns_to_drop = [0, 1, 2, 3, 4, 5]
in_df.drop(columns_to_drop, axis=1, inplace=True)
in_df.columns = ['text_1', 'text_2']
# In[3]:
print(in_df.head())
# In[4]:
in_df['text_1'].fillna('', inplace=True)
in_df['text_2'].fillna('', inplace=True)
# In[5]:
from tqdm import tqdm
def replace_newline(text):
if isinstance(text, str):
return text.replace('\\n', ' ')
return text
tqdm.pandas(desc="Replacing '\\n' in 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline)
tqdm.pandas(desc="Replacing '\\n' in 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline)
# In[6]:
print(in_df.head())
# In[ ]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
from tqdm import tqdm
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
def preprocess_text(text):
text = text.lower()
# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
words = word_tokenize(text)
# Remove stopwords and stem the words
words = [stemmer.stem(word) for word in words if word not in stop_words]
text = ' '.join(words)
return text
tqdm.pandas(desc="Processing 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text)
tqdm.pandas(desc="Processing 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text)
# In[8]:
in_df.to_csv('preprocessed_text1_text2.tsv', sep='\t', index=False)
# In[9]:
def concatenate_texts(row):
return str(row['text_1']) + ' <MASK> ' + str(row['text_2'])
tqdm.pandas(desc="Concatenating 'text_1' and 'text_2'")
in_df['text'] = in_df.progress_apply(concatenate_texts, axis=1)
in_df.drop(['text_1', 'text_2'], axis=1, inplace=True)
# In[31]:
from tqdm import tqdm
tqdm.pandas()
in_df = pd.read_csv('preprocessed_text1_text2.tsv', sep='\t')
expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None, on_bad_lines='warn')
expected_df.columns = ['expected_word']
in_df = pd.concat([in_df, expected_df], axis=1)
def concatenate_texts(row):
return str(row['text_1']) + ' ' + str(row['expected_word']) + ' ' + str(row['text_2'])
# Apply the function to each row and show progress
in_df['unmasked_text'] = in_df.progress_apply(concatenate_texts, axis=1)
# In[35]:
in_df['unmasked_text'].to_csv('preprocessed_text_join_unmask.tsv', sep='\t', index=False)
# In[10]:
in_df.to_csv('preprocessed_text_join_mask.tsv', sep='\t', index=False)
# In[5]:
import pandas as pd
in_df = pd.read_csv('preprocessed_text_join_mask.tsv', sep='\t')
print(in_df.head())
# In[6]:
print(in_df.head())
# In[1]:
from abc import ABC, abstractmethod
from collections import Counter
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
class Model(ABC):
def __init__(self, UNK_token= '<MASK>', smoothing_parameter=0.5):
self.UNK_token = UNK_token
self.smoothing_parameter = smoothing_parameter
@abstractmethod
def train(self, corpus):
pass
@abstractmethod
def predict(self, text):
pass
class InterpolatedModel(Model):
def train(self, corpus, smoothing_parameter=0.5, vocab=15000):
self.smoothing_parameter=smoothing_parameter
if corpus.empty:
raise ValueError("The corpus is empty.")
corpus = list(set(corpus))
tokens = []
for i, sentence in enumerate(corpus):
sentence_tokens = word_tokenize(sentence)
# Add padding to sentences that are less than three words long
while len(sentence_tokens) < 3:
sentence_tokens.append(self.UNK_token)
tokens.append(sentence_tokens)
if i % 1000 == 0:
print(f'Tokenizing sentence {i} of {len(corpus)}')
self.unigram_counts = Counter()
self.bigram_counts = Counter()
self.trigram_counts = Counter()
for i, sentence in enumerate(tokens):
self.unigram_counts.update(sentence)
self.bigram_counts.update(list(ngrams(sentence, 2, pad_left=True, pad_right=True)))
self.trigram_counts.update(list(ngrams(sentence, 3, pad_left=True, pad_right=True)))
if i % 1000 == 0:
print(f'Counting ngrams in sentence {i} of {len(tokens)}')
self.unigram_counts = Counter(dict(self.unigram_counts.most_common(vocab)))
self.vocab_size = len(self.unigram_counts)
def probability(self, text, word):
tokens = word_tokenize(text)
if len(tokens) < 2:
# Use a unigram model if the input text is only one word
return self.unigram_counts[word] / self.vocab_size if word in self.unigram_counts else 0
else:
# Use a bigram/trigram model if the input text is at least two words
previous_bigram = tuple(tokens[-2:])
unigram_prob = self.unigram_counts[word] / self.vocab_size if word in self.unigram_counts else 0
bigram_prob = self.bigram_counts.get((previous_bigram[1], word), 0) / self.unigram_counts.get(previous_bigram[1], 1)
trigram_prob = self.trigram_counts.get((*previous_bigram, word), 0) / self.bigram_counts.get(previous_bigram, 1)
return self.smoothing_parameter * unigram_prob + (1 - 2 * self.smoothing_parameter) * bigram_prob + (1 - self.smoothing_parameter) * trigram_prob
def perplexity(self, sentence):
tokens = word_tokenize(sentence)
probabilities = [self.probability(' '.join(tokens[:i]), tokens[i]) for i in range(1, len(tokens))]
log_probabilities = [np.log2(p) if p > 0 else float('-inf') for p in probabilities]
average_log_probability = np.mean(log_probabilities)
return np.power(2, -average_log_probability)
def predict(self, text):
tokens = word_tokenize(text)
if len(tokens) < 1:
return None
elif len(tokens) < 2:
# Use a unigram model if the input text is only one word
probabilities = {word: self.unigram_counts[word] / self.vocab_size if word in self.unigram_counts else 0 for word in self.unigram_counts.keys()}
else:
# Use a bigram/trigram model if the input text is at least two words
previous_bigram = tuple(tokens[-2:])
probabilities = {}
for word in self.unigram_counts.keys():
unigram_prob = self.unigram_counts[word] / self.vocab_size if word in self.unigram_counts else 0
bigram_prob = self.bigram_counts.get((previous_bigram[1], word), 0) / self.unigram_counts.get(previous_bigram[1], 1)
trigram_prob = self.trigram_counts.get((*previous_bigram, word), 0) / self.bigram_counts.get(previous_bigram, 1)
probabilities[word] = self.smoothing_parameter * unigram_prob + (1 - 2 * self.smoothing_parameter) * bigram_prob + (1 - self.smoothing_parameter) * trigram_prob
return max(probabilities, key=probabilities.get)
# In[32]:
in_df = pd.read_csv('preprocessed_text_join_unmask.tsv', sep='\t')
expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None, on_bad_lines='warn')
df = pd.concat([in_df, expected_df], axis=1)
df.columns = ['text', 'expected_word']
quarter = len(df) // 4
df = df.iloc[:quarter]
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
model = InterpolatedModel()
model.train(train_df['text'], smoothing_parameter=0.4, vocab=60000)
# In[33]:
import pickle
with open('trained_model_quarter_sp04_vocab60k.pkl', 'wb') as file:
pickle.dump(model, file)
# In[ ]:
import pickle
with open('trained_model_quarter_sp04_vocab60k.pkl', 'rb') as file:
model = pickle.load(file)
# Smoothing = 0.4
# In[14]:
import pandas as pd
df = pd.read_csv('preprocessed_text_join_mask.tsv', sep='\t')
expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None)
expected_df.columns = ['expected_word']
for index, row in df.head(100).iterrows():
sentence = row['text']
expected_word = expected_df.loc[index, 'expected_word']
words = sentence.split()
if '<MASK>' in words:
mask_index = words.index('<MASK>')
start_index = max(0, mask_index - 2)
context = ' '.join(words[start_index:mask_index])
predicted_word = model.predict(context)
if predicted_word == expected_word:
print(f'Index: {index} Predicted Word: {predicted_word}, Expected Word: {expected_word}')
# In[21]:
import pandas as pd
import csv
# Load the data
in_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/in.tsv', sep='\t', header=None, on_bad_lines='warn')
expected_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/expected.tsv', sep='\t', header=None, on_bad_lines='warn', quoting=csv.QUOTE_NONE)
columns_to_drop = [0, 1, 2, 3, 4, 5]
in_df.drop(columns_to_drop, axis=1, inplace=True)
in_df.columns = ['text_1', 'text_2']
in_df['text_2'].fillna('', inplace=True)
# In[22]:
from tqdm import tqdm
def replace_newline(text):
if isinstance(text, str):
return text.replace('\\n', ' ')
return text
tqdm.pandas(desc="Replacing '\\n' in 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline)
tqdm.pandas(desc="Replacing '\\n' in 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline)
# In[23]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
from tqdm import tqdm
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
def preprocess_text(text):
text = text.lower()
text = text.translate(str.maketrans('', '', string.punctuation))
words = word_tokenize(text)
words = [stemmer.stem(word) for word in words if word not in stop_words]
text = ' '.join(words)
return text
tqdm.pandas(desc="Processing 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text)
tqdm.pandas(desc="Processing 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text)
# In[24]:
in_df.to_csv('preprocessed_dev_text1_text2.tsv', sep='\t', index=False)
# In[25]:
def concatenate_texts(row):
return str(row['text_1']) + ' <MASK> ' + str(row['text_2'])
tqdm.pandas(desc="Concatenating 'text_1' and 'text_2'")
in_df['text'] = in_df.progress_apply(concatenate_texts, axis=1)
in_df.drop(['text_1', 'text_2'], axis=1, inplace=True)
# In[26]:
in_df.to_csv('preprocessed_dev_text_join_mask.tsv', sep='\t', index=False)
# In[34]:
import csv
df = pd.read_csv('preprocessed_dev_text_join_mask.tsv', sep='\t')
expected_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/expected.tsv', sep='\t', header=None, quoting=csv.QUOTE_NONE)
expected_df.columns = ['expected_word']
for index, row in df.head(100).iterrows():
sentence = row['text']
expected_word = expected_df.loc[index, 'expected_word']
words = sentence.split()
if '<MASK>' in words:
mask_index = words.index('<MASK>')
start_index = max(0, mask_index - 2)
context = ' '.join(words[start_index:mask_index])
predicted_word = model.predict(context)
if predicted_word == expected_word:
print(f'Index: {index} Predicted Word: {predicted_word}, Expected Word: {expected_word}')
# In[36]:
import csv
df = pd.read_csv('preprocessed_dev_text_join_mask.tsv', sep='\t')
expected_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/expected.tsv', sep='\t', header=None, quoting=csv.QUOTE_NONE)
expected_df.columns = ['expected_word']
df['Word'] = None
for index, row in df.iterrows():
sentence = row['text']
words = sentence.split()
if '<MASK>' in words:
expected_word = expected_df.loc[index, 'expected_word']
mask_index = words.index('<MASK>')
start_index = max(0, mask_index - 2)
context = ' '.join(words[start_index:mask_index])
predicted_word = model.predict(context)
df.loc[index, 'Word'] = predicted_word
if predicted_word == expected_word:
print(f'Index: {index} Predicted Word: {predicted_word}, Expected Word: {expected_word}')
df['Word'].to_csv('./challenging-america-word-gap-prediction/dev-0/out.tsv', sep='\t', index=False)
# In[37]:
import pandas as pd
in_df = pd.read_csv('./challenging-america-word-gap-prediction/test-A/in.tsv', sep='\t', header=None, on_bad_lines='warn')
columns_to_drop = [0, 1, 2, 3, 4, 5]
in_df.drop(columns_to_drop, axis=1, inplace=True)
in_df.columns = ['text_1', 'text_2']
in_df['text_2'].fillna('', inplace=True)
# In[38]:
from tqdm import tqdm
def replace_newline(text):
if isinstance(text, str):
return text.replace('\\n', ' ')
return text
tqdm.pandas(desc="Replacing '\\n' in 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline)
tqdm.pandas(desc="Replacing '\\n' in 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline)
# In[39]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
from tqdm import tqdm
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
def preprocess_text(text):
text = text.lower()
text = text.translate(str.maketrans('', '', string.punctuation))
words = word_tokenize(text)
words = [stemmer.stem(word) for word in words if word not in stop_words]
text = ' '.join(words)
return text
tqdm.pandas(desc="Processing 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text)
tqdm.pandas(desc="Processing 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text)
# In[40]:
in_df.to_csv('preprocessed_test_text1_text2.tsv', sep='\t', index=False)
# In[41]:
def concatenate_texts(row):
return str(row['text_1']) + ' <MASK> ' + str(row['text_2'])
tqdm.pandas(desc="Concatenating 'text_1' and 'text_2'")
in_df['text'] = in_df.progress_apply(concatenate_texts, axis=1)
in_df.drop(['text_1', 'text_2'], axis=1, inplace=True)
# In[42]:
in_df.to_csv('preprocessed_test_text_join_mask.tsv', sep='\t', index=False)
# In[43]:
df = pd.read_csv('preprocessed_dev_text_join_mask.tsv', sep='\t')
df['Word'] = None
for index, row in df.iterrows():
sentence = row['text']
words = sentence.split()
if '<MASK>' in words:
mask_index = words.index('<MASK>')
start_index = max(0, mask_index - 2)
context = ' '.join(words[start_index:mask_index])
predicted_word = model.predict(context)
df.loc[index, 'Word'] = predicted_word
df['Word'].to_csv('./challenging-america-word-gap-prediction/test-A/out.tsv', sep='\t', index=False)

10403
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff