151636
This commit is contained in:
parent
2a9545c47d
commit
47d69670a7
10403
dev-0/out.tsv
Normal file
10403
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
629
run.py
Normal file
629
run.py
Normal file
@ -0,0 +1,629 @@
|
|||||||
|
|
||||||
|
# In[1]:
|
||||||
|
|
||||||
|
|
||||||
|
import os
|
||||||
|
import lzma
|
||||||
|
|
||||||
|
folders = ["./challenging-america-word-gap-prediction/dev-0",
|
||||||
|
"./challenging-america-word-gap-prediction/test-A",
|
||||||
|
"./challenging-america-word-gap-prediction/train"]
|
||||||
|
|
||||||
|
for folder in folders:
|
||||||
|
for file in os.listdir(folder):
|
||||||
|
if file.endswith(".tsv.xz"):
|
||||||
|
file_path = os.path.join(folder, file)
|
||||||
|
output_path = os.path.splitext(file_path)[0] # Remove the .xz extension
|
||||||
|
with lzma.open(file_path, "rb") as compressed_file:
|
||||||
|
with open(output_path, "wb") as output_file:
|
||||||
|
output_file.write(compressed_file.read())
|
||||||
|
|
||||||
|
|
||||||
|
# In[5]:
|
||||||
|
|
||||||
|
|
||||||
|
import nltk
|
||||||
|
nltk.download('punkt')
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
in_df = pd.read_csv('./challenging-america-word-gap-prediction/train/in.tsv', sep='\t', header=None, on_bad_lines='warn')
|
||||||
|
expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None, on_bad_lines='warn')
|
||||||
|
|
||||||
|
|
||||||
|
print("in_df:")
|
||||||
|
print(in_df.head())
|
||||||
|
print("\nexpected_df:")
|
||||||
|
print(expected_df.head())
|
||||||
|
print("\nhate_speech_info_df:")
|
||||||
|
|
||||||
|
|
||||||
|
print("\nin_df info:")
|
||||||
|
print(in_df.info())
|
||||||
|
print("\nexpected_df info:")
|
||||||
|
print(expected_df.info())
|
||||||
|
|
||||||
|
|
||||||
|
# In[2]:
|
||||||
|
|
||||||
|
|
||||||
|
columns_to_drop = [0, 1, 2, 3, 4, 5]
|
||||||
|
in_df.drop(columns_to_drop, axis=1, inplace=True)
|
||||||
|
|
||||||
|
|
||||||
|
in_df.columns = ['text_1', 'text_2']
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# In[3]:
|
||||||
|
|
||||||
|
|
||||||
|
print(in_df.head())
|
||||||
|
|
||||||
|
|
||||||
|
# In[4]:
|
||||||
|
|
||||||
|
|
||||||
|
in_df['text_1'].fillna('', inplace=True)
|
||||||
|
in_df['text_2'].fillna('', inplace=True)
|
||||||
|
|
||||||
|
|
||||||
|
# In[5]:
|
||||||
|
|
||||||
|
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
def replace_newline(text):
|
||||||
|
if isinstance(text, str):
|
||||||
|
return text.replace('\\n', ' ')
|
||||||
|
return text
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Replacing '\\n' in 'text_1'")
|
||||||
|
in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline)
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Replacing '\\n' in 'text_2'")
|
||||||
|
in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# In[6]:
|
||||||
|
|
||||||
|
|
||||||
|
print(in_df.head())
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
import nltk
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from nltk.stem import PorterStemmer
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
import string
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
nltk.download('punkt')
|
||||||
|
nltk.download('stopwords')
|
||||||
|
|
||||||
|
stop_words = set(stopwords.words('english'))
|
||||||
|
stemmer = PorterStemmer()
|
||||||
|
|
||||||
|
def preprocess_text(text):
|
||||||
|
text = text.lower()
|
||||||
|
|
||||||
|
# Remove punctuation
|
||||||
|
text = text.translate(str.maketrans('', '', string.punctuation))
|
||||||
|
|
||||||
|
words = word_tokenize(text)
|
||||||
|
|
||||||
|
# Remove stopwords and stem the words
|
||||||
|
words = [stemmer.stem(word) for word in words if word not in stop_words]
|
||||||
|
|
||||||
|
text = ' '.join(words)
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Processing 'text_1'")
|
||||||
|
in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text)
|
||||||
|
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Processing 'text_2'")
|
||||||
|
in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text)
|
||||||
|
|
||||||
|
|
||||||
|
# In[8]:
|
||||||
|
|
||||||
|
|
||||||
|
in_df.to_csv('preprocessed_text1_text2.tsv', sep='\t', index=False)
|
||||||
|
|
||||||
|
|
||||||
|
# In[9]:
|
||||||
|
|
||||||
|
|
||||||
|
def concatenate_texts(row):
|
||||||
|
return str(row['text_1']) + ' <MASK> ' + str(row['text_2'])
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Concatenating 'text_1' and 'text_2'")
|
||||||
|
in_df['text'] = in_df.progress_apply(concatenate_texts, axis=1)
|
||||||
|
|
||||||
|
in_df.drop(['text_1', 'text_2'], axis=1, inplace=True)
|
||||||
|
|
||||||
|
|
||||||
|
# In[31]:
|
||||||
|
|
||||||
|
|
||||||
|
from tqdm import tqdm
|
||||||
|
tqdm.pandas()
|
||||||
|
|
||||||
|
in_df = pd.read_csv('preprocessed_text1_text2.tsv', sep='\t')
|
||||||
|
|
||||||
|
expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None, on_bad_lines='warn')
|
||||||
|
expected_df.columns = ['expected_word']
|
||||||
|
|
||||||
|
in_df = pd.concat([in_df, expected_df], axis=1)
|
||||||
|
|
||||||
|
def concatenate_texts(row):
|
||||||
|
return str(row['text_1']) + ' ' + str(row['expected_word']) + ' ' + str(row['text_2'])
|
||||||
|
|
||||||
|
# Apply the function to each row and show progress
|
||||||
|
in_df['unmasked_text'] = in_df.progress_apply(concatenate_texts, axis=1)
|
||||||
|
|
||||||
|
|
||||||
|
# In[35]:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
in_df['unmasked_text'].to_csv('preprocessed_text_join_unmask.tsv', sep='\t', index=False)
|
||||||
|
|
||||||
|
|
||||||
|
# In[10]:
|
||||||
|
|
||||||
|
|
||||||
|
in_df.to_csv('preprocessed_text_join_mask.tsv', sep='\t', index=False)
|
||||||
|
|
||||||
|
|
||||||
|
# In[5]:
|
||||||
|
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
in_df = pd.read_csv('preprocessed_text_join_mask.tsv', sep='\t')
|
||||||
|
|
||||||
|
print(in_df.head())
|
||||||
|
|
||||||
|
|
||||||
|
# In[6]:
|
||||||
|
|
||||||
|
|
||||||
|
print(in_df.head())
|
||||||
|
|
||||||
|
|
||||||
|
# In[1]:
|
||||||
|
|
||||||
|
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from collections import Counter
|
||||||
|
from nltk.util import ngrams
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.metrics import accuracy_score
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
class Model(ABC):
|
||||||
|
|
||||||
|
def __init__(self, UNK_token= '<MASK>', smoothing_parameter=0.5):
|
||||||
|
self.UNK_token = UNK_token
|
||||||
|
self.smoothing_parameter = smoothing_parameter
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def train(self, corpus):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def predict(self, text):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class InterpolatedModel(Model):
|
||||||
|
|
||||||
|
def train(self, corpus, smoothing_parameter=0.5, vocab=15000):
|
||||||
|
|
||||||
|
self.smoothing_parameter=smoothing_parameter
|
||||||
|
if corpus.empty:
|
||||||
|
raise ValueError("The corpus is empty.")
|
||||||
|
|
||||||
|
corpus = list(set(corpus))
|
||||||
|
|
||||||
|
tokens = []
|
||||||
|
for i, sentence in enumerate(corpus):
|
||||||
|
sentence_tokens = word_tokenize(sentence)
|
||||||
|
# Add padding to sentences that are less than three words long
|
||||||
|
while len(sentence_tokens) < 3:
|
||||||
|
sentence_tokens.append(self.UNK_token)
|
||||||
|
tokens.append(sentence_tokens)
|
||||||
|
if i % 1000 == 0:
|
||||||
|
print(f'Tokenizing sentence {i} of {len(corpus)}')
|
||||||
|
|
||||||
|
self.unigram_counts = Counter()
|
||||||
|
self.bigram_counts = Counter()
|
||||||
|
self.trigram_counts = Counter()
|
||||||
|
for i, sentence in enumerate(tokens):
|
||||||
|
self.unigram_counts.update(sentence)
|
||||||
|
self.bigram_counts.update(list(ngrams(sentence, 2, pad_left=True, pad_right=True)))
|
||||||
|
self.trigram_counts.update(list(ngrams(sentence, 3, pad_left=True, pad_right=True)))
|
||||||
|
if i % 1000 == 0:
|
||||||
|
print(f'Counting ngrams in sentence {i} of {len(tokens)}')
|
||||||
|
|
||||||
|
self.unigram_counts = Counter(dict(self.unigram_counts.most_common(vocab)))
|
||||||
|
|
||||||
|
self.vocab_size = len(self.unigram_counts)
|
||||||
|
|
||||||
|
def probability(self, text, word):
|
||||||
|
tokens = word_tokenize(text)
|
||||||
|
if len(tokens) < 2:
|
||||||
|
# Use a unigram model if the input text is only one word
|
||||||
|
return self.unigram_counts[word] / self.vocab_size if word in self.unigram_counts else 0
|
||||||
|
else:
|
||||||
|
# Use a bigram/trigram model if the input text is at least two words
|
||||||
|
previous_bigram = tuple(tokens[-2:])
|
||||||
|
unigram_prob = self.unigram_counts[word] / self.vocab_size if word in self.unigram_counts else 0
|
||||||
|
bigram_prob = self.bigram_counts.get((previous_bigram[1], word), 0) / self.unigram_counts.get(previous_bigram[1], 1)
|
||||||
|
trigram_prob = self.trigram_counts.get((*previous_bigram, word), 0) / self.bigram_counts.get(previous_bigram, 1)
|
||||||
|
return self.smoothing_parameter * unigram_prob + (1 - 2 * self.smoothing_parameter) * bigram_prob + (1 - self.smoothing_parameter) * trigram_prob
|
||||||
|
|
||||||
|
def perplexity(self, sentence):
|
||||||
|
tokens = word_tokenize(sentence)
|
||||||
|
|
||||||
|
probabilities = [self.probability(' '.join(tokens[:i]), tokens[i]) for i in range(1, len(tokens))]
|
||||||
|
|
||||||
|
log_probabilities = [np.log2(p) if p > 0 else float('-inf') for p in probabilities]
|
||||||
|
|
||||||
|
average_log_probability = np.mean(log_probabilities)
|
||||||
|
|
||||||
|
return np.power(2, -average_log_probability)
|
||||||
|
|
||||||
|
def predict(self, text):
|
||||||
|
tokens = word_tokenize(text)
|
||||||
|
if len(tokens) < 1:
|
||||||
|
return None
|
||||||
|
elif len(tokens) < 2:
|
||||||
|
# Use a unigram model if the input text is only one word
|
||||||
|
probabilities = {word: self.unigram_counts[word] / self.vocab_size if word in self.unigram_counts else 0 for word in self.unigram_counts.keys()}
|
||||||
|
else:
|
||||||
|
# Use a bigram/trigram model if the input text is at least two words
|
||||||
|
previous_bigram = tuple(tokens[-2:])
|
||||||
|
probabilities = {}
|
||||||
|
for word in self.unigram_counts.keys():
|
||||||
|
unigram_prob = self.unigram_counts[word] / self.vocab_size if word in self.unigram_counts else 0
|
||||||
|
bigram_prob = self.bigram_counts.get((previous_bigram[1], word), 0) / self.unigram_counts.get(previous_bigram[1], 1)
|
||||||
|
trigram_prob = self.trigram_counts.get((*previous_bigram, word), 0) / self.bigram_counts.get(previous_bigram, 1)
|
||||||
|
probabilities[word] = self.smoothing_parameter * unigram_prob + (1 - 2 * self.smoothing_parameter) * bigram_prob + (1 - self.smoothing_parameter) * trigram_prob
|
||||||
|
return max(probabilities, key=probabilities.get)
|
||||||
|
|
||||||
|
|
||||||
|
# In[32]:
|
||||||
|
|
||||||
|
|
||||||
|
in_df = pd.read_csv('preprocessed_text_join_unmask.tsv', sep='\t')
|
||||||
|
expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None, on_bad_lines='warn')
|
||||||
|
|
||||||
|
|
||||||
|
df = pd.concat([in_df, expected_df], axis=1)
|
||||||
|
df.columns = ['text', 'expected_word']
|
||||||
|
|
||||||
|
quarter = len(df) // 4
|
||||||
|
df = df.iloc[:quarter]
|
||||||
|
|
||||||
|
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
|
||||||
|
|
||||||
|
|
||||||
|
model = InterpolatedModel()
|
||||||
|
model.train(train_df['text'], smoothing_parameter=0.4, vocab=60000)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# In[33]:
|
||||||
|
|
||||||
|
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
with open('trained_model_quarter_sp04_vocab60k.pkl', 'wb') as file:
|
||||||
|
pickle.dump(model, file)
|
||||||
|
|
||||||
|
|
||||||
|
# In[ ]:
|
||||||
|
|
||||||
|
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
with open('trained_model_quarter_sp04_vocab60k.pkl', 'rb') as file:
|
||||||
|
model = pickle.load(file)
|
||||||
|
|
||||||
|
|
||||||
|
# Smoothing = 0.4
|
||||||
|
|
||||||
|
# In[14]:
|
||||||
|
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
df = pd.read_csv('preprocessed_text_join_mask.tsv', sep='\t')
|
||||||
|
|
||||||
|
expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None)
|
||||||
|
expected_df.columns = ['expected_word']
|
||||||
|
|
||||||
|
for index, row in df.head(100).iterrows():
|
||||||
|
sentence = row['text']
|
||||||
|
expected_word = expected_df.loc[index, 'expected_word']
|
||||||
|
words = sentence.split()
|
||||||
|
if '<MASK>' in words:
|
||||||
|
mask_index = words.index('<MASK>')
|
||||||
|
start_index = max(0, mask_index - 2)
|
||||||
|
context = ' '.join(words[start_index:mask_index])
|
||||||
|
predicted_word = model.predict(context)
|
||||||
|
if predicted_word == expected_word:
|
||||||
|
print(f'Index: {index} Predicted Word: {predicted_word}, Expected Word: {expected_word}')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# In[21]:
|
||||||
|
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import csv
|
||||||
|
|
||||||
|
# Load the data
|
||||||
|
in_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/in.tsv', sep='\t', header=None, on_bad_lines='warn')
|
||||||
|
expected_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/expected.tsv', sep='\t', header=None, on_bad_lines='warn', quoting=csv.QUOTE_NONE)
|
||||||
|
|
||||||
|
|
||||||
|
columns_to_drop = [0, 1, 2, 3, 4, 5]
|
||||||
|
in_df.drop(columns_to_drop, axis=1, inplace=True)
|
||||||
|
|
||||||
|
in_df.columns = ['text_1', 'text_2']
|
||||||
|
in_df['text_2'].fillna('', inplace=True)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# In[22]:
|
||||||
|
|
||||||
|
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
def replace_newline(text):
|
||||||
|
if isinstance(text, str):
|
||||||
|
return text.replace('\\n', ' ')
|
||||||
|
return text
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Replacing '\\n' in 'text_1'")
|
||||||
|
in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline)
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Replacing '\\n' in 'text_2'")
|
||||||
|
in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline)
|
||||||
|
|
||||||
|
|
||||||
|
# In[23]:
|
||||||
|
|
||||||
|
|
||||||
|
import nltk
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from nltk.stem import PorterStemmer
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
import string
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
nltk.download('punkt')
|
||||||
|
nltk.download('stopwords')
|
||||||
|
|
||||||
|
stop_words = set(stopwords.words('english'))
|
||||||
|
stemmer = PorterStemmer()
|
||||||
|
|
||||||
|
def preprocess_text(text):
|
||||||
|
text = text.lower()
|
||||||
|
text = text.translate(str.maketrans('', '', string.punctuation))
|
||||||
|
words = word_tokenize(text)
|
||||||
|
words = [stemmer.stem(word) for word in words if word not in stop_words]
|
||||||
|
text = ' '.join(words)
|
||||||
|
return text
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Processing 'text_1'")
|
||||||
|
in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text)
|
||||||
|
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Processing 'text_2'")
|
||||||
|
in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text)
|
||||||
|
|
||||||
|
|
||||||
|
# In[24]:
|
||||||
|
|
||||||
|
|
||||||
|
in_df.to_csv('preprocessed_dev_text1_text2.tsv', sep='\t', index=False)
|
||||||
|
|
||||||
|
|
||||||
|
# In[25]:
|
||||||
|
|
||||||
|
|
||||||
|
def concatenate_texts(row):
|
||||||
|
return str(row['text_1']) + ' <MASK> ' + str(row['text_2'])
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Concatenating 'text_1' and 'text_2'")
|
||||||
|
in_df['text'] = in_df.progress_apply(concatenate_texts, axis=1)
|
||||||
|
|
||||||
|
in_df.drop(['text_1', 'text_2'], axis=1, inplace=True)
|
||||||
|
|
||||||
|
|
||||||
|
# In[26]:
|
||||||
|
|
||||||
|
|
||||||
|
in_df.to_csv('preprocessed_dev_text_join_mask.tsv', sep='\t', index=False)
|
||||||
|
|
||||||
|
|
||||||
|
# In[34]:
|
||||||
|
|
||||||
|
|
||||||
|
import csv
|
||||||
|
df = pd.read_csv('preprocessed_dev_text_join_mask.tsv', sep='\t')
|
||||||
|
|
||||||
|
expected_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/expected.tsv', sep='\t', header=None, quoting=csv.QUOTE_NONE)
|
||||||
|
expected_df.columns = ['expected_word']
|
||||||
|
|
||||||
|
for index, row in df.head(100).iterrows():
|
||||||
|
sentence = row['text']
|
||||||
|
expected_word = expected_df.loc[index, 'expected_word']
|
||||||
|
words = sentence.split()
|
||||||
|
if '<MASK>' in words:
|
||||||
|
mask_index = words.index('<MASK>')
|
||||||
|
start_index = max(0, mask_index - 2)
|
||||||
|
context = ' '.join(words[start_index:mask_index])
|
||||||
|
predicted_word = model.predict(context)
|
||||||
|
if predicted_word == expected_word:
|
||||||
|
print(f'Index: {index} Predicted Word: {predicted_word}, Expected Word: {expected_word}')
|
||||||
|
|
||||||
|
|
||||||
|
# In[36]:
|
||||||
|
|
||||||
|
|
||||||
|
import csv
|
||||||
|
df = pd.read_csv('preprocessed_dev_text_join_mask.tsv', sep='\t')
|
||||||
|
|
||||||
|
expected_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/expected.tsv', sep='\t', header=None, quoting=csv.QUOTE_NONE)
|
||||||
|
expected_df.columns = ['expected_word']
|
||||||
|
|
||||||
|
df['Word'] = None
|
||||||
|
|
||||||
|
for index, row in df.iterrows():
|
||||||
|
sentence = row['text']
|
||||||
|
words = sentence.split()
|
||||||
|
if '<MASK>' in words:
|
||||||
|
expected_word = expected_df.loc[index, 'expected_word']
|
||||||
|
mask_index = words.index('<MASK>')
|
||||||
|
start_index = max(0, mask_index - 2)
|
||||||
|
context = ' '.join(words[start_index:mask_index])
|
||||||
|
predicted_word = model.predict(context)
|
||||||
|
df.loc[index, 'Word'] = predicted_word
|
||||||
|
if predicted_word == expected_word:
|
||||||
|
print(f'Index: {index} Predicted Word: {predicted_word}, Expected Word: {expected_word}')
|
||||||
|
|
||||||
|
df['Word'].to_csv('./challenging-america-word-gap-prediction/dev-0/out.tsv', sep='\t', index=False)
|
||||||
|
|
||||||
|
|
||||||
|
# In[37]:
|
||||||
|
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
in_df = pd.read_csv('./challenging-america-word-gap-prediction/test-A/in.tsv', sep='\t', header=None, on_bad_lines='warn')
|
||||||
|
|
||||||
|
columns_to_drop = [0, 1, 2, 3, 4, 5]
|
||||||
|
in_df.drop(columns_to_drop, axis=1, inplace=True)
|
||||||
|
|
||||||
|
in_df.columns = ['text_1', 'text_2']
|
||||||
|
in_df['text_2'].fillna('', inplace=True)
|
||||||
|
|
||||||
|
|
||||||
|
# In[38]:
|
||||||
|
|
||||||
|
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
def replace_newline(text):
|
||||||
|
if isinstance(text, str):
|
||||||
|
return text.replace('\\n', ' ')
|
||||||
|
return text
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Replacing '\\n' in 'text_1'")
|
||||||
|
in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline)
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Replacing '\\n' in 'text_2'")
|
||||||
|
in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline)
|
||||||
|
|
||||||
|
|
||||||
|
# In[39]:
|
||||||
|
|
||||||
|
|
||||||
|
import nltk
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from nltk.stem import PorterStemmer
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
import string
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
nltk.download('punkt')
|
||||||
|
nltk.download('stopwords')
|
||||||
|
|
||||||
|
stop_words = set(stopwords.words('english'))
|
||||||
|
stemmer = PorterStemmer()
|
||||||
|
|
||||||
|
def preprocess_text(text):
|
||||||
|
text = text.lower()
|
||||||
|
text = text.translate(str.maketrans('', '', string.punctuation))
|
||||||
|
words = word_tokenize(text)
|
||||||
|
words = [stemmer.stem(word) for word in words if word not in stop_words]
|
||||||
|
text = ' '.join(words)
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Processing 'text_1'")
|
||||||
|
in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text)
|
||||||
|
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Processing 'text_2'")
|
||||||
|
in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text)
|
||||||
|
|
||||||
|
|
||||||
|
# In[40]:
|
||||||
|
|
||||||
|
|
||||||
|
in_df.to_csv('preprocessed_test_text1_text2.tsv', sep='\t', index=False)
|
||||||
|
|
||||||
|
|
||||||
|
# In[41]:
|
||||||
|
|
||||||
|
|
||||||
|
def concatenate_texts(row):
|
||||||
|
return str(row['text_1']) + ' <MASK> ' + str(row['text_2'])
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Concatenating 'text_1' and 'text_2'")
|
||||||
|
in_df['text'] = in_df.progress_apply(concatenate_texts, axis=1)
|
||||||
|
|
||||||
|
in_df.drop(['text_1', 'text_2'], axis=1, inplace=True)
|
||||||
|
|
||||||
|
|
||||||
|
# In[42]:
|
||||||
|
|
||||||
|
|
||||||
|
in_df.to_csv('preprocessed_test_text_join_mask.tsv', sep='\t', index=False)
|
||||||
|
|
||||||
|
|
||||||
|
# In[43]:
|
||||||
|
|
||||||
|
|
||||||
|
df = pd.read_csv('preprocessed_dev_text_join_mask.tsv', sep='\t')
|
||||||
|
|
||||||
|
df['Word'] = None
|
||||||
|
|
||||||
|
for index, row in df.iterrows():
|
||||||
|
sentence = row['text']
|
||||||
|
words = sentence.split()
|
||||||
|
if '<MASK>' in words:
|
||||||
|
mask_index = words.index('<MASK>')
|
||||||
|
start_index = max(0, mask_index - 2)
|
||||||
|
context = ' '.join(words[start_index:mask_index])
|
||||||
|
predicted_word = model.predict(context)
|
||||||
|
df.loc[index, 'Word'] = predicted_word
|
||||||
|
|
||||||
|
|
||||||
|
df['Word'].to_csv('./challenging-america-word-gap-prediction/test-A/out.tsv', sep='\t', index=False)
|
||||||
|
|
10403
test-A/out.tsv
Normal file
10403
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user