commit
This commit is contained in:
parent
be14714af7
commit
b3b8a20768
1383
06_kenlm.ipynb
Normal file
1383
06_kenlm.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
10519
dev-0/expected.tsv
Normal file
10519
dev-0/expected.tsv
Normal file
File diff suppressed because it is too large
Load Diff
10519
dev-0/hate-speech-info.tsv
Normal file
10519
dev-0/hate-speech-info.tsv
Normal file
File diff suppressed because it is too large
Load Diff
10519
dev-0/in.tsv
Normal file
10519
dev-0/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
dev-0/in.tsv.xz
Normal file
BIN
dev-0/in.tsv.xz
Normal file
Binary file not shown.
10519
dev-0/out.tsv
Normal file
10519
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
1
in-header.tsv
Normal file
1
in-header.tsv
Normal file
@ -0,0 +1 @@
|
|||||||
|
FileId Year LeftContext RightContext
|
|
1
out-header.tsv
Normal file
1
out-header.tsv
Normal file
@ -0,0 +1 @@
|
|||||||
|
Word
|
|
632
run1.py
Normal file
632
run1.py
Normal file
@ -0,0 +1,632 @@
|
|||||||
|
# %%
|
||||||
|
import os
|
||||||
|
import lzma
|
||||||
|
|
||||||
|
folders = ["./challenging-america-word-gap-prediction/dev-0",
|
||||||
|
"./challenging-america-word-gap-prediction/test-A",
|
||||||
|
"./challenging-america-word-gap-prediction/train"]
|
||||||
|
|
||||||
|
for folder in folders:
|
||||||
|
for file in os.listdir(folder):
|
||||||
|
if file.endswith(".tsv.xz"):
|
||||||
|
file_path = os.path.join(folder, file)
|
||||||
|
output_path = os.path.splitext(file_path)[0] # Remove the .xz extension
|
||||||
|
with lzma.open(file_path, "rb") as compressed_file:
|
||||||
|
with open(output_path, "wb") as output_file:
|
||||||
|
output_file.write(compressed_file.read())
|
||||||
|
|
||||||
|
# %%
|
||||||
|
import nltk
|
||||||
|
nltk.download('punkt')
|
||||||
|
|
||||||
|
# %%
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
# Load the data
|
||||||
|
in_df = pd.read_csv('./challenging-america-word-gap-prediction/train/in.tsv', sep='\t', header=None, on_bad_lines='warn')
|
||||||
|
expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None, on_bad_lines='warn')
|
||||||
|
|
||||||
|
# Print out the first few rows of each DataFrame
|
||||||
|
print("in_df:")
|
||||||
|
print(in_df.head())
|
||||||
|
print("\nexpected_df:")
|
||||||
|
print(expected_df.head())
|
||||||
|
print("\nhate_speech_info_df:")
|
||||||
|
|
||||||
|
# Print out more information about each DataFrame
|
||||||
|
print("\nin_df info:")
|
||||||
|
print(in_df.info())
|
||||||
|
print("\nexpected_df info:")
|
||||||
|
print(expected_df.info())
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# Drop unnecessary columns
|
||||||
|
columns_to_drop = [0, 1, 2, 3, 4, 5] # Column indices to drop
|
||||||
|
in_df.drop(columns_to_drop, axis=1, inplace=True)
|
||||||
|
|
||||||
|
# Rename remaining columns for clarity
|
||||||
|
in_df.columns = ['text_1', 'text_2']
|
||||||
|
|
||||||
|
# %%
|
||||||
|
in_df['text_2'].fillna('', inplace=True)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
# Define a function to replace '\n' with ' '
|
||||||
|
def replace_newline(text):
|
||||||
|
if isinstance(text, str):
|
||||||
|
return text.replace('\\n', ' ')
|
||||||
|
return text
|
||||||
|
|
||||||
|
def replace_tabulation(text):
|
||||||
|
if isinstance(text, str):
|
||||||
|
return text.replace('\\t', ' ')
|
||||||
|
return text
|
||||||
|
|
||||||
|
# Apply the function to 'text_1' and 'text_2' columns and show progress
|
||||||
|
tqdm.pandas(desc="Replacing '\\n' in 'text_1'")
|
||||||
|
in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline)
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Replacing '\\t' in 'text_1'")
|
||||||
|
in_df['text_1'] = in_df['text_1'].progress_apply(replace_tabulation)
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Replacing '\\n' in 'text_2'")
|
||||||
|
in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline)
|
||||||
|
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Replacing '\\t' in 'text_2'")
|
||||||
|
in_df['text_2'] = in_df['text_2'].progress_apply(replace_tabulation)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
print(in_df.head())
|
||||||
|
|
||||||
|
# %%
|
||||||
|
import nltk
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from nltk.stem import PorterStemmer
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
import string
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
# If not already done, download the NLTK English stopwords and the Punkt Tokenizer Models
|
||||||
|
nltk.download('punkt')
|
||||||
|
nltk.download('stopwords')
|
||||||
|
|
||||||
|
stop_words = set(stopwords.words('english'))
|
||||||
|
stemmer = PorterStemmer()
|
||||||
|
|
||||||
|
def preprocess_text(text):
|
||||||
|
# Lowercase the text
|
||||||
|
text = text.lower()
|
||||||
|
|
||||||
|
# Remove punctuation
|
||||||
|
text = text.translate(str.maketrans('', '', string.punctuation))
|
||||||
|
|
||||||
|
# Tokenize the text
|
||||||
|
words = word_tokenize(text)
|
||||||
|
|
||||||
|
# Remove stopwords and stem the words
|
||||||
|
words = [stemmer.stem(word) for word in words if word not in stop_words]
|
||||||
|
|
||||||
|
# Join the words back into a single string
|
||||||
|
text = ' '.join(words)
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
# Apply the preprocessing to the 'text_1' and 'text_2' columns
|
||||||
|
tqdm.pandas(desc="Processing 'text_1'")
|
||||||
|
in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text)
|
||||||
|
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Processing 'text_2'")
|
||||||
|
in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
print(in_df.head())
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# Save 'in_df' DataFrame to a .tsv file
|
||||||
|
in_df.to_csv('preprocessed_text1_text2.tsv', sep='\t', index=False)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
import re
|
||||||
|
|
||||||
|
def handle_numbers_and_special_chars(text):
|
||||||
|
# Remove numbers
|
||||||
|
text = re.sub(r'\d+', '', text)
|
||||||
|
|
||||||
|
# Remove special characters
|
||||||
|
text = re.sub(r'\W+', ' ', text)
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
# Apply the function to the 'text_1' and 'text_2' columns
|
||||||
|
tqdm.pandas(desc="Processing 'text_1'")
|
||||||
|
in_df['text_1'] = in_df['text_1'].progress_apply(handle_numbers_and_special_chars)
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Processing 'text_2'")
|
||||||
|
in_df['text_2'] = in_df['text_2'].progress_apply(handle_numbers_and_special_chars)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
from spellchecker import SpellChecker
|
||||||
|
|
||||||
|
spell = SpellChecker()
|
||||||
|
|
||||||
|
def correct_spelling(text):
|
||||||
|
# Tokenize the text
|
||||||
|
words = text.split()
|
||||||
|
|
||||||
|
# Correct spelling
|
||||||
|
corrected_words = [spell.correction(word) if spell.correction(word) is not None else '' for word in words]
|
||||||
|
|
||||||
|
# Join the words back into a single string
|
||||||
|
text = ' '.join(corrected_words)
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
# Apply the spelling correction to the 'text_1' and 'text_2' columns
|
||||||
|
tqdm.pandas(desc="Spelling Correction 'text_1'")
|
||||||
|
in_df['text_1'] = in_df['text_1'].progress_apply(correct_spelling)
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Spelling Correction 'text_2'")
|
||||||
|
in_df['text_2'] = in_df['text_2'].progress_apply(correct_spelling)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# Define a function to concatenate 'text_1' and 'text_2'
|
||||||
|
def concatenate_texts(row):
|
||||||
|
return str(row['text_1']) + ' <MASK> ' + str(row['text_2'])
|
||||||
|
|
||||||
|
# Apply the function to each row and show progress
|
||||||
|
tqdm.pandas(desc="Concatenating 'text_1' and 'text_2'")
|
||||||
|
in_df['text'] = in_df.progress_apply(concatenate_texts, axis=1)
|
||||||
|
|
||||||
|
# Now you can drop 'text_1' and 'text_2' columns if you want
|
||||||
|
in_df.drop(['text_1', 'text_2'], axis=1, inplace=True)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
from tqdm import tqdm
|
||||||
|
tqdm.pandas()
|
||||||
|
|
||||||
|
# Load the preprocessed data
|
||||||
|
in_df = pd.read_csv('preprocessed_text1_text2.tsv', sep='\t')
|
||||||
|
|
||||||
|
# Load the expected words
|
||||||
|
expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None, on_bad_lines='warn')
|
||||||
|
expected_df.columns = ['expected_word']
|
||||||
|
|
||||||
|
# Add the expected words to in_df
|
||||||
|
in_df = pd.concat([in_df, expected_df], axis=1)
|
||||||
|
|
||||||
|
# Define a function to concatenate 'text_1' and 'expected_word' and 'text_2'
|
||||||
|
def concatenate_texts(row):
|
||||||
|
return str(row['text_1']) + ' ' + str(row['expected_word']) + ' ' + str(row['text_2'])
|
||||||
|
|
||||||
|
# Apply the function to each row and show progress
|
||||||
|
in_df['unmasked_text'] = in_df.progress_apply(concatenate_texts, axis=1)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
in_df['unmasked_text'].to_csv('preprocessed_text_join_unmask.tsv', sep='\t', index=False)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# Save the 'text' column to a text file
|
||||||
|
in_df['unmasked_text'].to_csv('training_data.txt', index=False, header=False)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
!lmplz -o 5 --discount_fallback < training_data.txt > language_model.arpa
|
||||||
|
|
||||||
|
# %%
|
||||||
|
!build_binary language_model.arpa language_model.binary
|
||||||
|
|
||||||
|
# %%
|
||||||
|
!lmplz -o 6 --discount_fallback -S 80% --prune 0 0 1 1 1 1 < training_data.txt > language_model.arpa
|
||||||
|
|
||||||
|
# %%
|
||||||
|
!wc -l ./challenging-america-word-gap-prediction/dev-0/in.tsv
|
||||||
|
|
||||||
|
# %%
|
||||||
|
import pandas as pd
|
||||||
|
import csv
|
||||||
|
|
||||||
|
# Load the data
|
||||||
|
try:
|
||||||
|
in_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/in.tsv', sep='\t', header=None, on_bad_lines='error')
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
expected_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/expected.tsv', sep='\t', header=None, on_bad_lines='warn', quoting=csv.QUOTE_NONE)
|
||||||
|
|
||||||
|
print(in_df.shape[0])
|
||||||
|
|
||||||
|
# Drop unnecessary columns
|
||||||
|
columns_to_drop = [0, 1, 2, 3, 4, 5] # Column indices to drop
|
||||||
|
in_df.drop(columns_to_drop, axis=1, inplace=True)
|
||||||
|
|
||||||
|
# Rename remaining columns for clarity
|
||||||
|
in_df.columns = ['text_1', 'text_2']
|
||||||
|
|
||||||
|
in_df['text_1'].fillna('placeholder', inplace=True)
|
||||||
|
in_df['text_2'].fillna('placeholder', inplace=True)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
import pandas as pd
|
||||||
|
import csv
|
||||||
|
|
||||||
|
# Placeholder line
|
||||||
|
placeholder_line = ['placeholder'] * 8 # Adjust the number of fields as needed
|
||||||
|
|
||||||
|
# Read the file line by line
|
||||||
|
with open('./challenging-america-word-gap-prediction/dev-0/in.tsv', 'r') as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
|
||||||
|
# Split each line into fields and replace problematic lines with the placeholder line
|
||||||
|
lines = [line.strip().split('\t') if len(line.strip().split('\t')) == 8 else placeholder_line for line in lines]
|
||||||
|
|
||||||
|
# Convert the list of lines into a DataFrame
|
||||||
|
in_df = pd.DataFrame(lines)
|
||||||
|
|
||||||
|
# Print the number of rows in the DataFrame
|
||||||
|
print(in_df.shape[0])
|
||||||
|
|
||||||
|
# Drop unnecessary columns
|
||||||
|
columns_to_drop = [0, 1, 2, 3, 4, 5] # Column indices to drop
|
||||||
|
in_df.drop(columns_to_drop, axis=1, inplace=True)
|
||||||
|
|
||||||
|
# Rename remaining columns for clarity
|
||||||
|
in_df.columns = ['text_1', 'text_2']
|
||||||
|
|
||||||
|
in_df['text_1'].fillna('placeholder', inplace=True)
|
||||||
|
in_df['text_2'].fillna('placeholder', inplace=True)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
print(in_df.shape[0])
|
||||||
|
|
||||||
|
# %%
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
# Define a function to replace '\n' with ' '
|
||||||
|
def replace_newline(text):
|
||||||
|
if isinstance(text, str):
|
||||||
|
return text.replace('\\n', ' ')
|
||||||
|
return text
|
||||||
|
|
||||||
|
def replace_tabulation(text):
|
||||||
|
if isinstance(text, str):
|
||||||
|
return text.replace('\\t', ' ')
|
||||||
|
return text
|
||||||
|
|
||||||
|
# Apply the function to 'text_1' and 'text_2' columns and show progress
|
||||||
|
tqdm.pandas(desc="Replacing '\\n' in 'text_1'")
|
||||||
|
in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline)
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Replacing '\\t' in 'text_1'")
|
||||||
|
in_df['text_1'] = in_df['text_1'].progress_apply(replace_tabulation)
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Replacing '\\n' in 'text_2'")
|
||||||
|
in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline)
|
||||||
|
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Replacing '\\t' in 'text_2'")
|
||||||
|
in_df['text_2'] = in_df['text_2'].progress_apply(replace_tabulation)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
import nltk
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from nltk.stem import PorterStemmer
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
import string
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
# If not already done, download the NLTK English stopwords and the Punkt Tokenizer Models
|
||||||
|
nltk.download('punkt')
|
||||||
|
nltk.download('stopwords')
|
||||||
|
|
||||||
|
stop_words = set(stopwords.words('english'))
|
||||||
|
stemmer = PorterStemmer()
|
||||||
|
|
||||||
|
def preprocess_text(text):
|
||||||
|
# Lowercase the text
|
||||||
|
text = text.lower()
|
||||||
|
|
||||||
|
# Remove punctuation
|
||||||
|
text = text.translate(str.maketrans('', '', string.punctuation))
|
||||||
|
|
||||||
|
# Tokenize the text
|
||||||
|
words = word_tokenize(text)
|
||||||
|
|
||||||
|
# Remove stopwords and stem the words
|
||||||
|
words = [stemmer.stem(word) for word in words if word not in stop_words]
|
||||||
|
|
||||||
|
# Join the words back into a single string
|
||||||
|
text = ' '.join(words)
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
# Apply the preprocessing to the 'text_1' and 'text_2' columns
|
||||||
|
tqdm.pandas(desc="Processing 'text_1'")
|
||||||
|
in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text)
|
||||||
|
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Processing 'text_2'")
|
||||||
|
in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
import re
|
||||||
|
|
||||||
|
def handle_numbers_and_special_chars(text):
|
||||||
|
# Remove numbers
|
||||||
|
text = re.sub(r'\d+', '', text)
|
||||||
|
|
||||||
|
# Remove special characters
|
||||||
|
text = re.sub(r'\W+', ' ', text)
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
# Apply the function to the 'text_1' and 'text_2' columns
|
||||||
|
tqdm.pandas(desc="Processing 'text_1'")
|
||||||
|
in_df['text_1'] = in_df['text_1'].progress_apply(handle_numbers_and_special_chars)
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Processing 'text_2'")
|
||||||
|
in_df['text_2'] = in_df['text_2'].progress_apply(handle_numbers_and_special_chars)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
in_df.to_csv('preprocessed_dev_text1_text2.tsv', sep='\t', index=False)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
import kenlm
|
||||||
|
|
||||||
|
# Load the language model
|
||||||
|
model = kenlm.Model('language_model.binary')
|
||||||
|
|
||||||
|
# %%
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
def softmax(x):
|
||||||
|
e_x = np.exp(x - np.max(x))
|
||||||
|
return e_x / e_x.sum()
|
||||||
|
|
||||||
|
def predict_missing_word(model, context_1, context_2):
|
||||||
|
# Define the vocabulary
|
||||||
|
vocabulary = set(' '.join([context_1, context_2]).split())
|
||||||
|
|
||||||
|
# Initialize a dictionary to store the words and their scores
|
||||||
|
word_scores = {}
|
||||||
|
|
||||||
|
# Iterate over the vocabulary
|
||||||
|
for word in vocabulary:
|
||||||
|
try:
|
||||||
|
# Generate the sentence
|
||||||
|
sentence = f"{context_1} {word} {context_2}"
|
||||||
|
|
||||||
|
# Score the sentence and store it in the dictionary
|
||||||
|
word_scores[word] = model.score(sentence)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing word '{word}': {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# If no word was found, return None for all values
|
||||||
|
if not word_scores:
|
||||||
|
return None, None, None
|
||||||
|
|
||||||
|
# Convert the scores to probabilities using the softmax function
|
||||||
|
word_probs = {word: max(0.001, prob) for word, prob in zip(word_scores.keys(), softmax(list(word_scores.values())))}
|
||||||
|
|
||||||
|
# Find the word with the highest probability
|
||||||
|
best_word, best_prob = max(word_probs.items(), key=lambda x: x[1])
|
||||||
|
|
||||||
|
# Calculate the sum of probabilities for the other words
|
||||||
|
other_probs_sum = sum(prob for word, prob in word_probs.items() if word != best_word)
|
||||||
|
|
||||||
|
return best_word, best_prob, other_probs_sum
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# Initialize a counter for the correct predictions
|
||||||
|
correct_predictions = 0
|
||||||
|
|
||||||
|
# Open the output file
|
||||||
|
with open('out.tsv', 'w') as f:
|
||||||
|
# Iterate over the rows of the input DataFrame and the expected DataFrame
|
||||||
|
for i, ((_, input_row), expected_word) in enumerate(zip(in_df.iterrows(), expected_df[0])):
|
||||||
|
try:
|
||||||
|
# Get the context
|
||||||
|
context_1 = input_row['text_1']
|
||||||
|
context_2 = input_row['text_2']
|
||||||
|
|
||||||
|
# Predict the missing word and get the probabilities
|
||||||
|
predicted_word, prob, other_probs_sum = predict_missing_word(model, context_1, context_2)
|
||||||
|
|
||||||
|
# If any of the values are None, use placeholder values
|
||||||
|
if predicted_word is None:
|
||||||
|
predicted_word = 'placeholder'
|
||||||
|
if prob is None:
|
||||||
|
prob = 0.001
|
||||||
|
if other_probs_sum is None:
|
||||||
|
other_probs_sum = 0.001
|
||||||
|
|
||||||
|
# Write the output to the file
|
||||||
|
f.write(f"{predicted_word}:{prob:.4f} :{other_probs_sum:.4f}\n")
|
||||||
|
|
||||||
|
# Check if the prediction is correct
|
||||||
|
if predicted_word == expected_word:
|
||||||
|
correct_predictions += 1
|
||||||
|
|
||||||
|
# Log progress every 1000 iterations
|
||||||
|
if i % 1000 == 0:
|
||||||
|
print(f"Processed {i} rows. Current accuracy: {correct_predictions / (i+1)}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing row {i}: {e}")
|
||||||
|
|
||||||
|
# Calculate the accuracy
|
||||||
|
accuracy = correct_predictions / len(in_df)
|
||||||
|
|
||||||
|
print(f"The accuracy of the model is {accuracy}")
|
||||||
|
|
||||||
|
# %%
|
||||||
|
import pandas as pd
|
||||||
|
import csv
|
||||||
|
|
||||||
|
# Placeholder line
|
||||||
|
placeholder_line = ['placeholder'] * 8 # Adjust the number of fields as needed
|
||||||
|
|
||||||
|
# Read the file line by line
|
||||||
|
with open('./challenging-america-word-gap-prediction/test-A/in.tsv', 'r') as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
|
||||||
|
# Split each line into fields and replace problematic lines with the placeholder line
|
||||||
|
lines = [line.strip().split('\t') if len(line.strip().split('\t')) == 8 else placeholder_line for line in lines]
|
||||||
|
|
||||||
|
# Convert the list of lines into a DataFrame
|
||||||
|
in_df = pd.DataFrame(lines)
|
||||||
|
|
||||||
|
# Print the number of rows in the DataFrame
|
||||||
|
print(in_df.shape[0])
|
||||||
|
|
||||||
|
# Drop unnecessary columns
|
||||||
|
columns_to_drop = [0, 1, 2, 3, 4, 5] # Column indices to drop
|
||||||
|
in_df.drop(columns_to_drop, axis=1, inplace=True)
|
||||||
|
|
||||||
|
# Rename remaining columns for clarity
|
||||||
|
in_df.columns = ['text_1', 'text_2']
|
||||||
|
|
||||||
|
in_df['text_1'].fillna('placeholder', inplace=True)
|
||||||
|
in_df['text_2'].fillna('placeholder', inplace=True)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
# Define a function to replace '\n' with ' '
|
||||||
|
def replace_newline(text):
|
||||||
|
if isinstance(text, str):
|
||||||
|
return text.replace('\\n', ' ')
|
||||||
|
return text
|
||||||
|
|
||||||
|
def replace_tabulation(text):
|
||||||
|
if isinstance(text, str):
|
||||||
|
return text.replace('\\t', ' ')
|
||||||
|
return text
|
||||||
|
|
||||||
|
# Apply the function to 'text_1' and 'text_2' columns and show progress
|
||||||
|
tqdm.pandas(desc="Replacing '\\n' in 'text_1'")
|
||||||
|
in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline)
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Replacing '\\t' in 'text_1'")
|
||||||
|
in_df['text_1'] = in_df['text_1'].progress_apply(replace_tabulation)
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Replacing '\\n' in 'text_2'")
|
||||||
|
in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline)
|
||||||
|
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Replacing '\\t' in 'text_2'")
|
||||||
|
in_df['text_2'] = in_df['text_2'].progress_apply(replace_tabulation)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
import nltk
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from nltk.stem import PorterStemmer
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
import string
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
# If not already done, download the NLTK English stopwords and the Punkt Tokenizer Models
|
||||||
|
nltk.download('punkt')
|
||||||
|
nltk.download('stopwords')
|
||||||
|
|
||||||
|
stop_words = set(stopwords.words('english'))
|
||||||
|
stemmer = PorterStemmer()
|
||||||
|
|
||||||
|
def preprocess_text(text):
|
||||||
|
# Lowercase the text
|
||||||
|
text = text.lower()
|
||||||
|
|
||||||
|
# Remove punctuation
|
||||||
|
text = text.translate(str.maketrans('', '', string.punctuation))
|
||||||
|
|
||||||
|
# Tokenize the text
|
||||||
|
words = word_tokenize(text)
|
||||||
|
|
||||||
|
# Remove stopwords and stem the words
|
||||||
|
words = [stemmer.stem(word) for word in words if word not in stop_words]
|
||||||
|
|
||||||
|
# Join the words back into a single string
|
||||||
|
text = ' '.join(words)
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
# Apply the preprocessing to the 'text_1' and 'text_2' columns
|
||||||
|
tqdm.pandas(desc="Processing 'text_1'")
|
||||||
|
in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text)
|
||||||
|
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Processing 'text_2'")
|
||||||
|
in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
import re
|
||||||
|
|
||||||
|
def handle_numbers_and_special_chars(text):
|
||||||
|
# Remove numbers
|
||||||
|
text = re.sub(r'\d+', '', text)
|
||||||
|
|
||||||
|
# Remove special characters
|
||||||
|
text = re.sub(r'\W+', ' ', text)
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
# Apply the function to the 'text_1' and 'text_2' columns
|
||||||
|
tqdm.pandas(desc="Processing 'text_1'")
|
||||||
|
in_df['text_1'] = in_df['text_1'].progress_apply(handle_numbers_and_special_chars)
|
||||||
|
|
||||||
|
tqdm.pandas(desc="Processing 'text_2'")
|
||||||
|
in_df['text_2'] = in_df['text_2'].progress_apply(handle_numbers_and_special_chars)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
in_df.to_csv('preprocessed_test_text1_text2.tsv', sep='\t', index=False)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# Initialize a counter for the correct predictions
|
||||||
|
correct_predictions = 0
|
||||||
|
|
||||||
|
# Open the output file
|
||||||
|
with open('out.tsv', 'w') as f:
|
||||||
|
# Iterate over the rows of the input DataFrame and the expected DataFrame
|
||||||
|
for i, ((_, input_row), expected_word) in enumerate(zip(in_df.iterrows(), expected_df[0])):
|
||||||
|
try:
|
||||||
|
# Get the context
|
||||||
|
context_1 = input_row['text_1']
|
||||||
|
context_2 = input_row['text_2']
|
||||||
|
|
||||||
|
# Predict the missing word and get the probabilities
|
||||||
|
predicted_word, prob, other_probs_sum = predict_missing_word(model, context_1, context_2)
|
||||||
|
|
||||||
|
# If any of the values are None, use placeholder values
|
||||||
|
if predicted_word is None:
|
||||||
|
predicted_word = 'placeholder'
|
||||||
|
if prob is None:
|
||||||
|
prob = 0.001
|
||||||
|
if other_probs_sum is None:
|
||||||
|
other_probs_sum = 0.001
|
||||||
|
|
||||||
|
# Write the output to the file
|
||||||
|
f.write(f"{predicted_word}:{prob:.4f} :{other_probs_sum:.4f}\n")
|
||||||
|
|
||||||
|
# Check if the prediction is correct
|
||||||
|
if predicted_word == expected_word:
|
||||||
|
correct_predictions += 1
|
||||||
|
|
||||||
|
# Log progress every 1000 iterations
|
||||||
|
if i % 1000 == 0:
|
||||||
|
print(f"Processed {i} rows. Current accuracy: {correct_predictions / (i+1)}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing row {i}: {e}")
|
||||||
|
|
||||||
|
# Calculate the accuracy
|
||||||
|
accuracy = correct_predictions / len(in_df)
|
||||||
|
|
||||||
|
print(f"The accuracy of the model is {accuracy}")
|
||||||
|
|
||||||
|
|
7414
test-A/hate-speech-info.tsv
Normal file
7414
test-A/hate-speech-info.tsv
Normal file
File diff suppressed because it is too large
Load Diff
7414
test-A/in.tsv
Normal file
7414
test-A/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
test-A/in.tsv.xz
Normal file
BIN
test-A/in.tsv.xz
Normal file
Binary file not shown.
7414
test-A/out.tsv
Normal file
7414
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user