06_kenlm/06_kenlm.ipynb
2024-05-15 02:33:04 +00:00

50 KiB

import os
import lzma

folders = ["./challenging-america-word-gap-prediction/dev-0",
           "./challenging-america-word-gap-prediction/test-A",
           "./challenging-america-word-gap-prediction/train"]

for folder in folders:
    for file in os.listdir(folder):
        if file.endswith(".tsv.xz"):
            file_path = os.path.join(folder, file)
            output_path = os.path.splitext(file_path)[0]  # Remove the .xz extension
            with lzma.open(file_path, "rb") as compressed_file:
                with open(output_path, "wb") as output_file:
                    output_file.write(compressed_file.read())
import nltk
nltk.download('punkt')
[nltk_data] Downloading package punkt to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
True
import pandas as pd

# Load the data
in_df = pd.read_csv('./challenging-america-word-gap-prediction/train/in.tsv', sep='\t', header=None, on_bad_lines='warn')
expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None, on_bad_lines='warn')

# Print out the first few rows of each DataFrame
print("in_df:")
print(in_df.head())
print("\nexpected_df:")
print(expected_df.head())
print("\nhate_speech_info_df:")

# Print out more information about each DataFrame
print("\nin_df info:")
print(in_df.info())
print("\nexpected_df info:")
print(expected_df.info())
Skipping line 30538: expected 8 fields, saw 9
Skipping line 37185: expected 8 fields, saw 9
Skipping line 40930: expected 8 fields, saw 9
Skipping line 44499: expected 8 fields, saw 9
Skipping line 46409: expected 8 fields, saw 9
Skipping line 52642: expected 8 fields, saw 9
Skipping line 53046: expected 8 fields, saw 9

Skipping line 69658: expected 8 fields, saw 9
Skipping line 71325: expected 8 fields, saw 9
Skipping line 72955: expected 8 fields, saw 9
Skipping line 80528: expected 8 fields, saw 9
Skipping line 96979: expected 8 fields, saw 9
Skipping line 121731: expected 8 fields, saw 9
Skipping line 126630: expected 8 fields, saw 9

Skipping line 132289: expected 8 fields, saw 9
Skipping line 140251: expected 8 fields, saw 9
Skipping line 142374: expected 8 fields, saw 9
Skipping line 149592: expected 8 fields, saw 9
Skipping line 150041: expected 8 fields, saw 9
Skipping line 151624: expected 8 fields, saw 9
Skipping line 158163: expected 8 fields, saw 9
Skipping line 159665: expected 8 fields, saw 9
Skipping line 171749: expected 8 fields, saw 9
Skipping line 174845: expected 8 fields, saw 9
Skipping line 177638: expected 8 fields, saw 9
Skipping line 178778: expected 8 fields, saw 9
Skipping line 188823: expected 8 fields, saw 9
Skipping line 191398: expected 8 fields, saw 9

Skipping line 196865: expected 8 fields, saw 9
Skipping line 203572: expected 8 fields, saw 9
Skipping line 207802: expected 8 fields, saw 9
Skipping line 214509: expected 8 fields, saw 9
Skipping line 214633: expected 8 fields, saw 9
Skipping line 217906: expected 8 fields, saw 9
Skipping line 220906: expected 8 fields, saw 9
Skipping line 238000: expected 8 fields, saw 9
Skipping line 257754: expected 8 fields, saw 9
Skipping line 259366: expected 8 fields, saw 9
Skipping line 261826: expected 8 fields, saw 9

Skipping line 272727: expected 8 fields, saw 9
Skipping line 280527: expected 8 fields, saw 9
Skipping line 282454: expected 8 fields, saw 9
Skipping line 285910: expected 8 fields, saw 9
Skipping line 289865: expected 8 fields, saw 9
Skipping line 292892: expected 8 fields, saw 9
Skipping line 292984: expected 8 fields, saw 9
Skipping line 293058: expected 8 fields, saw 9
Skipping line 302716: expected 8 fields, saw 9
Skipping line 303370: expected 8 fields, saw 9
Skipping line 314194: expected 8 fields, saw 9
Skipping line 321975: expected 8 fields, saw 9
Skipping line 324999: expected 8 fields, saw 9

Skipping line 331978: expected 8 fields, saw 9
Skipping line 345426: expected 8 fields, saw 9
Skipping line 345951: expected 8 fields, saw 9
Skipping line 355430: expected 8 fields, saw 9
Skipping line 358744: expected 8 fields, saw 9
Skipping line 361491: expected 8 fields, saw 9
Skipping line 370443: expected 8 fields, saw 9
Skipping line 388057: expected 8 fields, saw 9
Skipping line 391061: expected 8 fields, saw 9

Skipping line 395391: expected 8 fields, saw 9
Skipping line 404270: expected 8 fields, saw 9
Skipping line 407896: expected 8 fields, saw 9
Skipping line 409881: expected 8 fields, saw 9
Skipping line 421230: expected 8 fields, saw 9
Skipping line 425850: expected 8 fields, saw 9
Skipping line 427269: expected 8 fields, saw 9

in_df:
                                  0                           1        2  \
0  4e04702da929c78c52baf09c1851d3ff                          ST  ChronAm   
1  b374dadd940510271d9675d3e8caf9d8   DAILY ARIZONA SILVER BELT  ChronAm   
2  adb666c426bdc10fd949cb824da6c0d0   THE SAVANNAH MORNING NEWS  ChronAm   
3  bc2c9aa0b77d724311e3c2e12fc61c92  CHARLES CITY INTELLIGENCER  ChronAm   
4  0f612b991a39c712f0d745835b8b2f0d                EVENING STAR  ChronAm   

             3          4           5  \
0  1919.604110  30.475470  -90.100911   
1  1909.097260  33.399478 -110.870950   
2  1900.913699  32.080926  -81.091177   
3  1864.974044  43.066361  -92.672411   
4  1878.478082  38.894955  -77.036646   

                                                   6  \
0  came fiom the last place to this\nplace, and t...   
1  MB. BOOT'S POLITICAL OBEED\nAttempt to imagine...   
2  Thera were in 1771 only aeventy-nine\n*ub*erlb...   
3  whenever any prize property shall!*' condemn- ...   
4  SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T...   

                                                   7  
0  said\nit's all squash. The best I could get\ni...  
1  \ninto a proper perspective with those\nminor ...  
2                                                NaN  
3  the ceitihcate of'\noperate to prevent tfie ma...  
4  \nTerms of sale: One-tblrd, togethor with the ...  

expected_df:
         0
0      lie
1  himself
2       of
3     ably
4        j

hate_speech_info_df:

in_df info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 428517 entries, 0 to 428516
Data columns (total 8 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   0       428517 non-null  object 
 1   1       428517 non-null  object 
 2   2       428517 non-null  object 
 3   3       428517 non-null  float64
 4   4       428517 non-null  float64
 5   5       428517 non-null  float64
 6   6       428517 non-null  object 
 7   7       425735 non-null  object 
dtypes: float64(3), object(5)
memory usage: 26.2+ MB
None

expected_df info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279623 entries, 0 to 279622
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   0       279619 non-null  object
dtypes: object(1)
memory usage: 2.1+ MB
None
# Drop unnecessary columns
columns_to_drop = [0, 1, 2, 3, 4, 5]  # Column indices to drop
in_df.drop(columns_to_drop, axis=1, inplace=True)

# Rename remaining columns for clarity
in_df.columns = ['text_1', 'text_2']
in_df['text_2'].fillna('', inplace=True)
from tqdm import tqdm

# Define a function to replace '\n' with ' '
def replace_newline(text):
    if isinstance(text, str):
        return text.replace('\\\\n', ' ')
    return text

def replace_tabulation(text):
    if isinstance(text, str):
        return text.replace('\\\\t', ' ')
    return text

# Apply the function to 'text_1' and 'text_2' columns and show progress
tqdm.pandas(desc="Replacing '\\\\n' in 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline)

tqdm.pandas(desc="Replacing '\\\\t' in 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(replace_tabulation)

tqdm.pandas(desc="Replacing '\\\\n' in 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline)


tqdm.pandas(desc="Replacing '\\\\t' in 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(replace_tabulation)
Replacing '\n' in 'text_1': 100%|██████████| 428517/428517 [00:02<00:00, 166646.43it/s]
Replacing '\t' in 'text_1': 100%|██████████| 428517/428517 [00:01<00:00, 422489.36it/s]
Replacing '\n' in 'text_2': 100%|██████████| 428517/428517 [00:02<00:00, 149443.26it/s]
Replacing '\t' in 'text_2': 100%|██████████| 428517/428517 [00:01<00:00, 417969.18it/s]
print(in_df.head())
                                              text_1  \
0  came fiom the last place to this place, and th...   
1  MB. BOOT'S POLITICAL OBEED Attempt to imagine ...   
2  Thera were in 1771 only aeventy-nine *ub*erlbe...   
3  whenever any prize property shall!*' condemn- ...   
4  SA LKOFVALUABLE UNIMPBOV&D RE\\\\L JSIATF. ON TH...   

                                              text_2  
0  said it's all squash. The best I could get in ...  
1   into a proper perspective with those minor se...  
2                                                     
3  the ceitihcate of' operate to prevent tfie mak...  
4   Terms of sale: One-tblrd, togethor with the e...  
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
from tqdm import tqdm


# If not already done, download the NLTK English stopwords and the Punkt Tokenizer Models
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize the text
    words = word_tokenize(text)
    
    # Remove stopwords and stem the words
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    
    # Join the words back into a single string
    text = ' '.join(words)
    
    return text

# Apply the preprocessing to the 'text_1' and 'text_2' columns
tqdm.pandas(desc="Processing 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text)


tqdm.pandas(desc="Processing 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text)
[nltk_data] Downloading package punkt to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Processing 'text_1': 100%|██████████| 428517/428517 [16:38<00:00, 429.15it/s]
Processing 'text_2': 100%|██████████| 428517/428517 [14:55<00:00, 478.46it/s]
print(in_df.head())
                                              text_1  \
0  came fiom last place place place first road ev...   
1  mb boot polit obe attempt imagin piatt make ad...   
2  thera 1771 aeventynin uberlb lo lloyd nearli 1...   
3  whenev prize properti shall condemn appeal dis...   
4  sa lkofvalu unimpbovd rel jsiatf north bideof ...   

                                              text_2  
0  said squash best could get hotel soup sandwich...  
1  proper perspect minor senatori duti tho fill i...  
2                                                     
3  ceitihc oper prevent tfie make execut district...  
4  term sale onetblrd togethor ex¬ pens sale cash...  
# Save 'in_df' DataFrame to a .tsv file
in_df.to_csv('preprocessed_text1_text2.tsv', sep='\t', index=False)
import re

def handle_numbers_and_special_chars(text):
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove special characters
    text = re.sub(r'\W+', ' ', text)
    
    return text

# Apply the function to the 'text_1' and 'text_2' columns
tqdm.pandas(desc="Processing 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(handle_numbers_and_special_chars)

tqdm.pandas(desc="Processing 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(handle_numbers_and_special_chars)
Processing 'text_1': 100%|██████████| 428517/428517 [00:21<00:00, 19801.75it/s]
Processing 'text_2': 100%|██████████| 428517/428517 [00:25<00:00, 16938.99it/s]
from spellchecker import SpellChecker

spell = SpellChecker()

def correct_spelling(text):
    # Tokenize the text
    words = text.split()
    
    # Correct spelling
    corrected_words = [spell.correction(word) if spell.correction(word) is not None else '' for word in words]
    
    # Join the words back into a single string
    text = ' '.join(corrected_words)
    
    return text

# Apply the spelling correction to the 'text_1' and 'text_2' columns
tqdm.pandas(desc="Spelling Correction 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(correct_spelling)

tqdm.pandas(desc="Spelling Correction 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(correct_spelling)
# Define a function to concatenate 'text_1' and 'text_2'
def concatenate_texts(row):
    return str(row['text_1']) + ' <MASK> ' + str(row['text_2'])

# Apply the function to each row and show progress
tqdm.pandas(desc="Concatenating 'text_1' and 'text_2'")
in_df['text'] = in_df.progress_apply(concatenate_texts, axis=1)

# Now you can drop 'text_1' and 'text_2' columns if you want
in_df.drop(['text_1', 'text_2'], axis=1, inplace=True)
from tqdm import tqdm
tqdm.pandas()

# Load the preprocessed data
in_df = pd.read_csv('preprocessed_text1_text2.tsv', sep='\t')

# Load the expected words
expected_df = pd.read_csv('./challenging-america-word-gap-prediction/train/expected.tsv', sep='\t', header=None, on_bad_lines='warn')
expected_df.columns = ['expected_word']

# Add the expected words to in_df
in_df = pd.concat([in_df, expected_df], axis=1)

# Define a function to concatenate 'text_1' and 'expected_word' and 'text_2'
def concatenate_texts(row):
    return str(row['text_1']) + ' ' + str(row['expected_word']) + ' ' + str(row['text_2'])

# Apply the function to each row and show progress
in_df['unmasked_text'] = in_df.progress_apply(concatenate_texts, axis=1)
100%|██████████| 428517/428517 [00:05<00:00, 83867.84it/s]
in_df['unmasked_text'].to_csv('preprocessed_text_join_unmask.tsv', sep='\t', index=False)
# Save the 'text' column to a text file
in_df['unmasked_text'].to_csv('training_data.txt', index=False, header=False)
!lmplz -o 5 --discount_fallback < training_data.txt > language_model.arpa
=== 1/5 Counting and sorting n-grams ===
Reading /teamspace/studios/this_studio/training_data.txt
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Unigram tokens 76946114 types 3214378
=== 2/5 Calculating and sorting adjusted counts ===
Chain sizes: 1:38572536 2:1266848384 3:2375340800 4:3800545280 5:5542461952
Statistics:
1 3214378 D1=0.828129 D2=1.02195 D3+=1.19945
2 28265496 D1=0.815336 D2=1.00509 D3+=1.20745
3 64425776 D1=0.933716 D2=1.31704 D3+=1.47374
4 71021141 D1=0.980912 D2=1.49465 D3+=1.62427
5 72056555 D1=0.968608 D2=1.34446 D3+=1.42151
Memory estimate for binary LM:
type      MB
probing 5063 assuming -p 1.5
probing 6012 assuming -r models -p 1.5
trie    2711 without quantization
trie    1596 assuming -q 8 -b 8 quantization 
trie    2329 assuming -a 22 array pointer compression
trie    1214 assuming -a 22 -q 8 -b 8 array pointer compression and quantization
=== 3/5 Calculating and sorting initial probabilities ===
Chain sizes: 1:38572536 2:452247936 3:1288515520 4:1704507384 5:2017583540
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
####################################################################################################
=== 4/5 Calculating and writing order-interpolated probabilities ===
Chain sizes: 1:38572536 2:452247936 3:1288515520 4:1704507384 5:2017583540
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
####################################################################################################
=== 5/5 Writing ARPA model ===
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Name:lmplz	VmPeak:12890788 kB	VmRSS:4496 kB	RSSMax:5436904 kB	user:274.028	sys:89.3375	CPU:363.366	real:343.888
!build_binary language_model.arpa language_model.binary
Reading language_model.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
SUCCESS
!lmplz -o 6 --discount_fallback -S 80% --prune 0 0 1 1 1 1 < training_data.txt > language_model.arpa
=== 1/5 Counting and sorting n-grams ===
Reading /teamspace/studios/this_studio/training_data.txt
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Unigram tokens 76946114 types 3214378
=== 2/5 Calculating and sorting adjusted counts ===
Chain sizes: 1:38572536 2:799089024 3:1498291968 4:2397267200 5:3496014592 6:4794534400
Statistics:
1 3214378 D1=0.828129 D2=1.02195 D3+=1.19945
2 28265496 D1=0.815336 D2=1.00509 D3+=1.20745
3 4497710/64425776 D1=0.933716 D2=1.31704 D3+=1.47374
4 2178876/71021141 D1=0.980912 D2=1.49465 D3+=1.62427
5 1699108/72056555 D1=0.988779 D2=1.56326 D3+=1.72651
6 1460655/72365687 D1=0.972772 D2=1.34844 D3+=1.44176
Memory estimate for binary LM:
type      MB
probing  943 assuming -p 1.5
probing 1165 assuming -r models -p 1.5
trie     553 without quantization
trie     343 assuming -q 8 -b 8 quantization 
trie     474 assuming -a 22 array pointer compression
trie     265 assuming -a 22 -q 8 -b 8 array pointer compression and quantization
=== 3/5 Calculating and sorting initial probabilities ===
Chain sizes: 1:38572536 2:452247936 3:89954200 4:52293024 5:47575024 6:46740960
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
**##################################################################################################
=== 4/5 Calculating and writing order-interpolated probabilities ===
Chain sizes: 1:38572536 2:452247936 3:89954200 4:52293024 5:47575024 6:46740960
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
####################################################################################################
=== 5/5 Writing ARPA model ===
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Name:lmplz	VmPeak:12907180 kB	VmRSS:5872 kB	RSSMax:7679904 kB	user:168.818	sys:56.6427	CPU:225.461	real:147.307
!wc -l ./challenging-america-word-gap-prediction/dev-0/in.tsv
10519 ./challenging-america-word-gap-prediction/dev-0/in.tsv
import pandas as pd
import csv

# Load the data
try:
    in_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/in.tsv', sep='\t', header=None, on_bad_lines='error')
except Exception as e:
    print(e)
expected_df = pd.read_csv('./challenging-america-word-gap-prediction/dev-0/expected.tsv', sep='\t', header=None, on_bad_lines='warn', quoting=csv.QUOTE_NONE)

print(in_df.shape[0])

# Drop unnecessary columns
columns_to_drop = [0, 1, 2, 3, 4, 5]  # Column indices to drop
in_df.drop(columns_to_drop, axis=1, inplace=True)

# Rename remaining columns for clarity
in_df.columns = ['text_1', 'text_2']

in_df['text_1'].fillna('placeholder', inplace=True)
in_df['text_2'].fillna('placeholder', inplace=True)
import pandas as pd
import csv

# Placeholder line
placeholder_line = ['placeholder'] * 8  # Adjust the number of fields as needed

# Read the file line by line
with open('./challenging-america-word-gap-prediction/dev-0/in.tsv', 'r') as f:
    lines = f.readlines()

# Split each line into fields and replace problematic lines with the placeholder line
lines = [line.strip().split('\t') if len(line.strip().split('\t')) == 8 else placeholder_line for line in lines]

# Convert the list of lines into a DataFrame
in_df = pd.DataFrame(lines)

# Print the number of rows in the DataFrame
print(in_df.shape[0])

# Drop unnecessary columns
columns_to_drop = [0, 1, 2, 3, 4, 5]  # Column indices to drop
in_df.drop(columns_to_drop, axis=1, inplace=True)

# Rename remaining columns for clarity
in_df.columns = ['text_1', 'text_2']

in_df['text_1'].fillna('placeholder', inplace=True)
in_df['text_2'].fillna('placeholder', inplace=True)
10519
print(in_df.shape[0])
10519
from tqdm import tqdm

# Define a function to replace '\n' with ' '
def replace_newline(text):
    if isinstance(text, str):
        return text.replace('\\\\n', ' ')
    return text

def replace_tabulation(text):
    if isinstance(text, str):
        return text.replace('\\\\t', ' ')
    return text

# Apply the function to 'text_1' and 'text_2' columns and show progress
tqdm.pandas(desc="Replacing '\\\\n' in 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline)

tqdm.pandas(desc="Replacing '\\\\t' in 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(replace_tabulation)

tqdm.pandas(desc="Replacing '\\\\n' in 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline)


tqdm.pandas(desc="Replacing '\\\\t' in 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(replace_tabulation)
Replacing '\n' in 'text_1': 100%|██████████| 10519/10519 [00:00<00:00, 223066.53it/s]
Replacing '\t' in 'text_1': 100%|██████████| 10519/10519 [00:00<00:00, 535825.65it/s]
Replacing '\n' in 'text_2': 100%|██████████| 10519/10519 [00:00<00:00, 216324.84it/s]
Replacing '\t' in 'text_2': 100%|██████████| 10519/10519 [00:00<00:00, 534630.94it/s]
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
from tqdm import tqdm


# If not already done, download the NLTK English stopwords and the Punkt Tokenizer Models
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize the text
    words = word_tokenize(text)
    
    # Remove stopwords and stem the words
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    
    # Join the words back into a single string
    text = ' '.join(words)
    
    return text

# Apply the preprocessing to the 'text_1' and 'text_2' columns
tqdm.pandas(desc="Processing 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text)


tqdm.pandas(desc="Processing 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text)
[nltk_data] Downloading package punkt to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Processing 'text_1': 100%|██████████| 10519/10519 [00:23<00:00, 440.48it/s]
Processing 'text_2': 100%|██████████| 10519/10519 [00:29<00:00, 358.63it/s]
import re

def handle_numbers_and_special_chars(text):
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove special characters
    text = re.sub(r'\W+', ' ', text)
    
    return text

# Apply the function to the 'text_1' and 'text_2' columns
tqdm.pandas(desc="Processing 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(handle_numbers_and_special_chars)

tqdm.pandas(desc="Processing 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(handle_numbers_and_special_chars)
Processing 'text_1': 100%|██████████| 10519/10519 [00:00<00:00, 21823.01it/s]
Processing 'text_2': 100%|██████████| 10519/10519 [00:00<00:00, 21693.77it/s]
in_df.to_csv('preprocessed_dev_text1_text2.tsv', sep='\t', index=False)
import kenlm

# Load the language model
model = kenlm.Model('language_model.binary')
import numpy as np

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

def predict_missing_word(model, context_1, context_2):
    # Define the vocabulary
    vocabulary = set(' '.join([context_1, context_2]).split())

    # Initialize a dictionary to store the words and their scores
    word_scores = {}

    # Iterate over the vocabulary
    for word in vocabulary:
        try:
            # Generate the sentence
            sentence = f"{context_1} {word} {context_2}"
            
            # Score the sentence and store it in the dictionary
            word_scores[word] = model.score(sentence)

        except Exception as e:
            print(f"Error processing word '{word}': {e}")
            continue

    # If no word was found, return None for all values
    if not word_scores:
        return None, None, None

    # Convert the scores to probabilities using the softmax function
    word_probs = {word: max(0.001, prob) for word, prob in zip(word_scores.keys(), softmax(list(word_scores.values())))}

    # Find the word with the highest probability
    best_word, best_prob = max(word_probs.items(), key=lambda x: x[1])

    # Calculate the sum of probabilities for the other words
    other_probs_sum = sum(prob for word, prob in word_probs.items() if word != best_word)

    return best_word, best_prob, other_probs_sum
# Initialize a counter for the correct predictions
correct_predictions = 0

# Open the output file
with open('out.tsv', 'w') as f:
    # Iterate over the rows of the input DataFrame and the expected DataFrame
    for i, ((_, input_row), expected_word) in enumerate(zip(in_df.iterrows(), expected_df[0])):
        try:
            # Get the context
            context_1 = input_row['text_1']
            context_2 = input_row['text_2']

            # Predict the missing word and get the probabilities
            predicted_word, prob, other_probs_sum = predict_missing_word(model, context_1, context_2)

            # If any of the values are None, use placeholder values
            if predicted_word is None:
                predicted_word = 'placeholder'
            if prob is None:
                prob = 0.001
            if other_probs_sum is None:
                other_probs_sum = 0.001

            # Write the output to the file
            f.write(f"{predicted_word}:{prob:.4f} :{other_probs_sum:.4f}\n")

            # Check if the prediction is correct
            if predicted_word == expected_word:
                correct_predictions += 1

            # Log progress every 1000 iterations
            if i % 1000 == 0:
                print(f"Processed {i} rows. Current accuracy: {correct_predictions / (i+1)}")

        except Exception as e:
            print(f"Error processing row {i}: {e}")

# Calculate the accuracy
accuracy = correct_predictions / len(in_df)

print(f"The accuracy of the model is {accuracy}")
Processed 0 rows. Current accuracy: 0.0
Processed 1000 rows. Current accuracy: 0.016983016983016984
Processed 2000 rows. Current accuracy: 0.026486756621689155
Processed 3000 rows. Current accuracy: 0.02599133622125958
Processed 4000 rows. Current accuracy: 0.024493876530867282
Processed 5000 rows. Current accuracy: 0.02559488102379524
Processed 6000 rows. Current accuracy: 0.024329278453591067
Processed 7000 rows. Current accuracy: 0.023853735180688472
Processed 8000 rows. Current accuracy: 0.023122109736282963
Processed 9000 rows. Current accuracy: 0.022886345961559827
Processed 10000 rows. Current accuracy: 0.0225977402259774
The accuracy of the model is 0.02243559273695218
import pandas as pd
import csv

# Placeholder line
placeholder_line = ['placeholder'] * 8  # Adjust the number of fields as needed

# Read the file line by line
with open('./challenging-america-word-gap-prediction/test-A/in.tsv', 'r') as f:
    lines = f.readlines()

# Split each line into fields and replace problematic lines with the placeholder line
lines = [line.strip().split('\t') if len(line.strip().split('\t')) == 8 else placeholder_line for line in lines]

# Convert the list of lines into a DataFrame
in_df = pd.DataFrame(lines)

# Print the number of rows in the DataFrame
print(in_df.shape[0])

# Drop unnecessary columns
columns_to_drop = [0, 1, 2, 3, 4, 5]  # Column indices to drop
in_df.drop(columns_to_drop, axis=1, inplace=True)

# Rename remaining columns for clarity
in_df.columns = ['text_1', 'text_2']

in_df['text_1'].fillna('placeholder', inplace=True)
in_df['text_2'].fillna('placeholder', inplace=True)
7414
from tqdm import tqdm

# Define a function to replace '\n' with ' '
def replace_newline(text):
    if isinstance(text, str):
        return text.replace('\\\\n', ' ')
    return text

def replace_tabulation(text):
    if isinstance(text, str):
        return text.replace('\\\\t', ' ')
    return text

# Apply the function to 'text_1' and 'text_2' columns and show progress
tqdm.pandas(desc="Replacing '\\\\n' in 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(replace_newline)

tqdm.pandas(desc="Replacing '\\\\t' in 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(replace_tabulation)

tqdm.pandas(desc="Replacing '\\\\n' in 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(replace_newline)


tqdm.pandas(desc="Replacing '\\\\t' in 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(replace_tabulation)
Replacing '\n' in 'text_1': 100%|██████████| 7414/7414 [00:00<00:00, 216746.15it/s]
Replacing '\t' in 'text_1': 100%|██████████| 7414/7414 [00:00<00:00, 545247.75it/s]
Replacing '\n' in 'text_2': 100%|██████████| 7414/7414 [00:00<00:00, 223832.27it/s]
Replacing '\t' in 'text_2': 100%|██████████| 7414/7414 [00:00<00:00, 569784.70it/s]
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
from tqdm import tqdm


# If not already done, download the NLTK English stopwords and the Punkt Tokenizer Models
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize the text
    words = word_tokenize(text)
    
    # Remove stopwords and stem the words
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    
    # Join the words back into a single string
    text = ' '.join(words)
    
    return text

# Apply the preprocessing to the 'text_1' and 'text_2' columns
tqdm.pandas(desc="Processing 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(preprocess_text)


tqdm.pandas(desc="Processing 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(preprocess_text)
[nltk_data] Downloading package punkt to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Processing 'text_1': 100%|██████████| 7414/7414 [00:20<00:00, 365.75it/s]
Processing 'text_2': 100%|██████████| 7414/7414 [00:15<00:00, 478.59it/s]
import re

def handle_numbers_and_special_chars(text):
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove special characters
    text = re.sub(r'\W+', ' ', text)
    
    return text

# Apply the function to the 'text_1' and 'text_2' columns
tqdm.pandas(desc="Processing 'text_1'")
in_df['text_1'] = in_df['text_1'].progress_apply(handle_numbers_and_special_chars)

tqdm.pandas(desc="Processing 'text_2'")
in_df['text_2'] = in_df['text_2'].progress_apply(handle_numbers_and_special_chars)
Processing 'text_1': 100%|██████████| 7414/7414 [00:00<00:00, 21928.33it/s]
Processing 'text_2': 100%|██████████| 7414/7414 [00:00<00:00, 20930.30it/s]
in_df.to_csv('preprocessed_test_text1_text2.tsv', sep='\t', index=False)
# Initialize a counter for the correct predictions
correct_predictions = 0

# Open the output file
with open('out.tsv', 'w') as f:
    # Iterate over the rows of the input DataFrame and the expected DataFrame
    for i, ((_, input_row), expected_word) in enumerate(zip(in_df.iterrows(), expected_df[0])):
        try:
            # Get the context
            context_1 = input_row['text_1']
            context_2 = input_row['text_2']

            # Predict the missing word and get the probabilities
            predicted_word, prob, other_probs_sum = predict_missing_word(model, context_1, context_2)

            # If any of the values are None, use placeholder values
            if predicted_word is None:
                predicted_word = 'placeholder'
            if prob is None:
                prob = 0.001
            if other_probs_sum is None:
                other_probs_sum = 0.001

            # Write the output to the file
            f.write(f"{predicted_word}:{prob:.4f} :{other_probs_sum:.4f}\n")

            # Check if the prediction is correct
            if predicted_word == expected_word:
                correct_predictions += 1

            # Log progress every 1000 iterations
            if i % 1000 == 0:
                print(f"Processed {i} rows. Current accuracy: {correct_predictions / (i+1)}")

        except Exception as e:
            print(f"Error processing row {i}: {e}")

# Calculate the accuracy
accuracy = correct_predictions / len(in_df)

print(f"The accuracy of the model is {accuracy}")
Processed 0 rows. Current accuracy: 0.0
Processed 1000 rows. Current accuracy: 0.000999000999000999
Processed 2000 rows. Current accuracy: 0.0004997501249375312
Processed 3000 rows. Current accuracy: 0.0003332222592469177
Processed 4000 rows. Current accuracy: 0.0004998750312421895
Processed 5000 rows. Current accuracy: 0.0005998800239952009
Processed 6000 rows. Current accuracy: 0.0006665555740709882
Processed 7000 rows. Current accuracy: 0.0005713469504356521
The accuracy of the model is 0.0005395198273536552