challenging-america-word-ga.../trigram.ipynb
2023-05-10 00:37:23 +02:00

60 KiB
Raw Blame History

from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
cd drive/MyDrive
/content/drive/MyDrive
cd challenging-america-word-gap-prediction/
/content/drive/MyDrive/challenging-america-word-gap-prediction
import pandas as pd
data =  pd.read_csv("train/in.tsv.xz", sep="\t", on_bad_lines='skip', header=None, encoding="utf-8")

exp_words =  pd.read_csv("train/expected.tsv", sep="\t", on_bad_lines='skip', header=None, encoding="utf-8")
data[:10]
0 1 2 3 4 5 6 7
0 4e04702da929c78c52baf09c1851d3ff ST ChronAm 1919.604110 30.475470 -90.100911 came fiom the last place to this\nplace, and t... said\nit's all squash. The best I could get\ni...
1 b374dadd940510271d9675d3e8caf9d8 DAILY ARIZONA SILVER BELT ChronAm 1909.097260 33.399478 -110.870950 MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... \ninto a proper perspective with those\nminor ...
2 adb666c426bdc10fd949cb824da6c0d0 THE SAVANNAH MORNING NEWS ChronAm 1900.913699 32.080926 -81.091177 Thera were in 1771 only aeventy-nine\n*ub*erlb... NaN
3 bc2c9aa0b77d724311e3c2e12fc61c92 CHARLES CITY INTELLIGENCER ChronAm 1864.974044 43.066361 -92.672411 whenever any prize property shall!*' condemn- ... the ceitihcate of'\noperate to prevent tfie ma...
4 0f612b991a39c712f0d745835b8b2f0d EVENING STAR ChronAm 1878.478082 38.894955 -77.036646 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T... \nTerms of sale: One-tblrd, togethor with the ...
5 4c13fb3d2e6eef35fa28e7bae7868d60 EDGEFIELD ADVERTISER ChronAm 1913.346575 33.789577 -81.929558 God includes all. and would we not\ngrieve if ... lot of spiritual\nwaifs all about us. children...
6 a452eadfc3f4a475147728c5f4005429 DAILY LOS ANGELES HERALD ChronAm 1883.801370 34.054935 -118.244476 The said action is brought to obtain a decree ... then to obtain an execution against said Vie\n...
7 b970ee32372d81f1fd59ab6196e797c9 THE FINDLAY JEFFERSONIAN ChronAm 1874.828767 41.041387 -83.650398 party" is a useless exhortation to intel-\nlig... with all tjie hatred that\nsurvives the war; a...
8 d130f899a50db2792c546cc978dc930c BUTLER CITIZEN ChronAm 1883.793151 40.861021 -79.895225 has led me to accept, everything I read\nwith ... that the earth has mo-\ntion. Aday ortwo agoIt...
9 80e56928e09b93529d206708ac905b63 FERGUS COUNTY ARGUS ChronAm 1892.821038 47.062473 -109.428238 The wool circulars alluded to are\nthose which... accuracy, as\nthey were furnished by him as ch...
train_data = data[[6, 7]]
train_data[:10]
6 7
0 came fiom the last place to this\nplace, and t... said\nit's all squash. The best I could get\ni...
1 MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... \ninto a proper perspective with those\nminor ...
2 Thera were in 1771 only aeventy-nine\n*ub*erlb... NaN
3 whenever any prize property shall!*' condemn- ... the ceitihcate of'\noperate to prevent tfie ma...
4 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T... \nTerms of sale: One-tblrd, togethor with the ...
5 God includes all. and would we not\ngrieve if ... lot of spiritual\nwaifs all about us. children...
6 The said action is brought to obtain a decree ... then to obtain an execution against said Vie\n...
7 party" is a useless exhortation to intel-\nlig... with all tjie hatred that\nsurvives the war; a...
8 has led me to accept, everything I read\nwith ... that the earth has mo-\ntion. Aday ortwo agoIt...
9 The wool circulars alluded to are\nthose which... accuracy, as\nthey were furnished by him as ch...
train_data= pd.concat([train_data, exp_words], axis=1)
train_data.rename(columns={6: 'First Part', 7: 'Second Part', 0:'Expected word'}, inplace=True)
train_data[:10]
First Part Second Part Expected word
0 came fiom the last place to this\nplace, and t... said\nit's all squash. The best I could get\ni... lie
1 MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... \ninto a proper perspective with those\nminor ... himself
2 Thera were in 1771 only aeventy-nine\n*ub*erlb... NaN of
3 whenever any prize property shall!*' condemn- ... the ceitihcate of'\noperate to prevent tfie ma... ably
4 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T... \nTerms of sale: One-tblrd, togethor with the ... j
5 God includes all. and would we not\ngrieve if ... lot of spiritual\nwaifs all about us. children... he
6 The said action is brought to obtain a decree ... then to obtain an execution against said Vie\n... graph
7 party" is a useless exhortation to intel-\nlig... with all tjie hatred that\nsurvives the war; a... 011
8 has led me to accept, everything I read\nwith ... that the earth has mo-\ntion. Aday ortwo agoIt... separately.
9 The wool circulars alluded to are\nthose which... accuracy, as\nthey were furnished by him as ch... a
train_data['Concatenated'] = train_data['First Part'] + train_data['Expected word'] + train_data['Second Part']
train_data[:3]
First Part Second Part Expected word Concatenated
0 came fiom the last place to this\nplace, and t... said\nit's all squash. The best I could get\ni... lie came fiom the last place to this\nplace, and t...
1 MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... \ninto a proper perspective with those\nminor ... himself MB. BOOT'S POLITICAL OBEED\nAttempt to imagine...
2 Thera were in 1771 only aeventy-nine\n*ub*erlb... NaN of NaN
import regex as re
train_data.replace('\n', '', regex=True)
First Part Second Part Expected word Concatenated
0 came fiom the last place to this\nplace, and t... said\nit's all squash. The best I could get\ni... lie came fiom the last place to this\nplace, and t...
1 MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... \ninto a proper perspective with those\nminor ... himself MB. BOOT'S POLITICAL OBEED\nAttempt to imagine...
2 Thera were in 1771 only aeventy-nine\n*ub*erlb... NaN of NaN
3 whenever any prize property shall!*' condemn- ... the ceitihcate of'\noperate to prevent tfie ma... ably whenever any prize property shall!*' condemn- ...
4 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T... \nTerms of sale: One-tblrd, togethor with the ... j SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T...
... ... ... ... ...
428512 Sam Clendenin bad a fancy for Ui«\nscience of ... \nSam was arrested.\nThe case excited a great ... NaN NaN
428513 Wita.htt halting the party ware dilven to the ... through the alnp the »Uitors laapeeeed tia.»\n... NaN NaN
428514 It was the last thing that either of\nthem exp... Agua Negra across the line.\nIt was a grim pla... NaN NaN
428515 settlement with the department.\nIt is also sh... \na note of Wood, Dialogue fc Co., for\nc27,im... NaN NaN
428516 Flour quotations—low extras at 1 R0®2 50;\ncit... 3214c;do White at 3614c: Mixed Western at\n331... NaN NaN

428517 rows × 4 columns

import nltk
nltk.download('punkt')
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
True
from collections import Counter, defaultdict
model2 = defaultdict(lambda: defaultdict(lambda: 0))
'''

for _, x in train_data[:1].iterrows():
        words = nltk.word_tokenize(x['Concatenated'])
        print(nltk.trigrams(words, pad_left=True, pad_right=True))
        for word_1, word_2, word_3 in nltk.trigrams(words, pad_left=True, pad_right=True):
          print('word1: ', word_1)
          print('word2: ', word_2)
          print('word3: ', word_3)
          if word_1 and word_2 and word_3:
            model2[(word_1, word_3)][word_2] += 1
            print(model2)

'''
"\n\nfor _, x in train_data[:1].iterrows():\n        words = nltk.word_tokenize(x['Concatenated'])\n        print(nltk.trigrams(words, pad_left=True, pad_right=True))\n        for word_1, word_2, word_3 in nltk.trigrams(words, pad_left=True, pad_right=True):\n          print('word1: ', word_1)\n          print('word2: ', word_2)\n          print('word3: ', word_3)\n          if word_1 and word_2 and word_3:\n            model2[(word_1, word_3)][word_2] += 1\n            print(model2)\n\n"
for i, ws in enumerate(model2):
  print('i ', i)
  print('ws ', ws)
def strip(text):
    txt = str(text).lower().strip()
    txt = txt.replace("", "'")
    txt = txt.replace(" this\\\\nplace", "this place")
    txt = txt.replace("'we\\\\nwere", "we were")
    txt = txt.replace("'ever\\\\nwas", "ever was")
    txt = txt.replace("'making\\\\nsuch", "making such")
    txt = txt.replace("'boot\\\\nto", "boot to")
    txt = txt.replace("'elsewhere\\\\nfrom", "elsewhere from")
    txt=txt.replace("United\\\\nStates","United States")
    txt = txt.replace("Unit-\\\\ned","United" )
    txt = txt.replace("neigh-\\\\nbors", "neighbours")
    txt = txt.replace("aver-\\\\nage", "average")
    txt = txt.replace("people\\\\ndown", "people down")
    txt =re.compile(r"'s|[\-]|\-\\\\n|\p{P}").sub("", txt)
    txt = re.compile(r"[{}\[\]\&%^$*#\(\)@\t\n0123456789]+").sub(" ", txt)
    return txt
model = defaultdict(lambda: defaultdict(lambda: 0))
#cleaned = []
def train(data, m):
  for y,x in data.iterrows():
    words = nltk.word_tokenize(strip(x['Concatenated']))
    for word_1, word_2, word_3 in nltk.trigrams(words, pad_left=True, pad_right=True):
      #print(nltk.trigrams(words, pad_left=True, pad_right=True))
      if word_1 and word_2 and word_3:
        m[(word_1, word_3)][word_2] += 1
  for i, ws in enumerate(m):
    count = sum(m[ws].values())
    for word_2 in m[ws]:
        m[ws][word_2] += 0.25
        m[ws][word_2] /= float(count + 0.25 + len(word_2))
  return m
train(train_data[:100000], model)
def base_prob():
  return 'the:0.02 a:0.013 to:0.01 be:0.01 and:0.01 :0.937'
def predict_words(before, after):
    prediction = dict(Counter(dict(model[before, after])).most_common(5))
    result = ''
    prob = 0.0
    for key, value in prediction.items():
        prob += value
        result += f'{key}:{value} '
    if prob == 0.0:
        return base_prob()
    result += f':{max(1 - prob, 0.01)}'
    return result
from csv import QUOTE_NONE
def predict_file(file):
    data = pd.read_csv(f'{file}/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=QUOTE_NONE)
    with open(f'{file}/out.tsv', 'w', encoding='utf-8') as file_out:
        for _, row in data.iterrows():
            before, after = nltk.word_tokenize(strip(str(row[6]))), nltk.word_tokenize(strip(str(row[7])))
            if len(before) < 3 or len(after) < 3:
                prediction = base_prob()
            else:
                prediction = predict_words(before[-1], after[0])
            file_out.write(prediction + '\n')

predict_file('dev-0')

predict_file('test-A')