60 KiB
60 KiB
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
cd drive/MyDrive
/content/drive/MyDrive
cd challenging-america-word-gap-prediction/
/content/drive/MyDrive/challenging-america-word-gap-prediction
import pandas as pd
data = pd.read_csv("train/in.tsv.xz", sep="\t", on_bad_lines='skip', header=None, encoding="utf-8")
exp_words = pd.read_csv("train/expected.tsv", sep="\t", on_bad_lines='skip', header=None, encoding="utf-8")
data[:10]
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | |
---|---|---|---|---|---|---|---|---|
0 | 4e04702da929c78c52baf09c1851d3ff | ST | ChronAm | 1919.604110 | 30.475470 | -90.100911 | came fiom the last place to this\nplace, and t... | said\nit's all squash. The best I could get\ni... |
1 | b374dadd940510271d9675d3e8caf9d8 | DAILY ARIZONA SILVER BELT | ChronAm | 1909.097260 | 33.399478 | -110.870950 | MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... | \ninto a proper perspective with those\nminor ... |
2 | adb666c426bdc10fd949cb824da6c0d0 | THE SAVANNAH MORNING NEWS | ChronAm | 1900.913699 | 32.080926 | -81.091177 | Thera were in 1771 only aeventy-nine\n*ub*erlb... | NaN |
3 | bc2c9aa0b77d724311e3c2e12fc61c92 | CHARLES CITY INTELLIGENCER | ChronAm | 1864.974044 | 43.066361 | -92.672411 | whenever any prize property shall!*' condemn- ... | the ceitihcate of'\noperate to prevent tfie ma... |
4 | 0f612b991a39c712f0d745835b8b2f0d | EVENING STAR | ChronAm | 1878.478082 | 38.894955 | -77.036646 | SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T... | \nTerms of sale: One-tblrd, togethor with the ... |
5 | 4c13fb3d2e6eef35fa28e7bae7868d60 | EDGEFIELD ADVERTISER | ChronAm | 1913.346575 | 33.789577 | -81.929558 | God includes all. and would we not\ngrieve if ... | lot of spiritual\nwaifs all about us. children... |
6 | a452eadfc3f4a475147728c5f4005429 | DAILY LOS ANGELES HERALD | ChronAm | 1883.801370 | 34.054935 | -118.244476 | The said action is brought to obtain a decree ... | then to obtain an execution against said Vie\n... |
7 | b970ee32372d81f1fd59ab6196e797c9 | THE FINDLAY JEFFERSONIAN | ChronAm | 1874.828767 | 41.041387 | -83.650398 | party" is a useless exhortation to intel-\nlig... | with all tjie hatred that\nsurvives the war; a... |
8 | d130f899a50db2792c546cc978dc930c | BUTLER CITIZEN | ChronAm | 1883.793151 | 40.861021 | -79.895225 | has led me to accept, everything I read\nwith ... | that the earth has mo-\ntion. Aday ortwo agoIt... |
9 | 80e56928e09b93529d206708ac905b63 | FERGUS COUNTY ARGUS | ChronAm | 1892.821038 | 47.062473 | -109.428238 | The wool circulars alluded to are\nthose which... | accuracy, as\nthey were furnished by him as ch... |
train_data = data[[6, 7]]
train_data[:10]
6 | 7 | |
---|---|---|
0 | came fiom the last place to this\nplace, and t... | said\nit's all squash. The best I could get\ni... |
1 | MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... | \ninto a proper perspective with those\nminor ... |
2 | Thera were in 1771 only aeventy-nine\n*ub*erlb... | NaN |
3 | whenever any prize property shall!*' condemn- ... | the ceitihcate of'\noperate to prevent tfie ma... |
4 | SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T... | \nTerms of sale: One-tblrd, togethor with the ... |
5 | God includes all. and would we not\ngrieve if ... | lot of spiritual\nwaifs all about us. children... |
6 | The said action is brought to obtain a decree ... | then to obtain an execution against said Vie\n... |
7 | party" is a useless exhortation to intel-\nlig... | with all tjie hatred that\nsurvives the war; a... |
8 | has led me to accept, everything I read\nwith ... | that the earth has mo-\ntion. Aday ortwo agoIt... |
9 | The wool circulars alluded to are\nthose which... | accuracy, as\nthey were furnished by him as ch... |
train_data= pd.concat([train_data, exp_words], axis=1)
train_data.rename(columns={6: 'First Part', 7: 'Second Part', 0:'Expected word'}, inplace=True)
train_data[:10]
First Part | Second Part | Expected word | |
---|---|---|---|
0 | came fiom the last place to this\nplace, and t... | said\nit's all squash. The best I could get\ni... | lie |
1 | MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... | \ninto a proper perspective with those\nminor ... | himself |
2 | Thera were in 1771 only aeventy-nine\n*ub*erlb... | NaN | of |
3 | whenever any prize property shall!*' condemn- ... | the ceitihcate of'\noperate to prevent tfie ma... | ably |
4 | SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T... | \nTerms of sale: One-tblrd, togethor with the ... | j |
5 | God includes all. and would we not\ngrieve if ... | lot of spiritual\nwaifs all about us. children... | he |
6 | The said action is brought to obtain a decree ... | then to obtain an execution against said Vie\n... | graph |
7 | party" is a useless exhortation to intel-\nlig... | with all tjie hatred that\nsurvives the war; a... | 011 |
8 | has led me to accept, everything I read\nwith ... | that the earth has mo-\ntion. Aday ortwo agoIt... | separately. |
9 | The wool circulars alluded to are\nthose which... | accuracy, as\nthey were furnished by him as ch... | a |
train_data['Concatenated'] = train_data['First Part'] + train_data['Expected word'] + train_data['Second Part']
train_data[:3]
First Part | Second Part | Expected word | Concatenated | |
---|---|---|---|---|
0 | came fiom the last place to this\nplace, and t... | said\nit's all squash. The best I could get\ni... | lie | came fiom the last place to this\nplace, and t... |
1 | MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... | \ninto a proper perspective with those\nminor ... | himself | MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... |
2 | Thera were in 1771 only aeventy-nine\n*ub*erlb... | NaN | of | NaN |
import regex as re
train_data.replace('\n', '', regex=True)
First Part | Second Part | Expected word | Concatenated | |
---|---|---|---|---|
0 | came fiom the last place to this\nplace, and t... | said\nit's all squash. The best I could get\ni... | lie | came fiom the last place to this\nplace, and t... |
1 | MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... | \ninto a proper perspective with those\nminor ... | himself | MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... |
2 | Thera were in 1771 only aeventy-nine\n*ub*erlb... | NaN | of | NaN |
3 | whenever any prize property shall!*' condemn- ... | the ceitihcate of'\noperate to prevent tfie ma... | ably | whenever any prize property shall!*' condemn- ... |
4 | SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T... | \nTerms of sale: One-tblrd, togethor with the ... | j | SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T... |
... | ... | ... | ... | ... |
428512 | Sam Clendenin bad a fancy for Ui«\nscience of ... | \nSam was arrested.\nThe case excited a great ... | NaN | NaN |
428513 | Wita.htt halting the party ware dilven to the ... | through the alnp the »Uitors laapeeeed tia.»\n... | NaN | NaN |
428514 | It was the last thing that either of\nthem exp... | Agua Negra across the line.\nIt was a grim pla... | NaN | NaN |
428515 | settlement with the department.\nIt is also sh... | \na note of Wood, Dialogue fc Co., for\nc27,im... | NaN | NaN |
428516 | Flour quotations—low extras at 1 R0®2 50;\ncit... | 3214c;do White at 3614c: Mixed Western at\n331... | NaN | NaN |
428517 rows × 4 columns
import nltk
nltk.download('punkt')
[nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Unzipping tokenizers/punkt.zip.
True
from collections import Counter, defaultdict
model2 = defaultdict(lambda: defaultdict(lambda: 0))
'''
for _, x in train_data[:1].iterrows():
words = nltk.word_tokenize(x['Concatenated'])
print(nltk.trigrams(words, pad_left=True, pad_right=True))
for word_1, word_2, word_3 in nltk.trigrams(words, pad_left=True, pad_right=True):
print('word1: ', word_1)
print('word2: ', word_2)
print('word3: ', word_3)
if word_1 and word_2 and word_3:
model2[(word_1, word_3)][word_2] += 1
print(model2)
'''
"\n\nfor _, x in train_data[:1].iterrows():\n words = nltk.word_tokenize(x['Concatenated'])\n print(nltk.trigrams(words, pad_left=True, pad_right=True))\n for word_1, word_2, word_3 in nltk.trigrams(words, pad_left=True, pad_right=True):\n print('word1: ', word_1)\n print('word2: ', word_2)\n print('word3: ', word_3)\n if word_1 and word_2 and word_3:\n model2[(word_1, word_3)][word_2] += 1\n print(model2)\n\n"
for i, ws in enumerate(model2):
print('i ', i)
print('ws ', ws)
def strip(text):
txt = str(text).lower().strip()
txt = txt.replace("’", "'")
txt = txt.replace(" this\\\\nplace", "this place")
txt = txt.replace("'we\\\\nwere", "we were")
txt = txt.replace("'ever\\\\nwas", "ever was")
txt = txt.replace("'making\\\\nsuch", "making such")
txt = txt.replace("'boot\\\\nto", "boot to")
txt = txt.replace("'elsewhere\\\\nfrom", "elsewhere from")
txt=txt.replace("United\\\\nStates","United States")
txt = txt.replace("Unit-\\\\ned","United" )
txt = txt.replace("neigh-\\\\nbors", "neighbours")
txt = txt.replace("aver-\\\\nage", "average")
txt = txt.replace("people\\\\ndown", "people down")
txt =re.compile(r"'s|[\-]|\-\\\\n|\p{P}").sub("", txt)
txt = re.compile(r"[{}\[\]\&%^$*#\(\)@\t\n0123456789]+").sub(" ", txt)
return txt
model = defaultdict(lambda: defaultdict(lambda: 0))
#cleaned = []
def train(data, m):
for y,x in data.iterrows():
words = nltk.word_tokenize(strip(x['Concatenated']))
for word_1, word_2, word_3 in nltk.trigrams(words, pad_left=True, pad_right=True):
#print(nltk.trigrams(words, pad_left=True, pad_right=True))
if word_1 and word_2 and word_3:
m[(word_1, word_3)][word_2] += 1
for i, ws in enumerate(m):
count = sum(m[ws].values())
for word_2 in m[ws]:
m[ws][word_2] += 0.25
m[ws][word_2] /= float(count + 0.25 + len(word_2))
return m
train(train_data[:100000], model)
def base_prob():
return 'the:0.02 a:0.013 to:0.01 be:0.01 and:0.01 :0.937'
def predict_words(before, after):
prediction = dict(Counter(dict(model[before, after])).most_common(5))
result = ''
prob = 0.0
for key, value in prediction.items():
prob += value
result += f'{key}:{value} '
if prob == 0.0:
return base_prob()
result += f':{max(1 - prob, 0.01)}'
return result
from csv import QUOTE_NONE
def predict_file(file):
data = pd.read_csv(f'{file}/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=QUOTE_NONE)
with open(f'{file}/out.tsv', 'w', encoding='utf-8') as file_out:
for _, row in data.iterrows():
before, after = nltk.word_tokenize(strip(str(row[6]))), nltk.word_tokenize(strip(str(row[7])))
if len(before) < 3 or len(after) < 3:
prediction = base_prob()
else:
prediction = predict_words(before[-1], after[0])
file_out.write(prediction + '\n')
predict_file('dev-0')
predict_file('test-A')