This commit is contained in:
alesad7 2022-04-03 23:42:16 +02:00
parent b6f858fa1e
commit dbeb347d77
14 changed files with 460408 additions and 460368 deletions

8
.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

4
.idea/misc.xml Normal file
View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/challenging-america-word-gap-prediction.iml" filepath="$PROJECT_DIR$/.idea/challenging-america-word-gap-prediction.iml" />
</modules>
</component>
</project>

6
.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1 +1 @@
FileId Year LeftContext RightContext FileId Year LeftContext RightContext

1 FileId Year LeftContext RightContext

View File

@ -1 +1 @@
Word Word

1 Word

View File

@ -1,9 +1,9 @@
Challenging America word-gap prediction Challenging America word-gap prediction
=================================== ===================================
Guess a word in a gap. Guess a word in a gap.
Evaluation metric Evaluation metric
----------------- -----------------
LikelihoodHashed is the metric LikelihoodHashed is the metric

160
run.py
View File

@ -1,80 +1,80 @@
from nltk import trigrams, word_tokenize from nltk import trigrams, word_tokenize
import pandas as pd import pandas as pd
import csv import csv
import regex as re import regex as re
from collections import Counter, defaultdict from collections import Counter, defaultdict
train_set = pd.read_csv( train_set = pd.read_csv(
'train/in.tsv.xz', 'train/in.tsv.xz',
sep='\t', sep='\t',
on_bad_lines='skip', on_bad_lines='skip',
header=None, header=None,
uoting=csv.QUOTE_NONE, quoting=csv.QUOTE_NONE,
nrows=50000) nrows=50000)
train_labels = pd.read_csv( train_labels = pd.read_csv(
'train/expected.tsv', 'train/expected.tsv',
sep='\t', sep='\t',
on_bad_lines='skip', on_bad_lines='skip',
header=None, header=None,
quoting=csv.QUOTE_NONE, quoting=csv.QUOTE_NONE,
nrows=50000) nrows=50000)
def data_preprocessing(text): def data_preprocessing(text):
return re.sub(r'\p{P}', '', text.lower().replace('-\\n', '').replace('\\n', ' ')) return re.sub(r'\p{P}', '', text.lower().replace('-\\n', '').replace('\\n', ' '))
def predict(before, after): def predict(before, after):
prediction = dict(Counter(dict(trigram[before, after])).most_common(5)) prediction = dict(Counter(dict(trigram[before, after])).most_common(5))
result = '' result = ''
prob = 0.0 prob = 0.0
for key, value in prediction.items(): for key, value in prediction.items():
prob += value prob += value
result += f'{key}:{value} ' result += f'{key}:{value} '
if prob == 0.0: if prob == 0.0:
return 'to:0.015 be:0.015 the:0.015 not:0.01 and:0.02 a:0.02 :0.9' return 'to:0.015 be:0.015 the:0.015 not:0.01 and:0.02 a:0.02 :0.9'
result += f':{max(1 - prob, 0.01)}' result += f':{max(1 - prob, 0.01)}'
return result return result
def make_prediction(file): def make_prediction(file):
data = pd.read_csv(f'{file}/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE) data = pd.read_csv(f'{file}/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
with open(f'{file}/out.tsv', 'w', encoding='utf-8') as file_out: with open(f'{file}/out.tsv', 'w', encoding='utf-8') as file_out:
for _, row in data.iterrows(): for _, row in data.iterrows():
before, after = word_tokenize(data_preprocessing(str(row[6]))), word_tokenize(data_preprocessing(str(row[7]))) before, after = word_tokenize(data_preprocessing(str(row[6]))), word_tokenize(data_preprocessing(str(row[7])))
if len(before) < 3 or len(after) < 3: if len(before) < 3 or len(after) < 3:
prediction = 'to:0.015 be:0.015 the:0.015 not:0.01 and:0.02 a:0.02 :0.9' prediction = 'to:0.015 be:0.015 the:0.015 not:0.01 and:0.02 a:0.02 :0.9'
else: else:
prediction = predict(before[-1], after[0]) prediction = predict(before[-1], after[0])
file_out.write(prediction + '\n') file_out.write(prediction + '\n')
train_set = train_set[[6, 7]] train_set = train_set[[6, 7]]
train_set = pd.concat([train_set, train_labels], axis=1) train_set = pd.concat([train_set, train_labels], axis=1)
train_set['line'] = train_set[6] + train_set[0] + train_set[7] train_set['line'] = train_set[6] + train_set[0] + train_set[7]
trigram = defaultdict(lambda: defaultdict(lambda: 0)) trigram = defaultdict(lambda: defaultdict(lambda: 0))
rows = train_set.iterrows() rows = train_set.iterrows()
rows_len = len(train_set) rows_len = len(train_set)
for index, (_, row) in enumerate(rows): for index, (_, row) in enumerate(rows):
text = data_preprocessing(str(row['line'])) text = data_preprocessing(str(row['line']))
words = word_tokenize(text) words = word_tokenize(text)
for word_1, word_2, word_3 in trigrams(words, pad_right=True, pad_left=True): for word_1, word_2, word_3 in trigrams(words, pad_right=True, pad_left=True):
if word_1 and word_2 and word_3: if word_1 and word_2 and word_3:
trigram[(word_1, word_3)][word_2] += 1 trigram[(word_1, word_3)][word_2] += 1
model_len = len(trigram) model_len = len(trigram)
for index, words_1_3 in enumerate(trigram): for index, words_1_3 in enumerate(trigram):
count = sum(trigram[words_1_3].values()) count = sum(trigram[words_1_3].values())
for word_2 in trigram[words_1_3]: for word_2 in trigram[words_1_3]:
trigram[words_1_3][word_2] += 0.25 trigram[words_1_3][word_2] += 0.25
trigram[words_1_3][word_2] /= float(count + 0.25 + len(word_2)) trigram[words_1_3][word_2] /= float(count + 0.25 + len(word_2))
make_prediction('test-A') make_prediction('test-A')
make_prediction('dev-0') make_prediction('dev-0')

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff