This commit is contained in:
ZarebaMichal 2022-04-03 19:36:26 +02:00
parent 206774da84
commit 68537ae8d2
3 changed files with 13486 additions and 10379 deletions

File diff suppressed because it is too large Load Diff

46
run.py
View File

@ -13,12 +13,6 @@ DEFAULT_PREDICTION = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'
def preprocess_text(text):
# normalize text
text = (
unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode(
'utf-8', 'ignore'))
# replace html chars with ' '
text = re.sub('<.*?>', ' ', text)
# remove punctuation
text = text.translate(str.maketrans(' ', ' ', string.punctuation))
# only alphabets and numerics
@ -56,18 +50,6 @@ def predict_probs(word1, word2):
return str_prediction
def prepare_output(file_path):
with open(file_path, 'w') as file:
for index, row in test_data.iterrows():
text = preprocess_text(str(row[7]))
words = word_tokenize(text)
if len(words) < 4:
prediction = DEFAULT_PREDICTION
else:
prediction = predict_probs(words[0], words[1])
file.write(prediction + '\n')
def train_model(training_data):
for index, row in training_data.iterrows():
text = preprocess_text(str(row["final"]))
@ -90,15 +72,16 @@ data = pd.read_csv(
warn_bad_lines=False,
header=None,
quoting=csv.QUOTE_NONE,
nrows=200000,
nrows=100000,
)
train_labels = pd.read_csv(
"train/expected.tsv",
sep="\t",
error_bad_lines=False,
header=None,
quoting=csv.QUOTE_NONE,
nrows=200000,
nrows=100000,
)
train_data = data[[6, 7]]
@ -113,5 +96,24 @@ test_data = pd.read_csv('test-A/in.tsv.xz', sep='\t', error_bad_lines=False, war
train_model(train_data)
prepare_output("dev-0/out.tsv")
prepare_output("test-A/out.tsv")
with open("dev-0/out.tsv", "w") as file:
for _, row in dev_data.iterrows():
text = preprocess_text(str(row[7]))
words = word_tokenize(text)
if len(words) < 3:
prediction = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
else:
prediction = predict_probs(words[0], words[1])
file.write(prediction + "\n")
with open("test-A/out.tsv", "w") as file:
for _, row in test_data.iterrows():
text = preprocess_text(str(row[7]))
words = word_tokenize(text)
if len(words) < 3:
prediction = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
else:
prediction = predict_probs(words[0], words[1])
file.write(prediction + "\n")

File diff suppressed because it is too large Load Diff