This commit is contained in:
ZarebaMichal 2022-04-03 19:43:11 +02:00
parent 68537ae8d2
commit 290a1f802c
3 changed files with 16190 additions and 16197 deletions

File diff suppressed because it is too large Load Diff

17
run.py
View File

@ -13,16 +13,9 @@ DEFAULT_PREDICTION = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'
def preprocess_text(text): def preprocess_text(text):
# remove punctuation text = text.lower().replace("-\\n", "").replace("\\n", " ")
text = text.translate(str.maketrans(' ', ' ', string.punctuation)) text = re.sub(r"\p{P}", "", text)
# only alphabets and numerics
text = re.sub('[^a-zA-Z]', ' ', text)
# replace newline with space
text = re.sub("\n", " ", text)
# lower case
text = text.lower()
# split and join the words
text = ' '.join(text.split())
return text return text
@ -102,7 +95,7 @@ with open("dev-0/out.tsv", "w") as file:
text = preprocess_text(str(row[7])) text = preprocess_text(str(row[7]))
words = word_tokenize(text) words = word_tokenize(text)
if len(words) < 3: if len(words) < 3:
prediction = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1" prediction = DEFAULT_PREDICTION
else: else:
prediction = predict_probs(words[0], words[1]) prediction = predict_probs(words[0], words[1])
file.write(prediction + "\n") file.write(prediction + "\n")
@ -112,7 +105,7 @@ with open("test-A/out.tsv", "w") as file:
text = preprocess_text(str(row[7])) text = preprocess_text(str(row[7]))
words = word_tokenize(text) words = word_tokenize(text)
if len(words) < 3: if len(words) < 3:
prediction = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1" prediction = DEFAULT_PREDICTION
else: else:
prediction = predict_probs(words[0], words[1]) prediction = predict_probs(words[0], words[1])
file.write(prediction + "\n") file.write(prediction + "\n")

File diff suppressed because it is too large Load Diff