2020-04-27 11:40:28 +02:00
|
|
|
|
#!/usr/bin/python3
|
|
|
|
|
import re, sys, nltk
|
|
|
|
|
from nltk.tokenize import TweetTokenizer
|
|
|
|
|
from nltk.corpus import stopwords
|
|
|
|
|
from string import punctuation
|
2020-04-27 14:24:12 +02:00
|
|
|
|
from nltk.corpus import wordnet as wn
|
|
|
|
|
|
2020-04-27 11:40:28 +02:00
|
|
|
|
|
|
|
|
|
def clear_post(post):
|
|
|
|
|
post = post.replace('\\n', ' ')
|
|
|
|
|
post = re.sub(r'[\.\,\/\~]+', ' ', post)
|
|
|
|
|
post = re.sub(r'(<|>|\@[a-zA-Z0-9]+)','',post)
|
|
|
|
|
post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|\–\”\!\=\^]+', '', post)
|
|
|
|
|
post = re.sub(r'( \- |\-\-+)', ' ', post)
|
|
|
|
|
post = re.sub(r' +', ' ', post)
|
|
|
|
|
post = post.rstrip(' ')
|
|
|
|
|
return post
|
|
|
|
|
|
|
|
|
|
def create_dict(in_file, expected_file):
|
|
|
|
|
posts = {}
|
|
|
|
|
tt = TweetTokenizer(preserve_case = False)
|
|
|
|
|
stoplist = set(stopwords.words('english') + list(punctuation))
|
|
|
|
|
counter = 0
|
|
|
|
|
with open(in_file) as in_f, open(expected_file) as exp_f:
|
|
|
|
|
for line, exp in zip(in_f, exp_f):
|
|
|
|
|
line = line.rstrip('\n').split("\t")[0]
|
|
|
|
|
exp = exp.rstrip("\n")
|
2020-04-27 12:15:20 +02:00
|
|
|
|
#line = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', line)
|
|
|
|
|
#line = line.replace("|", "")
|
|
|
|
|
#line = line.replace(":", "")
|
|
|
|
|
#post = line
|
|
|
|
|
post = clear_post(line)
|
2020-04-27 11:40:28 +02:00
|
|
|
|
#post = clear_post(line)
|
|
|
|
|
tokenized_line = [token for token in tt.tokenize(post) if token not in stoplist]
|
|
|
|
|
bigrams = nltk.bigrams(tokenized_line)
|
2020-04-27 14:24:12 +02:00
|
|
|
|
lemas = {}
|
|
|
|
|
for word in set(tokenized_line):
|
|
|
|
|
res = wn.synsets(word)
|
|
|
|
|
if not res:
|
|
|
|
|
continue
|
|
|
|
|
word_lem = []
|
|
|
|
|
for w in res:
|
|
|
|
|
le_na = w.lemma_names()
|
|
|
|
|
word_lem.append(" ".join(le_na))
|
|
|
|
|
lemas[word] = " ".join(word_lem)
|
|
|
|
|
|
2020-04-27 11:40:28 +02:00
|
|
|
|
if exp == "d":
|
|
|
|
|
ex_val = ""
|
|
|
|
|
else:
|
|
|
|
|
ex_val = int(exp)
|
|
|
|
|
if ex_val == 0:
|
|
|
|
|
ex_val = -1
|
|
|
|
|
counter+=1
|
|
|
|
|
big_merged = [i[0] +"_" + i[1] for i in list(bigrams)]
|
2020-04-27 14:24:12 +02:00
|
|
|
|
posts[counter] = [ex_val, " ".join(tokenized_line), " ".join(big_merged), lemas]
|
2020-04-27 11:40:28 +02:00
|
|
|
|
# TODO
|
|
|
|
|
# Można stworzyc tutaj juz bigramy
|
|
|
|
|
return posts
|
|
|
|
|
|
|
|
|
|
def create_file(posts):
|
|
|
|
|
for c, v in posts.items():
|
|
|
|
|
exp = v[0]
|
|
|
|
|
post = v[1]
|
|
|
|
|
big = v[2]
|
2020-04-27 14:24:12 +02:00
|
|
|
|
lem = list(v[3].values())
|
|
|
|
|
lem = " ".join(lem)
|
2020-04-27 11:53:58 +02:00
|
|
|
|
#print(exp, "|", post)
|
2020-04-27 14:24:12 +02:00
|
|
|
|
print(exp, "|words", post, "|bigrams", big, "|lemmas" , lem)
|
2020-04-27 11:40:28 +02:00
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
if len(sys.argv) != 3:
|
|
|
|
|
print("Synthax is ./prepare_data.py in_file expected_file")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
in_file = str(sys.argv[1])
|
|
|
|
|
expected_file = str(sys.argv[2])
|
|
|
|
|
posts = create_dict(in_file, expected_file)
|
|
|
|
|
|
|
|
|
|
create_file(posts)
|
|
|
|
|
main()
|
|
|
|
|
# --passes 18 -b 16 --random_seed 123456789 --link logistic --loss_function logistic -k cashe_file vw-meta-cashe
|