First vowpal wabit

This commit is contained in:
s426135 2020-04-27 11:40:28 +02:00
parent 6f5772b7fd
commit 23905680f2
3 changed files with 10463 additions and 10398 deletions

File diff suppressed because it is too large Load Diff

65
prepare_data.py Executable file
View File

@ -0,0 +1,65 @@
#!/usr/bin/python3
import re, sys, nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from string import punctuation
def clear_post(post):
post = post.replace('\\n', ' ')
post = re.sub(r'[\.\,\/\~]+', ' ', post)
post = re.sub(r'(&lt|&gt|\@[a-zA-Z0-9]+)','',post)
post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|\\\!\=\^]+', '', post)
post = re.sub(r'( \- |\-\-+)', ' ', post)
post = re.sub(r' +', ' ', post)
post = post.rstrip(' ')
return post
def create_dict(in_file, expected_file):
posts = {}
tt = TweetTokenizer(preserve_case = False)
stoplist = set(stopwords.words('english') + list(punctuation))
counter = 0
with open(in_file) as in_f, open(expected_file) as exp_f:
for line, exp in zip(in_f, exp_f):
line = line.rstrip('\n').split("\t")[0]
exp = exp.rstrip("\n")
line = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', line)
line = line.replace("|", "")
line = line.replace(":", "")
post = line
#post = clear_post(line)
tokenized_line = [token for token in tt.tokenize(post) if token not in stoplist]
bigrams = nltk.bigrams(tokenized_line)
if exp == "d":
ex_val = ""
else:
ex_val = int(exp)
if ex_val == 0:
ex_val = -1
counter+=1
big_merged = [i[0] +"_" + i[1] for i in list(bigrams)]
posts[counter] = [ex_val, " ".join(tokenized_line), " ".join(big_merged)]
# TODO
# Można stworzyc tutaj juz bigramy
return posts
def create_file(posts):
for c, v in posts.items():
exp = v[0]
post = v[1]
big = v[2]
print(exp, "|", post)
#print(exp, "|words", post, "|bigrams", big)
def main():
if len(sys.argv) != 3:
print("Synthax is ./prepare_data.py in_file expected_file")
return
in_file = str(sys.argv[1])
expected_file = str(sys.argv[2])
posts = create_dict(in_file, expected_file)
create_file(posts)
main()
# --passes 18 -b 16 --random_seed 123456789 --link logistic --loss_function logistic -k cashe_file vw-meta-cashe

File diff suppressed because it is too large Load Diff