paranormal-or-skeptic/prepare_data.py
2020-04-27 11:53:58 +02:00

66 lines
2.3 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/python3
import re, sys, nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from string import punctuation
def clear_post(post):
post = post.replace('\\n', ' ')
post = re.sub(r'[\.\,\/\~]+', ' ', post)
post = re.sub(r'(&lt|&gt|\@[a-zA-Z0-9]+)','',post)
post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|\\\!\=\^]+', '', post)
post = re.sub(r'( \- |\-\-+)', ' ', post)
post = re.sub(r' +', ' ', post)
post = post.rstrip(' ')
return post
def create_dict(in_file, expected_file):
posts = {}
tt = TweetTokenizer(preserve_case = False)
stoplist = set(stopwords.words('english') + list(punctuation))
counter = 0
with open(in_file) as in_f, open(expected_file) as exp_f:
for line, exp in zip(in_f, exp_f):
line = line.rstrip('\n').split("\t")[0]
exp = exp.rstrip("\n")
line = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', line)
line = line.replace("|", "")
line = line.replace(":", "")
post = line
#post = clear_post(line)
tokenized_line = [token for token in tt.tokenize(post) if token not in stoplist]
bigrams = nltk.bigrams(tokenized_line)
if exp == "d":
ex_val = ""
else:
ex_val = int(exp)
if ex_val == 0:
ex_val = -1
counter+=1
big_merged = [i[0] +"_" + i[1] for i in list(bigrams)]
posts[counter] = [ex_val, " ".join(tokenized_line), " ".join(big_merged)]
# TODO
# Można stworzyc tutaj juz bigramy
return posts
def create_file(posts):
for c, v in posts.items():
exp = v[0]
post = v[1]
big = v[2]
#print(exp, "|", post)
print(exp, "|words", post, "|bigrams", big)
def main():
if len(sys.argv) != 3:
print("Synthax is ./prepare_data.py in_file expected_file")
return
in_file = str(sys.argv[1])
expected_file = str(sys.argv[2])
posts = create_dict(in_file, expected_file)
create_file(posts)
main()
# --passes 18 -b 16 --random_seed 123456789 --link logistic --loss_function logistic -k cashe_file vw-meta-cashe