paranormal-or-skeptic/prepare_data.py

#!/usr/bin/python3
import re, sys, nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from string import punctuation
from nltk.corpus import wordnet as wn


def clear_post(post):
    post = post.replace('\\n', ' ')
    post = re.sub(r'[\.\,\/\~]+', ' ', post)
    post = re.sub(r'(&lt|&gt|\@[a-zA-Z0-9]+)','',post)
    post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|\–\”\!\=\^]+', '', post)
    post = re.sub(r'( \- |\-\-+)', ' ', post)
    post = re.sub(r' +', ' ', post)
    post = post.rstrip(' ')
    return post

def create_dict(in_file, expected_file):
    posts = {}
    tt = TweetTokenizer(preserve_case = False)
    stoplist = set(stopwords.words('english') + list(punctuation))
    counter = 0
    with open(in_file) as in_f, open(expected_file) as exp_f:
        for line, exp in zip(in_f, exp_f):
            line = line.rstrip('\n').split("\t")[0]
            exp = exp.rstrip("\n")
            #line = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', line)
            #line = line.replace("|", "")
            #line = line.replace(":", "")
            #post = line
            post = clear_post(line)
            #post = clear_post(line)
            tokenized_line = [token for token in tt.tokenize(post) if token not in stoplist]
            bigrams = nltk.bigrams(tokenized_line)
            lemas = {}
            for word in set(tokenized_line):
                res = wn.synsets(word)
                if not res:
                    continue
                word_lem = []
                for w in res:
                    le_na = w.lemma_names()
                    word_lem.append(" ".join(le_na))
                lemas[word] = " ".join(word_lem)

            if exp == "d":
                ex_val = ""
            else:
                ex_val = int(exp)
                if ex_val == 0:
                    ex_val = -1
            counter+=1
            big_merged = [i[0] +"_" + i[1] for i in list(bigrams)]
            posts[counter] = [ex_val, " ".join(tokenized_line), " ".join(big_merged), lemas]
            # TODO
            # Można stworzyc tutaj juz bigramy
    return posts

def create_file(posts):
    for c, v in posts.items():
        exp = v[0]
        post = v[1]
        big = v[2]
        lem = list(v[3].values())
        lem = " ".join(lem)
        #print(exp, "|", post)
        print(exp, "|words", post, "|bigrams", big, "|lemmas" , lem)

def main():
    if len(sys.argv) != 3:
        print("Synthax is ./prepare_data.py in_file expected_file")
        return

    in_file = str(sys.argv[1])
    expected_file = str(sys.argv[2])
    posts = create_dict(in_file, expected_file)

    create_file(posts)
main()
# --passes 18 -b 16 --random_seed 123456789 --link logistic --loss_function logistic -k cashe_file vw-meta-cashe