#!/usr/bin/python3 import re, sys, nltk from nltk.tokenize import TweetTokenizer from nltk.corpus import stopwords from string import punctuation from nltk.corpus import wordnet as wn def clear_post(post): post = post.replace('\\n', ' ') post = re.sub(r'[\.\,\/\~]+', ' ', post) post = re.sub(r'(<|>|\@[a-zA-Z0-9]+)','',post) post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|\–\”\!\=\^]+', '', post) post = re.sub(r'( \- |\-\-+)', ' ', post) post = re.sub(r' +', ' ', post) post = post.rstrip(' ') return post def create_dict(in_file, expected_file): posts = {} tt = TweetTokenizer(preserve_case = False) stoplist = set(stopwords.words('english') + list(punctuation)) counter = 0 with open(in_file) as in_f, open(expected_file) as exp_f: for line, exp in zip(in_f, exp_f): line = line.rstrip('\n').split("\t")[0] exp = exp.rstrip("\n") #line = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', line) #line = line.replace("|", "") #line = line.replace(":", "") #post = line post = clear_post(line) #post = clear_post(line) tokenized_line = [token for token in tt.tokenize(post) if token not in stoplist] bigrams = nltk.bigrams(tokenized_line) lemas = {} for word in set(tokenized_line): res = wn.synsets(word) if not res: continue word_lem = [] for w in res: le_na = w.lemma_names() word_lem.append(" ".join(le_na)) lemas[word] = " ".join(word_lem) if exp == "d": ex_val = "" else: ex_val = int(exp) if ex_val == 0: ex_val = -1 counter+=1 big_merged = [i[0] +"_" + i[1] for i in list(bigrams)] posts[counter] = [ex_val, " ".join(tokenized_line), " ".join(big_merged), lemas] # TODO # Można stworzyc tutaj juz bigramy return posts def create_file(posts): for c, v in posts.items(): exp = v[0] post = v[1] big = v[2] lem = list(v[3].values()) lem = " ".join(lem) #print(exp, "|", post) print(exp, "|words", post, "|bigrams", big, "|lemmas" , lem) def main(): if len(sys.argv) != 3: print("Synthax is ./prepare_data.py in_file expected_file") return in_file = str(sys.argv[1]) expected_file = str(sys.argv[2]) posts = create_dict(in_file, expected_file) create_file(posts) main() # --passes 18 -b 16 --random_seed 123456789 --link logistic --loss_function logistic -k cashe_file vw-meta-cashe