#vw --binary vowpal.txt --passes 20 -c -k -f model.model [-b 24 --ngram 2 | --loss_function hinge -b 24 --nn 10 -inpass] #vw --binary -t -i model.model -p pred.pred test.txt import sys,re, spacy, glob, random sanitizator=re.compile(r'[^\w\s]') foldersPositive = ["dev/pos"] foldersNegative = ["dev/neg"] nlp = spacy.load("en_core_web_sm") def is_german(text): return " ist " in text or " sind " in text or " zur " in text or " und " in text def sanitize(text): #a=text a= sanitizator.sub("", text) return a.replace("\n"," ").lower() lines=[] outFilename = "dev.txt" for value, folder in list(zip("+", foldersPositive)) + list(zip("-", foldersNegative)): files = glob.glob(f'{folder}/**/*.txt', recursive=True) for textFile in files: print(textFile) with open(textFile, "r") as textFileHandle: content = textFileHandle.read() if is_german(content): continue document = nlp(sanitize(content)) l = " ".join([x.lemma_ for x in document if not x.is_stop]) #eventually, put additional features here lines.append(f"{value}1 | {l}") random.shuffle(lines) #it is even more random for sure!!! xddd random.shuffle(lines) with open(outFilename, "w+") as f: f.write("\n".join(lines))