35 lines
1.3 KiB
Python
35 lines
1.3 KiB
Python
#vw --binary vowpal.txt --passes 20 -c -k -f model.model [-b 24 --ngram 2 | --loss_function hinge -b 24 --nn 10 -inpass]
|
|
#vw --binary -t -i model.model -p pred.pred test.txt
|
|
import sys,re, spacy, glob, random
|
|
sanitizator=re.compile(r'[^\w\s]')
|
|
foldersPositive = ["dev/pos"]
|
|
foldersNegative = ["dev/neg"]
|
|
nlp = spacy.load("en_core_web_sm")
|
|
|
|
def is_german(text):
|
|
return " ist " in text or " sind " in text or " zur " in text or " und " in text
|
|
|
|
def sanitize(text):
|
|
#a=text
|
|
a= sanitizator.sub("", text)
|
|
return a.replace("\n"," ").lower()
|
|
lines=[]
|
|
outFilename = "dev.txt"
|
|
for value, folder in list(zip("+", foldersPositive)) + list(zip("-", foldersNegative)):
|
|
files = glob.glob(f'{folder}/**/*.txt', recursive=True)
|
|
for textFile in files:
|
|
print(textFile)
|
|
with open(textFile, "r") as textFileHandle:
|
|
content = textFileHandle.read()
|
|
if is_german(content): continue
|
|
document = nlp(sanitize(content))
|
|
l = " ".join([x.lemma_ for x in document if not x.is_stop])
|
|
#eventually, put additional features here
|
|
|
|
lines.append(f"{value}1 | {l}")
|
|
random.shuffle(lines)
|
|
#it is even more random for sure!!! xddd
|
|
random.shuffle(lines)
|
|
with open(outFilename, "w+") as f:
|
|
f.write("\n".join(lines))
|