przemowoAnalizator/vowpalize.py

40 lines
1.5 KiB
Python

#vw --binary vowpal.txt --passes 20 -c -k -f model.model [-b 24 --ngram 2 | --loss_function hinge -b 24 --nn 10 -inpass]
#vw --binary -t -i model.model -p pred.pred test.txt
#to najlepsze! vw --binary vowpal.txt --passes 20 -c -k -f model.model --loss_function hinge -b 24 --nn 10 [--inpass]
import sys,re, spacy, glob, random
sanitizator=re.compile(r'[^\w\s]')
foldersPositive = ["piosenki", 'war', 'trockizm']
foldersNegative = ["negativeSongs"]
nlp = spacy.load("en_core_web_sm")
def is_german(text):
return " ist " in text or " sind " in text or " zur " in text or " und " in text
def sanitize_from_git(text):
return re.sub("____.*$", "", text, re.MULTILINE | re.DOTALL)
def sanitize(text):
a= sanitizator.sub("", text)
#a=text
return sanitize_from_git(a.replace("\n"," ").lower())
lines=[]
outFilename = "vowpal.txt"
for value, folder in list(zip("+", foldersPositive)) + list(zip("-", foldersNegative)):
files = glob.glob(f'{folder}/**/*.txt', recursive=True)
for textFile in files:
print(textFile)
with open(textFile, "r") as textFileHandle:
content = textFileHandle.read()
if is_german(content): continue
document = nlp(sanitize(content))
l = " ".join([x.lemma_ for x in document if not x.is_stop])
#eventually, put additional features here
lines.append(f"{value}1 | {l}")
random.shuffle(lines)
#it is even more random for sure!!! xddd
random.shuffle(lines)
with open(outFilename, "w+") as f:
f.write("\n".join(lines))