40 lines
1.5 KiB
Python
40 lines
1.5 KiB
Python
|
#vw --binary vowpal.txt --passes 20 -c -k -f model.model [-b 24 --ngram 2 | --loss_function hinge -b 24 --nn 10 -inpass]
|
||
|
#vw --binary -t -i model.model -p pred.pred test.txt
|
||
|
|
||
|
|
||
|
#to najlepsze! vw --binary vowpal.txt --passes 20 -c -k -f model.model --loss_function hinge -b 24 --nn 10 [--inpass]
|
||
|
import sys,re, spacy, glob, random
|
||
|
sanitizator=re.compile(r'[^\w\s]')
|
||
|
foldersPositive = ["piosenki", 'war', 'trockizm']
|
||
|
foldersNegative = ["negativeSongs"]
|
||
|
nlp = spacy.load("en_core_web_sm")
|
||
|
|
||
|
def is_german(text):
|
||
|
return " ist " in text or " sind " in text or " zur " in text or " und " in text
|
||
|
def sanitize_from_git(text):
|
||
|
return re.sub("____.*$", "", text, re.MULTILINE | re.DOTALL)
|
||
|
|
||
|
def sanitize(text):
|
||
|
a= sanitizator.sub("", text)
|
||
|
#a=text
|
||
|
return sanitize_from_git(a.replace("\n"," ").lower())
|
||
|
lines=[]
|
||
|
outFilename = "vowpal.txt"
|
||
|
for value, folder in list(zip("+", foldersPositive)) + list(zip("-", foldersNegative)):
|
||
|
files = glob.glob(f'{folder}/**/*.txt', recursive=True)
|
||
|
for textFile in files:
|
||
|
print(textFile)
|
||
|
with open(textFile, "r") as textFileHandle:
|
||
|
content = textFileHandle.read()
|
||
|
if is_german(content): continue
|
||
|
document = nlp(sanitize(content))
|
||
|
l = " ".join([x.lemma_ for x in document if not x.is_stop])
|
||
|
#eventually, put additional features here
|
||
|
|
||
|
lines.append(f"{value}1 | {l}")
|
||
|
random.shuffle(lines)
|
||
|
#it is even more random for sure!!! xddd
|
||
|
random.shuffle(lines)
|
||
|
with open(outFilename, "w+") as f:
|
||
|
f.write("\n".join(lines))
|