PJN/dzielenie.py
Bamkoff 30f1a744ba s
2019-12-15 14:34:33 +01:00

54 lines
1.2 KiB
Python

import os
import nltk
import re
number_lines = os.popen("wc -l < reviews").read()
pierwszy = 0.8 * float(number_lines)
drugi = 0.9 * float(number_lines)
dane = open("reviews", "r")
linia =1
learning_s = open("learning", "a")
development_s = open("development", "a")
testing_s = open("testing", "a")
def magic(text):
tokeny = nltk.word_tokenize(text)
slownik = {}
string = "|"
for n in tokeny:
if n in slownik:
slownik[n] =+1
else:
slownik[n] = 1
for n in slownik:
string += " "+n+":."+str(slownik[n])
return string
for n in dane:
reg = re.search(r"^([0-9]+)( *|-*)([^0-9].*)", n.replace(":", " ").replace(".", " ").replace("|", " ").replace("\n", " ").lower())
if reg is not None:
if linia <= pierwszy:
learning_s.write(reg.group(1)+magic(reg.group(3))+'\n')
elif linia <= drugi:
development_s.write(reg.group(1)+magic(reg.group(3))+'\n')
else:
testing_s.write(reg.group(1)+magic(reg.group(3))+'\n')
linia +=1
learning_s.close()
dane.close()
development_s.close()
testing_s.close()
os.popen("vw learning -f model_ai")