54 lines
1.2 KiB
Python
54 lines
1.2 KiB
Python
import os
|
|
import nltk
|
|
import re
|
|
|
|
number_lines = os.popen("wc -l < reviews").read()
|
|
|
|
|
|
pierwszy = 0.8 * float(number_lines)
|
|
|
|
drugi = 0.9 * float(number_lines)
|
|
|
|
dane = open("reviews", "r")
|
|
|
|
linia =1
|
|
|
|
learning_s = open("learning", "a")
|
|
|
|
development_s = open("development", "a")
|
|
|
|
testing_s = open("testing", "a")
|
|
|
|
|
|
def magic(text):
|
|
tokeny = nltk.word_tokenize(text)
|
|
slownik = {}
|
|
string = "|"
|
|
for n in tokeny:
|
|
if n in slownik:
|
|
slownik[n] =+1
|
|
else:
|
|
slownik[n] = 1
|
|
for n in slownik:
|
|
string += " "+n+":."+str(slownik[n])
|
|
return string
|
|
|
|
|
|
for n in dane:
|
|
reg = re.search(r"^([0-9]+)( *|-*)([^0-9].*)", n.replace(":", " ").replace(".", " ").replace("|", " ").replace("\n", " ").lower())
|
|
if reg is not None:
|
|
|
|
if linia <= pierwszy:
|
|
learning_s.write(reg.group(1)+magic(reg.group(3))+'\n')
|
|
elif linia <= drugi:
|
|
development_s.write(reg.group(1)+magic(reg.group(3))+'\n')
|
|
else:
|
|
testing_s.write(reg.group(1)+magic(reg.group(3))+'\n')
|
|
linia +=1
|
|
|
|
learning_s.close()
|
|
dane.close()
|
|
development_s.close()
|
|
testing_s.close()
|
|
|
|
os.popen("vw learning -f model_ai") |