paranormal-or-skeptic3/tokenizator.py

19 lines
589 B
Python
Raw Permalink Normal View History

2020-12-15 16:40:10 +01:00
import nltk
#nltk.download()
from nltk.corpus import stopwords
def tokenize(d):
chars = ['.', '?', ';', ':', ',', '(', ')', '"', "'", '`', '``', "''", '/', '//', '!', '+', '-', '*',
'{', '}', '[', ']', '>', '<',"&", '~']
d = d.replace('/', ' ').replace('\'','').replace('*', '').replace("\\n", ' ').replace('gt','').replace('.',' ').replace('_',' ').replace('-',' ')
#print(d)
tokens = nltk.word_tokenize(d)
stops = stopwords.words('english')
deletethis = chars + stops
tokens = [x.lower() for x in tokens if x not in deletethis]
return tokens