paranormal-or-skeptic3/tokenizator.py
Michal Maciaszek 9f31d8cc24 solution
2020-12-15 16:40:10 +01:00

19 lines
589 B
Python

import nltk
#nltk.download()
from nltk.corpus import stopwords
def tokenize(d):
chars = ['.', '?', ';', ':', ',', '(', ')', '"', "'", '`', '``', "''", '/', '//', '!', '+', '-', '*',
'{', '}', '[', ']', '>', '<',"&", '~']
d = d.replace('/', ' ').replace('\'','').replace('*', '').replace("\\n", ' ').replace('gt','').replace('.',' ').replace('_',' ').replace('-',' ')
#print(d)
tokens = nltk.word_tokenize(d)
stops = stopwords.words('english')
deletethis = chars + stops
tokens = [x.lower() for x in tokens if x not in deletethis]
return tokens