19 lines
589 B
Python
19 lines
589 B
Python
import nltk
|
|
#nltk.download()
|
|
from nltk.corpus import stopwords
|
|
|
|
|
|
def tokenize(d):
|
|
|
|
chars = ['.', '?', ';', ':', ',', '(', ')', '"', "'", '`', '``', "''", '/', '//', '!', '+', '-', '*',
|
|
'{', '}', '[', ']', '>', '<',"&", '~']
|
|
d = d.replace('/', ' ').replace('\'','').replace('*', '').replace("\\n", ' ').replace('gt','').replace('.',' ').replace('_',' ').replace('-',' ')
|
|
#print(d)
|
|
tokens = nltk.word_tokenize(d)
|
|
|
|
stops = stopwords.words('english')
|
|
deletethis = chars + stops
|
|
tokens = [x.lower() for x in tokens if x not in deletethis]
|
|
|
|
return tokens
|