19 lines
589 B
Python
19 lines
589 B
Python
|
import nltk
|
||
|
#nltk.download()
|
||
|
from nltk.corpus import stopwords
|
||
|
|
||
|
|
||
|
def tokenize(d):
|
||
|
|
||
|
chars = ['.', '?', ';', ':', ',', '(', ')', '"', "'", '`', '``', "''", '/', '//', '!', '+', '-', '*',
|
||
|
'{', '}', '[', ']', '>', '<',"&", '~']
|
||
|
d = d.replace('/', ' ').replace('\'','').replace('*', '').replace("\\n", ' ').replace('gt','').replace('.',' ').replace('_',' ').replace('-',' ')
|
||
|
#print(d)
|
||
|
tokens = nltk.word_tokenize(d)
|
||
|
|
||
|
stops = stopwords.words('english')
|
||
|
deletethis = chars + stops
|
||
|
tokens = [x.lower() for x in tokens if x not in deletethis]
|
||
|
|
||
|
return tokens
|