import nltk #nltk.download() from nltk.corpus import stopwords def tokenize(d): chars = ['.', '?', ';', ':', ',', '(', ')', '"', "'", '`', '``', "''", '/', '//', '!', '+', '-', '*', '{', '}', '[', ']', '>', '<',"&", '~'] d = d.replace('/', ' ').replace('\'','').replace('*', '').replace("\\n", ' ').replace('gt','').replace('.',' ').replace('_',' ').replace('-',' ') #print(d) tokens = nltk.word_tokenize(d) stops = stopwords.words('english') deletethis = chars + stops tokens = [x.lower() for x in tokens if x not in deletethis] return tokens