#!/usr/bin/python3 from nltk.tokenize import word_tokenize from nltk.corpus import stopwords import nltk import re import string stop_words = set(stopwords.words('english')) printable = set(string.printable) def tokenize(d): d = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 'thereisasimplelinkinside', d, flags=re.MULTILINE) d = re.sub(r'\\n',' ',d) d = re.sub(r'\*|\'|\"|\/|~|_|=|-',' ',d) d = ''.join(filter(lambda x: x in printable, d)) tokenized = word_tokenize(d) lower = [w.lower() for w in tokenized] words = [w for w in lower if not w in stop_words] return words