import re """ Takes a document and returns a list of tokens. """ def tokenize(d): d = re.sub(r'(\s+|\\n)', ' ', d) d = re.sub(r'(https?:|www)\S+(\s+|$)', ' URL ', d) d = d.lower().replace(".", " .").replace(",", " ,").replace("?", " ?").replace("!", " !") d = re.sub(r'\d+', 'NUM', d) return re.split(r'\s+', d)