paranormal-or-skeptic/mytokenize.py

13 lines
334 B
Python

import re
"""
Takes a document and returns a list of tokens.
"""
def tokenize(d):
d = re.sub(r'(\s+|\\n)', ' ', d)
d = re.sub(r'(https?:|www)\S+(\s+|$)', ' URL ', d)
d = d.lower().replace(".", " .").replace(",", " ,").replace("?", " ?").replace("!", " !")
d = re.sub(r'\d+', 'NUM', d)
return re.split(r'\s+', d)