13 lines
334 B
Python
13 lines
334 B
Python
import re
|
|
|
|
"""
|
|
Takes a document and returns a list of tokens.
|
|
"""
|
|
def tokenize(d):
|
|
d = re.sub(r'(\s+|\\n)', ' ', d)
|
|
d = re.sub(r'(https?:|www)\S+(\s+|$)', ' URL ', d)
|
|
d = d.lower().replace(".", " .").replace(",", " ,").replace("?", " ?").replace("!", " !")
|
|
d = re.sub(r'\d+', 'NUM', d)
|
|
|
|
return re.split(r'\s+', d)
|