22 lines
652 B
Python
22 lines
652 B
Python
#!/usr/bin/python3
|
|
|
|
from nltk.tokenize import word_tokenize
|
|
from nltk.corpus import stopwords
|
|
import nltk
|
|
import re
|
|
import string
|
|
|
|
|
|
stop_words = set(stopwords.words('english'))
|
|
printable = set(string.printable)
|
|
|
|
def tokenize(d):
|
|
d = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 'thereisasimplelinkinside', d, flags=re.MULTILINE)
|
|
d = re.sub(r'\\n',' ',d)
|
|
d = re.sub(r'\*|\'|\"|\/|~|_|=|-',' ',d)
|
|
d = ''.join(filter(lambda x: x in printable, d))
|
|
tokenized = word_tokenize(d)
|
|
lower = [w.lower() for w in tokenized]
|
|
words = [w for w in lower if not w in stop_words]
|
|
return words
|