# Sebastian Raschka 2014-2020 # mlxtend Machine Learning Library Extensions # # Functions for tokenizing text data. # Author: Sebastian Raschka # # License: BSD 3 clause import re def tokenizer_words_and_emoticons(text): """Convert text to lowercase words and emoticons. Examples ----------- >>> tokenizer_words_and_emoticons('This :) is :( a test :-)!') ['this', 'is', 'a', 'test', ':)', ':(', ':-)'] For more usage examples, please see http://rasbt.github.io/mlxtend/user_guide/text/tokenizer_words_and_emoticons/ """ text = re.sub(r'<[^>]*>', '', text) emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text) text = re.sub(r'[\W]+', ' ', text.lower()) + ' '.join(emoticons) return text.split() def tokenizer_emoticons(text): """Return emoticons from text Examples ----------- >>> tokenizer_emoticons('This :) is :( a test :-)!') [':)', ':(', ':-)'] For usage examples, please see http://rasbt.github.io/mlxtend/user_guide/text/tokenizer_emoticons/ """ text = re.sub(r'<[^>]*>', '', text) emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text) return emoticons