# Sebastian Raschka 2014-2020
# mlxtend Machine Learning Library Extensions
#
# Functions for tokenizing text data.
# Author: Sebastian Raschka <sebastianraschka.com>
#
# License: BSD 3 clause

import re


def tokenizer_words_and_emoticons(text):
    """Convert text to lowercase words and emoticons.

    Examples
    -----------
    >>> tokenizer_words_and_emoticons('</a>This :) is :( a test :-)!')
    ['this', 'is', 'a', 'test', ':)', ':(', ':-)']

    For more usage examples, please see
    http://rasbt.github.io/mlxtend/user_guide/text/tokenizer_words_and_emoticons/

    """
    text = re.sub(r'<[^>]*>', '', text)
    emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub(r'[\W]+', ' ', text.lower()) + ' '.join(emoticons)
    return text.split()


def tokenizer_emoticons(text):
    """Return emoticons from text

    Examples
    -----------
    >>> tokenizer_emoticons('</a>This :) is :( a test :-)!')
    [':)', ':(', ':-)']

    For usage examples, please see
    http://rasbt.github.io/mlxtend/user_guide/text/tokenizer_emoticons/

    """
    text = re.sub(r'<[^>]*>', '', text)
    emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    return emoticons