hadoop_zaliczenie/mr/python/mapper.py

#!/usr/bin/env python
import sys
import re

# input comes from STDIN (standard input)
for line in sys.stdin:
    # remove leading and trailing whitespace
    line = line.strip()
    # split the line into words
    words = re.findall(r'\b\w+\b', line)  # using regex to find words
    # increase counters
    for word in words:
        # apply regex to remove non-alphanumeric characters and convert to lowercase
        word = re.sub(r'[^a-zA-Z0-9]', '', word).lower()
        # write the results to STDOUT (standard output);
        # what we output here will be the input for the
        # Reduce step, i.e. the input for reducer.py
        print('%s\t%s' % (word, 1))