#!/usr/bin/env python import sys import re # input comes from STDIN (standard input) for line in sys.stdin: # remove leading and trailing whitespace line = line.strip() # split the line into words words = re.findall(r'\b\w+\b', line) # using regex to find words # increase counters for word in words: # apply regex to remove non-alphanumeric characters and convert to lowercase word = re.sub(r'[^a-zA-Z0-9]', '', word).lower() # write the results to STDOUT (standard output); # what we output here will be the input for the # Reduce step, i.e. the input for reducer.py print('%s\t%s' % (word, 1))