19 lines
671 B
Python
19 lines
671 B
Python
#!/usr/bin/env python
|
|
import sys
|
|
import re
|
|
|
|
# input comes from STDIN (standard input)
|
|
for line in sys.stdin:
|
|
# remove leading and trailing whitespace
|
|
line = line.strip()
|
|
# split the line into words
|
|
words = re.findall(r'\b\w+\b', line) # using regex to find words
|
|
# increase counters
|
|
for word in words:
|
|
# apply regex to remove non-alphanumeric characters and convert to lowercase
|
|
word = re.sub(r'[^a-zA-Z0-9]', '', word).lower()
|
|
# write the results to STDOUT (standard output);
|
|
# what we output here will be the input for the
|
|
# Reduce step, i.e. the input for reducer.py
|
|
print('%s\t%s' % (word, 1))
|