hadoop_zaliczenie/mr/python/mapper.py

19 lines
671 B
Python

#!/usr/bin/env python
import sys
import re
# input comes from STDIN (standard input)
for line in sys.stdin:
# remove leading and trailing whitespace
line = line.strip()
# split the line into words
words = re.findall(r'\b\w+\b', line) # using regex to find words
# increase counters
for word in words:
# apply regex to remove non-alphanumeric characters and convert to lowercase
word = re.sub(r'[^a-zA-Z0-9]', '', word).lower()
# write the results to STDOUT (standard output);
# what we output here will be the input for the
# Reduce step, i.e. the input for reducer.py
print('%s\t%s' % (word, 1))