hadoop_sorted/mr/python/mapper.py

19 lines
675 B
Python
Raw Normal View History

2024-04-05 17:14:07 +02:00
#!/usr/bin/env python
import sys
import re
# input comes from STDIN (standard input)
for line in sys.stdin:
# remove leading and trailing whitespace
line = line.strip()
# split the line into words
2024-04-05 17:33:16 +02:00
words = re.findall(r'\b[\w\s]+\b', line) # using regex to find words
2024-04-05 17:14:07 +02:00
# increase counters
for word in words:
# apply regex to remove non-alphanumeric characters and convert to lowercase
word = re.sub(r'[^a-zA-Z0-9]', '', word).lower()
# write the results to STDOUT (standard output);
# what we output here will be the input for the
# Reduce step, i.e. the input for reducer.py
print('%s\t%s' % (word, 1))