hadoop_zaliczenie/mr/python/mapper.py

19 lines
671 B
Python
Raw Normal View History

2024-03-27 13:49:30 +01:00
#!/usr/bin/env python
import sys
import re
2024-03-27 17:42:41 +01:00
# input comes from STDIN (standard input)
2024-03-27 13:49:30 +01:00
for line in sys.stdin:
2024-03-27 17:42:41 +01:00
# remove leading and trailing whitespace
2024-03-27 13:49:30 +01:00
line = line.strip()
2024-03-27 17:42:41 +01:00
# split the line into words
words = re.findall(r'\b\w+\b', line) # using regex to find words
# increase counters
2024-03-27 13:49:30 +01:00
for word in words:
2024-03-27 17:42:41 +01:00
# apply regex to remove non-alphanumeric characters and convert to lowercase
2024-03-27 13:49:30 +01:00
word = re.sub(r'[^a-zA-Z0-9]', '', word).lower()
2024-03-27 17:42:41 +01:00
# write the results to STDOUT (standard output);
# what we output here will be the input for the
# Reduce step, i.e. the input for reducer.py
2024-03-27 13:49:30 +01:00
print('%s\t%s' % (word, 1))