hadoop_sorted/mr/python/mapper.py

#!/usr/bin/env python
import sys
import re

# input comes from STDIN (standard input)
for line in sys.stdin:
    # remove leading and trailing whitespace
    line = line.strip()
    # split the line into words
    words = re.findall(r'\b[\w\s]+\b', line)  # using regex to find words
    # increase counters
    for word in words:
        # apply regex to remove non-alphanumeric characters and convert to lowercase
        word = re.sub(r'[^a-zA-Z0-9]', '', word).lower()
        # write the results to STDOUT (standard output);
        # what we output here will be the input for the
        # Reduce step, i.e. the input for reducer.py
        print('%s\t%s' % (word, 1))
first 2024-04-05 17:14:07 +02:00			`#!/usr/bin/env python`
			`import sys`
			`import re`

			`# input comes from STDIN (standard input)`
			`for line in sys.stdin:`
			`# remove leading and trailing whitespace`
			`line = line.strip()`
			`# split the line into words`
update 2024-04-05 17:33:16 +02:00			`words = re.findall(r'\b[\w\s]+\b', line) # using regex to find words`
first 2024-04-05 17:14:07 +02:00			`# increase counters`
			`for word in words:`
			`# apply regex to remove non-alphanumeric characters and convert to lowercase`
			`word = re.sub(r'[^a-zA-Z0-9]', '', word).lower()`
			`# write the results to STDOUT (standard output);`
			`# what we output here will be the input for the`
			`# Reduce step, i.e. the input for reducer.py`
			`print('%s\t%s' % (word, 1))`