2024-04-05 15:34:31 +02:00
|
|
|
#!/usr/bin/env python
|
2024-04-05 15:15:25 +02:00
|
|
|
from operator import itemgetter
|
|
|
|
import sys
|
|
|
|
|
2024-04-05 15:34:31 +02:00
|
|
|
current_word = None
|
|
|
|
current_count = 0
|
|
|
|
word = None
|
2024-04-05 15:15:25 +02:00
|
|
|
|
|
|
|
# input comes from STDIN
|
|
|
|
for line in sys.stdin:
|
|
|
|
# remove leading and trailing whitespace
|
|
|
|
line = line.strip()
|
|
|
|
|
|
|
|
# parse the input we got from mapper.py
|
|
|
|
word, count = line.split('\t', 1)
|
|
|
|
|
|
|
|
# convert count (currently a string) to int
|
|
|
|
try:
|
|
|
|
count = int(count)
|
|
|
|
except ValueError:
|
|
|
|
# count was not a number, so silently
|
|
|
|
# ignore/discard this line
|
|
|
|
continue
|
|
|
|
|
2024-04-05 15:34:31 +02:00
|
|
|
# this IF-switch only works because Hadoop sorts map output
|
|
|
|
# by key (here: word) before it is passed to the reducer
|
|
|
|
if current_word == word:
|
|
|
|
current_count += count
|
|
|
|
else:
|
|
|
|
if current_word:
|
|
|
|
# write result to STDOUT
|
|
|
|
print('%s\t%s' % (current_word, current_count))
|
|
|
|
current_count = count
|
|
|
|
current_word = word
|
2024-04-05 15:15:25 +02:00
|
|
|
|
2024-04-05 15:34:31 +02:00
|
|
|
# do not forget to output the last word if needed!
|
|
|
|
if current_word == word:
|
|
|
|
print('%s\t%s' % (current_word, current_count))
|
2024-04-05 15:15:25 +02:00
|
|
|
|