diff --git a/mr/python/reducer.py b/mr/python/reducer.py index 1fe2f28..44c36fd 100644 --- a/mr/python/reducer.py +++ b/mr/python/reducer.py @@ -1,8 +1,10 @@ +#!/usr/bin/env python from operator import itemgetter import sys -# Lista do przechowywania wyników -results = [] +current_word = None +current_count = 0 +word = None # input comes from STDIN for line in sys.stdin: @@ -20,12 +22,18 @@ for line in sys.stdin: # ignore/discard this line continue - # Dodaj słowo i jego długość do listy wyników - results.append((word, len(word), count)) + # this IF-switch only works because Hadoop sorts map output + # by key (here: word) before it is passed to the reducer + if current_word == word: + current_count += count + else: + if current_word: + # write result to STDOUT + print('%s\t%s' % (current_word, current_count)) + current_count = count + current_word = word -# Posortuj wyniki po długości słowa -results.sort(key=lambda x: x[1]) +# do not forget to output the last word if needed! +if current_word == word: + print('%s\t%s' % (current_word, current_count)) -# Wypisz posortowane wyniki -for result in results: - print('%s\t%s' % (result[0], result[2]))