diff --git a/mr/python/reducer.py b/mr/python/reducer.py index e1ce1be..1fe2f28 100644 --- a/mr/python/reducer.py +++ b/mr/python/reducer.py @@ -1,38 +1,31 @@ -#!/usr/bin/env python -from operator import itemgetter -import sys - -current_word = None -current_count = 0 -word = None - -# input comes from STDIN -for line in sys.stdin: - # remove leading and trailing whitespace - line = line.strip() - - # parse the input we got from mapper.py - word, count = line.split('\t', 1) - - # convert count (currently a string) to int - try: - count = int(count) - except ValueError: - # count was not a number, so silently - # ignore/discard this line - continue - - # this IF-switch only works because Hadoop sorts map output - # by key (here: word) before it is passed to the reducer - if current_word == word: - current_count += count - else: - if current_word: - # write result to STDOUT - print('%s\t%s' % (current_word, current_count)) - current_count = count - current_word = word - -# do not forget to output the last word if needed! -if current_word == word: - print('%s\t%s' % (current_word, current_count)) +from operator import itemgetter +import sys + +# Lista do przechowywania wyników +results = [] + +# input comes from STDIN +for line in sys.stdin: + # remove leading and trailing whitespace + line = line.strip() + + # parse the input we got from mapper.py + word, count = line.split('\t', 1) + + # convert count (currently a string) to int + try: + count = int(count) + except ValueError: + # count was not a number, so silently + # ignore/discard this line + continue + + # Dodaj słowo i jego długość do listy wyników + results.append((word, len(word), count)) + +# Posortuj wyniki po długości słowa +results.sort(key=lambda x: x[1]) + +# Wypisz posortowane wyniki +for result in results: + print('%s\t%s' % (result[0], result[2]))