From 9e58d37a8cb2aadca5b8cf41c34aa657e2c58521 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Skurzy=C5=84ski?= Date: Fri, 5 Apr 2024 15:34:31 +0200 Subject: [PATCH] update --- mr/python/reducer.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/mr/python/reducer.py b/mr/python/reducer.py index 1fe2f28..44c36fd 100644 --- a/mr/python/reducer.py +++ b/mr/python/reducer.py @@ -1,8 +1,10 @@ +#!/usr/bin/env python from operator import itemgetter import sys -# Lista do przechowywania wyników -results = [] +current_word = None +current_count = 0 +word = None # input comes from STDIN for line in sys.stdin: @@ -20,12 +22,18 @@ for line in sys.stdin: # ignore/discard this line continue - # Dodaj słowo i jego długość do listy wyników - results.append((word, len(word), count)) + # this IF-switch only works because Hadoop sorts map output + # by key (here: word) before it is passed to the reducer + if current_word == word: + current_count += count + else: + if current_word: + # write result to STDOUT + print('%s\t%s' % (current_word, current_count)) + current_count = count + current_word = word -# Posortuj wyniki po długości słowa -results.sort(key=lambda x: x[1]) +# do not forget to output the last word if needed! +if current_word == word: + print('%s\t%s' % (current_word, current_count)) -# Wypisz posortowane wyniki -for result in results: - print('%s\t%s' % (result[0], result[2]))