From 0aa3d7941ac6b462fa28ed42c53277d83eadcee9 Mon Sep 17 00:00:00 2001 From: s1201683 Date: Fri, 5 Apr 2024 15:42:49 +0200 Subject: [PATCH] revert d881302e4ec1eb9716489de1f45c3b8ff213b177 revert update --- mr/python/reducer.py | 69 ++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 31 deletions(-) diff --git a/mr/python/reducer.py b/mr/python/reducer.py index 1fe2f28..e1ce1be 100644 --- a/mr/python/reducer.py +++ b/mr/python/reducer.py @@ -1,31 +1,38 @@ -from operator import itemgetter -import sys - -# Lista do przechowywania wyników -results = [] - -# input comes from STDIN -for line in sys.stdin: - # remove leading and trailing whitespace - line = line.strip() - - # parse the input we got from mapper.py - word, count = line.split('\t', 1) - - # convert count (currently a string) to int - try: - count = int(count) - except ValueError: - # count was not a number, so silently - # ignore/discard this line - continue - - # Dodaj słowo i jego długość do listy wyników - results.append((word, len(word), count)) - -# Posortuj wyniki po długości słowa -results.sort(key=lambda x: x[1]) - -# Wypisz posortowane wyniki -for result in results: - print('%s\t%s' % (result[0], result[2])) +#!/usr/bin/env python +from operator import itemgetter +import sys + +current_word = None +current_count = 0 +word = None + +# input comes from STDIN +for line in sys.stdin: + # remove leading and trailing whitespace + line = line.strip() + + # parse the input we got from mapper.py + word, count = line.split('\t', 1) + + # convert count (currently a string) to int + try: + count = int(count) + except ValueError: + # count was not a number, so silently + # ignore/discard this line + continue + + # this IF-switch only works because Hadoop sorts map output + # by key (here: word) before it is passed to the reducer + if current_word == word: + current_count += count + else: + if current_word: + # write result to STDOUT + print('%s\t%s' % (current_word, current_count)) + current_count = count + current_word = word + +# do not forget to output the last word if needed! +if current_word == word: + print('%s\t%s' % (current_word, current_count))