revert d881302e4e

revert update
2024-04-05 15:42:49 +02:00 · 2024-04-05 15:42:49 +02:00 · 0aa3d7941a
parent 20cbaa1fa7
commit 0aa3d7941a
1 changed files with 38 additions and 31 deletions
--- a/mr/python/reducer.py
+++ b/mr/python/reducer.py
@ -1,31 +1,38 @@
-from operator import itemgetter
+#!/usr/bin/env python
-import sys
+from operator import itemgetter
-
+import sys
-# Lista do przechowywania wyników
+
-results = []
+current_word = None
-
+current_count = 0
-# input comes from STDIN
+word = None
-for line in sys.stdin:
+
-    # remove leading and trailing whitespace
+# input comes from STDIN
-    line = line.strip()
+for line in sys.stdin:
-
+    # remove leading and trailing whitespace
-    # parse the input we got from mapper.py
+    line = line.strip()
-    word, count = line.split('\t', 1)
+
-
+    # parse the input we got from mapper.py
-    # convert count (currently a string) to int
+    word, count = line.split('\t', 1)
-    try:
+
-        count = int(count)
+    # convert count (currently a string) to int
-    except ValueError:
+    try:
-        # count was not a number, so silently
+        count = int(count)
-        # ignore/discard this line
+    except ValueError:
-        continue
+        # count was not a number, so silently
-
+        # ignore/discard this line
-    # Dodaj słowo i jego długość do listy wyników
+        continue
-    results.append((word, len(word), count))
+
-
+    # this IF-switch only works because Hadoop sorts map output
-# Posortuj wyniki po długości słowa
+    # by key (here: word) before it is passed to the reducer
-results.sort(key=lambda x: x[1])
+    if current_word == word:
-
+        current_count += count
-# Wypisz posortowane wyniki
+    else:
-for result in results:
+        if current_word:
-    print('%s\t%s' % (result[0], result[2]))
+            # write result to STDOUT
            print('%s\t%s' % (current_word, current_count))
        current_count = count
        current_word = word
 # do not forget to output the last word if needed!
 if current_word == word:
    print('%s\t%s' % (current_word, current_count))