diff --git a/.DS_Store b/.DS_Store index 202195e..0d6c775 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/lab/LAB_01.md b/lab/LAB_01.md index a79f2f3..0ffde06 100644 --- a/lab/LAB_01.md +++ b/lab/LAB_01.md @@ -2,7 +2,7 @@ Do wykonania ćwiczeń należy skopiować repozytorium: ```shell -git clone https://git.wmi.amu.edu.pl/bigdata/apache_hadoop +git clone https://git.wmi.amu.edu.pl/s1201683/hadoop_zaliczenie ``` Celem ćwiczenia jest zaprezentowanie aplikacji w oparciu o algorytm MapReduce z wykorzystaniem: @@ -18,7 +18,7 @@ WordCount jest „odpowiednikiem Hello World” w świecie Big Data. Ćwiczenie Aby wykonać ćwiczenia, należy skopiować folder _books_ do systemu HDFS: ``` hdfs dfs -mkdir tmp -hdfs dfs -copyFromLocal ~/apache_hadoop/mr/books tmp/books +hdfs dfs -copyFromLocal ~/hadoop_zaliczenie/mr/books tmp/books ``` ## 1.WordCount – Hadoop Streaming Hadoop streaming umożliwia użytkownikom wykorzystanie mappera i reducera napisanego w dowolnym języku programowania. Jedynym wymaganiem jest obecność interpretera na każdym z węzłów. diff --git a/mr/.DS_Store b/mr/.DS_Store index 0f95520..5117ba3 100644 Binary files a/mr/.DS_Store and b/mr/.DS_Store differ diff --git a/mr/python/.DS_Store b/mr/python/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/mr/python/.DS_Store differ diff --git a/mr/python/mapper.py b/mr/python/mapper.py index 77d44f4..8668ac8 100644 --- a/mr/python/mapper.py +++ b/mr/python/mapper.py @@ -2,13 +2,17 @@ import sys import re - +# input comes from STDIN (standard input) for line in sys.stdin: - + # remove leading and trailing whitespace line = line.strip() - words = re.findall(r'\b\w+\b', line) - + # split the line into words + words = re.findall(r'\b\w+\b', line) # using regex to find words + # increase counters for word in words: - + # apply regex to remove non-alphanumeric characters and convert to lowercase word = re.sub(r'[^a-zA-Z0-9]', '', word).lower() + # write the results to STDOUT (standard output); + # what we output here will be the input for the + # Reduce step, i.e. the input for reducer.py print('%s\t%s' % (word, 1)) diff --git a/mr/python/reducer.py b/mr/python/reducer.py index 6bf6ae3..e1ce1be 100644 --- a/mr/python/reducer.py +++ b/mr/python/reducer.py @@ -1,4 +1,4 @@ - +#!/usr/bin/env python from operator import itemgetter import sys