Materiały na zajęcia

This commit is contained in:
Jakub Kasprzak 2021-04-24 16:05:48 +02:00
parent 7d53b118d7
commit 08662c4f24
8 changed files with 0 additions and 358 deletions

View File

@ -1,17 +0,0 @@
resource "aws_glue_catalog_database" "datalake_db_raw_zone" {
name = "datalake_${var.environment}_${var.account_number}_${var.student_initials}_${var.student_index_no}"
}
resource "aws_glue_crawler" "glue_crawler_raw_zone" {
database_name = aws_glue_catalog_database.datalake_db_raw_zone.name
name = "gc-raw-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
role = aws_iam_role.glue_crawler_role.arn
table_prefix = "crawler_"
s3_target {
path = "s3://${aws_s3_bucket.main_dl_bucket.bucket}/raw-zone/stockdata/"
}
tags = merge(local.common_tags, )
}

View File

@ -1,171 +0,0 @@
resource "aws_iam_role" "firehose_stream_role" {
name = "firehose-role-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
assume_role_policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Action": "sts:AssumeRole",
"Principal": {
"Service": "firehose.amazonaws.com"
},
"Effect": "Allow",
"Sid": ""
}
]
}
EOF
}
resource "aws_iam_role_policy" "firehose_stream_policy" {
name = "firehose-stream-policy-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
role = aws_iam_role.firehose_stream_role.id
policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": "kinesis:*",
"Resource": "*"
},
{
"Effect": "Allow",
"Action": [
"s3:AbortMultipartUpload",
"s3:GetBucketLocation",
"s3:GetObject",
"s3:ListBucket",
"s3:ListBucketMultipartUploads",
"s3:PutObject"
],
"Resource": [
"${aws_s3_bucket.main_dl_bucket.arn}",
"${aws_s3_bucket.main_dl_bucket.arn}/*"
]
},
{
"Sid": "",
"Effect": "Allow",
"Action": [
"logs:PutLogEvents"
],
"Resource": [
"arn:aws:logs:${var.region}:${var.account_number}:log-group:/aws/kinesisfirehose/*"
]
}
]
}
EOF
}
// Role & policies for Glue Crawler
resource "aws_iam_role" "glue_crawler_role" {
name = "crawler-role-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
assume_role_policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": "glue.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}
EOF
}
data "aws_iam_policy" "glue_service_policy" {
arn = "arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole"
}
resource "aws_iam_role_policy" "glue_crawler_user_bucket_policy" {
name = "user-bucket-policy-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
role = aws_iam_role.glue_crawler_role.id
policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"s3:GetObject",
"s3:PutObject"
],
"Resource": [
"${aws_s3_bucket.main_dl_bucket.arn}*"
]
}
]
}
EOF
}
resource "aws_iam_policy_attachment" "crawler_attach_managed_policy" {
name = "crawler-managed-service-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
roles = [
aws_iam_role.glue_crawler_role.name]
policy_arn = data.aws_iam_policy.glue_service_policy.arn
}
// Role and policies for Lambda
resource "aws_iam_role" "lambda_basic_role" {
name = "lambda-basic-role-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
tags = merge(local.common_tags, )
assume_role_policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Action": "sts:AssumeRole",
"Principal": {
"Service": "lambda.amazonaws.com"
},
"Effect": "Allow",
"Sid": ""
}
]
}
EOF
}
resource "aws_iam_role_policy" "lambda_basic_policy" {
name = "lambda-basic-policy-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
role = aws_iam_role.lambda_basic_role.id
policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Action": [
"logs:CreateLogGroup",
"logs:CreateLogStream",
"logs:PutLogEvents"
],
"Effect": "Allow",
"Resource": "*"
},
{
"Effect": "Allow",
"Action": "s3:*",
"Resource": [
"${aws_s3_bucket.main_dl_bucket.arn}",
"${aws_s3_bucket.main_dl_bucket.arn}/*"]
}
]
}
EOF
}

View File

@ -1,14 +0,0 @@
resource "aws_kinesis_stream" "cryptostock_stream" {
name = "cryptostock-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
shard_count = 1
enforce_consumer_deletion = true
shard_level_metrics = [
"IncomingBytes",
"OutgoingBytes",
"IncomingRecords",
"OutgoingRecords"
]
tags = merge(local.common_tags, )
}

View File

@ -1,18 +0,0 @@
resource "aws_kinesis_firehose_delivery_stream" "stock_delivery_stream" {
name = "firehose-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
destination = "extended_s3"
kinesis_source_configuration {
kinesis_stream_arn = aws_kinesis_stream.cryptostock_stream.arn
role_arn = aws_iam_role.firehose_stream_role.arn
}
extended_s3_configuration {
role_arn = aws_iam_role.firehose_stream_role.arn
bucket_arn = aws_s3_bucket.main_dl_bucket.arn
buffer_size = 1
buffer_interval = 60
prefix = "raw-zone/stockdata/year=!{timestamp:yyyy}/month=!{timestamp:MM}/day=!{timestamp:dd}/hour=!{timestamp:HH}/"
error_output_prefix = "${ "raw-zone/stockdata_errors/!{firehose:error-output-type}/year=!{timestamp:yyyy}"}${ "/month=!{timestamp:MM}/day=!{timestamp:dd}/hour=!{timestamp:HH}"}/"
}
}

View File

@ -1,50 +0,0 @@
resource "aws_lambda_layer_version" "aws_wrangler" {
filename = "../lambda/awswrangler-layer-2.7.0-py3.8.zip"
layer_name = "aws_wrangler_${var.environment}_${var.account_number}_${var.student_initials}_${var.student_index_no}"
source_code_hash = "${filebase64sha256("../lambda/awswrangler-layer-2.7.0-py3.8.zip")}"
compatible_runtimes = [
"python3.8"]
}
resource "aws_lambda_function" "etl_post_processing" {
function_name = "etl-post-processing-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
filename = "../lambda/lambda_definition.zip"
handler = "lambda_definition.etl_function"
runtime = "python3.8"
role = aws_iam_role.lambda_basic_role.arn
timeout = 300
memory_size = 512
source_code_hash = filebase64sha256("../lambda/lambda_definition.zip")
layers = [
"${aws_lambda_layer_version.aws_wrangler.arn}"]
}
resource "aws_lambda_permission" "allow_bucket" {
statement_id = "AllowExecutionFromS3Bucket"
action = "lambda:InvokeFunction"
function_name = aws_lambda_function.etl_post_processing.arn
principal = "s3.amazonaws.com"
source_arn = aws_s3_bucket.main_dl_bucket.arn
}
resource "aws_s3_bucket_notification" "trigger_etl_lambda" {
bucket = aws_s3_bucket.main_dl_bucket.id
lambda_function {
lambda_function_arn = aws_lambda_function.etl_post_processing.arn
events = [
"s3:ObjectCreated:*"]
filter_prefix = "raw-zone/"
}
depends_on = [
aws_lambda_permission.allow_bucket]
}

View File

@ -1,7 +0,0 @@
locals {
common_tags = {
Purpose = "UAM Cloud Data Processing"
Environment = "DEV"
Owner = var.student_full_name
}
}

View File

@ -1,75 +0,0 @@
-- lab 3.3
WITH CTE AS
(
SELECT date_format(from_unixtime(transaction_ts),'%Y-%m-%dT%H') as HourlyBucket,
RANK() OVER(PARTITION BY date_format(from_unixtime(transaction_ts),'%Y-%m-%dT%H'), symbol ,type ORDER BY dollar_amount DESC) as rnk, *
FROM "datalake_dev_100603781557_jk_12345"."crawler_stockdata"
)
select *
from CTE
where rnk=1
order by 1, 4, 8
-- LAB 4.2
CREATE EXTERNAL TABLE processed_stockdata(
transaction_date timestamp,
price double,
amount double,
dollar_amount double,
type string,
trans_id bigint)
PARTITIONED BY (
symbol string,
year integer,
month integer,
day integer,
hour integer
)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
's3://datalake-dev-100603781557-jk-12345/processed-zone/stockdata/'
MSCK REPAIR TABLE processed_stockdata;
-- LAB 5
-- .----------. .----------. .----------.
-- | SOURCE | | INSERT | | DESTIN. |
-- Source-->| STREAM |-->| & SELECT |-->| STREAM |-->Destination
-- | | | (PUMP) | | |
-- '----------' '----------' '----------'
CREATE OR REPLACE STREAM "DESTINATION_SQL_STREAM"
("symbol" VARCHAR(10), "type" VARCHAR(10), "trans_id" BIGINT,
"dollar_amount" DOUBLE, "AvgLast30seconds" DOUBLE, "CntLast30seconds" INT,
"SumLast30rows" DOUBLE, "CntLast30rows" INT, "max_tran_id" BIGINT );
CREATE OR REPLACE PUMP "STREAM_PUMP" AS INSERT INTO "DESTINATION_SQL_STREAM"
SELECT STREAM "symbol", "type", "trans_id", "dollar_amount", "AvgLast30seconds", "CntLast30seconds"
, "SumLast30rows", "CntLast30rows", "max_tran_id"
FROM (
SELECT STREAM "symbol", "type", "trans_id", "dollar_amount",
AVG("dollar_amount") OVER LAST_30_SECS AS "AvgLast30seconds",
COUNT(*) OVER LAST_30_SECS AS "CntLast30seconds",
SUM("dollar_amount") OVER LAST_30_ROWS AS "SumLast30rows",
COUNT(*) OVER LAST_30_ROWS AS "CntLast30rows",
MAX("trans_id") OVER LAST_30_ROWS AS "max_tran_id"
FROM "SOURCE_SQL_STREAM_001"
WHERE "symbol" = 'BTC_USD'
WINDOW
LAST_30_SECS AS (PARTITION BY "symbol", "type" RANGE INTERVAL '30' SECOND PRECEDING),
LAST_30_ROWS AS (PARTITION BY "symbol", "type" ROWS 30 PRECEDING)
)
WHERE "dollar_amount" > 4 * ("AvgLast30seconds");

View File

@ -1,6 +0,0 @@
resource "aws_s3_bucket" "main_dl_bucket" {
bucket = "datalake-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
force_destroy = true
tags = merge(local.common_tags, )
}