Materiały na zajęcia
This commit is contained in:
parent
7d53b118d7
commit
08662c4f24
@ -1,17 +0,0 @@
|
||||
resource "aws_glue_catalog_database" "datalake_db_raw_zone" {
|
||||
name = "datalake_${var.environment}_${var.account_number}_${var.student_initials}_${var.student_index_no}"
|
||||
}
|
||||
|
||||
|
||||
resource "aws_glue_crawler" "glue_crawler_raw_zone" {
|
||||
database_name = aws_glue_catalog_database.datalake_db_raw_zone.name
|
||||
name = "gc-raw-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
|
||||
role = aws_iam_role.glue_crawler_role.arn
|
||||
table_prefix = "crawler_"
|
||||
|
||||
s3_target {
|
||||
path = "s3://${aws_s3_bucket.main_dl_bucket.bucket}/raw-zone/stockdata/"
|
||||
}
|
||||
|
||||
tags = merge(local.common_tags, )
|
||||
}
|
@ -1,171 +0,0 @@
|
||||
resource "aws_iam_role" "firehose_stream_role" {
|
||||
name = "firehose-role-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
|
||||
|
||||
assume_role_policy = <<EOF
|
||||
{
|
||||
"Version": "2012-10-17",
|
||||
"Statement": [
|
||||
{
|
||||
"Action": "sts:AssumeRole",
|
||||
"Principal": {
|
||||
"Service": "firehose.amazonaws.com"
|
||||
},
|
||||
"Effect": "Allow",
|
||||
"Sid": ""
|
||||
}
|
||||
]
|
||||
}
|
||||
EOF
|
||||
}
|
||||
|
||||
resource "aws_iam_role_policy" "firehose_stream_policy" {
|
||||
name = "firehose-stream-policy-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
|
||||
role = aws_iam_role.firehose_stream_role.id
|
||||
|
||||
policy = <<EOF
|
||||
{
|
||||
"Version": "2012-10-17",
|
||||
"Statement": [
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": "kinesis:*",
|
||||
"Resource": "*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:AbortMultipartUpload",
|
||||
"s3:GetBucketLocation",
|
||||
"s3:GetObject",
|
||||
"s3:ListBucket",
|
||||
"s3:ListBucketMultipartUploads",
|
||||
"s3:PutObject"
|
||||
],
|
||||
"Resource": [
|
||||
"${aws_s3_bucket.main_dl_bucket.arn}",
|
||||
"${aws_s3_bucket.main_dl_bucket.arn}/*"
|
||||
]
|
||||
},
|
||||
{
|
||||
"Sid": "",
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"logs:PutLogEvents"
|
||||
],
|
||||
"Resource": [
|
||||
"arn:aws:logs:${var.region}:${var.account_number}:log-group:/aws/kinesisfirehose/*"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
EOF
|
||||
}
|
||||
|
||||
|
||||
// Role & policies for Glue Crawler
|
||||
resource "aws_iam_role" "glue_crawler_role" {
|
||||
name = "crawler-role-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
|
||||
|
||||
assume_role_policy = <<EOF
|
||||
{
|
||||
"Version": "2012-10-17",
|
||||
"Statement": [
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Principal": {
|
||||
"Service": "glue.amazonaws.com"
|
||||
},
|
||||
"Action": "sts:AssumeRole"
|
||||
}
|
||||
]
|
||||
}
|
||||
EOF
|
||||
}
|
||||
|
||||
|
||||
data "aws_iam_policy" "glue_service_policy" {
|
||||
arn = "arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole"
|
||||
}
|
||||
|
||||
resource "aws_iam_role_policy" "glue_crawler_user_bucket_policy" {
|
||||
name = "user-bucket-policy-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
|
||||
role = aws_iam_role.glue_crawler_role.id
|
||||
|
||||
policy = <<EOF
|
||||
{
|
||||
"Version": "2012-10-17",
|
||||
"Statement": [
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": [
|
||||
"s3:GetObject",
|
||||
"s3:PutObject"
|
||||
],
|
||||
"Resource": [
|
||||
"${aws_s3_bucket.main_dl_bucket.arn}*"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
EOF
|
||||
}
|
||||
|
||||
resource "aws_iam_policy_attachment" "crawler_attach_managed_policy" {
|
||||
name = "crawler-managed-service-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
|
||||
roles = [
|
||||
aws_iam_role.glue_crawler_role.name]
|
||||
policy_arn = data.aws_iam_policy.glue_service_policy.arn
|
||||
}
|
||||
|
||||
|
||||
// Role and policies for Lambda
|
||||
resource "aws_iam_role" "lambda_basic_role" {
|
||||
name = "lambda-basic-role-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
|
||||
tags = merge(local.common_tags, )
|
||||
|
||||
assume_role_policy = <<EOF
|
||||
{
|
||||
"Version": "2012-10-17",
|
||||
"Statement": [
|
||||
{
|
||||
"Action": "sts:AssumeRole",
|
||||
"Principal": {
|
||||
"Service": "lambda.amazonaws.com"
|
||||
},
|
||||
"Effect": "Allow",
|
||||
"Sid": ""
|
||||
}
|
||||
]
|
||||
}
|
||||
EOF
|
||||
}
|
||||
|
||||
resource "aws_iam_role_policy" "lambda_basic_policy" {
|
||||
name = "lambda-basic-policy-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
|
||||
role = aws_iam_role.lambda_basic_role.id
|
||||
|
||||
policy = <<EOF
|
||||
{
|
||||
"Version": "2012-10-17",
|
||||
"Statement": [
|
||||
{
|
||||
"Action": [
|
||||
"logs:CreateLogGroup",
|
||||
"logs:CreateLogStream",
|
||||
"logs:PutLogEvents"
|
||||
],
|
||||
"Effect": "Allow",
|
||||
"Resource": "*"
|
||||
},
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": "s3:*",
|
||||
"Resource": [
|
||||
"${aws_s3_bucket.main_dl_bucket.arn}",
|
||||
"${aws_s3_bucket.main_dl_bucket.arn}/*"]
|
||||
}
|
||||
]
|
||||
}
|
||||
EOF
|
||||
}
|
||||
|
@ -1,14 +0,0 @@
|
||||
resource "aws_kinesis_stream" "cryptostock_stream" {
|
||||
name = "cryptostock-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
|
||||
shard_count = 1
|
||||
enforce_consumer_deletion = true
|
||||
|
||||
shard_level_metrics = [
|
||||
"IncomingBytes",
|
||||
"OutgoingBytes",
|
||||
"IncomingRecords",
|
||||
"OutgoingRecords"
|
||||
]
|
||||
|
||||
tags = merge(local.common_tags, )
|
||||
}
|
@ -1,18 +0,0 @@
|
||||
resource "aws_kinesis_firehose_delivery_stream" "stock_delivery_stream" {
|
||||
name = "firehose-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
|
||||
destination = "extended_s3"
|
||||
|
||||
kinesis_source_configuration {
|
||||
kinesis_stream_arn = aws_kinesis_stream.cryptostock_stream.arn
|
||||
role_arn = aws_iam_role.firehose_stream_role.arn
|
||||
}
|
||||
|
||||
extended_s3_configuration {
|
||||
role_arn = aws_iam_role.firehose_stream_role.arn
|
||||
bucket_arn = aws_s3_bucket.main_dl_bucket.arn
|
||||
buffer_size = 1
|
||||
buffer_interval = 60
|
||||
prefix = "raw-zone/stockdata/year=!{timestamp:yyyy}/month=!{timestamp:MM}/day=!{timestamp:dd}/hour=!{timestamp:HH}/"
|
||||
error_output_prefix = "${ "raw-zone/stockdata_errors/!{firehose:error-output-type}/year=!{timestamp:yyyy}"}${ "/month=!{timestamp:MM}/day=!{timestamp:dd}/hour=!{timestamp:HH}"}/"
|
||||
}
|
||||
}
|
@ -1,50 +0,0 @@
|
||||
resource "aws_lambda_layer_version" "aws_wrangler" {
|
||||
filename = "../lambda/awswrangler-layer-2.7.0-py3.8.zip"
|
||||
layer_name = "aws_wrangler_${var.environment}_${var.account_number}_${var.student_initials}_${var.student_index_no}"
|
||||
source_code_hash = "${filebase64sha256("../lambda/awswrangler-layer-2.7.0-py3.8.zip")}"
|
||||
compatible_runtimes = [
|
||||
"python3.8"]
|
||||
}
|
||||
|
||||
|
||||
resource "aws_lambda_function" "etl_post_processing" {
|
||||
|
||||
function_name = "etl-post-processing-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
|
||||
filename = "../lambda/lambda_definition.zip"
|
||||
handler = "lambda_definition.etl_function"
|
||||
runtime = "python3.8"
|
||||
role = aws_iam_role.lambda_basic_role.arn
|
||||
timeout = 300
|
||||
memory_size = 512
|
||||
source_code_hash = filebase64sha256("../lambda/lambda_definition.zip")
|
||||
layers = [
|
||||
"${aws_lambda_layer_version.aws_wrangler.arn}"]
|
||||
|
||||
}
|
||||
|
||||
|
||||
resource "aws_lambda_permission" "allow_bucket" {
|
||||
statement_id = "AllowExecutionFromS3Bucket"
|
||||
action = "lambda:InvokeFunction"
|
||||
function_name = aws_lambda_function.etl_post_processing.arn
|
||||
principal = "s3.amazonaws.com"
|
||||
source_arn = aws_s3_bucket.main_dl_bucket.arn
|
||||
}
|
||||
|
||||
|
||||
resource "aws_s3_bucket_notification" "trigger_etl_lambda" {
|
||||
bucket = aws_s3_bucket.main_dl_bucket.id
|
||||
|
||||
lambda_function {
|
||||
lambda_function_arn = aws_lambda_function.etl_post_processing.arn
|
||||
events = [
|
||||
"s3:ObjectCreated:*"]
|
||||
filter_prefix = "raw-zone/"
|
||||
}
|
||||
|
||||
depends_on = [
|
||||
aws_lambda_permission.allow_bucket]
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,7 +0,0 @@
|
||||
locals {
|
||||
common_tags = {
|
||||
Purpose = "UAM Cloud Data Processing"
|
||||
Environment = "DEV"
|
||||
Owner = var.student_full_name
|
||||
}
|
||||
}
|
@ -1,75 +0,0 @@
|
||||
-- lab 3.3
|
||||
|
||||
WITH CTE AS
|
||||
(
|
||||
|
||||
SELECT date_format(from_unixtime(transaction_ts),'%Y-%m-%dT%H') as HourlyBucket,
|
||||
RANK() OVER(PARTITION BY date_format(from_unixtime(transaction_ts),'%Y-%m-%dT%H'), symbol ,type ORDER BY dollar_amount DESC) as rnk, *
|
||||
FROM "datalake_dev_100603781557_jk_12345"."crawler_stockdata"
|
||||
|
||||
)
|
||||
select *
|
||||
from CTE
|
||||
where rnk=1
|
||||
order by 1, 4, 8
|
||||
|
||||
-- LAB 4.2
|
||||
|
||||
CREATE EXTERNAL TABLE processed_stockdata(
|
||||
transaction_date timestamp,
|
||||
price double,
|
||||
amount double,
|
||||
dollar_amount double,
|
||||
type string,
|
||||
trans_id bigint)
|
||||
PARTITIONED BY (
|
||||
symbol string,
|
||||
year integer,
|
||||
month integer,
|
||||
day integer,
|
||||
hour integer
|
||||
)
|
||||
ROW FORMAT SERDE
|
||||
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
|
||||
STORED AS INPUTFORMAT
|
||||
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
|
||||
OUTPUTFORMAT
|
||||
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
|
||||
LOCATION
|
||||
's3://datalake-dev-100603781557-jk-12345/processed-zone/stockdata/'
|
||||
|
||||
|
||||
MSCK REPAIR TABLE processed_stockdata;
|
||||
|
||||
-- LAB 5
|
||||
|
||||
-- .----------. .----------. .----------.
|
||||
-- | SOURCE | | INSERT | | DESTIN. |
|
||||
-- Source-->| STREAM |-->| & SELECT |-->| STREAM |-->Destination
|
||||
-- | | | (PUMP) | | |
|
||||
-- '----------' '----------' '----------'
|
||||
|
||||
|
||||
CREATE OR REPLACE STREAM "DESTINATION_SQL_STREAM"
|
||||
("symbol" VARCHAR(10), "type" VARCHAR(10), "trans_id" BIGINT,
|
||||
"dollar_amount" DOUBLE, "AvgLast30seconds" DOUBLE, "CntLast30seconds" INT,
|
||||
"SumLast30rows" DOUBLE, "CntLast30rows" INT, "max_tran_id" BIGINT );
|
||||
|
||||
CREATE OR REPLACE PUMP "STREAM_PUMP" AS INSERT INTO "DESTINATION_SQL_STREAM"
|
||||
SELECT STREAM "symbol", "type", "trans_id", "dollar_amount", "AvgLast30seconds", "CntLast30seconds"
|
||||
, "SumLast30rows", "CntLast30rows", "max_tran_id"
|
||||
FROM (
|
||||
|
||||
SELECT STREAM "symbol", "type", "trans_id", "dollar_amount",
|
||||
AVG("dollar_amount") OVER LAST_30_SECS AS "AvgLast30seconds",
|
||||
COUNT(*) OVER LAST_30_SECS AS "CntLast30seconds",
|
||||
SUM("dollar_amount") OVER LAST_30_ROWS AS "SumLast30rows",
|
||||
COUNT(*) OVER LAST_30_ROWS AS "CntLast30rows",
|
||||
MAX("trans_id") OVER LAST_30_ROWS AS "max_tran_id"
|
||||
FROM "SOURCE_SQL_STREAM_001"
|
||||
WHERE "symbol" = 'BTC_USD'
|
||||
WINDOW
|
||||
LAST_30_SECS AS (PARTITION BY "symbol", "type" RANGE INTERVAL '30' SECOND PRECEDING),
|
||||
LAST_30_ROWS AS (PARTITION BY "symbol", "type" ROWS 30 PRECEDING)
|
||||
)
|
||||
WHERE "dollar_amount" > 4 * ("AvgLast30seconds");
|
@ -1,6 +0,0 @@
|
||||
resource "aws_s3_bucket" "main_dl_bucket" {
|
||||
bucket = "datalake-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
|
||||
force_destroy = true
|
||||
|
||||
tags = merge(local.common_tags, )
|
||||
}
|
Loading…
Reference in New Issue
Block a user