Materiały na zajęcia
This commit is contained in:
parent
7d53b118d7
commit
08662c4f24
@ -1,17 +0,0 @@
|
|||||||
resource "aws_glue_catalog_database" "datalake_db_raw_zone" {
|
|
||||||
name = "datalake_${var.environment}_${var.account_number}_${var.student_initials}_${var.student_index_no}"
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
resource "aws_glue_crawler" "glue_crawler_raw_zone" {
|
|
||||||
database_name = aws_glue_catalog_database.datalake_db_raw_zone.name
|
|
||||||
name = "gc-raw-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
|
|
||||||
role = aws_iam_role.glue_crawler_role.arn
|
|
||||||
table_prefix = "crawler_"
|
|
||||||
|
|
||||||
s3_target {
|
|
||||||
path = "s3://${aws_s3_bucket.main_dl_bucket.bucket}/raw-zone/stockdata/"
|
|
||||||
}
|
|
||||||
|
|
||||||
tags = merge(local.common_tags, )
|
|
||||||
}
|
|
@ -1,171 +0,0 @@
|
|||||||
resource "aws_iam_role" "firehose_stream_role" {
|
|
||||||
name = "firehose-role-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
|
|
||||||
|
|
||||||
assume_role_policy = <<EOF
|
|
||||||
{
|
|
||||||
"Version": "2012-10-17",
|
|
||||||
"Statement": [
|
|
||||||
{
|
|
||||||
"Action": "sts:AssumeRole",
|
|
||||||
"Principal": {
|
|
||||||
"Service": "firehose.amazonaws.com"
|
|
||||||
},
|
|
||||||
"Effect": "Allow",
|
|
||||||
"Sid": ""
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
|
|
||||||
resource "aws_iam_role_policy" "firehose_stream_policy" {
|
|
||||||
name = "firehose-stream-policy-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
|
|
||||||
role = aws_iam_role.firehose_stream_role.id
|
|
||||||
|
|
||||||
policy = <<EOF
|
|
||||||
{
|
|
||||||
"Version": "2012-10-17",
|
|
||||||
"Statement": [
|
|
||||||
{
|
|
||||||
"Effect": "Allow",
|
|
||||||
"Action": "kinesis:*",
|
|
||||||
"Resource": "*"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Effect": "Allow",
|
|
||||||
"Action": [
|
|
||||||
"s3:AbortMultipartUpload",
|
|
||||||
"s3:GetBucketLocation",
|
|
||||||
"s3:GetObject",
|
|
||||||
"s3:ListBucket",
|
|
||||||
"s3:ListBucketMultipartUploads",
|
|
||||||
"s3:PutObject"
|
|
||||||
],
|
|
||||||
"Resource": [
|
|
||||||
"${aws_s3_bucket.main_dl_bucket.arn}",
|
|
||||||
"${aws_s3_bucket.main_dl_bucket.arn}/*"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Sid": "",
|
|
||||||
"Effect": "Allow",
|
|
||||||
"Action": [
|
|
||||||
"logs:PutLogEvents"
|
|
||||||
],
|
|
||||||
"Resource": [
|
|
||||||
"arn:aws:logs:${var.region}:${var.account_number}:log-group:/aws/kinesisfirehose/*"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Role & policies for Glue Crawler
|
|
||||||
resource "aws_iam_role" "glue_crawler_role" {
|
|
||||||
name = "crawler-role-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
|
|
||||||
|
|
||||||
assume_role_policy = <<EOF
|
|
||||||
{
|
|
||||||
"Version": "2012-10-17",
|
|
||||||
"Statement": [
|
|
||||||
{
|
|
||||||
"Effect": "Allow",
|
|
||||||
"Principal": {
|
|
||||||
"Service": "glue.amazonaws.com"
|
|
||||||
},
|
|
||||||
"Action": "sts:AssumeRole"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
data "aws_iam_policy" "glue_service_policy" {
|
|
||||||
arn = "arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole"
|
|
||||||
}
|
|
||||||
|
|
||||||
resource "aws_iam_role_policy" "glue_crawler_user_bucket_policy" {
|
|
||||||
name = "user-bucket-policy-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
|
|
||||||
role = aws_iam_role.glue_crawler_role.id
|
|
||||||
|
|
||||||
policy = <<EOF
|
|
||||||
{
|
|
||||||
"Version": "2012-10-17",
|
|
||||||
"Statement": [
|
|
||||||
{
|
|
||||||
"Effect": "Allow",
|
|
||||||
"Action": [
|
|
||||||
"s3:GetObject",
|
|
||||||
"s3:PutObject"
|
|
||||||
],
|
|
||||||
"Resource": [
|
|
||||||
"${aws_s3_bucket.main_dl_bucket.arn}*"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
|
|
||||||
resource "aws_iam_policy_attachment" "crawler_attach_managed_policy" {
|
|
||||||
name = "crawler-managed-service-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
|
|
||||||
roles = [
|
|
||||||
aws_iam_role.glue_crawler_role.name]
|
|
||||||
policy_arn = data.aws_iam_policy.glue_service_policy.arn
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Role and policies for Lambda
|
|
||||||
resource "aws_iam_role" "lambda_basic_role" {
|
|
||||||
name = "lambda-basic-role-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
|
|
||||||
tags = merge(local.common_tags, )
|
|
||||||
|
|
||||||
assume_role_policy = <<EOF
|
|
||||||
{
|
|
||||||
"Version": "2012-10-17",
|
|
||||||
"Statement": [
|
|
||||||
{
|
|
||||||
"Action": "sts:AssumeRole",
|
|
||||||
"Principal": {
|
|
||||||
"Service": "lambda.amazonaws.com"
|
|
||||||
},
|
|
||||||
"Effect": "Allow",
|
|
||||||
"Sid": ""
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
|
|
||||||
resource "aws_iam_role_policy" "lambda_basic_policy" {
|
|
||||||
name = "lambda-basic-policy-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
|
|
||||||
role = aws_iam_role.lambda_basic_role.id
|
|
||||||
|
|
||||||
policy = <<EOF
|
|
||||||
{
|
|
||||||
"Version": "2012-10-17",
|
|
||||||
"Statement": [
|
|
||||||
{
|
|
||||||
"Action": [
|
|
||||||
"logs:CreateLogGroup",
|
|
||||||
"logs:CreateLogStream",
|
|
||||||
"logs:PutLogEvents"
|
|
||||||
],
|
|
||||||
"Effect": "Allow",
|
|
||||||
"Resource": "*"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Effect": "Allow",
|
|
||||||
"Action": "s3:*",
|
|
||||||
"Resource": [
|
|
||||||
"${aws_s3_bucket.main_dl_bucket.arn}",
|
|
||||||
"${aws_s3_bucket.main_dl_bucket.arn}/*"]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
EOF
|
|
||||||
}
|
|
||||||
|
|
@ -1,14 +0,0 @@
|
|||||||
resource "aws_kinesis_stream" "cryptostock_stream" {
|
|
||||||
name = "cryptostock-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
|
|
||||||
shard_count = 1
|
|
||||||
enforce_consumer_deletion = true
|
|
||||||
|
|
||||||
shard_level_metrics = [
|
|
||||||
"IncomingBytes",
|
|
||||||
"OutgoingBytes",
|
|
||||||
"IncomingRecords",
|
|
||||||
"OutgoingRecords"
|
|
||||||
]
|
|
||||||
|
|
||||||
tags = merge(local.common_tags, )
|
|
||||||
}
|
|
@ -1,18 +0,0 @@
|
|||||||
resource "aws_kinesis_firehose_delivery_stream" "stock_delivery_stream" {
|
|
||||||
name = "firehose-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
|
|
||||||
destination = "extended_s3"
|
|
||||||
|
|
||||||
kinesis_source_configuration {
|
|
||||||
kinesis_stream_arn = aws_kinesis_stream.cryptostock_stream.arn
|
|
||||||
role_arn = aws_iam_role.firehose_stream_role.arn
|
|
||||||
}
|
|
||||||
|
|
||||||
extended_s3_configuration {
|
|
||||||
role_arn = aws_iam_role.firehose_stream_role.arn
|
|
||||||
bucket_arn = aws_s3_bucket.main_dl_bucket.arn
|
|
||||||
buffer_size = 1
|
|
||||||
buffer_interval = 60
|
|
||||||
prefix = "raw-zone/stockdata/year=!{timestamp:yyyy}/month=!{timestamp:MM}/day=!{timestamp:dd}/hour=!{timestamp:HH}/"
|
|
||||||
error_output_prefix = "${ "raw-zone/stockdata_errors/!{firehose:error-output-type}/year=!{timestamp:yyyy}"}${ "/month=!{timestamp:MM}/day=!{timestamp:dd}/hour=!{timestamp:HH}"}/"
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,50 +0,0 @@
|
|||||||
resource "aws_lambda_layer_version" "aws_wrangler" {
|
|
||||||
filename = "../lambda/awswrangler-layer-2.7.0-py3.8.zip"
|
|
||||||
layer_name = "aws_wrangler_${var.environment}_${var.account_number}_${var.student_initials}_${var.student_index_no}"
|
|
||||||
source_code_hash = "${filebase64sha256("../lambda/awswrangler-layer-2.7.0-py3.8.zip")}"
|
|
||||||
compatible_runtimes = [
|
|
||||||
"python3.8"]
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
resource "aws_lambda_function" "etl_post_processing" {
|
|
||||||
|
|
||||||
function_name = "etl-post-processing-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
|
|
||||||
filename = "../lambda/lambda_definition.zip"
|
|
||||||
handler = "lambda_definition.etl_function"
|
|
||||||
runtime = "python3.8"
|
|
||||||
role = aws_iam_role.lambda_basic_role.arn
|
|
||||||
timeout = 300
|
|
||||||
memory_size = 512
|
|
||||||
source_code_hash = filebase64sha256("../lambda/lambda_definition.zip")
|
|
||||||
layers = [
|
|
||||||
"${aws_lambda_layer_version.aws_wrangler.arn}"]
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
resource "aws_lambda_permission" "allow_bucket" {
|
|
||||||
statement_id = "AllowExecutionFromS3Bucket"
|
|
||||||
action = "lambda:InvokeFunction"
|
|
||||||
function_name = aws_lambda_function.etl_post_processing.arn
|
|
||||||
principal = "s3.amazonaws.com"
|
|
||||||
source_arn = aws_s3_bucket.main_dl_bucket.arn
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
resource "aws_s3_bucket_notification" "trigger_etl_lambda" {
|
|
||||||
bucket = aws_s3_bucket.main_dl_bucket.id
|
|
||||||
|
|
||||||
lambda_function {
|
|
||||||
lambda_function_arn = aws_lambda_function.etl_post_processing.arn
|
|
||||||
events = [
|
|
||||||
"s3:ObjectCreated:*"]
|
|
||||||
filter_prefix = "raw-zone/"
|
|
||||||
}
|
|
||||||
|
|
||||||
depends_on = [
|
|
||||||
aws_lambda_permission.allow_bucket]
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,7 +0,0 @@
|
|||||||
locals {
|
|
||||||
common_tags = {
|
|
||||||
Purpose = "UAM Cloud Data Processing"
|
|
||||||
Environment = "DEV"
|
|
||||||
Owner = var.student_full_name
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,75 +0,0 @@
|
|||||||
-- lab 3.3
|
|
||||||
|
|
||||||
WITH CTE AS
|
|
||||||
(
|
|
||||||
|
|
||||||
SELECT date_format(from_unixtime(transaction_ts),'%Y-%m-%dT%H') as HourlyBucket,
|
|
||||||
RANK() OVER(PARTITION BY date_format(from_unixtime(transaction_ts),'%Y-%m-%dT%H'), symbol ,type ORDER BY dollar_amount DESC) as rnk, *
|
|
||||||
FROM "datalake_dev_100603781557_jk_12345"."crawler_stockdata"
|
|
||||||
|
|
||||||
)
|
|
||||||
select *
|
|
||||||
from CTE
|
|
||||||
where rnk=1
|
|
||||||
order by 1, 4, 8
|
|
||||||
|
|
||||||
-- LAB 4.2
|
|
||||||
|
|
||||||
CREATE EXTERNAL TABLE processed_stockdata(
|
|
||||||
transaction_date timestamp,
|
|
||||||
price double,
|
|
||||||
amount double,
|
|
||||||
dollar_amount double,
|
|
||||||
type string,
|
|
||||||
trans_id bigint)
|
|
||||||
PARTITIONED BY (
|
|
||||||
symbol string,
|
|
||||||
year integer,
|
|
||||||
month integer,
|
|
||||||
day integer,
|
|
||||||
hour integer
|
|
||||||
)
|
|
||||||
ROW FORMAT SERDE
|
|
||||||
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
|
|
||||||
STORED AS INPUTFORMAT
|
|
||||||
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
|
|
||||||
OUTPUTFORMAT
|
|
||||||
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
|
|
||||||
LOCATION
|
|
||||||
's3://datalake-dev-100603781557-jk-12345/processed-zone/stockdata/'
|
|
||||||
|
|
||||||
|
|
||||||
MSCK REPAIR TABLE processed_stockdata;
|
|
||||||
|
|
||||||
-- LAB 5
|
|
||||||
|
|
||||||
-- .----------. .----------. .----------.
|
|
||||||
-- | SOURCE | | INSERT | | DESTIN. |
|
|
||||||
-- Source-->| STREAM |-->| & SELECT |-->| STREAM |-->Destination
|
|
||||||
-- | | | (PUMP) | | |
|
|
||||||
-- '----------' '----------' '----------'
|
|
||||||
|
|
||||||
|
|
||||||
CREATE OR REPLACE STREAM "DESTINATION_SQL_STREAM"
|
|
||||||
("symbol" VARCHAR(10), "type" VARCHAR(10), "trans_id" BIGINT,
|
|
||||||
"dollar_amount" DOUBLE, "AvgLast30seconds" DOUBLE, "CntLast30seconds" INT,
|
|
||||||
"SumLast30rows" DOUBLE, "CntLast30rows" INT, "max_tran_id" BIGINT );
|
|
||||||
|
|
||||||
CREATE OR REPLACE PUMP "STREAM_PUMP" AS INSERT INTO "DESTINATION_SQL_STREAM"
|
|
||||||
SELECT STREAM "symbol", "type", "trans_id", "dollar_amount", "AvgLast30seconds", "CntLast30seconds"
|
|
||||||
, "SumLast30rows", "CntLast30rows", "max_tran_id"
|
|
||||||
FROM (
|
|
||||||
|
|
||||||
SELECT STREAM "symbol", "type", "trans_id", "dollar_amount",
|
|
||||||
AVG("dollar_amount") OVER LAST_30_SECS AS "AvgLast30seconds",
|
|
||||||
COUNT(*) OVER LAST_30_SECS AS "CntLast30seconds",
|
|
||||||
SUM("dollar_amount") OVER LAST_30_ROWS AS "SumLast30rows",
|
|
||||||
COUNT(*) OVER LAST_30_ROWS AS "CntLast30rows",
|
|
||||||
MAX("trans_id") OVER LAST_30_ROWS AS "max_tran_id"
|
|
||||||
FROM "SOURCE_SQL_STREAM_001"
|
|
||||||
WHERE "symbol" = 'BTC_USD'
|
|
||||||
WINDOW
|
|
||||||
LAST_30_SECS AS (PARTITION BY "symbol", "type" RANGE INTERVAL '30' SECOND PRECEDING),
|
|
||||||
LAST_30_ROWS AS (PARTITION BY "symbol", "type" ROWS 30 PRECEDING)
|
|
||||||
)
|
|
||||||
WHERE "dollar_amount" > 4 * ("AvgLast30seconds");
|
|
@ -1,6 +0,0 @@
|
|||||||
resource "aws_s3_bucket" "main_dl_bucket" {
|
|
||||||
bucket = "datalake-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
|
|
||||||
force_destroy = true
|
|
||||||
|
|
||||||
tags = merge(local.common_tags, )
|
|
||||||
}
|
|
Loading…
Reference in New Issue
Block a user