Init with first version

This commit is contained in:
Jakub Kasprzak 2021-04-24 15:54:14 +02:00
commit 7d53b118d7
42 changed files with 18897 additions and 0 deletions

34
.gitignore vendored Normal file
View File

@ -0,0 +1,34 @@
.git
.idea
.vscode
*.pytest_cache*
.coverage
.env
*.iml
tests/reports*
reports
local_utils*
### Vim ###
# Swap
[._]*.s[a-v][a-z]
[._]*.sw[a-p]
[._]s[a-rt-v][a-z]
[._]ss[a-gi-z]
[._]sw[a-p]
### macOS ###
# General
.DS_Store
.AppleDouble
.LSOverride
### JupyterNotebooks ###
.ipynb_checkpoints
*/.ipynb_checkpoints/*
__pycache__
### Terraform
.terraform.lock.hcl
.terraform

13
README.md Normal file
View File

@ -0,0 +1,13 @@
# Przetwarzanie danych w chmurze publicznej
## Wstęp do Data lakes
### Struktura projektu ###
* jupyter - notebooki z ćwiczeniami
* labs - skrypty do laboratorium - testowe dane, generator i podstawowe pliki terraform (starter)
* pdf - prezentacja i materiały do ćwiczeń
* testing-stack - docker compose z definicjami Kafka, Kafka-Connect
### Wymagania wstępne ###
Instalacja środowiska zgodnie z instrukcją w :
`./pdf/LABS Setup - Przetwarzanie Danych w chmurze publicznej.pdf`

633
jupyter/UAM_1_avro.ipynb Normal file
View File

@ -0,0 +1,633 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 1 Format danych AVRO\n",
"\n",
"Ćwiczenie ma na celu zademonstrowanie schematów danych AVRO, typów złożonych (mapy, listy, struktury zagnieżdżone) oraz wstęp do Glue / Athena\n",
"\n",
"## Przebieg ćwiczenia\n",
"* skonfiguruj środowisko uruchomieniowe Python (sugerowana Anaconda z Python 3)\n",
"* zainstaluj wszystkie wymagane biblioteki\n",
"\n",
"<code>\n",
"% conda create -n myenv python=3.8\n",
"% conda activate uam-datalake\n",
"% pip install -r ./datalake-uam/jupyter/requirements.txt\n",
"</code>\n",
"\n",
"* zaloguj się do konsoli AWS i stwórz Bucket testowy oraz bazę dancyh w Glue. Uzupełnij poniższy skrypt o te dane \n",
"* wygeneruj dane testowe w wybranym schemacie AVRO\n",
"* zapisz dane do plików na S3 w folderach s3:/<twoj-bukcet-name>/EventName/namespace=xxx/year=YYYY/month=MM/day=DD/version=VVV\n",
"* zarejestruj tabele w Glue z wykorzystaniem BOTO3 / crawler (poprzez konsole AWS GUI - przeglądarkę)\n",
"* skonfiguruj domyślną WorkGroup w Athena (PRIMARY) - konieczne wskazanie miejsce docelowego dla danych z zapytań (S3 location Athena) https://docs.aws.amazon.com/athena/latest/ug/getting-started.html\n",
"* sprawdź definicję tabeli i upewnij się że są zarejestrowane partycje (użyj polecenia MSCK REPAIR w Athena (LOAD PARTITIONS)\n",
"* sprawdź ile danych jest w tabeli (select count(*) from table) - Data Scanned in bytes\n",
"* odpytaj tabele z wykorzystaniem predykatu day=1 (partition elimination) - zweryfikuj ilość danych przeskanowanych (do porównania z ćwiczeniem 2 - parquet)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import boto3\n",
"\n",
"REGION = \"us-east-1\"\n",
"\n",
"\n",
"session_kwargs = {\n",
"\n",
" \"aws_access_key_id\":\"\",\n",
" \"aws_secret_access_key\":\"\",\n",
" \"aws_session_token\":\"\",\n",
" \"region_name\": REGION\n",
"}\n",
" \n",
"session = boto3.Session(**session_kwargs)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"\n",
"from faker import Faker\n",
"from botocore.exceptions import ClientError\n",
"from avro.datafile import DataFileReader, DataFileWriter\n",
"from avro.io import DatumReader, DatumWriter\n",
"import time \n",
"import io\n",
"import datetime\n",
"from avro.schema import Parse\n",
"\n",
"\n",
"fake = Faker()\n",
"fake.seed_instance(4321)\n",
" \n",
"S3_BUCKET = \"datalake-dev-920628590621-us-east-1\"\n",
"\n",
"TEST_DB = 'datalake_dev_jk'\n",
"TEST_TABLE_NAME = 'avro_uam_test'\n",
"EVENT_NAME = \"UamTestEvent\"\n",
"\n",
"\n",
"s3_client = session.client(\"s3\")\n",
"glue_client = session.client(\"glue\")\n",
"\n",
"\n",
"def tear_down_test_db(database=TEST_DB):\n",
" db_names = [x[\"Name\"] for x in glue_client.get_databases()[\"DatabaseList\"] ]\n",
" if database in db_names:\n",
" glue_client.delete_database(Name=database)\n",
" print(\"{} deleted\".format(database))\n",
"\n",
" response_create_db = glue_client.create_database(DatabaseInput={'Name': database }) \n",
" print(\"%s db recreated\" % database)\n",
"\n",
"def tear_down_test_table(database=TEST_DB, table_name=TEST_TABLE_NAME):\n",
" tbl_list = [x[\"Name\"] for x in glue_client.get_tables(DatabaseName=database)[\"TableList\"]]\n",
" if table_name in tbl_list:\n",
" glue_client.delete_table(DatabaseName=database,Name=table_name)\n",
" print(\"test table {} deleted\".format(table_name))\n",
" else:\n",
" print(\"tbl %s not found\" % table_name)\n",
" \n",
"def tear_down_s3(bucket=S3_BUCKET,prefix=EVENT_NAME):\n",
" s3 = boto3.resource('s3',**session_kwargs)\n",
" bucket = s3.Bucket(bucket)\n",
" bucket.objects.filter(Prefix=prefix).delete()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. AVRO schema"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"UamTestEvent com.uam.datalake.v1 1.0.2\n"
]
},
{
"data": {
"text/plain": [
"{'mox-meta': {'version': '1.0.2', 'type': 'ENTITY_SNAPSHOT'},\n",
" 'namespace': 'com.uam.datalake.v1',\n",
" 'type': 'record',\n",
" 'name': 'UamTestEvent',\n",
" 'fields': [{'name': 'customerId',\n",
" 'type': {'type': 'string', 'avro.java.string': 'String'}},\n",
" {'name': 'isActive',\n",
" 'type': 'boolean',\n",
" 'doc': 'a boolean flag if the Customer is active'},\n",
" {'name': 'age', 'type': 'int'},\n",
" {'name': 'balance', 'type': 'float'},\n",
" {'name': 'accountBalance_logical_dec',\n",
" 'type': {'type': 'bytes',\n",
" 'logicalType': 'decimal',\n",
" 'precision': 20,\n",
" 'scale': 4}},\n",
" {'name': 'array_of_strings',\n",
" 'type': ['null',\n",
" {'type': 'array',\n",
" 'items': {'type': 'string', 'avro.java.string': 'String'}}],\n",
" 'default': None},\n",
" {'name': 'paymentDetails',\n",
" 'type': ['null',\n",
" {'type': 'record',\n",
" 'name': 'PaymentDetails',\n",
" 'fields': [{'name': 'counterPartyName',\n",
" 'type': ['null', {'type': 'string', 'avro.java.string': 'String'}],\n",
" 'default': None},\n",
" {'name': 'groupingId',\n",
" 'type': ['null', {'type': 'string', 'avro.java.string': 'String'}],\n",
" 'default': None},\n",
" {'name': 'payeeId',\n",
" 'type': ['null', {'type': 'string', 'avro.java.string': 'String'}],\n",
" 'default': None},\n",
" {'name': 'message',\n",
" 'type': ['null', {'type': 'string', 'avro.java.string': 'String'}],\n",
" 'default': None},\n",
" {'name': 'type',\n",
" 'type': {'type': 'enum',\n",
" 'name': 'PaymentType',\n",
" 'symbols': ['UNKNOWN', 'ONE', 'TWO']}},\n",
" {'name': 'otherAccountId',\n",
" 'type': ['null', {'type': 'string', 'avro.java.string': 'String'}],\n",
" 'default': None}]}],\n",
" 'default': None},\n",
" {'name': 'parameters',\n",
" 'type': ['null',\n",
" {'type': 'map',\n",
" 'avro.java.string': 'String',\n",
" 'values': {'type': 'string', 'avro.java.string': 'String'}}],\n",
" 'default': None}]}"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# test Avro Schema with all important cases\n",
"\n",
"1\n",
"\n",
"schema_string = \"\"\"\n",
"{\n",
" \"mox-meta\":{\n",
" \"version\":\"1.0.2\",\n",
" \"type\":\"ENTITY_SNAPSHOT\"\n",
" },\n",
" \"namespace\":\"com.uam.datalake.v1\",\n",
" \"type\":\"record\",\n",
" \"name\":\"\",\n",
" \"fields\":[\n",
" {\n",
" \"name\":\"customerId\",\n",
" \"type\":{\n",
" \"type\":\"string\",\n",
" \"avro.java.string\":\"String\"\n",
" }\n",
" },\n",
" {\n",
" \"name\":\"isActive\",\n",
" \"type\":\"boolean\",\n",
" \"doc\":\"a boolean flag if the Customer is active\"\n",
" },\n",
" {\n",
" \"name\":\"age\",\n",
" \"type\":\"int\"\n",
" },\n",
" {\n",
" \"name\":\"balance\",\n",
" \"type\":\"float\"\n",
" },\n",
" {\n",
" \"name\":\"accountBalance_logical_dec\",\n",
" \"type\":{\n",
" \"type\":\"bytes\",\n",
" \"logicalType\":\"decimal\",\n",
" \"precision\":20,\n",
" \"scale\":4\n",
" }\n",
" },\n",
" {\n",
" \"name\":\"array_of_strings\",\n",
" \"type\":[\n",
" \"null\",\n",
" {\n",
" \"type\":\"array\",\n",
" \"items\":{\n",
" \"type\":\"string\",\n",
" \"avro.java.string\":\"String\"\n",
" }\n",
" }\n",
" ],\n",
" \"default\":null\n",
" },\n",
" {\n",
" \"name\":\"paymentDetails\",\n",
" \"type\":[\n",
" \"null\",\n",
" {\n",
" \"type\":\"record\",\n",
" \"name\":\"PaymentDetails\",\n",
" \"fields\":[\n",
" {\n",
" \"name\":\"counterPartyName\",\n",
" \"type\":[\n",
" \"null\",\n",
" {\n",
" \"type\":\"string\",\n",
" \"avro.java.string\":\"String\"\n",
" }\n",
" ],\n",
" \"default\":null\n",
" },\n",
" {\n",
" \"name\":\"groupingId\",\n",
" \"type\":[\n",
" \"null\",\n",
" {\n",
" \"type\":\"string\",\n",
" \"avro.java.string\":\"String\"\n",
" }\n",
" ],\n",
" \"default\":null\n",
" },\n",
" {\n",
" \"name\":\"payeeId\",\n",
" \"type\":[\n",
" \"null\",\n",
" {\n",
" \"type\":\"string\",\n",
" \"avro.java.string\":\"String\"\n",
" }\n",
" ],\n",
" \"default\":null\n",
" },\n",
" {\n",
" \"name\":\"message\",\n",
" \"type\":[\n",
" \"null\",\n",
" {\n",
" \"type\":\"string\",\n",
" \"avro.java.string\":\"String\"\n",
" }\n",
" ],\n",
" \"default\":null\n",
" },\n",
" {\n",
" \"name\":\"type\",\n",
" \"type\":{\n",
" \"type\":\"enum\",\n",
" \"name\":\"PaymentType\",\n",
" \"symbols\":[\n",
" \"UNKNOWN\",\n",
" \"ONE\",\n",
" \"TWO\" \n",
" ]\n",
" }\n",
" },\n",
" {\n",
" \"name\":\"otherAccountId\",\n",
" \"type\":[\n",
" \"null\",\n",
" {\n",
" \"type\":\"string\",\n",
" \"avro.java.string\":\"String\"\n",
" }\n",
" ],\n",
" \"default\":null\n",
" }\n",
" ]\n",
" }\n",
" ],\n",
" \"default\":null\n",
" },\n",
" {\n",
" \"name\":\"parameters\",\n",
" \"type\":[\n",
" \"null\",\n",
" {\n",
" \"type\":\"map\",\n",
" \"avro.java.string\":\"String\",\n",
" \"values\":{\n",
" \"type\":\"string\",\n",
" \"avro.java.string\":\"String\"\n",
" }\n",
" }\n",
" ],\n",
" \"default\":null\n",
" }\n",
" ]\n",
"}\n",
"\"\"\"\n",
"\n",
"schema = json.loads(schema_string)\n",
"schema[\"name\"] = EVENT_NAME\n",
"\n",
"RECORD_NAME = schema[\"name\"]\n",
"NAMESPACE = schema[\"namespace\"]\n",
"VERSION = schema[\"mox-meta\"][\"version\"]\n",
"\n",
"print('%s %s %s' %(RECORD_NAME,NAMESPACE,VERSION))\n",
"schema"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Generating test data"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# generate some avro data (buffer file) based on the above schema\n",
"\n",
"avro_schema = Parse(json.dumps(schema))\n",
"buf = io.BytesIO()\n",
"writer = DataFileWriter(buf, DatumWriter(), avro_schema)\n",
"\n",
"for x in range(0, 10000):\n",
"\n",
"\n",
" customer_id = fake.uuid4()\n",
" amount = fake.pydecimal(left_digits=8, right_digits=4)\n",
" amount_int = int(str(amount).replace('.', ''))\n",
"\n",
" strings_arrray = [fake.first_name() for x in range(0, fake.random.randint(1, 5))]\n",
"\n",
" paymentDetails = {'counterPartyName': customer_id,\n",
" 'groupingId': str(fake.uuid4()), 'payeeId': None, 'message': None,\n",
" 'type': 'ONE', 'otherAccountId': str(fake.uuid4())}\n",
" \n",
" randint = fake.random.randint(20, 70)\n",
" \n",
"\n",
" array_of_structs = [{\"field1\": \"one\"}, {\"field1\": \"two\"}]\n",
" customer = {\n",
" \"customerId\": customer_id,\n",
" \"isActive\": fake.random.choice([True, False]),\n",
" \"age\": randint,\n",
" \"balance\": fake.random.random() * 123,\n",
" \"accountBalance_logical_dec\": amount_int.to_bytes(amount_int.bit_length() // 8 + 1, byteorder='big',\n",
" signed=True),\n",
" \"array_of_strings\": strings_arrray,\n",
" \"paymentDetails\": paymentDetails,\n",
"\n",
" \"parameters\": {\"key1\": \"value1\", \"key2\": \"value2\"}\n",
"\n",
" }\n",
" writer.append(customer)\n",
"\n",
"\n",
"writer.flush()\n",
"raw_bytes = buf.getvalue()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"uploaded UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=1/version=1.0.2/CustData_d6f44c22-5c87-4e14-956b-ad7d985226d0.avro\n",
"uploaded UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=2/version=1.0.2/CustData_cf0edf74-e76e-458f-a1d2-e092275a719c.avro\n",
"uploaded UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=3/version=1.0.2/CustData_f9ed4ad7-ed1c-4431-b9ab-317387cdb5af.avro\n",
"uploaded UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=4/version=1.0.2/CustData_d89b1c7f-66e3-4522-a603-1630abbf24fa.avro\n",
"uploaded UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=5/version=1.0.2/CustData_92483d0c-5254-4825-9bbb-59545fc6e4dd.avro\n",
"uploaded UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=6/version=1.0.2/CustData_3473649b-97c5-4597-965b-672a11cdad73.avro\n",
"uploaded UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=7/version=1.0.2/CustData_ecc999f8-eda7-4782-b844-17809980f34c.avro\n",
"uploaded UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=8/version=1.0.2/CustData_b95fd45c-8f0d-4612-8f8b-131437895013.avro\n",
"uploaded UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=9/version=1.0.2/CustData_5a5ef6ba-1576-4450-82a0-3f6f9a10a7c8.avro\n",
"uploaded UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=10/version=1.0.2/CustData_4e5dc9b8-6eb7-4fa4-93e0-61faf874b698.avro\n"
]
}
],
"source": [
"tear_down_s3()\n",
"\n",
"for i in range(1,11):\n",
"\n",
" target_key_name = '{record_name}/namespace={ns}/year=2020/month=2/day={day}/version={ver}/CustData_{rand}.avro'.format(\n",
" record_name=RECORD_NAME,ns=NAMESPACE, day=i,ver=VERSION, rand=fake.uuid4())\n",
" try:\n",
" response = s3_client.put_object(Body=raw_bytes, Bucket=S3_BUCKET, Key=target_key_name)\n",
" print(\"uploaded %s\" % target_key_name)\n",
" except ClientError as e:\n",
" logging.error(e)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Avro reading"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=10/version=1.0.2/CustData_4e5dc9b8-6eb7-4fa4-93e0-61faf874b698.avro\n",
"{'customerId': 'cc733c92-6853-45f6-8e49-bec741188ebb', 'isActive': True, 'age': 58, 'balance': 49.34309768676758, 'accountBalance_logical_dec': b'7L\\xbc\\xff\\xf3', 'array_of_strings': ['Rebecca'], 'paymentDetails': {'counterPartyName': 'cc733c92-6853-45f6-8e49-bec741188ebb', 'groupingId': '9626bf79-2f97-4c0c-9aae-de080adab7df', 'payeeId': None, 'message': None, 'type': 'ONE', 'otherAccountId': '69261bc2-4a71-4de7-bc8b-1beb0d9320ac'}, 'parameters': {'key1': 'value1', 'key2': 'value2'}}\n"
]
}
],
"source": [
"print(target_key_name)\n",
"\n",
"obj = s3_client.get_object(Bucket=S3_BUCKET, Key=target_key_name)\n",
"record_raw = obj['Body'].read()\n",
"\n",
"\n",
"reader = DataFileReader(io.BytesIO(record_raw), DatumReader())\n",
"for line in reader:\n",
" print(line)\n",
" break\n",
"\n",
"avro_schema = reader.meta[\"avro.schema\"]\n",
"reader.close()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"b'{\"type\": \"record\", \"mox-meta\": {\"version\": \"1.0.2\", \"type\": \"ENTITY_SNAPSHOT\"}, \"name\": \"UamTestEvent\", \"namespace\": \"com.uam.datalake.v1\", \"fields\": [{\"type\": {\"type\": \"string\", \"avro.java.string\": \"String\"}, \"name\": \"customerId\"}, {\"type\": \"boolean\", \"name\": \"isActive\", \"doc\": \"a boolean flag if the Customer is active\"}, {\"type\": \"int\", \"name\": \"age\"}, {\"type\": \"float\", \"name\": \"balance\"}, {\"type\": {\"type\": \"bytes\", \"logicalType\": \"decimal\", \"precision\": 20, \"scale\": 4}, \"name\": \"accountBalance_logical_dec\"}, {\"type\": [\"null\", {\"type\": \"array\", \"items\": {\"type\": \"string\", \"avro.java.string\": \"String\"}}], \"name\": \"array_of_strings\", \"default\": null}, {\"type\": [\"null\", {\"type\": \"record\", \"name\": \"PaymentDetails\", \"namespace\": \"com.uam.datalake.v1\", \"fields\": [{\"type\": [\"null\", {\"type\": \"string\", \"avro.java.string\": \"String\"}], \"name\": \"counterPartyName\", \"default\": null}, {\"type\": [\"null\", {\"type\": \"string\", \"avro.java.string\": \"String\"}], \"name\": \"groupingId\", \"default\": null}, {\"type\": [\"null\", {\"type\": \"string\", \"avro.java.string\": \"String\"}], \"name\": \"payeeId\", \"default\": null}, {\"type\": [\"null\", {\"type\": \"string\", \"avro.java.string\": \"String\"}], \"name\": \"message\", \"default\": null}, {\"type\": {\"type\": \"enum\", \"name\": \"PaymentType\", \"namespace\": \"com.uam.datalake.v1\", \"symbols\": [\"UNKNOWN\", \"ONE\", \"TWO\"]}, \"name\": \"type\"}, {\"type\": [\"null\", {\"type\": \"string\", \"avro.java.string\": \"String\"}], \"name\": \"otherAccountId\", \"default\": null}]}], \"name\": \"paymentDetails\", \"default\": null}, {\"type\": [\"null\", {\"type\": \"map\", \"avro.java.string\": \"String\", \"values\": {\"type\": \"string\", \"avro.java.string\": \"String\"}}], \"name\": \"parameters\", \"default\": null}]}'"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"avro_schema"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tbl avro_uam_test not found\n"
]
},
{
"data": {
"text/plain": [
"{'ResponseMetadata': {'RequestId': '1d5a05a1-0253-436c-a85a-d21f334d51ae',\n",
" 'HTTPStatusCode': 200,\n",
" 'HTTPHeaders': {'date': 'Sat, 24 Apr 2021 11:20:04 GMT',\n",
" 'content-type': 'application/x-amz-json-1.1',\n",
" 'content-length': '2',\n",
" 'connection': 'keep-alive',\n",
" 'x-amzn-requestid': '1d5a05a1-0253-436c-a85a-d21f334d51ae'},\n",
" 'RetryAttempts': 0}}"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#register glue table with avro SCHEMA\n",
"tear_down_test_table() # create or replace\n",
"\n",
"glue_client.create_table(\n",
" DatabaseName=TEST_DB,\n",
" TableInput={\n",
" \"Name\" : TEST_TABLE_NAME,\n",
" 'Owner': 'owner',\n",
" 'StorageDescriptor': {\n",
" 'Columns': [\n",
" {'Name': 'customerId', 'Type': 'string'},\n",
" {'Name': 'isActive', 'Type': 'boolean'},\n",
" {'Name': 'age', 'Type': 'int'}, \n",
" {'Name': 'balance', 'Type': 'float'},\n",
" {'Name': 'accountBalance_logical_dec', 'Type': 'decimal(20,4)'},\n",
" {'Name': 'array_of_strings', 'Type': 'array<string>'},\n",
" {'Name': 'paymentdetails',\n",
" 'Type': 'struct<counterpartyname:string,groupingid:string,payeeid:string,message:string,type:string,otheraccountid:string>'},\n",
" {'Name': 'parameters', 'Type': 'map<string,string>'}\n",
" ],\n",
" 'Location': 's3://{}/{}/namespace={}/'.format(S3_BUCKET,RECORD_NAME,NAMESPACE),\n",
" 'InputFormat': 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat',\n",
" 'OutputFormat': 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat',\n",
" 'Compressed': False,\n",
" 'NumberOfBuckets': -1,\n",
" 'SerdeInfo': {\n",
" 'SerializationLibrary': 'org.apache.hadoop.hive.serde2.avro.AvroSerDe',\n",
" 'Parameters': \n",
" {\n",
" 'avro.schema.literal': json.dumps(schema),\n",
" 'serialization.format': '1'\n",
" }\n",
" },\n",
" 'BucketColumns': [],\n",
" 'SortColumns': [],\n",
" },\n",
" 'PartitionKeys': [\n",
" {'Name': 'year','Type': 'int'},\n",
" {'Name': 'month','Type': 'int'},\n",
" {'Name': 'day','Type': 'int'},\n",
" {'Name': 'version','Type': 'string'}\n",
" ],\n",
" 'TableType': 'EXTERNAL_TABLE', \n",
" 'Parameters': {\n",
" \n",
" 'avro.schema.literal': json.dumps(schema),\n",
" 'classification': 'avro',\n",
" 'compressionType': 'none',\n",
" \n",
" }\n",
" }\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### now you can find the table in Glue Data Catalogue and query with Athena (remember about partitions)\n",
"\n",
"```\n",
"SELECT * , paymentdetails.groupingid , \"$path\"\n",
"FROM \"avro_uam_test\" as a \n",
"CROSS JOIN UNNEST(array_of_strings) as t(names)\n",
"where customerid = '0d6f913f-9364-4898-875e-d07311d1e300' and day = 1\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

606
jupyter/UAM_2_parquet.ipynb Normal file

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,41 @@
### Runbook kafka stack + kafka connect
1. uruchom stack
```
docker-compose -f docker-compose.yml up
```
2. Sprawdz jakie sa topiki na Kafce
```
./kafka-topics.sh --bootstrap-server localhost:9092 --list
```
3. Stworz nowy topik : test-topic
```
./kafka-topics.sh --bootstrap-server localhost:9092 --topic json.test.topic --create --partitions 3 --replication-factor
```
4. Połącz się producerem do Kafki na topik test-topic
```
./kafka-console-producer.sh --broker-list localhost:9092 --topic json.test.topic
```
5. Połącz się z Kafką na json.test-topic
```
./kafka-console-consumer.sh --bootstrap-server localhost:9092 --topic json.test.topic --from-beginning --property print.timestamp=true --from-beginning --property print.key=true
```
6. Deploy connectors
Najpierw simple connector - wyślij na http://localhost:8083/connectors/ (POST method) definicję UAM_3_sink_connector_simple.json np. z Postmana
Teraz napisz kilka wiadomości i poczekaj aż zrzuci do Minio
7. `docker-compose logs minio -f`
`docker-compose ps`

View File

@ -0,0 +1,39 @@
{
"name": "sink-s3-bytes",
"config": {
"timestamp.extractor": "Record",
"locale": "US",
"timezone": "UTC",
"connector.class": "io.confluent.connect.s3.S3SinkConnector",
"tasks.max": "3",
"s3.region": "ap-southeast-1",
"s3.bucket.name": "dev-data-raw",
"s3.acl.canned": "bucket-owner-full-control",
"s3.part.size": "5242880",
"flush.size": "10",
"rotate.interval.ms": "3600000",
"rotate.schedule.interval.ms": "3000",
"storage.class": "io.confluent.connect.s3.storage.S3Storage",
"format.class": "io.confluent.connect.s3.format.json.JsonFormat",
"value.converter": "org.apache.kafka.connect.converters.ByteArrayConverter",
"value.converter.schemas.enable": "false",
"key.converter": "org.apache.kafka.connect.converters.ByteArrayConverter",
"key.converter.schemas.enable": "false",
"partitioner.class": "io.confluent.connect.storage.partitioner.TimeBasedPartitioner",
"path.format": "'year'=YYYY/'month'=M/'day'=d/'hour'=H",
"partition.duration.ms": "3600000",
"schema.generator.class": "io.confluent.connect.storage.hive.schema.DefaultSchemaGenerator",
"schema.compatibility": "NONE",
"name": "sink-s3-bytes",
"topics.regex": ".*",
"topics.dir": "sink-s3-bytes",
"transforms": "MakeMap, InsertMetadata",
"transforms.MakeMap.type": "org.apache.kafka.connect.transforms.HoistField$Value",
"transforms.MakeMap.field": "msg_payload",
"transforms.InsertMetadata.type": "org.apache.kafka.connect.transforms.InsertField$Value",
"transforms.InsertMetadata.partition.field": "msg_partition",
"transforms.InsertMetadata.offset.field": "msg_offset",
"transforms.InsertMetadata.timestamp.field": "msg_ts",
"store.url": "http://minio:9000"
}
}

View File

@ -0,0 +1,27 @@
{
"name": "sink-s3-json",
"config": {
"timestamp.extractor": "Record",
"locale": "US",
"timezone": "UTC",
"connector.class": "io.confluent.connect.s3.S3SinkConnector",
"tasks.max": "1",
"s3.region": "ap-southeast-1",
"s3.bucket.name": "dev-data-raw",
"s3.part.size": "5242880",
"flush.size": "10",
"rotate.interval.ms": "3600000",
"rotate.schedule.interval.ms": "3000",
"storage.class": "io.confluent.connect.s3.storage.S3Storage",
"format.class": "io.confluent.connect.s3.format.json.JsonFormat",
"partitioner.class": "io.confluent.connect.storage.partitioner.TimeBasedPartitioner",
"path.format": "'year'=YYYY/'month'=M/'day'=d",
"partition.duration.ms": "3600000",
"schema.generator.class": "io.confluent.connect.storage.hive.schema.DefaultSchemaGenerator",
"schema.compatibility": "NONE",
"name": "sink-s3-json",
"topics.regex": "json.*",
"store.url": "http://minio:9000",
"topics.dir": "sink-s3-json"
}
}

412
jupyter/UAM_4_Kinesis.ipynb Normal file
View File

@ -0,0 +1,412 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 4 Kinesis Data Streams & Firehose\n",
"\n",
"Przebieg ćwiczenia\n",
"* Stwórz Data Stream z wykorzystaniem boto3 / AWS console (GUI)\n",
"* wygeneruj testowe dane do streama\n",
"* odczytaj dane ze streama (ShardIterator)\n",
"* Stwórz Kinesis Firehose Stream i podepnij pod niego utworzony wcześniej Data Stream jako source. Skonfiguruj buffor size = 1Mb buffor time = 60s\n",
"* wygeneruj 10000 wiadomości i sprawdź czy dane ładowane są do S3\n",
"\n",
"## Pamiętaj aby po skończonych ćwiczeniach usunąć wszystkie obiekty\n",
"### Uwaga !!! poniższy skrypt tworzy obiekty w regionie HongKong ! Na końcu skryptu jest funkcja tear_down_all() która usuwa testowy bucket, bazę Glue i Kinesis Data Streams czyli wszystkie obiekty które były stworzone w kodzie."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import boto3\n",
"\n",
"REGION = \"ap-east-1\"\n",
"\n",
"session_kwargs = {\n",
"\n",
" \"aws_access_key_id\":\"\",\n",
" \"aws_secret_access_key\":\"\",\n",
" \"aws_session_token\":\"\",\n",
" \"region_name\": REGION\n",
"}\n",
"\n",
"session = boto3.Session(**session_kwargs)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"kinesis_client = session.client(\"kinesis\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'ResponseMetadata': {'RequestId': 'dd209b3a-57d1-b862-8a48-bc194546845a',\n",
" 'HTTPStatusCode': 200,\n",
" 'HTTPHeaders': {'x-amzn-requestid': 'dd209b3a-57d1-b862-8a48-bc194546845a',\n",
" 'x-amz-id-2': 'k78aY4x6wCDEXo6kL76yEG64tV2ct9TQxM76Bfy345CJgSaVdfDJsjlr1jzNnpRxVk2qc+G9L42xZbmYrx/mivAFMWoek7Si',\n",
" 'date': 'Sat, 20 Jun 2020 15:01:55 GMT',\n",
" 'content-type': 'application/x-amz-json-1.1',\n",
" 'content-length': '0'},\n",
" 'RetryAttempts': 0}}"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"STREAM_NAME = 'uam-test'\n",
"\n",
"kinesis_client.create_stream(\n",
" StreamName=STREAM_NAME,\n",
" ShardCount=1\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['uam-test']"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kinesis_client.list_streams()[\"StreamNames\"]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'ShardId': 'shardId-000000000000', 'SequenceNumber': '49608139280302835973846909376978574930250627257427034114', 'ResponseMetadata': {'RequestId': 'c4858035-fc96-5621-93ed-a71b8bbdb95f', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'c4858035-fc96-5621-93ed-a71b8bbdb95f', 'x-amz-id-2': 'Z8hEKLIaCne7YPLWuDMTiFcAt7s1HJvfENE/2Oj7ARnuzjOsVNj+QyHpuIbM0tMucp7YZxLq2M65Xy/bgyyYeI2T5eJviuYA', 'date': 'Sat, 20 Jun 2020 15:02:06 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '110'}, 'RetryAttempts': 0}}\n"
]
}
],
"source": [
"response = kinesis_client.put_record(\n",
" StreamName=STREAM_NAME,\n",
" Data=b'{\"col1\" : \"this is my test json data\"}',\n",
" PartitionKey='1' \n",
")\n",
"print(response)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'kinesis_client' is not defined",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-3-f9ad17c57d0b>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mkinesis_client\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdescribe_stream\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mStreamName\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mSTREAM_NAME\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mNameError\u001b[0m: name 'kinesis_client' is not defined"
]
}
],
"source": [
"response = kinesis_client.describe_stream(StreamName=STREAM_NAME) \n",
"print(response)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'response' is not defined",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-4-bfebdd8c4299>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[0mshard_ids\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[0mstream_name\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[1;32mif\u001b[0m \u001b[0mresponse\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;34m'StreamDescription'\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4\u001b[0m \u001b[0mstream_name\u001b[0m\u001b[1;33m=\u001b[0m \u001b[0mresponse\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'StreamDescription'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'StreamName'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mNameError\u001b[0m: name 'response' is not defined"
]
}
],
"source": [
"shard_ids = []\n",
"stream_name = None \n",
"if response and 'StreamDescription' in response:\n",
" stream_name= response['StreamDescription']['StreamName'] \n",
" \n",
" for shard_id in response['StreamDescription']['Shards']:\n",
" shard_id = shard_id['ShardId']\n",
" shard_iterator = kinesis_client.get_shard_iterator(StreamName=stream_name, ShardId = shard_id, ShardIteratorType=\"TRIM_HORIZON\")\n",
" shard_ids.append({'shard_id' : shard_id ,'shard_iterator' : shard_iterator['ShardIterator'] })\n",
" \n",
"shard_ids"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'AAAAAAAAAAHfJPwIwOEqHzQIjn90snM/nPs4zZARsJlXPyGlUTbvU+T5cdGvXzb54qetks+heTq/ttfFlehkcLGr27CpkPNDn2A9NHYc1w+3VjLIBmNKTLJlHnCjjFCwgqksrs1mUQVli12hZjy6wZXhGualZUI//H2BxRwKqH/Pf2Zk9S6KSbeJFDm0boV2COPqB3wZ21axe8lWXJVJAfjMgPacIU6K'"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sh = shard_iterator[\"ShardIterator\"]\n",
"sh"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"tries = 0\n",
"limit = 100\n",
"result = []\n",
"while tries < 10:\n",
" tries += 1\n",
" response_get_rec = kinesis_client.get_records(ShardIterator = sh , Limit = limit)\n",
" shard_iterator = response_get_rec['NextShardIterator']\n",
" break\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'Records': [{'SequenceNumber': '49608139280302835973846909376978574930250627257427034114',\n",
" 'ApproximateArrivalTimestamp': datetime.datetime(2020, 6, 20, 17, 2, 6, 976000, tzinfo=tzlocal()),\n",
" 'Data': b'{\"col1\" : \"this is my test json data\"}',\n",
" 'PartitionKey': '1'}],\n",
" 'NextShardIterator': 'AAAAAAAAAAEgmwgBEgaauGNF/YzN5S+FcuWOWZZMledH4BR7CLiGD9iYYL4z+eLK7NaTQTHTAlSFEYm6N6vtjdFcTl8ibGJGKnuQthZiMgCfolA1FAAoWmLHvI0slHvZx1oWLfdApD8robDWj3zX/2d4zOzj1P9xz/+Xo8/YFdCXd0ENfUNxI7MhzZUGamw09rXa8Y0sDunFpkLy7msr5vjURGjr+xrf',\n",
" 'MillisBehindLatest': 0,\n",
" 'ResponseMetadata': {'RequestId': 'e580a084-b06e-ca24-b2e8-87b6c745255a',\n",
" 'HTTPStatusCode': 200,\n",
" 'HTTPHeaders': {'x-amzn-requestid': 'e580a084-b06e-ca24-b2e8-87b6c745255a',\n",
" 'x-amz-id-2': '/zy9vbX4wzEuqd7TZ959MQcL0OzB9kPQ3TSfrtEIIpI1IvubKb7OgnxYhxiWIZpvPWbtzXO5x0r+eO6+A8wR+bn0v8khnlpL',\n",
" 'date': 'Sat, 20 Jun 2020 15:02:10 GMT',\n",
" 'content-type': 'application/x-amz-json-1.1',\n",
" 'content-length': '489'},\n",
" 'RetryAttempts': 0}}"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"response_get_rec"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['49608122819498384072835111163875039160761192199284064258']"
]
},
"execution_count": 99,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[x[\"SequenceNumber\"] for x in response_get_rec[\"Records\"] ]"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'SequenceNumber': '49608122819498384072835111163875039160761192199284064258',\n",
" 'ApproximateArrivalTimestamp': datetime.datetime(2020, 6, 20, 4, 13, 11, 39000, tzinfo=tzlocal()),\n",
" 'Data': b'{\"col1\" : \"this is my test json data\"}',\n",
" 'PartitionKey': '1'}]"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"response_get_rec[\"Records\"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Utworz recznie Kinesis Firehose dla tego Stream'a dopiero pozniej wygeneruj dane testowe ponizsza petla"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-13-0e9e1b83eba0>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m10000\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m response = kinesis_client.put_record(\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mStreamName\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mSTREAM_NAME\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mData\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34mb'{\"col1\" : \"this is json data\"}'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mPartitionKey\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'1'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/opt/anaconda3/envs/uam-d/lib/python3.8/site-packages/botocore/client.py\u001b[0m in \u001b[0;36m_api_call\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 314\u001b[0m \"%s() only accepts keyword arguments.\" % py_operation_name)\n\u001b[1;32m 315\u001b[0m \u001b[0;31m# The \"self\" in this scope is referring to the BaseClient.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 316\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_api_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moperation_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 317\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 318\u001b[0m \u001b[0m_api_call\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpy_operation_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/opt/anaconda3/envs/uam-d/lib/python3.8/site-packages/botocore/client.py\u001b[0m in \u001b[0;36m_make_api_call\u001b[0;34m(self, operation_name, api_params)\u001b[0m\n\u001b[1;32m 619\u001b[0m \u001b[0mhttp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparsed_response\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mevent_response\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 620\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 621\u001b[0;31m http, parsed_response = self._make_request(\n\u001b[0m\u001b[1;32m 622\u001b[0m operation_model, request_dict, request_context)\n\u001b[1;32m 623\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/opt/anaconda3/envs/uam-d/lib/python3.8/site-packages/botocore/client.py\u001b[0m in \u001b[0;36m_make_request\u001b[0;34m(self, operation_model, request_dict, request_context)\u001b[0m\n\u001b[1;32m 639\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_make_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moperation_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest_dict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest_context\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 640\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 641\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_endpoint\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmake_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moperation_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest_dict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 642\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 643\u001b[0m self.meta.events.emit(\n",
"\u001b[0;32m~/opt/anaconda3/envs/uam-d/lib/python3.8/site-packages/botocore/endpoint.py\u001b[0m in \u001b[0;36mmake_request\u001b[0;34m(self, operation_model, request_dict)\u001b[0m\n\u001b[1;32m 100\u001b[0m logger.debug(\"Making request for %s with params: %s\",\n\u001b[1;32m 101\u001b[0m operation_model, request_dict)\n\u001b[0;32m--> 102\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_send_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest_dict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moperation_model\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 103\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 104\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcreate_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moperation_model\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/opt/anaconda3/envs/uam-d/lib/python3.8/site-packages/botocore/endpoint.py\u001b[0m in \u001b[0;36m_send_request\u001b[0;34m(self, request_dict, operation_model)\u001b[0m\n\u001b[1;32m 132\u001b[0m \u001b[0mrequest\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreate_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest_dict\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moperation_model\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 133\u001b[0m \u001b[0mcontext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrequest_dict\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'context'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 134\u001b[0;31m success_response, exception = self._get_response(\n\u001b[0m\u001b[1;32m 135\u001b[0m request, operation_model, context)\n\u001b[1;32m 136\u001b[0m while self._needs_retry(attempts, operation_model, request_dict,\n",
"\u001b[0;32m~/opt/anaconda3/envs/uam-d/lib/python3.8/site-packages/botocore/endpoint.py\u001b[0m in \u001b[0;36m_get_response\u001b[0;34m(self, request, operation_model, context)\u001b[0m\n\u001b[1;32m 164\u001b[0m \u001b[0;31m# If an exception occurs then the success_response is None.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 165\u001b[0m \u001b[0;31m# If no exception occurs then exception is None.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 166\u001b[0;31m success_response, exception = self._do_get_response(\n\u001b[0m\u001b[1;32m 167\u001b[0m request, operation_model)\n\u001b[1;32m 168\u001b[0m kwargs_to_emit = {\n",
"\u001b[0;32m~/opt/anaconda3/envs/uam-d/lib/python3.8/site-packages/botocore/endpoint.py\u001b[0m in \u001b[0;36m_do_get_response\u001b[0;34m(self, request, operation_model)\u001b[0m\n\u001b[1;32m 198\u001b[0m \u001b[0mhttp_response\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfirst_non_none_response\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresponses\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 199\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhttp_response\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 200\u001b[0;31m \u001b[0mhttp_response\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_send\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 201\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mHTTPClientError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 202\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/opt/anaconda3/envs/uam-d/lib/python3.8/site-packages/botocore/endpoint.py\u001b[0m in \u001b[0;36m_send\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 267\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 268\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_send\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 269\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhttp_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 270\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 271\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/opt/anaconda3/envs/uam-d/lib/python3.8/site-packages/botocore/httpsession.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 252\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 253\u001b[0m \u001b[0mrequest_target\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_request_target\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mproxy_url\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 254\u001b[0;31m urllib_response = conn.urlopen(\n\u001b[0m\u001b[1;32m 255\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 256\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrequest_target\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/opt/anaconda3/envs/uam-d/lib/python3.8/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[1;32m 668\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 669\u001b[0m \u001b[0;31m# Make the request on the httplib connection object.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 670\u001b[0;31m httplib_response = self._make_request(\n\u001b[0m\u001b[1;32m 671\u001b[0m \u001b[0mconn\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 672\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/opt/anaconda3/envs/uam-d/lib/python3.8/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 424\u001b[0m \u001b[0;31m# Python 3 (including for exceptions like SystemExit).\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 425\u001b[0m \u001b[0;31m# Otherwise it looks like a bug in the code.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 426\u001b[0;31m \u001b[0msix\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraise_from\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 427\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mSocketTimeout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mBaseSSLError\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mSocketError\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 428\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_raise_timeout\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0merr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout_value\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mread_timeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/opt/anaconda3/envs/uam-d/lib/python3.8/site-packages/urllib3/packages/six.py\u001b[0m in \u001b[0;36mraise_from\u001b[0;34m(value, from_value)\u001b[0m\n",
"\u001b[0;32m~/opt/anaconda3/envs/uam-d/lib/python3.8/site-packages/urllib3/connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[0;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[1;32m 419\u001b[0m \u001b[0;31m# Python 3\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 420\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 421\u001b[0;31m \u001b[0mhttplib_response\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgetresponse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 422\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 423\u001b[0m \u001b[0;31m# Remove the TypeError from the exception chain in\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/opt/anaconda3/envs/uam-d/lib/python3.8/http/client.py\u001b[0m in \u001b[0;36mgetresponse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1330\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1331\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1332\u001b[0;31m \u001b[0mresponse\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbegin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1333\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1334\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/opt/anaconda3/envs/uam-d/lib/python3.8/http/client.py\u001b[0m in \u001b[0;36mbegin\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 301\u001b[0m \u001b[0;31m# read until we get a non-100 response\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 302\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 303\u001b[0;31m \u001b[0mversion\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstatus\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreason\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_read_status\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 304\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mstatus\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mCONTINUE\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 305\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/opt/anaconda3/envs/uam-d/lib/python3.8/http/client.py\u001b[0m in \u001b[0;36m_read_status\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 262\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 263\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_read_status\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 264\u001b[0;31m \u001b[0mline\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadline\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_MAXLINE\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"iso-8859-1\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 265\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0m_MAXLINE\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 266\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mLineTooLong\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"status line\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/opt/anaconda3/envs/uam-d/lib/python3.8/socket.py\u001b[0m in \u001b[0;36mreadinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 667\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 668\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 669\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 670\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 671\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_timeout_occurred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/opt/anaconda3/envs/uam-d/lib/python3.8/ssl.py\u001b[0m in \u001b[0;36mrecv_into\u001b[0;34m(self, buffer, nbytes, flags)\u001b[0m\n\u001b[1;32m 1239\u001b[0m \u001b[0;34m\"non-zero flags not allowed in calls to recv_into() on %s\"\u001b[0m \u001b[0;34m%\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1240\u001b[0m self.__class__)\n\u001b[0;32m-> 1241\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnbytes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbuffer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1242\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1243\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnbytes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflags\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/opt/anaconda3/envs/uam-d/lib/python3.8/ssl.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, len, buffer)\u001b[0m\n\u001b[1;32m 1097\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1098\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mbuffer\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1099\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sslobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbuffer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1100\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1101\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sslobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"for i in range(0,10000):\n",
" response = kinesis_client.put_record(\n",
" StreamName=STREAM_NAME,\n",
" Data=b'{\"col1\" : \"this is json data\"}',\n",
" PartitionKey='1' \n",
" )\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"BUCKET_NAME = 'datalake-uam'\n",
"GLUE_DB = 'uam'\n",
"KINESIS_STREAM = 'uam-test'\n",
"KINESIS_FIREHOSE = 'uam-test-fh'\n",
"\n",
"s3_client = session.client(\"s3\")\n",
"glue_client = session.client(\"glue\")\n",
"\n",
"def tear_down_all():\n",
" \n",
" s3 = boto3.resource('s3',**session_kwargs)\n",
" bucket = s3.Bucket(BUCKET_NAME)\n",
" bucket.objects.delete()\n",
" \n",
" s3_client.delete_bucket(\n",
" Bucket = BUCKET_NAME\n",
" )\n",
" \n",
" glue_client.delete_database(\n",
" Name=GLUE_DB\n",
" )\n",
" \n",
" kinesis_client.delete_stream(StreamName=STREAM_NAME)\n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"tear_down_all()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -0,0 +1,262 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "alternate-pantyhose",
"metadata": {},
"source": [
"# Lab1 Czytanie Kinesis Data Streams \n",
"\n",
"Przebieg ćwiczenia\n",
"* Stwórz Data Stream \n",
"* wygeneruj testowe dane do streama\n",
"* odczytaj dane ze streama (ShardIterator)\n",
"* zwróć uwagę na iteracje po shardach i iteratorach (per shard)\n",
"* porównaj przeczytane dane z danymi wygenerowanymi (czytamy dwie iteracje - pierwsze 10 rekordów TRIM_HORIZON)\n",
"* sprawdź jakie inne opcje ustawienia punktu w shardzie są"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "gross-series",
"metadata": {},
"outputs": [],
"source": [
"import boto3\n",
"from pprint import pprint\n",
"\n",
"kinesis_client = boto3.client('kinesis')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "excited-latex",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['cryptostock-dev-100603781557-jk-12345']"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"kinesis_client.list_streams()[\"StreamNames\"]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "excellent-address",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'cryptostock-dev-100603781557-jk-12345'"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"STREAM_NAME = kinesis_client.list_streams()[\"StreamNames\"][0]\n",
"STREAM_NAME"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "attended-combat",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[{'HashKeyRange': {'EndingHashKey': '340282366920938463463374607431768211455',\n",
" 'StartingHashKey': '0'},\n",
" 'SequenceNumberRange': {'StartingSequenceNumber': '49617445977150094507622122574044516561004852020651229186'},\n",
" 'ShardId': 'shardId-000000000000'}]\n"
]
}
],
"source": [
"response = kinesis_client.describe_stream(StreamName=STREAM_NAME) \n",
"pprint(response[\"StreamDescription\"][\"Shards\"])"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "together-finance",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'shard_id': 'shardId-000000000000',\n",
" 'shard_iterator': 'AAAAAAAAAAEfLA4f5f+lMhjNfHXIXsKxQeP3dg79sVKKRiT+843gRXwSQsYRXeMIS4KwdRUjPdChkE2ZZGYSG3DeghHZi41DXOE0pNSdFHnqkePkBVIX2cN/9rbedZTgX/WXfNaL+sMUfdbYV6f9iQEtTtRAYN3bXfk5jUwIBvcgB1mQDRzdT1Or150vbf3LSlLtC7XlkK7HNZoGM1t577jseZTyvJ4+yeBOV73DQnSFnL/EPQvVdm+lidZtaNe39NMak4bXx5AWmhwblwLPmXg/l2PMDx7Z'}]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"shard_ids = []\n",
"stream_name = None \n",
"if response and 'StreamDescription' in response:\n",
" stream_name= response['StreamDescription']['StreamName'] \n",
" \n",
" # reading all shards (getting shard iterators)\n",
" for shard_id in response['StreamDescription']['Shards']:\n",
" shard_id = shard_id['ShardId'] \n",
" shard_iterator = kinesis_client.get_shard_iterator(StreamName=stream_name, ShardId = shard_id, ShardIteratorType=\"TRIM_HORIZON\")\n",
" \n",
" si = shard_iterator[\"ShardIterator\"]\n",
" shard_ids.append({'shard_id' : shard_id ,'shard_iterator' : si })\n",
" \n",
"shard_ids"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "vital-bridges",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[{'ApproximateArrivalTimestamp': datetime.datetime(2021, 4, 18, 14, 18, 25, 191000, tzinfo=tzlocal()),\n",
" 'Data': b'{\"transaction_ts\": 1601510403, \"symbol\": \"ETH_USD\", \"price\": 360'\n",
" b'.03, \"amount\": 0.646, \"dollar_amount\": 232.57938, \"type\": \"buy\",'\n",
" b' \"trans_id\": 124289044}\\n',\n",
" 'PartitionKey': 'ETH_USD',\n",
" 'SequenceNumber': '49617445977150094507622122578777461144796130256147709954'},\n",
" {'ApproximateArrivalTimestamp': datetime.datetime(2021, 4, 18, 14, 18, 25, 310000, tzinfo=tzlocal()),\n",
" 'Data': b'{\"transaction_ts\": 1601510403, \"symbol\": \"BTC_USD\", \"price\": 107'\n",
" b'80.83, \"amount\": 0.035, \"dollar_amount\": 377.32905, \"type\": \"buy'\n",
" b'\", \"trans_id\": 124289043}\\n',\n",
" 'PartitionKey': 'BTC_USD',\n",
" 'SequenceNumber': '49617445977150094507622122578778670070615744885322416130'},\n",
" {'ApproximateArrivalTimestamp': datetime.datetime(2021, 4, 18, 14, 18, 25, 428000, tzinfo=tzlocal()),\n",
" 'Data': b'{\"transaction_ts\": 1601510404, \"symbol\": \"ETH_USD\", \"price\": 360'\n",
" b'.12, \"amount\": 0.523, \"dollar_amount\": 188.34276, \"type\": \"buy\",'\n",
" b' \"trans_id\": 124289045}\\n',\n",
" 'PartitionKey': 'ETH_USD',\n",
" 'SequenceNumber': '49617445977150094507622122578779878996435359514497122306'},\n",
" {'ApproximateArrivalTimestamp': datetime.datetime(2021, 4, 18, 14, 18, 25, 545000, tzinfo=tzlocal()),\n",
" 'Data': b'{\"transaction_ts\": 1601510405, \"symbol\": \"BTC_USD\", \"price\": 107'\n",
" b'84.42, \"amount\": 0.25635676, \"dollar_amount\": 2764.65897, \"type\"'\n",
" b': \"buy\", \"trans_id\": 124289050}\\n',\n",
" 'PartitionKey': 'BTC_USD',\n",
" 'SequenceNumber': '49617445977150094507622122578781087922254974143671828482'},\n",
" {'ApproximateArrivalTimestamp': datetime.datetime(2021, 4, 18, 14, 18, 25, 663000, tzinfo=tzlocal()),\n",
" 'Data': b'{\"transaction_ts\": 1601510407, \"symbol\": \"BTC_USD\", \"price\": 107'\n",
" b'84.42, \"amount\": 0.23877038, \"dollar_amount\": 2575.000061, \"type'\n",
" b'\": \"buy\", \"trans_id\": 124289051}\\n',\n",
" 'PartitionKey': 'BTC_USD',\n",
" 'SequenceNumber': '49617445977150094507622122578782296848074588772846534658'}]\n"
]
}
],
"source": [
"limit = 5\n",
"response_get_rec = kinesis_client.get_records(ShardIterator = si , Limit = limit)\n",
"next_shard_iterator = response_get_rec['NextShardIterator']\n",
"pprint(response_get_rec[\"Records\"])"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "alone-martial",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"AAAAAAAAAAGUetfZYhJAUbRmLdnxgHF2gXHQ+Yt8063YzfurEZ+Vdauri9LJ13JLrPqLIrBxeHRJ1GEBctxNJ4jYeB4Um/JNu4+2L5Jfa1Apl9s9y6f/5UMZlqIAFGvUPmW53Gj6MyauM9r7EWNBUBZCvrFQkHvC9fQwNYP3eyYm1xp4K9fcjBX90qUdnGmFU69bq+3BF5I7PXgPHitcwzJev6PqPLVny2SmhSHtnRF/Rogj00Xv+DtKo1/SBdVid3tyQ0e9tm4XrgttPfPIhNsJB3j57lpM\n",
"[{'ApproximateArrivalTimestamp': datetime.datetime(2021, 4, 18, 14, 18, 25, 812000, tzinfo=tzlocal()),\n",
" 'Data': b'{\"transaction_ts\": 1601510409, \"symbol\": \"BTC_USD\", \"price\": 107'\n",
" b'84.42, \"amount\": 1.01303547, \"dollar_amount\": 10924.99998, \"type'\n",
" b'\": \"buy\", \"trans_id\": 124289054}\\n',\n",
" 'PartitionKey': 'BTC_USD',\n",
" 'SequenceNumber': '49617445977150094507622122578783505773894203402021240834'},\n",
" {'ApproximateArrivalTimestamp': datetime.datetime(2021, 4, 18, 14, 18, 25, 930000, tzinfo=tzlocal()),\n",
" 'Data': b'{\"transaction_ts\": 1601510410, \"symbol\": \"BTC_USD\", \"price\": 107'\n",
" b'84.42, \"amount\": 0.26135077, \"dollar_amount\": 2818.516471, \"type'\n",
" b'\": \"buy\", \"trans_id\": 124289055}\\n',\n",
" 'PartitionKey': 'BTC_USD',\n",
" 'SequenceNumber': '49617445977150094507622122578784714699713818031195947010'},\n",
" {'ApproximateArrivalTimestamp': datetime.datetime(2021, 4, 18, 14, 18, 26, 48000, tzinfo=tzlocal()),\n",
" 'Data': b'{\"transaction_ts\": 1601510413, \"symbol\": \"ETH_USD\", \"price\": 360'\n",
" b'.39, \"amount\": 5.55416701, \"dollar_amount\": 2001.666249, \"type\":'\n",
" b' \"buy\", \"trans_id\": 124289059}\\n',\n",
" 'PartitionKey': 'ETH_USD',\n",
" 'SequenceNumber': '49617445977150094507622122578785923625533432660370653186'},\n",
" {'ApproximateArrivalTimestamp': datetime.datetime(2021, 4, 18, 14, 18, 26, 165000, tzinfo=tzlocal()),\n",
" 'Data': b'{\"transaction_ts\": 1601510414, \"symbol\": \"ETH_USD\", \"price\": 360'\n",
" b'.6, \"amount\": 13.855, \"dollar_amount\": 4996.113, \"type\": \"buy\", '\n",
" b'\"trans_id\": 124289071}\\n',\n",
" 'PartitionKey': 'ETH_USD',\n",
" 'SequenceNumber': '49617445977150094507622122578787132551353047358264836098'},\n",
" {'ApproximateArrivalTimestamp': datetime.datetime(2021, 4, 18, 14, 18, 26, 282000, tzinfo=tzlocal()),\n",
" 'Data': b'{\"transaction_ts\": 1601510415, \"symbol\": \"ETH_USD\", \"price\": 360'\n",
" b'.24, \"amount\": 6.32869733, \"dollar_amount\": 2279.849926, \"type\":'\n",
" b' \"sell\", \"trans_id\": 124289072}\\n',\n",
" 'PartitionKey': 'ETH_USD',\n",
" 'SequenceNumber': '49617445977150094507622122578788341477172661987439542274'}]\n"
]
}
],
"source": [
"print(next_shard_iterator)\n",
"response_get_rec = kinesis_client.get_records(ShardIterator = next_shard_iterator , Limit = limit)\n",
"pprint(response_get_rec[\"Records\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "analyzed-applicant",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

5
jupyter/requirements.txt Normal file
View File

@ -0,0 +1,5 @@
boto3==1.17.33
faker==6.6.2
avro-python3==1.9.1
pandavro==1.6.0
pyarrow==0.17.1

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,143 @@
#!/usr/bin/env python3
import configparser
import argparse
import csv
import time
import logging
import sys
import json
import os
import boto3
logging.basicConfig(
level=logging.INFO,
format='[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s - %(message)s',
handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger(__name__)
DEFAULT_DATA_FILE = 'crypto_trades_20201001.csv'
class KinesisProducer:
"""
Kinesis Producer
"""
def __init__(self, speed_per_sec):
self.client = boto3.client('kinesis')
self.max_retry_attempt = 5
def produce(self, event, key, data_stream):
"""
A simple wrapper for put record
:param event:
:param key:
:param data_stream:
:return:
"""
# adding a new line at the end to produce JSON lines
# (otherwise we would need to pre-process those records in Firehose
# invoking a Lambda to add those new lines).Every message is a dumped json with \n
tran_id = event["trans_id"]
payload = (json.dumps(event)+'\n').encode('utf-8')
attempt = 1
while attempt < self.max_retry_attempt:
try:
response = self.client.put_record(
StreamName=data_stream,
Data=payload,
PartitionKey=key
)
logger.info('Msg with trans_id={} sent to shard {} seq no {}'.format(tran_id, response["ShardId"], response["SequenceNumber"]))
return response
except Exception as e:
logger.warning('Exception has occurred {}, retrying...'.format(e))
attempt += 1
time.sleep(attempt)
logger.error('Max attempt has been reached, rethrowing the last err')
raise
def prepare_event(event):
"""
Events from CSV have no dtypes, lets convert it to some more real values (int / decimals etc)
:param event:
:return:
"""
msg_key = event["symbol"]
msg_formatted = {
"transaction_ts": int(event["transaction_ts"]),
"symbol": event["symbol"],
"price": float(event["price"]),
"amount": float(event["amount"]),
"dollar_amount": float(event["dollar_amount"]),
"type": event["type"],
"trans_id": int(event["trans_id"]),
}
return msg_formatted, msg_key
def produce_data(kinesis_data_stream, messages_per_sec, input_file):
"""
Main method for producing
:param kinesis_data_stream: param from cmdline name of KDS
:param messages_per_sec: param from cmdline max speed per sec 1/mps
:return:
"""
kp = KinesisProducer(speed_per_sec=messages_per_sec)
with open(input_file) as csv_file:
reader = csv.DictReader(csv_file, delimiter=',')
all_rows = list(reader)
current_time = int(all_rows[0]["transaction_ts"])
replay_cnt = 1
while True:
logger.info("start replaying for the {} time".format(replay_cnt))
for row in all_rows:
new_event_time = int(row["transaction_ts"])
time_delta = new_event_time - current_time
current_time = new_event_time
if time_delta > 0 and messages_per_sec > 0:
time.sleep(time_delta / messages_per_sec)
event, key = prepare_event(row)
kp.produce(event, key, kinesis_data_stream)
replay_cnt += 1
if __name__ == "__main__":
logger.info('Starting Simple Kinesis Producer (replaying stock data)')
parser = argparse.ArgumentParser()
parser.add_argument('-k', '--kinesis_ds', dest='kinesis_ds', required=True)
parser.add_argument('-i', '--input_file', dest='input_file', required=False)
parser.add_argument('-s', '--messages_per_sec', dest='mps', type=int, default=-1, required=False)
args, unknown = parser.parse_known_args()
config = configparser.ConfigParser()
kinesis_data_stream = args.kinesis_ds
messages_per_sec = int(args.mps)
if args.input_file:
input_file = args.input_file
else:
main_path = os.path.abspath(os.path.dirname(__file__))
input_file = os.path.join(main_path, DEFAULT_DATA_FILE)
produce_data(kinesis_data_stream, messages_per_sec, input_file)

14
labs/labs_1.md Normal file
View File

@ -0,0 +1,14 @@
Utwórz nowy folder w swoim repozytorium kodu ./cdp-terraform
skopiuj pliki
1. Stwórz S3 bucket
- sprawdź definicję https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/s3_bucket
2. Stwórz Kinesis Data Stream. Przyjmij do konfiguracji następujące parametry
- nazwa : <env = dev>_<inicjały_studenta>_<nr_indeksu> np dev_jk_12345
- liczba shardów - 1

67
labs/labs_preparation.md Normal file
View File

@ -0,0 +1,67 @@
## Laboratorium
### Przetwarzanie danych w chmnurze publicznej
---
1. Wymagania wstępne - środowiska (rekomendowane **PyCharm + Anacoda**)
* PyCharm - https://www.jetbrains.com/pycharm/download/
* Anaconda - https://www.anaconda.com/products/individual#Downloads
- nowe środowisko Python 3.8
Windows users : użyj Anaconda Prompt)
Linux / MacOs bash / zsh etc..
```
conda create -n uam_cloud_dp python=3.8
conda activate uam_cloud_dp
```
* Terraform (minimum w wersji 0.14)
- pobierz Terraform z https://www.terraform.io/downloads.html
właściwy dla twoje OS
- zainstaluj zgodnie z https://learn.hashicorp.com/tutorials/terraform/install-cli?in=terraform/aws-get-started
- sprawdź poprawność instalacji wpisując w cmdline / bash (TF w wersji 0.14+)
```
$ terraform --version
Terraform v0.14.8
```
* Setup środowiska
- Aktywuj swoją conda env
```
conda activate uam_cloud_dp
```
- instalacja wymaganych pakietów Python
```
pip install -f <path to this repo>/labs/requirements.txt
```
- sprawdź czy awscli jest zainstalowane poprawnie
```
$ aws --version
aws-cli/1.19.33 Python/3.8.8 Windows/10 botocore/1.20.33
```
* Konfiguracja konta AWS
- Zaloguj się do AWS Educate - https://www.awseducate.com/signin/SiteLogin
- AWS Account -> Starter Account
- Account Details - skopiuj tymczasowe dane do logowanie (Access / Secret i Token)
- jeśli pierwszy raz konfigurujsze awscli na swojej maszynie wpisz (Acces i Secret nie istotne - potem je wyedytujemy)
```bash
$ aws configure
AWS Access Key ID [None]: a
AWS Secret Access Key [None]: b
Default region name [None]: us-east-1
Default output format [None]:
```
- Wklej do pliku ~/.aws/credentials skopiowane dane do logowania

Binary file not shown.

View File

@ -0,0 +1,54 @@
import urllib.parse
import awswrangler as wr
import pandas as pd
def etl_function(event, context):
processed_zone_prefix = "processed-zone"
record = event["Records"][0]
bucket = record["s3"]["bucket"]["name"]
key = urllib.parse.unquote(record["s3"]["object"]["key"])
event_prefix = key.split('/')[1]
full_src_path = 's3://{bucket}/{key}'.format(bucket=bucket, key=key)
print(f'Processing key = {full_src_path}')
df = wr.s3.read_json(path=full_src_path, lines=True)
filename = key.split('/')[-1][-36:]
dest_prefix = f"s3://{bucket}/{processed_zone_prefix}/{event_prefix}"
df['transaction_date'] = pd.to_datetime(df['transaction_ts'], unit='s')
df['year'] = df['transaction_date'].dt.year
df['month'] = df['transaction_date'].dt.month
df['day'] = df['transaction_date'].dt.day
df['hour'] = df['transaction_date'].dt.hour
cols_to_return = ["transaction_date", "price", "amount", "dollar_amount", "type", "trans_id"]
new_keys = []
for [symbol, year, month, day, hour], data in df.groupby(['symbol', 'year', 'month', 'day', 'hour']):
partitions = f"symbol={symbol}/year={year}/month={month}/day={day}/hour={hour}"
full_key_name = '/'.join([dest_prefix, partitions, filename + '.parquet'])
print(f'Saving a new key = {full_key_name}')
new_keys.append(full_key_name)
wr.s3.to_parquet(
df=data[cols_to_return],
path=full_key_name,
compression='snappy'
)
return {
'key': key,
'statusCode': 200,
'new_keys': new_keys
}
if __name__ == "__main__":
event = ""
context = ""
response = etl_function(event, context)

Binary file not shown.

4
labs/requirements.txt Normal file
View File

@ -0,0 +1,4 @@
awscli==1.19.33
boto3==1.17.33
configparser==5.0.2
awswrangler==2.7.0

7
labs/terraform/main.tf Normal file
View File

@ -0,0 +1,7 @@
locals {
common_tags = {
Purpose = "UAM Cloud Data Processing"
Environment = "DEV"
Owner = var.student_full_name
}
}

View File

@ -0,0 +1,13 @@
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 3.27"
}
}
}
provider "aws" {
profile = "default"
region = var.region
}

View File

@ -0,0 +1,17 @@
resource "aws_glue_catalog_database" "datalake_db_raw_zone" {
name = "datalake_${var.environment}_${var.account_number}_${var.student_initials}_${var.student_index_no}"
}
resource "aws_glue_crawler" "glue_crawler_raw_zone" {
database_name = aws_glue_catalog_database.datalake_db_raw_zone.name
name = "gc-raw-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
role = aws_iam_role.glue_crawler_role.arn
table_prefix = "crawler_"
s3_target {
path = "s3://${aws_s3_bucket.main_dl_bucket.bucket}/raw-zone/stockdata/"
}
tags = merge(local.common_tags, )
}

View File

@ -0,0 +1,171 @@
resource "aws_iam_role" "firehose_stream_role" {
name = "firehose-role-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
assume_role_policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Action": "sts:AssumeRole",
"Principal": {
"Service": "firehose.amazonaws.com"
},
"Effect": "Allow",
"Sid": ""
}
]
}
EOF
}
resource "aws_iam_role_policy" "firehose_stream_policy" {
name = "firehose-stream-policy-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
role = aws_iam_role.firehose_stream_role.id
policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": "kinesis:*",
"Resource": "*"
},
{
"Effect": "Allow",
"Action": [
"s3:AbortMultipartUpload",
"s3:GetBucketLocation",
"s3:GetObject",
"s3:ListBucket",
"s3:ListBucketMultipartUploads",
"s3:PutObject"
],
"Resource": [
"${aws_s3_bucket.main_dl_bucket.arn}",
"${aws_s3_bucket.main_dl_bucket.arn}/*"
]
},
{
"Sid": "",
"Effect": "Allow",
"Action": [
"logs:PutLogEvents"
],
"Resource": [
"arn:aws:logs:${var.region}:${var.account_number}:log-group:/aws/kinesisfirehose/*"
]
}
]
}
EOF
}
// Role & policies for Glue Crawler
resource "aws_iam_role" "glue_crawler_role" {
name = "crawler-role-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
assume_role_policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": "glue.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}
EOF
}
data "aws_iam_policy" "glue_service_policy" {
arn = "arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole"
}
resource "aws_iam_role_policy" "glue_crawler_user_bucket_policy" {
name = "user-bucket-policy-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
role = aws_iam_role.glue_crawler_role.id
policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"s3:GetObject",
"s3:PutObject"
],
"Resource": [
"${aws_s3_bucket.main_dl_bucket.arn}*"
]
}
]
}
EOF
}
resource "aws_iam_policy_attachment" "crawler_attach_managed_policy" {
name = "crawler-managed-service-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
roles = [
aws_iam_role.glue_crawler_role.name]
policy_arn = data.aws_iam_policy.glue_service_policy.arn
}
// Role and policies for Lambda
resource "aws_iam_role" "lambda_basic_role" {
name = "lambda-basic-role-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
tags = merge(local.common_tags, )
assume_role_policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Action": "sts:AssumeRole",
"Principal": {
"Service": "lambda.amazonaws.com"
},
"Effect": "Allow",
"Sid": ""
}
]
}
EOF
}
resource "aws_iam_role_policy" "lambda_basic_policy" {
name = "lambda-basic-policy-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
role = aws_iam_role.lambda_basic_role.id
policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Action": [
"logs:CreateLogGroup",
"logs:CreateLogStream",
"logs:PutLogEvents"
],
"Effect": "Allow",
"Resource": "*"
},
{
"Effect": "Allow",
"Action": "s3:*",
"Resource": [
"${aws_s3_bucket.main_dl_bucket.arn}",
"${aws_s3_bucket.main_dl_bucket.arn}/*"]
}
]
}
EOF
}

View File

@ -0,0 +1,14 @@
resource "aws_kinesis_stream" "cryptostock_stream" {
name = "cryptostock-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
shard_count = 1
enforce_consumer_deletion = true
shard_level_metrics = [
"IncomingBytes",
"OutgoingBytes",
"IncomingRecords",
"OutgoingRecords"
]
tags = merge(local.common_tags, )
}

View File

@ -0,0 +1,18 @@
resource "aws_kinesis_firehose_delivery_stream" "stock_delivery_stream" {
name = "firehose-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
destination = "extended_s3"
kinesis_source_configuration {
kinesis_stream_arn = aws_kinesis_stream.cryptostock_stream.arn
role_arn = aws_iam_role.firehose_stream_role.arn
}
extended_s3_configuration {
role_arn = aws_iam_role.firehose_stream_role.arn
bucket_arn = aws_s3_bucket.main_dl_bucket.arn
buffer_size = 1
buffer_interval = 60
prefix = "raw-zone/stockdata/year=!{timestamp:yyyy}/month=!{timestamp:MM}/day=!{timestamp:dd}/hour=!{timestamp:HH}/"
error_output_prefix = "${ "raw-zone/stockdata_errors/!{firehose:error-output-type}/year=!{timestamp:yyyy}"}${ "/month=!{timestamp:MM}/day=!{timestamp:dd}/hour=!{timestamp:HH}"}/"
}
}

View File

@ -0,0 +1,50 @@
resource "aws_lambda_layer_version" "aws_wrangler" {
filename = "../lambda/awswrangler-layer-2.7.0-py3.8.zip"
layer_name = "aws_wrangler_${var.environment}_${var.account_number}_${var.student_initials}_${var.student_index_no}"
source_code_hash = "${filebase64sha256("../lambda/awswrangler-layer-2.7.0-py3.8.zip")}"
compatible_runtimes = [
"python3.8"]
}
resource "aws_lambda_function" "etl_post_processing" {
function_name = "etl-post-processing-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
filename = "../lambda/lambda_definition.zip"
handler = "lambda_definition.etl_function"
runtime = "python3.8"
role = aws_iam_role.lambda_basic_role.arn
timeout = 300
memory_size = 512
source_code_hash = filebase64sha256("../lambda/lambda_definition.zip")
layers = [
"${aws_lambda_layer_version.aws_wrangler.arn}"]
}
resource "aws_lambda_permission" "allow_bucket" {
statement_id = "AllowExecutionFromS3Bucket"
action = "lambda:InvokeFunction"
function_name = aws_lambda_function.etl_post_processing.arn
principal = "s3.amazonaws.com"
source_arn = aws_s3_bucket.main_dl_bucket.arn
}
resource "aws_s3_bucket_notification" "trigger_etl_lambda" {
bucket = aws_s3_bucket.main_dl_bucket.id
lambda_function {
lambda_function_arn = aws_lambda_function.etl_post_processing.arn
events = [
"s3:ObjectCreated:*"]
filter_prefix = "raw-zone/"
}
depends_on = [
aws_lambda_permission.allow_bucket]
}

View File

@ -0,0 +1,7 @@
locals {
common_tags = {
Purpose = "UAM Cloud Data Processing"
Environment = "DEV"
Owner = var.student_full_name
}
}

View File

@ -0,0 +1,75 @@
-- lab 3.3
WITH CTE AS
(
SELECT date_format(from_unixtime(transaction_ts),'%Y-%m-%dT%H') as HourlyBucket,
RANK() OVER(PARTITION BY date_format(from_unixtime(transaction_ts),'%Y-%m-%dT%H'), symbol ,type ORDER BY dollar_amount DESC) as rnk, *
FROM "datalake_dev_100603781557_jk_12345"."crawler_stockdata"
)
select *
from CTE
where rnk=1
order by 1, 4, 8
-- LAB 4.2
CREATE EXTERNAL TABLE processed_stockdata(
transaction_date timestamp,
price double,
amount double,
dollar_amount double,
type string,
trans_id bigint)
PARTITIONED BY (
symbol string,
year integer,
month integer,
day integer,
hour integer
)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
's3://datalake-dev-100603781557-jk-12345/processed-zone/stockdata/'
MSCK REPAIR TABLE processed_stockdata;
-- LAB 5
-- .----------. .----------. .----------.
-- | SOURCE | | INSERT | | DESTIN. |
-- Source-->| STREAM |-->| & SELECT |-->| STREAM |-->Destination
-- | | | (PUMP) | | |
-- '----------' '----------' '----------'
CREATE OR REPLACE STREAM "DESTINATION_SQL_STREAM"
("symbol" VARCHAR(10), "type" VARCHAR(10), "trans_id" BIGINT,
"dollar_amount" DOUBLE, "AvgLast30seconds" DOUBLE, "CntLast30seconds" INT,
"SumLast30rows" DOUBLE, "CntLast30rows" INT, "max_tran_id" BIGINT );
CREATE OR REPLACE PUMP "STREAM_PUMP" AS INSERT INTO "DESTINATION_SQL_STREAM"
SELECT STREAM "symbol", "type", "trans_id", "dollar_amount", "AvgLast30seconds", "CntLast30seconds"
, "SumLast30rows", "CntLast30rows", "max_tran_id"
FROM (
SELECT STREAM "symbol", "type", "trans_id", "dollar_amount",
AVG("dollar_amount") OVER LAST_30_SECS AS "AvgLast30seconds",
COUNT(*) OVER LAST_30_SECS AS "CntLast30seconds",
SUM("dollar_amount") OVER LAST_30_ROWS AS "SumLast30rows",
COUNT(*) OVER LAST_30_ROWS AS "CntLast30rows",
MAX("trans_id") OVER LAST_30_ROWS AS "max_tran_id"
FROM "SOURCE_SQL_STREAM_001"
WHERE "symbol" = 'BTC_USD'
WINDOW
LAST_30_SECS AS (PARTITION BY "symbol", "type" RANGE INTERVAL '30' SECOND PRECEDING),
LAST_30_ROWS AS (PARTITION BY "symbol", "type" ROWS 30 PRECEDING)
)
WHERE "dollar_amount" > 4 * ("AvgLast30seconds");

View File

@ -0,0 +1,6 @@
resource "aws_s3_bucket" "main_dl_bucket" {
bucket = "datalake-${var.environment}-${var.account_number}-${var.student_initials}-${var.student_index_no}"
force_destroy = true
tags = merge(local.common_tags, )
}

View File

@ -0,0 +1,7 @@
locals {
common_tags = {
Purpose = "UAM Cloud Data Processing"
Environment = "DEV"
Owner = var.student_full_name
}
}

View File

@ -0,0 +1,13 @@
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 3.27"
}
}
}
provider "aws" {
profile = "default"
region = var.region
}

View File

@ -0,0 +1,4 @@
account_number=920628590621
student_initials="jk"
student_full_name="Jakub Kasprzak"
student_index_no = "12345"

View File

@ -0,0 +1,31 @@
variable "account_number" {
description = "Account number"
type = number
}
variable "region" {
description = "Region name - must be NVirginia us-east-1"
type = string
default = "us-east-1"
}
variable "environment" {
description = "Environment name"
type = string
default = "dev"
}
variable "student_initials" {
description = "letters of first and last names"
type = string
}
variable "student_full_name" {
description = "Student's full name"
type = string
}
variable "student_index_no" {
description = "Index no"
type = string
}

View File

@ -0,0 +1,4 @@
account_number=920628590621
student_initials="jk"
student_full_name="Jakub Kasprzak"
student_index_no = "12345"

View File

@ -0,0 +1,31 @@
variable "account_number" {
description = "Account number"
type = number
}
variable "region" {
description = "Region name - must be NVirginia us-east-1"
type = string
default = "us-east-1"
}
variable "environment" {
description = "Environment name"
type = string
default = "dev"
}
variable "student_initials" {
description = "letters of first and last names"
type = string
}
variable "student_full_name" {
description = "Student's full name"
type = string
}
variable "student_index_no" {
description = "Index no"
type = string
}

Binary file not shown.

1
test.test Normal file
View File

@ -0,0 +1 @@
fadsa

View File

@ -0,0 +1,100 @@
version: '3.7'
services:
zookeeper:
image: confluentinc/cp-zookeeper:5.2.2
ports:
- 2181:2181
hostname: zookeeper
environment:
ZOOKEEPER_CLIENT_PORT: 2181
networks:
- uam
kafka:
image: confluentinc/cp-kafka:5.2.2
ports:
- 9092:9092
- 9094:9094
hostname: kafka
environment:
KAFKA_ADVERTISED_LISTENERS: LISTENER_DOCKER_INTERNAL://kafka:9094,LISTENER_DOCKER_EXTERNAL://${DOCKER_HOST_IP:-127.0.0.1}:9092
KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: LISTENER_DOCKER_INTERNAL:PLAINTEXT,LISTENER_DOCKER_EXTERNAL:PLAINTEXT
KAFKA_INTER_BROKER_LISTENER_NAME: LISTENER_DOCKER_INTERNAL
KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
KAFKA_BROKER_ID: 1
KAFKA_LOG4J_LOGGERS: kafka.controller=INFO,kafka.producer.async.DefaultEventHandler=INFO,state.change.logger=INFO
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
KAFKA_LOG_RETENTION_HOURS: 168000
depends_on:
- zookeeper
networks:
- uam
minio:
image: minio/minio
ports:
- 9000:9000
hostname: minio
environment:
MINIO_ACCESS_KEY: minio
MINIO_SECRET_KEY: minio123
MINIO_HTTP_TRACE: /dev/stdout
command: server /data
networks:
- uam
default-buckets:
image: minio/mc
entrypoint: >
/bin/sh -c "
sleep 20;
/usr/bin/mc config host add minio http://minio:9000 minio minio123;
/usr/bin/mc mb minio/dev-data-raw;
/usr/bin/mc policy public minio/dev-data-raw;"
depends_on:
- minio
networks:
- uam
kafka-connect:
image: confluentinc/cp-kafka-connect:5.2.2
hostname: kafka-connect
ports:
- 8083:8083
- 3030:3030
environment:
CONNECT_REST_ADVERTISED_HOST_NAME: kafka-connect
CONNECT_REST_PORT: 8083
CONNECT_BOOTSTRAP_SERVERS: kafka:9094
CONNECT_GROUP_ID: primary-etl
CONNECT_CONFIG_STORAGE_TOPIC: __primary-etl-configs
CONNECT_CONFIG_STORAGE_REPLICATION_FACTOR: 1
CONNECT_OFFSET_STORAGE_TOPIC: __primary-etl-offsets
CONNECT_OFFSET_STORAGE_REPLICATION_FACTOR: 1
CONNECT_STATUS_STORAGE_TOPIC: __primary-etl-status
CONNECT_STATUS_STORAGE_REPLICATION_FACTOR: 1
CONNECT_KEY_CONVERTER: org.apache.kafka.connect.json.JsonConverter
CONNECT_KEY_CONVERTER_SCHEMAS_ENABLE: "false"
CONNECT_VALUE_CONVERTER: org.apache.kafka.connect.json.JsonConverter
CONNECT_VALUE_CONVERTER_SCHEMAS_ENABLE: "false"
CONNECT_INTERNAL_KEY_CONVERTER: org.apache.kafka.connect.json.JsonConverter
CONNECT_INTERNAL_KEY_CONVERTER_SCHEMAS_ENABLE: "false"
CONNECT_INTERNAL_VALUE_CONVERTER: org.apache.kafka.connect.json.JsonConverter
CONNECT_INTERNAL_VALUE_CONVERTER_SCHEMAS_ENABLE: "false"
CONNECT_LOG4J_ROOT_LOGLEVEL: INFO
CONNECT_LOG4J_LOGGERS: org.apache.kafka.connect.runtime.rest=WARN,org.reflections=ERROR
CONNECT_PLUGIN_PATH: /usr/share/java,/etc/kafka-connect/jars
AWS_ACCESS_KEY_ID: minio
AWS_SECRET_ACCESS_KEY: minio123
depends_on:
- kafka
- default-buckets
networks:
- uam
networks:
uam:
name: uam

0
tests/__init__.py Normal file
View File

View File

@ -0,0 +1,69 @@
from unittest import main, TestCase
import boto3
import configparser as cp
import warnings
import os
class TestInfraKinesis(TestCase):
def setUp(self):
self.kinesis_client = boto3.client('kinesis')
# read configuration
# making path universal for running tests from the module / outside
cwd = os.getcwd()
extra_dot = '.' if cwd.endswith('tests') else ''
config_path = extra_dot + "./labs/terraform/terraform.tfvars"
with open(config_path, 'r') as f:
config_string = '[main]\n' + f.read()
config = cp.ConfigParser()
config.read_string(config_string)
self.config = {param: (
config["main"][param].replace('"', '') if isinstance(config["main"][param], str) else config["main"][param]
) for param in config["main"]}
warnings.filterwarnings("ignore", category=ResourceWarning, message="unclosed.*<ssl.SSLSocket.*>")
kinesis_stream_name_pattern = "cryptostock-dev-{account_number}-{student_initials}-{student_index_no}"
self.kinesis_stream_name = kinesis_stream_name_pattern.format(account_number=self.config["account_number"],
student_initials=self.config["student_initials"],
student_index_no=self.config["student_index_no"])
def test_kinesis_data_stream_exists(self):
kinesis_streams = self.kinesis_client.list_streams()["StreamNames"]
find_stream = next((stream for stream in kinesis_streams if stream == self.kinesis_stream_name), None)
self.assertEqual(find_stream, self.kinesis_stream_name)
def test_kinesis_data_stream_config(self):
expected_no_of_shards = 1
stream_config = self.kinesis_client.describe_stream(StreamName=self.kinesis_stream_name)
# check no of shards
no_of_shards = len(stream_config["StreamDescription"]["Shards"])
self.assertEqual(no_of_shards, expected_no_of_shards)
def test_kinesis_data_stream_tags(self):
tags = self.kinesis_client.list_tags_for_stream(StreamName=self.kinesis_stream_name)["Tags"]
find_env_tag = next((tag["Value"] for tag in tags if tag.get("Key") == "Environment"), None)
self.assertEqual(find_env_tag.upper(), "DEV")
def test_kinesis_data_stream_monitoring(self):
expected_monitors = ['IncomingBytes', 'OutgoingRecords', 'IncomingRecords', 'OutgoingBytes']
stream_monitoring = \
self.kinesis_client.describe_stream(StreamName=self.kinesis_stream_name)["StreamDescription"][
"EnhancedMonitoring"]
current_monitors = next((x["ShardLevelMetrics"] for x in stream_monitoring if x.get("ShardLevelMetrics")), None)
self.assertTrue(set(current_monitors).issuperset(set(expected_monitors)))
if __name__ == '__main__':
main()

View File

@ -0,0 +1,73 @@
from unittest import main, TestCase
import boto3
import configparser as cp
import warnings
import os
class TestInfraKinesisFH(TestCase):
def setUp(self):
self.firehose_client = boto3.client('firehose')
self.kinesis_client = boto3.client('kinesis')
# read configuration
# making path universal for running tests from the module / outside
cwd = os.getcwd()
extra_dot = '.' if cwd.endswith('tests') else ''
config_path = extra_dot + "./labs/terraform/terraform.tfvars"
with open(config_path, 'r') as f:
config_string = '[main]\n' + f.read()
config = cp.ConfigParser()
config.read_string(config_string)
self.config = {param: (
config["main"][param].replace('"', '') if isinstance(config["main"][param], str) else config["main"][param]
) for param in config["main"]}
warnings.filterwarnings("ignore", category=ResourceWarning, message="unclosed.*<ssl.SSLSocket.*>")
firehose_stream_name_pattern = "firehose-dev-{account_number}-{student_initials}-{student_index_no}"
self.firehose_stream_name = firehose_stream_name_pattern.format(account_number=self.config["account_number"],
student_initials=self.config[
"student_initials"],
student_index_no=self.config[
"student_index_no"])
kinesis_stream_name_pattern = "cryptostock-dev-{account_number}-{student_initials}-{student_index_no}"
self.kinesis_stream_name = kinesis_stream_name_pattern.format(account_number=self.config["account_number"],
student_initials=self.config["student_initials"],
student_index_no=self.config["student_index_no"])
def test_kinesis_firehose_exists(self):
deliver_streams = self.firehose_client.list_delivery_streams()["DeliveryStreamNames"]
find_stream = next((stream for stream in deliver_streams if stream == self.firehose_stream_name), None)
self.assertEqual(find_stream, self.firehose_stream_name)
def test_kinesis_firehose_source(self):
config = self.firehose_client.describe_delivery_stream(DeliveryStreamName=self.firehose_stream_name)
source = config["DeliveryStreamDescription"]["Source"]["KinesisStreamSourceDescription"]["KinesisStreamARN"]
kinesis_stream_config = self.kinesis_client.describe_stream(StreamName=self.kinesis_stream_name)
kds_arn = kinesis_stream_config["StreamDescription"]["StreamARN"]
self.assertEqual(kds_arn, source)
def test_kinesis_firehose_dest_config(self):
config = self.firehose_client.describe_delivery_stream(DeliveryStreamName=self.firehose_stream_name)
destination = config["DeliveryStreamDescription"]["Destinations"][0]
expected_pref = 'raw-zone/stockdata/year=!{timestamp:yyyy}/month=!{timestamp:MM}/day=!{timestamp:dd}/hour=!{timestamp:HH}/'
expected_err_pref = 'raw-zone/stockdata_errors/!{firehose:error-output-type}/year=!{timestamp:yyyy}/month=!{timestamp:MM}/day=!{timestamp:dd}/hour=!{timestamp:HH}/'
dest_prefix = destination.get("S3DestinationDescription", None).get("Prefix", None)
dest_errors_prefix = destination.get("S3DestinationDescription", None).get("ErrorOutputPrefix", None)
self.assertEqual(dest_prefix, expected_pref)
self.assertEqual(expected_err_pref, dest_errors_prefix)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,52 @@
from unittest import main, TestCase
import boto3
import configparser as cp
import warnings
import os
class TestInfraS3(TestCase):
def setUp(self):
self.s3_client = boto3.client('s3')
# read configuration
# making path universal for running tests from the module / outside
cwd = os.getcwd()
extra_dot = '.' if cwd.endswith('tests') else ''
config_path = extra_dot + "./labs/terraform/terraform.tfvars"
with open(config_path, 'r') as f:
config_string = '[main]\n' + f.read()
config = cp.ConfigParser()
config.read_string(config_string)
self.config = {param: (
config["main"][param].replace('"', '') if isinstance(config["main"][param], str) else config["main"][param]
) for param in config["main"]}
warnings.filterwarnings("ignore", category=ResourceWarning, message="unclosed.*<ssl.SSLSocket.*>")
bucket_name_pattern = "datalake-dev-{account_number}-{student_initials}-{student_index_no}"
self.bucket_name = bucket_name_pattern.format(account_number=self.config["account_number"],
student_initials=self.config["student_initials"],
student_index_no=self.config["student_index_no"])
def test_main_s3_bucket_exists(self):
s3_buckets = self.s3_client.list_buckets()["Buckets"]
find_bucket = next((bucket["Name"] for bucket in s3_buckets if bucket["Name"] == self.bucket_name), None)
self.assertEqual(find_bucket, self.bucket_name)
def test_main_s3_bucket_comfig(self):
tags = self.s3_client.get_bucket_tagging(
Bucket=self.bucket_name
)["TagSet"]
find_env_tag = next((tag["Value"] for tag in tags if tag.get("Key") == "Environment"), None)
self.assertEqual(find_env_tag.upper(), "DEV")
if __name__ == '__main__':
main()