634 lines
25 KiB
Plaintext
634 lines
25 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# 1 Format danych AVRO\n",
|
|
"\n",
|
|
"Ćwiczenie ma na celu zademonstrowanie schematów danych AVRO, typów złożonych (mapy, listy, struktury zagnieżdżone) oraz wstęp do Glue / Athena\n",
|
|
"\n",
|
|
"## Przebieg ćwiczenia\n",
|
|
"* skonfiguruj środowisko uruchomieniowe Python (sugerowana Anaconda z Python 3)\n",
|
|
"* zainstaluj wszystkie wymagane biblioteki\n",
|
|
"\n",
|
|
"<code>\n",
|
|
"% conda create -n myenv python=3.8\n",
|
|
"% conda activate uam-datalake\n",
|
|
"% pip install -r ./datalake-uam/jupyter/requirements.txt\n",
|
|
"</code>\n",
|
|
"\n",
|
|
"* zaloguj się do konsoli AWS i stwórz Bucket testowy oraz bazę dancyh w Glue. Uzupełnij poniższy skrypt o te dane \n",
|
|
"* wygeneruj dane testowe w wybranym schemacie AVRO\n",
|
|
"* zapisz dane do plików na S3 w folderach s3:/<twoj-bukcet-name>/EventName/namespace=xxx/year=YYYY/month=MM/day=DD/version=VVV\n",
|
|
"* zarejestruj tabele w Glue z wykorzystaniem BOTO3 / crawler (poprzez konsole AWS GUI - przeglądarkę)\n",
|
|
"* skonfiguruj domyślną WorkGroup w Athena (PRIMARY) - konieczne wskazanie miejsce docelowego dla danych z zapytań (S3 location Athena) https://docs.aws.amazon.com/athena/latest/ug/getting-started.html\n",
|
|
"* sprawdź definicję tabeli i upewnij się że są zarejestrowane partycje (użyj polecenia MSCK REPAIR w Athena (LOAD PARTITIONS)\n",
|
|
"* sprawdź ile danych jest w tabeli (select count(*) from table) - Data Scanned in bytes\n",
|
|
"* odpytaj tabele z wykorzystaniem predykatu day=1 (partition elimination) - zweryfikuj ilość danych przeskanowanych (do porównania z ćwiczeniem 2 - parquet)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import boto3\n",
|
|
"\n",
|
|
"REGION = \"us-east-1\"\n",
|
|
"\n",
|
|
"\n",
|
|
"session_kwargs = {\n",
|
|
"\n",
|
|
" \"aws_access_key_id\":\"\",\n",
|
|
" \"aws_secret_access_key\":\"\",\n",
|
|
" \"aws_session_token\":\"\",\n",
|
|
" \"region_name\": REGION\n",
|
|
"}\n",
|
|
" \n",
|
|
"session = boto3.Session(**session_kwargs)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import json\n",
|
|
"\n",
|
|
"from faker import Faker\n",
|
|
"from botocore.exceptions import ClientError\n",
|
|
"from avro.datafile import DataFileReader, DataFileWriter\n",
|
|
"from avro.io import DatumReader, DatumWriter\n",
|
|
"import time \n",
|
|
"import io\n",
|
|
"import datetime\n",
|
|
"from avro.schema import Parse\n",
|
|
"\n",
|
|
"\n",
|
|
"fake = Faker()\n",
|
|
"fake.seed_instance(4321)\n",
|
|
" \n",
|
|
"S3_BUCKET = \"datalake-dev-920628590621-us-east-1\"\n",
|
|
"\n",
|
|
"TEST_DB = 'datalake_dev_jk'\n",
|
|
"TEST_TABLE_NAME = 'avro_uam_test'\n",
|
|
"EVENT_NAME = \"UamTestEvent\"\n",
|
|
"\n",
|
|
"\n",
|
|
"s3_client = session.client(\"s3\")\n",
|
|
"glue_client = session.client(\"glue\")\n",
|
|
"\n",
|
|
"\n",
|
|
"def tear_down_test_db(database=TEST_DB):\n",
|
|
" db_names = [x[\"Name\"] for x in glue_client.get_databases()[\"DatabaseList\"] ]\n",
|
|
" if database in db_names:\n",
|
|
" glue_client.delete_database(Name=database)\n",
|
|
" print(\"{} deleted\".format(database))\n",
|
|
"\n",
|
|
" response_create_db = glue_client.create_database(DatabaseInput={'Name': database }) \n",
|
|
" print(\"%s db recreated\" % database)\n",
|
|
"\n",
|
|
"def tear_down_test_table(database=TEST_DB, table_name=TEST_TABLE_NAME):\n",
|
|
" tbl_list = [x[\"Name\"] for x in glue_client.get_tables(DatabaseName=database)[\"TableList\"]]\n",
|
|
" if table_name in tbl_list:\n",
|
|
" glue_client.delete_table(DatabaseName=database,Name=table_name)\n",
|
|
" print(\"test table {} deleted\".format(table_name))\n",
|
|
" else:\n",
|
|
" print(\"tbl %s not found\" % table_name)\n",
|
|
" \n",
|
|
"def tear_down_s3(bucket=S3_BUCKET,prefix=EVENT_NAME):\n",
|
|
" s3 = boto3.resource('s3',**session_kwargs)\n",
|
|
" bucket = s3.Bucket(bucket)\n",
|
|
" bucket.objects.filter(Prefix=prefix).delete()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 1. AVRO schema"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"UamTestEvent com.uam.datalake.v1 1.0.2\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"{'mox-meta': {'version': '1.0.2', 'type': 'ENTITY_SNAPSHOT'},\n",
|
|
" 'namespace': 'com.uam.datalake.v1',\n",
|
|
" 'type': 'record',\n",
|
|
" 'name': 'UamTestEvent',\n",
|
|
" 'fields': [{'name': 'customerId',\n",
|
|
" 'type': {'type': 'string', 'avro.java.string': 'String'}},\n",
|
|
" {'name': 'isActive',\n",
|
|
" 'type': 'boolean',\n",
|
|
" 'doc': 'a boolean flag if the Customer is active'},\n",
|
|
" {'name': 'age', 'type': 'int'},\n",
|
|
" {'name': 'balance', 'type': 'float'},\n",
|
|
" {'name': 'accountBalance_logical_dec',\n",
|
|
" 'type': {'type': 'bytes',\n",
|
|
" 'logicalType': 'decimal',\n",
|
|
" 'precision': 20,\n",
|
|
" 'scale': 4}},\n",
|
|
" {'name': 'array_of_strings',\n",
|
|
" 'type': ['null',\n",
|
|
" {'type': 'array',\n",
|
|
" 'items': {'type': 'string', 'avro.java.string': 'String'}}],\n",
|
|
" 'default': None},\n",
|
|
" {'name': 'paymentDetails',\n",
|
|
" 'type': ['null',\n",
|
|
" {'type': 'record',\n",
|
|
" 'name': 'PaymentDetails',\n",
|
|
" 'fields': [{'name': 'counterPartyName',\n",
|
|
" 'type': ['null', {'type': 'string', 'avro.java.string': 'String'}],\n",
|
|
" 'default': None},\n",
|
|
" {'name': 'groupingId',\n",
|
|
" 'type': ['null', {'type': 'string', 'avro.java.string': 'String'}],\n",
|
|
" 'default': None},\n",
|
|
" {'name': 'payeeId',\n",
|
|
" 'type': ['null', {'type': 'string', 'avro.java.string': 'String'}],\n",
|
|
" 'default': None},\n",
|
|
" {'name': 'message',\n",
|
|
" 'type': ['null', {'type': 'string', 'avro.java.string': 'String'}],\n",
|
|
" 'default': None},\n",
|
|
" {'name': 'type',\n",
|
|
" 'type': {'type': 'enum',\n",
|
|
" 'name': 'PaymentType',\n",
|
|
" 'symbols': ['UNKNOWN', 'ONE', 'TWO']}},\n",
|
|
" {'name': 'otherAccountId',\n",
|
|
" 'type': ['null', {'type': 'string', 'avro.java.string': 'String'}],\n",
|
|
" 'default': None}]}],\n",
|
|
" 'default': None},\n",
|
|
" {'name': 'parameters',\n",
|
|
" 'type': ['null',\n",
|
|
" {'type': 'map',\n",
|
|
" 'avro.java.string': 'String',\n",
|
|
" 'values': {'type': 'string', 'avro.java.string': 'String'}}],\n",
|
|
" 'default': None}]}"
|
|
]
|
|
},
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# test Avro Schema with all important cases\n",
|
|
"\n",
|
|
"1\n",
|
|
"\n",
|
|
"schema_string = \"\"\"\n",
|
|
"{\n",
|
|
" \"mox-meta\":{\n",
|
|
" \"version\":\"1.0.2\",\n",
|
|
" \"type\":\"ENTITY_SNAPSHOT\"\n",
|
|
" },\n",
|
|
" \"namespace\":\"com.uam.datalake.v1\",\n",
|
|
" \"type\":\"record\",\n",
|
|
" \"name\":\"\",\n",
|
|
" \"fields\":[\n",
|
|
" {\n",
|
|
" \"name\":\"customerId\",\n",
|
|
" \"type\":{\n",
|
|
" \"type\":\"string\",\n",
|
|
" \"avro.java.string\":\"String\"\n",
|
|
" }\n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"name\":\"isActive\",\n",
|
|
" \"type\":\"boolean\",\n",
|
|
" \"doc\":\"a boolean flag if the Customer is active\"\n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"name\":\"age\",\n",
|
|
" \"type\":\"int\"\n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"name\":\"balance\",\n",
|
|
" \"type\":\"float\"\n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"name\":\"accountBalance_logical_dec\",\n",
|
|
" \"type\":{\n",
|
|
" \"type\":\"bytes\",\n",
|
|
" \"logicalType\":\"decimal\",\n",
|
|
" \"precision\":20,\n",
|
|
" \"scale\":4\n",
|
|
" }\n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"name\":\"array_of_strings\",\n",
|
|
" \"type\":[\n",
|
|
" \"null\",\n",
|
|
" {\n",
|
|
" \"type\":\"array\",\n",
|
|
" \"items\":{\n",
|
|
" \"type\":\"string\",\n",
|
|
" \"avro.java.string\":\"String\"\n",
|
|
" }\n",
|
|
" }\n",
|
|
" ],\n",
|
|
" \"default\":null\n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"name\":\"paymentDetails\",\n",
|
|
" \"type\":[\n",
|
|
" \"null\",\n",
|
|
" {\n",
|
|
" \"type\":\"record\",\n",
|
|
" \"name\":\"PaymentDetails\",\n",
|
|
" \"fields\":[\n",
|
|
" {\n",
|
|
" \"name\":\"counterPartyName\",\n",
|
|
" \"type\":[\n",
|
|
" \"null\",\n",
|
|
" {\n",
|
|
" \"type\":\"string\",\n",
|
|
" \"avro.java.string\":\"String\"\n",
|
|
" }\n",
|
|
" ],\n",
|
|
" \"default\":null\n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"name\":\"groupingId\",\n",
|
|
" \"type\":[\n",
|
|
" \"null\",\n",
|
|
" {\n",
|
|
" \"type\":\"string\",\n",
|
|
" \"avro.java.string\":\"String\"\n",
|
|
" }\n",
|
|
" ],\n",
|
|
" \"default\":null\n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"name\":\"payeeId\",\n",
|
|
" \"type\":[\n",
|
|
" \"null\",\n",
|
|
" {\n",
|
|
" \"type\":\"string\",\n",
|
|
" \"avro.java.string\":\"String\"\n",
|
|
" }\n",
|
|
" ],\n",
|
|
" \"default\":null\n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"name\":\"message\",\n",
|
|
" \"type\":[\n",
|
|
" \"null\",\n",
|
|
" {\n",
|
|
" \"type\":\"string\",\n",
|
|
" \"avro.java.string\":\"String\"\n",
|
|
" }\n",
|
|
" ],\n",
|
|
" \"default\":null\n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"name\":\"type\",\n",
|
|
" \"type\":{\n",
|
|
" \"type\":\"enum\",\n",
|
|
" \"name\":\"PaymentType\",\n",
|
|
" \"symbols\":[\n",
|
|
" \"UNKNOWN\",\n",
|
|
" \"ONE\",\n",
|
|
" \"TWO\" \n",
|
|
" ]\n",
|
|
" }\n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"name\":\"otherAccountId\",\n",
|
|
" \"type\":[\n",
|
|
" \"null\",\n",
|
|
" {\n",
|
|
" \"type\":\"string\",\n",
|
|
" \"avro.java.string\":\"String\"\n",
|
|
" }\n",
|
|
" ],\n",
|
|
" \"default\":null\n",
|
|
" }\n",
|
|
" ]\n",
|
|
" }\n",
|
|
" ],\n",
|
|
" \"default\":null\n",
|
|
" },\n",
|
|
" {\n",
|
|
" \"name\":\"parameters\",\n",
|
|
" \"type\":[\n",
|
|
" \"null\",\n",
|
|
" {\n",
|
|
" \"type\":\"map\",\n",
|
|
" \"avro.java.string\":\"String\",\n",
|
|
" \"values\":{\n",
|
|
" \"type\":\"string\",\n",
|
|
" \"avro.java.string\":\"String\"\n",
|
|
" }\n",
|
|
" }\n",
|
|
" ],\n",
|
|
" \"default\":null\n",
|
|
" }\n",
|
|
" ]\n",
|
|
"}\n",
|
|
"\"\"\"\n",
|
|
"\n",
|
|
"schema = json.loads(schema_string)\n",
|
|
"schema[\"name\"] = EVENT_NAME\n",
|
|
"\n",
|
|
"RECORD_NAME = schema[\"name\"]\n",
|
|
"NAMESPACE = schema[\"namespace\"]\n",
|
|
"VERSION = schema[\"mox-meta\"][\"version\"]\n",
|
|
"\n",
|
|
"print('%s %s %s' %(RECORD_NAME,NAMESPACE,VERSION))\n",
|
|
"schema"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 2. Generating test data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# generate some avro data (buffer file) based on the above schema\n",
|
|
"\n",
|
|
"avro_schema = Parse(json.dumps(schema))\n",
|
|
"buf = io.BytesIO()\n",
|
|
"writer = DataFileWriter(buf, DatumWriter(), avro_schema)\n",
|
|
"\n",
|
|
"for x in range(0, 10000):\n",
|
|
"\n",
|
|
"\n",
|
|
" customer_id = fake.uuid4()\n",
|
|
" amount = fake.pydecimal(left_digits=8, right_digits=4)\n",
|
|
" amount_int = int(str(amount).replace('.', ''))\n",
|
|
"\n",
|
|
" strings_arrray = [fake.first_name() for x in range(0, fake.random.randint(1, 5))]\n",
|
|
"\n",
|
|
" paymentDetails = {'counterPartyName': customer_id,\n",
|
|
" 'groupingId': str(fake.uuid4()), 'payeeId': None, 'message': None,\n",
|
|
" 'type': 'ONE', 'otherAccountId': str(fake.uuid4())}\n",
|
|
" \n",
|
|
" randint = fake.random.randint(20, 70)\n",
|
|
" \n",
|
|
"\n",
|
|
" array_of_structs = [{\"field1\": \"one\"}, {\"field1\": \"two\"}]\n",
|
|
" customer = {\n",
|
|
" \"customerId\": customer_id,\n",
|
|
" \"isActive\": fake.random.choice([True, False]),\n",
|
|
" \"age\": randint,\n",
|
|
" \"balance\": fake.random.random() * 123,\n",
|
|
" \"accountBalance_logical_dec\": amount_int.to_bytes(amount_int.bit_length() // 8 + 1, byteorder='big',\n",
|
|
" signed=True),\n",
|
|
" \"array_of_strings\": strings_arrray,\n",
|
|
" \"paymentDetails\": paymentDetails,\n",
|
|
"\n",
|
|
" \"parameters\": {\"key1\": \"value1\", \"key2\": \"value2\"}\n",
|
|
"\n",
|
|
" }\n",
|
|
" writer.append(customer)\n",
|
|
"\n",
|
|
"\n",
|
|
"writer.flush()\n",
|
|
"raw_bytes = buf.getvalue()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"uploaded UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=1/version=1.0.2/CustData_d6f44c22-5c87-4e14-956b-ad7d985226d0.avro\n",
|
|
"uploaded UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=2/version=1.0.2/CustData_cf0edf74-e76e-458f-a1d2-e092275a719c.avro\n",
|
|
"uploaded UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=3/version=1.0.2/CustData_f9ed4ad7-ed1c-4431-b9ab-317387cdb5af.avro\n",
|
|
"uploaded UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=4/version=1.0.2/CustData_d89b1c7f-66e3-4522-a603-1630abbf24fa.avro\n",
|
|
"uploaded UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=5/version=1.0.2/CustData_92483d0c-5254-4825-9bbb-59545fc6e4dd.avro\n",
|
|
"uploaded UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=6/version=1.0.2/CustData_3473649b-97c5-4597-965b-672a11cdad73.avro\n",
|
|
"uploaded UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=7/version=1.0.2/CustData_ecc999f8-eda7-4782-b844-17809980f34c.avro\n",
|
|
"uploaded UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=8/version=1.0.2/CustData_b95fd45c-8f0d-4612-8f8b-131437895013.avro\n",
|
|
"uploaded UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=9/version=1.0.2/CustData_5a5ef6ba-1576-4450-82a0-3f6f9a10a7c8.avro\n",
|
|
"uploaded UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=10/version=1.0.2/CustData_4e5dc9b8-6eb7-4fa4-93e0-61faf874b698.avro\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"tear_down_s3()\n",
|
|
"\n",
|
|
"for i in range(1,11):\n",
|
|
"\n",
|
|
" target_key_name = '{record_name}/namespace={ns}/year=2020/month=2/day={day}/version={ver}/CustData_{rand}.avro'.format(\n",
|
|
" record_name=RECORD_NAME,ns=NAMESPACE, day=i,ver=VERSION, rand=fake.uuid4())\n",
|
|
" try:\n",
|
|
" response = s3_client.put_object(Body=raw_bytes, Bucket=S3_BUCKET, Key=target_key_name)\n",
|
|
" print(\"uploaded %s\" % target_key_name)\n",
|
|
" except ClientError as e:\n",
|
|
" logging.error(e)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 3. Avro reading"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=10/version=1.0.2/CustData_4e5dc9b8-6eb7-4fa4-93e0-61faf874b698.avro\n",
|
|
"{'customerId': 'cc733c92-6853-45f6-8e49-bec741188ebb', 'isActive': True, 'age': 58, 'balance': 49.34309768676758, 'accountBalance_logical_dec': b'7L\\xbc\\xff\\xf3', 'array_of_strings': ['Rebecca'], 'paymentDetails': {'counterPartyName': 'cc733c92-6853-45f6-8e49-bec741188ebb', 'groupingId': '9626bf79-2f97-4c0c-9aae-de080adab7df', 'payeeId': None, 'message': None, 'type': 'ONE', 'otherAccountId': '69261bc2-4a71-4de7-bc8b-1beb0d9320ac'}, 'parameters': {'key1': 'value1', 'key2': 'value2'}}\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(target_key_name)\n",
|
|
"\n",
|
|
"obj = s3_client.get_object(Bucket=S3_BUCKET, Key=target_key_name)\n",
|
|
"record_raw = obj['Body'].read()\n",
|
|
"\n",
|
|
"\n",
|
|
"reader = DataFileReader(io.BytesIO(record_raw), DatumReader())\n",
|
|
"for line in reader:\n",
|
|
" print(line)\n",
|
|
" break\n",
|
|
"\n",
|
|
"avro_schema = reader.meta[\"avro.schema\"]\n",
|
|
"reader.close()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"b'{\"type\": \"record\", \"mox-meta\": {\"version\": \"1.0.2\", \"type\": \"ENTITY_SNAPSHOT\"}, \"name\": \"UamTestEvent\", \"namespace\": \"com.uam.datalake.v1\", \"fields\": [{\"type\": {\"type\": \"string\", \"avro.java.string\": \"String\"}, \"name\": \"customerId\"}, {\"type\": \"boolean\", \"name\": \"isActive\", \"doc\": \"a boolean flag if the Customer is active\"}, {\"type\": \"int\", \"name\": \"age\"}, {\"type\": \"float\", \"name\": \"balance\"}, {\"type\": {\"type\": \"bytes\", \"logicalType\": \"decimal\", \"precision\": 20, \"scale\": 4}, \"name\": \"accountBalance_logical_dec\"}, {\"type\": [\"null\", {\"type\": \"array\", \"items\": {\"type\": \"string\", \"avro.java.string\": \"String\"}}], \"name\": \"array_of_strings\", \"default\": null}, {\"type\": [\"null\", {\"type\": \"record\", \"name\": \"PaymentDetails\", \"namespace\": \"com.uam.datalake.v1\", \"fields\": [{\"type\": [\"null\", {\"type\": \"string\", \"avro.java.string\": \"String\"}], \"name\": \"counterPartyName\", \"default\": null}, {\"type\": [\"null\", {\"type\": \"string\", \"avro.java.string\": \"String\"}], \"name\": \"groupingId\", \"default\": null}, {\"type\": [\"null\", {\"type\": \"string\", \"avro.java.string\": \"String\"}], \"name\": \"payeeId\", \"default\": null}, {\"type\": [\"null\", {\"type\": \"string\", \"avro.java.string\": \"String\"}], \"name\": \"message\", \"default\": null}, {\"type\": {\"type\": \"enum\", \"name\": \"PaymentType\", \"namespace\": \"com.uam.datalake.v1\", \"symbols\": [\"UNKNOWN\", \"ONE\", \"TWO\"]}, \"name\": \"type\"}, {\"type\": [\"null\", {\"type\": \"string\", \"avro.java.string\": \"String\"}], \"name\": \"otherAccountId\", \"default\": null}]}], \"name\": \"paymentDetails\", \"default\": null}, {\"type\": [\"null\", {\"type\": \"map\", \"avro.java.string\": \"String\", \"values\": {\"type\": \"string\", \"avro.java.string\": \"String\"}}], \"name\": \"parameters\", \"default\": null}]}'"
|
|
]
|
|
},
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"avro_schema"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"tbl avro_uam_test not found\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"{'ResponseMetadata': {'RequestId': '1d5a05a1-0253-436c-a85a-d21f334d51ae',\n",
|
|
" 'HTTPStatusCode': 200,\n",
|
|
" 'HTTPHeaders': {'date': 'Sat, 24 Apr 2021 11:20:04 GMT',\n",
|
|
" 'content-type': 'application/x-amz-json-1.1',\n",
|
|
" 'content-length': '2',\n",
|
|
" 'connection': 'keep-alive',\n",
|
|
" 'x-amzn-requestid': '1d5a05a1-0253-436c-a85a-d21f334d51ae'},\n",
|
|
" 'RetryAttempts': 0}}"
|
|
]
|
|
},
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"#register glue table with avro SCHEMA\n",
|
|
"tear_down_test_table() # create or replace\n",
|
|
"\n",
|
|
"glue_client.create_table(\n",
|
|
" DatabaseName=TEST_DB,\n",
|
|
" TableInput={\n",
|
|
" \"Name\" : TEST_TABLE_NAME,\n",
|
|
" 'Owner': 'owner',\n",
|
|
" 'StorageDescriptor': {\n",
|
|
" 'Columns': [\n",
|
|
" {'Name': 'customerId', 'Type': 'string'},\n",
|
|
" {'Name': 'isActive', 'Type': 'boolean'},\n",
|
|
" {'Name': 'age', 'Type': 'int'}, \n",
|
|
" {'Name': 'balance', 'Type': 'float'},\n",
|
|
" {'Name': 'accountBalance_logical_dec', 'Type': 'decimal(20,4)'},\n",
|
|
" {'Name': 'array_of_strings', 'Type': 'array<string>'},\n",
|
|
" {'Name': 'paymentdetails',\n",
|
|
" 'Type': 'struct<counterpartyname:string,groupingid:string,payeeid:string,message:string,type:string,otheraccountid:string>'},\n",
|
|
" {'Name': 'parameters', 'Type': 'map<string,string>'}\n",
|
|
" ],\n",
|
|
" 'Location': 's3://{}/{}/namespace={}/'.format(S3_BUCKET,RECORD_NAME,NAMESPACE),\n",
|
|
" 'InputFormat': 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat',\n",
|
|
" 'OutputFormat': 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat',\n",
|
|
" 'Compressed': False,\n",
|
|
" 'NumberOfBuckets': -1,\n",
|
|
" 'SerdeInfo': {\n",
|
|
" 'SerializationLibrary': 'org.apache.hadoop.hive.serde2.avro.AvroSerDe',\n",
|
|
" 'Parameters': \n",
|
|
" {\n",
|
|
" 'avro.schema.literal': json.dumps(schema),\n",
|
|
" 'serialization.format': '1'\n",
|
|
" }\n",
|
|
" },\n",
|
|
" 'BucketColumns': [],\n",
|
|
" 'SortColumns': [],\n",
|
|
" },\n",
|
|
" 'PartitionKeys': [\n",
|
|
" {'Name': 'year','Type': 'int'},\n",
|
|
" {'Name': 'month','Type': 'int'},\n",
|
|
" {'Name': 'day','Type': 'int'},\n",
|
|
" {'Name': 'version','Type': 'string'}\n",
|
|
" ],\n",
|
|
" 'TableType': 'EXTERNAL_TABLE', \n",
|
|
" 'Parameters': {\n",
|
|
" \n",
|
|
" 'avro.schema.literal': json.dumps(schema),\n",
|
|
" 'classification': 'avro',\n",
|
|
" 'compressionType': 'none',\n",
|
|
" \n",
|
|
" }\n",
|
|
" }\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### now you can find the table in Glue Data Catalogue and query with Athena (remember about partitions)\n",
|
|
"\n",
|
|
"```\n",
|
|
"SELECT * , paymentdetails.groupingid , \"$path\"\n",
|
|
"FROM \"avro_uam_test\" as a \n",
|
|
"CROSS JOIN UNNEST(array_of_strings) as t(names)\n",
|
|
"where customerid = '0d6f913f-9364-4898-875e-d07311d1e300' and day = 1\n",
|
|
"```"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.8"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|