chmura/jupyter/UAM_2_parquet.ipynb

607 lines
3.2 MiB
Plaintext
Raw Normal View History

2021-04-24 15:54:14 +02:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 2 Format danych Parquet\n",
"\n",
"* wykonaj wszystkie kroki z ćwiczenia 1\n",
"* przekonwertuj pliki AVRO do Parquet z wykorzystaniem bibliotek pandas i pandavro\n",
"* zapisz pliki na S3 w osobnej ścieżce\n",
"* zarejestruj tabelę GLUE z wykorzystaniem boto3 / crawlera\n",
"* odśwież partycje (MSCK REPAIR w Athena/Load partitions na tabeli)\n",
"* sprawdź efektywność zapytań agregujących - np AVG(Age) - porównaj wydajność (ilosć przeskanowanych danych = koszt & czas)\n",
"\n",
"Przykłady kwerened do testów :\n",
"\n",
"SELECT AVG(age) FROM \"uam\".\"avro_uam_test\" where day = '1'\n",
"\n",
"SELECT AVG(age) FROM \"uam\".\"parquet_uam_test\" where day = '1'\n",
"\n",
"## Pamiętaj aby po skończonych ćwiczeniach usunąć wszystkie obiekty\n",
"### Uwaga !!! poniższy skrypt tworzy obiekty w regionie HongKong !\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import boto3\n",
"import io \n",
"import json\n",
" \n",
"REGION = \"us-east-1\"\n",
"\n",
"\n",
"session_kwargs = {\n",
"\n",
" \"aws_access_key_id\":\"\",\n",
" \"aws_secret_access_key\":\"\",\n",
" \"aws_session_token\":\"\",\n",
" \"region_name\": REGION\n",
"}\n",
"session = boto3.Session(**session_kwargs)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
" \n",
"S3_BUCKET = \"datalake-dev-920628590621-us-east-1\"\n",
"\n",
"TEST_DB = 'datalake_dev_jk'\n",
"TEST_TABLE_NAME = 'parquet_uam_test'\n",
"EVENT_NAME = \"UamTestEvent\"\n",
"\n",
"PARQUET_PREFIX = \"Parquet_UamTestEvent\"\n",
"NAMESPACE = \"com.uam.datalake.v1\"\n",
"\n",
"def tear_down_test_table(database=TEST_DB, table_name=TEST_TABLE_NAME):\n",
" tbl_list = [x[\"Name\"] for x in glue_client.get_tables(DatabaseName=database)[\"TableList\"]]\n",
" if table_name in tbl_list:\n",
" glue_client.delete_table(DatabaseName=database,Name=table_name)\n",
" print(\"test table {} deleted\".format(table_name))\n",
" else:\n",
" print(\"tbl %s not found\" % table_name)\n",
" \n",
"def tear_down_s3(bucket=S3_BUCKET,prefix=PARQUET_PREFIX):\n",
" s3 = boto3.resource('s3',**session_kwargs)\n",
" bucket = s3.Bucket(bucket)\n",
" bucket.objects.filter(Prefix=prefix).delete()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=1/version=1.0.2/CustData_d6f44c22-5c87-4e14-956b-ad7d985226d0.avro',\n",
" 'UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=10/version=1.0.2/CustData_4e5dc9b8-6eb7-4fa4-93e0-61faf874b698.avro',\n",
" 'UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=2/version=1.0.2/CustData_cf0edf74-e76e-458f-a1d2-e092275a719c.avro',\n",
" 'UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=3/version=1.0.2/CustData_f9ed4ad7-ed1c-4431-b9ab-317387cdb5af.avro',\n",
" 'UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=4/version=1.0.2/CustData_d89b1c7f-66e3-4522-a603-1630abbf24fa.avro',\n",
" 'UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=5/version=1.0.2/CustData_92483d0c-5254-4825-9bbb-59545fc6e4dd.avro',\n",
" 'UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=6/version=1.0.2/CustData_3473649b-97c5-4597-965b-672a11cdad73.avro',\n",
" 'UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=7/version=1.0.2/CustData_ecc999f8-eda7-4782-b844-17809980f34c.avro',\n",
" 'UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=8/version=1.0.2/CustData_b95fd45c-8f0d-4612-8f8b-131437895013.avro',\n",
" 'UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=9/version=1.0.2/CustData_5a5ef6ba-1576-4450-82a0-3f6f9a10a7c8.avro']"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"s3_client = session.client(\"s3\")\n",
"glue_client = session.client(\"glue\")\n",
"\n",
"avro_files = [keys[\"Key\"] for keys in s3_client.list_objects(Bucket=S3_BUCKET, Prefix=EVENT_NAME)[\"Contents\"]]\n",
"avro_files"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import tempfile\n",
"import os\n",
"import pandavro as pdx\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"obj = s3_client.get_object(Bucket=S3_BUCKET, Key=avro_files[0])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"record_raw = obj['Body'].read()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"b'Obj\\x01\\x04\\x14avro.codec\\x08null\\x16avro.schema\\x92\\x1a{\"type\": \"record\", \"mox-meta\": {\"version\": \"1.0.2\", \"type\": \"ENTITY_SNAPSHOT\"}, \"name\": \"UamTestEvent\", \"namespace\": \"com.uam.datalake.v1\", \"fields\": [{\"type\": {\"type\": \"string\", \"avro.java.string\": \"String\"}, \"name\": \"customerId\"}, {\"type\": \"boolean\", \"name\": \"isActive\", \"doc\": \"a boolean flag if the Customer is active\"}, {\"type\": \"int\", \"name\": \"age\"}, {\"type\": \"float\", \"name\": \"balance\"}, {\"type\": {\"type\": \"bytes\", \"logicalType\": \"decimal\", \"precision\": 20, \"scale\": 4}, \"name\": \"accountBalance_logical_dec\"}, {\"type\": [\"null\", {\"type\": \"array\", \"items\": {\"type\": \"string\", \"avro.java.string\": \"String\"}}], \"name\": \"array_of_strings\", \"default\": null}, {\"type\": [\"null\", {\"type\": \"record\", \"name\": \"PaymentDetails\", \"namespace\": \"com.uam.datalake.v1\", \"fields\": [{\"type\": [\"null\", {\"type\": \"string\", \"avro.java.string\": \"String\"}], \"name\": \"counterPartyName\", \"default\": null}, {\"type\": [\"null\", {\"type\": \"string\", \"avro.java.string\": \"String\"}], \"name\": \"groupingId\", \"default\": null}, {\"type\": [\"null\", {\"type\": \"string\", \"avro.java.string\": \"String\"}], \"name\": \"payeeId\", \"default\": null}, {\"type\": [\"null\", {\"type\": \"string\", \"avro.java.string\": \"String\"}], \"name\": \"message\", \"default\": null}, {\"type\": {\"type\": \"enum\", \"name\": \"PaymentType\", \"namespace\": \"com.uam.datalake.v1\", \"symbols\": [\"UNKNOWN\", \"ONE\", \"TWO\"]}, \"name\": \"type\"}, {\"type\": [\"null\", {\"type\": \"string\", \"avro.java.string\": \"String\"}], \"name\": \"otherAccountId\", \"default\": null}]}], \"name\": \"paymentDetails\", \"default\": null}, {\"type\": [\"null\", {\"type\": \"map\", \"avro.java.string\": \"String\", \"values\": {\"type\": \"string\", \"avro.java.string\": \"String\"}}], \"name\": \"parameters\", \"default\": null}]}\\x00\\x13\\xc6\\xe5<(\\xf5\\xf2\\x14,\"zIF\\x8c\\xe0<\\x94\\x01\\xf2\\xfb\\x01Hcc733c92-6853-45f6-8e49-bec741188ebb\\x01tU_EB\\n7L\\xbc\\xff\\xf3\\x02\\x02\\x0eRebecca\\x00\\x02\\x02Hcc733c92-6853-45f6-8e49-bec741188ebb\\x02H9626bf79-2f97-4c0c-9aae-de080adab7df\\x00\\x00\\x02\\x02H69261bc2-4a71-4de7-bc8b-1beb0d9320ac\\x02\\x04\\x08key1\\x0cvalue1\\x08key2\\x0cvalue2\\x00H7217d7d2-6f24-4bf5-942d-3e4cf15982c1\\x01x\\x93\\x19\\x8aB\\n\\x01h\"\\xa0\\xa7\\x02\\x02\\x0cAmanda\\x00\\x02\\x02H7217d7d2-6f24-4bf5-942d-3e4cf15982c1\\x02Hdae78e5b-bfe0-43f8-bbf8-38407019f6a5\\x00\\x00\\x02\\x02H96b3cbbe-fc2d-4e29-8344-6578a5e24f10\\x02\\x04\\x08key1\\x0cvalue1\\x08key2\\x0cvalue2\\x00Hd9117f52-3839-4641-a470-7de16f437d8b\\x00@\\xcbL\\xf0A\\n\\xc3>\\xde\\xf9J\\x02\\x04\\nNancy\\x0eMichael\\x00\\x02\\x02Hd9117f52-3839-4641-a470-7de16f437d8b\\x02Hc7815ad2-1180-4b64-9b8c-d60663eb8ba5\\x00\\x00\\x02\\x02H31fb5bd1-9e65-461e-81bf-bb1e55ebea32\\x02\\x04\\x08key1\\x0cvalue1\\x08key2\\x0cvalue2\\x00H6527fe9d-4eb8-4d82-bb8b-01321086a9ed\\x01.\\x9d\\xf6\\xb9B\\x0c\\xff)\\xaf\\x9e\\xaf\\xed\\x02\\x08\\x0cMeghan\\nKevin\\x0eTimothy\\x10Angelica\\x00\\x02\\x02H6527fe9d-4eb8-4d82-bb8b-01321086a9ed\\x02Hffcecb26-d2cc-442f-973c-83f2c1150137\\x00\\x00\\x02\\x02H0b315ed3-4f05-4a54-a084-050fdcd434ac\\x02\\x04\\x08key1\\x0cvalue1\\x08key2\\x0cvalue2\\x00H0d6f913f-9364-4898-875e-d07311d1e300\\x00r\\x0fpOB\\x0c\\xffT8\\t\\xd31\\x02\\x06\\x0eAllison\\nJames\\x10Danielle\\x00\\x02\\x02H0d6f913f-9364-4898-875e-d07311d1e300\\x02Hcd8d0edf-4711-474b-8a2b-c3f5b51906d5\\x00\\x00\\x02\\x02Hd5e111f0-ab7b-4997-955d-a3aa10d44ccd\\x02\\x04\\x08key1\\x0cvalue1\\x08key2\\x0cvalue2\\x00Heac4e114-cb83-4542-955b-b3a0d83ad014\\x01z\\x01 \\xb9A\\n\\x05U\\xfb\\x9a\\xb5\\x02\\x06\\x10Nicholas\\x0cJoshua\\x0cAmanda\\x00\\x02\\x02Heac4e114-cb83-4542-955b-b3a0d83ad014\\x02Hd5de4604-609c-4362-bbf8-615a90c831e2\\x00\\x00\\x02\\x02Ha7ca2fbd-5443-4ac9-8cf0-c9d5e5a943c3\\x02\\x04\\x08key1\\x0cvalue1\\x08key2\\x0cvalue2\\x00H3708c492-8607-45d2-932d-b9fafdf2b380\\x00\\x88\\x01\\xa9\\x11 B\\n[$\\xb0!\\x91
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"record_raw"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'type': 'record', 'mox-meta': {'version': '1.0.2', 'type': 'ENTITY_SNAPSHOT'}, 'name': 'UamTestEvent', 'namespace': 'com.uam.datalake.v1', 'fields': [{'type': {'type': 'string', 'avro.java.string': 'String'}, 'name': 'customerId'}, {'type': 'boolean', 'name': 'isActive', 'doc': 'a boolean flag if the Customer is active'}, {'type': 'int', 'name': 'age'}, {'type': 'float', 'name': 'balance'}, {'type': {'type': 'bytes', 'logicalType': 'decimal', 'precision': 20, 'scale': 4}, 'name': 'accountBalance_logical_dec'}, {'type': ['null', {'type': 'array', 'items': {'type': 'string', 'avro.java.string': 'String'}}], 'name': 'array_of_strings', 'default': None}, {'type': ['null', {'type': 'record', 'name': 'PaymentDetails', 'namespace': 'com.uam.datalake.v1', 'fields': [{'type': ['null', {'type': 'string', 'avro.java.string': 'String'}], 'name': 'counterPartyName', 'default': None}, {'type': ['null', {'type': 'string', 'avro.java.string': 'String'}], 'name': 'groupingId', 'default': None}, {'type': ['null', {'type': 'string', 'avro.java.string': 'String'}], 'name': 'payeeId', 'default': None}, {'type': ['null', {'type': 'string', 'avro.java.string': 'String'}], 'name': 'message', 'default': None}, {'type': {'type': 'enum', 'name': 'PaymentType', 'namespace': 'com.uam.datalake.v1', 'symbols': ['UNKNOWN', 'ONE', 'TWO']}, 'name': 'type'}, {'type': ['null', {'type': 'string', 'avro.java.string': 'String'}], 'name': 'otherAccountId', 'default': None}]}], 'name': 'paymentDetails', 'default': None}, {'type': ['null', {'type': 'map', 'avro.java.string': 'String', 'values': {'type': 'string', 'avro.java.string': 'String'}}], 'name': 'parameters', 'default': None}]}\n"
]
}
],
"source": [
"from avro.datafile import DataFileReader\n",
"from avro.io import DatumReader\n",
"\n",
"reader = DataFileReader(io.BytesIO(record_raw), DatumReader())\n",
"avro_schema = json.loads(reader.meta[\"avro.schema\"])\n",
"reader.close()\n",
"\n",
"print(avro_schema)\n",
"\n",
"RECORD_NAME = avro_schema[\"name\"]\n",
"NAMESPACE = avro_schema[\"namespace\"]\n",
"VERSION = avro_schema[\"mox-meta\"][\"version\"]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"f = io.BytesIO(record_raw)\n",
"f.seek(0)\n",
"\n",
"df = pdx.read_avro(f)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(10000, 8)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>customerId</th>\n",
" <th>isActive</th>\n",
" <th>age</th>\n",
" <th>balance</th>\n",
" <th>accountBalance_logical_dec</th>\n",
" <th>array_of_strings</th>\n",
" <th>paymentDetails</th>\n",
" <th>parameters</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>cc733c92-6853-45f6-8e49-bec741188ebb</td>\n",
" <td>True</td>\n",
" <td>58</td>\n",
" <td>49.343098</td>\n",
" <td>23751065.5987</td>\n",
" <td>[Rebecca]</td>\n",
" <td>{'counterPartyName': 'cc733c92-6853-45f6-8e49-...</td>\n",
" <td>{'key1': 'value1', 'key2': 'value2'}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>7217d7d2-6f24-4bf5-942d-3e4cf15982c1</td>\n",
" <td>True</td>\n",
" <td>60</td>\n",
" <td>69.049950</td>\n",
" <td>604206.7111</td>\n",
" <td>[Amanda]</td>\n",
" <td>{'counterPartyName': '7217d7d2-6f24-4bf5-942d-...</td>\n",
" <td>{'key1': 'value1', 'key2': 'value2'}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>d9117f52-3839-4641-a470-7de16f437d8b</td>\n",
" <td>False</td>\n",
" <td>32</td>\n",
" <td>30.037497</td>\n",
" <td>-26093820.4854</td>\n",
" <td>[Nancy, Michael]</td>\n",
" <td>{'counterPartyName': 'd9117f52-3839-4641-a470-...</td>\n",
" <td>{'key1': 'value1', 'key2': 'value2'}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>6527fe9d-4eb8-4d82-bb8b-01321086a9ed</td>\n",
" <td>True</td>\n",
" <td>23</td>\n",
" <td>92.981667</td>\n",
" <td>-92047155.6115</td>\n",
" <td>[Meghan, Kevin, Timothy, Angelica]</td>\n",
" <td>{'counterPartyName': '6527fe9d-4eb8-4d82-bb8b-...</td>\n",
" <td>{'key1': 'value1', 'key2': 'value2'}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0d6f913f-9364-4898-875e-d07311d1e300</td>\n",
" <td>False</td>\n",
" <td>57</td>\n",
" <td>51.859432</td>\n",
" <td>-73779420.6927</td>\n",
" <td>[Allison, James, Danielle]</td>\n",
" <td>{'counterPartyName': '0d6f913f-9364-4898-875e-...</td>\n",
" <td>{'key1': 'value1', 'key2': 'value2'}</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" customerId isActive age balance \\\n",
"0 cc733c92-6853-45f6-8e49-bec741188ebb True 58 49.343098 \n",
"1 7217d7d2-6f24-4bf5-942d-3e4cf15982c1 True 60 69.049950 \n",
"2 d9117f52-3839-4641-a470-7de16f437d8b False 32 30.037497 \n",
"3 6527fe9d-4eb8-4d82-bb8b-01321086a9ed True 23 92.981667 \n",
"4 0d6f913f-9364-4898-875e-d07311d1e300 False 57 51.859432 \n",
"\n",
" accountBalance_logical_dec array_of_strings \\\n",
"0 23751065.5987 [Rebecca] \n",
"1 604206.7111 [Amanda] \n",
"2 -26093820.4854 [Nancy, Michael] \n",
"3 -92047155.6115 [Meghan, Kevin, Timothy, Angelica] \n",
"4 -73779420.6927 [Allison, James, Danielle] \n",
"\n",
" paymentDetails \\\n",
"0 {'counterPartyName': 'cc733c92-6853-45f6-8e49-... \n",
"1 {'counterPartyName': '7217d7d2-6f24-4bf5-942d-... \n",
"2 {'counterPartyName': 'd9117f52-3839-4641-a470-... \n",
"3 {'counterPartyName': '6527fe9d-4eb8-4d82-bb8b-... \n",
"4 {'counterPartyName': '0d6f913f-9364-4898-875e-... \n",
"\n",
" parameters \n",
"0 {'key1': 'value1', 'key2': 'value2'} \n",
"1 {'key1': 'value1', 'key2': 'value2'} \n",
"2 {'key1': 'value1', 'key2': 'value2'} \n",
"3 {'key1': 'value1', 'key2': 'value2'} \n",
"4 {'key1': 'value1', 'key2': 'value2'} "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(df.shape)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"df.to_parquet(\"local_file.parquet\", compression='gzip')"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Parquet_UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=1/version=1.0.2/CustData_d6f44c22-5c87-4e14-956b-ad7d985226d0.parquet uploaded\n",
"Parquet_UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=10/version=1.0.2/CustData_4e5dc9b8-6eb7-4fa4-93e0-61faf874b698.parquet uploaded\n",
"Parquet_UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=2/version=1.0.2/CustData_cf0edf74-e76e-458f-a1d2-e092275a719c.parquet uploaded\n",
"Parquet_UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=3/version=1.0.2/CustData_f9ed4ad7-ed1c-4431-b9ab-317387cdb5af.parquet uploaded\n",
"Parquet_UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=4/version=1.0.2/CustData_d89b1c7f-66e3-4522-a603-1630abbf24fa.parquet uploaded\n",
"Parquet_UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=5/version=1.0.2/CustData_92483d0c-5254-4825-9bbb-59545fc6e4dd.parquet uploaded\n",
"Parquet_UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=6/version=1.0.2/CustData_3473649b-97c5-4597-965b-672a11cdad73.parquet uploaded\n",
"Parquet_UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=7/version=1.0.2/CustData_ecc999f8-eda7-4782-b844-17809980f34c.parquet uploaded\n",
"Parquet_UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=8/version=1.0.2/CustData_b95fd45c-8f0d-4612-8f8b-131437895013.parquet uploaded\n",
"Parquet_UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=9/version=1.0.2/CustData_5a5ef6ba-1576-4450-82a0-3f6f9a10a7c8.parquet uploaded\n"
]
}
],
"source": [
"def save_df(df, full_path, output_file_name):\n",
" \n",
" parquet_key_name = '/'.join([full_path, output_file_name]) \n",
" with tempfile.TemporaryDirectory() as tmpdirname:\n",
" local_file = os.path.join(tmpdirname, output_file_name)\n",
" \n",
" df.to_parquet(local_file, compression='gzip')\n",
" \n",
" try:\n",
" s3_client.upload_file(local_file, S3_BUCKET, parquet_key_name)\n",
" print(\"{} uploaded\".format(parquet_key_name))\n",
" \n",
" except ClientError as e:\n",
" print(e) \n",
" \n",
" \n",
"\n",
"for every_avro in avro_files: \n",
" \n",
" obj = s3_client.get_object(Bucket=S3_BUCKET, Key=every_avro)\n",
" record_raw = obj['Body'].read()\n",
"\n",
" f = io.BytesIO(record_raw)\n",
" f.seek(0)\n",
" \n",
" df = pdx.read_avro(f)\n",
" \n",
"\n",
" full_path = '/'.join(every_avro.split(\"/\")[:-1]).replace('avro','parquet').replace(\"UamTestEvent\",PARQUET_PREFIX)\n",
" output_file_name = every_avro.split(\"/\")[-1].replace('avro','parquet')\n",
" \n",
" save_df(df, full_path, output_file_name)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"test table parquet_uam_test deleted\n"
]
},
{
"data": {
"text/plain": [
"{'ResponseMetadata': {'RequestId': 'aa815163-ae98-4e66-8bae-a7c376daead6',\n",
" 'HTTPStatusCode': 200,\n",
" 'HTTPHeaders': {'date': 'Sat, 24 Apr 2021 11:59:42 GMT',\n",
" 'content-type': 'application/x-amz-json-1.1',\n",
" 'content-length': '2',\n",
" 'connection': 'keep-alive',\n",
" 'x-amzn-requestid': 'aa815163-ae98-4e66-8bae-a7c376daead6'},\n",
" 'RetryAttempts': 0}}"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def tear_down_test_table(database=TEST_DB, table_name=TEST_TABLE_NAME):\n",
" tbl_list = [x[\"Name\"] for x in glue_client.get_tables(DatabaseName=database)[\"TableList\"]]\n",
" if table_name in tbl_list:\n",
" glue_client.delete_table(DatabaseName=database,Name=table_name)\n",
" print(\"test table {} deleted\".format(table_name))\n",
" else:\n",
" print(\"tbl %s not found\" % table_name)\n",
" \n",
"tear_down_test_table() # create or replace\n",
"\n",
"glue_client.create_table(\n",
" DatabaseName=TEST_DB,\n",
" TableInput={\n",
" \"Name\" : TEST_TABLE_NAME,\n",
" \n",
" 'Owner': 'owner',\n",
" 'PartitionKeys': [\n",
" {'Name': 'year', 'Type': 'int'},\n",
" {'Name': 'month', 'Type': 'int'},\n",
" {'Name': 'day', 'Type': 'int'},\n",
" {'Name': 'version', 'Type': 'string'}],\n",
" 'Retention': 0,\n",
" 'StorageDescriptor': {'BucketColumns': [],\n",
" 'Columns': [{'Name': 'customerid', 'Type': 'string'},\n",
" {'Name': 'isactive', 'Type': 'boolean'},\n",
" {'Name': 'age', 'Type': 'bigint'},\n",
" {'Name': 'balance', 'Type': 'double'},\n",
" {'Name': 'accountbalance_logical_dec',\n",
" 'Type': 'decimal(12,4)'},\n",
" {'Name': 'array_of_strings',\n",
" 'Type': 'array<string>'},\n",
" {'Name': 'paymentdetails',\n",
" 'Type': 'struct<counterPartyName:string,groupingId:string,message:int,otherAccountId:string,payeeId:int,type:string>'},\n",
" {'Name': 'parameters',\n",
" 'Type': 'struct<key1:string,key2:string>'}],\n",
" 'Compressed': True,\n",
" 'InputFormat': 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat',\n",
" 'Location': 's3://{}/{}/namespace={}/'.format(S3_BUCKET,PARQUET_PREFIX,NAMESPACE),\n",
" 'NumberOfBuckets': -1,\n",
" 'OutputFormat': 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat',\n",
"\n",
" 'SerdeInfo': {'Parameters': {'serialization.format': '1'},\n",
" 'SerializationLibrary': 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'},\n",
" 'SortColumns': [],\n",
" 'StoredAsSubDirectories': False},\n",
" 'TableType': 'EXTERNAL_TABLE'\n",
" }\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"SELECT customerid , \"$path\"\n",
"FROM \"parquet_uam_test\" as a \n",
"where customerid = '0d6f913f-9364-4898-875e-d07311d1e300' and day = 1\n",
"\n",
"\n",
"SELECT customerid , \"$path\"\n",
"FROM \"avro_uam_test\" as a \n",
"where customerid = '0d6f913f-9364-4898-875e-d07311d1e300' and day = 1"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"loc = f's3://{S3_BUCKET}/{PARQUET_PREFIX}/namespace=com.uam.datalake.v1/year=2020/month=2/day=1/version=1.0.2/'\n",
"\n",
"response = glue_client.create_partition(\n",
" \n",
" DatabaseName=TEST_DB,\n",
" TableName=TEST_TABLE_NAME,\n",
" PartitionInput={\n",
" 'Values': [\n",
" '2020','2','1', '1.0.2'\n",
" ], \n",
" 'StorageDescriptor': {\n",
" 'Location': loc,\n",
" 'InputFormat': 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat',\n",
" 'OutputFormat': 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat',\n",
" 'Compressed': False,\n",
" 'NumberOfBuckets': -1,\n",
" 'SerdeInfo': {'SerializationLibrary': 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'},\n",
" 'BucketColumns': [],\n",
" \n",
" }\n",
" \n",
" \n",
" }\n",
")"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"# alternative for glue table & partition registration\n",
"\n",
"CREATE EXTERNAL TABLE parquet_uam(\n",
" `customerid` string, \n",
" `isactive` boolean, \n",
" `age` bigint, \n",
" `balance` double, \n",
" `accountbalance_logical_dec` decimal(12,4), \n",
" `array_of_strings` array<string>, \n",
" `paymentdetails` struct<counterPartyName:string,groupingId:string,message:int,otherAccountId:string,payeeId:int,type:string>, \n",
" `parameters` struct<key1:string,key2:string>)\n",
"PARTITIONED BY ( \n",
" `year` int, \n",
" `month` int, \n",
" `day` int, \n",
" `version` string)\n",
"ROW FORMAT SERDE \n",
" 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' \n",
"STORED AS INPUTFORMAT \n",
" 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' \n",
"OUTPUTFORMAT \n",
" 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'\n",
"LOCATION\n",
" 's3://datalake-dev-920628590621-us-east-1/Parquet_UamTestEvent/namespace=com.uam.datalake.v1/'"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"ALTER TABLE parquet_uam ADD\n",
" PARTITION (year='2020', month='2', day='1', version='1.0.2') \n",
" LOCATION 's3://datalake-dev-920628590621-us-east-1/Parquet_UamTestEvent/namespace=com.uam.datalake.v1/year=2020/month=2/day=1/version=1.0.2/'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}