cwiczenia/broadcast.py

46 lines
1.4 KiB
Python

from pyspark.sql import SparkSession
from pyspark.sql.functions import broadcast
import logging
import datetime
import time
spark = SparkSession \
.builder \
.appName("broadcast") \
.enableHiveSupport() \
.getOrCreate()
log4jLogger = spark._jvm.org.apache.log4j
logger = log4jLogger.LogManager.getLogger(__name__)
try:
logger.info("SPARKAPP START")
start1 = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
cat = spark.sql("select * from hrucinska.uam_categories")
offers = spark.sql("select * from hrucinska.uam_offers")
res = offers.join(cat, cat.category_id == offers.category_leaf)
print res.where(res.category_level2 == "RTV i AGD").count()
stop1 = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
start2 = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
res = offers.join(broadcast(cat), cat.category_id == offers.category_leaf)
print res.where(res.category_level2 == "RTV i AGD").count()
stop2 = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
print "czas rozpoczecia obliczen 1: ", start1
print "czas zakonczenia obliczen1: ", stop1
print "czas rozpoczecia obliczen 2: ", start2
print "czas zakonczenia obliczen 2: ", stop2
time.sleep(180)
except Exception as inst:
logger.info("SPARKAPP ERROR {0}".format(inst))
finally:
logger.info("SPARKAPP STOP")