15 - EMR¶
[1]:
import boto3
import awswrangler as wr
Enter your bucket name:¶
[2]:
import getpass
bucket = getpass.getpass()
··········································
Enter your Subnet ID:¶
[8]:
subnet = getpass.getpass()
························
Creating EMR Cluster¶
[9]:
cluster_id = wr.emr.create_cluster(subnet)
Uploading our PySpark script to Amazon S3¶
[10]:
script = """
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("docker-awswrangler").getOrCreate()
sc = spark.sparkContext
print("Spark Initialized")
"""
_ = boto3.client("s3").put_object(Body=script, Bucket=bucket, Key="test.py")
Submit PySpark step¶
[11]:
step_id = wr.emr.submit_step(cluster_id, command=f"spark-submit s3://{bucket}/test.py")
Wait Step¶
[12]:
while wr.emr.get_step_state(cluster_id, step_id) != "COMPLETED":
pass
Terminate Cluster¶
[13]:
wr.emr.terminate_cluster(cluster_id)