Spark cheatsheet
Import PySpark
import pyspark
Setup SparkSession
spark = pyspark.sql.SparkSession.builder \
.master("local[*]") \
.enableHiveSupport() \
.getOrCreate()
Read data
json_sdf = spark.read.json("mydata.json")
Convert RDD to Pandas DataFrame
json_pdf = json_sdf.toPandas()
Convert PySpark row to dictionary
row.asDict(recursive=True)
Join two dataframes
import pyspark.sql.functions as F
df = df_01.alias('dfone').join(df_02.alias('dftwo'),
on=[F.col('dfone.id') == F.col('dftwo.id')],
how='left').drop('id')
Select fields from dataframe
df.select('id', 'name', 'country', 'amount').show()
Expand JSON
df.withColumn('json',
F.from_json(F.col('_json_col').cast('string'),
json_schema)).show()
Casting a datatype
from pyspark.sql.types import IntegerType
dataframe.withColumn("count", F.col("count").cast(IntegerType()))