Creating Pandas dataframe from Azure Table Storage

import pandas as pd
from azure.cosmosdb.table.tableservice import TableService

CONNECTION_STRING = "DUMMYSTRING"
SOURCE_TABLE = "DUMMYTABLE"

def set_table_service():
    """ Set the Azure Table Storage service """
    return TableService(connection_string=CONNECTION_STRING)

def get_dataframe_from_table_storage_table(table_service, filter_query):
    """ Create a dataframe from table storage data """
    return pd.DataFrame(get_data_from_table_storage_table(table_service,
                                                          filter_query))

def get_data_from_table_storage_table(table_service, filter_query):
    """ Retrieve data from Table Storage """
    for record in table_service.query_entities(
        SOURCE_TABLE, filter=filter_query
    ):
        yield record

fq = "PartitionKey eq '12345'"
ts = set_table_service()
df = get_dataframe_from_table_storage_table(table_service=ts,
                                            filter_query=fq)

Create Spark dataframe column with lag

Create a lagged column in a PySpark dataframe:

from pyspark.sql.functions import monotonically_increasing_id, lag
from pyspark.sql.window import Window

# Add ID to be used by the window function
df = df.withColumn('id', monotonically_increasing_id())
# Set the window
w = Window.orderBy("id")
# Create the lagged value
value_lag = lag('value').over(w)
# Add the lagged values to a new column
df = df.withColumn('prev_value', value_lag)