Pandas cheatsheet

This is my Pandas cheatsheet.

Note that I import pandas the 'standard' way:

import pandas as pd

Convert with dataframes

Create dataframe from a dictionary

character_df = pd.DataFrame.from_dict(characters)
characters = character_df.to_dict(orient='records')

Convert CSV to dataframe

character_df = pd.DataFrame.from_csv("characters.csv", sep='\t', encoding='utf-8')
character_df.to_csv('characters.csv', sep='\t', encoding='utf-8')

Convert dataframe to JSON

character_df = pd.DataFrame.from_json('characters.json')
character_df.to_json('characters.json', orient='records')

Convert dataframe to pickle

character_df = pd.read_pickle('characters.pandas')
character_df.to_pickle('characters.pandas')

Convert database query to dataframe

db = create_engine('postgresql://%s:%s@%s:%d/characters' % (POSTGRES_USER, POSTGRES_PASS, POSTGRES_HOST, POSTGRES_PORT))
character_df = pd.read_sql_query('SELECT * FROM "character_collection"', con=db)

Cleaning dataframes

Replace in column

character_df['name'] = character_df['name'].str.replace('-', ' ')

Regex replace in whole dataframe

character_df.replace(r'-',r' ', regex=True, inplace=True)

Regex extract in column

character_df['introduction_year'] = character_df['date_of_introduction'].str.extract('(\d{4})-..-..', expand=True)

Remove all Not-a-Numbers

character_df = character_df.replace({'NaN': None}, regex=True)

Rename a column

character_df.rename(columns={'name': 'character_name'}, inplace=True)

Or replace characters:

character_df.columns = character_df.columns.str.replace('.', '_')

Drop a column

character_df = character_df.drop('origin', axis=1)

Drop a row

Drop all rows where the name is NaN.

character_df.dropna(subset=['name'], inplace=True)

Delete a column

del character_df['special_diet']

Convert to integer

character_df['introduction_year'] = character_df['introduction_year'].fillna(-1).astype('int64')

Convert to category

character_df['superpower'] = character_df['superpower'].astype('category')

Convert string to list

# Convert a string with surrounding brackets and quotes to a list
def convert_string_to_list(column):
    """ Convert unicode string to list """
    return column.str.strip('{}').astype(str).apply(lambda x: x.split(',')[0].strip("\"") if len(x) > 0 else "")
character_df['superpowers'] = convert_string_to_list(character_df['superpowers'])

Create column from index

character_df.index.names = ['Name']
character_df = character_df.reset_index()

Extend dictionary cell to columns

df = pd.concat([df.drop(['meta'], axis=1), df['meta'].apply(pd.Series)], axis=1)

Find data

Describe the data

character_df['age'].describe()

Unique values

characters = character_df['character_name'].unique()

Field contains

character_df[character_df['name'].str.contains("Koopa").fillna(False)]

Count by

character_df.groupby(['superpowers']).count()

Loop through data

for element in character_df.index:
    superpower = character_df.iloc[element]['superpower']
    if not pd.isnull(superpower):
       print 'Super!'

Substract

Substract two consecutive cells

df['difference'] = df['amount'] - df['amount'].shift(+1)

Add a maximum column for a groupby

df['group_maximum'] = df.groupby(['category'])['score'].transform(max)

Get maximum 10

df.groupby(['category'])['viewers'].sum().nlargest(10)

Create category based on values

def set_category(row):
    if row['score'] < float(row['maximum']/3):
        return 'beginner'
    elif row['score'] >= float(row['maximum']/3*2):
        return 'expert' 
    else:
        return 'intermediate'

df['category'] = df.apply(set_category, axis=1)

Apply lambda function

df['inverse_number'] = df['number'].apply(lambda x: x**(-1))

Sort values

df.sort_values('name', ascending=False)

Normalize a JSON column

pd.io.json.json_normalize(df['json_col'])

Select data

df[df.name.notnull()]

or

df.query('name.notnull()',
         engine='python')

Expand cell with list to rows

df['list_cells']\
    .apply(pd.Series)\
    .stack()\
    .reset_index(level=1,
                 drop=True)\
    .to_frame('list_cell')

Find and drop empty columns

empty_cols = [col for col in df.columns if df[col].isnull().all()]
df.drop(empty_cols,
        axis=1,
        inplace=True)

Merge two dataframe

combined_data_df = first_df.merge(second_df,
                                  left_on='left_id',
                                  right_on='right_id',
                                  how='left')

Calculate difference between two consecutive rows

df['diff'] = df['amount']\
    .diff()\
    .fillna(0)

Filter each column larger than threshold

THRESHOLD = 100
df[df.gt(THRESHOLD).all(axis=1)].sort_values('total',
                                             ascending=False)