Pandas cheatsheet
This is my Pandas cheatsheet.
Note that I import pandas the 'standard' way:
import pandas as pd
Convert with dataframes
Create dataframe from a dictionary
character_df = pd.DataFrame.from_dict(characters)
characters = character_df.to_dict(orient='records')
Convert CSV to dataframe
character_df = pd.DataFrame.from_csv("characters.csv", sep='\t', encoding='utf-8')
character_df.to_csv('characters.csv', sep='\t', encoding='utf-8')
Convert dataframe to JSON
character_df = pd.DataFrame.from_json('characters.json')
character_df.to_json('characters.json', orient='records')
Convert dataframe to pickle
character_df = pd.read_pickle('characters.pandas')
character_df.to_pickle('characters.pandas')
Convert database query to dataframe
db = create_engine('postgresql://%s:%s@%s:%d/characters' % (POSTGRES_USER, POSTGRES_PASS, POSTGRES_HOST, POSTGRES_PORT))
character_df = pd.read_sql_query('SELECT * FROM "character_collection"', con=db)
Cleaning dataframes
Replace in column
character_df['name'] = character_df['name'].str.replace('-', ' ')
Regex replace in whole dataframe
character_df.replace(r'-',r' ', regex=True, inplace=True)
Regex extract in column
character_df['introduction_year'] = character_df['date_of_introduction'].str.extract('(\d{4})-..-..', expand=True)
Remove all Not-a-Numbers
character_df = character_df.replace({'NaN': None}, regex=True)
Rename a column
character_df.rename(columns={'name': 'character_name'}, inplace=True)
Or replace characters:
character_df.columns = character_df.columns.str.replace('.', '_')
Drop a column
character_df = character_df.drop('origin', axis=1)
Drop a row
Drop all rows where the name is NaN.
character_df.dropna(subset=['name'], inplace=True)
Delete a column
del character_df['special_diet']
Convert to integer
character_df['introduction_year'] = character_df['introduction_year'].fillna(-1).astype('int64')
Convert to category
character_df['superpower'] = character_df['superpower'].astype('category')
Convert string to list
# Convert a string with surrounding brackets and quotes to a list
def convert_string_to_list(column):
""" Convert unicode string to list """
return column.str.strip('{}').astype(str).apply(lambda x: x.split(',')[0].strip("\"") if len(x) > 0 else "")
character_df['superpowers'] = convert_string_to_list(character_df['superpowers'])
Create column from index
character_df.index.names = ['Name']
character_df = character_df.reset_index()
Extend dictionary cell to columns
df = pd.concat([df.drop(['meta'], axis=1), df['meta'].apply(pd.Series)], axis=1)
Find data
Describe the data
character_df['age'].describe()
Unique values
characters = character_df['character_name'].unique()
Field contains
character_df[character_df['name'].str.contains("Koopa").fillna(False)]
Count by
character_df.groupby(['superpowers']).count()
Loop through data
for element in character_df.index:
superpower = character_df.iloc[element]['superpower']
if not pd.isnull(superpower):
print 'Super!'
Substract
Substract two consecutive cells
df['difference'] = df['amount'] - df['amount'].shift(+1)
Add a maximum column for a groupby
df['group_maximum'] = df.groupby(['category'])['score'].transform(max)
Get maximum 10
df.groupby(['category'])['viewers'].sum().nlargest(10)
Create category based on values
def set_category(row):
if row['score'] < float(row['maximum']/3):
return 'beginner'
elif row['score'] >= float(row['maximum']/3*2):
return 'expert'
else:
return 'intermediate'
df['category'] = df.apply(set_category, axis=1)
Apply lambda function
df['inverse_number'] = df['number'].apply(lambda x: x**(-1))
Sort values
df.sort_values('name', ascending=False)
Normalize a JSON column
pd.io.json.json_normalize(df['json_col'])
Select data
df[df.name.notnull()]
or
df.query('name.notnull()',
engine='python')
Expand cell with list to rows
df['list_cells']\
.apply(pd.Series)\
.stack()\
.reset_index(level=1,
drop=True)\
.to_frame('list_cell')
Find and drop empty columns
empty_cols = [col for col in df.columns if df[col].isnull().all()]
df.drop(empty_cols,
axis=1,
inplace=True)
Merge two dataframe
combined_data_df = first_df.merge(second_df,
left_on='left_id',
right_on='right_id',
how='left')
Calculate difference between two consecutive rows
df['diff'] = df['amount']\
.diff()\
.fillna(0)
Filter each column larger than threshold
THRESHOLD = 100
df[df.gt(THRESHOLD).all(axis=1)].sort_values('total',
ascending=False)