C3-5: Matplotlib

L5: Matplotlib and Seaborn Part 1


Bar Charts

\
Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

%matplotlib inline

\
Read in CSV file

df = pd.read_csv('pokemon.csv')
print(df.shape)
pokemon.head(10)

\
Draw bar chart

sb.countplot(data = df, x = 'cat_var');

\
Change bar color to blue

base_color = sb.color_palette()[0]
sb.countplot(data = df, x = 'cat_var', color = base_color)

\
Change bar order (nominal-type data)

cat_order = df['cat_var'].value_counts().index
sb.countplot(data = df, x = 'cat_var', color = base_color, order = cat_order)

\
Change bar order (ordinal-type data)

level_order =['Alpha', 'Beta', 'Gamma', 'Delta']
ordered_cat = pd.api.types.CategoricalDtype(ordered = True, categories = level_order)
df['cat_var'] = df['cat_var'].astype(ordered_cat)

\
Horizontal bar chart

sb.countplot(data = df, y = 'cat_var', color = base_color)

\
Change rotation of x-ticks via matplotlib's xticks function

sb.countplot(data = df, x = 'cat_var'. color = base_color)
plt.xticks(rotation = 90)

Absolute vs. relative frequency

\
Relative frequency: Calculate proportion

n_points = df.shape[0]
max_count = df['cat_var'].value_counts().max()
max_prop = max_count / n_points

\
Relative frequency: Generate tick mark location and names

tick_props = np.arange(0, max_prop, 0.05)
tick_names = ['{:0.2f}'.format(v) for v in tick_props]

\
Create plot

sb.countplot(data = df, x = 'cat_var', color = base_color)
plt.yticks(tick_props * n_points, tick_names)
plt.ylabel('proportion')

\
Text annotations to label frequencies:
Create plot

sb.countplot(data = df, x = 'cat_var', color = base_color)

\
Add annotations

n_points = df.shape[0]
cat_counts = df['cat_var'].value_counts()
locs, labels = plt.xticks # get current tick locations and labels

\
Loop through each pair of locations and labels

for loc, label in zip(locs, labels):

        # get text property for the label to get the correct count
        count = cat_counts[label.get_text()]
        pct_string = '{:0.1f}%'.format(100*count/n_points)

        # print the annotation just below the top of the bar
        plt.text(loc, count-8, pct_string, ha = 'center', color = 'w')

Count missing data

\
Count missing data in each column

na_counts = df.isna().sum()

\
Seaborn barplot: Depict a summary of one quantitative variable against levels of a second qualitative variable

base_color = sb.color_palette()[0]
sb.barplot(na_counts.index.values, na_counts, color = base3_color)

Pie charts

\
Draw pie chart - data needs to be in a summarized form

sorted_counts = df['cat_var'].value_counts()
plt.pie(sorted_counts, labels = sorted_counts.index, startangle = 90, counterclock = False)

# scaling of the plot is equal on both x and y axes, could be oval shaped without this
plt.axis('square') 

\
Donut plot

sorted_counts = df['cat_var'].value_counts()
plt.pie(sorted_counts, labels = sorted_counts.index, startangle = 90, counterclock = False, wedgeprops = {'width' : 0.4};
plt.axis('square')

Histograms

\
Draw histogram (matplotlib)

plt.hist(data = df, x = 'num_var')

\
Set bin edges manually

bin_edges = np.arange(0, df['num_var'].max() + 1, 1)
prt.hist(data = df, x = 'num_var', bins = bin_edges)

\
Create subplot

# 1 row, 2 columns, subplot 1
plt.subplot(1, 2, 1)
# 1 row, 2 columns, subplot 2
plt.subplot(1, 2, 2)

\
distplot

sb.distplot(df['num_var'])

\
distplot without curve, transparency turned off

sb.distplot(['num_var'], bins = bin_edges, kde = False, hist_kws = {'alpha' : 1})

Figures, Axes, and Subplots

Set up figures and axes explicitly in matplotlib

fig = plt.figure()
ax = fig.add_axes([.125, .125, .775, .755])
ax.hist(data = df, x = 'num_var')

\
Use figures and axes in seaborn

fig = plt. figure()
ax = fig.add_axes([.125, .125, .775, .755])
base_color = sb.color_palette()[0]
sb.countplot(data = df, x = 'cat_var', color = base_color, ax = ax)

\
Subplots
Set figure size in inches (larger than normal)

plt.figure(figsize = [10, 5])

\
Create new axes on figure (1 row, 2 cols, subplot 1 and 2)

plt.subplot(1, 2, 1)
plt.subplot(1, 2, 2)

\
Retrieve current axes

ax = plt.gca()

\
Get a list of all axes in a figure

axes = fig.get_axes()

\
Create subplots

fig.add_subplot()

\
Create various subplots

fig, axes = plt.subplots(3, 4) # grid of 12 subplots
axes = axes.flatten() # 3 x 4 array => 12-element vector
for i in range(12):
    plt.sca(axes[i]) # set current axes
        plt.text(0.5, 0.5, i+1) # print subplot index no. to middle of axes

Choosing a plot for discrete data

Non-connected bins with rwidth (not suitable for continuous numeric data)

bin_edges = np.arange(1.5, 12.5+1, 1)
plt.hist(die_rolls, bins = bin_edges, rwidth = 0.7)
plt.xticks(np.arange(2, 12+1, 1)

Descriptive statistics, outliers and axis limits

matplotlib xlim to change histogram's axis limits

plt.figure(figsize = [10, 5])
bin_edges = np.arange(0, 35+1, 1)
plt.hist(data = df, x = 'skew_var', bins = bin_edges)
plt.xlim(o, 35)

Scales and transformations

Using a log10 axis => Problem: Readibility

log_data = np.log10(data)
log_bin_edges = np.arange(0.8, log_data.max()+0.1, 0.1)
plt.hist(log_data, bins = log_bin_edges)
plt.xlabel('log(values)')

\
Scale transformations with matplotlib's xscale function => Problem: Bins are too large

bin_edges = np.arange(0, data.max()+100, 100)
plt.hist(data, bins = bin_edges)
plt.xsclae('log')

\
Evenly spaced powers of 10 as scale

bin_edges = 10 ** np.arange(0.8, np.log10(data.max()) + 0.1, 0,1)
plt.hist(data, bins = bin_edges)
plt.xscale('log')
tick_locs = [10, 30, 100, 300, 1000, 3000]
plt.xticks(tick_locs, tick_locs)

Extra: Kernel density estimation (KDE)

KDE on top of histogram

sb.distplot(df['num_var'])

L6: Matplotlib and Seaborn Part 2

Scatterplots and correlation

matplotlib scatterplot: relationship between two numeric variables

plt.scatter(data = df, x >= 'num_var1', y = 'num_var2')

\
Seaborn's regplot for scatterplot with regression function fitting:
Standard: Linear regression function and shaded confidence region for the regression estimate

sb.regplot(data = df, x = 'num_var1', y = 'num_var2')

reg_fit = False => turn off regression line
\
Plot regression line => data needs to be adapted:

def log_trans(x, inverse = False):
    if not inverse: return np.log10(x)
        else: return np.power(10, x)

sb.regplot(df['num_var1'], df['num_var2'].apply(log_trans))
tick_locs = [10, 20, 50, 100, 200, 500]
plt.yticks(log_trans(tick_locs), tick_locs)

Overplotting, transparency, and jitter

Adding transparency to scatterplot using matplotlib's alpha parameter (0 = fully transparent, 1 = fully opaque)

plt.scatter(data = df, x = 'Ädisc_var1', y = 'disc_var2', alpha = 1/5)

\
Adding jitter with seaborn's regplot function (x_jitter and y_jitter)

sb.regplot(data = df, x = 'disc_var1', y = 'disc_var2', fit_reg = False, x_fitter = 0.2, y_jitter = o.2, scatter_kws = {'alpha' : 1/3})

Heat maps

Matplotlib's hist2d function

bins_x = np.arange(0.5, 10.5+1, 1)
bins_y = np.arange(-0.5, 10.5+1, 1)
plt.hist2d(data = df, x = 'disc_var1', y = 'disc_var2', bins = [bins_x, bins_y])
plt.colorbar();

Change color palette with the cmap parameter in hist2d
Using cmin to set minimum value for coloring a cell

bins_x = np.arange(0.5, 10.5+1, 1)
bins_y = np.arange(-0.5, 10.5, 1)
plt.hist2d(data = df, x = 'disc_var1', y = 'disc_var2', bins = [bins_x, bins_y], cmap = 'viridis_r', cmin = 0.5)

Add text annotations with the count of points to each cell

counts = h2d[0]

# loop through the cell counts and add text annotations for each
for i in range(counts.shape[0]):
    for j in range(counts.shape[1]):
            c = counts[i, j]
                if c > 7: # increase visibility of text on darkest cells
                    plt.text(bins_x[i]+0.5, bins_y[j]+0.5, int(c), ha = 'center', va = 'center', color = 'white')
                elif c >0:
                    plt.text(bins_x[i]+0.5, bins_y[j]+0.5, int(c), ha = 'center', va = 'center', color = 'black')

Violin plots

Seaborn's violinplot function

sb.violinplot(data = df, x = 'cat_var', y = 'num_var')

\

\
Adapt to monocolor, remove miniature box plot inside violins (inner = None)

base_color = sb.color_palette()[0]
sb.violinplot(data = df, x = 'cat_var', y = 'num_var', color = base_color, inner = None)

\
Horizontal rendering

sb.violinplot(data = df, x = 'num_var', y = 'cat_var', color = base_color, inner = None)

Box plots

\

\
Seaborn's boxplot function

sb.boxplot(data = df, x = 'cat_var', y = 'num_var', color = base_color)
plt. ylim(ax1.get_ylim()) # set y-axis limits to the left subplot's, if there is one

\
Horizontal boxplots

sb.boxplot(data = df, x = 'num_var', y = 'cat_var', color = base_color)

\
Violinplot: Plotting three middle quartiles with inner = 'quartile'

sb.violinplot(data = df, x = 'cat_var', y = 'num_var', color = base_color, inner = 'quartile')

Clustered bar charts

\

\
Create clustered bar chart with seaborn

sb.countplot(data = df, x = 'cat_var1', hue = 'cat_var2')

\
Move legend to x-axis

ax = sb.countplot(data = df, x = 'cat_var1', hue = 'cat_var2')
ax.legend(loc = 8, ncol = 3, framealpha = 1, title = 'cat_var2')

\
Heat maps: Summarization of counts into matrix before plotting
Series reset_index and DataFrame pivot

ct_counts = df.groupby(['cat_var1', 'cat_var2']).size()
ct_counts = ct_counts.reset_index(name = 'count')
ct_counts = ct_counts.pivot(index = 'cat_var2', columns = 'cat_var1', values = 'count')
sb.heatmap(ct_counts)

\

\
Adding annotations to the heatmap using fmt = 'd' for integer output

sb.heatmap(ct_counts, annot = True, fmt = 'd')

Faceting

Seaborn's FacetGrid class

g = sb.FacetGrid(data = df, col = 'cat_var')
g.map(plt.hist, "num_var")


\
Extra visualizations as keyword arguments to the map function

bin_edges = np.arange(-3, df['num_var'].max()+1/3, 1/3)
g = sb.FacetGrid(data = df, col = 'cat_var')
g.map (plt.hist, "num_var", bins = bin_edges)

\
Many categorical levels

group_means = df.groupby(['many_cat_var']).mean()
group_order = group_means.sort_values(['num_var'], ascending = False).index

g = sb.FacetGrid(data = df, col = 'many_cat_var', col_wrap = 5, size = 2, col_order = group_order)
g.map(plt.hist, 'num_var', bins = np.arange(5, 15+1, 1))
g.set_titles('{col_name}')

Adaption of univariate plots

Adapted bar charts
Seaborn's barplot function

base_color = sb.color_palette()[0]
sb.barplot(data = df, x = 'cat_var', y = 'num_var', color = base_color)


\
Seaborn's pointplot function

sb.pointplot(data = df, x = 'cat_var', y = 'num_var', linestyles = "")
plt.ylabel('Avg. value of num_var')


\
Adapted histograms
Bar heights indicate value other than a count by using the "weights" parameter

bin_edges = np.arange(0, df['num_var'].max()+1/3, 1/3)

# count number of points in each bin
bin_idxs = pd.cut(df['num_var'], bin_edges, right = False, include_lowest = True,
                  labels = False).astype(int)
pts_per_bin = df.groupby(bin_idxs).size()

num_var_wts = df['binary_out'] / pts_per_bin[bin_idxs].values

# plot the data using the calculated weights
plt.hist(data = df, x = 'num_var', bins = bin_edges, weights = num_var_wts)
plt.xlabel('num_var')
plt.ylabel('mean(binary_out)')

Line plots

Matplotlib's errorbar function

plt.errorbar(data = df, x = 'num_var1', y = 'num_var2')


You'll only receive email when understanding publishes a new post

More from understanding