Linear Regression#

from datascience import *
from cs104 import *
import numpy as np
%matplotlib inline

Helper functions for visualizations#

This code in the next cell is used in the rest of the notebook to create visualizations. You do not need to look at or understand all the details of it.

import matplotlib.colors as colors
import matplotlib.pyplot as plots

def plot_line(a, b, x): 
    """
    Plots a line using the give slope and intercept over the
    range of x-values in the x array.
    """
    xlims = make_array(min(x), max(x))
    plot = Plot()
    plot.line(xlims, a * xlims + b, lw=4, clip_on=False)

def line_predictions(a, b, x): 
    """
    Computes the prediction  y_hat = a * x + b
    where a and b are the slope and intercept and x is the set of x-values.
    """
    return a * x + b

def visualize_scatter_with_line_and_residuals(table, x_label, y_label, a, b):
    """
    Draw a scatter plot, a line, and the line segments capturing the residuals 
    for some of the points.
    """
    y_hat = line_predictions(a, b, table.column(x_label))
    prediction_table = table.with_columns(y_label + '_hat', y_hat)
    plot = table.scatter(x_label, y_label, title='a = ' + str(round(a,3)) + '; b = ' + str(round(b,3)))
    xlims = make_array(min(table.column(x_label)), max(table.column(x_label)))
    plot.line(xlims, a * xlims + b, lw=4, color="C0")
    
    every10th = prediction_table.sort(x_label).take(np.arange(0, 
                                                           prediction_table.num_rows, 
                                                           prediction_table.num_rows // 10))
    for row in every10th.rows:
        x = row.item(x_label)
        y = row.item(y_label)
        plot.line([x, x], [y, a * x + b], color='r', lw=2)    
        plot.dot(x,y,color='C0',s=70)

    return plot

def mean_squared_error(table, x_label, y_label, a, b): 
    y_hat = line_predictions(a, b, table.column(x_label))
    residual = table.column(y_label) - y_hat
    return np.mean(residual**2)

def visualize_scatter_with_line_and_mse(table, x_label, y_label, a, b):
    plot = visualize_scatter_with_line_and_residuals(table, x_label, y_label, a, b)
    mse = mean_squared_error(x_y_table, x_label, y_label, a, b)
    plot.set_title('a = ' + str(round(a,3)) + 
                '; b = ' + str(round(b,3)) + 
                '\n mse = ' + str(round(mse,3)))

    
## MSE HEAT

def plot_regression_line_and_mse_heat(table, x_label, y_label, a, b, show_mse='2d', fig=None):
        """
        Left plot: the scatter plot with line y=ax+b
        Right plot: None, 2D heat map of MSE, or 3D surface plot of MSE 
        """

        a_space = np.linspace(-0.5, 0.5, 200)
        b_space = np.linspace(-35, -8, 200)
        a_space, b_space = np.meshgrid(a_space, b_space)
        broadcasted = np.broadcast(a_space, b_space)
        mses = np.empty(broadcasted.shape)
        mses.flat = [mean_squared_error(table, 'x', 'y', a, b) for (a,b) in broadcasted]
        
        if fig is None:
            fig = Figure(1,2)

        ax = fig.axes()

        #Plot the scatter plot and best fit line on the left
        with Plot(ax[0]):
            plot_scatter_with_line(table, x_label, y_label, a, b)

        if show_mse == '2d':
            with Plot(ax[1]):
                mse = mean_squared_error(table, x_label, y_label, a, b)
                ax[1].pcolormesh(a_space, b_space, mses, cmap='viridis', 
                                 norm=colors.PowerNorm(gamma=0.25,
                                                       vmin=mses.min(), 
                                                       vmax=mses.max()));
                ax[1].scatter(a,b,s=100,color='red');
                ax[1].set_xlabel('a')
                ax[1].set_ylabel('b')
                ax[1].set_title('Mean Squared Error: ' + str(np.round(mse, 3)))

        elif show_mse == "3d": 
            ax[1] = plots.subplot(1, 2, 2, projection='3d')
            with Plot(ax[1]):
                ax[1].plot_surface(a_space, b_space, mses,
                                cmap='viridis', 
                                antialiased=False, 
                                linewidth=0,
                                norm = colors.PowerNorm(gamma=0.25,vmin=mses.min(), vmax=mses.max()))

                ax[1].plot([a],[b],[mean_squared_error(table, x_label, y_label, a, b)], 'ro',zorder=3);
                ax[1].set_xlabel('a')
                ax[1].set_ylabel('b')
                ax[1].set_title('Mean Squared Error')      

def visualize_regression_line_and_mse_heat(table, x_label, y_label, show_mse=None):
    
    a_space = np.linspace(-0.5, 0.5, 200)
    b_space = np.linspace(-35, -8, 200)
    a_space, b_space = np.meshgrid(a_space, b_space)
    broadcasted = np.broadcast(a_space, b_space)
    mses = np.empty(broadcasted.shape)
    mses.flat = [mean_squared_error(table, 'x', 'y', a, b) for (a,b) in broadcasted]
    
    def visualize_regression_line_and_mse_heat_helper(a, b, show_mse=show_mse, fig=None):
        """
        Left plot: the scatter plot with line y=ax+b
        Right plot: None, 2D heat map of MSE, or 3D surface plot of MSE 
        """

        if fig is None:
            fig = Figure(1,2)

        ax = fig.axes()

        #Plot the scatter plot and best fit line on the left
        with Plot(ax[0]):
            plot_scatter_with_line(table, x_label, y_label, a, b)

        if show_mse == '2d':
            with Plot(ax[1]):
                mse = mean_squared_error(table, x_label, y_label, a, b)
                ax[1].pcolormesh(a_space, b_space, mses, cmap='viridis', 
                                 norm=colors.PowerNorm(gamma=0.25,
                                                       vmin=mses.min(), 
                                                       vmax=mses.max()));
                ax[1].scatter(a,b,s=100,color='red');
                ax[1].set_xlabel('a')
                ax[1].set_ylabel('b')
                ax[1].set_title('Mean Squared Error: ' + str(np.round(mse, 3)))

        elif show_mse == "3d": 
            ax[1] = plots.subplot(1, 2, 2, projection='3d')
            with Plot(ax[1]):
                ax[1].plot_surface(a_space, b_space, mses,
                                cmap='viridis', 
                                antialiased=False, 
                                linewidth=0,
                                norm = colors.PowerNorm(gamma=0.25,vmin=mses.min(), vmax=mses.max()))

                ax[1].plot([a],[b],[mean_squared_error(table, x_label, y_label, a, b)], 'ro',zorder=3);
                ax[1].set_xlabel('a')
                ax[1].set_ylabel('b')
                ax[1].set_title('Mean Squared Error')
            
    interact(visualize_regression_line_and_mse_heat_helper, 
                a = Slider(-0.5,0.5,0.01),
                b = Slider(-35,-8,0.1),
                fig = Fixed(None),
                show_mse=Choice([None, "2d", "3d"]))            

1. Lines on scatter plots#

#load data from the lecture 7 and clean it up 
greenland_climate = Table().read_table('data/climate_upernavik.csv')
greenland_climate = greenland_climate.relabeled('Precipitation (millimeters)', 
                                                "Precip (mm)")
tidy_greenland = greenland_climate.where('Air temperature (C)', 
                                         are.not_equal_to(999.99))
tidy_greenland = tidy_greenland.where('Sea level pressure (mbar)', 
                                      are.not_equal_to(9999.9))
feb = tidy_greenland.where('Month', are.equal_to(2))
feb.show(3)
Year Month Air temperature (C) Sea level pressure (mbar) Precip (mm)
1875 2 -19.7 1005.5 63
1876 2 -21.2 1012.3 68
1877 2 -26.5 1008.2 26

... (102 rows omitted)

plot = feb.scatter('Year', 'Air temperature (C)', fit_line=True)
plot.set_title('Februrary in Upernavik, Greenland')
../_images/28-linear-regression_8_0.png

Let’s change the x-axis to be Years since 1880 so that it’s easier to calculate slope.

x = feb.column('Year') - 1880 
y = feb.column('Air temperature (C)')
x_y_table = Table().with_columns("x", x, "y", y)
plot = x_y_table.scatter("x", "y", fit_line=True)
plot.set_xlabel("Years since 1880")
plot.set_ylabel("Air temperature (C)");
plot.set_title('Februrary in Upernavik, Greenland');
../_images/28-linear-regression_11_0.png
# Manually put in a, b 
rise = -19+24
run = 120 
a = rise/run
b = -24 
print("a=", a, " , b=", b)
a= 0.041666666666666664  , b= -24

Two lines with different slopes and intercepts:

with Figure(1,2):
    plot_scatter_with_line(x_y_table, 'x', 'y', a, b)
    plot_scatter_with_line(x_y_table, 'x', 'y', 0.06, -20)
../_images/28-linear-regression_14_0.png
interact(plot_scatter_with_line, 
         table = Fixed(x_y_table),
         x_label = Fixed('x'),
         y_label = Fixed('y'),
         a = Slider(-1,1,0.01),
         b = Slider(-35,-8,0.1))

Here is an animation showing lines of different slope and intercept.

2. Predict and evaluate#

def line_predictions(a, b, x): 
    """
    Computes the prediction  y_hat = a * x + b
    where a and b are the slope and intercept and x is the set of x-values.
    """
    return a * x + b
y_hat = line_predictions(a, b, x_y_table.column("x"))
prediction_table = x_y_table.with_columns("y_hat", y_hat)
prediction_table.take(np.arange(10, 15))
x y y_hat
5 -18.3 -23.7917
6 -25.2 -23.75
7 -28.8 -23.7083
8 -22.6 -23.6667
9 -18.9 -23.625

A “residual” is y - y_hat. In other words, a “residual” is the true y minus the prediction made by our line, y_hat.

residuals = prediction_table.column("y") - prediction_table.column("y_hat")
prediction_table = prediction_table.with_columns('residual', residuals)
prediction_table.show(3)
x y y_hat residual
-5 -19.7 -24.2083 4.50833
-4 -21.2 -24.1667 2.96667
-3 -26.5 -24.125 -2.375

... (102 rows omitted)

interact(visualize_scatter_with_line_and_residuals, 
        table = Fixed(x_y_table),
        x_label = Fixed('x'),
        y_label = Fixed('y'),                     
        a = Slider(-1,1,0.01),
        b = Slider(-35,-8,0.1))

Here, mean squared error (MSE) first squares the residuals and then takes the mean.

Since we want our predictions (y_hat) to be closer to the observed values y a smaller MSE value is better.

np.mean(residuals**2)
22.57361111111111

Let’s write a function that computes the MSE for any table with data and line characterized by a and b.

def mean_squared_error(table, x_label, y_label, a, b): 
    y_hat = line_predictions(a, b, table.column(x_label))
    residual = table.column(y_label) - y_hat
    return np.mean(residual**2)
mean_squared_error(x_y_table, 'x', 'y', a, b)
22.57361111111111
mean_squared_error(x_y_table, 'x', 'y', 0.04, -24)
22.683558095238094
mean_squared_error(x_y_table, 'x', 'y', 0.02, -20)
27.799756190476195
interact(visualize_scatter_with_line_and_mse, 
        table = Fixed(x_y_table),
        x_label = Fixed('x'),
        y_label = Fixed('y'),                     
        a = Slider(-1,1,0.01),
        b = Slider(-35,-8,0.1))
with Figure(1,3):
    visualize_scatter_with_line_and_mse(x_y_table, 'x', 'y', a, b)
    visualize_scatter_with_line_and_mse(x_y_table, 'x', 'y', 0.04, -24)
    visualize_scatter_with_line_and_mse(x_y_table, 'x', 'y', 0.02, -20)
../_images/28-linear-regression_36_0.png

3. Fit the Best Line Manually#

visualize_regression_line_and_mse_heat(x_y_table, 'x', 'y')

Here is an animation showing how to manually fit a line using MSE.

4. Fit the Best Line with Brute Force#

# Brute force loop to find a and b 
lowest_mse = np.inf
a_for_lowest = 0
b_for_lowest = 0

for a in np.arange(-0.5, 0.5, 0.001): 
    for b in np.arange(-35, -10, 1): 
        mse = mean_squared_error(x_y_table, 'x', 'y', a, b)
        if mse < lowest_mse: 
            lowest_mse = mse
            a_for_lowest = a
            b_for_lowest = b 
lowest_mse, a_for_lowest, b_for_lowest
(22.216645485714288, 0.036000000000000476, -23)
# Looks pretty good when we plot it! 
plot_regression_line_and_mse_heat(x_y_table, 'x', 'y', a_for_lowest, b_for_lowest)
../_images/28-linear-regression_45_0.png

5. Fit the Best Line with minimize()#

This is really an optimization problem for a curve in 3d.

plot_regression_line_and_mse_heat(x_y_table, 
                                  'x', 'y', 
                                  a_for_lowest, b_for_lowest, 
                                  show_mse='3d')
../_images/28-linear-regression_48_0.png
visualize_regression_line_and_mse_heat(x_y_table, 'x', 'y', show_mse='3d')
def linear_regression(table, x_label, y_label):
    """
    Return an array containing the slope and intercept of the line best fitting 
    the table's data according to the mean square error loss function.
    """
    
    # A helper function that takes *only* the two variables we need to optimize.
    # This is necessary to use minimize below, because the function we want
    # to minimize cannot take any parameters beyond those it will solve for.
    def mse_for_a_b(a, b):
        return mean_squared_error(table, x_label, y_label, a, b)
    
    # We can use a built-in function minimize that uses gradient descent
    # to find the optimal a and b for our mean squared error function     
    return minimize(mse_for_a_b)

optimal = linear_regression(x_y_table, 'x', 'y')
a_optimal = optimal.item(0)
b_optimal = optimal.item(1)
plot_regression_line_and_mse_heat(x_y_table, 'x', 'y', a_optimal, b_optimal)
../_images/28-linear-regression_51_0.png