Introduction to Linear Regression With Python 13 Feb 2019

Using the Auto dataset

Reference: https://vincentarelbundock.github.io/Rdatasets/doc/ISLR/Auto.html

# Common imports
import numpy as np
import os

# Keep a same seed in different executions
np.random.seed(42)

# More common imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.linear_model

# Directory path of datasets
datapath = os.path.join("datasets", "islr", "")

# Load dataset into a dataframe
auto = pd.read_csv(datapath + "Auto.data.txt", delim_whitespace=True, na_values='?')
auto.head()

	mpg	cylinders	displacement	horsepower	weight	acceleration	year	origin	name
0	18.0	8	307.0	130.0	3504.0	12.0	70	1	chevrolet chevelle malibu
1	15.0	8	350.0	165.0	3693.0	11.5	70	1	buick skylark 320
2	18.0	8	318.0	150.0	3436.0	11.0	70	1	plymouth satellite
3	16.0	8	304.0	150.0	3433.0	12.0	70	1	amc rebel sst
4	17.0	8	302.0	140.0	3449.0	10.5	70	1	ford torino

# Query NaN values
auto[auto.isnull().any(axis=1)]

	mpg	cylinders	displacement	horsepower	weight	acceleration	year	origin	name
32	25.0	4	98.0	NaN	2046.0	19.0	71	1	ford pinto
126	21.0	6	200.0	NaN	2875.0	17.0	74	1	ford maverick
330	40.9	4	85.0	NaN	1835.0	17.3	80	2	renault lecar deluxe
336	23.6	4	140.0	NaN	2905.0	14.3	80	1	ford mustang cobra
354	34.5	4	100.0	NaN	2320.0	15.8	81	2	renault 18i

# Clean data
auto = auto.dropna()
# Query data types of all columns
auto.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight          float64
acceleration    float64
year              int64
origin            int64
name             object
dtype: object

# Select mpg as output(y) and horsepower as one feature(X)
# X and Y become matrix shape
X = np.c_[auto['horsepower']]
y = np.c_[auto['mpg']]
print(X.shape)
print(y.shape)

(392, 1)
(392, 1)

# Plot x vs y by selection two columns of the dataframe
auto.plot(kind='scatter', x="horsepower", y='mpg')

<matplotlib.axes._subplots.AxesSubplot at 0x2ae2bfb780>

png

# Define a linear regression model
model = sklearn.linear_model.LinearRegression()

# Fit our linear regression model
model.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

# Make a prediction for 150 horsepower
X_sample = np.array([150]).reshape(1,1) # 
print(model.predict(X_sample)) # 

[[16.25915102]]

# turn the car model name into index
auto.set_index("name", inplace = True)
auto.head(5)

	mpg	cylinders	displacement	horsepower	weight	acceleration	year	origin
name
chevrolet chevelle malibu	18.0	8	307.0	130.0	3504.0	12.0	70	1
buick skylark 320	15.0	8	350.0	165.0	3693.0	11.5	70	1
plymouth satellite	18.0	8	318.0	150.0	3436.0	11.0	70	1
amc rebel sst	16.0	8	304.0	150.0	3433.0	12.0	70	1
ford torino	17.0	8	302.0	140.0	3449.0	10.5	70	1

# Select our data of interest
data = auto[['horsepower','mpg']]
data.head(3)

	horsepower	mpg
name
chevrolet chevelle malibu	130.0	18.0
buick skylark 320	165.0	15.0
plymouth satellite	150.0	18.0

# Show three cases on our plot
data.plot(kind='scatter', x="horsepower", y='mpg', figsize=(10,6))
plt.axis([0, 250, 0, 50])
text_pos = { "vw pickup": (20, 35), "ford ranger": (95, 36), "bmw 2002": (130, 30)}
for carmodel, text_pos_i in text_pos.items():
    pos_x, pos_y = data.loc[carmodel]
    plt.annotate(carmodel, xy=(pos_x, pos_y), xytext=text_pos_i,
            arrowprops=dict(facecolor='black', width=0.5, shrink=0.1, headwidth=5))
    plt.plot(pos_x, pos_y, "ro")
plt.show()

png

# Plot two hypothesis
import numpy as np

data.plot(kind='scatter', x="horsepower", y='mpg', figsize=(10,6))
plt.axis([0, 250, 0, 50])
X=np.linspace(0, 250, 1000)
plt.plot(X, 45 - X/5, "r")
plt.text(10, 35, r"$\theta_0 = 45$", fontsize=14, color="r")
plt.text(10, 31, r"$\theta_1 = 0.2 \times 10^{-1}$", fontsize=14, color="r")

plt.plot(X, 20 + 0 * X, "b")
plt.text(200, 25, r"$\theta_0 = 20$", fontsize=14, color="b")
plt.text(200, 22, r"$\theta_1 = 0$", fontsize=14, color="b")
plt.show()

png

# Obtain one good linear hypothesis
from sklearn import linear_model
linearModel = linear_model.LinearRegression()
X = np.c_[data["horsepower"]]
y = np.c_[data["mpg"]]
linearModel.fit(X, y)
theta0, theta1 = linearModel.intercept_[0], linearModel.coef_[0][0]
theta0, theta1

(39.93586102117047, -0.15784473335365365)

# Plot our estimated good linear hypothesis
data.plot(kind='scatter', x="horsepower", y='mpg', figsize=(10,6))
plt.axis([0, 250, 0, 50])
X=np.linspace(0, 250, 1000)
plt.plot(X, theta0 + theta1*X, "b")
plt.text(5, 30, r"$\theta_0 = 39.93 $", fontsize=14, color="b")
plt.text(5, 27, r"$\theta_1 = 15.78 \times 10^{-2}$", fontsize=14, color="b")
plt.show()

png

x_test = np.array([150]).reshape(1,1)
y_predited = linearModel.predict(x_test)
y_predited

array([[16.25915102]])

# Draw prediction on our plot
data.plot(kind='scatter', x="horsepower", y='mpg', figsize=(10,6), s=1)
X=np.linspace(0, 250, 1000)
plt.plot(X, theta0 + theta1*X, "b")
plt.axis([0, 250, 0, 50])
plt.text(10, 30, r"$\theta_0 = 39.93 $", fontsize=14, color="b")
plt.text(10, 27, r"$\theta_1 = 15.78 \times 10^{-2}$", fontsize=14, color="b")
plt.plot([x_test[0,0], x_test[0,0]], [0, y_predited], "r--")
plt.text(150, 20, r"Prediction = 16.26", fontsize=14, color="b")
plt.plot(x_test, y_predited, "ro")
plt.show()

png

# Separate "nonrepresentative" data points
data_excluded = data[data.horsepower>200]
data_excluded

	horsepower	mpg
name
chevrolet impala	220.0	14.0
plymouth fury iii	215.0	14.0
pontiac catalina	225.0	14.0
buick estate wagon (sw)	225.0	14.0
ford f250	215.0	10.0
dodge d200	210.0	11.0
mercury marquis	208.0	11.0
chrysler new yorker brougham	215.0	13.0
buick electra 225 custom	225.0	12.0
pontiac grand prix	230.0	16.0

data_excluded[['horsepower', 'mpg']].values

array([[220.,  14.],
       [215.,  14.],
       [225.,  14.],
       [225.,  14.],
       [215.,  10.],
       [210.,  11.],
       [208.,  11.],
       [215.,  13.],
       [225.,  12.],
       [230.,  16.]])

# Define data_new without "nonrepresentative" data points
data_new = data[data.horsepower<=200]
data_new.head(5)

	horsepower	mpg
name
chevrolet chevelle malibu	130.0	18.0
buick skylark 320	165.0	15.0
plymouth satellite	150.0	18.0
amc rebel sst	150.0	16.0
ford torino	140.0	17.0

# Compare a full model to a model without "norepresentative" data points
data_new.plot(kind='scatter', x="horsepower", y='mpg', figsize=(10,6))
plt.axis([0, 250, 0, 50])
text_pos = { "pontiac catalina": (160, 34), "buick estate wagon (sw)": (140, 37), 
            "buick electra 225 custom": (160, 30), "pontiac grand prix": (180,40)}

for model, text_pos_i in text_pos.items():
    pos_x, pos_y = data_excluded.loc[model]
    plt.annotate(model, xy=(pos_x, pos_y), xytext=text_pos_i,
            arrowprops=dict(facecolor='black', width=0.5, shrink=0.1, headwidth=5))
#    plt.plot(pos_x, pos_y, "rs")


for xi,yi in data_excluded[['horsepower', 'mpg']].values:
    plt.plot(xi, yi, "rs")

X=np.linspace(0, 250, 1000)
X_new = np.c_[data_new["horsepower"]]
y_new = np.c_[data_new["mpg"]]
linearModel.fit(X_new, y_new)
theta0, theta1 = linearModel.intercept_[0], linearModel.coef_[0][0]
plt.plot(X, theta0 + theta1*X, "b:", label="Linear model on partial data")

lin_reg_full = linear_model.LinearRegression()
Xfull = np.c_[data["horsepower"]]
yfull = np.c_[data["mpg"]]
lin_reg_full.fit(Xfull, yfull)

theta0full, theta1full = lin_reg_full.intercept_[0], lin_reg_full.coef_[0][0]
X = np.linspace(0, 250, 1000)
plt.plot(X, theta0full + theta1full * X, "b", label="Linear model on full data")

plt.legend(loc="lower left")

plt.show()

png

# Fit a polynomial model
data.plot(kind='scatter', x="horsepower", y='mpg', figsize=(10,6))
plt.axis([0, 250, 0, 50])

from sklearn import preprocessing
from sklearn import pipeline

poly = preprocessing.PolynomialFeatures(degree=4, include_bias=False)
XFullPol = poly.fit_transform(Xfull)

lin_reg2 = linear_model.LinearRegression()
lin_reg2.fit(XFullPol, yfull)

XPol = poly.fit_transform(X[:,np.newaxis])
curve = lin_reg2.predict(XPol)
plt.plot(X, curve)
plt.show()

png

# Fit a polynomial model (Another way)
data.plot(kind='scatter', x="horsepower", y='mpg', figsize=(10,6))
plt.axis([0, 250, 0, 50])

from sklearn import preprocessing
from sklearn import pipeline

poly = preprocessing.PolynomialFeatures(degree=4, include_bias=False)
scaler = preprocessing.StandardScaler()
lin_reg2 = linear_model.LinearRegression()

pipeline_reg = pipeline.Pipeline([('poly', poly), ('scal', scaler), ('lin', lin_reg2)])
pipeline_reg.fit(Xfull, yfull)
curve = pipeline_reg.predict(X[:, np.newaxis])
plt.plot(X, curve)
plt.show()

png

# Add a regularized model and compare this one to the previous models
plt.figure(figsize=(10,6))

plt.xlabel("horsepower")
plt.ylabel('mpg')

plt.plot(list(data_new["horsepower"]), list(data_new["mpg"]), "bo")
plt.plot(list(data_excluded["horsepower"]), list(data_excluded["mpg"]), "rs")

X = np.linspace(0, 250, 1000)
plt.plot(X, theta0full + theta1full * X, "r--", label="Linear model on all data")
plt.plot(X, theta0 + theta1*X, "b:", label="Linear model on partial data")


ridge = linear_model.Ridge(alpha=10**5)
Xsample = np.c_[data_new["horsepower"]]
ysample = np.c_[data_new["mpg"]]
ridge.fit(Xsample, ysample)
theta0ridge, theta1ridge = ridge.intercept_[0], ridge.coef_[0][0]
plt.plot(X, theta0ridge + theta1ridge * X, "b", label="Regularized linear model on partial data")


plt.legend(loc="lower right")
plt.axis([0, 250, 0, 50])
plt.show()

png

References

Hands-On Machine Learning with Scikit-Learn and TensorFlow http://shop.oreilly.com/product/0636920052289.do