# Some common preliminaries
import numpy as np
import matplotlib.pyplot as plt


from sklearn.datasets import load_boston # Get the dataset
data = load_boston() # Load the data


data.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])


data.feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')


print(data.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
        - LSTAT    % lower status of the population
        - MEDV     Median value of owner-occupied homes in $1000's

    :Missing Attribute Values: None

    :Creator: Harrison, D. and Rubinfeld, D.L.

This is a copy of UCI ML housing dataset.
https://archive.ics.uci.edu/ml/machine-learning-databases/housing/


This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.

The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
prices and the demand for clean air', J. Environ. Economics & Management,
vol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics
...', Wiley, 1980.   N.B. Various transformations are used in the table on
pages 244-261 of the latter.

The Boston house-price data has been used in many machine learning papers that address regression
problems.   
     
.. topic:: References

   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.
   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.


plt.hist((data.target - data.target.mean())/data.target.std(), 100, density=True);


# Linear regression is sensitive to outliers.
# Let's get rid of anything that may be z>2.5 from the target variable

# Find where the target z-score is above 2.5
max_val = data.target.mean() + (2.5 * data.target.std())
valid_targets = np.where(data.target<=max_val)

# filter out those values
data.data = data.data[valid_targets[0],:]
data.target = data.target[valid_targets[0]]


# Another good step is to normalize the columns.
# This will prevents headaches later on!
from sklearn.preprocessing import normalize
data_norm = normalize(data.data, norm='l2', axis=0, copy=True)

# Quick check of normalized cols
np.allclose(np.diagonal(data_norm.T @ data_norm),1)

True


# Relationship of LSTAT to Target

from scipy.stats import pearsonr
r,p = pearsonr(data_norm[:,-1], data.target)
plt.scatter(data_norm[:,-1], data.target)
plt.xlabel(data.feature_names[-1])
plt.ylabel('Price')
plt.title(f'r={r:0.3f}, p={p:0.2E}');


# Relationship of B to Target

r, p = pearsonr(data_norm[:,-2], data.target)
plt.scatter(data_norm[:,-2], data.target)
plt.xlabel(data.feature_names[-2])
plt.ylabel('Price')
plt.title(f'r={r:0.3f}, p={p:0.2E}');


from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse
from sklearn.utils import resample


# Fit a linear model and get the MSE
reg = LinearRegression().fit(data_norm, data.target)
mse_raw = mse(reg.predict(data_norm), data.target)
print(f'MSE={mse_raw:0.3f}')

MSE=12.588


# Measure fairness

# Let's combine the X and y arrays to enable easier indexing

d = np.hstack((data_norm, data.target[:,np.newaxis])) # keep shape tuple same

# Step 1:
# Fit the full model (we've done this before but just bear with me)
reg_full = LinearRegression().fit(d[:,:-1], d[:,-1])
s_full = mse(reg_full.predict(d[:,:-1]), d[:,-1])

# Step 2:
# Divide the target into equal lower and higher ranges based
d_l_idx = np.where(d[:, -1] < np.median(d[:,-1]))
d_h_idx = np.where(d[:, -1] >= np.median(d[:,-1]))

d_l = d[d_l_idx]
d_h = d[d_h_idx]

# Perform separate regressions for low vs high value houses
reg_l = LinearRegression().fit(d_l[:,:-1], d_l[:,-1])
s_l = mse(reg_l.predict(d_l[:,:-1]), d_l[:,-1])
reg_h = LinearRegression().fit(d_h[:,:-1], d_h[:,-1])
s_h = mse(reg_h.predict(d_h[:,:-1]), d_h[:,-1])

# Step 4:
# Calculate the bias (high - low / full)
score_diff = 100*(s_l - s_h)/s_full

print(f'Bias: {score_diff:0.3f}')

Bias: -49.313


# Combine the data and targets for easier sampling
data_full = np.hstack((data_norm, data.target[:, np.newaxis]))

# array to accumulate bias score
score_diff = np.array([])

# Run bootstrap
for i in range(1000):
  d = resample(data_full)

  # Obtain MSE for the current bootstrapped dataset
  reg_full = LinearRegression().fit(d[:,:-1], d[:,-1])
  s_full = mse(reg_full.predict(d[:,:-1]), d[:,-1])

# Divide the target into equal lower and higher ranges based
  d_l_idx = np.where(d[:, -1] < np.median(d[:,-1]))
  d_h_idx = np.where(d[:, -1] >= np.median(d[:,-1]))

  d_l = d[d_l_idx]
  d_h = d[d_h_idx]

  # Perform separate regressions for low vs high value houses
  reg_l = LinearRegression().fit(d_l[:,:-1], d_l[:,-1])
  s_l = mse(reg_l.predict(d_l[:,:-1]), d_l[:,-1])
  reg_h = LinearRegression().fit(d_h[:,:-1], d_h[:,-1])
  s_h = mse(reg_h.predict(d_h[:,:-1]), d_h[:,-1])
  score_diff = np.append(score_diff, 100*(s_l - s_h)/s_full)


# Plot the results
plt.subplots(1,2, figsize=(10, 5))
plt.subplot(1,2,1)
plt.scatter(reg_full.predict(data_norm), data.target)
plt.title(f'MSE full fit = {mse(reg_full.predict(data_norm), data.target):0.3f}');

plt.subplot(1,2,2)
plt.hist(score_diff, density=True, alpha=0.3)
plt.xlabel("bias")
plt.ylabel("freq");
plt.title(f'bias mean={score_diff.mean():0.3f}, std={score_diff.std():0.3f}');


# One Class to Rule Them All!

class linreg(LinearRegression):
  def __init__(self, X, y, *args, **kwargs):
    """
    Wrapper over the scikit-learn LinearRegression.
    Init calls fit() and also calculates MSE (mean squared error).
    Enables creation of multiple dataset/model objects.   
    
    """
    # Initialize the parent class
    super().__init__()

    # Set params
    self.debug=False if not "debug" in kwargs else True # useful for debugging
    self.nreps=1000 if not "nreps" in kwargs else kwargs['nreps']
    
    # Fit the data
    self.fit(X, y)
    self.preds = self.predict(X)
    self.X = X
    self.y = y
    
    # Calculate MSE
    self.mse = self.mean_squared_error(X, y)

    #Always good to have some housekeeping
    if self.debug: print(f'mse: {self.mse:0.3f}')

  def mean_squared_error(self, X, y):
    "Calculate the mean squared error between prediction and observation"
    return np.sum((self.predict(X) - y)**2)/len(y)

  def bootstrap_bias(self, *args, **kwargs):
    "Run a bootstrap over the data to get a distribution of bias scores"
    score_diff = np.array([])
    
    # Combine the data and targets for easier sampling
    data = np.hstack((self.X, self.y[:, np.newaxis]))
    
    # Run bootstrap
    for i in range(self.nreps):
      d = resample(data)

      # Obtain MSE for the current bootstrapped dataset
      reg_full = LinearRegression().fit(d[:,:-1], d[:,-1])
      s_full = mse(reg_full.predict(d[:,:-1]), d[:,-1])

      # Divide the data into lower and higher prices based on target value
      d_l_idx = np.where(d[:, -1] < np.median(d[:,-1]))
      d_h_idx = np.where(d[:, -1] >= np.median(d[:,-1]))

      d_l = d[d_l_idx]
      d_h = d[d_h_idx]

      # Perform separate regressions for low vs high value houses
      reg_l = LinearRegression().fit(d_l[:,:-1], d_l[:,-1])
      s_l = mse(reg_l.predict(d_l[:,:-1]), d_l[:,-1])
      reg_h = LinearRegression().fit(d_h[:,:-1], d_h[:,-1])
      s_h = mse(reg_h.predict(d_h[:,:-1]), d_h[:,-1])
      score_diff = np.append(score_diff, 100*(s_l - s_h)/s_full)
    self.score_diff = score_diff
    
    if self.debug: print(f'bias: {score_diff.mean():0.3f}')

    return score_diff

  def plot(self):
    "Plot the fit and bias scores"
    plt.subplots(1,2, figsize=(10, 5))
    plt.subplot(1,2,1)
    plt.scatter(self.preds, self.y)
    plt.title(f'MSE full fit = {self.mse:0.3f}');

    plt.subplot(1,2,2)
    plt.hist(self.score_diff, density=True, alpha=0.3)
    plt.xlabel("bias")
    plt.ylabel("freq");
    plt.title(f'bias mean={self.score_diff.mean():0.3f}, std={self.score_diff.std():0.3f}')


# Let's see this in action using the raw features and time it.

%%time
raw_reg = linreg(data_norm, data.target, nreps=1000, debug=True)
raw_bias = raw_reg.bootstrap_bias()
raw_reg.plot()

mse: 12.588
bias: -45.037
CPU times: user 3.07 s, sys: 13.5 ms, total: 3.08 s
Wall time: 3.08 s


# Create politically correct features

# Get 'clean' features
pol_corr_cols = [i for i in range(len(data.feature_names)) if data.feature_names[i]!='B' and data.feature_names[i]!='LSTAT']

# Filter the data
pol_corr_data = data_norm[:, pol_corr_cols]

# Model it
pol_corr_reg = linreg(pol_corr_data, data.target, debug=True)
pol_corr_bias = pol_corr_reg.bootstrap_bias()
pol_corr_reg.plot()

mse: 15.419
bias: -28.762


plt.figure()
plt.hist(raw_bias, alpha=0.3, density=True, label='Raw Bias');
plt.hist(pol_corr_bias, alpha=0.3, density=True, label='Dropped Columns');
plt.axvline(raw_bias.mean(), c='b')
plt.axvline(pol_corr_bias.mean(), c='r')
plt.legend();


# Fit clean data to 'LSTAT'
reg_pc = LinearRegression().fit(pol_corr_data, data_norm[:,-1])
print(reg_pc.score(pol_corr_data, data_norm[:,-1]))
             
# Fit clean data to 'B'
reg_pc = LinearRegression().fit(pol_corr_data, data_norm[:,-2])
print(reg_pc.score(pol_corr_data, data_norm[:,-2]))

0.6709661975363947
0.24306629879425456


# Get the orthonormal basis from the SVD
U, _, _ = np.linalg.svd(pol_corr_data, full_matrices=False)

# Test if the columns are orthogonal to the unwanted columns
U.T @ data_norm[:,(-1,-2)]

array([[-0.92092404, -0.91490534],
       [ 0.15816429, -0.25946357],
       [ 0.00450424,  0.02901175],
       [ 0.07869565,  0.14152012],
       [ 0.02250988,  0.13558084],
       [ 0.08881227, -0.00821001],
       [ 0.00677911,  0.01736526],
       [-0.10568813,  0.05822104],
       [-0.04101328,  0.00384456],
       [-0.0192633 ,  0.02302542],
       [ 0.12959063,  0.01288281]])


# Run a linreg just to see if the bias changes

svd_reg = linreg(U, data.target, debug=True)
svd_reg.bootstrap_bias()
svd_reg.plot()

mse: 15.419
bias: -28.455


# Create an matrix to store the values of the clean columns
orth_data = np.ones((data_norm.shape[0], data_norm.shape[1]-2))

# Loop over each column and remove the projections of the unwanted cols
for i in range(orth_data.shape[1]):
  
  # Remove projection of 'B' from each column and re-normalize
  orth_data[:,i] = data_norm[:,i] - data_norm[:,-2]*np.dot(data_norm[:,i], data_norm[:,-2])
  orth_data[:,i] = orth_data[:,i]/np.sqrt(np.dot(orth_data[:,i], orth_data[:,i]))

  # Remove projection of 'LSTAT' from each column
  orth_data[:,i] = orth_data[:,i] - (np.dot(orth_data[:,i], data_norm[:,-1])*data_norm[:,1])
  orth_data[:,i] = orth_data[:,i]/np.sqrt(np.dot(orth_data[:,i], orth_data[:,i]))

orth_data = normalize(orth_data, norm='l2', axis=0)


# Let's be sure the columns are orthonormal
np.allclose(np.diagonal(orth_data.T @ orth_data),1)

True


# And then let's ensure they are also orthogonal to the unwanted columns
orth_data.T @ data_norm[:,(-1,-2)]

array([[ 0.23840696, -0.13797649],
       [-0.13066592,  0.07562206],
       [ 0.28720424, -0.1662176 ],
       [ 0.01376148, -0.00796437],
       [ 0.30507012, -0.17655736],
       [ 0.20102537, -0.11634213],
       [ 0.2934696 , -0.16984364],
       [-0.10581986,  0.06124256],
       [ 0.27025785, -0.15640999],
       [ 0.29945689, -0.17330874],
       [ 0.28894943, -0.16722762]])


# Could it be that the unwanted columns aren't normalized?
np.allclose(np.diagonal(data_norm[:,(-1,-2)].T @ data_norm[:,(-1,-2)]),1)

True


orth_data = np.ones((data_norm.shape[0], data_norm.shape[1]-2))
for i in range(orth_data.shape[1]):
  
  # Remove the projection of 'B' from the cols
  orth_data[:,i] = data_norm[:,i] - data_norm[:,-2]*np.dot(data_norm[:,i], data_norm[:,-2])
  orth_data[:,i] = orth_data[:,i]/np.sqrt(np.dot(orth_data[:,i], orth_data[:,i]))

  # Remove the projection of 'B' on 'LSTAT', let's call this C. (Step 2 in illustration)
  proj_b_lstat = data_norm[:,-1] - (np.dot(data_norm[:,-2], data_norm[:,-1])*data_norm[:,-2])
  proj_b_lstat = proj_b_lstat/np.sqrt(np.dot(proj_b_lstat, proj_b_lstat))

  # Remove this projection of C from all cols (step 3 in illustration)
  orth_data[:,i] = orth_data[:,i] - proj_b_lstat*np.dot(orth_data[:,i], proj_b_lstat)
  orth_data[:,i] = orth_data[:,i]/np.sqrt(np.dot(orth_data[:,i], orth_data[:,i]))

orth_data = normalize(orth_data, norm='l2', axis=0)


# test for orthonormal cols
np.allclose(np.diagonal(orth_data.T @ orth_data),1)

True


# test for orthogonality to unwanted cols
np.allclose(orth_data.T @ data_norm[:,(-1,-2)],0)

True


ortho_reg = linreg(orth_data, data.target, debug=True)
ortho_bias = ortho_reg.bootstrap_bias()
ortho_reg.plot()

mse: 44.409
bias: -2.697


# A comparision of the three different datasets
plt.figure()
plt.hist(raw_bias, 10, alpha=0.3, density=True, label="raw");
plt.axvline(raw_bias.mean(), c='b')
plt.hist(pol_corr_bias, 10, alpha=0.3, density=True, label='drop cols');
plt.axvline(pol_corr_bias.mean(), c='r')
plt.hist(ortho_bias, 10, alpha=0.3, density=True, label="Gram-Schmidt");
plt.axvline(ortho_bias.mean(), c='g')

plt.xlabel('Bias');
plt.ylabel('Freq');
plt.legend()

<matplotlib.legend.Legend at 0x7f1542ab3d10>


bias_weights = np.arange(0,1.01,0.01)

mse_s = np.array([])
bias_s = np.array([])

for i in bias_weights:
  # Take weighted average of models
  weighted_preds = ortho_reg.predict(orth_data)*(1-i) + raw_reg.predict(data_norm)*i
  
  # Compute MSE for weighted average
  mse_s = np.append(mse_s, np.sum((weighted_preds - data.target)**2)/len(data.target))

  # Calculate the bias (a quick and dirty hack)
  idx_l = data.target<np.median(data.target)
  idx_h = data.target>=np.median(data.target)

  s_l = mse(weighted_preds[idx_l], data.target[idx_l])
  s_h = mse(weighted_preds[idx_h], data.target[idx_h])

  bias_s = np.append(bias_s, 100*(s_l - s_h)/mse_s[-1])
  # NOTE: This is not the way we calculated bias. Earlier we fitted separate models
  # So the values aren't exactly identical. This is a rough estimate for bias.


plt.plot(bias_weights, mse_s, label="MSE")
plt.plot(bias_weights, bias_s, label="Bias proxy")
plt.grid()
plt.xlabel('Weight of biased predictions')
plt.ylabel('MSE / QDBias') # QD: quick and dirty
plt.legend()

<matplotlib.legend.Legend at 0x7f1542b50450>

De-biasing predictors: a tutorial¶

What this tutorial is¶

What this tutorial is not¶

Background¶

The devil is in the details¶

Linear Regression on the raw data¶

Measuring bias¶

Bootstrapping for statistics¶

Idea 1: Being politically correct (bias avoidance)¶

SVD to the rescue?¶

The Gram-Schmidt process¶