# You are expected to use this support code. # You may want to write: # from ct_support_code import * # at the top of your answers # You will need NumPy and SciPy: import numpy as np from scipy.optimize import minimize from scipy.linalg import cho_factor, cho_solve def params_unwrap(param_vec, shapes, sizes): """Helper routine for minimize_list""" args = [] pos = 0 for i in range(len(shapes)): sz = sizes[i] args.append(param_vec[pos:pos+sz].reshape(shapes[i])) pos += sz return args def params_wrap(param_list): """Helper routine for minimize_list""" param_list = [np.array(x) for x in param_list] shapes = [x.shape for x in param_list] sizes = [x.size for x in param_list] param_vec = np.zeros(sum(sizes)) pos = 0 for param in param_list: sz = param.size param_vec[pos:pos+sz] = param.ravel() pos += sz unwrap = lambda pvec: params_unwrap(pvec, shapes, sizes) return param_vec, unwrap def minimize_list(cost, init_list, args): """Optimize a list of arrays (wrapper of scipy.optimize.minimize) The input function "cost" should take a list of parameters, followed by any extra arguments: cost(init_list, *args) should return the cost of the initial condition, and a list in the same format as init_list giving gradients of the cost wrt the parameters. The options to the optimizer have been hard-coded. You may wish to change disp to True to get more diagnostics. You may want to decrease maxiter while debugging. Although please report all results in Q2-5 using maxiter=500. """ opt = {'maxiter': 500, 'disp': False} init, unwrap = params_wrap(init_list) def wrap_cost(vec, *args): E, params_bar = cost(unwrap(vec), *args) vec_bar, _ = params_wrap(params_bar) return E, vec_bar res = minimize(wrap_cost, init, args, 'L-BFGS-B', jac=True, options=opt) return unwrap(res.x) def linreg_cost(params, X, yy, alpha): """Regularized least squares cost function and gradients Can be optimized with minimize_list -- see fit_linreg_gradopt for a demonstration. Inputs: params: tuple (ww, bb): weights ww (D,), bias bb scalar X: N,D design matrix of input features yy: N, real-valued targets alpha: regularization constant Outputs: (E, [ww_bar, bb_bar]), cost and gradients """ # Unpack parameters from list ww, bb = params # forward computation of error ff = np.dot(X, ww) + bb res = ff - yy E = np.dot(res, res) + alpha*np.dot(ww, ww) # reverse computation of gradients ff_bar = 2*res bb_bar = np.sum(ff_bar) ww_bar = np.dot(X.T, ff_bar) + 2*alpha*ww return E, [ww_bar, bb_bar] def fit_linreg_gradopt(X, yy, alpha): """ fit a regularized linear regression model with gradient opt ww, bb = fit_linreg_gradopt(X, yy, alpha) Find weights and bias by using a gradient-based optimizer (minimize_list) to improve the regularized least squares cost: np.sum(((np.dot(X,ww) + bb) - yy)**2) + alpha*np.dot(ww,ww) Inputs: X N,D design matrix of input features yy N, real-valued targets alpha scalar regularization constant Outputs: ww D, fitted weights bb scalar fitted bias """ D = X.shape[1] args = (X, yy, alpha) init = (np.zeros(D), np.array(0)) ww, bb = minimize_list(linreg_cost, init, args) return ww, bb def logreg_cost(params, X, yy, alpha): """Regularized logistic regression cost function and gradients Can be optimized with minimize_list -- see fit_linreg_gradopt for a demonstration of fitting a similar function. Inputs: params: tuple (ww, bb): weights ww (D,), bias bb scalar X: N,D design matrix of input features yy: N, real-valued targets alpha: regularization constant Outputs: (E, [ww_bar, bb_bar]), cost and gradients """ # Unpack parameters from list ww, bb = params # Force targets to be +/- 1 yy = 2*(yy==1) - 1 # forward computation of error aa = yy*(np.dot(X, ww) + bb) sigma = 1/(1 + np.exp(-aa)) E = -np.sum(np.log(sigma)) + alpha*np.dot(ww, ww) # reverse computation of gradients aa_bar = sigma - 1 bb_bar = np.dot(aa_bar, yy) ww_bar = np.dot(X.T, yy*aa_bar) + 2*alpha*ww return E, (ww_bar, bb_bar) def nn_cost(params, X, yy=None, alpha=None): """NN_COST simple neural network cost function and gradients, or predictions E, params_bar = nn_cost([ww, bb, V, bk], X, yy, alpha) pred = nn_cost([ww, bb, V, bk], X) Cost function E can be minimized with minimize_list Inputs: params (ww, bb, V, bk), where: -------------------------------- ww K, hidden-output weights bb scalar output bias V K,D hidden-input weights bk K, hidden biases -------------------------------- X N,D input design matrix yy N, regression targets alpha scalar regularization for weights Outputs: E sum of squares error params_bar gradients wrt params, same format as params OR pred N, predictions if only params and X are given as inputs """ # Unpack parameters from list ww, bb, V, bk = params # Forwards computation of cost A = np.dot(X, V.T) + bk[None,:] # N,K P = 1 / (1 + np.exp(-A)) # N,K F = np.dot(P, ww) + bb # N, if yy is None: # user wants prediction rather than training signal: return F res = F - yy # N, E = np.dot(res, res) + alpha*(np.sum(V*V) + np.dot(ww,ww)) # 1x1 # Reverse computation of gradients F_bar = 2*res # N, ww_bar = np.dot(P.T, F_bar) + 2*alpha*ww # K, bb_bar = np.sum(F_bar) # scalar P_bar = np.dot(F_bar[:,None], ww[None,:]) # N,K A_bar = P_bar * P * (1 - P) # N,K V_bar = np.dot(A_bar.T, X) + 2*alpha*V # K,D bk_bar = np.sum(A_bar, 0) return E, (ww_bar, bb_bar, V_bar, bk_bar) def rbf_fn(X1, X2): """Helper routine for gp_post_par""" return np.exp((np.dot(X1,(2*X2.T))-np.sum(X1*X1,1)[:,None]) - np.sum(X2*X2,1)[None,:]) def gauss_kernel_fn(X1, X2, ell, sigma_f): """Helper routine for gp_post_par""" return sigma_f**2 * rbf_fn(X1/(np.sqrt(2)*ell), X2/(np.sqrt(2)*ell)) def gp_post_par(X_rest, X_obs, yy, sigma_y=0.05, ell=5.0, sigma_f=0.1): """GP_POST_PAR means and covariances of a posterior Gaussian process rest_cond_mu, rest_cond_cov = gp_post_par(X_rest, X_obs, yy) rest_cond_mu, rest_cond_cov = gp_post_par(X_rest, X_obs, yy, sigma_y, ell, sigma_f) Calculate the means and covariances at all test locations of the posterior Gaussian process conditioned on the observations yy at observed locations X_obs. Inputs: X_rest GP test locations X_obs locations of observations yy observed values sigma_y observation noise standard deviation ell kernel function length scale sigma_f kernel function standard deviation Outputs: rest_cond_mu mean at each location in X_rest rest_cond_cov covariance matrix between function values at all test locations """ X_rest = X_rest[:, None] X_obs = X_obs[:, None] K_rest = gauss_kernel_fn(X_rest, X_rest, ell, sigma_f) K_rest_obs = gauss_kernel_fn(X_rest, X_obs, ell, sigma_f) K_obs = gauss_kernel_fn(X_obs, X_obs, ell, sigma_f) M = K_obs + sigma_y**2 * np.eye(yy.size) M_cho, M_low = cho_factor(M) rest_cond_mu = np.dot(K_rest_obs, cho_solve((M_cho, M_low), yy)) rest_cond_cov = K_rest - np.dot(K_rest_obs, cho_solve((M_cho, M_low), K_rest_obs.T)) return rest_cond_mu, rest_cond_cov