# This file provides a small self-contained demonstration of a Gaussian process # model. The model really is just a Gaussian distribution! A few lines of # standard linear algebra give the conditional distributions you need to perform # inference. You will probably need to invest some effort to understanding this # demonstration, and working out the correspondence to the maths in the notes. # No matter what I provide, you'll need to do some hard work. But once you've # reproduced the ideas for yourself you should find that nothing that # complicated is going on. # # Alternatively, gp_altdemo.py is a simpler demo, in that it requires less # linear algebra, but I've used a non-standard presentation to achieve that # trick: it will still require thought! # # One messy detail that we do encounter is that large covariance matrices # constructed from the Gaussian kernel are poorly-conditioned. This script # applies a standard hack to work around that. # # The class notes link to several polished and featureful Gaussian process # toolboxes. These hide the maths from you behind user-friendly interfaces, # and come with more extensive demonstrations. # # Iain Murray, November 2016 import numpy as np import matplotlib.pyplot as plt ## The kernel function ###################################################################### # This demo uses a Gaussian kernel function, # k(x1,x2) = sigma_f**2 * np.exp(-0.5 * np.sum(((x1-x2)/ell)**2)) # We create kernel functions that take NxD and MxD design matrices, to give NxM # kernel values. rbf_fn is a fixed unit bandwidth RBF kernel, and # gauss_kernel_fn rescales that using free parameters ell (1xD or 1x1) and sigma_f (1x1). rbf_fn = lambda X1, X2: \ np.exp((np.dot(X1,(2*X2.T))-np.sum(X1*X1,1)[:,None]) - np.sum(X2*X2,1)[None,:]) gauss_kernel_fn = lambda X1, X2, ell, sigma_f: \ sigma_f**2 * rbf_fn(X1/(np.sqrt(2)*ell), X2/(np.sqrt(2)*ell)) # Pick some particular parameters for this demo: k_fn = lambda X1, X2: gauss_kernel_fn(X1, X2, 3.0, 10.0) # You could also try: # k_fn = lambda X1, X2: gauss_kernel_fn(X1, X2, 0.5, 1.0) # k_fn = lambda X1, X2: gauss_kernel_fn(X1, X2, 0.5, 1.0) + gauss_kernel_fn(X1, X2, 3.0, 10.0) # ... # gauss_kernel_fn is equivalent to the following slower function: def gauss_kernel_naive(X1, X2, ell, sigma_f): N, M = X1.shape[0], X2.shape[0] K = np.zeros((N, M)) for n in range(N): for m in range(M): K[n,m] = sigma_f**2 * np.exp( -0.5*np.sum(((X1[n,:]-X2[m,:])/ell)**2)) return K ## Sampling from the prior ###################################################################### # Pick the input locations that we want to see the function at. X_grid = np.arange(0, 10, 0.02)[:,None] # Compute covariance of function values for those points. The Gaussian kernel # gives positive definite matrices in theory. In practice, we need to add a tiny # amount of noise to the process (a "jitter" or "nugget" term) if we need a # matrix that is positive definite given finite numerical precision. N_grid = X_grid.shape[0] K_grid = k_fn(X_grid, X_grid) + 1e-9*np.eye(N_grid) # To sample from Gausian with covariance K=L*L', # we just multiply L by a vector standard normals: L_grid = np.linalg.cholesky(K_grid) plt.figure(1) plt.clf() for ii in range(3): f_grid = np.dot(L_grid, np.random.randn(N_grid)) plt.plot(X_grid, f_grid, '-') plt.legend(['prior sample 1', 'prior sample 2', 'prior sample 3']) plt.xlabel('x') plt.ylabel('f') ## Sampling from the prior in two stages ###################################################################### # We don't have to sample "the whole" function, or a dense grid. # Pick four of the original grid points, just sample a setting of those and plot as crosses: idx = np.round(N_grid*np.array([0.2,0.4,0.6,0.8])).astype(int) X_locs1 = X_grid[idx, :] N_locs1 = idx.size K_locs1 = k_fn(X_locs1, X_locs1) L_locs1 = np.linalg.cholesky(K_locs1) plt.figure(2) plt.clf() noise_var = 0.0 # add no noise to the samples, we want to look at function values #noise_var = 1.0 # add some noise to the samples, we want to simulate data f_locs1 = np.dot(L_locs1, np.random.randn(N_locs1)) + \ np.sqrt(noise_var)*np.random.randn(N_locs1) plt.plot(X_locs1, f_locs1, 'x', markersize=20, markeredgewidth=2) # We could later choose to sample any of the other function values, conditioned # on the ones we already picked. We compute all the relevant covariances, and # apply standard formulae to get the mean and covariance of the Gaussian over # the remaining function values, conditioned on the ones we've already picked. X_rest = np.delete(X_grid, idx, 0) K_rest = k_fn(X_rest, X_rest) K_rest_locs1 = k_fn(X_rest, X_locs1) M = K_locs1 + noise_var*np.eye(N_locs1) # We could compute cholesky(M) once and use it to solve both the linear systems in the next two lines rest_cond_mu = np.dot(K_rest_locs1, np.linalg.solve(M, f_locs1)) rest_cond_cov = K_rest - np.dot(K_rest_locs1, np.linalg.solve(M, K_rest_locs1.T)) # Then we sample 3 different completions of the f_locs1 function values that we # sampled above. Each completion by itself is just illustrating a sample of the # function from the prior. The different completions illustrate different # samples from the posterior at test locations, given a set of observed function # values. N_rest = X_rest.shape[0] L_rest = np.linalg.cholesky(rest_cond_cov + 1e-9*np.eye(N_rest)) for ii in range(3): rest_sample = np.dot(L_rest, np.random.randn(N_rest)) + rest_cond_mu plt.plot(X_rest, rest_sample, '-') # If you change noise_var from 0.0 to 1.0 above, you can simulate first looking # at noisy values of the function. The conditional distribution is then the # posterior over some function values given some noisy observations. # If you'd rather plot the mean prediction and error bars rather than samples, # you can do that too. The thick black solid line shows the mean, and dashed # thick black lines show +/- 2 standard deviations -- at any particular # location, we have ~95% belief the function will lie in this range. plt.plot(X_rest, rest_cond_mu, '-k', linewidth=2) rest_cond_std = np.sqrt(np.diag(rest_cond_cov)) plt.plot(X_rest, rest_cond_mu + 2*rest_cond_std, '--k', linewidth=2) plt.plot(X_rest, rest_cond_mu - 2*rest_cond_std, '--k', linewidth=2) # The mean and error bars can be computed more cheaply if you don't want to know # the whole posterior. See the notes. plt.legend(['Initial Samples', 'completion 1', 'completion 2', 'completion 3', 'mean completion', 'credible band']) plt.xlabel('x') plt.ylabel('f') plt.show() # Final remarks: # -------------- # - None of the points have to be on a grid. # - The initial training points X_locs1 could be anywhere. # - The remaining test points X_rest could be anywhere. # - If the points aren't sampled densely and in order, we wouldn't # join them with straight lines. We'd just plot individual points. # - The inputs can be in D-dimensions. The kernel function is the # only part of the code that looks at the original input features. # None of the rest of the code cares where the scalar function # values are located in input space.