#!/usr/bin/env python

"""
This is a demo (not production code!) of black-box stochastic variational
inference applied to logistic regression.

The main black-box stochastic variational "engine" is svi_grad.

In principle it could be applied to any model with a differentiable negative
log-likelihood function. Although for large models you may want to make the
posterior approximation diagonal, and you may want to build a more structured
prior than one shared variance for all weights.

This demonstration is a quick line-by-line port of the Matlab/Octave code in
svi_minimal.m. The Matlab/Octave tar-ball has some more code with some checks,
which I haven't ported.

Iain Murray, November 2016, November 2017
"""


import numpy as np
import matplotlib.pyplot as plt


def svi_grad(mm, LL, Lsigma_w, neg_log_like_grad):
    """
    cost function and gradients for black-box stochastic variational inference

    Inputs:
                   mm D,  mean of variational posterior
                   LL D,D lower-triangular Cholesky decomposition of
                          variational posterior, with diagonal log-transformed
             Lsigma_w     scalar: log of prior standard deviation over weights
    neg_log_like_grad fn  -ve log-likelihood of model and gradients wrt weights
                          Could be an unbiased estimate based on a mini-batch,
                          we only get unbiased estimates of cost and gradients
                          anyway.

    Outputs:
                    J     scalar: estimate of variational cost function = -ELBO
               mm_bar D,  with derivatives wrt mm, ...
               LL_bar D,D ...LL, ...
         Lsigma_w_bar     ...and Lsigma_w
    """

    # Unpack Cholesky factor of posterior covariance and prior variance
    # from their unconstrained forms.
    D = mm.size
    L = np.tril(LL)
    diag = np.diag_indices_from(L)
    L[diag] = np.exp(LL[diag])
    sigma2_w = np.exp(2*Lsigma_w)

    # The estimate of the variational cost function
    J1 = -0.5*D - np.sum(np.diag(LL)) # - D/2*log(2*pi)
    tmp = (np.sum(L*L) + np.dot(mm, mm)) / sigma2_w
    J2 = tmp/2.0 + D*Lsigma_w         # + D/2*log(2*pi)
    nu = np.random.randn(D)
    ww = mm + np.dot(L, nu) # Using random weight
    J3, ww_bar = neg_log_like_grad(ww)
    J = J1 + J2 + J3

    # The derivatives
    mm_bar = mm/sigma2_w + ww_bar
    L_bar = L/sigma2_w + np.tril(np.dot(ww_bar[:,None], nu[None,:]))
    LL_bar = L_bar
    LL_bar[diag] = (L_bar[diag] * L[diag]) - 1
    Lsigma_w_bar = D - tmp

    return J, mm_bar, LL_bar, Lsigma_w_bar


def logreg_negLlike(ww, X, yy):
    """
    Negative log-likelihood and gradients of logistic regression

        negLlike, ww_bar = logreg_negLlike(ww, X, yy)

    There's no separate bias term. So X needs augmenting with a
    constant column to include a bias.

    Inputs:
             ww D,
              X N,D
             yy N,

    Outputs:
       negLlike scalar
         ww_bar D,
    """

    # Force targets to be +/- 1
    yy = 2*(yy==1) - 1

    # forward computation of error
    sigma = 1.0/(1.0 + np.exp(-yy*np.dot(X, ww)))
    negLlike = -np.sum(np.log(sigma))

    # reverse computation of gradients
    ww_bar = np.dot(X.T, yy*(sigma - 1))

    return negLlike, ww_bar


if __name__ == "__main__":
    # Generate synthetic dataset and note corresponding likelihood
    np.random.seed(0)
    D = 3
    ww = 5*np.random.randn(D)
    N = 20
    X = np.hstack((np.random.randn(N, D-1), np.ones((N,1))))
    yy = np.random.rand(N) < (1.0/(1.0 + np.exp(-np.dot(X,ww))))
    neg_log_like_grad = lambda w: logreg_negLlike(w, X, yy)

    # If you rerun the fitting with different seeds you'll see we don't
    # get quite the same answer each time. I'd need to set the learning rate
    # schedule better, and/or run for longer, to get better convergence.
    np.random.seed(2)

    # Simple stochastic steepest descent with decreasing learning rate.
    # Here each update includes the whole dataset because the dataset is
    # tiny. However, we still have stochastic updates, as each update
    # uses a different random weight drawn from the current variational
    # approximation to the posterior.
    Lsigma_w = np.log(10) # Initialize prior width broader than it actually was. (We'll learn to fix that.)
    mm = np.zeros(D)
    LL = np.zeros((D, D))
    diag = np.diag_indices_from(LL)
    LL[diag] = Lsigma_w
    eta0 = 0.1
    tau = 0.1
    for ii in range(10000):
        eta = eta0 / (1 + tau*eta0*ii)
        J, mm_bar, LL_bar, Lsigma_w_bar = svi_grad(mm, LL, Lsigma_w, neg_log_like_grad)
        mm = mm - eta*mm_bar
        LL = LL - eta*LL_bar
        Lsigma_w = Lsigma_w - eta*Lsigma_w_bar
        # hack to stop extreme settings of hyperparameters early in training:
        Lsigma_w = np.min([100.0, Lsigma_w])

    # Extract covariance of the variational posterior from its
    # unconstrained parameterization.
    L = LL
    L[diag] = np.exp(LL[diag])
    V = np.dot(L, L.T)

    # Plot data:
    plt.clf()
    plt.plot(X[yy==1, 0], X[yy==1, 1], 'bx')
    plt.plot(X[yy==0, 0], X[yy==0, 1], 'ro')
    plt.legend(('y=1', 'y=0'))

    # Overlay contour plot of approximate predictive distribution:
    x_grid = np.arange(-3, 3, 0.05)
    X1, X2 = np.meshgrid(x_grid, x_grid)
    NG = X1.size
    X_test = np.hstack((X1.reshape(NG,1), X2.reshape(NG,1), np.ones((NG,1))))
    kappa = 1.0 / np.sqrt(1 + (np.pi/8)*np.sum(np.dot(X_test,V)*X_test, 1))
    p_test = 1.0 / (1+np.exp(-np.dot(X_test,mm)*kappa))
    P = np.reshape(p_test, X1.shape)
    CS = plt.contour(X1, X2, P, [0.1,0.25,0.5,0.75,0.9])
    plt.clabel(CS)
    plt.xlabel('x_1')
    plt.ylabel('x_2')
    plt.title('Contours of p(y=1|x,D)')
    plt.show()