Source code for sciapy.regress.mcmc

# -*- coding: utf-8 -*-
# vim:fileencoding=utf-8
#
# Copyright (c) 2017-2018 Stefan Bender
#
# This module is part of sciapy.
# sciapy is free software: you can redistribute it or modify
# it under the terms of the GNU General Public License as published
# by the Free Software Foundation, version 2.
# See accompanying LICENSE file or http://www.gnu.org/licenses/gpl-2.0.html.
"""SCIAMACHY data regression MCMC sampler

Markov Chain Monte Carlo (MCMC) routines to sample
the posterior probability of SCIAMACHY data regression fits.
Uses the :class:`emcee.EnsembleSampler` [#]_ do do the real work.

.. [#] https://emcee.readthedocs.io
"""

import logging
from multiprocessing import Pool

import numpy as np
from scipy.optimize._differentialevolution import DifferentialEvolutionSolver
from scipy.special import logsumexp

import celerite
import george
import emcee

try:
	from tqdm import tqdm
	have_tqdm = True
except ImportError:
	have_tqdm = False


__all__ = ["mcmc_sample_model"]


def _lpost(p, model, y=None, beta=1.):
	model.set_parameter_vector(p)
	lprior = model.log_prior()
	if not np.isfinite(lprior):
		return (-np.inf, [np.nan])
	log_likelihood = model.log_likelihood(y, quiet=True)
	return (beta * (log_likelihood + lprior), [log_likelihood])


def _sample_mcmc(sampler, nsamples, p0, rst0,
		show_progress, progress_mod, debug=False):
	smpl = sampler.sample(p0, rstate0=rst0, iterations=nsamples)
	if have_tqdm and show_progress:
		smpl = tqdm(smpl, total=nsamples, disable=None)
	for i, result in enumerate(smpl):
		if show_progress and (i + 1) % progress_mod == 0:
			if not have_tqdm and not debug:
				logging.info("%5.1f%%", 100 * (float(i + 1) / nsamples))
			if debug:
				_pos, _logp, _, _ = result
				logging.debug("%5.1f%% lnpmax: %s, p(lnpmax): %s",
					100 * (float(i + 1) / nsamples),
					np.max(_logp), _pos[np.argmax(_logp)])
	return result


[docs]def mcmc_sample_model(model, y, beta=1.,
		nwalkers=100, nburnin=200, nprod=800,
		nthreads=1, optimized=False,
		bounds=None,
		return_logpost=False,
		show_progress=False, progress_mod=10):
	"""Markov Chain Monte Carlo sampling interface

	MCMC sampling interface to sample posterior probabilities using the
	:class:`emcee.EnsembleSampler` [#]_.

	.. [#] https://emcee.readthedocs.io

	Arguments
	---------
	model : celerite.GP, george.GP, or sciapy.regress.RegressionModel instance
		The model to draw posterior samples from. It should provide either
		`log_likelihood()` and `log_prior()` functions or be directly callable
		via `__call__()`.
	y : (N,) array_like
		The data to condition the probabilities on.
	beta : float, optional
		Tempering factor for the probability, default: 1.
	nwalkers : int, optional
		The number of MCMC walkers (default: 100). If this number is smaller
		than 4 times the number of parameters, it is multiplied by the number
		of parameters. Otherwise it specifies the number of parameters directly.
	nburnin : int, optional
		The number of burn-in samples to draw, default: 200.
	nprod : int, optional
		The number of production samples to draw, default: 800.
	nthreads : int, optional
		The number of threads to use with a `multiprocessing.Pool`,
		used as `pool` for `emcee.EnsembleSampler`. Default: 1.
	optimized : bool, optional
		Indicate whether the actual (starting) position was determined with an
		optimization algorithm beforehand. If `False` (the default), a
		pre-burn-in run optimizes the starting position. Sampling continues
		from there with the normal burn-in and production runs.
		In that case, latin hypercube sampling is used to distribute the walker
		starting positions equally in parameter space.
	bounds : iterable, optional
		The parameter bounds as a list of (min, max) entries.
		Default: None
	return_logpost : bool, optional
		Indicate whether or not to  return the sampled log probabilities as well.
		Default: False
	show_progress : bool, optional
		Print the percentage of samples every `progress_mod` samples.
		Default: False
	progress_mod : int, optional
		Interval in samples to print the percentage of samples.
		Default: 10

	Returns
	-------
	samples or (samples, logpost) : array_like or tuple
		(nwalkers * nprod, ndim) array of the sampled parameters from the
		production run if return_logpost is `False`.
		A tuple of an (nwalkers * nprod, ndim) array (the same as above)
		and an (nwalkers,) array with the second entry containing the
		log posterior probabilities if return_logpost is `True`.
	"""
	v = model.get_parameter_vector()
	ndim = len(v)
	if nwalkers < 4 * ndim:
		nwalkers *= ndim
	logging.info("MCMC parameters: %s walkers, %s burn-in samples, "
				"%s production samples using %s threads.",
				nwalkers, nburnin, nprod, nthreads)

	if isinstance(model, celerite.GP) or isinstance(model, george.GP):
		mod_func = _lpost
		mod_args = (model, y, beta)
	else:
		mod_func = model
		mod_args = (beta,)

	# Initialize the walkers.
	if not optimized:
		# scipy.optimize's DifferentialEvolutionSolver uses
		# latin hypercube sampling as starting positions.
		# We just use their initialization to avoid duplicating code.
		if bounds is None:
			bounds = model.get_parameter_bounds()
		de_solver = DifferentialEvolutionSolver(_lpost,
					bounds=bounds,
					popsize=nwalkers // ndim)
		# The initial population should reflect latin hypercube sampling
		p0 = de_solver.population
		# fill up to full size in case the number of walkers is not a
		# multiple of the number of parameters
		missing = nwalkers - p0.shape[0]
		p0 = np.vstack([p0] +
			[v + 1e-2 * np.random.randn(ndim) for _ in range(missing)])
	else:
		p0 = np.array([v + 1e-2 * np.random.randn(ndim) for _ in range(nwalkers)])

	# set up the sampling pool
	if nthreads > 1:
		pool = Pool(processes=nthreads)
	else:
		pool = None
	sampler = emcee.EnsembleSampler(nwalkers, ndim, mod_func, args=mod_args,
			pool=pool)

	rst0 = np.random.get_state()

	if not optimized:
		logging.info("Running MCMC fit (%s samples)", nburnin)
		p0, lnp0, rst0, _ = _sample_mcmc(sampler, nburnin, p0, rst0,
				show_progress, progress_mod, debug=True)
		logging.info("MCMC fit finished.")

		p = p0[np.argmax(lnp0)]
		logging.info("Fit max logpost: %s, params: %s, exp(params): %s",
					np.max(lnp0), p, np.exp(p))
		model.set_parameter_vector(p)
		logging.debug("params: %s", model.get_parameter_dict())
		logging.debug("log_likelihood: %s", model.log_likelihood(y))
		p0 = [p + 1e-4 * np.random.randn(ndim) for _ in range(nwalkers)]
		sampler.reset()

	logging.info("Running burn-in (%s samples)", nburnin)
	p0, lnp0, rst0, _ = _sample_mcmc(sampler, nburnin, p0, rst0,
			show_progress, progress_mod)
	logging.info("Burn-in finished.")

	p = p0[np.argmax(lnp0)]
	logging.info("burn-in max logpost: %s, params: %s, exp(params): %s",
				np.max(lnp0), p, np.exp(p))
	model.set_parameter_vector(p)
	logging.debug("params: %s", model.get_parameter_dict())
	logging.debug("log_likelihood: %s", model.log_likelihood(y))
	sampler.reset()

	logging.info("Running production chain (%s samples)", nprod)
	_sample_mcmc(sampler, nprod, p0, rst0, show_progress, progress_mod)
	logging.info("Production run finished.")

	samples = sampler.flatchain
	lnp = sampler.flatlnprobability
	# first column in the blobs are the log likelihoods
	lnlh = np.array(sampler.blobs)[..., 0].ravel().astype(float)
	post_expect_loglh = np.nanmean(np.array(lnlh))
	logging.info("total samples: %s", samples.shape)

	samplmean = np.mean(samples, axis=0)
	logging.info("mean: %s, exp(mean): %s, sqrt(exp(mean)): %s",
			samplmean, np.exp(samplmean), np.sqrt(np.exp(samplmean)))

	samplmedian = np.median(samples, axis=0)
	logging.info("median: %s, exp(median): %s, sqrt(exp(median)): %s",
			samplmedian, np.exp(samplmedian), np.sqrt(np.exp(samplmedian)))

	logging.info("max logpost: %s, params: %s, exp(params): %s",
			np.max(lnp), samples[np.argmax(lnp)],
			np.exp(samples[np.argmax(lnp)]))

	logging.info("AIC: %s", 2 * ndim - 2 * np.max(lnp))
	logging.info("BIC: %s", np.log(len(y)) * ndim - 2 * np.max(lnp))
	logging.info("poor man's evidence 1 sum: %s, mean: %s",
			np.sum(np.exp(lnp)), np.mean(np.exp(lnp)))
	logging.info("poor man's evidence 2 max: %s, std: %s",
			np.max(np.exp(lnp)), np.std(np.exp(lnp)))
	logging.info("poor man's evidence 3: %s",
			np.max(np.exp(lnp)) / np.std(np.exp(lnp)))
	logging.info("poor man's evidence 4 sum: %s",
			logsumexp(lnp, b=1. / lnp.shape[0], axis=0))

	# mode
	model.set_parameter_vector(samples[np.argmax(lnp)])
	log_lh = model.log_likelihood(y)
	# Use the likelihood instead of the posterior
	# https://doi.org/10.3847/1538-3881/aa9332
	logging.info("BIC lh: %s", np.log(len(y)) * ndim - 2 * log_lh)
	# DIC
	sample_deviance = -2 * np.max(lnp)
	deviance_at_sample = -2 * (model.log_prior() + log_lh)
	pd = sample_deviance - deviance_at_sample
	dic = 2 * sample_deviance - deviance_at_sample
	logging.info("max logpost log_lh: %s, AIC: %s, DIC: %s, pd: %s",
			model.log_likelihood(y), 2 * ndim - 2 * log_lh, dic, pd)
	# mean
	model.set_parameter_vector(samplmean)
	log_lh = model.log_likelihood(y)
	log_lh_mean = log_lh
	# DIC
	sample_deviance = -2 * np.nanmean(lnp)
	deviance_at_sample = -2 * (model.log_prior() + log_lh)
	pd = sample_deviance - deviance_at_sample
	dic = 2 * sample_deviance - deviance_at_sample
	logging.info("mean log_lh: %s, AIC: %s, DIC: %s, pd: %s",
			model.log_likelihood(y), 2 * ndim - 2 * log_lh, dic, pd)
	# median
	model.set_parameter_vector(samplmedian)
	log_lh = model.log_likelihood(y)
	# DIC
	sample_deviance = -2 * np.nanmedian(lnp)
	deviance_at_sample = -2 * (model.log_prior() + log_lh)
	dic = 2 * sample_deviance - deviance_at_sample
	pd = sample_deviance - deviance_at_sample
	logging.info("median log_lh: %s, AIC: %s, DIC: %s, pd: %s",
			model.log_likelihood(y), 2 * ndim - 2 * log_lh, dic, pd)
	# (4)--(6) in Ando2011 doi:10.1080/01966324.2011.10737798
	pd_ando = 2 * (log_lh_mean - post_expect_loglh)
	ic5 = - 2 * post_expect_loglh + 2 * pd_ando
	ic6 = - 2 * post_expect_loglh + 2 * ndim
	logging.info("Ando2011: pd: %s, IC(5): %s, IC(6): %s",
			pd_ando, ic5, ic6)

	if return_logpost:
		return samples, lnp
	return samples