Calibrating GPs using ABC

[1]:
import os
# Ignore my broken HDF5 install...
os.putenv("HDF5_DISABLE_VERSION_CHECK", '1')
[2]:
import pandas as pd
import cis
import iris

from utils import get_aeronet_data, get_bc_ppe_data

from esem.utils import validation_plot, plot_parameter_space, get_random_params, ensemble_collocate
from esem import gp_model
from esem.abc_sampler import ABCSampler, constrain

import iris.quickplot as qplt
import matplotlib.pyplot as plt
%matplotlib inline

C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\h5py\__init__.py:40: UserWarning: h5py is running against HDF5 1.10.6 when it was built against 1.10.5, this may cause problems
  '{0}.{1}.{2}'.format(*version.hdf5_built_version_tuple)

Read in the parameters and observables

[3]:
aaod = get_aeronet_data()
print(aaod)
Ungridded data: Absorption_AOD440nm / (1)
     Shape = (10098,)

     Total number of points = 10098
     Number of non-masked points = 10098
     Long name = Absorption_AOD440nm
     Standard name = None
     Units = 1
     Missing value = -999.0
     Range = (5.1e-05, 0.47236)
     History =
     Coordinates:
       longitude
          Long name =
          Standard name = longitude
          Units = degrees_east
          Missing value = None
          Range = (-155.576755, 141.3407)
          History =
       latitude
          Long name =
          Standard name = latitude
          Units = degrees_north
          Missing value = None
          Range = (-35.495807, 79.990278)
          History =
       time
          Long name =
          Standard name = time
          Units = days since 1600-01-01 00:00:00
          Missing value = None
          Range = (cftime.DatetimeGregorian(2017, 1, 1, 12, 0, 0, 0), cftime.DatetimeGregorian(2018, 1, 2, 12, 0, 0, 0))
          History =

[4]:
# Read in the PPE parameters, AAOD and DRE
ppe_params, ppe_aaod, ppe_dre = get_bc_ppe_data(dre=True)
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\iris\__init__.py:249: IrisDeprecation: setting the 'Future' property 'netcdf_promote' is deprecated and will be removed in a future release. Please remove code that sets this property.
  warn_deprecated(msg.format(name))
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\iris\__init__.py:249: IrisDeprecation: setting the 'Future' property 'netcdf_promote' is deprecated and will be removed in a future release. Please remove code that sets this property.
  warn_deprecated(msg.format(name))
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\iris\__init__.py:249: IrisDeprecation: setting the 'Future' property 'netcdf_promote' is deprecated and will be removed in a future release. Please remove code that sets this property.
  warn_deprecated(msg.format(name))
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\iris\__init__.py:249: IrisDeprecation: setting the 'Future' property 'netcdf_promote' is deprecated and will be removed in a future release. Please remove code that sets this property.
  warn_deprecated(msg.format(name))
[5]:
# Take the annual mean of the DRE
ppe_dre, = ppe_dre.collapsed('time', iris.analysis.MEAN)
WARNING:root:Creating guessed bounds as none exist in file
WARNING:root:Creating guessed bounds as none exist in file
WARNING:root:Creating guessed bounds as none exist in file
WARNING:root:Creating guessed bounds as none exist in file

Collocate the model on to the observations

[6]:
col_ppe_aaod = ensemble_collocate(ppe_aaod, aaod)
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\ma\core.py:3225: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  dout = self.data[indx]
[7]:
n_test = 8

X_test, X_train = ppe_params[:n_test], ppe_params[n_test:]
Y_test, Y_train = col_ppe_aaod[:n_test], col_ppe_aaod[n_test:]
[8]:
Y_train
[8]:
Absorption Optical Thickness - Total 550Nm (1) job obs
Shape 31 10098
Dimension coordinates
job x -
obs - x
Auxiliary coordinates
latitude - x
longitude - x
time - x

Setup and run the models

Explore different model choices

[9]:
from esem.utils import leave_one_out, prediction_within_ci
from scipy import stats
import numpy as np

from esem.data_processors import Log
res_l = leave_one_out(X_train, Y_train, model='GaussianProcess', data_processors=[Log(constant=0.1)], kernel=['Linear', 'Exponential', 'Bias'])
r2_values_l = [stats.linregress(x.data.compressed(), y.data[:, ~x.data.mask].flatten())[2]**2 for x,y,_ in res_l]
ci95_values_l = [prediction_within_ci(x.data.flatten(), y.data.flatten(), v.data.flatten())[2].sum()/x.data.count() for x,y,v in res_l]
print("Mean R^2: {:.2f}".format(np.asarray(r2_values_l).mean()))
print("Mean proportion within 95% CI: {:.2f}".format(np.asarray(ci95_values_l).mean()))

res = leave_one_out(X_train, Y_train, model='GaussianProcess', kernel=['Linear', 'Bias'])
r2_values = [stats.linregress(x.data.flatten(), y.data.flatten())[2]**2 for x,y,v in res]
ci95_values = [prediction_within_ci(x.data.flatten(), y.data.flatten(), v.data.flatten())[2].sum()/x.data.count() for x,y,v in res]
print("Mean R^2: {:.2f}".format(np.asarray(r2_values).mean()))
print("Mean proportion within 95% CI: {:.2f}".format(np.asarray(ci95_values).mean()))

# Note that while the Log pre-processing leads to slightly better R^2, the model is under-confident and
#  has too large uncertainties which would adversley effec our implausibility metric.
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
c:\users\duncan\pycharmprojects\gcem\esem\data_processors.py:67: RuntimeWarning: invalid value encountered in log
  return np.log(data + self.constant)
Mean R^2: 1.00
Mean proportion within 95% CI: 1.00
Mean R^2: 0.99
Mean proportion within 95% CI: 0.94

Build final model

[10]:
model = gp_model(X_train, Y_train, kernel=['Linear', 'Bias'])
[11]:
model.train()
[12]:
m, v = model.predict(X_test.values)
[13]:
validation_plot(Y_test.data.flatten(), m.data.flatten(), v.data.flatten(),
               minx=0, maxx=0.1, miny=0., maxy=0.1)
Proportion of 'Bad' estimates : 5.52%
C:\Users\duncan\miniconda3\envs\gcem_dev\lib\site-packages\numpy\core\_asarray.py:83: UserWarning: Warning: converting a masked element to nan.
  return array(a, dtype, copy=False, order=order)
../_images/examples_Calibrating_GPs_using_ABC_18_2.png

Sample and constrain the models

Emulating 1e6 sample points directly would require 673 Gb of memory so we can either run 1e6 samples for each point, or run the constraint everywhere, but in batches. Here we do the latter, optioanlly on the GPU, using the ‘naive’ algorithm for calculating the running mean and variance of the various properties.

The rejection sampling happens in a similar manner so that only as much memory as is used for one batch is ever used.

[14]:
# In this case
sample_points = pd.DataFrame(data=get_random_params(3, int(1e6)), columns=X_train.columns)
[15]:
# Note that smoothing the parameter distribution can be slow for large numbers of points
plot_parameter_space(sample_points, fig_size=(3,6), smooth=False)
../_images/examples_Calibrating_GPs_using_ABC_21_0.png
[16]:
# Setup the sampler to compare against our AeroNet data
sampler = ABCSampler(model, aaod, obs_uncertainty=0.5, repres_uncertainty=0.5)
[17]:
# Calculate the implausibilty for each sample against each observation - note this can be very large so we only sample a fraction!
implaus = sampler.get_implausibility(sample_points[::100], batch_size=1000)

# The implausibility distributions for different observations can be very different.
_ = plt.hist(implaus.data[:, 1400])
_ = plt.hist(implaus.data[:, 14])
plt.gca().set(xlabel='Implausibility')
100%|##########| 10000/10000 [00:05<00:00, 1859.15sample/s]
[17]:
[Text(0.5,0,'Implausibility')]
../_images/examples_Calibrating_GPs_using_ABC_23_2.png
[18]:
# Find the valid samples in our full 1million samples by comparing against a given tolerance and threshold
valid_samples = sampler.batch_constrain(sample_points, batch_size=1000, tolerance=.1)
print("Remaining points: {}".format(valid_samples.sum()))
100%|##########| 1000000/1000000 [05:23<00:00, 2972.65sample/s]Remaining points: 729607
[19]:
# Plot the reduced parameter distribution
constrained_sample = sample_points[valid_samples]
plot_parameter_space(constrained_sample, fig_size=(3,6))
../_images/examples_Calibrating_GPs_using_ABC_25_0.png
[20]:
# We can also easily plot the joint distributions
#  Only plot every one in 100 points as scatter plots with large numbers of points are slow...
import matplotlib

# Mimic Seaborn scaling without requiring the whole package
scale = 1.5
matplotlib.rcParams['font.size'] = 12 * scale
matplotlib.rcParams['axes.labelsize'] = 12 * scale
matplotlib.rcParams['axes.titlesize'] = 12 * scale
matplotlib.rcParams['xtick.labelsize'] = 11 * scale
matplotlib.rcParams['ytick.labelsize'] = 11 * scale
matplotlib.rcParams['lines.linewidth'] = 1.5 * scale
matplotlib.rcParams['lines.markersize'] = 6 * scale
#

m, _ = model.predict(constrained_sample[::100].values)
Zs = m.data
# Plot the emulated AAOD value (averaged over observation locations) for each point
grr = pd.plotting.scatter_matrix(constrained_sample[::100], c=Zs.mean(axis=1), figsize=(12, 10), marker='o',
                                 hist_kwds={'bins': 20,}, s=20, alpha=.8, vmin=1e-3, vmax=1e-2, range_padding=0.,
                                 density_kwds={'range': [[0., 1.], [0., 1.]], 'colormap':'viridis'},
                                 )

# Matplotlib dragons...
grr[0][0].set_yticklabels([0.2, 0.4, 0.6, 0.8], fontsize=12 * scale)
for i in range(2):
    grr[i+1][0].set_yticklabels([0.0, 0.2, 0.4, 0.6, 0.8], fontsize=12 * scale)
for i in range(3):
    grr[2][i].set_xticks([0.0, 0.2, 0.4, 0.6, 0.8])
    grr[2][i].set_xticklabels([0.0, 0.2, 0.4, 0.6, 0.8], fontsize=12 * scale)

plt.colorbar(grr[0][1].collections[0], ax=grr, use_gridspec=True, label='AAOD (1)')

plt.savefig('BCPPE_constrained_params_paper.png', transparent=True)
../_images/examples_Calibrating_GPs_using_ABC_26_0.png

Explore the uncertainty in Direct Radiative Effect of Aerosol in constrianed sample-space

[21]:
dre_test, dre_train = ppe_dre[:n_test], ppe_dre[n_test:]

ari_model = gp_model(X_train, dre_train, name="ARI", kernel=['Linear', 'Bias'])
ari_model.train()
[22]:
# Calculate the mean and std-dev DRE over each set of sample points

unconstrained_mean_ari, unconstrained_sd_ari = ari_model.batch_stats(sample_points, batch_size=1000)
constrained_mean_ari, constrained_sd_ari = ari_model.batch_stats(constrained_sample, batch_size=1000)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-22-1ce0d8f67b9a> in <module>
      1 # Calculate the mean and std-dev DRE over each set of sample points
      2
----> 3 unconstrained_mean_ari, unconstrained_sd_ari = ari_model.batch_stats(sample_points, batch_size=1000)
      4 constrained_mean_ari, constrained_sd_ari = ari_model.batch_stats(constrained_sample, batch_size=1000)

c:\users\duncan\pycharmprojects\gcem\esem\emulator.py in batch_stats(self, sample_points, batch_size)
    112             # TODO: Make sample points optional and just sample from a uniform distribution if not provided
    113             mean, sd = _tf_stats(self, tf.constant(sample_points),
--> 114                                  tf.constant(batch_size, dtype=tf.int64),
    115                                  pbar=tf_tqdm(batch_size=batch_size,
    116                                               total=sample_points.shape[0]))

~\miniconda3\envs\gcem_dev\lib\site-packages\tensorflow\python\eager\def_function.py in __call__(self, *args, **kwds)
    826     tracing_count = self.experimental_get_tracing_count()
    827     with trace.Trace(self._name) as tm:
--> 828       result = self._call(*args, **kwds)
    829       compiler = "xla" if self._experimental_compile else "nonXla"
    830       new_tracing_count = self.experimental_get_tracing_count()

~\miniconda3\envs\gcem_dev\lib\site-packages\tensorflow\python\eager\def_function.py in _call(self, *args, **kwds)
    869       # This is the first call of __call__, so we have to initialize.
    870       initializers = []
--> 871       self._initialize(args, kwds, add_initializers_to=initializers)
    872     finally:
    873       # At this point we know that the initialization is complete (or less

~\miniconda3\envs\gcem_dev\lib\site-packages\tensorflow\python\eager\def_function.py in _initialize(self, args, kwds, add_initializers_to)
    724     self._concrete_stateful_fn = (
    725         self._stateful_fn._get_concrete_function_internal_garbage_collected(  # pylint: disable=protected-access
--> 726             *args, **kwds))
    727
    728     def invalid_creator_scope(*unused_args, **unused_kwds):

~\miniconda3\envs\gcem_dev\lib\site-packages\tensorflow\python\eager\function.py in _get_concrete_function_internal_garbage_collected(self, *args, **kwargs)
   2967       args, kwargs = None, None
   2968     with self._lock:
-> 2969       graph_function, _ = self._maybe_define_function(args, kwargs)
   2970     return graph_function
   2971

~\miniconda3\envs\gcem_dev\lib\site-packages\tensorflow\python\eager\function.py in _maybe_define_function(self, args, kwargs)
   3359
   3360           self._function_cache.missed.add(call_context_key)
-> 3361           graph_function = self._create_graph_function(args, kwargs)
   3362           self._function_cache.primary[cache_key] = graph_function
   3363

~\miniconda3\envs\gcem_dev\lib\site-packages\tensorflow\python\eager\function.py in _create_graph_function(self, args, kwargs, override_flat_arg_shapes)
   3204             arg_names=arg_names,
   3205             override_flat_arg_shapes=override_flat_arg_shapes,
-> 3206             capture_by_value=self._capture_by_value),
   3207         self._function_attributes,
   3208         function_spec=self.function_spec,

~\miniconda3\envs\gcem_dev\lib\site-packages\tensorflow\python\framework\func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
    988         _, original_func = tf_decorator.unwrap(python_func)
    989
--> 990       func_outputs = python_func(*func_args, **func_kwargs)
    991
    992       # invariant: `func_outputs` contains only Tensors, CompositeTensors,

~\miniconda3\envs\gcem_dev\lib\site-packages\tensorflow\python\eager\def_function.py in wrapped_fn(*args, **kwds)
    632             xla_context.Exit()
    633         else:
--> 634           out = weak_wrapped_fn().__wrapped__(*args, **kwds)
    635         return out
    636

~\miniconda3\envs\gcem_dev\lib\site-packages\tensorflow\python\framework\func_graph.py in wrapper(*args, **kwargs)
    975           except Exception as e:  # pylint:disable=broad-except
    976             if hasattr(e, "ag_error_metadata"):
--> 977               raise e.ag_error_metadata.to_exception(e)
    978             else:
    979               raise

TypeError: in user code:


    TypeError: tf__batch_stats() takes from 2 to 3 positional arguments but 4 were given

[23]:
# The original (unconstrained DRE)
qplt.pcolormesh(unconstrained_sd_ari, vmin=0., vmax=1)
plt.gca().coastlines()
[23]:
<cartopy.mpl.feature_artist.FeatureArtist at 0x2114ad520c8>
../_images/examples_Calibrating_GPs_using_ABC_30_1.png
[24]:
# The constrained DRE
qplt.pcolormesh(constrained_sd_ari, vmin=0., vmax=1)
plt.gca().coastlines()
[24]:
<cartopy.mpl.feature_artist.FeatureArtist at 0x2114a4e35c8>
../_images/examples_Calibrating_GPs_using_ABC_31_1.png
[25]:
# The change in spread after the constraint is applied
qplt.pcolormesh((constrained_sd_ari-unconstrained_sd_ari), cmap='RdBu_r', vmin=-5e-1, vmax=5e-1)
plt.gca().coastlines()
[25]:
<cartopy.mpl.feature_artist.FeatureArtist at 0x21132beb148>
../_images/examples_Calibrating_GPs_using_ABC_32_1.png
[ ]: