Source code for sciapy.regress.load_data

# -*- coding: utf-8 -*-
# vim:fileencoding=utf-8
#
# Copyright (c) 2017-2018 Stefan Bender
#
# This module is part of sciapy.
# sciapy is free software: you can redistribute it or modify
# it under the terms of the GNU General Public License as published
# by the Free Software Foundation, version 2.
# See accompanying LICENSE file or http://www.gnu.org/licenses/gpl-2.0.html.
"""SCIAMACHY data interface

Data loading and selection routines for SCIAMACHY data regression fits.
Includes reading the solar and geomagnetic index files used as proxies.
"""

import logging

import numpy as np
import pandas as pd
import xarray as xr

from astropy.time import Time

__all__ = ["load_solar_gm_table", "load_scia_dzm"]

_NPV_MAJOR, _NPV_MINOR, _NPV_PATCH = np.__version__.split('.')

_seasons = {
	"summerNH": [
		slice("2002-03-20", "2002-09-25"),
		slice("2003-03-20", "2003-10-13"),
		slice("2004-03-20", "2004-10-20"),
		slice("2005-03-20", "2005-10-20"),
		slice("2006-03-20", "2006-10-20"),
		slice("2007-03-21", "2007-10-20"),
		slice("2008-03-20", "2008-10-20"),
		slice("2009-03-20", "2009-10-20"),
		slice("2010-03-20", "2010-10-20"),
		slice("2011-03-20", "2011-08-31"),
		slice("2012-03-20", "2012-04-07"),
	],
	"summerSH": [
		slice("2002-09-13", "2003-03-26"),
		slice("2003-09-13", "2004-04-01"),
		slice("2004-09-14", "2005-04-02"),
		slice("2005-09-13", "2006-04-02"),
		slice("2006-09-13", "2007-04-02"),
		slice("2007-09-13", "2008-04-01"),
		slice("2008-09-14", "2009-04-02"),
		slice("2009-09-03", "2010-04-02"),
		slice("2010-09-13", "2011-04-02"),
		slice("2011-09-13", "2012-04-01"),
	]
}

_SPEs = [pd.date_range("2002-04-20", "2002-05-01"),
		pd.date_range("2002-05-21", "2002-06-02"),
		pd.date_range("2002-07-15", "2002-07-27"),
		pd.date_range("2002-08-23", "2002-09-03"),
		pd.date_range("2002-09-06", "2002-09-17"),
		pd.date_range("2002-11-08", "2002-11-20"),
		pd.date_range("2003-05-27", "2003-06-07"),
		pd.date_range("2003-10-25", "2003-11-15"),
		pd.date_range("2004-07-24", "2004-08-05"),
		pd.date_range("2004-11-06", "2004-11-18"),
		pd.date_range("2005-01-15", "2005-01-27"),
		pd.date_range("2005-05-13", "2005-05-25"),
		pd.date_range("2005-07-13", "2005-08-05"),
		pd.date_range("2005-08-21", "2005-09-02"),
		pd.date_range("2005-09-07", "2005-09-21"),
		pd.date_range("2006-12-05", "2006-12-23"),
		pd.date_range("2012-01-22", "2012-02-07"),
		pd.date_range("2012-03-06", "2012-03-27")]


def load_dailymeanAE(
		filename="data/indices/AE_Kyoto_1980-2018_daily2_shift12h.dat",
		name="AE", tfmt="jyear"):
	from pkg_resources import resource_filename
	AEfilename = resource_filename("sciapy", filename)
	return load_solar_gm_table(AEfilename, cols=[0, 1],
			names=["time", name], tfmt=tfmt)


def load_dailymeanLya(filename="data/indices/lisird_lya3_1980-2021.dat",
		name="Lya", tfmt="jyear"):
	from pkg_resources import resource_filename
	Lyafilename = resource_filename("sciapy", filename)
	return load_solar_gm_table(Lyafilename, cols=[0, 1],
			names=["time", name], tfmt=tfmt)


[docs]def load_solar_gm_table(filename, cols, names, sep="\t", tfmt="jyear"):
	"""Load proxy tables from ascii files

	This function wraps :func:`numpy.genfromtxt()` [#]_ with
	pre-defined settings to match the file format.
	It explicitly returns the times as UTC decimal years or julian epochs.

	.. [#] https://docs.scipy.org/doc/numpy/reference/generated/numpy.genfromtxt.html

	Parameters
	----------
	filename: str
		The file to read
	cols: sequence
		The columns in the file to use, passed to `numpy.genfromtxt`'s
		`usecols` keyword. Should be at least of length 2, indicating
		time values in the first column, e.g., `cols=[0, 1]`.
	names: sequence
		The column names, passed as `names` to `numpy.genfromtxt()`.
		Should be at least of length 2, naming the proxy values in the
		second column, e.g., `names=["time", "proxy"]`.
	sep: str, optional
		The column separator character, passed as `sep`.
		Default: `\t`
	tfmt: str, optional
		The astropy.time "Time Format" for the time units,
		for example, "jyear", "decimalyear", "jd", "mjd", etc.
		See:
		http://docs.astropy.org/en/stable/time/index.html#time-format
		Default: "jyear"

	Returns
	-------
	times: array_like
		The proxy times according to the `tfmt` keyword (UTC) as
		:class:`numpy.ndarray` from the first column of "cols".
	values: tuple of array_like
		The proxy values combined into a structured array
		(:class:`numpy.recarray`) with fields set
		according to "names[1:]" and "cols[1:]" passed above.

	See Also
	--------
	:class:`astropy.time.Time`
	:class:`numpy.recarray`
	"""
	if len(cols) < 2 and len(names) < 2:
		raise ValueError(
				"Both `cols` and `names` should be at least of length 2.")
	encoding = {}
	# set the encoding for numpy >= 1.14.0
	if int(_NPV_MAJOR) >= 1 and int(_NPV_MINOR) >= 14:
		encoding = dict(encoding=None)
	tab = np.genfromtxt(filename,
			delimiter=sep,
			dtype=None,
			names=names,
			usecols=cols,
			**encoding)
	times = Time(tab[names[0]], scale="utc")
	ts = getattr(times, tfmt)
	return ts, tab[names[1:]]


def _greedy_select(ds, size, varname="NO_DENS_std", scale=1.):
	logging.info("Greedy subsampling to size: %s", size)
	var = np.ma.masked_array((ds[varname] * scale)**2, mask=[False])
	var_p = np.ma.masked_array((ds[varname] * scale)**2, mask=[False])
	sigma2_i = scale**2
	sigma2_ip = scale**2
	idxs = np.arange(len(var))
	for _ in range(size):
		max_entr_idx = np.ma.argmax(np.log(1. + var * sigma2_i))
		min_entr_idx = np.ma.argmin(np.log(1. + var_p * sigma2_ip))
		sigma2_i += 1. / var[max_entr_idx]
		sigma2_ip += 1. / var_p[min_entr_idx]
		var.mask[max_entr_idx] = True
		var_p.mask[max_entr_idx] = True
	return ds.isel(time=idxs[var.mask])

def _greedy_idxs_post(x, xerr, size):
	logging.info("Greedy subsampling to size: %s", size)
	var = np.ma.masked_array(xerr**2, mask=[False])
	var_p = np.ma.masked_array(xerr**2, mask=[False])
	sigma2_i = 1.
	sigma2_ip = 1.
	idxs = np.arange(len(var))
	for _ in range(size):
		max_entr_idx = np.ma.argmax(np.log(1. + var * sigma2_i))
		min_entr_idx = np.ma.argmin(np.log(1. + var_p * sigma2_ip))
		sigma2_i += 1. / var[max_entr_idx]
		sigma2_ip += 1. / var_p[min_entr_idx]
		var.mask[max_entr_idx] = True
		var_p.mask[max_entr_idx] = True
	return idxs[var.mask]

[docs]def load_scia_dzm(filename, alt, lat, tfmt="jyear",
		scale=1, subsample_factor=1, subsample_method="greedy",
		akd_threshold=0.002, cnt_threshold=0,
		center=False, season=None, SPEs=False):
	"""Load SCIAMACHY daily zonal mean data

	Interface function for SCIAMACHY daily zonal mean data files version 6.x.
	Uses :mod:`xarray` [#]_ to load and select the data. Possible selections are by
	hemispheric summer (NH summer ~ SH winter and SH summer ~ NH winter) and
	exclusion of strong solar proton events (SPE).

	.. [#] https://xarray.pydata.org

	Parameters
	----------
	filename: str
		The input filename
	alt: float
		The altitude
	lat: float
		The longitude
	tfmt: string, optional
		The astropy.time "Time Format" for the time units,
		for example, "jyear", "decimalyear", "jd", "mjd", etc.
		See:
		http://docs.astropy.org/en/stable/time/index.html#time-format
		Default: "jyear"
	scale: float, optional
		Scale factor of the data (default: 1)
	subsample_factor: int, optional
		Factor to subsample the data by (see `subsample_method`)
		(default: 1 (no subsampling))
	subsample_method: "equal", "greedy", or "random", optional
		Method for subsampling the data (see `subsample_factor`).
		"equal" for equally spaced subsampling,
		"greedy" for selecting the data based on their uncertainty,
		and "random" for uniform random subsampling.
		(default: "greedy")
	center: bool, optional
		Center the data by subtracting the global mean.
		(default: False)
	season: "summerNH", "summerSH", or `None`, optional
		Select the named season or `None` for all data (default: None)
	SPEs: bool, optional
		Set to `True` to exclude strong SPE events (default: False)

	Returns
	-------
	(times, dens, errs): tuple of (N,) array_like
		The measurement times according to the `tfmt` keyword,
		the number densities, and their uncertainties.
	"""
	logging.info("Opening dataset: '%s'", filename)
	NO_ds = xr.open_mfdataset(filename, decode_times=False,
				chunks={"time": 400, "latitude": 9, "altitude": 11})
	logging.info("done.")
	# Decode time coordinate for selection
	NO_ds["time"] = xr.conventions.decode_cf(NO_ds[["time"]]).time

	NO_mean = 0.
	if center:
		NO_mean = NO_ds.NO_DENS.mean().values
		logging.info("Centering with global mean: %s", NO_mean)
	NO_tds = NO_ds.sel(latitude=lat, altitude=alt)

	# Exclude SPEs first if requested
	if SPEs:
		logging.info("Removing SPEs.")
		for spe in _SPEs:
			NO_tds = NO_tds.drop(
					NO_tds.sel(time=slice(spe[0], spe[-1])).time.values,
					dim="time")

	# Filter by season
	if season in _seasons.keys():
		logging.info("Restricting to season: %s", season)
		NO_tds = xr.concat([NO_tds.sel(time=s) for s in _seasons[season]],
					dim="time")
		NO_tds.load()
	else:
		logging.info("No season selected or unknown season, "
					"using all available data.")

	try:
		NO_counts = NO_tds.NO_DENS_cnt
	except AttributeError:
		NO_counts = NO_tds.counts

	# Select only useful data
	NO_tds = NO_tds.where(
				np.isfinite(NO_tds.NO_DENS) &
				(NO_tds.NO_DENS_std != 0) &
				(NO_tds.NO_AKDIAG > akd_threshold) &
				(NO_counts > cnt_threshold) &
				(NO_tds.NO_MASK == 0),
				drop=True)

	no_dens = scale * NO_tds.NO_DENS
	if center:
		no_dens -= scale * NO_mean
	no_errs = scale * NO_tds.NO_DENS_std / np.sqrt(NO_counts)
	logging.debug("no_dens.shape (ntime,): %s", no_dens.shape)

	no_sza = NO_tds.mean_SZA

	# Convert to astropy.Time for Julian epoch or decimal year
	if NO_tds.time.size > 0:
		no_t = Time(pd.to_datetime(NO_tds.time.values, utc=True).to_pydatetime(),
					format="datetime", scale="utc")
		no_ys = getattr(no_t, tfmt)
	else:
		no_ys = np.empty_like(NO_tds.time.values, dtype=np.float64)

	if subsample_factor > 1:
		new_data_size = no_dens.shape[0] // subsample_factor
		if subsample_method == "random":
			# random subsampling
			_idxs = np.random.choice(no_dens.shape[0],
					new_data_size, replace=False)
		elif subsample_method == "equal":
			# equally spaced subsampling
			_idxs = slice(0, no_dens.shape[0], subsample_factor)
		else:
			# "greedy" subsampling (default fall-back)
			_idxs = _greedy_idxs_post(no_dens, no_errs, new_data_size)
		return (no_ys[_idxs],
				no_dens.values[_idxs],
				no_errs.values[_idxs],
				no_sza.values[_idxs])

	return no_ys, no_dens.values, no_errs.values, no_sza.values