Source code for sciapy.regress.load_data

# -*- coding: utf-8 -*-
# vim:fileencoding=utf-8
#
# Copyright (c) 2017-2018 Stefan Bender
#
# This module is part of sciapy.
# sciapy is free software: you can redistribute it or modify
# it under the terms of the GNU General Public License as published
# by the Free Software Foundation, version 2.
# See accompanying LICENSE file or http://www.gnu.org/licenses/gpl-2.0.html.
"""SCIAMACHY data interface

Data loading and selection routines for SCIAMACHY data regression fits.
Includes reading the solar and geomagnetic index files used as proxies.
"""

import logging

import numpy as np
import pandas as pd
import xarray as xr

from astropy.time import Time

__all__ = ["load_solar_gm_table", "load_scia_dzm"]

_NPV_MAJOR, _NPV_MINOR, _NPV_PATCH = np.__version__.split('.')

_seasons = {
	"summerNH": [
		slice("2002-03-20", "2002-09-25"),
		slice("2003-03-20", "2003-10-13"),
		slice("2004-03-20", "2004-10-20"),
		slice("2005-03-20", "2005-10-20"),
		slice("2006-03-20", "2006-10-20"),
		slice("2007-03-21", "2007-10-20"),
		slice("2008-03-20", "2008-10-20"),
		slice("2009-03-20", "2009-10-20"),
		slice("2010-03-20", "2010-10-20"),
		slice("2011-03-20", "2011-08-31"),
		slice("2012-03-20", "2012-04-07"),
	],
	"summerSH": [
		slice("2002-09-13", "2003-03-26"),
		slice("2003-09-13", "2004-04-01"),
		slice("2004-09-14", "2005-04-02"),
		slice("2005-09-13", "2006-04-02"),
		slice("2006-09-13", "2007-04-02"),
		slice("2007-09-13", "2008-04-01"),
		slice("2008-09-14", "2009-04-02"),
		slice("2009-09-03", "2010-04-02"),
		slice("2010-09-13", "2011-04-02"),
		slice("2011-09-13", "2012-04-01"),
	]
}

_SPEs = [pd.date_range("2002-04-20", "2002-05-01"),
		pd.date_range("2002-05-21", "2002-06-02"),
		pd.date_range("2002-07-15", "2002-07-27"),
		pd.date_range("2002-08-23", "2002-09-03"),
		pd.date_range("2002-09-06", "2002-09-17"),
		pd.date_range("2002-11-08", "2002-11-20"),
		pd.date_range("2003-05-27", "2003-06-07"),
		pd.date_range("2003-10-25", "2003-11-15"),
		pd.date_range("2004-07-24", "2004-08-05"),
		pd.date_range("2004-11-06", "2004-11-18"),
		pd.date_range("2005-01-15", "2005-01-27"),
		pd.date_range("2005-05-13", "2005-05-25"),
		pd.date_range("2005-07-13", "2005-08-05"),
		pd.date_range("2005-08-21", "2005-09-02"),
		pd.date_range("2005-09-07", "2005-09-21"),
		pd.date_range("2006-12-05", "2006-12-23"),
		pd.date_range("2012-01-22", "2012-02-07"),
		pd.date_range("2012-03-06", "2012-03-27")]


def load_dailymeanAE(
		filename="data/indices/AE_Kyoto_1980-2018_daily2_shift12h.dat",
		name="AE", tfmt="jyear"):
	from pkg_resources import resource_filename
	AEfilename = resource_filename("sciapy", filename)
	return load_solar_gm_table(AEfilename, cols=[0, 1],
			names=["time", name], tfmt=tfmt)


def load_dailymeanLya(filename="data/indices/lisird_lya3_1980-2021.dat",
		name="Lya", tfmt="jyear"):
	from pkg_resources import resource_filename
	Lyafilename = resource_filename("sciapy", filename)
	return load_solar_gm_table(Lyafilename, cols=[0, 1],
			names=["time", name], tfmt=tfmt)


[docs]def load_solar_gm_table(filename, cols, names, sep="\t", tfmt="jyear"): """Load proxy tables from ascii files This function wraps :func:`numpy.genfromtxt()` [#]_ with pre-defined settings to match the file format. It explicitly returns the times as UTC decimal years or julian epochs. .. [#] https://docs.scipy.org/doc/numpy/reference/generated/numpy.genfromtxt.html Parameters ---------- filename: str The file to read cols: sequence The columns in the file to use, passed to `numpy.genfromtxt`'s `usecols` keyword. Should be at least of length 2, indicating time values in the first column, e.g., `cols=[0, 1]`. names: sequence The column names, passed as `names` to `numpy.genfromtxt()`. Should be at least of length 2, naming the proxy values in the second column, e.g., `names=["time", "proxy"]`. sep: str, optional The column separator character, passed as `sep`. Default: `\t` tfmt: str, optional The astropy.time "Time Format" for the time units, for example, "jyear", "decimalyear", "jd", "mjd", etc. See: http://docs.astropy.org/en/stable/time/index.html#time-format Default: "jyear" Returns ------- times: array_like The proxy times according to the `tfmt` keyword (UTC) as :class:`numpy.ndarray` from the first column of "cols". values: tuple of array_like The proxy values combined into a structured array (:class:`numpy.recarray`) with fields set according to "names[1:]" and "cols[1:]" passed above. See Also -------- :class:`astropy.time.Time` :class:`numpy.recarray` """ if len(cols) < 2 and len(names) < 2: raise ValueError( "Both `cols` and `names` should be at least of length 2.") encoding = {} # set the encoding for numpy >= 1.14.0 if int(_NPV_MAJOR) >= 1 and int(_NPV_MINOR) >= 14: encoding = dict(encoding=None) tab = np.genfromtxt(filename, delimiter=sep, dtype=None, names=names, usecols=cols, **encoding) times = Time(tab[names[0]], scale="utc") ts = getattr(times, tfmt) return ts, tab[names[1:]]
def _greedy_select(ds, size, varname="NO_DENS_std", scale=1.): logging.info("Greedy subsampling to size: %s", size) var = np.ma.masked_array((ds[varname] * scale)**2, mask=[False]) var_p = np.ma.masked_array((ds[varname] * scale)**2, mask=[False]) sigma2_i = scale**2 sigma2_ip = scale**2 idxs = np.arange(len(var)) for _ in range(size): max_entr_idx = np.ma.argmax(np.log(1. + var * sigma2_i)) min_entr_idx = np.ma.argmin(np.log(1. + var_p * sigma2_ip)) sigma2_i += 1. / var[max_entr_idx] sigma2_ip += 1. / var_p[min_entr_idx] var.mask[max_entr_idx] = True var_p.mask[max_entr_idx] = True return ds.isel(time=idxs[var.mask]) def _greedy_idxs_post(x, xerr, size): logging.info("Greedy subsampling to size: %s", size) var = np.ma.masked_array(xerr**2, mask=[False]) var_p = np.ma.masked_array(xerr**2, mask=[False]) sigma2_i = 1. sigma2_ip = 1. idxs = np.arange(len(var)) for _ in range(size): max_entr_idx = np.ma.argmax(np.log(1. + var * sigma2_i)) min_entr_idx = np.ma.argmin(np.log(1. + var_p * sigma2_ip)) sigma2_i += 1. / var[max_entr_idx] sigma2_ip += 1. / var_p[min_entr_idx] var.mask[max_entr_idx] = True var_p.mask[max_entr_idx] = True return idxs[var.mask]
[docs]def load_scia_dzm(filename, alt, lat, tfmt="jyear", scale=1, subsample_factor=1, subsample_method="greedy", akd_threshold=0.002, cnt_threshold=0, center=False, season=None, SPEs=False): """Load SCIAMACHY daily zonal mean data Interface function for SCIAMACHY daily zonal mean data files version 6.x. Uses :mod:`xarray` [#]_ to load and select the data. Possible selections are by hemispheric summer (NH summer ~ SH winter and SH summer ~ NH winter) and exclusion of strong solar proton events (SPE). .. [#] https://xarray.pydata.org Parameters ---------- filename: str The input filename alt: float The altitude lat: float The longitude tfmt: string, optional The astropy.time "Time Format" for the time units, for example, "jyear", "decimalyear", "jd", "mjd", etc. See: http://docs.astropy.org/en/stable/time/index.html#time-format Default: "jyear" scale: float, optional Scale factor of the data (default: 1) subsample_factor: int, optional Factor to subsample the data by (see `subsample_method`) (default: 1 (no subsampling)) subsample_method: "equal", "greedy", or "random", optional Method for subsampling the data (see `subsample_factor`). "equal" for equally spaced subsampling, "greedy" for selecting the data based on their uncertainty, and "random" for uniform random subsampling. (default: "greedy") center: bool, optional Center the data by subtracting the global mean. (default: False) season: "summerNH", "summerSH", or `None`, optional Select the named season or `None` for all data (default: None) SPEs: bool, optional Set to `True` to exclude strong SPE events (default: False) Returns ------- (times, dens, errs): tuple of (N,) array_like The measurement times according to the `tfmt` keyword, the number densities, and their uncertainties. """ logging.info("Opening dataset: '%s'", filename) NO_ds = xr.open_mfdataset(filename, decode_times=False, chunks={"time": 400, "latitude": 9, "altitude": 11}) logging.info("done.") # Decode time coordinate for selection NO_ds["time"] = xr.conventions.decode_cf(NO_ds[["time"]]).time NO_mean = 0. if center: NO_mean = NO_ds.NO_DENS.mean().values logging.info("Centering with global mean: %s", NO_mean) NO_tds = NO_ds.sel(latitude=lat, altitude=alt) # Exclude SPEs first if requested if SPEs: logging.info("Removing SPEs.") for spe in _SPEs: NO_tds = NO_tds.drop( NO_tds.sel(time=slice(spe[0], spe[-1])).time.values, dim="time") # Filter by season if season in _seasons.keys(): logging.info("Restricting to season: %s", season) NO_tds = xr.concat([NO_tds.sel(time=s) for s in _seasons[season]], dim="time") NO_tds.load() else: logging.info("No season selected or unknown season, " "using all available data.") try: NO_counts = NO_tds.NO_DENS_cnt except AttributeError: NO_counts = NO_tds.counts # Select only useful data NO_tds = NO_tds.where( np.isfinite(NO_tds.NO_DENS) & (NO_tds.NO_DENS_std != 0) & (NO_tds.NO_AKDIAG > akd_threshold) & (NO_counts > cnt_threshold) & (NO_tds.NO_MASK == 0), drop=True) no_dens = scale * NO_tds.NO_DENS if center: no_dens -= scale * NO_mean no_errs = scale * NO_tds.NO_DENS_std / np.sqrt(NO_counts) logging.debug("no_dens.shape (ntime,): %s", no_dens.shape) no_sza = NO_tds.mean_SZA # Convert to astropy.Time for Julian epoch or decimal year if NO_tds.time.size > 0: no_t = Time(pd.to_datetime(NO_tds.time.values, utc=True).to_pydatetime(), format="datetime", scale="utc") no_ys = getattr(no_t, tfmt) else: no_ys = np.empty_like(NO_tds.time.values, dtype=np.float64) if subsample_factor > 1: new_data_size = no_dens.shape[0] // subsample_factor if subsample_method == "random": # random subsampling _idxs = np.random.choice(no_dens.shape[0], new_data_size, replace=False) elif subsample_method == "equal": # equally spaced subsampling _idxs = slice(0, no_dens.shape[0], subsample_factor) else: # "greedy" subsampling (default fall-back) _idxs = _greedy_idxs_post(no_dens, no_errs, new_data_size) return (no_ys[_idxs], no_dens.values[_idxs], no_errs.values[_idxs], no_sza.values[_idxs]) return no_ys, no_dens.values, no_errs.values, no_sza.values