From 453158e0f74ee8964a451c02609ea2ad1a02a86e Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Tue, 29 Sep 2020 13:17:46 +1000 Subject: [PATCH 01/11] Add more tests --- src/scmdata/time.py | 3 ++ tests/unit/test_time.py | 97 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 tests/unit/test_time.py diff --git a/src/scmdata/time.py b/src/scmdata/time.py index f2b2cf24..a00dbe01 100644 --- a/src/scmdata/time.py +++ b/src/scmdata/time.py @@ -67,6 +67,9 @@ def _format_datetime(dts: np.ndarray) -> np.ndarray: ValueError If one of the values in :obj:`dts` cannot be converted to :class:`np.datetime64` """ + + dts = np.asarray(dts) + if len(dts) <= 0: # pylint: disable=len-as-condition return np.array([], dtype=_TARGET_DTYPE) diff --git a/tests/unit/test_time.py b/tests/unit/test_time.py new file mode 100644 index 00000000..1edb7a16 --- /dev/null +++ b/tests/unit/test_time.py @@ -0,0 +1,97 @@ +import pytest +from scmdata.time import _format_datetime, decode_datetimes_to_index +import datetime as dt +import numpy as np + +from xarray import CFTimeIndex +from pandas import DatetimeIndex +import cftime + +input_type = pytest.mark.parametrize( + "input_type", + [ + "int-year", + "decimal-year", + "str-year", + "str-year-month-day", + "numpy", + "datetime", + "cftime", + ], +) + + +def convert_input(dates_as_int, input_type): + if input_type == "int-year": + return [int(d) for d in dates_as_int] + elif input_type == "decimal-year": + return [float(d) for d in dates_as_int] + elif input_type == "str-year": + return [str(d) for d in dates_as_int] + elif input_type == "str-year-month-day": + return [str(d) + "-01-01" for d in dates_as_int] + elif input_type == "numpy": + return [str(d) + "-01-01" for d in dates_as_int] + elif input_type == "datetime": + try: + return [dt.datetime(d, 1, 1) for d in dates_as_int] + except ValueError: + pytest.skip("datetime out of range") + elif input_type == "cftime": + return [cftime.datetime(d, 1, 1) for d in dates_as_int] + + +@input_type +def test_format(input_type): + dates = [2000, 2010, 2020] + + inp_dates = convert_input(dates, input_type) + res = _format_datetime(inp_dates) + exp = np.asarray(["2000-01-01", "2010-01-01", "2020-01-01"]).astype("datetime64[s]") + + np.testing.assert_array_equal(res, exp) + + +@input_type +def test_format_wide_range(input_type): + dates = [-100, 0, 1000, 5000] + + inp_dates = convert_input(dates, input_type) + res = _format_datetime(inp_dates) + exp = np.asarray(["-100-01-01", "0-01-01", "1000-01-01", "5000-01-01"]).astype( + "datetime64[s]" + ) + + np.testing.assert_array_equal(res, exp) + + +def test_format_weird(): + inp = np.asarray(["-1000"]) + res = _format_datetime(inp) + exp = np.asarray(["-1000-01-01"]).astype("datetime64[s]") + + np.testing.assert_array_equal(res, exp) + + +def test_to_cftime_index(): + inp = np.asarray(["-1000-01-01", "1000-01-01", "2000-01-01", "2000-01-01"]).astype( + "datetime64[s]" + ) + + res = decode_datetimes_to_index( + ["-1000-01-01", "1000-01-01", "2000-01-01", "2000-01-01"] + ) + + exp = CFTimeIndex(inp) + + assert all(idx.year == [-1000, 1000, 2000, 3000]) + + +def test_to_pd_index(): + inp = np.asarray(["-1000-01-01", "1000-01-01", "2000-01-01", "2000-01-01"]).astype( + "datetime64[s]" + ) + + idx = CFTimeIndex(inp) + + assert all(idx.year == [-1000, 1000, 2000, 3000]) From f631e0df256a69a3bd1f76edd493094eac62bb10 Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Tue, 29 Sep 2020 14:08:37 +1000 Subject: [PATCH 02/11] Tweak time handling --- src/scmdata/run.py | 20 ++++++++------ src/scmdata/time.py | 60 ++++++++++++++++++++++++++++++++++++----- tests/unit/test_time.py | 45 ++++++++++++++++--------------- 3 files changed, 88 insertions(+), 37 deletions(-) diff --git a/src/scmdata/run.py b/src/scmdata/run.py index 9dc1d3f3..3e085d91 100644 --- a/src/scmdata/run.py +++ b/src/scmdata/run.py @@ -37,7 +37,12 @@ from .ops import inject_ops_methods from .plotting import inject_plotting_methods from .pyam_compat import IamDataFrame, LongDatetimeIamDataFrame -from .time import _TARGET_DTYPE, TimePoints, TimeseriesConverter +from .time import ( + _TARGET_DTYPE, + TimePoints, + TimeseriesConverter, + decode_datetimes_to_index, +) from .units import UnitConverter _logger = getLogger(__name__) @@ -258,12 +263,12 @@ def _from_ts( if not isinstance(df, pd.DataFrame): df = pd.DataFrame(df) if index is not None: - if isinstance(index, np.ndarray): - df.index = TimePoints(index).to_index() - elif isinstance(index, TimePoints): - df.index = index.to_index() - else: + if isinstance(index, (np.ndarray, list)): + df.index = decode_datetimes_to_index(index) + elif isinstance(index, pd.Index): df.index = index + else: + raise ValueError("Could not determine type of index") # format columns to lower-case and check that all required columns exist if not set(required_cols).issubset(columns.keys()): @@ -2061,8 +2066,7 @@ def run_append( to_join_metas.append(run_to_join_meta) ret._df = pd.concat([ret._df] + to_join_dfs, axis="columns").sort_index() - ret._time_points = TimePoints(ret._df.index.values) - ret._df.index = ret._time_points.to_index() + ret._df.index = decode_datetimes_to_index(ret._df.index.values) ret._meta = pd.MultiIndex.from_frame( pd.concat([ret._meta.to_frame()] + to_join_metas).astype("category") ) diff --git a/src/scmdata/time.py b/src/scmdata/time.py index a00dbe01..4d120c2c 100644 --- a/src/scmdata/time.py +++ b/src/scmdata/time.py @@ -9,10 +9,12 @@ import cftime import numpy as np import pandas as pd -from dateutil import parser +from pandas.errors import OutOfBoundsDatetime +from xarray import CFTimeIndex _TARGET_TYPE = np.int64 _TARGET_DTYPE = "datetime64[s]" +_STANDARD_CALENDARS = {"standard", "gregorian", "proleptic_gregorian"} class InsufficientDataError(Exception): @@ -38,7 +40,7 @@ def _float_year_to_datetime(inp: float) -> np.datetime64: _ufunc_float_year_to_datetime = np.frompyfunc(_float_year_to_datetime, 1, 1) -_ufunc_str_to_datetime = np.frompyfunc(parser.parse, 1, 1) +_ufunc_str_to_datetime = np.frompyfunc(np.datetime64, 1, 1) def _parse_datetime(inp: np.ndarray) -> np.ndarray: @@ -48,18 +50,20 @@ def _parse_datetime(inp: np.ndarray) -> np.ndarray: return _ufunc_str_to_datetime(inp) -def _format_datetime(dts: np.ndarray) -> np.ndarray: +def _format_datetime(dts) -> np.ndarray: """ - Convert an array to an array of :class:`np.datetime64`. + Convert a list of times to numpy datetimes + + This truncates the datetimes to have second resolution Parameters ---------- - dts + dts : np.array or list Input to attempt to convert Returns ------- - :class:`np.ndarray` of :class:`np.datetime64` + :class:`np.ndarray` with dtype :class:`np.datetime64[s]` Converted array Raises @@ -87,6 +91,48 @@ def _format_datetime(dts: np.ndarray) -> np.ndarray: return np.asarray(dts, dtype=_TARGET_DTYPE) +def _to_cftimes(np_dates, calendar): + return cftime.num2date( + np_dates.astype(int), "seconds since 1970-01-01", calendar=calendar + ) + + +def decode_datetimes_to_index(dates, calendar=None, use_cftime=None): + """ + Decodes a list of dates to an index + + Uses xarray.CFTimeIndex() + Parameters + ---------- + dates + calendar + use_cftime + + Returns + ------- + + """ + dates = np.asarray(dates) + dates = _format_datetime(dates) + + if calendar is None: + calendar = "standard" + + if use_cftime is None: + try: + index = pd.DatetimeIndex(dates) + except (KeyError, OutOfBoundsDatetime, OverflowError): + index = CFTimeIndex(_to_cftimes(dates, calendar)) + elif use_cftime: + # Force coercion to cftimes + index = CFTimeIndex(_to_cftimes(dates, calendar)) + else: + index = pd.DatetimeIndex(dates) + + index.name = "time" + return index + + class TimePoints: """ Handles time points by wrapping :class:`np.ndarray` of :class:`np.datetime64`.. @@ -126,7 +172,7 @@ def to_index(self) -> pd.Index: :class:`pd.Index` of :class:`np.dtype` :class:`object` with name ``"time"`` made from the time points represented as :class:`datetime.datetime`. """ - return pd.Index(self._values.astype(object), dtype=object, name="time") + return CFTimeIndex(self.as_cftime(), name="time") def as_cftime(self) -> list: """ diff --git a/tests/unit/test_time.py b/tests/unit/test_time.py index 1edb7a16..e03d3836 100644 --- a/tests/unit/test_time.py +++ b/tests/unit/test_time.py @@ -5,6 +5,7 @@ from xarray import CFTimeIndex from pandas import DatetimeIndex +import pandas.testing as pdt import cftime input_type = pytest.mark.parametrize( @@ -65,33 +66,33 @@ def test_format_wide_range(input_type): np.testing.assert_array_equal(res, exp) -def test_format_weird(): - inp = np.asarray(["-1000"]) - res = _format_datetime(inp) - exp = np.asarray(["-1000-01-01"]).astype("datetime64[s]") - - np.testing.assert_array_equal(res, exp) - +@pytest.mark.parametrize("use_cftime", [True, None]) +@input_type +def test_to_cftime_index(input_type, use_cftime): + years = [-1000, 1000, 2000, 3000] + inp_dates = convert_input(years, input_type) -def test_to_cftime_index(): - inp = np.asarray(["-1000-01-01", "1000-01-01", "2000-01-01", "2000-01-01"]).astype( - "datetime64[s]" - ) + res = decode_datetimes_to_index(inp_dates, use_cftime=use_cftime) - res = decode_datetimes_to_index( - ["-1000-01-01", "1000-01-01", "2000-01-01", "2000-01-01"] - ) + cftime_dts = [cftime.datetime(y, 1, 1) for y in years] + exp = CFTimeIndex(cftime_dts, name="time") - exp = CFTimeIndex(inp) + assert isinstance(res, CFTimeIndex) + assert all(res.year == years) + pdt.assert_index_equal(res, exp) - assert all(idx.year == [-1000, 1000, 2000, 3000]) +@pytest.mark.parametrize("use_cftime", [False, None]) +@input_type +def test_to_pd_index(input_type, use_cftime): + years = [2000, 2050, 2100] + inp_dates = convert_input(years, input_type) -def test_to_pd_index(): - inp = np.asarray(["-1000-01-01", "1000-01-01", "2000-01-01", "2000-01-01"]).astype( - "datetime64[s]" - ) + res = decode_datetimes_to_index(inp_dates, use_cftime=use_cftime) - idx = CFTimeIndex(inp) + exp = DatetimeIndex([str(y) for y in years], name="time") - assert all(idx.year == [-1000, 1000, 2000, 3000]) + # Pandas datetimes are coerced to ns + assert res.values.dtype == "datetime64[ns]" + assert all(res.year == years) + pdt.assert_index_equal(res, exp) From 113fd160c4e9e8c0ac5628ffccc4792faa1de4cb Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Tue, 29 Sep 2020 15:04:58 +1000 Subject: [PATCH 03/11] Add docs --- src/scmdata/time.py | 113 ++++++++++++++++++++++++++++++++++++---- tests/unit/test_time.py | 31 ++++++++++- 2 files changed, 134 insertions(+), 10 deletions(-) diff --git a/src/scmdata/time.py b/src/scmdata/time.py index 4d120c2c..d3b68fba 100644 --- a/src/scmdata/time.py +++ b/src/scmdata/time.py @@ -14,7 +14,19 @@ _TARGET_TYPE = np.int64 _TARGET_DTYPE = "datetime64[s]" -_STANDARD_CALENDARS = {"standard", "gregorian", "proleptic_gregorian"} +STANDARD_CALENDARS = {"standard", "gregorian", "proleptic_gregorian"} +""" +Over the span of a ``datetime64[ns]`` these calendars are all equivalent +""" + +_CFTIME_CALENDARS = { + "standard": cftime.datetime, + "360day": cftime.Datetime360Day, + "gregorian": cftime.DatetimeGregorian, + "proleptic_gregorian": cftime.DatetimeProlepticGregorian, + "noleap": cftime.DatetimeNoLeap, + "julian": cftime.DatetimeJulian, +} class InsufficientDataError(Exception): @@ -27,6 +39,10 @@ class InsufficientDataError(Exception): def _float_year_to_datetime(inp: float) -> np.datetime64: year = int(inp) + + if year < 0: + raise OutOfBoundsDatetime("Cannot connect negative decimal year") + fractional_part = inp - year return np.datetime64( # pylint: disable=too-many-function-args year - 1970, "Y" @@ -39,8 +55,31 @@ def _float_year_to_datetime(inp: float) -> np.datetime64: ) +def _str_to_cftime(inp: str, calendar: str): + cls = _CFTIME_CALENDARS[calendar] + + negative_year = False + if inp.startswith("-"): + negative_year = True + inp = inp[1:] + + assert len(inp) == 19 + + y = int(inp[:4]) + if negative_year: + y = -y + mon = int(inp[5:7]) + d = int(inp[8:10]) + h = int(inp[11:13]) + m = int(inp[14:16]) + s = int(inp[17:]) + + return cls(y, mon, d, h, m, s) + + _ufunc_float_year_to_datetime = np.frompyfunc(_float_year_to_datetime, 1, 1) _ufunc_str_to_datetime = np.frompyfunc(np.datetime64, 1, 1) +_ufunc_str_to_cftime = np.frompyfunc(_str_to_cftime, 2, 1) def _parse_datetime(inp: np.ndarray) -> np.ndarray: @@ -92,25 +131,76 @@ def _format_datetime(dts) -> np.ndarray: def _to_cftimes(np_dates, calendar): - return cftime.num2date( - np_dates.astype(int), "seconds since 1970-01-01", calendar=calendar - ) + # This would be faster, but results in calendar issues + # return cftime.num2date( + # np_dates.astype(int), "seconds since 1970-01-01", calendar=calendar + # ) + + if calendar not in _CFTIME_CALENDARS: + raise ValueError("Unknown calendar: {}".format(calendar)) + + return _ufunc_str_to_cftime(np.datetime_as_string(np_dates), calendar) def decode_datetimes_to_index(dates, calendar=None, use_cftime=None): """ Decodes a list of dates to an index - Uses xarray.CFTimeIndex() + Uses a :class:`pandas.DatetimeIndex` where possible. When a non-standard calendar is + used or for dates before year 1678 or after year 2262, a dates are converted + to :module:`cftime` datetimes and a :class:`xarray.CFTimeIndex()` is used. + + A wide formats for dates is supported. The following are all equivalent: + + * str ("2000" or "2000-01-01") + * int (2000) + * decimal years (2000.0) + * python datetimes (``datetime.datetime(2000, 1, 1)``) + * cftime datetimes (``cftime.datetime(2000, 1, 1)``) + * numpy datetimes (``np.datetime64("2000-01-01", "Y")``) + Parameters ---------- dates - calendar - use_cftime + Dates to be converted + + calendar: str + Describes the calendar used by in the time calculations. All the values + currently defined in the [CF metadata convention](http://cfconventions.org) + and are implemented in [cftime](https://unidata.github.io/cftime) + + Valid calendars ``'standard', 'gregorian', 'proleptic_gregorian', 'noleap', '360_day’, 'julian'``. + Default is ``'standard'``, which is a mixed Julian/Gregorian calendar. + + If a calendar other than ``'standard', 'gregorian'`` or ``'proleptic_gregorian'`` + is selected, then dates will be attempted to converted to ``cftime``'s + + use_cftime: bool + If None (default), then try and determine the appropriate time index to use. + Attempts to use a :class:`pandas.DatetimeIndex`, but falls back to + :class:`xarray.CFTimeIndex` if the conversion fails. + + If True, dates are explicitly converted to `cftime`'s and a + :class:`xarray.CFTimeIndex` is returned. + + If False, a :class:`pandas.DatetimeIndex` will always be returned (if + possible). In this case a :class:`pandas.errors.OutOfBoundsDatetime` + is raised if a date falls before year 1678 or after year 2262. Returns ------- + :class:`pandas.DatetimeIndex` or :class:`xarray.CFTimeIndex` + The return type depends on the value of calendar and the dates provided + + Raises + ------ + :class:`pandas.errors.OutOfBoundsDatetime` + ``use_cftime == False`` and date before year 1678 or after year 2262 is + provided + + ValueError + ``use_cftime == False`` and a non-standard calendar is requested """ dates = np.asarray(dates) dates = _format_datetime(dates) @@ -120,13 +210,18 @@ def decode_datetimes_to_index(dates, calendar=None, use_cftime=None): if use_cftime is None: try: + if calendar not in STANDARD_CALENDARS: + raise ValueError( + "Cannot use pandas indexes with a non-standard calendar" + ) index = pd.DatetimeIndex(dates) - except (KeyError, OutOfBoundsDatetime, OverflowError): + except (OutOfBoundsDatetime, ValueError): index = CFTimeIndex(_to_cftimes(dates, calendar)) elif use_cftime: - # Force coercion to cftimes index = CFTimeIndex(_to_cftimes(dates, calendar)) else: + if calendar not in STANDARD_CALENDARS: + raise ValueError("Cannot use pandas indexes with a non-standard calendar") index = pd.DatetimeIndex(dates) index.name = "time" diff --git a/tests/unit/test_time.py b/tests/unit/test_time.py index e03d3836..b73f9c72 100644 --- a/tests/unit/test_time.py +++ b/tests/unit/test_time.py @@ -1,10 +1,11 @@ import pytest -from scmdata.time import _format_datetime, decode_datetimes_to_index +from scmdata.time import _format_datetime, decode_datetimes_to_index, _CFTIME_CALENDARS import datetime as dt import numpy as np from xarray import CFTimeIndex from pandas import DatetimeIndex +from pandas.errors import OutOfBoundsDatetime import pandas.testing as pdt import cftime @@ -26,6 +27,8 @@ def convert_input(dates_as_int, input_type): if input_type == "int-year": return [int(d) for d in dates_as_int] elif input_type == "decimal-year": + if min(dates_as_int) < 0: + pytest.skip("datetime out of range") return [float(d) for d in dates_as_int] elif input_type == "str-year": return [str(d) for d in dates_as_int] @@ -82,6 +85,23 @@ def test_to_cftime_index(input_type, use_cftime): pdt.assert_index_equal(res, exp) +@pytest.mark.parametrize("calendar", _CFTIME_CALENDARS.keys()) +@input_type +def test_to_cftime_index(input_type, calendar): + years = [-1000, 1000, 2000, 3000] + inp_dates = convert_input(years, input_type) + + res = decode_datetimes_to_index(inp_dates, calendar=calendar, use_cftime=True) + + cls = _CFTIME_CALENDARS[calendar] + cftime_dts = [cls(y, 1, 1) for y in years] + exp = CFTimeIndex(cftime_dts, name="time") + + assert isinstance(res, CFTimeIndex) + assert all(res.year == years) + pdt.assert_index_equal(res, exp) + + @pytest.mark.parametrize("use_cftime", [False, None]) @input_type def test_to_pd_index(input_type, use_cftime): @@ -96,3 +116,12 @@ def test_to_pd_index(input_type, use_cftime): assert res.values.dtype == "datetime64[ns]" assert all(res.year == years) pdt.assert_index_equal(res, exp) + + +@input_type +def test_to_pd_index_with_overflow(input_type): + years = [1500, 2050, 2100] + inp_dates = convert_input(years, input_type) + + with pytest.raises(OutOfBoundsDatetime): + decode_datetimes_to_index(inp_dates, use_cftime=False) From c8ebbe0bb8b8567b92dd5b92446d37469be4a032 Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Tue, 29 Sep 2020 15:19:45 +1000 Subject: [PATCH 04/11] More tests --- src/scmdata/time.py | 2 +- tests/unit/test_time.py | 51 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 47 insertions(+), 6 deletions(-) diff --git a/src/scmdata/time.py b/src/scmdata/time.py index d3b68fba..928a6f5d 100644 --- a/src/scmdata/time.py +++ b/src/scmdata/time.py @@ -21,7 +21,7 @@ _CFTIME_CALENDARS = { "standard": cftime.datetime, - "360day": cftime.Datetime360Day, + "360_day": cftime.Datetime360Day, "gregorian": cftime.DatetimeGregorian, "proleptic_gregorian": cftime.DatetimeProlepticGregorian, "noleap": cftime.DatetimeNoLeap, diff --git a/tests/unit/test_time.py b/tests/unit/test_time.py index b73f9c72..83730a3e 100644 --- a/tests/unit/test_time.py +++ b/tests/unit/test_time.py @@ -7,6 +7,7 @@ from pandas import DatetimeIndex from pandas.errors import OutOfBoundsDatetime import pandas.testing as pdt +import pandas as pd import cftime input_type = pytest.mark.parametrize( @@ -16,7 +17,9 @@ "decimal-year", "str-year", "str-year-month-day", - "numpy", + "numpy-ns", + "numpy-s", + "numpy-y", "datetime", "cftime", ], @@ -34,8 +37,26 @@ def convert_input(dates_as_int, input_type): return [str(d) for d in dates_as_int] elif input_type == "str-year-month-day": return [str(d) + "-01-01" for d in dates_as_int] - elif input_type == "numpy": - return [str(d) + "-01-01" for d in dates_as_int] + elif input_type == "numpy-ns": + if min(dates_as_int) <= 1678 or max(dates_as_int) >= 2262: + pytest.skip("datetime out of range") + return np.asarray([str(d) + "-01-01" for d in dates_as_int]).astype( + "datetime64[ns]" + ) + elif input_type == "numpy-s": + return np.asarray([str(d) + "-01-01" for d in dates_as_int]).astype( + "datetime64[s]" + ) + elif input_type == "numpy-y": + return np.asarray([str(d) + "-01-01" for d in dates_as_int]).astype( + "datetime64[Y]" + ) + elif input_type == "pandas": + try: + return [pd.Timestamp(dt.datetime(d, 1, 1)) for d in dates_as_int] + except OutOfBoundsDatetime: + pytest.skip("datetime out of range") + return [np.datestr(d) + "-01-01" for d in dates_as_int] elif input_type == "datetime": try: return [dt.datetime(d, 1, 1) for d in dates_as_int] @@ -86,12 +107,13 @@ def test_to_cftime_index(input_type, use_cftime): @pytest.mark.parametrize("calendar", _CFTIME_CALENDARS.keys()) +@pytest.mark.parametrize("use_cftime", [True, None]) @input_type -def test_to_cftime_index(input_type, calendar): +def test_decode_index_with_calendar(input_type, calendar, use_cftime): years = [-1000, 1000, 2000, 3000] inp_dates = convert_input(years, input_type) - res = decode_datetimes_to_index(inp_dates, calendar=calendar, use_cftime=True) + res = decode_datetimes_to_index(inp_dates, calendar=calendar, use_cftime=use_cftime) cls = _CFTIME_CALENDARS[calendar] cftime_dts = [cls(y, 1, 1) for y in years] @@ -102,6 +124,25 @@ def test_to_cftime_index(input_type, calendar): pdt.assert_index_equal(res, exp) +def test_decode_index_with_invalid_calendar(): + years = [-1000, 1000, 2000, 3000] + + with pytest.raises(ValueError, match="Unknown calendar: not-a-cal"): + decode_datetimes_to_index(years, calendar="not-a-cal") + + +def test_decode_index_with_nonstandard_calendar(): + years = [-1000, 1000, 2000, 3000] + + res = decode_datetimes_to_index(years, calendar="360_day") + assert isinstance(res, CFTimeIndex) + + with pytest.raises( + ValueError, match="Cannot use pandas indexes with a non-standard calendar" + ): + decode_datetimes_to_index(years, calendar="360_day", use_cftime=False) + + @pytest.mark.parametrize("use_cftime", [False, None]) @input_type def test_to_pd_index(input_type, use_cftime): From ec1c657b240a92cc18c2e0e13a253f52a1f1c160 Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Tue, 29 Sep 2020 18:03:26 +1000 Subject: [PATCH 05/11] Fix a bunch of tests --- src/scmdata/run.py | 101 ++++++++++++++++------------------------- tests/conftest.py | 10 ++++ tests/unit/test_run.py | 4 +- 3 files changed, 49 insertions(+), 66 deletions(-) diff --git a/src/scmdata/run.py b/src/scmdata/run.py index 3e085d91..856fe00e 100644 --- a/src/scmdata/run.py +++ b/src/scmdata/run.py @@ -20,6 +20,8 @@ import pint from dateutil import parser from xarray.core.ops import inject_binary_ops +from xarray import CFTimeIndex +import cftime from .errors import MissingRequiredColumnError, NonUniqueMetadataError from .filters import ( @@ -39,7 +41,6 @@ from .pyam_compat import IamDataFrame, LongDatetimeIamDataFrame from .time import ( _TARGET_DTYPE, - TimePoints, TimeseriesConverter, decode_datetimes_to_index, ) @@ -207,7 +208,7 @@ def _format_wide_data(df, required_cols): time_cols, extra_cols = False, [] for i in cols: # if in wide format, check if columns are years (int) or datetime - if isinstance(i, dt.datetime): + if isinstance(i, (dt.datetime, cftime.datetime)): time_cols = True else: try: @@ -240,10 +241,7 @@ def _format_wide_data(df, required_cols): def _from_ts( - df: Any, - required_cols: Tuple[str], - index: Any = None, - **columns: Union[str, bool, float, int, List], + df: Any, required_cols: Tuple[str], **columns: Union[str, bool, float, int, List], ) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Prepare data to initialize :class:`ScmRun` from wide timeseries. @@ -262,21 +260,12 @@ def _from_ts( """ if not isinstance(df, pd.DataFrame): df = pd.DataFrame(df) - if index is not None: - if isinstance(index, (np.ndarray, list)): - df.index = decode_datetimes_to_index(index) - elif isinstance(index, pd.Index): - df.index = index - else: - raise ValueError("Could not determine type of index") # format columns to lower-case and check that all required columns exist if not set(required_cols).issubset(columns.keys()): missing = list(set(required_cols) - set(columns.keys())) raise MissingRequiredColumnError(missing) - df.index.name = "time" - num_ts = len(df.columns) for c_name, col in columns.items(): col_list = ( @@ -428,7 +417,6 @@ def __init__( if isinstance(data, ScmRun): self._df = data._df.copy() if copy_data else data._df self._meta = data._meta - self._time_points = TimePoints(data.time_points.values) if metadata is None: metadata = data.metadata.copy() else: @@ -456,9 +444,7 @@ def _init_timeseries( raise ValueError("`index` argument is required") if columns is not None: - (_df, _meta) = _from_ts( - data, index=index, required_cols=self.required_cols, **columns - ) + (_df, _meta) = _from_ts(data, required_cols=self.required_cols, **columns) elif isinstance(data, (pd.DataFrame, pd.Series)): (_df, _meta) = _format_data(data, self.required_cols) elif (IamDataFrame is not None) and isinstance(data, IamDataFrame): @@ -477,11 +463,11 @@ def _init_timeseries( (_df, _meta) = _read_file(data, required_cols=self.required_cols, **kwargs) - self._time_points = TimePoints(_df.index.values) - _df = _df.astype(float) - self._df = _df - self._df.index = self._time_points.to_index() + if index is not None: + self._df.index = decode_datetimes_to_index(index) + else: + self._df.index = decode_datetimes_to_index(self._df.index.values) self._meta = pd.MultiIndex.from_frame(_meta.astype("category")) def copy(self): @@ -519,9 +505,9 @@ def __getitem__(self, key: Any) -> Any: [key] if isinstance(key, str) or not isinstance(key, Iterable) else key ) if key == "time": - return pd.Series(self._time_points.to_index(), dtype="object") + return pd.Series(self.times, dtype="object") # converted to datetimes if key == "year": - return pd.Series(self._time_points.years()) + return pd.Series(self.times.year) if set(_key_check).issubset(self.meta_attributes): try: return self._meta_column(key).astype( @@ -605,8 +591,7 @@ def __setitem__( """ meta = np.atleast_1d(value) if key == "time": - self._time_points = TimePoints(meta) - self._df.index = self._time_points.to_index() + self._df.index = decode_datetimes_to_index(meta) else: if len(meta) == 1: new_meta = self._meta.to_frame() @@ -633,12 +618,12 @@ def _indent(s): meta_str = _indent(self.meta.__repr__()) time_str = [ - "Start: {}".format(self.time_points.values[0]), - "End: {}".format(self.time_points.values[-1]), + "Start: {}".format(self.times.values[0]), + "End: {}".format(self.times.values[-1]), ] time_str = _indent("\n".join(time_str)) return "\nTime:\n{}\nMeta:\n{}".format( - len(self), len(self.time_points), time_str, meta_str + len(self), len(self.times), time_str, meta_str ) @staticmethod @@ -749,15 +734,8 @@ def meta_attributes(self): return sorted(list(self._meta.names)) @property - def time_points(self): - """ - Time points of the data - - Returns - ------- - :obj:`scmdata.time.TimePoints` - """ - return self._time_points + def times(self): + return self._df.index def timeseries( self, meta=None, check_duplicated=True, time_axis=None, drop_all_nan_times=False @@ -814,13 +792,11 @@ def timeseries( raise NonUniqueMetadataError(_meta) if time_axis is None: - columns = self._time_points.to_index() + columns = self.times elif time_axis == "year": - columns = self._time_points.years() + columns = self.times.year elif time_axis == "year-month": - columns = ( - self._time_points.years() + (self._time_points.months() - 0.5) / 12 - ) + columns = self.times.year + (self.times.month - 0.5) / 12 elif time_axis == "days since 1970-01-01": def calc_days(x): @@ -828,7 +804,7 @@ def calc_days(x): return (x - ref).astype("timedelta64[D]") - columns = calc_days(self._time_points.values).astype(int) + columns = calc_days(self.times.values).astype(int) elif time_axis == "seconds since 1970-01-01": @@ -837,7 +813,7 @@ def calc_seconds(x): return x - ref - columns = calc_seconds(self._time_points.values).astype(int) + columns = calc_seconds(self.times.values).astype(int) else: raise NotImplementedError("time_axis = '{}'".format(time_axis)) @@ -1072,8 +1048,8 @@ def filter( _keep_rows = _keep_rows * False ret._df = ret._df.loc[_keep_times, _keep_rows] + ret._df.index = self.times[_keep_times] ret._meta = ret._meta[_keep_rows] - ret["time"] = self.time_points.values[_keep_times] if log_if_empty and ret.empty: _logger.warning("Filtered ScmRun is empty!") @@ -1109,7 +1085,7 @@ def _apply_filters( # pylint: disable=missing-return-doc Filtering cannot be performed on requested column """ regexp = filters.pop("regexp", False) - keep_ts = np.array([True] * len(self.time_points)) + keep_ts = np.array([True] * len(self.times)) keep_meta = np.array([True] * len(self)) # filter by columns and list of values @@ -1140,19 +1116,19 @@ def _apply_filters( # pylint: disable=missing-return-doc # else do nothing as level handled in variable filtering elif col == "year": - keep_ts &= years_match(self._time_points.years(), values) + keep_ts &= years_match(self.times.year, values) elif col == "month": - keep_ts &= month_match(self._time_points.months(), values) + keep_ts &= month_match(self.times.month, values) elif col == "day": keep_ts &= self._day_match(values) elif col == "hour": - keep_ts &= hour_match(self._time_points.hours(), values) + keep_ts &= hour_match(self.times.hour, values) elif col == "time": - keep_ts &= datetime_match(self._time_points.values, values) + keep_ts &= datetime_match(self.times.values, values) else: raise ValueError("filter by `{}` not supported".format(col)) @@ -1168,9 +1144,9 @@ def _day_match(self, values): wday = False if wday: - days = self._time_points.weekdays() + days = self.times.weekdays() else: # ints or list of ints - days = self._time_points.days() + days = self.times.days() return day_match(days, values) @@ -1280,11 +1256,11 @@ def interpolate( res = self.copy() - target_times = TimePoints(target_times) + target_index = decode_datetimes_to_index(target_times) timeseries_converter = TimeseriesConverter( - self.time_points.values, - target_times.values, + self.times.values, + target_times, interpolation_type=interpolation_type, extrapolation_type=extrapolation_type, ) @@ -1295,10 +1271,7 @@ def interpolate( target_data[:, i] = timeseries_converter.convert_from( res._df.iloc[:, i].values ) - res._df = pd.DataFrame( - target_data, columns=res._df.columns, index=target_times.to_index() - ) - res._time_points = target_times + res._df = pd.DataFrame(target_data, columns=res._df.columns, index=target_index) return res @@ -2035,6 +2008,8 @@ def run_append( to_join_metas = [] overlapping_times = False + any_cftimes = isinstance(ret.times, CFTimeIndex) + ind = range(ret._df.shape[1]) ret._df.columns = ind ret._meta.index = ind @@ -2057,7 +2032,7 @@ def run_append( # check for overlap idx_to_check = run_to_join_df.index if not overlapping_times and ( - idx_to_check.isin(ret._df.index).any() + idx_to_check.isin(ret.times.values.astype("datetime64[s]")).any() or any([idx_to_check.isin(df.index).any() for df in to_join_dfs]) ): overlapping_times = True @@ -2066,7 +2041,7 @@ def run_append( to_join_metas.append(run_to_join_meta) ret._df = pd.concat([ret._df] + to_join_dfs, axis="columns").sort_index() - ret._df.index = decode_datetimes_to_index(ret._df.index.values) + ret._df.index = decode_datetimes_to_index(ret.times.values) ret._meta = pd.MultiIndex.from_frame( pd.concat([ret._meta.to_frame()] + to_join_metas).astype("category") ) diff --git a/tests/conftest.py b/tests/conftest.py index 89ff6632..f3ca4fbd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -543,6 +543,16 @@ def rcp26(): interpolation_type="linear", extrapolation_type=None, ), + dict( + source_start_time=np.datetime64("1000-01-06"), + source_period_length=np.timedelta64(3, "D"), + target_start_time=np.datetime64("1000-01-07"), + target_period_length=np.timedelta64(4, "D"), + source_values=possible_source_values[0], + target_values=[2.33333333, 3.6666667], + interpolation_type="linear", + extrapolation_type=None, + ), ] test_combinations = [] diff --git a/tests/unit/test_run.py b/tests/unit/test_run.py index 30f4d46a..cb57ad3e 100644 --- a/tests/unit/test_run.py +++ b/tests/unit/test_run.py @@ -1446,9 +1446,7 @@ def test_append_duplicates_order_doesnt_matter(scm_run): obs = res.filter(scenario="a_scenario2").timeseries().squeeze() exp = [2.0, 7.0, 7.0, 2.0, 7.0, 5.0] - npt.assert_array_equal( - res._time_points.years(), [2005, 2010, 2015, 2020, 2030, 2040] - ) + npt.assert_array_equal(res["year"], [2005, 2010, 2015, 2020, 2030, 2040]) npt.assert_almost_equal(obs, exp) From f4ac4f9f553909c57fd0f7bcaeedb84d27465c71 Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Tue, 29 Sep 2020 18:25:10 +1000 Subject: [PATCH 06/11] typo --- src/scmdata/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scmdata/run.py b/src/scmdata/run.py index 856fe00e..93d8298d 100644 --- a/src/scmdata/run.py +++ b/src/scmdata/run.py @@ -463,7 +463,7 @@ def _init_timeseries( (_df, _meta) = _read_file(data, required_cols=self.required_cols, **kwargs) - _df = _df.astype(float) + self._df = _df.astype(float) if index is not None: self._df.index = decode_datetimes_to_index(index) else: From b9fdbe1e2e1d84deeafb7780cfded3b056a160ec Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Wed, 30 Sep 2020 11:47:18 +1000 Subject: [PATCH 07/11] add example magicc append --- tests/integration/test_scmrun_speed.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_scmrun_speed.py b/tests/integration/test_scmrun_speed.py index 70c377ea..b1cfe513 100644 --- a/tests/integration/test_scmrun_speed.py +++ b/tests/integration/test_scmrun_speed.py @@ -6,9 +6,7 @@ import scmdata -@pytest.fixture(params=[10, 10 ** 2, 10 ** 3, 10 ** 3.5, 10 ** 4, 10 ** 4.5]) -def big_scmrun(request): - length = int(request.param) +def make_example_run(length): t_steps = 750 variables = [ "Surface Air Temperature Change", @@ -45,6 +43,12 @@ def big_scmrun(request): ) +@pytest.fixture(params=[10, 10 ** 2, 10 ** 3, 10 ** 3.5, 10 ** 4, 10 ** 4.5]) +def big_scmrun(request): + length = int(request.param) + return make_example_run(length) + + def test_recreate_from_timeseries(benchmark, big_scmrun): def recreate(): return scmdata.ScmRun(big_scmrun.timeseries()) @@ -169,3 +173,18 @@ def append(): assert res.shape[0] == big_scmrun.shape[0] * n_to_append assert res.shape[1] == big_scmrun.shape[1] + + +def test_append_example_magicc_ensemble(benchmark): + n_to_append = 600 + + to_append = [] + for i in range(n_to_append): + tmp = make_example_run(20 * 5) # 20 variable 5 scenarios + tmp["run_number"] = i + to_append.append(tmp) + + def append(): + return scmdata.run_append(to_append) + + benchmark.pedantic(append, iterations=1, rounds=5) From 54d4a9c4702c32387ce4d024a42e997cc36f457f Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Wed, 7 Oct 2020 15:26:09 +1100 Subject: [PATCH 08/11] Use cf compliant times --- src/scmdata/netcdf.py | 49 ++++++++++++++++++++++++++++++++----------- src/scmdata/run.py | 6 ++---- 2 files changed, 39 insertions(+), 16 deletions(-) diff --git a/src/scmdata/netcdf.py b/src/scmdata/netcdf.py index 12edd964..30766d65 100644 --- a/src/scmdata/netcdf.py +++ b/src/scmdata/netcdf.py @@ -14,6 +14,7 @@ from collections import defaultdict from datetime import datetime from logging import getLogger +from xarray.coding.times import encode_cf_datetime, decode_cf_datetime import numpy as np @@ -78,22 +79,46 @@ def _get_nc_type(np_type): return {"datatype": str, "fill_value": None} -def _write_nc(ds, df, dimensions, extras): +def _create_time_variable(ds, run): + """ + Create a CF-compliant time variable + + Note that the CF dictates the use of units, rather than unit which we use else where + """ + ds.createDimension("time", run.shape[1]) + ds.createVariable( + "time", "i8", "time", + ) + + num, units, calendar = encode_cf_datetime(run.times) + ds.variables["time"][:] = num + ds.variables["time"].setncatts({"calendar": calendar, "units": units}) + + +def _read_time_variable(time_var): + # If times use the f8 datatype, convert to datetime64[s] + if time_var.dtype == np.dtype("f8"): + return time_var[:].astype("datetime64[s]") + else: + # Use CF-compliant time handling + attrs = time_var.ncattrs() + units = time_var.units if "units" in attrs else None + calendar = time_var.calendar if "calendar" in attrs else None + + return decode_cf_datetime(time_var[:], units, calendar) + + +def _write_nc(ds, run, dimensions, extras): """ Low level function to write the dimensions, variables and metadata to disk """ all_dims = list(dimensions) + ["time"] - # Create the dimensions - ds.createDimension("time", len(df.time_points)) - ds.createVariable( - "time", "f8", "time", - ) - ds.variables["time"][:] = df.time_points.values + _create_time_variable(ds, run) dims = {} for d in dimensions: - vals = sorted(df.meta[d].unique()) + vals = sorted(run.meta[d].unique()) if not all([isinstance(v, str) for v in vals]) and np.isnan(vals).any(): raise AssertionError("nan in dimension: `{}`".format(d)) @@ -104,11 +129,11 @@ def _write_nc(ds, df, dimensions, extras): ds.variables[d][i] = v dims[d] = np.asarray(vals) - var_shape = [len(dims[d]) for d in dimensions] + [len(df.time_points)] + var_shape = [len(dims[d]) for d in dimensions] + [run.shape[1]] # Write any extra variables for e in extras: - metadata = df.meta[[e, *dimensions]].drop_duplicates() + metadata = run.meta[[e, *dimensions]].drop_duplicates() if metadata[dimensions].duplicated().any(): raise ValueError( @@ -132,7 +157,7 @@ def _write_nc(ds, df, dimensions, extras): ds.variables[e][:] = data_to_write - for var_df in df.groupby("variable"): + for var_df in run.groupby("variable"): v = var_df.get_unique_meta("variable", True) meta = var_df.meta.copy().drop("variable", axis=1) @@ -173,7 +198,7 @@ def _write_nc(ds, df, dimensions, extras): def _read_nc(cls, ds): dims = {d: ds.variables[d][:] for d in ds.dimensions} - dims["time"] = dims["time"].astype("datetime64[s]") + dims["time"] = _read_time_variable(ds.variables["time"]) data = [] columns = defaultdict(list) diff --git a/src/scmdata/run.py b/src/scmdata/run.py index 93d8298d..7d289fdb 100644 --- a/src/scmdata/run.py +++ b/src/scmdata/run.py @@ -1880,12 +1880,10 @@ def reduce(self, func, dim=None, axis=None, **kwargs): data = func(input_data, **kwargs) if getattr(data, "shape", ()) == self.shape: - return type(self)( - data, index=self.time_points, columns=self.meta.to_dict("list") - ) + return type(self)(data, index=self.times, columns=self.meta.to_dict("list")) else: removed_axes = range(2) if axis is None else np.atleast_1d(axis) % 2 - index = self.time_points + index = self.times meta = self.meta.to_dict("list") if 0 in removed_axes and len(meta): # Reduced the timeseries From 7342070f20e47bd0f7d9105f4579c6be33ca8d0a Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Wed, 7 Oct 2020 16:53:39 +1100 Subject: [PATCH 09/11] More checks --- src/scmdata/ops.py | 4 +--- src/scmdata/run.py | 39 +++++++++++++++++++++++-------- tests/conftest.py | 5 ++++ tests/unit/test_netcdf.py | 8 ++++++- tests/unit/test_run.py | 48 +++++++++++++++++++++++++++------------ 5 files changed, 76 insertions(+), 28 deletions(-) diff --git a/src/scmdata/ops.py b/src/scmdata/ops.py index cd89765e..27a9064e 100644 --- a/src/scmdata/ops.py +++ b/src/scmdata/ops.py @@ -825,9 +825,7 @@ def linear_regression_scmrun(self): def _calculate_linear_regression(in_scmrun): time_unit = "s" - times_numpy = in_scmrun.time_points.values.astype( - "datetime64[{}]".format(time_unit) - ) + times_numpy = in_scmrun.times.to_numpy().astype("datetime64[{}]".format(time_unit)) times_in_s = times_numpy.astype("int") ts = in_scmrun.timeseries() diff --git a/src/scmdata/run.py b/src/scmdata/run.py index 7d289fdb..3fa0bfd8 100644 --- a/src/scmdata/run.py +++ b/src/scmdata/run.py @@ -43,6 +43,7 @@ _TARGET_DTYPE, TimeseriesConverter, decode_datetimes_to_index, + TimePoints, ) from .units import UnitConverter @@ -464,12 +465,16 @@ def _init_timeseries( (_df, _meta) = _read_file(data, required_cols=self.required_cols, **kwargs) self._df = _df.astype(float) - if index is not None: - self._df.index = decode_datetimes_to_index(index) - else: - self._df.index = decode_datetimes_to_index(self._df.index.values) + + self._set_time_index(index if index is not None else self._df.index) self._meta = pd.MultiIndex.from_frame(_meta.astype("category")) + def _set_time_index(self, index): + if not isinstance(index, (pd.DatetimeIndex, CFTimeIndex)): + index = decode_datetimes_to_index(index) + self._df.index = index + self.time_points = TimePoints(self.times.to_numpy().astype("datetime64[s]")) + def copy(self): """ Return a :func:`copy.deepcopy` of self. @@ -591,7 +596,7 @@ def __setitem__( """ meta = np.atleast_1d(value) if key == "time": - self._df.index = decode_datetimes_to_index(meta) + self._set_time_index(meta) else: if len(meta) == 1: new_meta = self._meta.to_frame() @@ -804,16 +809,20 @@ def calc_days(x): return (x - ref).astype("timedelta64[D]") - columns = calc_days(self.times.values).astype(int) + columns = calc_days(self.times.to_numpy().astype("datetime64[s]")).astype( + int + ) elif time_axis == "seconds since 1970-01-01": def calc_seconds(x): ref = np.array(["1970-01-01"], dtype=_TARGET_DTYPE)[0] - return x - ref + return (x - ref).astype("timedelta64[s]") - columns = calc_seconds(self.times.values).astype(int) + columns = calc_seconds( + self.times.to_numpy().astype("datetime64[s]") + ).astype(int) else: raise NotImplementedError("time_axis = '{}'".format(time_axis)) @@ -1048,7 +1057,7 @@ def filter( _keep_rows = _keep_rows * False ret._df = ret._df.loc[_keep_times, _keep_rows] - ret._df.index = self.times[_keep_times] + ret._set_time_index(self.times[_keep_times]) ret._meta = ret._meta[_keep_rows] if log_if_empty and ret.empty: @@ -2016,6 +2025,8 @@ def run_append( for run in runs[1:]: run_to_join_df = run._df + any_cftimes = any_cftimes or isinstance(run.times, CFTimeIndex) + max_idx = min_idx + run_to_join_df.shape[1] ind = range(min_idx, max_idx) min_idx = max_idx @@ -2038,8 +2049,16 @@ def run_append( to_join_dfs.append(run_to_join_df) to_join_metas.append(run_to_join_meta) + if any_cftimes: + # If any cftimes are present cast everything to cftime + ret._df.index = decode_datetimes_to_index(ret._df.index, use_cftime=True) + for df in to_join_dfs: + df.index = decode_datetimes_to_index(df.index, use_cftime=True) + ret._df = pd.concat([ret._df] + to_join_dfs, axis="columns").sort_index() - ret._df.index = decode_datetimes_to_index(ret.times.values) + ret._set_time_index( + decode_datetimes_to_index(ret.times.values, use_cftime=any_cftimes) + ) ret._meta = pd.MultiIndex.from_frame( pd.concat([ret._meta.to_frame()] + to_join_metas).astype("category") ) diff --git a/tests/conftest.py b/tests/conftest.py index f3ca4fbd..fb28f204 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -256,6 +256,11 @@ def scm_run(): yield ScmRun(TEST_DF.copy()) +@pytest.fixture(scope="function") +def long_scm_run(): + yield ScmRun(TEST_DF_LONG_TIMES.copy()) + + @pytest.fixture(scope="function") def base_scm_run(): yield BaseScmRun( diff --git a/tests/unit/test_netcdf.py b/tests/unit/test_netcdf.py index 9df5e58a..eebe01c3 100644 --- a/tests/unit/test_netcdf.py +++ b/tests/unit/test_netcdf.py @@ -24,7 +24,7 @@ def test_run_to_nc(scm_run): ds = nc.Dataset(out_fname) - assert ds.dimensions["time"].size == len(scm_run.time_points) + assert ds.dimensions["time"].size == scm_run.shape[1] assert ds.dimensions["scenario"].size == 2 assert ds.variables["scenario"][0] == "a_scenario" @@ -403,3 +403,9 @@ def test_error_run_to_nc_required_cols_in_extras_duplicated(): msg = "metadata for model is not unique for requested dimensions" with pytest.raises(ValueError, match=msg): start.to_nc(out_fname, extras=("model",)) + + +def test_read_legacy_datetimes_nc(scm_run, test_data_path): + old_datetimes_run = ScmRun.from_nc(join(test_data_path, "legacy_datetimes.nc")) + + assert_scmdf_almost_equal(old_datetimes_run, scm_run, check_ts_names=False) diff --git a/tests/unit/test_run.py b/tests/unit/test_run.py index cb57ad3e..5ffdb4e8 100644 --- a/tests/unit/test_run.py +++ b/tests/unit/test_run.py @@ -17,6 +17,7 @@ from scmdata.errors import MissingRequiredColumnError, NonUniqueMetadataError from scmdata.run import BaseScmRun, ScmRun, run_append from scmdata.testing import _check_pandas_less_110, assert_scmdf_almost_equal +from scmdata.time import decode_datetimes_to_index def test_init_df_year_converted_to_datetime(test_pd_df): @@ -986,12 +987,6 @@ def test_append_long_times( "unit": "unit_1", }, ) - if try_start_1_from_df_with_datetime_index: - scmrun_1_ts = scmrun_1.timeseries() - try: - scmrun_1_ts.columns = pd.DatetimeIndex(scmrun_1_ts.columns.values) - except pd.errors.OutOfBoundsDatetime: - pytest.skip("pandas datetime error") scmrun_2 = ScmRun( data=np.arange(len(time_2)), @@ -1004,18 +999,29 @@ def test_append_long_times( "unit": "unit_2", }, ) - if try_start_2_from_df_with_datetime_index: - scmrun_2_ts = scmrun_2.timeseries() - try: - scmrun_2_ts.columns = pd.DatetimeIndex(scmrun_2_ts.columns.values) - scmrun_2 = ScmRun(scmrun_2_ts) - except pd.errors.OutOfBoundsDatetime: - pytest.skip("pandas datetime error") + def _ensure_dateindex_type(run, ensure_datetime_index): + index = run.times + + if ensure_datetime_index and not isinstance(index, pd.DatetimeIndex): + try: + scmrun_1._df.index = index.to_datetimeindex() + except ValueError: + pytest.skip("pandas datetime error") + elif not ensure_datetime_index and isinstance(index, pd.DatetimeIndex): + run._df.index = decode_datetimes_to_index(index, use_cftime=True) + + _ensure_dateindex_type(scmrun_1, try_start_1_from_df_with_datetime_index) + _ensure_dateindex_type(scmrun_2, try_start_2_from_df_with_datetime_index) res = scmrun_1.append(scmrun_2) - assert not isinstance(res._df.index, pd.DatetimeIndex) + if isinstance(scmrun_1.times, pd.DatetimeIndex) and isinstance( + scmrun_2.times, pd.DatetimeIndex + ): + assert isinstance(res.times, pd.DatetimeIndex) + else: + assert not isinstance(res.times, pd.DatetimeIndex) exp_years = set(time_1).union(set(time_2)) assert set(res["year"]) == exp_years @@ -2583,6 +2589,13 @@ def test_timeseries_time_axis(scm_run, time_axis, mod_func): assert (res.columns == (scm_run["time"].apply(mod_func))).all() +@time_axis_checks +def test_long_data_time_axis_long_run(long_scm_run, time_axis, mod_func): + res = long_scm_run.timeseries(time_axis=time_axis) + + assert (res.columns == (long_scm_run["time"].apply(mod_func))).all() + + @time_axis_checks def test_long_data_time_axis(scm_run, time_axis, mod_func): res = scm_run.long_data(time_axis=time_axis) @@ -2590,6 +2603,13 @@ def test_long_data_time_axis(scm_run, time_axis, mod_func): assert (res["time"] == (scm_run.long_data()["time"].apply(mod_func))).all() +@time_axis_checks +def test_long_data_time_axis_long_run(long_scm_run, time_axis, mod_func): + res = long_scm_run.long_data(time_axis=time_axis) + + assert (res["time"] == (long_scm_run.long_data()["time"].apply(mod_func))).all() + + @time_axis_checks @patch("scmdata.plotting.sns.lineplot") @patch.object(ScmRun, "long_data") From 4a4fea8833541b8f45f28c2e0c89b6798e87aa8b Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Wed, 7 Oct 2020 17:33:23 +1100 Subject: [PATCH 10/11] Fallback to parser --- src/scmdata/run.py | 15 +++++++-------- src/scmdata/time.py | 7 ++++++- tests/test_data/legacy_datetimes.nc | Bin 0 -> 15336 bytes 3 files changed, 13 insertions(+), 9 deletions(-) create mode 100644 tests/test_data/legacy_datetimes.nc diff --git a/src/scmdata/run.py b/src/scmdata/run.py index 3fa0bfd8..03321c5f 100644 --- a/src/scmdata/run.py +++ b/src/scmdata/run.py @@ -1057,7 +1057,7 @@ def filter( _keep_rows = _keep_rows * False ret._df = ret._df.loc[_keep_times, _keep_rows] - ret._set_time_index(self.times[_keep_times]) + ret._set_time_index(ret.times) ret._meta = ret._meta[_keep_rows] if log_if_empty and ret.empty: @@ -1123,21 +1123,20 @@ def _apply_filters( # pylint: disable=missing-return-doc separator=self.data_hierarchy_separator, ) # else do nothing as level handled in variable filtering - elif col == "year": - keep_ts &= years_match(self.times.year, values) + keep_ts &= years_match(self.time_points.years(), values) elif col == "month": - keep_ts &= month_match(self.times.month, values) + keep_ts &= month_match(self.time_points.months(), values) elif col == "day": keep_ts &= self._day_match(values) elif col == "hour": - keep_ts &= hour_match(self.times.hour, values) + keep_ts &= hour_match(self.time_points.hours(), values) elif col == "time": - keep_ts &= datetime_match(self.times.values, values) + keep_ts &= datetime_match(self.time_points.values, values) else: raise ValueError("filter by `{}` not supported".format(col)) @@ -1153,9 +1152,9 @@ def _day_match(self, values): wday = False if wday: - days = self.times.weekdays() + days = self.time_points.weekdays() else: # ints or list of ints - days = self.times.days() + days = self.time_points.days() return day_match(days, values) diff --git a/src/scmdata/time.py b/src/scmdata/time.py index 928a6f5d..a7403888 100644 --- a/src/scmdata/time.py +++ b/src/scmdata/time.py @@ -11,6 +11,7 @@ import pandas as pd from pandas.errors import OutOfBoundsDatetime from xarray import CFTimeIndex +from dateutil import parser _TARGET_TYPE = np.int64 _TARGET_DTYPE = "datetime64[s]" @@ -79,6 +80,7 @@ def _str_to_cftime(inp: str, calendar: str): _ufunc_float_year_to_datetime = np.frompyfunc(_float_year_to_datetime, 1, 1) _ufunc_str_to_datetime = np.frompyfunc(np.datetime64, 1, 1) +_ufunc_str_to_datetime_parser = np.frompyfunc(parser.parse, 1, 1) _ufunc_str_to_cftime = np.frompyfunc(_str_to_cftime, 2, 1) @@ -86,7 +88,10 @@ def _parse_datetime(inp: np.ndarray) -> np.ndarray: try: return _ufunc_float_year_to_datetime(inp.astype(float)) except (TypeError, ValueError): - return _ufunc_str_to_datetime(inp) + try: + return _ufunc_str_to_datetime(inp) + except ValueError: + return _ufunc_str_to_datetime_parser(inp) def _format_datetime(dts) -> np.ndarray: diff --git a/tests/test_data/legacy_datetimes.nc b/tests/test_data/legacy_datetimes.nc new file mode 100644 index 0000000000000000000000000000000000000000..9658108c058632fab3f2f7eb3da102ac41c06a5d GIT binary patch literal 15336 zcmeHN-*3}Z6hGg#>k9kAh7KIR<`@zrKrG{g1)dE+`p6_@ z9MVX5d$@CRNBD>v?RX*MbgWn8@=9Bn2g%i4D1dertntN~(o zahUf!qxXDBjRFL0l4n`!n&}HYMAUB`(<;?+*t2{MzC9eMgKAWTw<9#uLKq3N_+ZcI zDDu^N>mAsiNR18*rP8DEp1}mz8yKNFH7nJ#KS5e(Rvmr>ky1AAQa?bv^W2@Gc>g}J z$6wPP-tljG$J0DrRG!^k-$iOj$PY1dWWXAyLtSki^%}S@-(s*Vf)|<^|Gbqu7 zc=1tdQ}SXYF^AfBG>cw<)6Gf%l1yZAd#|V`q|&MQuveUPy;9blLP6SlJAOp1J}kSU zTMEjKU9s>J;m$2bGZS6AI>H^1aMz;N-S5!~uTFo}gXB``%dd%^T)*jZPhWg!P-&tA zhE23GCmR9a-zb2Chtw@NUbe^*^cJW!FsT@isvh~#$Qe3C=3}d8g8Snp{lW{~rwv+E zLtg9>dC?>GNWbXOYM`l6o(wez7z7Lg1_6VBLBJqj5HJWB1PlTO0fWGUkHGwSBb^k% zBnLOY${Vf3tP-UO=)+-o263xW_bO~T%M_;(-RXpcdoEw>Hds6k;5vfu8g7Y z^avT|_xayg#}<+vaviB+Bh|Z5&FUR2;aL^m>n?nDrtHG~yp@MWM1uDbRI6D5fn({g zm(4rg@pPi#dPj~Y*RuHU*U$f0)^%w!_qMYRu!o7apc(6bEpzq1c(LeZvIX2EmFecG z1L~rzEG<+kRayzb2GomMli6JEIVU%*3!C%3DOyEJy`n`ZjRD*~VUKY!u@T+&7ATij z2M0zE0IWnAa?j~y5b&Z+Y!gaa#iyjwT_sQR&OCeTJHkuK!%6K(w&55xuA@$FWA0lf z0PoAgy-odN*DsV$!;AVh!t2^-yPpoOg(p&-hU`>2@0J`YBQb`$2}6Q!>)Qa+$S_n?2G4 zASYZjz5C5<&w-Idf=-^4>V^JOG%!&mAg4t~fO_A&uK6|y7z7Lg1_6VBLBJqj5V-#l zAfLFdiJVfO-)FIxOS{6T<38|-$$bu@@q8gQU(C2Uw7uGWcG6iVZ)4z6kD^PJr#+s- zkg?>ZE2l_MMB8_GCR6gk{L`$}@fU=Uyc;m%R#q_``KK yYenjE4PS7nrmvYuy&4k38w3mj1_6VBLBJqj5O|;wsPK`iBlQK2f>SexyZ-=q7eCJc literal 0 HcmV?d00001 From aa1f45a279fe38483c14b52b1088ef2de5169f9c Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Wed, 7 Oct 2020 17:37:09 +1100 Subject: [PATCH 11/11] isort --- src/scmdata/netcdf.py | 2 +- src/scmdata/run.py | 6 +++--- src/scmdata/time.py | 2 +- tests/unit/test_time.py | 15 ++++++++------- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/scmdata/netcdf.py b/src/scmdata/netcdf.py index 30766d65..1f4d7aa9 100644 --- a/src/scmdata/netcdf.py +++ b/src/scmdata/netcdf.py @@ -14,9 +14,9 @@ from collections import defaultdict from datetime import datetime from logging import getLogger -from xarray.coding.times import encode_cf_datetime, decode_cf_datetime import numpy as np +from xarray.coding.times import decode_cf_datetime, encode_cf_datetime from . import __version__ diff --git a/src/scmdata/run.py b/src/scmdata/run.py index 03321c5f..cb533993 100644 --- a/src/scmdata/run.py +++ b/src/scmdata/run.py @@ -13,15 +13,15 @@ from logging import getLogger from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union +import cftime import numpy as np import numpy.testing as npt import openscm_units.unit_registry as ur import pandas as pd import pint from dateutil import parser -from xarray.core.ops import inject_binary_ops from xarray import CFTimeIndex -import cftime +from xarray.core.ops import inject_binary_ops from .errors import MissingRequiredColumnError, NonUniqueMetadataError from .filters import ( @@ -41,9 +41,9 @@ from .pyam_compat import IamDataFrame, LongDatetimeIamDataFrame from .time import ( _TARGET_DTYPE, + TimePoints, TimeseriesConverter, decode_datetimes_to_index, - TimePoints, ) from .units import UnitConverter diff --git a/src/scmdata/time.py b/src/scmdata/time.py index a7403888..70de37e5 100644 --- a/src/scmdata/time.py +++ b/src/scmdata/time.py @@ -9,9 +9,9 @@ import cftime import numpy as np import pandas as pd +from dateutil import parser from pandas.errors import OutOfBoundsDatetime from xarray import CFTimeIndex -from dateutil import parser _TARGET_TYPE = np.int64 _TARGET_DTYPE = "datetime64[s]" diff --git a/tests/unit/test_time.py b/tests/unit/test_time.py index 83730a3e..e68b6518 100644 --- a/tests/unit/test_time.py +++ b/tests/unit/test_time.py @@ -1,14 +1,15 @@ -import pytest -from scmdata.time import _format_datetime, decode_datetimes_to_index, _CFTIME_CALENDARS import datetime as dt -import numpy as np -from xarray import CFTimeIndex +import cftime +import numpy as np +import pandas as pd +import pandas.testing as pdt +import pytest from pandas import DatetimeIndex from pandas.errors import OutOfBoundsDatetime -import pandas.testing as pdt -import pandas as pd -import cftime +from xarray import CFTimeIndex + +from scmdata.time import _CFTIME_CALENDARS, _format_datetime, decode_datetimes_to_index input_type = pytest.mark.parametrize( "input_type",