Source code for mizani.labels

"""
Scales have guides and these are what help users make sense of
the data mapped onto the scale. Common examples of guides include
the x-axis, the y-axis, the keyed legend and a colorbar legend.
The guides have demarcations(breaks), some of which must be labelled.

The `label_*` functions below create functions that convert data
values as understood by a specific scale and return string
representations of those values. Manipulating the string
representation of a value helps improve readability of the guide.
"""

from __future__ import annotations

import re
import typing
from bisect import bisect_right
from dataclasses import dataclass
from zoneinfo import ZoneInfo

import numpy as np

from .breaks import timedelta_helper
from .utils import (
    match,
    precision,
    round_any,
    same_log10_order_of_magnitude,
)

if typing.TYPE_CHECKING:
    from datetime import datetime, tzinfo
    from typing import Literal, Optional, Sequence

    from mizani.typing import (
        BytesSymbol,
        DurationUnit,
        FloatArrayLike,
        NDArrayTimedelta,
        TupleInt2,
    )

__all__ = [
    "label_comma",
    "label_custom",
    "label_currency",
    "label_dollar",
    "label_percent",
    "label_scientific",
    "label_date",
    "label_number",
    "label_log",
    "label_timedelta",
    "label_pvalue",
    "label_ordinal",
    "label_bytes",
]

UTC = ZoneInfo("UTC")



[docs]
@dataclass
class label_number:
    """
    Labelling numbers

    Parameters
    ----------
    precision : int
        Number of digits after the decimal point.
    suffix : str
        What to put after the value.
    big_mark : str
        The thousands separator. This is usually
        a comma or a dot.
    decimal_mark : str
        What to use to separate the decimals digits.

    Examples
    --------
    >>> label_number()([.654, .8963, .1])
    ['0.65', '0.90', '0.10']
    >>> label_number(accuracy=0.0001)([.654, .8963, .1])
    ['0.6540', '0.8963', '0.1000']
    >>> label_number(precision=4)([.654, .8963, .1])
    ['0.6540', '0.8963', '0.1000']
    >>> label_number(prefix="$")([5, 24, -42])
    ['$5', '$24', '-$42']
    >>> label_number(suffix="s")([5, 24, -42])
    ['5s', '24s', '-42s']
    >>> label_number(big_mark="_")([1e3, 1e4, 1e5, 1e6])
    ['1_000', '10_000', '100_000', '1_000_000']
    >>> label_number(width=3)([1, 10, 100, 1000])
    ['  1', ' 10', '100', '1000']
    >>> label_number(align="^", width=5)([1, 10, 100, 1000])
    ['  1  ', ' 10  ', ' 100 ', '1000 ']
    >>> label_number(style_positive=" ")([5, 24, -42])
    [' 5', ' 24', '-42']
    >>> label_number(style_positive="+")([5, 24, -42])
    ['+5', '+24', '-42']
    >>> label_number(prefix="$", style_negative="braces")([5, 24, -42])
    ['$5', '$24', '($42)']
    """

    accuracy: Optional[float] = None
    precision: Optional[int] = None
    scale: float = 1
    prefix: str = ""
    suffix: str = ""
    big_mark: str = ""
    decimal_mark: str = "."
    fill: str = ""
    style_negative: Literal["-", "hyphen", "parens"] = "-"
    style_positive: Literal["", "+", " "] = ""
    align: Literal["<", ">", "=", "^"] = ">"
    width: Optional[int] = None

    def __post_init__(self):
        if self.precision is not None:
            if self.accuracy is not None:
                raise ValueError("Specify only one of precision or accuracy")
            self.accuracy = 10**-self.precision


[docs]
    def __call__(self, x: FloatArrayLike) -> Sequence[str]:
        # Construct formatting according to
        # https://docs.python.org/3/library/string.html#format-string-syntax
        # Specfically using the Format Specification Mini-Language

        # python format only accepts ",", "_" to separate the thousands
        # if we have a non-standard value, we use "," & replace it after
        valid_big_mark = self.big_mark in ("", ",", "_")
        sep = self.big_mark if valid_big_mark else ","

        fmt = (
            f"{self.prefix}" f"{{num:{sep}.{{precision}}f}}" f"{self.suffix}"
        ).format

        x = np.asarray(x)
        x_scaled = x * self.scale

        if self.accuracy is None:
            accuracy = precision(x_scaled)
        else:
            accuracy = self.accuracy

        x = round_any(x, accuracy / self.scale)
        digits = -np.floor(np.log10(accuracy)).astype(int)
        digits = np.minimum(np.maximum(digits, 0), 20)

        res = [fmt(num=abs(n), precision=digits) for n in x_scaled]
        if not valid_big_mark:
            res = [s.replace(",", self.big_mark) for s in res]

        if self.decimal_mark != ".":
            res = [s.replace(".", self.decimal_mark) for s in res]

        pos_fmt = f"{self.style_positive}{{s}}".format

        if self.style_negative == "-":
            neg_fmt = "-{s}".format
        elif self.style_negative == "hyphen":
            neg_fmt = "\u2212{s}".format
        else:
            neg_fmt = "({s})".format

        res = [
            neg_fmt(s=s) if num < 0 else pos_fmt(s=s) for num, s in zip(x, res)
        ]

        if self.width is not None:
            fmt = f"{{s:{self.fill}{self.align}{self.width}}}".format
            res = [fmt(s=s) for s in res]

        return res





[docs]
@dataclass
class label_custom:
    """
    Creating a custom labelling function

    Parameters
    ----------
    fmt : str, optional
        Format string. Default is the generic new style
        format braces, ``{}``.
    style : 'new' | 'old'
        Whether to use new style or old style formatting.
        New style uses the :meth:`str.format` while old
        style uses ``%``. The format string must be written
        accordingly.

    Examples
    --------
    >>> label = label_custom('{:.2f} USD')
    >>> label([3.987, 2, 42.42])
    ['3.99 USD', '2.00 USD', '42.42 USD']
    """

    fmt: str = "{}"
    style: Literal["old", "new"] = "new"


[docs]
    def __call__(self, x: FloatArrayLike) -> Sequence[str]:
        """
        Format a sequence of inputs

        Parameters
        ----------
        x : array
            Input

        Returns
        -------
        out : list
            List of strings.
        """
        if self.style == "new":
            return [self.fmt.format(val) for val in x]
        elif self.style == "old":
            return [self.fmt % val for val in x]
        else:
            raise ValueError("style should be either 'new' or 'old'")




# formatting functions

[docs]
@dataclass
class label_currency(label_number):
    """
    Labelling currencies

    Parameters
    ----------
    prefix : str
        What to put before the value.

    Examples
    --------
    >>> x = [1.232, 99.2334, 4.6, 9, 4500]
    >>> label_currency()(x)
    ['$1.23', '$99.23', '$4.60', '$9.00', '$4500.00']
    >>> label_currency(prefix='C$', precision=0, big_mark=',')(x)
    ['C$1', 'C$99', 'C$5', 'C$9', 'C$4,500']
    """

    prefix: str = "$"

    def __post_init__(self):
        if self.precision is None and self.accuracy is None:
            self.precision = 2
        super().__post_init__()



label_dollar = label_currency
dollar = label_dollar()



[docs]
@dataclass
class label_comma(label_currency):
    """
    Labels of numbers with commas as separators

    Parameters
    ----------
    precision : int
        Number of digits after the decimal point.

    Examples
    --------
    >>> label_comma()([1000, 2, 33000, 400])
    ['1,000', '2', '33,000', '400']
    """

    prefix: str = ""
    precision: int = 0
    big_mark: str = ","




[docs]
@dataclass
class label_percent(label_number):
    """
    Labelling percentages

    Multiply by one hundred and display percent sign

    Examples
    --------
    >>> label = label_percent()
    >>> label([.45, 9.515, .01])
    ['45%', '952%', '1%']
    >>> label([.654, .8963, .1])
    ['65%', '90%', '10%']
    """

    scale: float = 100
    suffix: str = "%"



percent = label_percent()



[docs]
@dataclass
class label_scientific:
    """
    Scientific number labels

    Parameters
    ----------
    digits : int
        Significant digits.

    Examples
    --------
    >>> x = [.12, .23, .34, 45]
    >>> label_scientific()(x)
    ['1.2e-01', '2.3e-01', '3.4e-01', '4.5e+01']

    Notes
    -----
    Be careful when using many digits (15+ on a 64
    bit computer). Consider of the `machine epsilon`_.

    .. _machine epsilon: https://en.wikipedia.org/wiki/Machine_epsilon
    """

    digits: int = 3

    def __post_init__(self):
        tpl = f"{{:.{self.digits}e}}"
        self._label = label_custom(tpl)
        self.trailling_zeros_pattern = re.compile(r"(0+)e")


[docs]
    def __call__(self, x: FloatArrayLike) -> Sequence[str]:
        if len(x) == 0:
            return []

        def count_zeros(s):
            match = self.trailling_zeros_pattern.search(s)
            if match:
                return len(match.group(1))
            else:
                return 0

        # format and then remove superfluous zeros
        labels = self._label(x)
        n = min([count_zeros(val) for val in labels])
        if n:
            labels = [val.replace("0" * n + "e", "e") for val in labels]
        return labels




scientific = label_scientific()



[docs]
@dataclass
class label_log:
    """
    Log number labels

    Parameters
    ----------
    base : int
        Base of the logarithm. Default is 10.
    exponent_limits : tuple
        limits (int, int) where if the any of the powers of the
        numbers falls outside, then the labels will be in
        exponent form. This only applies for base 10.
    mathtex : bool
        If True, return the labels in mathtex format as understood
        by Matplotlib.

    Examples
    --------
    >>> label_log()([0.001, 0.1, 100])
    ['0.001', '0.1', '100']

    >>> label_log()([0.0001, 0.1, 10000])
    ['1e-4', '1e-1', '1e4']

    >>> label_log(mathtex=True)([0.0001, 0.1, 10000])
    ['$10^{-4}$', '$10^{-1}$', '$10^{4}$']
    """

    base: float = 10
    exponent_limits: TupleInt2 = (-4, 4)
    mathtex: bool = False

    def _tidyup_labels(self, labels: Sequence[str]) -> Sequence[str]:
        """
        Make all labels uniform in format

        Remove redundant zeros for labels in exponential format.

        Parameters
        ----------
        labels : list-like
            Labels to be tidied.

        Returns
        -------
        out : list-like
            Labels
        """

        def remove_zeroes(s: str) -> str:
            """
            Remove unnecessary zeros for float string s
            """
            tup = s.split("e")
            if len(tup) == 2:
                mantissa = tup[0].rstrip("0").rstrip(".")
                exponent = int(tup[1])
                s = f"{mantissa}e{exponent}" if exponent else mantissa
            return s

        def as_exp(s: str) -> str:
            """
            Float string s as in exponential format
            """
            return s if "e" in s else "{:1.0e}".format(float(s))

        def as_mathtex(s: str) -> str:
            """
            Mathtex for maplotlib
            """
            if "e" not in s:
                assert s == "1", f"Unexpected value {s = }, instead of '1'"
                return f"${self.base}^{{0}}$"

            exp = s.split("e")[1]
            return f"${self.base}^{{{exp}}}$"

        # If any are in exponential format, make all of
        # them expontential
        has_e = ["e" in x for x in labels]
        if not all(has_e) and sum(has_e):
            labels = [as_exp(x) for x in labels]

        labels = [remove_zeroes(x) for x in labels]

        has_e = ["e" in x for x in labels]
        if self.mathtex and any(has_e):
            labels = [as_mathtex(x) for x in labels]

        return labels


[docs]
    def __call__(self, x: FloatArrayLike) -> Sequence[str]:
        """
        Format a sequence of inputs

        Parameters
        ----------
        x : array
            Input

        Returns
        -------
        out : list
            List of strings.
        """
        if len(x) == 0:
            return []

        # Decide on using exponents
        if self.base == 10:
            xmin = int(np.floor(np.log10(np.min(x))))
            xmax = int(np.ceil(np.log10(np.max(x))))
            emin, emax = self.exponent_limits
            all_multiples = np.all([np.log10(num).is_integer() for num in x])
            beyond_threshold = xmin <= emin or emax <= xmax
            use_exponents = (
                same_log10_order_of_magnitude(x) or all_multiples
            ) and beyond_threshold
            fmt = "{:1.0e}" if use_exponents else "{:g}"
            labels = [fmt.format(num) for num in x]
            return self._tidyup_labels(labels)
        else:

            def _exp(num, base):
                e = np.log(num) / np.log(base)
                e_round = np.round(e)
                e = int(e_round) if np.isclose(e, e_round) else np.round(e, 3)
                return e

            base_txt = f"{self.base}"
            if self.base == np.e:
                base_txt = "e"

            if self.mathtex:
                fmt_parts = (f"${base_txt}^", "{{{e}}}$")
            else:
                fmt_parts = (f"{base_txt}^", "{e}")

            fmt = "".join(fmt_parts)
            exps = [_exp(num, self.base) for num in x]
            labels = [fmt.format(e=e) for e in exps]
            return labels





[docs]
@dataclass
class label_date:
    """
    Datetime labels

    Parameters
    ----------
    fmt : str
        Format string. See
        :ref:`strftime <strftime-strptime-behavior>`.
    tz : datetime.tzinfo, optional
        Time zone information. If none is specified, the
        time zone will be that of the first date. If the
        first date has no time information then a time zone
        is chosen by other means.

    Examples
    --------
    >>> from datetime import datetime
    >>> x = [datetime(x, 1, 1) for x in [2010, 2014, 2018, 2022]]
    >>> label_date()(x)
    ['2010-01-01', '2014-01-01', '2018-01-01', '2022-01-01']
    >>> label_date('%Y')(x)
    ['2010', '2014', '2018', '2022']

    Can format time

    >>> x = [datetime(2017, 12, 1, 16, 5, 7)]
    >>> label_date("%Y-%m-%d %H:%M:%S")(x)
    ['2017-12-01 16:05:07']

    Time zones are respected

    >>> UTC = ZoneInfo('UTC')
    >>> UG = ZoneInfo('Africa/Kampala')
    >>> x = [datetime(2010, 1, 1, i) for i in [8, 15]]
    >>> x_tz = [datetime(2010, 1, 1, i, tzinfo=UG) for i in [8, 15]]
    >>> label_date('%Y-%m-%d %H:%M')(x)
    ['2010-01-01 08:00', '2010-01-01 15:00']
    >>> label_date('%Y-%m-%d %H:%M')(x_tz)
    ['2010-01-01 08:00', '2010-01-01 15:00']

    Format with a specific time zone

    >>> label_date('%Y-%m-%d %H:%M', tz=UTC)(x_tz)
    ['2010-01-01 05:00', '2010-01-01 12:00']
    >>> label_date('%Y-%m-%d %H:%M', tz='EST')(x_tz)
    ['2010-01-01 00:00', '2010-01-01 07:00']
    """

    fmt: str = "%Y-%m-%d"
    tz: Optional[tzinfo] = None

    def __post_init__(self):
        if isinstance(self.tz, str):
            self.tz = ZoneInfo(self.tz)


[docs]
    def __call__(self, x: Sequence[datetime]) -> Sequence[str]:
        """
        Format a sequence of inputs

        Parameters
        ----------
        x : array
            Input

        Returns
        -------
        out : list
            List of strings.
        """
        if self.tz is not None:
            x = [d.astimezone(self.tz) for d in x]
        return [d.strftime(self.fmt) for d in x]





[docs]
@dataclass
class label_timedelta:
    """
    Timedelta labels

    Parameters
    ----------
    units : str, optional
        The units in which the breaks will be computed.
        If None, they are decided automatically. Otherwise,
        the value should be one of::

            'ns'    # nanoseconds
            'us'    # microseconds
            'ms'    # milliseconds
            's'     # seconds
            'min'   # minute
            'h'     # hour
            'day'     # day
            'week'  # week
            'month' # month
            'year'  # year

    show_units : bool
        Whether to append the units symbol to the values.
    zero_has_units : bool
        If True a value of zero
    usetex : bool
        If True, they microseconds identifier string is
        rendered with greek letter *mu*. Default is False.
    space : bool
        If True add a space between the value and the units
    use_plurals : bool
        If True, for the when the value is not 1 and the units are
        one of `week`, `month` and `year`, the plural form of the
        unit is used e.g. `2 weeks`.

    Examples
    --------
    >>> from datetime import timedelta
    >>> x = [timedelta(days=31*i) for i in range(5)]
    >>> label_timedelta()(x)
    ['0 months', '1 month', '2 months', '3 months', '4 months']
    >>> label_timedelta(use_plurals=False)(x)
    ['0 month', '1 month', '2 month', '3 month', '4 month']
    >>> label_timedelta(units='day')(x)
    ['0 days', '31 days', '62 days', '93 days', '124 days']
    >>> label_timedelta(units='day', zero_has_units=False)(x)
    ['0', '31 days', '62 days', '93 days', '124 days']
    >>> label_timedelta(units='day', show_units=False)(x)
    ['0', '31', '62', '93', '124']
    """

    units: Optional[DurationUnit] = None
    show_units: bool = True
    zero_has_units: bool = True
    usetex: bool = False
    space: bool = True
    use_plurals: bool = True


[docs]
    def __call__(self, x: NDArrayTimedelta) -> Sequence[str]:
        if len(x) == 0:
            return []

        values, units = timedelta_helper.format_info(x, self.units)
        labels = list(label_number()(values))

        if self.show_units:
            if self.usetex and units == "us":
                units = r"$\mu s$"

            if self.use_plurals and units in ("day", "week", "month", "year"):
                units_plural = f"{units}s"
            else:
                units_plural = units

            if self.space:
                units = f" {units}"
                units_plural = f" {units_plural}"
            for i, (num, label) in enumerate(zip(values, labels)):
                if num == 0 and not self.zero_has_units:
                    continue
                elif num == 1:
                    labels[i] = f"{label}{units}"
                else:
                    labels[i] = f"{label}{units_plural}"

        return labels





[docs]
@dataclass
class label_pvalue:
    """
    p-values labelling

    Parameters
    ----------
    accuracy : float
        Number to round to
    add_p : bool
        Whether to prepend "p=" or "p<" to the output

    Examples
    --------
    >>> x = [.90, .15, .015, .009, 0.0005]
    >>> label_pvalue()(x)
    ['0.9', '0.15', '0.015', '0.009', '<0.001']
    >>> label_pvalue(0.1)(x)
    ['0.9', '0.1', '<0.1', '<0.1', '<0.1']
    >>> label_pvalue(0.1, True)(x)
    ['p=0.9', 'p=0.1', 'p<0.1', 'p<0.1', 'p<0.1']
    """

    accuracy: float = 0.001
    add_p: float = False


[docs]
    def __call__(self, x: FloatArrayLike) -> Sequence[str]:
        """
        Format a sequence of inputs

        Parameters
        ----------
        x : array
            Input

        Returns
        -------
        out : list
            List of strings.
        """
        x = round_any(x, self.accuracy)
        below = [num < self.accuracy for num in x]

        if self.add_p:
            eq_fmt = "p={:g}".format
            below_label = f"p<{self.accuracy:g}"
        else:
            eq_fmt = "{:g}".format
            below_label = f"<{self.accuracy:g}"

        labels = [below_label if b else eq_fmt(i) for i, b in zip(x, below)]
        return labels




def ordinal(n: float, prefix="", suffix="", big_mark=""):
    # General Case: 0th, 1st, 2nd, 3rd, 4th, 5th, 6th, 7th, 8th, 9th
    # Special Case: 10th, 11th, 12th, 13th
    n = int(n)
    idx = np.min((n % 10, 4))
    _suffix = ("th", "st", "nd", "rd", "th")[idx]
    if 11 <= (n % 100) <= 13:
        _suffix = "th"

    if big_mark:
        s = f"{n:,}"
        if big_mark != ",":
            s = s.replace(",", big_mark)
    else:
        s = f"{n}"

    return f"{prefix}{s}{_suffix}{suffix}"



[docs]
@dataclass
class label_ordinal:
    """
    Ordinal number labelling

    Parameters
    ----------
    prefix : str
        What to put before the value.
    suffix : str
        What to put after the value.
    big_mark : str
        The thousands separator. This is usually
        a comma or a dot.

    Examples
    --------
    >>> label_ordinal()(range(8))
    ['0th', '1st', '2nd', '3rd', '4th', '5th', '6th', '7th']
    >>> label_ordinal(suffix=' Number')(range(11, 15))
    ['11th Number', '12th Number', '13th Number', '14th Number']
    """

    prefix: str = ""
    suffix: str = ""
    big_mark: str = ""


[docs]
    def __call__(self, x: FloatArrayLike) -> Sequence[str]:
        labels = [
            ordinal(num, self.prefix, self.suffix, self.big_mark) for num in x
        ]
        return labels





[docs]
@dataclass
class label_bytes:
    """
    Labelling byte numbers

    Parameters
    ----------
    symbol : str
        Valid symbols are "B", "kB", "MB", "GB", "TB", "PB", "EB",
        "ZB", and "YB" for SI units, and the "iB" variants for
        binary units. Default is "auto" where the symbol to be
        used is determined separately for each value of 1x.
    units : "binary" | "si"
        Which unit base to use, 1024 for "binary" or 1000 for "si".
    fmt : str, optional
        Format sting. Default is ``{:.0f}``.

    Examples
    --------
    >>> x = [1000, 1000000, 4e5]
    >>> label_bytes()(x)
    ['1000 B', '977 KiB', '391 KiB']
    >>> label_bytes(units='si')(x)
    ['1 kB', '1 MB', '400 kB']
    """

    symbol: Literal["auto"] | BytesSymbol = "auto"
    units: Literal["binary", "si"] = "binary"
    fmt: str = "{:.0f} "

    def __post_init__(self):
        if self.units == "si":
            self.base = 1000
            self._all_symbols = (
                "B",
                "kB",
                "MB",
                "GB",
                "TB",
                "PB",
                "EB",
                "ZB",
                "YB",
            )
        else:
            self.base = 1024
            self._all_symbols = (
                "B",
                "KiB",
                "MiB",
                "GiB",
                "TiB",
                "PiB",
                "EiB",
                "ZiB",
                "YiB",
            )

        # possible exponents of base: eg 1000^1, 1000^2, 1000^3, ...
        exponents = np.arange(1, len(self._all_symbols) + 1, dtype=float)
        self._powers = self.base**exponents
        self._validate_symbol(self.symbol, ("auto",) + self._all_symbols)


[docs]
    def __call__(self, x: FloatArrayLike) -> Sequence[str]:
        _all_symbols = self._all_symbols
        symbol = self.symbol
        if symbol == "auto":
            power = [bisect_right(self._powers, val) for val in x]
            symbols = [_all_symbols[p] for p in power]
        else:
            power = np.array(match([symbol], _all_symbols))
            symbols = [symbol] * len(x)

        x = np.asarray(x)
        power = np.asarray(power, dtype=float)
        values = x / self.base**power
        fmt = (self.fmt + "{}").format
        labels = [fmt(v, s) for v, s in zip(values, symbols)]
        return labels


    def _validate_symbol(self, symbol: str, allowed_symbols: Sequence[str]):
        if symbol not in allowed_symbols:
            raise ValueError(
                "Symbol must be one of {}".format(allowed_symbols)
            )



# Deprecated
comma_format = label_comma
custom_format = label_custom
currency_format = label_currency
label_dollar = label_dollar
percent_format = label_percent
scientific_format = label_scientific
date_format = label_date
number_format = label_number
log_format = label_log
timedelta_format = label_timedelta
pvalue_format = label_pvalue
ordinal_format = label_ordinal
number_bytes_format = label_bytes