Source code for mizani.labels

"""
Scales have guides and these are what help users make sense of
the data mapped onto the scale. Common examples of guides include
the x-axis, the y-axis, the keyed legend and a colorbar legend.
The guides have demarcations(breaks), some of which must be labelled.

The `label_*` functions below create functions that convert data
values as understood by a specific scale and return string
representations of those values. Manipulating the string
representation of a value helps improve readability of the guide.
"""

from __future__ import annotations

import re
import typing
from bisect import bisect_right
from dataclasses import dataclass
from zoneinfo import ZoneInfo

import numpy as np

from .breaks import timedelta_helper
from .utils import (
    match,
    precision,
    round_any,
    same_log10_order_of_magnitude,
)

if typing.TYPE_CHECKING:
    from datetime import datetime, tzinfo
    from typing import Literal, Optional, Sequence

    from mizani.typing import (
        BytesSymbol,
        DurationUnit,
        FloatArrayLike,
        NDArrayTimedelta,
        TupleInt2,
    )

__all__ = [
    "label_comma",
    "label_custom",
    "label_currency",
    "label_dollar",
    "label_percent",
    "label_scientific",
    "label_date",
    "label_number",
    "label_log",
    "label_timedelta",
    "label_pvalue",
    "label_ordinal",
    "label_bytes",
]

UTC = ZoneInfo("UTC")


[docs] @dataclass class label_number: """ Labelling numbers Parameters ---------- precision : int Number of digits after the decimal point. suffix : str What to put after the value. big_mark : str The thousands separator. This is usually a comma or a dot. decimal_mark : str What to use to separate the decimals digits. Examples -------- >>> label_number()([.654, .8963, .1]) ['0.65', '0.90', '0.10'] >>> label_number(accuracy=0.0001)([.654, .8963, .1]) ['0.6540', '0.8963', '0.1000'] >>> label_number(precision=4)([.654, .8963, .1]) ['0.6540', '0.8963', '0.1000'] >>> label_number(prefix="$")([5, 24, -42]) ['$5', '$24', '-$42'] >>> label_number(suffix="s")([5, 24, -42]) ['5s', '24s', '-42s'] >>> label_number(big_mark="_")([1e3, 1e4, 1e5, 1e6]) ['1_000', '10_000', '100_000', '1_000_000'] >>> label_number(width=3)([1, 10, 100, 1000]) [' 1', ' 10', '100', '1000'] >>> label_number(align="^", width=5)([1, 10, 100, 1000]) [' 1 ', ' 10 ', ' 100 ', '1000 '] >>> label_number(style_positive=" ")([5, 24, -42]) [' 5', ' 24', '-42'] >>> label_number(style_positive="+")([5, 24, -42]) ['+5', '+24', '-42'] >>> label_number(prefix="$", style_negative="braces")([5, 24, -42]) ['$5', '$24', '($42)'] """ accuracy: Optional[float] = None precision: Optional[int] = None scale: float = 1 prefix: str = "" suffix: str = "" big_mark: str = "" decimal_mark: str = "." fill: str = "" style_negative: Literal["-", "hyphen", "parens"] = "-" style_positive: Literal["", "+", " "] = "" align: Literal["<", ">", "=", "^"] = ">" width: Optional[int] = None def __post_init__(self): if self.precision is not None: if self.accuracy is not None: raise ValueError("Specify only one of precision or accuracy") self.accuracy = 10**-self.precision
[docs] def __call__(self, x: FloatArrayLike) -> Sequence[str]: # Construct formatting according to # https://docs.python.org/3/library/string.html#format-string-syntax # Specfically using the Format Specification Mini-Language # python format only accepts ",", "_" to separate the thousands # if we have a non-standard value, we use "," & replace it after valid_big_mark = self.big_mark in ("", ",", "_") sep = self.big_mark if valid_big_mark else "," fmt = ( f"{self.prefix}" f"{{num:{sep}.{{precision}}f}}" f"{self.suffix}" ).format x = np.asarray(x) x_scaled = x * self.scale if self.accuracy is None: accuracy = precision(x_scaled) else: accuracy = self.accuracy x = round_any(x, accuracy / self.scale) digits = -np.floor(np.log10(accuracy)).astype(int) digits = np.minimum(np.maximum(digits, 0), 20) res = [fmt(num=abs(n), precision=digits) for n in x_scaled] if not valid_big_mark: res = [s.replace(",", self.big_mark) for s in res] if self.decimal_mark != ".": res = [s.replace(".", self.decimal_mark) for s in res] pos_fmt = f"{self.style_positive}{{s}}".format if self.style_negative == "-": neg_fmt = "-{s}".format elif self.style_negative == "hyphen": neg_fmt = "\u2212{s}".format else: neg_fmt = "({s})".format res = [ neg_fmt(s=s) if num < 0 else pos_fmt(s=s) for num, s in zip(x, res) ] if self.width is not None: fmt = f"{{s:{self.fill}{self.align}{self.width}}}".format res = [fmt(s=s) for s in res] return res
[docs] @dataclass class label_custom: """ Creating a custom labelling function Parameters ---------- fmt : str, optional Format string. Default is the generic new style format braces, ``{}``. style : 'new' | 'old' Whether to use new style or old style formatting. New style uses the :meth:`str.format` while old style uses ``%``. The format string must be written accordingly. Examples -------- >>> label = label_custom('{:.2f} USD') >>> label([3.987, 2, 42.42]) ['3.99 USD', '2.00 USD', '42.42 USD'] """ fmt: str = "{}" style: Literal["old", "new"] = "new"
[docs] def __call__(self, x: FloatArrayLike) -> Sequence[str]: """ Format a sequence of inputs Parameters ---------- x : array Input Returns ------- out : list List of strings. """ if self.style == "new": return [self.fmt.format(val) for val in x] elif self.style == "old": return [self.fmt % val for val in x] else: raise ValueError("style should be either 'new' or 'old'")
# formatting functions
[docs] @dataclass class label_currency(label_number): """ Labelling currencies Parameters ---------- prefix : str What to put before the value. Examples -------- >>> x = [1.232, 99.2334, 4.6, 9, 4500] >>> label_currency()(x) ['$1.23', '$99.23', '$4.60', '$9.00', '$4500.00'] >>> label_currency(prefix='C$', precision=0, big_mark=',')(x) ['C$1', 'C$99', 'C$5', 'C$9', 'C$4,500'] """ prefix: str = "$" def __post_init__(self): if self.precision is None and self.accuracy is None: self.precision = 2 super().__post_init__()
label_dollar = label_currency dollar = label_dollar()
[docs] @dataclass class label_comma(label_currency): """ Labels of numbers with commas as separators Parameters ---------- precision : int Number of digits after the decimal point. Examples -------- >>> label_comma()([1000, 2, 33000, 400]) ['1,000', '2', '33,000', '400'] """ prefix: str = "" precision: int = 0 big_mark: str = ","
[docs] @dataclass class label_percent(label_number): """ Labelling percentages Multiply by one hundred and display percent sign Examples -------- >>> label = label_percent() >>> label([.45, 9.515, .01]) ['45%', '952%', '1%'] >>> label([.654, .8963, .1]) ['65%', '90%', '10%'] """ scale: float = 100 suffix: str = "%"
percent = label_percent()
[docs] @dataclass class label_scientific: """ Scientific number labels Parameters ---------- digits : int Significant digits. Examples -------- >>> x = [.12, .23, .34, 45] >>> label_scientific()(x) ['1.2e-01', '2.3e-01', '3.4e-01', '4.5e+01'] Notes ----- Be careful when using many digits (15+ on a 64 bit computer). Consider of the `machine epsilon`_. .. _machine epsilon: https://en.wikipedia.org/wiki/Machine_epsilon """ digits: int = 3 def __post_init__(self): tpl = f"{{:.{self.digits}e}}" self._label = label_custom(tpl) self.trailling_zeros_pattern = re.compile(r"(0+)e")
[docs] def __call__(self, x: FloatArrayLike) -> Sequence[str]: if len(x) == 0: return [] def count_zeros(s): match = self.trailling_zeros_pattern.search(s) if match: return len(match.group(1)) else: return 0 # format and then remove superfluous zeros labels = self._label(x) n = min([count_zeros(val) for val in labels]) if n: labels = [val.replace("0" * n + "e", "e") for val in labels] return labels
scientific = label_scientific()
[docs] @dataclass class label_log: """ Log number labels Parameters ---------- base : int Base of the logarithm. Default is 10. exponent_limits : tuple limits (int, int) where if the any of the powers of the numbers falls outside, then the labels will be in exponent form. This only applies for base 10. mathtex : bool If True, return the labels in mathtex format as understood by Matplotlib. Examples -------- >>> label_log()([0.001, 0.1, 100]) ['0.001', '0.1', '100'] >>> label_log()([0.0001, 0.1, 10000]) ['1e-4', '1e-1', '1e4'] >>> label_log(mathtex=True)([0.0001, 0.1, 10000]) ['$10^{-4}$', '$10^{-1}$', '$10^{4}$'] """ base: float = 10 exponent_limits: TupleInt2 = (-4, 4) mathtex: bool = False def _tidyup_labels(self, labels: Sequence[str]) -> Sequence[str]: """ Make all labels uniform in format Remove redundant zeros for labels in exponential format. Parameters ---------- labels : list-like Labels to be tidied. Returns ------- out : list-like Labels """ def remove_zeroes(s: str) -> str: """ Remove unnecessary zeros for float string s """ tup = s.split("e") if len(tup) == 2: mantissa = tup[0].rstrip("0").rstrip(".") exponent = int(tup[1]) s = f"{mantissa}e{exponent}" if exponent else mantissa return s def as_exp(s: str) -> str: """ Float string s as in exponential format """ return s if "e" in s else "{:1.0e}".format(float(s)) def as_mathtex(s: str) -> str: """ Mathtex for maplotlib """ if "e" not in s: assert s == "1", f"Unexpected value {s = }, instead of '1'" return f"${self.base}^{{0}}$" exp = s.split("e")[1] return f"${self.base}^{{{exp}}}$" # If any are in exponential format, make all of # them expontential has_e = ["e" in x for x in labels] if not all(has_e) and sum(has_e): labels = [as_exp(x) for x in labels] labels = [remove_zeroes(x) for x in labels] has_e = ["e" in x for x in labels] if self.mathtex and any(has_e): labels = [as_mathtex(x) for x in labels] return labels
[docs] def __call__(self, x: FloatArrayLike) -> Sequence[str]: """ Format a sequence of inputs Parameters ---------- x : array Input Returns ------- out : list List of strings. """ if len(x) == 0: return [] # Decide on using exponents if self.base == 10: xmin = int(np.floor(np.log10(np.min(x)))) xmax = int(np.ceil(np.log10(np.max(x)))) emin, emax = self.exponent_limits all_multiples = np.all([np.log10(num).is_integer() for num in x]) beyond_threshold = xmin <= emin or emax <= xmax use_exponents = ( same_log10_order_of_magnitude(x) or all_multiples ) and beyond_threshold fmt = "{:1.0e}" if use_exponents else "{:g}" labels = [fmt.format(num) for num in x] return self._tidyup_labels(labels) else: def _exp(num, base): e = np.log(num) / np.log(base) e_round = np.round(e) e = int(e_round) if np.isclose(e, e_round) else np.round(e, 3) return e base_txt = f"{self.base}" if self.base == np.e: base_txt = "e" if self.mathtex: fmt_parts = (f"${base_txt}^", "{{{e}}}$") else: fmt_parts = (f"{base_txt}^", "{e}") fmt = "".join(fmt_parts) exps = [_exp(num, self.base) for num in x] labels = [fmt.format(e=e) for e in exps] return labels
[docs] @dataclass class label_date: """ Datetime labels Parameters ---------- fmt : str Format string. See :ref:`strftime <strftime-strptime-behavior>`. tz : datetime.tzinfo, optional Time zone information. If none is specified, the time zone will be that of the first date. If the first date has no time information then a time zone is chosen by other means. Examples -------- >>> from datetime import datetime >>> x = [datetime(x, 1, 1) for x in [2010, 2014, 2018, 2022]] >>> label_date()(x) ['2010-01-01', '2014-01-01', '2018-01-01', '2022-01-01'] >>> label_date('%Y')(x) ['2010', '2014', '2018', '2022'] Can format time >>> x = [datetime(2017, 12, 1, 16, 5, 7)] >>> label_date("%Y-%m-%d %H:%M:%S")(x) ['2017-12-01 16:05:07'] Time zones are respected >>> UTC = ZoneInfo('UTC') >>> UG = ZoneInfo('Africa/Kampala') >>> x = [datetime(2010, 1, 1, i) for i in [8, 15]] >>> x_tz = [datetime(2010, 1, 1, i, tzinfo=UG) for i in [8, 15]] >>> label_date('%Y-%m-%d %H:%M')(x) ['2010-01-01 08:00', '2010-01-01 15:00'] >>> label_date('%Y-%m-%d %H:%M')(x_tz) ['2010-01-01 08:00', '2010-01-01 15:00'] Format with a specific time zone >>> label_date('%Y-%m-%d %H:%M', tz=UTC)(x_tz) ['2010-01-01 05:00', '2010-01-01 12:00'] >>> label_date('%Y-%m-%d %H:%M', tz='EST')(x_tz) ['2010-01-01 00:00', '2010-01-01 07:00'] """ fmt: str = "%Y-%m-%d" tz: Optional[tzinfo] = None def __post_init__(self): if isinstance(self.tz, str): self.tz = ZoneInfo(self.tz)
[docs] def __call__(self, x: Sequence[datetime]) -> Sequence[str]: """ Format a sequence of inputs Parameters ---------- x : array Input Returns ------- out : list List of strings. """ if self.tz is not None: x = [d.astimezone(self.tz) for d in x] return [d.strftime(self.fmt) for d in x]
[docs] @dataclass class label_timedelta: """ Timedelta labels Parameters ---------- units : str, optional The units in which the breaks will be computed. If None, they are decided automatically. Otherwise, the value should be one of:: 'ns' # nanoseconds 'us' # microseconds 'ms' # milliseconds 's' # seconds 'min' # minute 'h' # hour 'day' # day 'week' # week 'month' # month 'year' # year show_units : bool Whether to append the units symbol to the values. zero_has_units : bool If True a value of zero usetex : bool If True, they microseconds identifier string is rendered with greek letter *mu*. Default is False. space : bool If True add a space between the value and the units use_plurals : bool If True, for the when the value is not 1 and the units are one of `week`, `month` and `year`, the plural form of the unit is used e.g. `2 weeks`. Examples -------- >>> from datetime import timedelta >>> x = [timedelta(days=31*i) for i in range(5)] >>> label_timedelta()(x) ['0 months', '1 month', '2 months', '3 months', '4 months'] >>> label_timedelta(use_plurals=False)(x) ['0 month', '1 month', '2 month', '3 month', '4 month'] >>> label_timedelta(units='day')(x) ['0 days', '31 days', '62 days', '93 days', '124 days'] >>> label_timedelta(units='day', zero_has_units=False)(x) ['0', '31 days', '62 days', '93 days', '124 days'] >>> label_timedelta(units='day', show_units=False)(x) ['0', '31', '62', '93', '124'] """ units: Optional[DurationUnit] = None show_units: bool = True zero_has_units: bool = True usetex: bool = False space: bool = True use_plurals: bool = True
[docs] def __call__(self, x: NDArrayTimedelta) -> Sequence[str]: if len(x) == 0: return [] values, units = timedelta_helper.format_info(x, self.units) labels = list(label_number()(values)) if self.show_units: if self.usetex and units == "us": units = r"$\mu s$" if self.use_plurals and units in ("day", "week", "month", "year"): units_plural = f"{units}s" else: units_plural = units if self.space: units = f" {units}" units_plural = f" {units_plural}" for i, (num, label) in enumerate(zip(values, labels)): if num == 0 and not self.zero_has_units: continue elif num == 1: labels[i] = f"{label}{units}" else: labels[i] = f"{label}{units_plural}" return labels
[docs] @dataclass class label_pvalue: """ p-values labelling Parameters ---------- accuracy : float Number to round to add_p : bool Whether to prepend "p=" or "p<" to the output Examples -------- >>> x = [.90, .15, .015, .009, 0.0005] >>> label_pvalue()(x) ['0.9', '0.15', '0.015', '0.009', '<0.001'] >>> label_pvalue(0.1)(x) ['0.9', '0.1', '<0.1', '<0.1', '<0.1'] >>> label_pvalue(0.1, True)(x) ['p=0.9', 'p=0.1', 'p<0.1', 'p<0.1', 'p<0.1'] """ accuracy: float = 0.001 add_p: float = False
[docs] def __call__(self, x: FloatArrayLike) -> Sequence[str]: """ Format a sequence of inputs Parameters ---------- x : array Input Returns ------- out : list List of strings. """ x = round_any(x, self.accuracy) below = [num < self.accuracy for num in x] if self.add_p: eq_fmt = "p={:g}".format below_label = f"p<{self.accuracy:g}" else: eq_fmt = "{:g}".format below_label = f"<{self.accuracy:g}" labels = [below_label if b else eq_fmt(i) for i, b in zip(x, below)] return labels
def ordinal(n: float, prefix="", suffix="", big_mark=""): # General Case: 0th, 1st, 2nd, 3rd, 4th, 5th, 6th, 7th, 8th, 9th # Special Case: 10th, 11th, 12th, 13th n = int(n) idx = np.min((n % 10, 4)) _suffix = ("th", "st", "nd", "rd", "th")[idx] if 11 <= (n % 100) <= 13: _suffix = "th" if big_mark: s = f"{n:,}" if big_mark != ",": s = s.replace(",", big_mark) else: s = f"{n}" return f"{prefix}{s}{_suffix}{suffix}"
[docs] @dataclass class label_ordinal: """ Ordinal number labelling Parameters ---------- prefix : str What to put before the value. suffix : str What to put after the value. big_mark : str The thousands separator. This is usually a comma or a dot. Examples -------- >>> label_ordinal()(range(8)) ['0th', '1st', '2nd', '3rd', '4th', '5th', '6th', '7th'] >>> label_ordinal(suffix=' Number')(range(11, 15)) ['11th Number', '12th Number', '13th Number', '14th Number'] """ prefix: str = "" suffix: str = "" big_mark: str = ""
[docs] def __call__(self, x: FloatArrayLike) -> Sequence[str]: labels = [ ordinal(num, self.prefix, self.suffix, self.big_mark) for num in x ] return labels
[docs] @dataclass class label_bytes: """ Labelling byte numbers Parameters ---------- symbol : str Valid symbols are "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", and "YB" for SI units, and the "iB" variants for binary units. Default is "auto" where the symbol to be used is determined separately for each value of 1x. units : "binary" | "si" Which unit base to use, 1024 for "binary" or 1000 for "si". fmt : str, optional Format sting. Default is ``{:.0f}``. Examples -------- >>> x = [1000, 1000000, 4e5] >>> label_bytes()(x) ['1000 B', '977 KiB', '391 KiB'] >>> label_bytes(units='si')(x) ['1 kB', '1 MB', '400 kB'] """ symbol: Literal["auto"] | BytesSymbol = "auto" units: Literal["binary", "si"] = "binary" fmt: str = "{:.0f} " def __post_init__(self): if self.units == "si": self.base = 1000 self._all_symbols = ( "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB", ) else: self.base = 1024 self._all_symbols = ( "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB", ) # possible exponents of base: eg 1000^1, 1000^2, 1000^3, ... exponents = np.arange(1, len(self._all_symbols) + 1, dtype=float) self._powers = self.base**exponents self._validate_symbol(self.symbol, ("auto",) + self._all_symbols)
[docs] def __call__(self, x: FloatArrayLike) -> Sequence[str]: _all_symbols = self._all_symbols symbol = self.symbol if symbol == "auto": power = [bisect_right(self._powers, val) for val in x] symbols = [_all_symbols[p] for p in power] else: power = np.array(match([symbol], _all_symbols)) symbols = [symbol] * len(x) x = np.asarray(x) power = np.asarray(power, dtype=float) values = x / self.base**power fmt = (self.fmt + "{}").format labels = [fmt(v, s) for v, s in zip(values, symbols)] return labels
def _validate_symbol(self, symbol: str, allowed_symbols: Sequence[str]): if symbol not in allowed_symbols: raise ValueError( "Symbol must be one of {}".format(allowed_symbols) )
# Deprecated comma_format = label_comma custom_format = label_custom currency_format = label_currency label_dollar = label_dollar percent_format = label_percent scientific_format = label_scientific date_format = label_date number_format = label_number log_format = label_log timedelta_format = label_timedelta pvalue_format = label_pvalue ordinal_format = label_ordinal number_bytes_format = label_bytes