Source code for aihwkit.inference.noise.hermes

# -*- coding: utf-8 -*-

# (C) Copyright 2020, 2021, 2022, 2023, 2024 IBM. All Rights Reserved.
#
# Licensed under the MIT license. See LICENSE file in the project root for details.

# pylint: disable=too-many-instance-attributes, too-many-arguments, too-many-branches

"""Phenomenological noise models for PCM devices for inference."""

from copy import deepcopy
from typing import List, Optional

from numpy import log as numpy_log
from numpy import sqrt
from torch import Tensor
from torch import abs as torch_abs
from torch import clamp, log, randn_like, zeros_like
from torch.autograd import no_grad

from aihwkit.inference.converter.base import BaseConductanceConverter
from aihwkit.inference.converter.conductance import SinglePairConductanceConverter
from aihwkit.inference.noise.base import BaseNoiseModel

_ZERO_CLIP = 1e-7



[docs]
class HermesNoiseModel(BaseNoiseModel):
    r"""A Noise model that was fitted and characterized on the PCM devices of the
    platform IBM HERMES Project Chip, see `Le Gallo et al. Nat. Electronics (2023)`_
    and `Khaddam-Aljameh et al. JSSC (2022)`_

    Expected weight noise at assumed time of inference with expected
    programming noise at 0.

    See also `Nandakumar et al. ICECS (2019)`_ for details about the
    statistical modelling methodology that was used.

    NOTE: The argument `num_devices` changes the programming method and the drift behavior of
    the model. When `num_devices` is 1, a conventional single device programming method is
    used. When `num_devices` is 2, the method from the work `Vasilopoulos et al. TED (2023)`_
    is employed (MSF), which is optimal and yields higher programming accuracy.
    For the drift characterization, though, when `num_devices` is 2 the model is applied as
    if the two devices host the same conductance and not as described in the aforementioned
    reference, due to it using a dynamic conductance mapping step which requires feedback from
    the chip in question. This simplification yields worse drift behavior in the current model
    than the one measured on-chip in the aforementioned work.

    Args:
        prog_coeff: Programming polynomial coeffs in
            :math:`\sum_i c_i \left(\frac{g_t}{g_\max}\right)^i`
        g_converter: instantiated class of the conductance converter
            (defaults to single pair)
        num_devices: The number of devices that are used to map a weight.
            Hermes supports either 1 or 2 devices per weight. When `num_devices`
            is 2, higher programming accuracy and less drift variability is measured.
            Defaults to 2.
        g_max: In :math:`\mu S`, the maximal conductance, ie the value
            the absolute max of the weights will be mapped to.
            When `num_devices = 2`, the maximum characterized conductance
            is 20.7 :math:`\mu S`. When `num_devices = 1` the maximum
            characterized value is 10.35 :math:`\mu S`. When `None` is passed (by default)
            the maximum conductance for the corresponding `num_devices` is selected.
        t_read: Parameter of the 1/f fit (in seconds).
        t_0: Parameter of the drift fit (first reading time). When `num_devices = 2`
            that time corresponds to 300s, while in the case that `num_devices = 1`
            it is 200s. If `None` is passed (by default) the time is selected according
            to the `num_devices` selection.

            Note:
                The ``t_inference`` is relative to this time `t0`
                e.g. t_inference counts from the completion of the programming
                of a device.
        prog_noise_scale: Scale for the programming noise.
        read_noise_scale: Scale for the read and accumulated noise.
        drift_scale: Scale for the  drift coefficient.
        prog_coeff_g_max_reference: reference :math:`g_\max` value
            when fitting the coefficients, since the result of the
            polynomial fit is given in uS. If
            ``prog_coeff_g_max_reference`` is not given and
            `prog_coeffs` are given explicitly, it will be set to
            ``g_max`` of the conductance converter.

    .. _`Nandakumar et al. ICECS (2019)`: https://ieeexplore.ieee.org/document/8964852
    .. _`Le Gallo et al. Nat. Electron. (2023)`: https://www.nature.com/articles/s41928-023-01010-1
    .. _`Khaddam-Aljameh et al. JSSC (2022)`: https://ieeexplore.ieee.org/document/9696185
    .. _`Vasilopoulos et al. TED (2023)`: https://ieeexplore.ieee.org/document/10281389

    """

    def __init__(
        self,
        prog_coeff: Optional[List[float]] = None,
        g_converter: Optional[BaseConductanceConverter] = None,
        num_devices: int = 2,
        g_max: Optional[float] = None,
        t_read: float = 512.0e-9,
        t_0: Optional[float] = None,
        prog_noise_scale: float = 1.0,
        read_noise_scale: float = 1.0,
        drift_scale: float = 1.0,
        prog_coeff_g_max_reference: Optional[float] = None,
    ):
        # The only valid options are 1 or 2 devices
        assert num_devices in [1, 2], "Hermes supports either 1 or 2 devices per weight"
        self.num_devices = num_devices

        # Figure out t0 depending on the num devices
        if t_0 is None:
            t_0 = 300.0 if self.num_devices == 2 else 200.0

        # Fix Gmax now
        if self.num_devices == 1:
            if g_max is not None:
                assert (
                    g_max <= 10.35
                ), "The maximum conductance characterized for single device unit cell is 10.35 uS"
            else:
                g_max = 10.35
        elif self.num_devices == 2:
            if g_max is not None:
                assert (
                    g_max <= 20.7
                ), "The maximum conductance characterized for single device unit cell is 10.35 uS"
            else:
                g_max = 20.7

        g_converter = deepcopy(g_converter) or SinglePairConductanceConverter(g_max=g_max)
        super().__init__(g_converter)

        self.g_max = getattr(self.g_converter, "g_max", g_max)
        if self.g_max is None:
            raise ValueError("g_max cannot be established from g_converter")

        if prog_coeff_g_max_reference is None:
            self.prog_coeff_g_max_reference = self.g_max

        if prog_coeff is None:
            # standard g_max are defined in respect to 20.7 or 10.35 uS. Need to
            # adjust for that in case g_max is not equal to the characterized maximum value
            if self.num_devices == 2:
                self.prog_coeff = [0.16603222, 4.71806468, -8.48101252, 4.68961419]
                self.prog_coeff_g_max_reference = 20.7
            elif self.num_devices == 1:
                self.prog_coeff = [0.15781817, 2.32443916, -2.16310839, 0.68841818]
                self.prog_coeff_g_max_reference = 10.35
        else:
            self.prog_coeff = prog_coeff

        self.t_0 = t_0
        self.t_read = t_read
        self.prog_noise_scale = prog_noise_scale
        self.read_noise_scale = read_noise_scale
        self.drift_scale = drift_scale


[docs]
    @no_grad()
    def apply_programming_noise_to_conductance(self, g_target: Tensor) -> Tensor:
        """Apply programming noise to a target conductance Tensor.

        Programming noise with additive Gaussian noise with
        conductance dependency of the variance given by a 3-degree
        polynomial.
        """
        mat = 1
        sig_prog = self.prog_coeff[0]
        for coeff in self.prog_coeff[1:]:
            mat *= g_target / self.g_max
            sig_prog += mat * coeff

        sig_prog *= self.g_max / self.prog_coeff_g_max_reference  # type: ignore
        g_prog = g_target + self.prog_noise_scale * sig_prog * randn_like(g_target)
        g_prog.clamp_(min=0.0)  # no negative conductances allowed

        return g_prog



[docs]
    @no_grad()
    def generate_drift_coefficients(self, g_target: Tensor) -> Tensor:
        """Return drift coefficients ``nu`` based on PCM measurements."""
        g_relative = clamp(torch_abs(g_target / self.g_max), min=_ZERO_CLIP)

        # gt should be normalized wrt g_max
        mu_drift, sig_drift = zeros_like(g_relative), zeros_like(g_relative)
        # Depending on the number of devices, different behavior is expected
        # for the standard deviation of the drift coefficient. The mean
        # behavior remains the same as it is considered that both devices
        # in the unit cell are programmed in the same state for simplicity.
        # The function for the mean and the standard deviation of
        # the ``nu`` factor are fitted with a branch function to match the
        # experimental data
        g_rel_low_mean, g_rel_high_mean = (
            g_relative[g_relative < 0.0945],
            g_relative[[g_relative >= 0.0945]],
        )
        mu_drift[g_relative < 0.0945] = (-0.0387 * log(g_rel_low_mean) - 0.0182).clamp(
            min=0.0720, max=0.13
        )
        mu_drift[g_relative >= 0.0945] = (
            -0.0436 * g_rel_high_mean**2 - 0.0126 * g_rel_high_mean + 0.0736
        )
        if self.num_devices == 1:
            g_rel_low_std, g_rel_high_std = (
                g_relative[g_relative < 0.3039],
                g_relative[[g_relative >= 0.3039]],
            )
            sig_drift[g_relative < 0.3039] = (-0.0120 * log(g_rel_low_std) - 0.0023).clamp(
                min=0.0124, max=0.04
            )
            sig_drift[g_relative >= 0.3039] = (
                -0.0165 * g_rel_high_std**2 + 0.0116 * g_rel_high_std + 0.0104
            )
        elif self.num_devices == 2:
            g_rel_low_std, g_rel_high_std = (
                g_relative[g_relative < 0.3055],
                g_relative[[g_relative >= 0.3055]],
            )
            sig_drift[g_relative < 0.3055] = (-0.0117 * log(g_rel_low_std) - 0.0057).clamp(
                min=0.0091, max=0.04
            )
            sig_drift[g_relative >= 0.3055] = (
                -0.0118 * g_rel_high_std**2 + 0.0093 * g_rel_high_std + 0.0073
            )

        nu_drift = torch_abs(mu_drift + sig_drift * randn_like(g_relative)).clamp(min=0.0)

        return nu_drift * self.drift_scale



[docs]
    @no_grad()
    def apply_drift_noise_to_conductance(
        self, g_prog: Tensor, drift_noise_param: Tensor, t_inference: float
    ) -> Tensor:
        """Apply the noise and drift up to the assumed inference time
        point based on PCM measurements."""
        t = t_inference + self.t_0

        # drift
        if t > self.t_0:
            g_drift = g_prog * ((t / self.t_0) ** (-drift_noise_param))
        else:
            g_drift = g_prog

        # expected accumulated 1/f noise since start of programming at t=0
        if t > 0:
            g_relative = torch_abs(g_prog) / self.g_max
            q_s = zeros_like(g_prog)
            if self.num_devices == 1:
                g_rel_low, g_rel_high = (
                    g_relative[g_relative < 0.1591],
                    g_relative[[g_relative >= 0.1591]],
                )
                q_s[g_relative < 0.1591] = (-0.0078 * log(g_rel_low) + 0.0038).clamp(
                    min=0.0179, max=0.04
                )
                q_s[g_relative >= 0.1591] = (
                    0.0664 * g_rel_high**3 - 0.1352 * g_rel_high**2 + 0.0768 * g_rel_high + 0.0088
                )
            elif self.num_devices == 2:
                g_rel_low, g_rel_high = (
                    g_relative[g_relative < 0.16],
                    g_relative[[g_relative >= 0.16]],
                )
                q_s[g_relative < 0.16] = (-0.0117 * log(g_rel_low) - 0.0069).clamp(
                    min=0.015, max=0.04
                )
                q_s[g_relative >= 0.16] = (
                    0.0069 * g_rel_high**3 - 0.0280 * g_rel_high**2 + 0.0211 * g_rel_high + 0.0123
                )
            sig_noise = q_s * sqrt(numpy_log((t + self.t_read) / (2 * self.t_read)))
            g_final = g_drift + torch_abs(g_drift) * self.read_noise_scale * sig_noise * randn_like(
                g_prog
            )
        else:
            g_final = g_prog

        return g_final.clamp(min=0.0)