Source code for aihwkit.experiments.experiments.inferencing

# -*- coding: utf-8 -*-

# (C) Copyright 2020, 2021, 2022, 2023, 2024 IBM. All Rights Reserved.
#
# This code is licensed under the Apache License, Version 2.0. You may
# obtain a copy of this license in the LICENSE.txt file in the root directory
# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
#
# Any modifications or derivative works of this code must retain this
# copyright notice, and modified files need to carry a notice indicating
# that they have been altered from the originals.

"""Basic inferencing Experiment."""

# pylint: disable=too-many-locals

from typing import Any, Dict, Tuple, Type, Optional
from os import path, mkdir
from copy import deepcopy
from requests import get as requests_get
from numpy import ndarray, array, logspace, log10, zeros, concatenate


from torch import device as torch_device, max as torch_max, Tensor
from torch import load
from torch.nn import Module, CrossEntropyLoss
from torch.nn.modules.loss import _Loss
from torch.utils.data import DataLoader, Dataset, Subset
from torchvision.datasets import FashionMNIST, SVHN
from torchvision.transforms import Compose, Normalize, ToTensor

from aihwkit.experiments.experiments.base import Experiment, Signals
from aihwkit.nn.modules.base import AnalogLayerBase
from aihwkit.utils.legacy import convert_legacy_checkpoint


WEIGHT_TEMPLATE_URL = "https://github.com/IBM-AI-Hardware-Center/Composer/raw/main/"


[docs]def download(url: str, destination: str) -> None: """Helper for downloading a file from url""" response = requests_get(url, timeout=30.0) with open(destination, "wb") as file_: file_.write(response.content)
[docs]class BasicInferencing(Experiment): """Experiment for inferencing a neural network. ``Experiment`` that represents inferencing a neural network using a basic inferencing loop. This class contains: * the data needed for an experiment. The recommended way of setting this data is via the arguments of the constructor. Additionally, some of the items have getters that are used by the ``Workers`` that execute the experiments and by the inferencing loop. * the inferencing algorithm, with the main entry point being ``train()``. Note: When executing a ``BasicInferencing`` in the cloud, additional constraints are applied to the data. For example, the model is restricted to sequential layers of specific types; the dataset choices are limited, etc. Please check the ``CloudRunner`` documentation. """ def __init__( self, dataset: Type[Dataset], model: Module, batch_size: int = 10, loss_function: type = CrossEntropyLoss, weight_template_id: str = "", inference_repeats: int = 2, inference_time: int = 86400, remap_weights: bool = True, ): """Create a new ``BasicInferencing``. Args: dataset: the dataset class to be used. model: the neural network to use for inferencing. batch_size: the batch size used for inferencing. loss_function: the loss function used in the neural network. weight_template_id: weights and biases of the trained neural network. inference_repeats: the number of times running the inference. inference_time: the time span between programming the chip and performing the inference. remap_weights: whether to remap the weights """ self.dataset = dataset self.model = model self.batch_size = batch_size self.loss_function = loss_function self.inference_repeats = inference_repeats self.inference_time = inference_time self.weight_template_id = weight_template_id self.remap_weights = remap_weights super().__init__()
[docs] def get_dataset_arguments(self, dataset: type) -> Tuple[Dict, Dict]: """Return the dataset constructor arguments for specifying subset.""" if dataset in (SVHN,): return {"split": "train"}, {"split": "test"} return {"train": True}, {"train": False}
[docs] def get_dataset_transform(self, dataset: type) -> Any: """Return the dataset transform.""" # Normalize supported datasets. if dataset == FashionMNIST: # mean = Tensor([0.2860]) # std_dev = Tensor([0.3205]) # transform = Compose([ToTensor(), Normalize(mean, std_dev)]) # Note: I removed the normalize step to match up with # the steps that were used by Fabio to generate the weight file. transform = Compose([ToTensor()]) elif dataset == SVHN: mean = Tensor([0.4377, 0.4438, 0.4728]) std_dev = Tensor([0.1980, 0.2010, 0.1970]) transform = Compose([ToTensor(), Normalize(mean, std_dev)]) else: transform = Compose([ToTensor()]) return transform
[docs] def get_data_loader( self, dataset: type, batch_size: int, max_elements: int = 0, dataset_root: str = "/tmp/datasets", ) -> DataLoader: """Return `DataLoaders` for the selected dataset. Args: dataset: the dataset class to be used. batch_size: the batch size used for inferencing. max_elements: the maximum number of elements of the dataset to be used. If ``0``, the full dataset is used. dataset_root: the path to the folder where the files from the dataset are stored. Returns: A tuple with the inferencing and validation loaders. """ # Create the sets and the loaders. _, test_args = self.get_dataset_arguments(dataset) transform = self.get_dataset_transform(dataset) # Create the validation set validation_set = dataset(dataset_root, transform=transform, **test_args) if max_elements > 0: validation_set = Subset(validation_set, range(max_elements)) validation_loader = DataLoader(validation_set, batch_size=batch_size, shuffle=False) return validation_loader
[docs] def get_model(self, weight_template_id: str, device: torch_device) -> Module: """Get a copy of the set-up model (with load the weights and biases) from the original experiment model. Args: weight_template_id: location/index for the file that contains the state_dicts for the model. device: the torch device used for the model. Returns: a copied model with loaded weights and biases """ model = deepcopy(self.model) if weight_template_id != "": if weight_template_id[0:1] == "." or weight_template_id[0:1] == "/": # This is the case where it is a local file template_path = weight_template_id else: template_dir = "/tmp/weight_templates" if weight_template_id.startswith("http"): template_url = weight_template_id else: # print('weights_template_id: ', weight_template_id) template_path = template_dir + "/" + weight_template_id + ".pth" template_url = WEIGHT_TEMPLATE_URL + weight_template_id + ".pth" # check if the file exists if not path.exists(template_dir): mkdir(template_dir) if not path.exists(template_path): download(template_url, template_path) # print('template_path: ', template_path) if path.exists(template_path): state_dict = load(template_path, map_location=device) state_dict, _ = convert_legacy_checkpoint(state_dict, model) model.load_state_dict(state_dict, load_rpu_config=False) else: print("Checkpoint file: ", template_path, " does not exist.") if self.remap_weights: model.remap_analog_weights() return model.to(device)
[docs] def inferencing_step( self, validation_loader: DataLoader, model: Module, loss_function: _Loss, t_inference_list: list, device: torch_device, ) -> Tuple[ndarray, ndarray, ndarray]: """Run a single inferencing. Args: validation_loader: the data loader for the inferencing data. model: the neural network to be trained. loss_function: the loss function used for inferencing. t_inference_list: list of t_inferences. device: the torch device used for the model. Return: Tuple of ndarray of inference accuracy, error and loss. """ # Set the mode mode to eval mode. model.eval() # Reset the program analog weights. model.program_analog_weights() n_inference = len(t_inference_list) infer_accuracy = zeros(n_inference) infer_error = zeros(n_inference) infer_loss = zeros(n_inference) # Simulation of inference pass at different times after training. # Go through the generated list for idx, t_inference in enumerate(t_inference_list): # Set the drift_analog_weights model.drift_analog_weights(t_inference) # Initialize variables as needed. predicted_ok = 0 total_images = 0 total_loss = 0 # Go through the images in the validation dataset for images, labels in validation_loader: # Load the images and labels into the memory of the device images = images.to(device) labels = labels.to(device) # Do prediction for the images using the model. predict = model(images) # Calculate the loss loss = loss_function(predict, labels) # Cummulate the loss to total_loss n_images = images.size(0) total_images += n_images total_loss += loss.item() * n_images _, predicted = torch_max(predict.data, 1) predicted_ok += (predicted == labels).sum().item() # Save the information in the np arrays and return accuracy_post = predicted_ok / total_images * 100.0 infer_accuracy[idx] = accuracy_post infer_error[idx] = 100.0 - accuracy_post infer_loss[idx] = total_loss / total_images return infer_accuracy, infer_error, infer_loss
[docs] def inference( self, validation_loader: DataLoader, model: Module, loss_function: _Loss, inference_repeats: int, inference_time: int, device: torch_device, n_inference_times: int = 10, ) -> Dict: """Run the inferencing loop. Args: validation_loader: the data loader for the validation data. model: the neural network to be trained. loss_function: the loss function used for inferencing. inference_repeats: the number of times to repeat the process zof programming and drifting. inference_time: the time span between programming the chip and performing the inference. device: the torch device used for the model. n_inference_times: how many inference times (log-spaced) Returns: A list of the metrics for each epoch. """ # Move the model to the device if needed. if device: model = model.to(device) # Create the t_inference_list using inference_time. # Generate the 9 values between 0 and the inference time using log10 t_inference_list = [0.0] + logspace( 0, log10(float(inference_time)), n_inference_times - 1 ).tolist() repeat_results = {} accuracy_array = array([], "float") error_array = array([], "float") loss_array = array([], "float") for repeat in range(inference_repeats): self._call_hook(Signals.INFERENCE_REPEAT_START, repeat) infer_accuracy, infer_error, infer_loss = self.inferencing_step( validation_loader, model, loss_function, t_inference_list, device ) # Save the info accuracy_array = concatenate([accuracy_array, infer_accuracy]) # type: ignore error_array = concatenate([error_array, infer_error]) # type: ignore loss_array = concatenate([loss_array, infer_loss]) # type: ignore # call the metric hook function with the average information # to write out the partial result to standard out. shape = (repeat + 1, n_inference_times) repeat_results = self._call_hook( Signals.INFERENCE_REPEAT_END, array(t_inference_list), accuracy_array.reshape(shape).mean(axis=0), accuracy_array.reshape(shape).std(axis=0), error_array.reshape(shape).mean(axis=0), loss_array.reshape(shape).mean(axis=0), self.inference_repeats, ) return deepcopy(repeat_results)
def _print_rpu_fields(self, model: Module) -> None: """Print the Inference RPU Config fields""" print("\n>>> inferenceworker.py: STARTING _print_rpu_fields() ") for name, module in model.named_modules(): if not isinstance(module, AnalogLayerBase): continue print(f"RPUConfig of module {name}:") tile = next(module.analog_tiles()) print(tile.rpu_config) print(tile.tile) print("-------------") print("\n>>> inferenceworker.py: ENDING _print_rpu_fields() ")
[docs] def run( self, max_elements: int = 0, dataset_root: str = "/tmp/data", device: Optional[torch_device] = None, ) -> Dict: """Sets up the internal model and runs the inference. Results are returned and the internal model is updated. """ # Build the objects needed for inferencing. # Get valication dataset validation_loader = self.get_data_loader( self.dataset, self.batch_size, max_elements=max_elements, dataset_root=dataset_root ) # Load the weights and biases to the model. # Assumption: the model already includes the customer-specified InferenceRPUConfig. model = self.get_model(self.weight_template_id, device) self._print_rpu_fields(model) # Invoke the inference step result = self.inference( validation_loader, model, self.loss_function(), self.inference_repeats, self.inference_time, device, ) self.model = model # update the stored model with the trained one return result
def __str__(self) -> str: """Return a string representation of a BasicInferencing experiment.""" return ( "{}(dataset={}, batch_size={}, loss_function={}, inference_repeats={}, " "inference_time={}, model={})".format( self.__class__.__name__, getattr(self.dataset, "__name__", self.dataset), self.batch_size, getattr(self.loss_function, "__name__", self.loss_function), self.inference_repeats, self.inference_time, self.model, ) )