TensorRT Backend API

TensorRTBackend

aitune.torch.backend.tensorrt.TensorRTBackend

TensorRTBackend(config=None)

Bases: Backend, TensorRTRunner

TensorRT backend for model acceleration.

This class provides functionality to build and run TensorRT engines from PyTorch models. It handles the process of exporting models to ONNX and then converting them to TensorRT engines for optimized inference.

Initialize the TensorRT backend.

Parameters:

config (TensorRTBackendConfig | None, default: None ) –

Configuration for TensorRT backend

Source code in aitune/torch/backend/tensorrt/tensorrt_backend.py

def __init__(
    self,
    config: TensorRTBackendConfig | None = None,
):
    """Initialize the TensorRT backend.

    Args:
        config: Configuration for TensorRT backend
    """
    super().__init__()

    # Create system monitor for tracking memory usage
    self._system_monitor = SystemMonitor()

    self._config = config or TensorRTBackendConfig()

    if self._config.profiles == ProfileMode.SAMPLES_USED and global_config.max_num_samples_stored <= 1:
        raise ValueError(
            """aitune.torch.config.max_num_samples_stored is set to 1, change it to number of samples to use for profile generation.
            Example:
            from aitune.torch.config import config as global_config
            global_config.max_num_samples_stored = <number of samples>
            """
        )

    self._context = None
    self._io_tensors = None
    self._output_names = None
    self._input_names = None
    self._engine_info = None
    self._cuda_stream = None
    self._start_time = None
    self._end_time = None
    self._outputs = None

    # build variables
    self._engine_path = None
    self._trt_optimization_profiles_path = None
    self._output_object = None
    self._graph_spec = None

    # runtime variables
    self._output_allocator = None
    self._trt_runtime = None
    self._trt_optimization_profiles: list[Profile] = []

    # CUDA graph variables
    self._cuda_graph = None
    self._last_input_shapes = None
    self._static_inputs = {}
    self._infer_cuda_graph = None

device `property`

device

Get the device of the backend.

Returns:

device –

The device the module is using.

is_active `property`

is_active

Returns True if the backend is active.

name `property`

name

Name of a backend.

activate

activate()

Activates backend.

After activating, the backend should be ready to do inference.

Source code in aitune/torch/backend/backend.py

@nvtx.annotate(domain="AITune", color="black")
def activate(self):
    """Activates backend.

    After activating, the backend should be ready to do inference.
    """
    if self.state == BackendState.INIT:
        raise RuntimeError(f"Cannot activate backend {self.name}, backend should be built first")
    if self.state == BackendState.DEPLOYED:
        raise RuntimeError(f"Cannot activate backend {self.name}, backend is already deployed")

    if self.state == BackendState.INACTIVE or self.state == BackendState.CHECKPOINT_LOADED:
        self._activate()
        self.state = BackendState.ACTIVE

build

build(module, graph_spec, data, device, cache_dir)

Build the model with the given arguments.

Building a backend should be idempotent i.e. do not cause side effects. A model is not necessarily pure functional and can have an internal state (like kv cache for LLMs). That is why build can call a sample of inputs at most once so that subsequent calls have exact same state as the first call for the given sample.

After building, the backend should be activated.

Source code in aitune/torch/backend/backend.py

def build(
    self,
    module: nn.Module,
    graph_spec: GraphSpec,
    data: list[Sample],
    device: torch.device,
    cache_dir: Path,
) -> "Backend":
    """Build the model with the given arguments.

    Building a backend should be idempotent i.e. do not cause side effects. A model is not necessarily pure
    functional and can have an internal state (like kv cache for LLMs). That is why build can call a sample of
    inputs at most once so that subsequent calls have exact same state as the first call for the given sample.

    After building, the backend should be activated.
    """
    if self.state == BackendState.INIT:
        try:
            self._assert_device(device)
            self._set_device(device)
            ready_backend = self._build(module, graph_spec, data, cache_dir)
            self.state = BackendState.ACTIVE
            return ready_backend
        except Exception as e:
            self._logger.error("Failed to build backend(%s): %s", self.__class__.__name__, e, exc_info=True)
            raise e
    else:
        raise RuntimeError(f"Backend {self.name} build should be called only once")

deactivate

deactivate()

Deactivates backend.

After deactivating, the backend cannot be used to do inference.

Source code in aitune/torch/backend/backend.py

def deactivate(self):
    """Deactivates backend.

    After deactivating, the backend cannot be used to do inference.
    """
    if self.state == BackendState.INIT:
        raise RuntimeError(f"Cannot deactivate backend {self.name}, backend should be built first")
    if self.state == BackendState.DEPLOYED:
        raise RuntimeError(f"Cannot deactivate backend {self.name}, backend is already deployed")
    if self.state == BackendState.CHECKPOINT_LOADED:
        raise RuntimeError(f"Cannot deactivate backend {self.name}, backend has already been deployed")

    if self.state == BackendState.ACTIVE:
        self._deactivate()
        self._clean_memory()
        self.state = BackendState.INACTIVE

deploy

deploy(device)

Deploys the backend.

After deploying, the backend is ready to do inference. Backend cannot be deactivated anymore.

Parameters:

device (device | None) –

The device to deploy the backend on.

Source code in aitune/torch/backend/backend.py

def deploy(self, device: torch.device | None):
    """Deploys the backend.

    After deploying, the backend is ready to do inference. Backend cannot be deactivated anymore.

    Args:
        device: The device to deploy the backend on.
    """
    if self.state != BackendState.CHECKPOINT_LOADED:
        raise RuntimeError(f"Cannot deploy backend {self.name}, backend should be loaded from a checkpoint")

    self._set_device(device)
    self._deploy()
    self.state = BackendState.DEPLOYED

describe

describe()

Returns the description of the backend.

Source code in aitune/torch/backend/tensorrt/tensorrt_backend.py

def describe(self) -> str:
    """Returns the description of the backend."""
    return f"{self.__class__.__name__}({self._config.describe()})"

from_dict `classmethod`

from_dict(module, state_dict)

Creates a backend from a state_dict.

Source code in aitune/torch/backend/tensorrt/tensorrt_backend.py

@classmethod
def from_dict(cls, module: torch.nn.Module, state_dict: dict):
    """Creates a backend from a state_dict."""
    backend = cls()
    backend._engine_path = state_dict[cls.STATE_ENGINE_PATH]
    backend._trt_optimization_profiles_path = state_dict.get(cls.STATE_TRT_OPTIMIZATION_PROFILES_PATH, [])
    backend._graph_spec = GraphSpec.from_dict(state_dict[cls.STATE_GRAPH_SPEC])
    backend._device = state_dict[cls.STATE_DEVICE]
    backend.state = BackendState.CHECKPOINT_LOADED

    # Reconstruct config with quantization settings
    backend._config = TensorRTBackendConfig.from_dict(state_dict[cls.STATE_CONFIG])

    backend._output_object = state_dict[cls.STATE_OUTPUT_OBJECT]
    # Ensure CUDA graphs are disabled when loading from checkpoint
    # CUDA graphs cannot be serialized and must be re-captured
    if backend._config.use_cuda_graphs:
        logger.info("CUDA graphs were enabled in saved state, but will be re-captured on first inference")
        # CUDA graph state will be None initially, triggering re-capture

    return backend

get_profiles

get_profiles(graph_spec, data)

Create profiles from samples or from graph_spec.

If self._config.profiles is a list, return the user provided profiles. If self._config.profiles is ProfileMode.SINGLE, create a single profile from the graph spec. If self._config.profiles is ProfileMode.SAMPLES_USED, create profiles from shapes seen in samples.

Parameters:

graph_spec (GraphSpec) –

Input graph spec
data (list[Sample]) –

List of samples

Returns: List of The Polygraphy Profile objects

Source code in aitune/torch/backend/tensorrt/tensorrt_backend.py

def get_profiles(self, graph_spec: GraphSpec, data: list[Sample]) -> list[Profile]:
    """Create profiles from samples or from graph_spec.

    If self._config.profiles is a list, return the user provided profiles.
    If self._config.profiles is ProfileMode.SINGLE, create a single profile from the graph spec.
    If self._config.profiles is ProfileMode.SAMPLES_USED, create profiles from shapes seen in samples.

    Args:
        graph_spec: Input graph spec
        data: List of samples
    Returns:
        List of The Polygraphy Profile objects
    """
    # if user provided profiles, return them
    if isinstance(self._config.profiles, list):
        return [user_config.profile for user_config in self._config.profiles]

    if self._config.profiles == ProfileMode.SINGLE:
        # this will create a single profile from the graph spec
        return self._get_profiles_from_shapes()

    profiles = OrderedDict()
    logger.info("Creating profiles from samples used for tuning")

    # Create a profile
    for idx, sample in enumerate(data):
        profile = TensorRTProfile()
        args, kwargs = sample
        for locator, tensor_spec in graph_spec.input_spec.tensor_data:
            if tensor_spec.name.startswith("args"):
                shape = locator.get_value(args).shape
            else:
                shape = locator.get_value(kwargs).shape
            profile.add_input_shape(tensor_spec.name, shape, shape, shape)

        logger.debug("Created profile %d: %s", idx, profile)
        profiles[profile] = True

    return [profile.profile for profile in profiles.keys()]

infer

infer(*args, **kwargs)

Run inference with the given arguments.

Parameters:

args (Any, default: () ) –

Variable length argument list.
kwargs (Any, default: {} ) –

Arbitrary keyword arguments.

Returns:

Any ( Any ) –

The result of the inference.

Source code in aitune/torch/backend/backend.py

def infer(self, *args: Any, **kwargs: Any) -> Any:
    """Run inference with the given arguments.

    Args:
        args: Variable length argument list.
        kwargs: Arbitrary keyword arguments.

    Returns:
        Any: The result of the inference.
    """
    if self.state != BackendState.ACTIVE and self.state != BackendState.DEPLOYED:
        raise RuntimeError(f"Cannot run inference, backend {self.name} should be activated first")

    return self._infer(*args, **kwargs)

key

key()

Returns the key of the backend.

Source code in aitune/torch/backend/tensorrt/tensorrt_backend.py

def key(self) -> str:
    """Returns the key of the backend."""
    return f"{self.__class__.__name__}_{self._config.key()}"

to_dict

to_dict()

Returns the state_dict of the backend.

Source code in aitune/torch/backend/tensorrt/tensorrt_backend.py

def to_dict(self):
    """Returns the state_dict of the backend."""
    return {
        self.STATE_TYPE: self.__class__.__name__,
        self.STATE_ENGINE_PATH: self._engine_path,
        self.STATE_OUTPUT_OBJECT: self._output_object,
        self.STATE_GRAPH_SPEC: self._graph_spec.to_dict(),
        self.STATE_DEVICE: self._device,
        self.STATE_QUANTIZATION_CONFIG: self._config.quantization_config,
        self.STATE_CONFIG: self._config.to_dict(),
        self.STATE_USE_CUDA_GRAPHS: self._config.use_cuda_graphs,
        self.STATE_TRT_OPTIMIZATION_PROFILES_PATH: self._trt_optimization_profiles_path,
    }

TensorRTBackendConfig

aitune.torch.backend.tensorrt.TensorRTBackendConfig `dataclass`

TensorRTBackendConfig(use_dynamo=True, workspace_size=None, opset_version=None, optimization_level=None, compatibility_level=None, timing_cache=None, profiles=SINGLE, device='cuda', quantization_config=None, enable_tf32=True, use_cuda_graphs=False)

Bases: BackendConfig

Configuration for TensorRT backend.

Attributes:

use_dynamo (bool) –

Whether to use torch.dynamo for export.
workspace_size (int | None) –

The workspace size for the TensorRT engine.
opset_version (int | None) –

The ONNX opset version to use for export.
optimization_level (int | None) –

The optimization level for the TensorRT engine.
compatibility_level (int | None) –

The compatibility level for the TensorRT engine.
timing_cache (Path | None) –

The path to the timing cache for the TensorRT engine.
profiles (ProfileMode | list[TensorRTProfile]) –

How TensorRT optimization profiles are generated. - SINGLE: auto-generate a single profile from the graph spec (default). - SAMPLES_USED: auto-generate multiple profiles from shapes of samples used for tuning. - list[TensorRTProfile]: use user-provided profiles directly.
device (str) –

The device to use for the TensorRT engine.
quantization_config (ONNXAutoCastConfig | ONNXQuantizationConfig | TorchQuantizationConfig | None) –

The quantization configuration for the TensorRT engine.
enable_tf32 (bool) –

Whether to enable TF32 hardware acceleration.
use_cuda_graphs (bool) –

Whether to use CUDA graphs for the TensorRT engine.

describe

describe()

Describe the backend configuration. Display only changed fields.

Source code in aitune/torch/backend/backend.py

def describe(self) -> str:
    """Describe the backend configuration. Display only changed fields."""
    default = self.__class__()
    changed_fields = self._get_changed_fields(self, default)
    return ",".join(changed_fields)

from_dict `classmethod`

from_dict(state_dict)

Convert dict to TensorRTBackendConfig.

Source code in aitune/torch/backend/tensorrt/tensorrt_backend.py

@classmethod
def from_dict(cls, state_dict: dict):
    """Convert dict to TensorRTBackendConfig."""
    if "profiles" in state_dict:
        state_dict["profiles"] = cls.profiles_from_dict(state_dict["profiles"])
    return cls(**state_dict)

key

key()

Returns the keys of the backend configuration.

Source code in aitune/torch/backend/backend.py

def key(self) -> str:
    """Returns the keys of the backend configuration."""
    config_dict = self.to_dict()
    config_dict = self._to_json(config_dict)
    config_dict_str = json.dumps(config_dict)
    key = hash_string(config_dict_str)
    return key

profiles_from_dict `classmethod`

profiles_from_dict(data)

Convert dict to list of TensorRTProfile.

Source code in aitune/torch/backend/tensorrt/tensorrt_backend.py

@classmethod
def profiles_from_dict(cls, data: str | list[dict]) -> ProfileMode | list[TensorRTProfile]:
    """Convert dict to list of TensorRTProfile."""
    if isinstance(data, list):
        return [TensorRTProfile.from_dict(profile) for profile in data]
    return ProfileMode(data)

to_dict

to_dict()

Convert TensorRTBackendConfig to dictionary.

Source code in aitune/torch/backend/tensorrt/tensorrt_backend.py

def to_dict(self) -> dict:
    """Convert TensorRTBackendConfig to dictionary."""
    state_dict = asdict(self)
    if isinstance(self.profiles, list):
        state_dict["profiles"] = [
            TensorRTProfile.profile_to_dict(profile.profile) for profile in state_dict["profiles"]
        ]

    return state_dict

to_json

to_json(path)

Saves the backend configuration to a file.

Source code in aitune/torch/backend/backend.py

def to_json(self, path: Path):
    """Saves the backend configuration to a file."""
    config_dict = self.to_dict()
    config_dict = self._to_json(config_dict)
    with open(path, "w") as f:
        json.dump(config_dict, f, indent=2)

TensorRTProfile

aitune.torch.backend.tensorrt.TensorRTProfile

TensorRTProfile()

Class for representing a TensorRT optimization profile.

This class provides an interface for defining optimization profiles for TensorRT engines with dynamic shapes.

Initialize a TensorRT optimization profile.

Source code in aitune/torch/backend/tensorrt/tensorrt_profile.py

def __init__(self):
    """Initialize a TensorRT optimization profile."""
    self._profile = Profile()

profile `property`

profile

Get the underlying Polygraphy Profile.

Returns:

Profile –

The Polygraphy Profile object

eq

__eq__(other)

Check if two TensorRTProfiles are equal.

Source code in aitune/torch/backend/tensorrt/tensorrt_profile.py

def __eq__(self, other: "TensorRTProfile") -> bool:
    """Check if two TensorRTProfiles are equal."""
    return hash(self) == hash(other)

hash

__hash__()

Hash the TensorRTProfile.

Source code in aitune/torch/backend/tensorrt/tensorrt_profile.py

def __hash__(self) -> int:
    """Hash the TensorRTProfile."""
    return hash(
        tuple(
            sorted(
                ((name, min_, opt_, max_) for name, (min_, opt_, max_) in self._profile.items()), key=lambda x: x[0]
            )
        )
    )

repr

__repr__()

Return the official string representation of the profile.

Returns:

str –

Official string representation

Source code in aitune/torch/backend/tensorrt/tensorrt_profile.py

def __repr__(self) -> str:
    """Return the official string representation of the profile.

    Returns:
        Official string representation
    """
    return repr(self._profile)

str

__str__()

Return string representation of the profile.

Returns:

str –

String representation

Source code in aitune/torch/backend/tensorrt/tensorrt_profile.py

def __str__(self) -> str:
    """Return string representation of the profile.

    Returns:
        String representation
    """
    return str(self._profile)

add_input_shape

add_input_shape(name, min_shape, opt_shape, max_shape)

Add a shape binding to the profile.

Parameters:

name (str) –

The name of the input tensor
min_shape (tuple[int, ...]) –

The minimum shape the profile will support
opt_shape (tuple[int, ...]) –

The shape for which TensorRT will tune the engine
max_shape (tuple[int, ...]) –

The maximum shape the profile will support

Returns:

TensorRTProfile –

The profile object for chaining

Source code in aitune/torch/backend/tensorrt/tensorrt_profile.py

def add_input_shape(
    self, name: str, min_shape: tuple[int, ...], opt_shape: tuple[int, ...], max_shape: tuple[int, ...]
) -> "TensorRTProfile":
    """Add a shape binding to the profile.

    Args:
        name: The name of the input tensor
        min_shape: The minimum shape the profile will support
        opt_shape: The shape for which TensorRT will tune the engine
        max_shape: The maximum shape the profile will support

    Returns:
        The profile object for chaining
    """
    self._profile.add(name=name, min=min_shape, opt=opt_shape, max=max_shape)
    logger.debug(
        "Added profile for input '%s': min=%s, opt=%s, max=%s",
        name,
        min_shape,
        opt_shape,
        max_shape,
    )
    return self

from_dict `classmethod`

from_dict(data)

Create TensorRTProfile from dictionary.

Source code in aitune/torch/backend/tensorrt/tensorrt_profile.py

@classmethod
def from_dict(cls, data: dict) -> "TensorRTProfile":
    """Create TensorRTProfile from dictionary."""
    profile = cls()
    for name, (min_, opt_, max_) in data.items():
        profile.add_input_shape(name, tuple(min_), tuple(opt_), tuple(max_))
    return profile

profile_to_dict `classmethod`

profile_to_dict(profile)

Convert Polygraphy Profile to dictionary.

Source code in aitune/torch/backend/tensorrt/tensorrt_profile.py

@classmethod
def profile_to_dict(cls, profile: Profile) -> dict:
    """Convert Polygraphy Profile to dictionary."""
    return {name: [min_, opt_, max_] for name, (min_, opt_, max_) in profile.items()}

to_dict

to_dict()

Convert TensorRTProfile to dictionary.

Source code in aitune/torch/backend/tensorrt/tensorrt_profile.py

def to_dict(self) -> dict:
    """Convert TensorRTProfile to dictionary."""
    return self.profile_to_dict(self._profile)

ProfileMode

aitune.torch.backend.tensorrt.ProfileMode

Bases: Enum

Mode how TRT optimization profiles will be generated for TensorRT engine.

Attributes:

SINGLE –

auto-generate single profile from graph spec, default mode.
SAMPLES_USED –

auto-generated multiple profiles from shapes of samples used for tuning.

TensorRT Backend API