push sheeet

2025-10-09 14:15:47 +02:00
commit 646b892680
49168 changed files with 5897842 additions and 0 deletions
--- a/pkgs/development/cuda-modules/packages/autoAddCudaCompatRunpath/auto-add-cuda-compat-runpath.sh
+++ b/pkgs/development/cuda-modules/packages/autoAddCudaCompatRunpath/auto-add-cuda-compat-runpath.sh
@@ -0,0 +1,27 @@
+# shellcheck shell=bash
+# Patch all dynamically linked, ELF files with the CUDA driver (libcuda.so)
+# coming from the cuda_compat package by adding it to the RUNPATH.
+echo "Sourcing auto-add-cuda-compat-runpath-hook"
+
+addCudaCompatRunpath() {
+  local libPath
+  local origRpath
+
+  if [[ $# -eq 0 ]]; then
+    echo "addCudaCompatRunpath: no library path provided" >&2
+    exit 1
+  elif [[ $# -gt 1 ]]; then
+    echo "addCudaCompatRunpath: too many arguments" >&2
+    exit 1
+  elif [[ "$1" == "" ]]; then
+    echo "addCudaCompatRunpath: empty library path" >&2
+    exit 1
+  else
+    libPath="$1"
+  fi
+
+  origRpath="$(patchelf --print-rpath "$libPath")"
+  patchelf --set-rpath "@libcudaPath@:$origRpath" "$libPath"
+}
+
+postFixupHooks+=("autoFixElfFiles addCudaCompatRunpath")
--- a/pkgs/development/cuda-modules/packages/autoAddCudaCompatRunpath/package.nix
+++ b/pkgs/development/cuda-modules/packages/autoAddCudaCompatRunpath/package.nix
@@ -0,0 +1,29 @@
+# autoAddCudaCompatRunpath hook must be added AFTER `setupCudaHook`. Both
+# hooks prepend a path with `libcuda.so` to the `DT_RUNPATH` section of
+# patched elf files, but `cuda_compat` path must take precedence (otherwise,
+# it doesn't have any effect) and thus appear first. Meaning this hook must be
+# executed last.
+{
+  autoFixElfFiles,
+  cuda_compat,
+  makeSetupHook,
+}:
+makeSetupHook {
+  name = "auto-add-cuda-compat-runpath-hook";
+  propagatedBuildInputs = [ autoFixElfFiles ];
+
+  substitutions = {
+    libcudaPath = "${cuda_compat}/compat";
+  };
+
+  meta =
+    let
+      # Handle `null`s in pre-`cuda_compat` releases,
+      # and `badPlatform`s for `!isJetsonBuild`.
+      platforms = cuda_compat.meta.platforms or [ ];
+      badPlatforms = cuda_compat.meta.badPlatforms or platforms;
+    in
+    {
+      inherit badPlatforms platforms;
+    };
+} ./auto-add-cuda-compat-runpath.sh
--- a/pkgs/development/cuda-modules/packages/backendStdenv.nix
+++ b/pkgs/development/cuda-modules/packages/backendStdenv.nix
@@ -0,0 +1,154 @@
+# This is what nvcc uses as a backend,
+# and it has to be an officially supported one (e.g. gcc14 for cuda12).
+#
+# It, however, propagates current stdenv's libstdc++ to avoid "GLIBCXX_* not found errors"
+# when linked with other C++ libraries.
+# E.g. for cudaPackages_12_9 we use gcc14 with gcc's libstdc++
+# Cf. https://github.com/NixOS/nixpkgs/pull/218265 for context
+{
+  config,
+  _cuda,
+  cudaMajorMinorVersion,
+  lib,
+  pkgs,
+  stdenv,
+  stdenvAdapters,
+}:
+let
+  inherit (builtins) toJSON;
+  inherit (_cuda.db) allSortedCudaCapabilities cudaCapabilityToInfo nvccCompatibilities;
+  inherit (_cuda.lib)
+    _cudaCapabilityIsDefault
+    _cudaCapabilityIsSupported
+    _evaluateAssertions
+    getRedistSystem
+    mkVersionedName
+    ;
+  inherit (lib) addErrorContext;
+  inherit (lib.customisation) extendDerivation;
+  inherit (lib.lists) filter intersectLists subtractLists;
+
+  # NOTE: By virtue of processing a sorted list (allSortedCudaCapabilities), our groups will be sorted.
+
+  architectureSpecificCudaCapabilities = filter (
+    cudaCapability: cudaCapabilityToInfo.${cudaCapability}.isArchitectureSpecific
+  ) allSortedCudaCapabilities;
+
+  familySpecificCudaCapabilities = filter (
+    cudaCapability: cudaCapabilityToInfo.${cudaCapability}.isFamilySpecific
+  ) allSortedCudaCapabilities;
+
+  jetsonCudaCapabilities = filter (
+    cudaCapability: cudaCapabilityToInfo.${cudaCapability}.isJetson
+  ) allSortedCudaCapabilities;
+
+  passthruExtra = {
+    nvccHostCCMatchesStdenvCC = backendStdenv.cc == stdenv.cc;
+
+    # The Nix system of the host platform.
+    hostNixSystem = stdenv.hostPlatform.system;
+
+    # The Nix system of the host platform for the CUDA redistributable.
+    hostRedistSystem = getRedistSystem passthruExtra.hasJetsonCudaCapability stdenv.hostPlatform.system;
+
+    # Sets whether packages should be built with forward compatibility.
+    # TODO(@connorbaker): If the requested CUDA capabilities are not supported by the current CUDA version,
+    # should we throw an evaluation warning and build with forward compatibility?
+    cudaForwardCompat = config.cudaForwardCompat or true;
+
+    # CUDA capabilities which are supported by the current CUDA version.
+    supportedCudaCapabilities = filter (
+      cudaCapability:
+      _cudaCapabilityIsSupported cudaMajorMinorVersion cudaCapabilityToInfo.${cudaCapability}
+    ) allSortedCudaCapabilities;
+
+    # Find the default set of capabilities for this CUDA version using the list of supported capabilities.
+    # Includes only baseline capabilities.
+    defaultCudaCapabilities = filter (
+      cudaCapability:
+      _cudaCapabilityIsDefault cudaMajorMinorVersion cudaCapabilityToInfo.${cudaCapability}
+    ) passthruExtra.supportedCudaCapabilities;
+
+    # The resolved requested or default CUDA capabilities.
+    cudaCapabilities =
+      if config.cudaCapabilities or [ ] != [ ] then
+        config.cudaCapabilities
+      else
+        passthruExtra.defaultCudaCapabilities;
+
+    # Requested architecture-specific CUDA capabilities.
+    requestedArchitectureSpecificCudaCapabilities = intersectLists architectureSpecificCudaCapabilities passthruExtra.cudaCapabilities;
+
+    # Whether the requested CUDA capabilities include architecture-specific CUDA capabilities.
+    hasArchitectureSpecificCudaCapability =
+      passthruExtra.requestedArchitectureSpecificCudaCapabilities != [ ];
+
+    # Requested family-specific CUDA capabilities.
+    requestedFamilySpecificCudaCapabilities = intersectLists familySpecificCudaCapabilities passthruExtra.cudaCapabilities;
+
+    # Whether the requested CUDA capabilities include family-specific CUDA capabilities.
+    hasFamilySpecificCudaCapability = passthruExtra.requestedFamilySpecificCudaCapabilities != [ ];
+
+    # Requested Jetson CUDA capabilities.
+    requestedJetsonCudaCapabilities = intersectLists jetsonCudaCapabilities passthruExtra.cudaCapabilities;
+
+    # Whether the requested CUDA capabilities include Jetson CUDA capabilities.
+    hasJetsonCudaCapability = passthruExtra.requestedJetsonCudaCapabilities != [ ];
+  };
+
+  assertions =
+    let
+      # Jetson devices cannot be targeted by the same binaries which target non-Jetson devices. While
+      # NVIDIA provides both `linux-aarch64` and `linux-sbsa` packages, which both target `aarch64`,
+      # they are built with different settings and cannot be mixed.
+      jetsonMesssagePrefix = "Jetson CUDA capabilities (${toJSON passthruExtra.requestedJetsonCudaCapabilities})";
+
+      # Remove all known capabilities from the user's list to find unrecognized capabilities.
+      unrecognizedCudaCapabilities = subtractLists allSortedCudaCapabilities passthruExtra.cudaCapabilities;
+
+      # Remove all supported capabilities from the user's list to find unsupported capabilities.
+      unsupportedCudaCapabilities = subtractLists passthruExtra.supportedCudaCapabilities passthruExtra.cudaCapabilities;
+    in
+    [
+      {
+        message = "Unrecognized CUDA capabilities: ${toJSON unrecognizedCudaCapabilities}";
+        assertion = unrecognizedCudaCapabilities == [ ];
+      }
+      {
+        message = "Unsupported CUDA capabilities: ${toJSON unsupportedCudaCapabilities}";
+        assertion = unsupportedCudaCapabilities == [ ];
+      }
+      {
+        message =
+          "${jetsonMesssagePrefix} require hostPlatform (currently ${passthruExtra.hostNixSystem}) "
+          + "to be aarch64-linux";
+        assertion = passthruExtra.hasJetsonCudaCapability -> passthruExtra.hostNixSystem == "aarch64-linux";
+      }
+      {
+        message =
+          let
+            # Find the capabilities which are not Jetson capabilities.
+            requestedNonJetsonCudaCapabilities = subtractLists (
+              passthruExtra.requestedJetsonCudaCapabilities
+              ++ passthruExtra.requestedArchitectureSpecificCudaCapabilities
+              ++ passthruExtra.requestedFamilySpecificCudaCapabilities
+            ) passthruExtra.cudaCapabilities;
+          in
+          "${jetsonMesssagePrefix} cannot be specified with non-Jetson capabilities "
+          + "(${toJSON requestedNonJetsonCudaCapabilities})";
+        assertion =
+          passthruExtra.hasJetsonCudaCapability
+          -> passthruExtra.requestedJetsonCudaCapabilities == passthruExtra.cudaCapabilities;
+      }
+    ];
+
+  assertCondition = addErrorContext "while evaluating ${mkVersionedName "cudaPackages" cudaMajorMinorVersion}.backendStdenv" (
+    _evaluateAssertions assertions
+  );
+
+  backendStdenv =
+    stdenvAdapters.useLibsFrom stdenv
+      pkgs."gcc${nvccCompatibilities.${cudaMajorMinorVersion}.gcc.maxMajorVersion}Stdenv";
+in
+# TODO: Consider testing whether we in fact use the newer libstdc++
+extendDerivation assertCondition passthruExtra backendStdenv
--- a/pkgs/development/cuda-modules/packages/cudnn-frontend/0001-cmake-float-out-common-python-bindings-option.patch
+++ b/pkgs/development/cuda-modules/packages/cudnn-frontend/0001-cmake-float-out-common-python-bindings-option.patch
@@ -0,0 +1,30 @@
+From eeef96e91bd3453160315bf4618b7b91ae7240ba Mon Sep 17 00:00:00 2001
+From: Connor Baker <ConnorBaker01@gmail.com>
+Date: Sat, 18 Jan 2025 20:48:11 +0000
+Subject: [PATCH 1/4] cmake: float out common python bindings option
+
+---
+ CMakeLists.txt | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 9739569..8944621 100644
+--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+@@ -5,12 +5,11 @@ project(cudnn_frontend VERSION 1.9.0)
+ option(CUDNN_FRONTEND_SKIP_JSON_LIB "Defines whether FE should not include nlohmann/json.hpp." OFF)
+ option(CUDNN_FRONTEND_BUILD_SAMPLES "Defines if samples are built or not." ON)
+ option(CUDNN_FRONTEND_BUILD_TESTS "Defines if unittests are built or not." ON)
+option(CUDNN_FRONTEND_BUILD_PYTHON_BINDINGS "Defines if python bindings are built or not." OFF)
+ 
+ if(MSVC OR MSYS OR MINGW)
+-    option(CUDNN_FRONTEND_BUILD_PYTHON_BINDINGS "Defines if python bindings are built or not." OFF)
+     add_compile_options(/W4 /WX)
+ else()
+-    option(CUDNN_FRONTEND_BUILD_PYTHON_BINDINGS "Defines if python bindings are built or not." OFF)
+     add_compile_options(-Wall -Wextra -Wpedantic -Werror -Wno-error=attributes -Wno-attributes -Wno-error=unused-function -Wno-unused-function)
+ endif()
+ 
+-- 
+2.47.0
+
--- a/pkgs/development/cuda-modules/packages/cudnn-frontend/0002-cmake-add-config-so-headers-can-be-discovered-when-i.patch
+++ b/pkgs/development/cuda-modules/packages/cudnn-frontend/0002-cmake-add-config-so-headers-can-be-discovered-when-i.patch
@@ -0,0 +1,84 @@
+From da16ec51ea78f88f333ecf3df2a249fcc65ead24 Mon Sep 17 00:00:00 2001
+From: Connor Baker <ConnorBaker01@gmail.com>
+Date: Sat, 18 Jan 2025 22:01:03 +0000
+Subject: [PATCH 2/4] cmake: add config so headers can be discovered when
+ installed
+
+---
+ CMakeLists.txt                 | 39 +++++++++++++++++++++++++++++++---
+ cudnn_frontend-config.cmake.in |  3 +++
+ 2 files changed, 39 insertions(+), 3 deletions(-)
+ create mode 100644 cudnn_frontend-config.cmake.in
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 8944621..9b1bfba 100644
+--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+@@ -1,4 +1,4 @@
+-cmake_minimum_required(VERSION 3.17)
+cmake_minimum_required(VERSION 3.23)
+ 
+ project(cudnn_frontend VERSION 1.9.0)
+ 
+@@ -15,6 +15,15 @@ endif()
+ 
+ add_library(cudnn_frontend INTERFACE)
+ 
+# Add header files to library
+file(GLOB_RECURSE CUDNN_FRONTEND_INCLUDE_FILES "include/*")
+target_sources(
+    cudnn_frontend PUBLIC FILE_SET HEADERS
+    BASE_DIRS "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
+    FILES "${CUDNN_FRONTEND_INCLUDE_FILES}"
+)
+unset(CUDNN_FRONTEND_INCLUDE_FILES)
+
+ target_compile_definitions(
+     cudnn_frontend INTERFACE
+     $<$<BOOL:${CUDNN_FRONTEND_SKIP_JSON_LIB}>:CUDNN_FRONTEND_SKIP_JSON_LIB>
+@@ -58,7 +67,31 @@ endif()
+ # * CMAKE_INSTALL_INCLUDEDIR
+ include(GNUInstallDirs)
+ 
+# See https://cmake.org/cmake/help/latest/module/CMakePackageConfigHelpers.html#example-generating-package-files
+include(CMakePackageConfigHelpers)
+
+# Install and export the header files
+install(
+    TARGETS cudnn_frontend
+    EXPORT cudnn_frontend_targets FILE_SET HEADERS
+)
+export(
+    EXPORT cudnn_frontend_targets
+    FILE "${CMAKE_CURRENT_BINARY_DIR}/cudnn_frontend/cudnn_frontend-targets.cmake"
+)
+install(
+    EXPORT cudnn_frontend_targets
+    FILE cudnn_frontend-targets.cmake
+    DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/cudnn_frontend"
+)
+
+# Install the CMake configuration file for header discovery
+configure_package_config_file(
+    cudnn_frontend-config.cmake.in
+    "${CMAKE_CURRENT_BINARY_DIR}/cudnn_frontend-config.cmake"
+    INSTALL_DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/cudnn_frontend"
+)
+ install(
+-    DIRECTORY ${PROJECT_SOURCE_DIR}/include/
+-    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+    FILES "${CMAKE_CURRENT_BINARY_DIR}/cudnn_frontend-config.cmake"
+    DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/cudnn_frontend"
+ )
+diff --git a/cudnn_frontend-config.cmake.in b/cudnn_frontend-config.cmake.in
+new file mode 100644
+index 0000000..8b2d843
+--- /dev/null
+++ b/cudnn_frontend-config.cmake.in
+@@ -0,0 +1,3 @@
+@PACKAGE_INIT@
+
+include(${CMAKE_CURRENT_LIST_DIR}/cudnn_frontend-targets.cmake)
+-- 
+2.47.0
+
--- a/pkgs/development/cuda-modules/packages/cudnn-frontend/0003-cmake-install-samples-and-tests-when-built.patch
+++ b/pkgs/development/cuda-modules/packages/cudnn-frontend/0003-cmake-install-samples-and-tests-when-built.patch
@@ -0,0 +1,85 @@
+From 53d5aaaad09b479cd8c0e148c9428baa33204024 Mon Sep 17 00:00:00 2001
+From: Connor Baker <ConnorBaker01@gmail.com>
+Date: Sat, 18 Jan 2025 22:10:41 +0000
+Subject: [PATCH 3/4] cmake: install samples and tests when built
+
+---
+ CMakeLists.txt                        | 12 +++++++++++-
+ samples/cpp/CMakeLists.txt            |  2 ++
+ samples/legacy_samples/CMakeLists.txt |  2 ++
+ test/cpp/CMakeLists.txt               |  2 ++
+ 4 files changed, 17 insertions(+), 1 deletion(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 9b1bfba..f6af111 100644
+--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+@@ -70,11 +70,21 @@ include(GNUInstallDirs)
+ # See https://cmake.org/cmake/help/latest/module/CMakePackageConfigHelpers.html#example-generating-package-files
+ include(CMakePackageConfigHelpers)
+ 
+-# Install and export the header files
+# Install the components
+ install(
+     TARGETS cudnn_frontend
+     EXPORT cudnn_frontend_targets FILE_SET HEADERS
+ )
+
+if (CUDNN_FRONTEND_BUILD_SAMPLES)
+    install(TARGETS legacy_samples samples RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+endif()
+
+if (CUDNN_FRONTEND_BUILD_TESTS)
+    install(TARGETS tests RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+endif()
+
+# Export the targets
+ export(
+     EXPORT cudnn_frontend_targets
+     FILE "${CMAKE_CURRENT_BINARY_DIR}/cudnn_frontend/cudnn_frontend-targets.cmake"
+diff --git a/samples/cpp/CMakeLists.txt b/samples/cpp/CMakeLists.txt
+index 9b8a5eb..01b09bb 100644
+--- a/samples/cpp/CMakeLists.txt
+++ b/samples/cpp/CMakeLists.txt
+@@ -69,8 +69,10 @@ target_link_libraries(
+     _cudnn_frontend_pch
+     CUDNN::cudnn
+ 
+    CUDA::cublasLt
+     CUDA::cudart
+     CUDA::cuda_driver # Needed as calls all CUDA calls will eventually move to driver
+    CUDA::nvrtc
+ )
+ 
+ # target cmake properties
+diff --git a/samples/legacy_samples/CMakeLists.txt b/samples/legacy_samples/CMakeLists.txt
+index 019f17c..3b56329 100644
+--- a/samples/legacy_samples/CMakeLists.txt
+++ b/samples/legacy_samples/CMakeLists.txt
+@@ -44,7 +44,9 @@ target_link_libraries(
+     _cudnn_frontend_pch
+     CUDNN::cudnn
+ 
+    CUDA::cublasLt
+     CUDA::cudart
+    CUDA::nvrtc
+ )
+ 
+ # target cmake properties
+diff --git a/test/cpp/CMakeLists.txt b/test/cpp/CMakeLists.txt
+index e244cd0..2750294 100644
+--- a/test/cpp/CMakeLists.txt
+++ b/test/cpp/CMakeLists.txt
+@@ -55,7 +55,9 @@ target_link_libraries(
+ 
+     CUDNN::cudnn
+ 
+    CUDA::cublasLt
+     CUDA::cudart
+    CUDA::nvrtc
+ )
+ 
+ # cuDNN dlopen's its libraries
+-- 
+2.47.0
+
--- a/pkgs/development/cuda-modules/packages/cudnn-frontend/0004-samples-fix-instances-of-maybe-uninitialized.patch
+++ b/pkgs/development/cuda-modules/packages/cudnn-frontend/0004-samples-fix-instances-of-maybe-uninitialized.patch
@@ -0,0 +1,591 @@
+From 4ce40a0c3de0e8a7065caf1cf59a90493e084682 Mon Sep 17 00:00:00 2001
+From: Connor Baker <ConnorBaker01@gmail.com>
+Date: Sat, 18 Jan 2025 22:22:21 +0000
+Subject: [PATCH 4/4] samples: fix instances of maybe-uninitialized
+
+---
+ samples/cpp/convolution/dgrads.cpp                 |  6 +++---
+ samples/cpp/convolution/fp8_fprop.cpp              |  2 +-
+ samples/cpp/convolution/fprop.cpp                  | 10 +++++-----
+ samples/cpp/convolution/int8_fprop.cpp             |  2 +-
+ samples/cpp/convolution/wgrads.cpp                 |  4 ++--
+ samples/cpp/matmul/fp8_matmul.cpp                  |  2 +-
+ samples/cpp/matmul/int8_matmul.cpp                 |  2 +-
+ samples/cpp/matmul/matmuls.cpp                     |  8 ++++----
+ samples/cpp/matmul/mixed_matmul.cpp                |  2 +-
+ samples/cpp/misc/pointwise.cpp                     |  6 +++---
+ samples/cpp/misc/resample.cpp                      |  6 +++---
+ samples/cpp/misc/serialization.cpp                 |  4 ++--
+ samples/cpp/misc/slice.cpp                         |  2 +-
+ samples/cpp/misc/sm_carveout.cpp                   |  2 +-
+ samples/cpp/norm/batchnorm.cpp                     |  8 ++++----
+ samples/cpp/norm/layernorm.cpp                     |  8 ++++----
+ samples/cpp/norm/rmsnorm.cpp                       |  6 +++---
+ samples/cpp/sdpa/fp16_bwd.cpp                      |  2 +-
+ samples/cpp/sdpa/fp16_bwd_with_flexible_graphs.cpp |  2 +-
+ samples/cpp/sdpa/fp16_cached.cpp                   |  2 +-
+ samples/cpp/sdpa/fp16_fwd.cpp                      |  2 +-
+ samples/cpp/sdpa/fp16_fwd_with_custom_dropout.cpp  |  2 +-
+ samples/cpp/sdpa/fp16_fwd_with_flexible_graphs.cpp |  2 +-
+ samples/cpp/sdpa/fp16_fwd_with_paged_caches.cpp    |  2 +-
+ samples/cpp/sdpa/fp8_bwd.cpp                       |  4 ++--
+ samples/cpp/sdpa/fp8_fwd.cpp                       |  2 +-
+ 26 files changed, 50 insertions(+), 50 deletions(-)
+
+diff --git a/samples/cpp/convolution/dgrads.cpp b/samples/cpp/convolution/dgrads.cpp
+index 589cb5f..f66abf4 100644
+--- a/samples/cpp/convolution/dgrads.cpp
+++ b/samples/cpp/convolution/dgrads.cpp
+@@ -65,7 +65,7 @@ TEST_CASE("Convolution Dgrad", "[dgrad][graph]") {
+     Surface<half> w_tensor(64 * 32 * 3 * 3, false);
+     Surface<half> dx_tensor(4 * 32 * 16 * 16, false);
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+ 
+     Surface<int8_t> workspace(workspace_size, false);
+@@ -122,7 +122,7 @@ TEST_CASE("Dgrad Drelu Graph", "[dgrad][graph]") {
+     Surface<half> x_tensor(4 * 32 * 16 * 16, false);
+     Surface<half> dx_tensor(4 * 32 * 16 * 16, false);
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+@@ -234,7 +234,7 @@ TEST_CASE("Dgrad Drelu DBNweight Graph", "[dgrad][graph]") {
+     Surface<float> eq_scale_x_tensor(1 * 32 * 1 * 1, false);
+     Surface<float> eq_bias_tensor(1 * 32 * 1 * 1, false);
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+diff --git a/samples/cpp/convolution/fp8_fprop.cpp b/samples/cpp/convolution/fp8_fprop.cpp
+index dfcb7e2..8246ce4 100644
+--- a/samples/cpp/convolution/fp8_fprop.cpp
+++ b/samples/cpp/convolution/fp8_fprop.cpp
+@@ -116,7 +116,7 @@ TEST_CASE("Convolution fp8 precision", "[conv][graph]") {
+     Surface<float> Y_scale_gpu(1, false);
+     Surface<float> amax_gpu(1, false);
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+diff --git a/samples/cpp/convolution/fprop.cpp b/samples/cpp/convolution/fprop.cpp
+index bc1aaf0..d61fa4e 100644
+--- a/samples/cpp/convolution/fprop.cpp
+++ b/samples/cpp/convolution/fprop.cpp
+@@ -80,7 +80,7 @@ TEST_CASE("Convolution fprop", "[conv][graph][caching]") {
+     std::unordered_map<int64_t, void *> variant_pack = {
+         {X->get_uid(), x_tensor.devPtr}, {W->get_uid(), w_tensor.devPtr}, {Y->get_uid(), y_tensor.devPtr}};
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+@@ -303,7 +303,7 @@ TEST_CASE("CSBR Graph", "[conv][graph][caching]") {
+     Surface<half> b_tensor(k, false);
+     Surface<half> y_tensor(n * k * h * w, false);  // Should be p, q.
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+@@ -550,7 +550,7 @@ TEST_CASE("SBRCS", "[conv][genstats][graph]") {
+         {SUM, sum_tensor.devPtr},
+         {SQ_SUM, sq_sum_tensor.devPtr}};
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+@@ -651,7 +651,7 @@ TEST_CASE("CBR Graph NCHW", "[conv][graph][caching]") {
+     Surface<half> y_tensor(n * k * h * w, false);  // Should be p, q.
+     Surface<half> z_tensor(n * k * h * w, false);  // Should be p, q.
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+@@ -734,7 +734,7 @@ TEST_CASE("Convolution fprop large", "[conv][graph][caching]") {
+     std::unordered_map<int64_t, void *> variant_pack = {
+         {X->get_uid(), x_tensor.devPtr}, {W->get_uid(), w_tensor.devPtr}, {Y->get_uid(), y_tensor.devPtr}};
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+diff --git a/samples/cpp/convolution/int8_fprop.cpp b/samples/cpp/convolution/int8_fprop.cpp
+index 3d5ac2f..e9248f5 100644
+--- a/samples/cpp/convolution/int8_fprop.cpp
+++ b/samples/cpp/convolution/int8_fprop.cpp
+@@ -94,7 +94,7 @@ TEST_CASE("Conv with Int8 datatypes", "[conv][graph][caching]") {
+     std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+         {X, x_tensor.devPtr}, {W, w_tensor.devPtr}, {Y, y_tensor.devPtr}};
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+diff --git a/samples/cpp/convolution/wgrads.cpp b/samples/cpp/convolution/wgrads.cpp
+index 2c58b26..26887dc 100644
+--- a/samples/cpp/convolution/wgrads.cpp
+++ b/samples/cpp/convolution/wgrads.cpp
+@@ -64,7 +64,7 @@ TEST_CASE("Convolution Wgrad", "[wgrad][graph][wgrad][Conv_wgrad]") {
+     Surface<half> dy_tensor(4 * 64 * 16 * 16, false);
+     Surface<half> dw_tensor(64 * 64 * 3 * 3, false);
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+@@ -137,7 +137,7 @@ TEST_CASE("scale-bias-relu-wgrad Graph", "[wgrad][graph][scale-bias-relu-wgrad][
+     Surface<half> dy_tensor(4 * 64 * 16 * 16, false);
+     Surface<half> dw_tensor(64 * 64 * 3 * 3, false);
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+diff --git a/samples/cpp/matmul/fp8_matmul.cpp b/samples/cpp/matmul/fp8_matmul.cpp
+index c6470cd..f32c627 100644
+--- a/samples/cpp/matmul/fp8_matmul.cpp
+++ b/samples/cpp/matmul/fp8_matmul.cpp
+@@ -115,7 +115,7 @@ TEST_CASE("Matmul fp8 precision", "[matmul][graph]") {
+     REQUIRE(graph.build_plans(handle, fe::BuildPlanPolicy_t::HEURISTICS_CHOICE).is_good());
+ 
+     Surface<float> C_gpu(b * m * n, false);
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+diff --git a/samples/cpp/matmul/int8_matmul.cpp b/samples/cpp/matmul/int8_matmul.cpp
+index cf4353a..cb3ce34 100644
+--- a/samples/cpp/matmul/int8_matmul.cpp
+++ b/samples/cpp/matmul/int8_matmul.cpp
+@@ -104,7 +104,7 @@ TEST_CASE("Int8 Matmul", "[matmul][graph]") {
+     // note this is a bf16 tensor, but half is used just for memory allocation
+     Surface<float> C_gpu(b * m * n, false);
+     Surface<float> Bias_gpu(b * m * n, false);
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+diff --git a/samples/cpp/matmul/matmuls.cpp b/samples/cpp/matmul/matmuls.cpp
+index ed0f10b..5c95713 100644
+--- a/samples/cpp/matmul/matmuls.cpp
+++ b/samples/cpp/matmul/matmuls.cpp
+@@ -250,7 +250,7 @@ TEST_CASE("Matmul", "[matmul][graph]") {
+ 
+     // Run cudnn graph
+     Surface<float> C_gpu(b * m * n, false);
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+@@ -319,7 +319,7 @@ TEST_CASE("Abs + Matmul", "[matmul][graph]") {
+ 
+     // Run cudnn graph
+     Surface<float> C_gpu(b * m * n, false);
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+@@ -539,7 +539,7 @@ TEST_CASE("Matmul SBR Graph", "[matmul][graph]") {
+     auto [graph, A, B, bias, scale, O] = lookup_cache_or_build_graph(
+         handle, x_tensor.devPtr, w_tensor.devPtr, s_tensor.devPtr, b_tensor.devPtr, y_tensor.devPtr);
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+@@ -606,7 +606,7 @@ TEST_CASE("Matmul with restricted shared memory", "[matmul][graph]") {
+ 
+     // Run cudnn graph
+     Surface<float> C_gpu(b * m * n, false);
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+diff --git a/samples/cpp/matmul/mixed_matmul.cpp b/samples/cpp/matmul/mixed_matmul.cpp
+index ab3e195..a2b05bd 100644
+--- a/samples/cpp/matmul/mixed_matmul.cpp
+++ b/samples/cpp/matmul/mixed_matmul.cpp
+@@ -96,7 +96,7 @@ TEST_CASE("Mixed Precision Matmul", "[matmul][graph]") {
+     //// Run cudnn graph
+     // note this is a bf16 tensor, but half is used just for memory allocation
+     Surface<half> C_gpu(b * m * n, false);
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+diff --git a/samples/cpp/misc/pointwise.cpp b/samples/cpp/misc/pointwise.cpp
+index 8f8d699..e8f4cb1 100644
+--- a/samples/cpp/misc/pointwise.cpp
+++ b/samples/cpp/misc/pointwise.cpp
+@@ -51,7 +51,7 @@ TEST_CASE("Reduction", "[reduction]") {
+     Surface<float> C_gpu(n * n * n * n, false);
+     std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {{A, A_gpu.devPtr},
+                                                                                              {C, C_gpu.devPtr}};
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+@@ -88,7 +88,7 @@ TEST_CASE("Fused scalar", "[scalar][graph]") {
+ 
+     std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {{A, A_gpu.devPtr},
+                                                                                              {C, C_gpu.devPtr}};
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+@@ -148,7 +148,7 @@ TEST_CASE("Fused Amax Reduction and type conversion", "[reduction]") {
+ 
+     std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+         {A, A_gpu.devPtr}, {scale, scale_gpu.devPtr}, {amax, amax_gpu.devPtr}, {C, C_gpu.devPtr}};
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+diff --git a/samples/cpp/misc/resample.cpp b/samples/cpp/misc/resample.cpp
+index 3f782e7..21998c3 100644
+--- a/samples/cpp/misc/resample.cpp
+++ b/samples/cpp/misc/resample.cpp
+@@ -69,7 +69,7 @@ TEST_CASE("Resample Max Pooling NHWC Inference", "[resample][pooling][max][graph
+     Surface<half> Y_gpu(N * H * W * C, false);
+     std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {{X, X_gpu.devPtr},
+                                                                                              {Y, Y_gpu.devPtr}};
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+@@ -132,7 +132,7 @@ TEST_CASE("Resample Max Pooling NHWC Training", "[resample][pooling][max][graph]
+     Surface<int8_t> Index_gpu(N * H * W * C / 8, false);
+     std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
+         {X, X_gpu.devPtr}, {Y, Y_gpu.devPtr}, {Index, Index_gpu.devPtr}};
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+@@ -186,7 +186,7 @@ TEST_CASE("Resample Avg Pooling", "[resample][pooling][average][graph]") {
+     Surface<half> Y_gpu(N * H * W * C, false);
+     std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {{X, X_gpu.devPtr},
+                                                                                              {Y, Y_gpu.devPtr}};
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+diff --git a/samples/cpp/misc/serialization.cpp b/samples/cpp/misc/serialization.cpp
+index a130406..278bad8 100644
+--- a/samples/cpp/misc/serialization.cpp
+++ b/samples/cpp/misc/serialization.cpp
+@@ -178,7 +178,7 @@ TEST_CASE("CSBR Graph with serialization", "[conv][graph][serialization]") {
+     Surface<half> b_device_memory(k, false);
+     Surface<half> y_device_memory(n * k * h * w, false);  // Should be p, q.
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+@@ -401,7 +401,7 @@ TEST_CASE("SDPA Graph with serialization", "[sdpa][graph][serialization]") {
+     Surface<int32_t> dropoutSeed(scaleSize, false, seed_value);
+     Surface<int32_t> dropoutOffset(scaleSize, false, (int32_t)1);
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+diff --git a/samples/cpp/misc/slice.cpp b/samples/cpp/misc/slice.cpp
+index 087ba36..78962c6 100644
+--- a/samples/cpp/misc/slice.cpp
+++ b/samples/cpp/misc/slice.cpp
+@@ -80,7 +80,7 @@ TEST_CASE("Slice gemm", "[slice][gemm][graph][fusion]") {
+     Surface<half> C_gpu(B * M * N, false);
+     std::unordered_map<int64_t, void *> variant_pack = {
+         {a_uid, A_gpu.devPtr}, {b_uid, B_gpu.devPtr}, {c_uid, C_gpu.devPtr}};
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+diff --git a/samples/cpp/misc/sm_carveout.cpp b/samples/cpp/misc/sm_carveout.cpp
+index d6818c0..b0e0651 100644
+--- a/samples/cpp/misc/sm_carveout.cpp
+++ b/samples/cpp/misc/sm_carveout.cpp
+@@ -121,7 +121,7 @@ TEST_CASE("SGBN with SM carveout", "[batchnorm][graph][sm_carveout]") {
+     Surface<float> Peer_stats_0_tensor(2 * 4 * c, false, true);
+     Surface<float> Peer_stats_1_tensor(2 * 4 * c, false);
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+diff --git a/samples/cpp/norm/batchnorm.cpp b/samples/cpp/norm/batchnorm.cpp
+index 5949365..a91a9bd 100644
+--- a/samples/cpp/norm/batchnorm.cpp
+++ b/samples/cpp/norm/batchnorm.cpp
+@@ -96,7 +96,7 @@ TEST_CASE("BN Finalize Graph", "[batchnorm][graph]") {
+     Surface<float> eq_scale_tensor(32, false);
+     Surface<float> eq_bias_tensor(32, false);
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+@@ -226,7 +226,7 @@ TEST_CASE("SGBN Add Relu Graph", "[batchnorm][graph]") {
+     Surface<float> Peer_stats_0_tensor(2 * 4 * 32, false, true);
+     Surface<float> Peer_stats_1_tensor(2 * 4 * 32, false);
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+@@ -346,7 +346,7 @@ TEST_CASE("DBN Add Relu Graph", "[BN][graph][backward]") {
+     Surface<float> Peer_stats_0_tensor(2 * 4 * 32, false, true);
+     Surface<float> Peer_stats_1_tensor(2 * 4 * 32, false);
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+@@ -454,7 +454,7 @@ TEST_CASE("BN_inference DRelu DBN Graph", "[Batchnorm][graph][backward]") {
+     Surface<float> Dbias_tensor(32, false);
+     Surface<half> DX_tensor(4 * 32 * 16 * 16, false);
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+diff --git a/samples/cpp/norm/layernorm.cpp b/samples/cpp/norm/layernorm.cpp
+index bac996f..7f69f34 100644
+--- a/samples/cpp/norm/layernorm.cpp
+++ b/samples/cpp/norm/layernorm.cpp
+@@ -133,7 +133,7 @@ layernorm_fwd_dynamic_shapes(bool train = true) {
+         Surface<float> Mean_tensor(max_stats_volume, false);
+         Surface<float> Var_tensor(max_stats_volume, false);
+ 
+-        int64_t workspace_size;
+        int64_t workspace_size = 0;
+         REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+         Surface<int8_t> workspace(workspace_size, false);
+ 
+@@ -232,7 +232,7 @@ TEST_CASE("LayerNorm Training", "[layernorm][graph]") {
+     Surface<float> Bias_tensor(hidden_size, false);
+     Surface<half> Y_tensor(batch_size * seq_length * hidden_size, false);
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+@@ -310,7 +310,7 @@ TEST_CASE("LayerNorm Inference", "[layernorm][graph]") {
+     Surface<float> Bias_tensor(hidden_size, false);
+     Surface<half> Y_tensor(batch_size * seq_length * hidden_size, false);
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+@@ -392,7 +392,7 @@ TEST_CASE("LayerNorm Backward", "[layernorm][graph]") {
+     Surface<float> Dbias_tensor(hidden_size, false);
+     Surface<half> DX_tensor(batch_size * seq_length * hidden_size, false);
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+diff --git a/samples/cpp/norm/rmsnorm.cpp b/samples/cpp/norm/rmsnorm.cpp
+index 878086c..d5c919b 100644
+--- a/samples/cpp/norm/rmsnorm.cpp
+++ b/samples/cpp/norm/rmsnorm.cpp
+@@ -78,7 +78,7 @@ TEST_CASE("RmsNorm Training", "[rmsnorm][graph]") {
+     Surface<float> Scale_tensor(hidden_size, false);
+     Surface<float> Y_tensor(batch_size * seq_length * hidden_size, false);
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+@@ -150,7 +150,7 @@ TEST_CASE("RmsNorm Inference", "[rmsnorm][graph]") {
+     Surface<float> Bias_tensor(hidden_size, false);
+     Surface<float> Y_tensor(batch_size * seq_length * hidden_size, false);
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+@@ -227,7 +227,7 @@ TEST_CASE("RmsNorm Backward", "[rmsnorm][graph]") {
+     Surface<float> Dbias_tensor(hidden_size, false);
+     Surface<float> DX_tensor(batch_size * seq_length * hidden_size, false);
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+diff --git a/samples/cpp/sdpa/fp16_bwd.cpp b/samples/cpp/sdpa/fp16_bwd.cpp
+index 749cbed..1145008 100644
+--- a/samples/cpp/sdpa/fp16_bwd.cpp
+++ b/samples/cpp/sdpa/fp16_bwd.cpp
+@@ -275,7 +275,7 @@ TEST_CASE("Toy sdpa backward", "[graph][sdpa][flash][backward]") {
+     }
+ 
+     // Allocate workspace
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+diff --git a/samples/cpp/sdpa/fp16_bwd_with_flexible_graphs.cpp b/samples/cpp/sdpa/fp16_bwd_with_flexible_graphs.cpp
+index 62d6bb3..50205c3 100644
+--- a/samples/cpp/sdpa/fp16_bwd_with_flexible_graphs.cpp
+++ b/samples/cpp/sdpa/fp16_bwd_with_flexible_graphs.cpp
+@@ -195,7 +195,7 @@ TEST_CASE("Toy sdpa backward with flexible graph", "[graph][sdpa][flash][backwar
+                                                                                    {DV_UID, dV_tensor.devPtr}};
+ 
+     // Allocate workspace
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+diff --git a/samples/cpp/sdpa/fp16_cached.cpp b/samples/cpp/sdpa/fp16_cached.cpp
+index d046271..4f0d3f8 100644
+--- a/samples/cpp/sdpa/fp16_cached.cpp
+++ b/samples/cpp/sdpa/fp16_cached.cpp
+@@ -146,7 +146,7 @@ TEST_CASE("Cached sdpa", "[graph][sdpa][flash]") {
+                     {O_UID, o_tensor.devPtr},
+                     {STATS_UID, stats_tensor.devPtr}};
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(fwd_graph2->get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> fwd_workspace(workspace_size, false);
+ 
+diff --git a/samples/cpp/sdpa/fp16_fwd.cpp b/samples/cpp/sdpa/fp16_fwd.cpp
+index b3acf5e..63697a1 100644
+--- a/samples/cpp/sdpa/fp16_fwd.cpp
+++ b/samples/cpp/sdpa/fp16_fwd.cpp
+@@ -210,7 +210,7 @@ TEST_CASE("Toy sdpa forward", "[graph][sdpa][flash][forward]") {
+         variant_pack[STATS_UID] = statsTensor.devPtr;
+     }
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+diff --git a/samples/cpp/sdpa/fp16_fwd_with_custom_dropout.cpp b/samples/cpp/sdpa/fp16_fwd_with_custom_dropout.cpp
+index 36cfba4..0cb9d2f 100644
+--- a/samples/cpp/sdpa/fp16_fwd_with_custom_dropout.cpp
+++ b/samples/cpp/sdpa/fp16_fwd_with_custom_dropout.cpp
+@@ -178,7 +178,7 @@ TEST_CASE("Toy sdpa forward with dropout", "[graph][sdpa][flash][forward]") {
+         variant_pack[STATS_UID] = statsTensor.devPtr;
+     }
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+diff --git a/samples/cpp/sdpa/fp16_fwd_with_flexible_graphs.cpp b/samples/cpp/sdpa/fp16_fwd_with_flexible_graphs.cpp
+index 810de63..7d81afe 100644
+--- a/samples/cpp/sdpa/fp16_fwd_with_flexible_graphs.cpp
+++ b/samples/cpp/sdpa/fp16_fwd_with_flexible_graphs.cpp
+@@ -186,7 +186,7 @@ TEST_CASE("Toy sdpa forward with flexible graph", "[graph][sdpa][flash][forward]
+         variant_pack[STATS_UID] = statsTensor.devPtr;
+     }
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+diff --git a/samples/cpp/sdpa/fp16_fwd_with_paged_caches.cpp b/samples/cpp/sdpa/fp16_fwd_with_paged_caches.cpp
+index 18dd937..d195f6b 100644
+--- a/samples/cpp/sdpa/fp16_fwd_with_paged_caches.cpp
+++ b/samples/cpp/sdpa/fp16_fwd_with_paged_caches.cpp
+@@ -268,7 +268,7 @@ TEST_CASE("Toy sdpa forward with paged caches", "[graph][sdpa][flash][paged][for
+         variant_pack[STATS_UID] = statsTensor.devPtr;
+     }
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(graph->get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+diff --git a/samples/cpp/sdpa/fp8_bwd.cpp b/samples/cpp/sdpa/fp8_bwd.cpp
+index 82e542b..296f2f9 100644
+--- a/samples/cpp/sdpa/fp8_bwd.cpp
+++ b/samples/cpp/sdpa/fp8_bwd.cpp
+@@ -214,7 +214,7 @@ TEST_CASE("sdpa_fp8_bprop", "[graph][sdpa][fp8][backward]") {
+         {Amax_dV, AMax_dV_Tensor.devPtr},
+         {Amax_dP, AMax_dP_Tensor.devPtr}};
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(mha_graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+@@ -385,7 +385,7 @@ TEST_CASE("sdpa_fp8_gqa_bprop", "[graph][sdpa][fp8][backward]") {
+         {amax_dV, amax_dV_gpu.devPtr},
+         {amax_dP, amax_dP_gpu.devPtr}};
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(mha_graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+diff --git a/samples/cpp/sdpa/fp8_fwd.cpp b/samples/cpp/sdpa/fp8_fwd.cpp
+index 6ede98d..23abc3f 100644
+--- a/samples/cpp/sdpa/fp8_fwd.cpp
+++ b/samples/cpp/sdpa/fp8_fwd.cpp
+@@ -146,7 +146,7 @@ TEST_CASE("sdpa_fp8_fprop", "[graph][sdpa][fp8][forward]") {
+         variant_pack[Stats] = stats_tensor.devPtr;
+     }
+ 
+-    int64_t workspace_size;
+    int64_t workspace_size = 0;
+     REQUIRE(mha_graph.get_workspace_size(workspace_size).is_good());
+     Surface<int8_t> workspace(workspace_size, false);
+ 
+-- 
+2.47.0
+
--- a/pkgs/development/cuda-modules/packages/cudnn-frontend/CMakeLists.txt
+++ b/pkgs/development/cuda-modules/packages/cudnn-frontend/CMakeLists.txt
@@ -0,0 +1,133 @@
+cmake_minimum_required(VERSION 3.23)
+
+project(cudnn_frontend VERSION 1.8.0)
+
+option(CUDNN_FRONTEND_SKIP_JSON_LIB "Defines whether FE should not include nlohmann/json.hpp." OFF)
+option(CUDNN_FRONTEND_BUILD_SAMPLES "Defines if samples are built or not." ON)
+option(CUDNN_FRONTEND_BUILD_TESTS "Defines if unittests are built or not." ON)
+option(CUDNN_FRONTEND_BUILD_PYTHON_BINDINGS "Defines if python bindings are built or not." OFF)
+
+if(MSVC OR MSYS OR MINGW)
+    add_compile_options(/W4 /WX)
+else()
+    add_compile_options(-Wall -Wextra -Wpedantic -Werror -Wno-error=attributes -Wno-attributes -Wno-error=unused-function -Wno-unused-function)
+endif()
+
+add_library(cudnn_frontend INTERFACE)
+
+# Add header files to library
+file(GLOB_RECURSE CUDNN_FRONTEND_INCLUDE_FILES "include/*")
+target_sources(
+    cudnn_frontend
+    PUBLIC
+        FILE_SET
+            HEADERS
+            BASE_DIRS
+                "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
+            FILES
+                "${CUDNN_FRONTEND_INCLUDE_FILES}"
+)
+unset(CUDNN_FRONTEND_INCLUDE_FILES)
+
+target_compile_definitions(cudnn_frontend INTERFACE $<$<BOOL:${CUDNN_FRONTEND_SKIP_JSON_LIB}>:CUDNN_FRONTEND_SKIP_JSON_LIB>)
+
+target_include_directories(
+    cudnn_frontend
+    INTERFACE
+        "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
+        "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>"
+)
+
+# Find the cuda compiler
+find_package(CUDAToolkit REQUIRED)
+
+target_include_directories(cudnn_frontend INTERFACE ${CUDAToolkit_INCLUDE_DIRS})
+
+target_compile_features(cudnn_frontend INTERFACE cxx_std_17)
+
+# Make PCH for targets to link against
+add_library(_cudnn_frontend_pch INTERFACE)
+target_precompile_headers(_cudnn_frontend_pch INTERFACE ${PROJECT_SOURCE_DIR}/include/cudnn_frontend.h)
+
+if (CUDNN_FRONTEND_BUILD_SAMPLES)
+    add_subdirectory(samples)
+    target_link_libraries(
+        samples
+        PRIVATE
+            CUDA::cublasLt
+            CUDA::nvrtc
+    )
+    target_link_libraries(
+        legacy_samples
+        PRIVATE
+            CUDA::cublasLt
+            CUDA::nvrtc
+    )
+endif()
+
+if (CUDNN_FRONTEND_BUILD_TESTS)
+    add_subdirectory(test)
+    target_link_libraries(
+        tests
+        CUDA::cublasLt
+        CUDA::nvrtc
+    )
+endif()
+
+if (CUDNN_FRONTEND_BUILD_PYTHON_BINDINGS)
+    add_subdirectory(python)
+endif()
+
+# Introduce variables:
+# * CMAKE_INSTALL_LIBDIR
+# * CMAKE_INSTALL_BINDIR
+# * CMAKE_INSTALL_INCLUDEDIR
+include(GNUInstallDirs)
+
+# Install and export the header files
+install(
+    TARGETS
+        cudnn_frontend
+    EXPORT
+        cudnn_frontend_targets
+    FILE_SET HEADERS
+)
+
+if (CUDNN_FRONTEND_BUILD_SAMPLES)
+    install(TARGETS legacy_samples samples RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+endif()
+
+if (CUDNN_FRONTEND_BUILD_TESTS)
+    install(TARGETS tests RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+endif()
+
+# See https://cmake.org/cmake/help/latest/module/CMakePackageConfigHelpers.html#example-generating-package-files
+include(CMakePackageConfigHelpers)
+
+export(
+    EXPORT
+        cudnn_frontend_targets
+    FILE
+        "${CMAKE_CURRENT_BINARY_DIR}/cudnn_frontend/cudnn_frontend-targets.cmake"
+)
+install(
+    EXPORT
+        cudnn_frontend_targets
+    FILE
+        cudnn_frontend-targets.cmake
+    DESTINATION
+        "${CMAKE_INSTALL_LIBDIR}/cmake/cudnn_frontend"
+)
+
+configure_package_config_file(
+    cudnn_frontend-config.cmake.in
+    "${CMAKE_CURRENT_BINARY_DIR}/cudnn_frontend-config.cmake"
+    INSTALL_DESTINATION
+        "${CMAKE_INSTALL_LIBDIR}/cmake/cudnn_frontend"
+)
+install(
+    FILES
+        "${CMAKE_CURRENT_BINARY_DIR}/cudnn_frontend-config.cmake"
+    DESTINATION
+        "${CMAKE_INSTALL_LIBDIR}/cmake/cudnn_frontend"
+)
--- a/pkgs/development/cuda-modules/packages/cudnn-frontend/cudnn_frontend-config.cmake.in
+++ b/pkgs/development/cuda-modules/packages/cudnn-frontend/cudnn_frontend-config.cmake.in
@@ -0,0 +1,3 @@
+@PACKAGE_INIT@
+
+include(${CMAKE_CURRENT_LIST_DIR}/cudnn_frontend-targets.cmake)
--- a/pkgs/development/cuda-modules/packages/cudnn-frontend/package.nix
+++ b/pkgs/development/cuda-modules/packages/cudnn-frontend/package.nix
@@ -0,0 +1,132 @@
+{
+  autoAddDriverRunpath,
+  catch2_3,
+  cmake,
+  fetchFromGitHub,
+  gitUpdater,
+  lib,
+  ninja,
+  nlohmann_json,
+  stdenv,
+  cuda_cccl ? null,
+  cuda_cudart ? null,
+  cuda_nvcc ? null,
+  cuda_nvrtc ? null,
+  cudnn ? null,
+  libcublas ? null,
+}:
+let
+  inherit (lib.lists) optionals;
+  inherit (lib.strings)
+    cmakeBool
+    cmakeFeature
+    optionalString
+    ;
+in
+
+# TODO(@connorbaker): This should be a hybrid C++/Python package.
+stdenv.mkDerivation (finalAttrs: {
+  pname = "cudnn-frontend";
+  version = "1.9.0";
+
+  src = fetchFromGitHub {
+    owner = "NVIDIA";
+    repo = "cudnn-frontend";
+    tag = "v${finalAttrs.version}";
+    hash = "sha256-Vc5jqB1XHcJEdKG0nxbWLewW2fDezRVwjUSzPDubSGE=";
+  };
+
+  patches = [
+    # https://github.com/NVIDIA/cudnn-frontend/pull/125
+    ./0001-cmake-float-out-common-python-bindings-option.patch
+    ./0002-cmake-add-config-so-headers-can-be-discovered-when-i.patch
+    ./0003-cmake-install-samples-and-tests-when-built.patch
+    ./0004-samples-fix-instances-of-maybe-uninitialized.patch
+  ];
+
+  # nlohmann_json should be the only vendored dependency.
+  postPatch = ''
+    echo "patching source to use nlohmann_json from nixpkgs"
+    rm -rf include/cudnn_frontend/thirdparty/nlohmann
+    rmdir include/cudnn_frontend/thirdparty
+    substituteInPlace include/cudnn_frontend_utils.h \
+      --replace-fail \
+        '#include "cudnn_frontend/thirdparty/nlohmann/json.hpp"' \
+        '#include <nlohmann/json.hpp>'
+  '';
+
+  # TODO: As a header-only library, we should make sure we have an `include` directory or similar which is not a
+  # superset of the `out` (`bin`) or `dev` outputs (which is what the multiple-outputs setup hook does by default).
+  outputs = [
+    "out"
+  ]
+  ++ optionals finalAttrs.doCheck [
+    "legacy_samples"
+    "samples"
+    "tests"
+  ];
+
+  nativeBuildInputs = [
+    autoAddDriverRunpath # Needed for samples because it links against CUDA::cuda_driver
+    cmake
+    cuda_nvcc
+    ninja
+  ];
+
+  buildInputs = [
+    cuda_cccl
+    cuda_cudart
+  ];
+
+  cmakeFlags = [
+    (cmakeBool "FETCHCONTENT_FULLY_DISCONNECTED" true)
+    (cmakeFeature "FETCHCONTENT_TRY_FIND_PACKAGE_MODE" "ALWAYS")
+    (cmakeBool "CUDNN_FRONTEND_BUILD_SAMPLES" finalAttrs.doCheck)
+    (cmakeBool "CUDNN_FRONTEND_BUILD_TESTS" finalAttrs.doCheck)
+    (cmakeBool "CUDNN_FRONTEND_BUILD_PYTHON_BINDINGS" false)
+  ];
+
+  checkInputs = [
+    cudnn
+    cuda_nvrtc
+    catch2_3
+    libcublas
+  ];
+
+  enableParallelBuilding = true;
+
+  propagatedBuildInputs = [
+    nlohmann_json
+  ];
+
+  doCheck = true;
+
+  postInstall = optionalString finalAttrs.doCheck ''
+    moveToOutput "bin/legacy_samples" "$legacy_samples"
+    moveToOutput "bin/samples" "$samples"
+    moveToOutput "bin/tests" "$tests"
+    if [[ -e "$out/bin" ]]
+    then
+      nixErrorLog "The bin directory in \$out should no longer exist."
+      exit 1
+    fi
+  '';
+
+  passthru.updateScript = gitUpdater {
+    inherit (finalAttrs) pname version;
+    rev-prefix = "v";
+  };
+
+  meta = {
+    description = "A c++ wrapper for the cudnn backend API";
+    homepage = "https://github.com/NVIDIA/cudnn-frontend";
+    license = lib.licenses.mit;
+    badPlatforms = optionals (cudnn == null) finalAttrs.meta.platforms;
+    platforms = [
+      "aarch64-linux"
+      "x86_64-linux"
+    ];
+    maintainers = with lib.maintainers; [ connorbaker ];
+    teams = [ lib.teams.cuda ];
+  };
+})
--- a/pkgs/development/cuda-modules/packages/markForCudatoolkitRootHook/mark-for-cudatoolkit-root-hook.sh
+++ b/pkgs/development/cuda-modules/packages/markForCudatoolkitRootHook/mark-for-cudatoolkit-root-hook.sh
@@ -0,0 +1,25 @@
+# shellcheck shell=bash
+
+(( ${hostOffset:?} == -1 && ${targetOffset:?} == 0)) || return 0
+
+echo "Sourcing mark-for-cudatoolkit-root-hook" >&2
+
+markForCUDAToolkit_ROOT() {
+    mkdir -p "${prefix:?}/nix-support"
+    local markerPath="$prefix/nix-support/include-in-cudatoolkit-root"
+
+    # Return early if the file already exists.
+    [[ -f "$markerPath" ]] && return 0
+
+    # Always create the file, even if it's empty, since setup-cuda-hook relies on its existence.
+    # However, only populate it if strictDeps is not set.
+    touch "$markerPath"
+
+    # Return early if strictDeps is set.
+    [[ -n "${strictDeps-}" ]] && return 0
+
+    # Populate the file with the package name and output.
+    echo "${pname:?}-${output:?}" > "$markerPath"
+}
+
+fixupOutputHooks+=(markForCUDAToolkit_ROOT)
--- a/pkgs/development/cuda-modules/packages/markForCudatoolkitRootHook/package.nix
+++ b/pkgs/development/cuda-modules/packages/markForCudatoolkitRootHook/package.nix
@@ -0,0 +1,4 @@
+# Internal hook, used by cudatoolkit and cuda redist packages
+# to accommodate automatic CUDAToolkit_ROOT construction
+{ makeSetupHook }:
+makeSetupHook { name = "mark-for-cudatoolkit-root-hook"; } ./mark-for-cudatoolkit-root-hook.sh
--- a/pkgs/development/cuda-modules/packages/nccl-tests.nix
+++ b/pkgs/development/cuda-modules/packages/nccl-tests.nix
@@ -0,0 +1,83 @@
+# NOTE: Though NCCL tests is called within the cudaPackages package set, we avoid passing in
+# the names of dependencies from that package set directly to avoid evaluation errors
+# in the case redistributable packages are not available.
+{
+  config,
+  cudaPackages,
+  fetchFromGitHub,
+  gitUpdater,
+  lib,
+  mpi,
+  mpiSupport ? false,
+  which,
+}:
+let
+  inherit (cudaPackages)
+    backendStdenv
+    cuda_cccl
+    cuda_cudart
+    cuda_nvcc
+    cudaAtLeast
+    nccl
+    ;
+in
+backendStdenv.mkDerivation (finalAttrs: {
+
+  pname = "nccl-tests";
+  version = "2.15.0";
+
+  src = fetchFromGitHub {
+    owner = "NVIDIA";
+    repo = "nccl-tests";
+    rev = "v${finalAttrs.version}";
+    hash = "sha256-OgffbW9Vx/sm1I1tpaPGdAhIpV4jbB4hJa9UcEAWkdE=";
+  };
+
+  postPatch = ''
+    # fix build failure with GCC14
+    substituteInPlace src/Makefile --replace-fail "-std=c++11" "-std=c++14"
+  '';
+
+  strictDeps = true;
+
+  nativeBuildInputs = [
+    which
+    cuda_nvcc
+  ];
+
+  buildInputs = [
+    nccl
+    cuda_nvcc # crt/host_config.h
+    cuda_cudart
+    cuda_cccl # <nv/target>
+  ]
+  ++ lib.optionals mpiSupport [ mpi ];
+
+  makeFlags = [
+    "NCCL_HOME=${nccl}"
+    "CUDA_HOME=${cuda_nvcc}"
+  ]
+  ++ lib.optionals mpiSupport [ "MPI=1" ];
+
+  enableParallelBuilding = true;
+
+  installPhase = ''
+    mkdir -p $out/bin
+    cp -r build/* $out/bin/
+  '';
+
+  passthru.updateScript = gitUpdater {
+    inherit (finalAttrs) pname version;
+    rev-prefix = "v";
+  };
+
+  meta = with lib; {
+    description = "Tests to check both the performance and the correctness of NVIDIA NCCL operations";
+    homepage = "https://github.com/NVIDIA/nccl-tests";
+    platforms = platforms.linux;
+    license = licenses.bsd3;
+    broken = !config.cudaSupport || (mpiSupport && mpi == null);
+    maintainers = with maintainers; [ jmillerpdt ];
+    teams = [ teams.cuda ];
+  };
+})
--- a/pkgs/development/cuda-modules/packages/nccl.nix
+++ b/pkgs/development/cuda-modules/packages/nccl.nix
@@ -0,0 +1,98 @@
+# NOTE: Though NCCL is called within the cudaPackages package set, we avoid passing in
+# the names of dependencies from that package set directly to avoid evaluation errors
+# in the case redistributable packages are not available.
+{
+  lib,
+  fetchFromGitHub,
+  python3,
+  which,
+  autoAddDriverRunpath,
+  cudaPackages,
+  # passthru.updateScript
+  gitUpdater,
+}:
+let
+  inherit (cudaPackages)
+    backendStdenv
+    cuda_cccl
+    cuda_cudart
+    cuda_nvcc
+    cudaAtLeast
+    flags
+    ;
+  version = "2.27.6-1";
+  hash = "sha256-/BiLSZaBbVIqOfd8nQlgUJub0YR3SR4B93x2vZpkeiU=";
+in
+backendStdenv.mkDerivation (finalAttrs: {
+  pname = "nccl";
+  version = version;
+
+  src = fetchFromGitHub {
+    owner = "NVIDIA";
+    repo = "nccl";
+    rev = "v${finalAttrs.version}";
+    hash = hash;
+  };
+
+  __structuredAttrs = true;
+  strictDeps = true;
+
+  outputs = [
+    "out"
+    "dev"
+  ];
+
+  nativeBuildInputs = [
+    which
+    autoAddDriverRunpath
+    python3
+    cuda_nvcc
+  ];
+
+  buildInputs = [
+    cuda_nvcc # crt/host_config.h
+    cuda_cudart
+    cuda_cccl
+  ];
+
+  env.NIX_CFLAGS_COMPILE = toString [ "-Wno-unused-function" ];
+
+  postPatch = ''
+    patchShebangs ./src/device/generate.py
+    patchShebangs ./src/device/symmetric/generate.py
+  '';
+
+  makeFlags = [
+    "PREFIX=$(out)"
+    "NVCC_GENCODE=${flags.gencodeString}"
+    "CUDA_HOME=${cuda_nvcc}"
+    "CUDA_LIB=${lib.getLib cuda_cudart}/lib"
+    "CUDA_INC=${lib.getDev cuda_cudart}/include"
+  ];
+
+  enableParallelBuilding = true;
+
+  postFixup = ''
+    moveToOutput lib/libnccl_static.a $dev
+  '';
+
+  passthru.updateScript = gitUpdater {
+    inherit (finalAttrs) pname version;
+    rev-prefix = "v";
+  };
+
+  meta = with lib; {
+    description = "Multi-GPU and multi-node collective communication primitives for NVIDIA GPUs";
+    homepage = "https://developer.nvidia.com/nccl";
+    license = licenses.bsd3;
+    platforms = platforms.linux;
+    # NCCL is not supported on Jetson, because it does not use NVLink or PCI-e for inter-GPU communication.
+    # https://forums.developer.nvidia.com/t/can-jetson-orin-support-nccl/232845/9
+    badPlatforms = lib.optionals flags.isJetsonBuild [ "aarch64-linux" ];
+    maintainers = with maintainers; [
+      mdaiter
+      orivej
+    ];
+    teams = [ teams.cuda ];
+  };
+})
--- a/pkgs/development/cuda-modules/packages/saxpy/package.nix
+++ b/pkgs/development/cuda-modules/packages/saxpy/package.nix
@@ -0,0 +1,63 @@
+{
+  autoAddDriverRunpath,
+  cmake,
+  cudaPackages,
+  lib,
+  saxpy,
+}:
+let
+  inherit (cudaPackages)
+    backendStdenv
+    cuda_cccl
+    cuda_cudart
+    cuda_nvcc
+    cudaAtLeast
+    flags
+    libcublas
+    ;
+  inherit (lib) getDev getLib getOutput;
+in
+backendStdenv.mkDerivation {
+  pname = "saxpy";
+  version = "unstable-2023-07-11";
+
+  src = ./src;
+
+  __structuredAttrs = true;
+  strictDeps = true;
+
+  nativeBuildInputs = [
+    cmake
+    autoAddDriverRunpath
+    cuda_nvcc
+  ];
+
+  buildInputs = [
+    (getDev libcublas)
+    (getLib libcublas)
+    (getOutput "static" libcublas)
+    cuda_cudart
+    cuda_cccl
+  ];
+
+  cmakeFlags = [
+    (lib.cmakeBool "CMAKE_VERBOSE_MAKEFILE" true)
+    (lib.cmakeFeature "CMAKE_CUDA_ARCHITECTURES" flags.cmakeCudaArchitecturesString)
+  ];
+
+  passthru.gpuCheck = saxpy.overrideAttrs (_: {
+    requiredSystemFeatures = [ "cuda" ];
+    doInstallCheck = true;
+    postInstallCheck = ''
+      $out/bin/${saxpy.meta.mainProgram or (lib.getName saxpy)}
+    '';
+  });
+
+  meta = {
+    description = "Simple (Single-precision AX Plus Y) FindCUDAToolkit.cmake example for testing cross-compilation";
+    license = lib.licenses.mit;
+    teams = [ lib.teams.cuda ];
+    mainProgram = "saxpy";
+    platforms = lib.platforms.unix;
+  };
+}
--- a/pkgs/development/cuda-modules/packages/saxpy/src/CMakeLists.txt
+++ b/pkgs/development/cuda-modules/packages/saxpy/src/CMakeLists.txt
@@ -0,0 +1,12 @@
+cmake_minimum_required(VERSION 3.25)
+project(saxpy LANGUAGES CXX CUDA)
+
+find_package(CUDAToolkit REQUIRED COMPONENTS cudart cublas)
+
+add_executable(saxpy saxpy.cu)
+target_link_libraries(saxpy PUBLIC CUDA::cublas CUDA::cudart m)
+target_compile_features(saxpy PRIVATE cxx_std_14)
+target_compile_options(saxpy PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
+                                     --expt-relaxed-constexpr>)
+
+install(TARGETS saxpy)
--- a/pkgs/development/cuda-modules/packages/saxpy/src/saxpy.cu
+++ b/pkgs/development/cuda-modules/packages/saxpy/src/saxpy.cu
@@ -0,0 +1,68 @@
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <vector>
+
+#include <stdio.h>
+
+static inline void check(cudaError_t err, const char *context) {
+  if (err != cudaSuccess) {
+    fprintf(stderr, "CUDA error at %s: %s\n", context, cudaGetErrorString(err));
+    std::exit(EXIT_FAILURE);
+  }
+}
+
+#define CHECK(x) check(x, #x)
+
+__global__ void saxpy(int n, float a, float *x, float *y) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < n)
+    y[i] = a * x[i] + y[i];
+}
+
+int main(void) {
+  setbuf(stderr, NULL);
+  fprintf(stderr, "Start\n");
+
+  int rtVersion, driverVersion;
+  CHECK(cudaRuntimeGetVersion(&rtVersion));
+  CHECK(cudaDriverGetVersion(&driverVersion));
+
+  fprintf(stderr, "Runtime version: %d\n", rtVersion);
+  fprintf(stderr, "Driver version: %d\n", driverVersion);
+
+  constexpr int N = 1 << 10;
+
+  std::vector<float> xHost(N), yHost(N);
+  for (int i = 0; i < N; i++) {
+    xHost[i] = 1.0f;
+    yHost[i] = 2.0f;
+  }
+
+  fprintf(stderr, "Host memory initialized, copying to the device\n");
+  fflush(stderr);
+
+  float *xDevice, *yDevice;
+  CHECK(cudaMalloc(&xDevice, N * sizeof(float)));
+  CHECK(cudaMalloc(&yDevice, N * sizeof(float)));
+
+  CHECK(cudaMemcpy(xDevice, xHost.data(), N * sizeof(float),
+                   cudaMemcpyHostToDevice));
+  CHECK(cudaMemcpy(yDevice, yHost.data(), N * sizeof(float),
+                   cudaMemcpyHostToDevice));
+  fprintf(stderr, "Scheduled a cudaMemcpy, calling the kernel\n");
+
+  saxpy<<<(N + 255) / 256, 256>>>(N, 2.0f, xDevice, yDevice);
+  fprintf(stderr, "Scheduled a kernel call\n");
+  CHECK(cudaGetLastError());
+
+  CHECK(cudaMemcpy(yHost.data(), yDevice, N * sizeof(float),
+                   cudaMemcpyDeviceToHost));
+
+  float maxError = 0.0f;
+  for (int i = 0; i < N; i++)
+    maxError = max(maxError, abs(yHost[i] - 4.0f));
+  fprintf(stderr, "Max error: %f\n", maxError);
+
+  CHECK(cudaFree(xDevice));
+  CHECK(cudaFree(yDevice));
+}
--- a/pkgs/development/cuda-modules/packages/setupCudaHook/package.nix
+++ b/pkgs/development/cuda-modules/packages/setupCudaHook/package.nix
@@ -0,0 +1,14 @@
+# Currently propagated by cuda_nvcc or cudatoolkit, rather than used directly
+{ makeSetupHook, backendStdenv }:
+makeSetupHook {
+  name = "setup-cuda-hook";
+
+  substitutions.setupCudaHook = placeholder "out";
+
+  # Point NVCC at a compatible compiler
+  substitutions.ccRoot = "${backendStdenv.cc}";
+
+  # Required in addition to ccRoot as otherwise bin/gcc is looked up
+  # when building CMakeCUDACompilerId.cu
+  substitutions.ccFullPath = "${backendStdenv.cc}/bin/${backendStdenv.cc.targetPrefix}c++";
+} ./setup-cuda-hook.sh
--- a/pkgs/development/cuda-modules/packages/setupCudaHook/setup-cuda-hook.sh
+++ b/pkgs/development/cuda-modules/packages/setupCudaHook/setup-cuda-hook.sh
@@ -0,0 +1,128 @@
+# shellcheck shell=bash
+
+# Only run the hook from nativeBuildInputs
+(( "$hostOffset" == -1 && "$targetOffset" == 0)) || return 0
+
+guard=Sourcing
+reason=
+
+[[ -n ${cudaSetupHookOnce-} ]] && guard=Skipping && reason=" because the hook has been propagated more than once"
+
+if (( "${NIX_DEBUG:-0}" >= 1 )) ; then
+    echo "$guard hostOffset=$hostOffset targetOffset=$targetOffset setup-cuda-hook$reason" >&2
+else
+    echo "$guard setup-cuda-hook$reason" >&2
+fi
+
+[[ "$guard" = Sourcing ]] || return 0
+
+declare -g cudaSetupHookOnce=1
+declare -Ag cudaHostPathsSeen=()
+declare -Ag cudaOutputToPath=()
+
+extendcudaHostPathsSeen() {
+    (( "${NIX_DEBUG:-0}" >= 1 )) && echo "extendcudaHostPathsSeen $1" >&2
+
+    local markerPath="$1/nix-support/include-in-cudatoolkit-root"
+    [[ ! -f "${markerPath}" ]] && return 0
+    [[ -v cudaHostPathsSeen[$1] ]] && return 0
+
+    cudaHostPathsSeen["$1"]=1
+
+    # E.g. cuda_cudart-lib
+    local cudaOutputName
+    # Fail gracefully if the file is empty.
+    # One reason the file may be empty: the package was built with strictDeps set, but the current build does not have
+    # strictDeps set.
+    read -r cudaOutputName < "$markerPath" || return 0
+
+    [[ -z "$cudaOutputName" ]] && return 0
+
+    local oldPath="${cudaOutputToPath[$cudaOutputName]-}"
+    [[ -n "$oldPath" ]] && echo "extendcudaHostPathsSeen: warning: overwriting $cudaOutputName from $oldPath to $1" >&2
+    cudaOutputToPath["$cudaOutputName"]="$1"
+}
+addEnvHooks "$targetOffset" extendcudaHostPathsSeen
+
+setupCUDAToolkit_ROOT() {
+    (( "${NIX_DEBUG:-0}" >= 1 )) && echo "setupCUDAToolkit_ROOT: cudaHostPathsSeen=${!cudaHostPathsSeen[*]}" >&2
+
+    for path in "${!cudaHostPathsSeen[@]}" ; do
+        addToSearchPathWithCustomDelimiter ";" CUDAToolkit_ROOT "$path"
+        if [[ -d "$path/include" ]] ; then
+            addToSearchPathWithCustomDelimiter ";" CUDAToolkit_INCLUDE_DIR "$path/include"
+        fi
+    done
+
+    # Use array form so semicolon-separated lists are passed safely.
+    if [[ -n "${CUDAToolkit_INCLUDE_DIR-}" ]]; then
+        cmakeFlagsArray+=("-DCUDAToolkit_INCLUDE_DIR=${CUDAToolkit_INCLUDE_DIR}")
+    fi
+    if [[ -n "${CUDAToolkit_ROOT-}" ]]; then
+        cmakeFlagsArray+=("-DCUDAToolkit_ROOT=${CUDAToolkit_ROOT}")
+    fi
+}
+preConfigureHooks+=(setupCUDAToolkit_ROOT)
+
+setupCUDAToolkitCompilers() {
+    echo Executing setupCUDAToolkitCompilers >&2
+
+    if [[ -n "${dontSetupCUDAToolkitCompilers-}" ]] ; then
+        return 0
+    fi
+
+    # Point NVCC at a compatible compiler
+
+    # For CMake-based projects:
+    # https://cmake.org/cmake/help/latest/module/FindCUDA.html#input-variables
+    # https://cmake.org/cmake/help/latest/envvar/CUDAHOSTCXX.html
+    # https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_HOST_COMPILER.html
+
+    appendToVar cmakeFlags "-DCUDA_HOST_COMPILER=@ccFullPath@"
+    appendToVar cmakeFlags "-DCMAKE_CUDA_HOST_COMPILER=@ccFullPath@"
+
+    # For non-CMake projects:
+    # We prepend --compiler-bindir to nvcc flags.
+    # Downstream packages can override these, because NVCC
+    # uses the last --compiler-bindir it gets on the command line.
+    # FIXME: this results in "incompatible redefinition" warnings.
+    # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#compiler-bindir-directory-ccbin
+    if [ -z "${CUDAHOSTCXX-}" ]; then
+      export CUDAHOSTCXX="@ccFullPath@";
+    fi
+
+    appendToVar NVCC_PREPEND_FLAGS "--compiler-bindir=@ccRoot@/bin"
+
+    # NOTE: We set -Xfatbin=-compress-all, which reduces the size of the compiled
+    #   binaries. If binaries grow over 2GB, they will fail to link. This is a problem for us, as
+    #   the default set of CUDA capabilities we build can regularly cause this to occur (for
+    #   example, with Magma).
+    #
+    # @SomeoneSerge: original comment was made by @ConnorBaker in .../cudatoolkit/common.nix
+    if [[ -z "${dontCompressFatbin-}" ]]; then
+        appendToVar NVCC_PREPEND_FLAGS "-Xfatbin=-compress-all"
+    fi
+}
+preConfigureHooks+=(setupCUDAToolkitCompilers)
+
+propagateCudaLibraries() {
+    (( "${NIX_DEBUG:-0}" >= 1 )) && echo "propagateCudaLibraries: cudaPropagateToOutput=$cudaPropagateToOutput cudaHostPathsSeen=${!cudaHostPathsSeen[*]}" >&2
+
+    [[ -z "${cudaPropagateToOutput-}" ]] && return 0
+
+    mkdir -p "${!cudaPropagateToOutput}/nix-support"
+    # One'd expect this should be propagated-bulid-build-deps, but that doesn't seem to work
+    echo "@setupCudaHook@" >> "${!cudaPropagateToOutput}/nix-support/propagated-native-build-inputs"
+
+    local propagatedBuildInputs=( "${!cudaHostPathsSeen[@]}" )
+    for output in $(getAllOutputNames) ; do
+        if [[ ! "$output" = "$cudaPropagateToOutput" ]] ; then
+            appendToVar propagatedBuildInputs "${!output}"
+        fi
+        break
+    done
+
+    # One'd expect this should be propagated-host-host-deps, but that doesn't seem to work
+    printWords "${propagatedBuildInputs[@]}" >> "${!cudaPropagateToOutput}/nix-support/propagated-build-inputs"
+}
+postFixupHooks+=(propagateCudaLibraries)
--- a/pkgs/development/cuda-modules/packages/writeGpuTestPython.nix
+++ b/pkgs/development/cuda-modules/packages/writeGpuTestPython.nix
@@ -0,0 +1,77 @@
+{
+  lib,
+  runCommand,
+  python3Packages,
+  makeWrapper,
+  writableTmpDirAsHomeHook,
+}:
+{
+  feature ? "cuda",
+  name ? if feature == null then "cpu" else feature,
+  libraries ? [ ], # [PythonPackage] | (PackageSet -> [PythonPackage])
+  gpuCheckArgs ? { },
+  ...
+}@args:
+
+let
+  inherit (builtins) isFunction all;
+  librariesFun = if isFunction libraries then libraries else (_: libraries);
+in
+
+assert lib.assertMsg (
+  isFunction libraries || all (python3Packages.hasPythonModule) libraries
+) "writeGpuTestPython was passed `libraries` from the wrong python release";
+
+content:
+
+let
+  interpreter = python3Packages.python.withPackages librariesFun;
+  tester =
+    runCommand "tester-${name}"
+      (
+        lib.removeAttrs args [
+          "gpuCheckArgs"
+          "libraries"
+          "name"
+        ]
+        // {
+          inherit content;
+          nativeBuildInputs = args.nativeBuildInputs or [ ] ++ [ makeWrapper ];
+          passAsFile = args.passAsFile or [ ] ++ [ "content" ];
+        }
+      )
+      ''
+        mkdir -p "$out"/bin
+        cat << EOF >"$out/bin/$name"
+        #!${lib.getExe interpreter}
+        EOF
+        cat "$contentPath" >>"$out/bin/$name"
+        chmod +x "$out/bin/$name"
+
+        if [[ -n "''${makeWrapperArgs+''${makeWrapperArgs[@]}}" ]] ; then
+          wrapProgram "$out/bin/$name" ''${makeWrapperArgs[@]}
+        fi
+      '';
+  tester' = tester.overrideAttrs (oldAttrs: {
+    passthru.gpuCheck =
+      runCommand "test-${name}"
+        (
+          gpuCheckArgs
+          // {
+            nativeBuildInputs = [
+              tester'
+            ]
+            ++ gpuCheckArgs.nativeBuildInputs or [ ];
+
+            requiredSystemFeatures =
+              lib.optionals (feature != null) [ feature ] ++ gpuCheckArgs.requiredSystemFeatures or [ ];
+          }
+        )
+        ''
+          set -e
+          ${tester.meta.mainProgram or (lib.getName tester')}
+          touch $out
+        '';
+  });
+in
+tester'