push sheeet

2025-10-09 14:15:47 +02:00
commit 646b892680
49168 changed files with 5897842 additions and 0 deletions
--- a/pkgs/development/rocm-modules/6/composable_kernel/avoid-extra-host-compile.patch
+++ b/pkgs/development/rocm-modules/6/composable_kernel/avoid-extra-host-compile.patch
@@ -0,0 +1,40 @@
+From 6ac72ec84269737626b1f5e43e64729f0922d182 Mon Sep 17 00:00:00 2001
+From: "Ding, Yi" <yi.ding@amd.com>
+Date: Wed, 9 Jul 2025 03:12:39 +0000
+Subject: [PATCH] Avoid compile kernel in host pass
+
+---
+ include/ck_tile/host/kernel_launch.hpp | 9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+diff --git a/include/ck_tile/host/kernel_launch.hpp b/include/ck_tile/host/kernel_launch.hpp
+index 9770e99738..f6ccb6968b 100644
+--- a/include/ck_tile/host/kernel_launch.hpp
+++ b/include/ck_tile/host/kernel_launch.hpp
+@@ -4,11 +4,12 @@
+ #pragma once
+ 
+ #include "ck_tile/core/config.hpp"
+-#include "ck_tile/host/stream_config.hpp"
+#include "ck_tile/core/utility/ignore.hpp"
+ #include "ck_tile/host/hip_check_error.hpp"
+#include "ck_tile/host/stream_config.hpp"
+ #include "ck_tile/host/timer.hpp"
+-#include <hip/hip_runtime.h>
+ #include <cstddef>
+#include <hip/hip_runtime.h>
+ 
+ namespace ck_tile {
+ 
+@@ -24,7 +25,11 @@ __launch_bounds__(MaxThreadPerBlock, MinBlockPerCu)
+ #endif
+     __global__ void kentry(Args... args)
+ {
+#if defined(__HIP_DEVICE_COMPILE__)
+     Kernel{}(args...);
+#else
+    (..., (ignore = args, 0));
+#endif
+ }
+ 
+ //
--- a/pkgs/development/rocm-modules/6/composable_kernel/base.nix
+++ b/pkgs/development/rocm-modules/6/composable_kernel/base.nix
@@ -0,0 +1,198 @@
+{
+  lib,
+  stdenv,
+  fetchFromGitHub,
+  rocmUpdateScript,
+  cmake,
+  rocm-cmake,
+  rocm-merged-llvm,
+  clr,
+  rocminfo,
+  python3,
+  hipify,
+  gitMinimal,
+  gtest,
+  zstd,
+  buildTests ? false,
+  buildExamples ? false,
+  gpuTargets ? (
+    clr.localGpuTargets or [
+      "gfx900"
+      "gfx906"
+      "gfx908"
+      "gfx90a"
+      "gfx942"
+      "gfx1030"
+      "gfx1100"
+      "gfx1101"
+      "gfx1102"
+      "gfx1200"
+      "gfx1201"
+    ]
+  ),
+}:
+
+# TODO: in 7.x CK is likely to gain support for
+# a) miopen kernel only build (MIOPEN_REQ_LIBS_ONLY)
+# b) header only build (useful for torch) https://github.com/ROCm/composable_kernel/issues/2030
+# that will likely allow us to get rid of this complicated split part build!
+stdenv.mkDerivation (finalAttrs: {
+  preBuild = ''
+    echo "This derivation isn't intended to be built directly and only exists to be overridden and built in chunks";
+    exit 1
+  '';
+
+  pname = "composable_kernel_base";
+  version = "6.4-unstable-2025-05-22";
+
+  outputs = [
+    "out"
+  ]
+  ++ lib.optionals buildTests [
+    "test"
+  ]
+  ++ lib.optionals buildExamples [
+    "example"
+  ];
+
+  src = fetchFromGitHub {
+    owner = "ROCm";
+    repo = "composable_kernel";
+    # Using a dev snapshot, trying to get MIOpen to work
+    rev = "bc2551ac3b27edc31f20863e3a873508fb73aad2";
+    hash = "sha256-bfmwbgR1ya+zkME3wOyaZX/e+1+ie0sSlugK/kozLsI=";
+  };
+
+  nativeBuildInputs = [
+    # Deliberately not using ninja
+    # because we're jankily composing build outputs from multiple drvs
+    # ninja won't believe they're up to date
+    gitMinimal
+    cmake
+    rocminfo
+    clr
+    hipify
+    zstd
+    python3
+  ];
+
+  buildInputs = [
+    rocm-cmake
+    clr
+    zstd
+  ];
+
+  strictDeps = true;
+  enableParallelBuilding = true;
+  env.ROCM_PATH = clr;
+  env.HIP_CLANG_PATH = "${rocm-merged-llvm}/bin";
+
+  cmakeFlags = [
+    "-DCMAKE_MODULE_PATH=${clr}/hip/cmake"
+    "-DCMAKE_BUILD_TYPE=Release"
+    "-DCMAKE_POLICY_DEFAULT_CMP0069=NEW"
+    # "-DDL_KERNELS=ON" # Not needed, slow to build
+    # CK_USE_CODEGEN Required for migraphx which uses device_gemm_multiple_d.hpp
+    # but migraphx requires an incompatible fork of CK and fails anyway
+    # "-DCK_USE_CODEGEN=ON"
+    # It might be worth skipping fp64 in future with this:
+    # "-DDTYPES=fp32;fp16;fp8;bf16;int8"
+    # Manually define CMAKE_INSTALL_<DIR>
+    # See: https://github.com/NixOS/nixpkgs/pull/197838
+    "-DCMAKE_INSTALL_BINDIR=bin"
+    "-DCMAKE_INSTALL_LIBDIR=lib"
+    "-DCMAKE_INSTALL_INCLUDEDIR=include"
+    "-DBUILD_DEV=OFF"
+    "-DBUILD_MHA_LIB=ON"
+    "-DROCM_PATH=${clr}"
+    "-DENABLE_CLANG_CPP_CHECKS=OFF"
+    "-DCMAKE_HIP_COMPILER_ROCM_ROOT=${clr}"
+
+    # FP8 can build for 908/90a but very slow build
+    # and produces unusably slow kernels that are huge
+    "-DCK_USE_FP8_ON_UNSUPPORTED_ARCH=OFF"
+  ]
+  ++ lib.optionals (gpuTargets != [ ]) [
+    # We intentionally set GPU_ARCHS and not AMD/GPU_TARGETS
+    # per readme this is required if archs are dissimilar
+    # In rocm-6.3.x not setting any arch flag worked
+    # but setting dissimilar arches always failed
+    "-DGPU_ARCHS=${lib.concatStringsSep ";" gpuTargets}"
+  ]
+  ++ lib.optionals buildTests [
+    "-DGOOGLETEST_DIR=${gtest.src}" # Custom linker names
+  ];
+
+  patches = [
+    # Significant build performance improvement
+    ./avoid-extra-host-compile.patch
+  ];
+
+  # No flags to build selectively it seems...
+  postPatch =
+    # Reduce configure time by preventing thousands of clang-tidy targets being added
+    # We will never call them
+    # Never build profiler
+    ''
+      substituteInPlace library/src/utility/CMakeLists.txt library/src/tensor_operation_instance/gpu/CMakeLists.txt \
+        --replace-fail clang_tidy_check '#clang_tidy_check'
+      substituteInPlace CMakeLists.txt \
+        --replace-fail "add_subdirectory(profiler)" ""
+      substituteInPlace cmake/EnableCompilerWarnings.cmake \
+        --replace-fail "-Werror" ""
+
+      # Apply equivalent change to https://github.com/ROCm/composable_kernel/pull/2564
+      # TODO: Remove after ROCm 7.1
+      find include/ck/tensor_operation/ -type f -name "*.hpp" -exec sed -i \
+        -e 's/!defined(__HIP_DEVICE_COMPILE__)/false/g' \
+        {} +
+    ''
+    # Optionally remove tests
+    + lib.optionalString (!buildTests) ''
+      substituteInPlace CMakeLists.txt \
+        --replace-fail "add_subdirectory(test)" ""
+      substituteInPlace codegen/CMakeLists.txt \
+        --replace-fail "include(ROCMTest)" ""
+    ''
+    # Optionally remove examples
+    + lib.optionalString (!buildExamples) ''
+      substituteInPlace CMakeLists.txt \
+        --replace-fail "add_subdirectory(example)" ""
+    '';
+
+  postInstall =
+    lib.optionalString buildTests ''
+      mkdir -p $test/bin
+      mv $out/bin/test_* $test/bin
+    ''
+    + lib.optionalString buildExamples ''
+      mkdir -p $example/bin
+      mv $out/bin/example_* $example/bin
+    '';
+
+  passthru = {
+    inherit gpuTargets;
+    updateScript = rocmUpdateScript {
+      name = finalAttrs.pname;
+      inherit (finalAttrs.src) owner;
+      inherit (finalAttrs.src) repo;
+    };
+    anyGfx9Target = lib.lists.any (lib.strings.hasPrefix "gfx9") gpuTargets;
+    anyMfmaTarget =
+      (lib.lists.intersectLists gpuTargets [
+        "gfx908"
+        "gfx90a"
+        "gfx942"
+        "gfx950"
+      ]) != [ ];
+  };
+
+  meta = with lib; {
+    description = "Performance portable programming model for machine learning tensor operators";
+    homepage = "https://github.com/ROCm/composable_kernel";
+    license = with licenses; [ mit ];
+    teams = [ teams.rocm ];
+    platforms = platforms.linux;
+    broken = true; # this base package shouldn't be built directly
+  };
+})
--- a/pkgs/development/rocm-modules/6/composable_kernel/ck4inductor.nix
+++ b/pkgs/development/rocm-modules/6/composable_kernel/ck4inductor.nix
@@ -0,0 +1,43 @@
+{
+  buildPythonPackage,
+  python,
+  composable_kernel,
+  lib,
+  setuptools,
+  setuptools-scm,
+  rocm-merged-llvm,
+}:
+buildPythonPackage {
+  pyproject = true;
+  pname = "ck4inductor";
+  build-system = [
+    setuptools
+    setuptools-scm
+  ];
+  version = "6.4.3";
+  inherit (composable_kernel) src;
+  pythonImportsCheck = [
+    "ck4inductor"
+    "ck4inductor.universal_gemm.gen_instances"
+    "ck4inductor.universal_gemm.gen_instances"
+    "ck4inductor.universal_gemm.op"
+  ];
+  propagatedBuildInputs = [
+    # At runtime will fail to compile anything with ck4inductor without this
+    # can't easily use in checks phase because most of the compiler machinery is in torch
+    rocm-merged-llvm
+  ];
+  checkPhase = ''
+    if [ ! -d "$out/${python.sitePackages}/ck4inductor" ]; then
+      echo "ck4inductor isn't at the expected location in $out/${python.sitePackages}/ck4inductor"
+      exit 1
+    fi
+  '';
+  meta = with lib; {
+    description = "Pytorch inductor backend which uses composable_kernel universal GEMM implementations";
+    homepage = "https://github.com/ROCm/composable_kernel";
+    license = with licenses; [ mit ];
+    teams = [ teams.rocm ];
+    platforms = platforms.linux;
+  };
+}
--- a/pkgs/development/rocm-modules/6/composable_kernel/default.nix
+++ b/pkgs/development/rocm-modules/6/composable_kernel/default.nix
@@ -0,0 +1,290 @@
+{
+  lib,
+  clr,
+  composable_kernel_base,
+}:
+
+let
+  parts = {
+    _mha = {
+      # mha takes ~3hrs on 64 cores on an EPYC milan system at ~2.5GHz
+      # big-parallel builders are one gen newer and clocked ~30% higher but only 24 cores
+      # Should be <10h timeout but might be cutting it close
+      # TODO: work out how to split this into smaller chunks instead of all 3k mha instances together
+      # mha_0,1,2, search ninja target file for the individual instances, split by the index?
+      # TODO: can we prune the generated instances down to only what in practice are used with popular models
+      # when using flash-attention + MHA kernels?
+      targets = [
+        "device_mha_instance"
+      ];
+      extraCmakeFlags = [ "-DHIP_CLANG_NUM_PARALLEL_JOBS=2" ];
+    };
+    gemm_multiply_multiply = {
+      targets = [
+        "device_gemm_multiply_multiply_instance"
+      ];
+      extraCmakeFlags = [ "-DHIP_CLANG_NUM_PARALLEL_JOBS=2" ];
+      onlyFor = [
+        "gfx942"
+        "gfx950"
+      ];
+    };
+    gemm_multiply_multiply_wp = {
+      targets = [
+        "device_gemm_multiply_multiply_wp_instance"
+      ];
+      extraCmakeFlags = [ "-DHIP_CLANG_NUM_PARALLEL_JOBS=2" ];
+      onlyFor = [
+        "gfx942"
+        "gfx950"
+      ];
+    };
+    grouped_conv_bwd = {
+      targets = [
+        "device_grouped_conv1d_bwd_weight_instance"
+        "device_grouped_conv2d_bwd_data_instance"
+        "device_grouped_conv2d_bwd_weight_instance"
+      ];
+    };
+    grouped_conv_fwd = {
+      targets = [
+        "device_grouped_conv1d_fwd_instance"
+        "device_grouped_conv2d_fwd_instance"
+        "device_grouped_conv2d_fwd_bias_relu_instance"
+        "device_grouped_conv2d_fwd_dynamic_op_instance"
+      ];
+    };
+    grouped_conv_bwd_3d1 = {
+      targets = [
+        "device_grouped_conv3d_bwd_data_instance"
+        "device_grouped_conv3d_bwd_data_bilinear_instance"
+        "device_grouped_conv3d_bwd_data_scale_instance"
+      ];
+    };
+    grouped_conv_bwd_3d2 = {
+      targets = [
+        "device_grouped_conv3d_bwd_weight_instance"
+        "device_grouped_conv3d_bwd_weight_bilinear_instance"
+        "device_grouped_conv3d_bwd_weight_scale_instance"
+      ];
+    };
+    grouped_conv_fwd_3d1 = {
+      targets = [
+        "device_grouped_conv3d_fwd_instance"
+        "device_grouped_conv3d_fwd_bias_relu_instance"
+        "device_grouped_conv3d_fwd_bilinear_instance"
+        "device_grouped_conv3d_fwd_convinvscale_instance"
+        "device_grouped_conv3d_fwd_convscale_instance"
+        "device_grouped_conv3d_fwd_convscale_add_instance"
+      ];
+    };
+    grouped_conv_fwd_3d2 = {
+      targets = [
+        "device_grouped_conv3d_fwd_convscale_relu_instance"
+        "device_grouped_conv3d_fwd_dynamic_op_instance"
+        "device_grouped_conv3d_fwd_scale_instance"
+        "device_grouped_conv3d_fwd_scaleadd_ab_instance"
+        "device_grouped_conv3d_fwd_scaleadd_scaleadd_relu_instance"
+      ];
+    };
+    batched_gemm1 = {
+      targets = [
+        "device_batched_gemm_instance"
+        "device_batched_gemm_b_scale_instance"
+        "device_batched_gemm_multi_d_instance"
+        "device_batched_gemm_add_relu_gemm_add_instance"
+        "device_batched_gemm_bias_permute_instance"
+        "device_batched_gemm_gemm_instance"
+        "device_batched_gemm_reduce_instance"
+        "device_batched_gemm_softmax_gemm_instance"
+      ];
+    };
+    batched_gemm2 = {
+      targets = [
+        "device_batched_gemm_softmax_gemm_permute_instance"
+        "device_grouped_gemm_instance"
+        "device_grouped_gemm_bias_instance"
+        "device_grouped_gemm_fastgelu_instance"
+        "device_grouped_gemm_fixed_nk_instance"
+        "device_grouped_gemm_fixed_nk_multi_abd_instance"
+        "device_grouped_gemm_tile_loop_instance"
+      ];
+    };
+    gemm_universal1 = {
+      targets = [
+        "device_gemm_universal_instance"
+        "device_gemm_universal_batched_instance"
+      ];
+    };
+    gemm_universal2 = {
+      targets = [
+        "device_gemm_universal_reduce_instance"
+        "device_gemm_universal_streamk_instance"
+      ];
+    };
+    gemm_other1 = {
+      targets = [
+        "device_gemm_instance"
+        "device_gemm_b_scale_instance"
+        "device_gemm_ab_scale_instance"
+        "device_gemm_add_instance"
+        "device_gemm_add_add_fastgelu_instance"
+        "device_gemm_add_fastgelu_instance"
+        "device_gemm_add_multiply_instance"
+        "device_gemm_add_relu_instance"
+      ];
+    };
+    gemm_other2 = {
+      targets = [
+        "device_gemm_add_relu_add_layernorm_instance"
+        "device_gemm_add_silu_instance"
+        "device_gemm_bias_add_reduce_instance"
+        "device_gemm_bilinear_instance"
+        "device_gemm_fastgelu_instance"
+        "device_gemm_multi_abd_instance"
+        "device_gemm_multiply_add_instance"
+        "device_gemm_reduce_instance"
+        "device_gemm_splitk_instance"
+        "device_gemm_streamk_instance"
+      ];
+    };
+    conv = {
+      targets = [
+        "device_conv1d_bwd_data_instance"
+        "device_conv2d_bwd_data_instance"
+        "device_conv2d_fwd_instance"
+        "device_conv2d_fwd_bias_relu_instance"
+        "device_conv2d_fwd_bias_relu_add_instance"
+        "device_conv3d_bwd_data_instance"
+      ];
+    };
+    pool = {
+      targets = [
+        "device_avg_pool2d_bwd_instance"
+        "device_avg_pool3d_bwd_instance"
+        "device_pool2d_fwd_instance"
+        "device_pool3d_fwd_instance"
+        "device_max_pool_bwd_instance"
+      ];
+    };
+    other1 = {
+      targets = [
+        "device_batchnorm_instance"
+        "device_contraction_bilinear_instance"
+        "device_contraction_scale_instance"
+        "device_elementwise_instance"
+        "device_elementwise_normalization_instance"
+      ];
+    };
+    other2 = {
+      targets = [
+        "device_column_to_image_instance"
+        "device_image_to_column_instance"
+        "device_permute_scale_instance"
+        "device_quantization_instance"
+        "device_reduce_instance"
+      ];
+    };
+    other3 = {
+      targets = [
+        "device_normalization_bwd_data_instance"
+        "device_normalization_bwd_gamma_beta_instance"
+        "device_normalization_fwd_instance"
+        "device_softmax_instance"
+        "device_transpose_instance"
+      ];
+    };
+  };
+  tensorOpBuilder =
+    {
+      part,
+      targets,
+      extraCmakeFlags ? [ ],
+      requiredSystemFeatures ? [ "big-parallel" ],
+      onlyFor ? [ ],
+    }:
+    let
+      supported =
+        onlyFor == [ ] || (lib.lists.intersectLists composable_kernel_base.gpuTargets onlyFor) != [ ];
+    in
+    if supported then
+      (composable_kernel_base.overrideAttrs (old: {
+        inherit requiredSystemFeatures;
+        pname = "composable_kernel${clr.gpuArchSuffix}-${part}";
+        makeTargets = targets;
+        preBuild = ''
+          echo "Building ${part}"
+          makeFlagsArray+=($makeTargets)
+          substituteInPlace $(find ./ -name "Makefile" -type f) \
+            --replace-fail '.NOTPARALLEL:' '.UNUSED_NOTPARALLEL:'
+        '';
+
+        # Compile parallelism adjusted based on available RAM
+        # Never uses less than NIX_BUILD_CORES/4, never uses more than NIX_BUILD_CORES
+        # CK uses an unusually high amount of memory per core in the build step
+        # Nix/nixpkgs doesn't really have any infra to tell it that this build is unusually memory hungry
+        # So, bodge. Otherwise you end up having to build all of ROCm with a low core limit when
+        # it's only this package that has trouble.
+        preConfigure = old.preConfigure or "" + ''
+          MEM_GB_TOTAL=$(awk '/MemTotal/ { printf "%d \n", $2/1024/1024 }' /proc/meminfo)
+          MEM_GB_AVAILABLE=$(awk '/MemAvailable/ { printf "%d \n", $2/1024/1024 }' /proc/meminfo)
+          APPX_GB=$((MEM_GB_AVAILABLE > MEM_GB_TOTAL ? MEM_GB_TOTAL : MEM_GB_AVAILABLE))
+          MAX_CORES=$((1 + APPX_GB/3))
+          MAX_CORES=$((MAX_CORES < NIX_BUILD_CORES/3 ? NIX_BUILD_CORES/3 : MAX_CORES))
+          export NIX_BUILD_CORES="$((NIX_BUILD_CORES > MAX_CORES ? MAX_CORES : NIX_BUILD_CORES))"
+          echo "Picked new core limit NIX_BUILD_CORES=$NIX_BUILD_CORES based on available mem: $APPX_GB GB"
+          cmakeFlagsArray+=(
+            "-DCK_PARALLEL_COMPILE_JOBS=$NIX_BUILD_CORES"
+          )
+        '';
+        cmakeFlags = old.cmakeFlags ++ extraCmakeFlags;
+        # Early exit after build phase with success, skips fixups etc
+        # Will get copied back into /build of the final CK
+        postBuild = ''
+          find . -name "*.o" -type f | while read -r file; do
+            mkdir -p "$out/$(dirname "$file")"
+            cp --reflink=auto "$file" "$out/$file"
+          done
+          exit 0
+        '';
+        meta = old.meta // {
+          broken = false;
+        };
+      }))
+    else
+      null;
+  composable_kernel_parts = builtins.mapAttrs (
+    part: targets: tensorOpBuilder (targets // { inherit part; })
+  ) parts;
+in
+
+composable_kernel_base.overrideAttrs (
+  finalAttrs: old: {
+    pname = "composable_kernel${clr.gpuArchSuffix}";
+    parts_dirs = builtins.filter (x: x != null) (builtins.attrValues composable_kernel_parts);
+    disallowedReferences = builtins.filter (x: x != null) (builtins.attrValues composable_kernel_parts);
+    preBuild = ''
+      for dir in $parts_dirs; do
+        find "$dir" -type f -name "*.o" | while read -r file; do
+          # Extract the relative path by removing the output directory prefix
+          rel_path="''${file#"$dir/"}"
+
+          # Create parent directory if it doesn't exist
+          mkdir -p "$(dirname "$rel_path")"
+
+          # Copy the file back to its original location, give it a future timestamp
+          # so make treats it as up to date
+          cp --reflink=auto --no-preserve=all "$file" "$rel_path"
+          touch -d "now +10 hours" "$rel_path"
+        done
+      done
+    '';
+    passthru = old.passthru // {
+      parts = composable_kernel_parts;
+    };
+    meta = old.meta // {
+      # Builds without any gfx9 fail
+      broken = !finalAttrs.passthru.anyGfx9Target;
+    };
+  }
+)