push sheeet
Some checks failed
Periodic Merges (6h) / master → staging-nixos (push) Failing after 12m50s
Periodic Merges (6h) / master → staging-next (push) Failing after 12m54s
Periodic Merges (24h) / merge-base(master,staging) → haskell-updates (push) Failing after 11m54s
Periodic Merges (6h) / staging-next → staging (push) Failing after 12m13s
Periodic Merges (24h) / staging-next-25.05 → staging-25.05 (push) Failing after 13m24s
Periodic Merges (24h) / release-25.05 → staging-next-25.05 (push) Failing after 14m28s

This commit is contained in:
Dark Steveneq
2025-10-09 14:15:47 +02:00
commit 646b892680
49168 changed files with 5897842 additions and 0 deletions

View File

@@ -0,0 +1,40 @@
From 6ac72ec84269737626b1f5e43e64729f0922d182 Mon Sep 17 00:00:00 2001
From: "Ding, Yi" <yi.ding@amd.com>
Date: Wed, 9 Jul 2025 03:12:39 +0000
Subject: [PATCH] Avoid compile kernel in host pass
---
include/ck_tile/host/kernel_launch.hpp | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/include/ck_tile/host/kernel_launch.hpp b/include/ck_tile/host/kernel_launch.hpp
index 9770e99738..f6ccb6968b 100644
--- a/include/ck_tile/host/kernel_launch.hpp
+++ b/include/ck_tile/host/kernel_launch.hpp
@@ -4,11 +4,12 @@
#pragma once
#include "ck_tile/core/config.hpp"
-#include "ck_tile/host/stream_config.hpp"
+#include "ck_tile/core/utility/ignore.hpp"
#include "ck_tile/host/hip_check_error.hpp"
+#include "ck_tile/host/stream_config.hpp"
#include "ck_tile/host/timer.hpp"
-#include <hip/hip_runtime.h>
#include <cstddef>
+#include <hip/hip_runtime.h>
namespace ck_tile {
@@ -24,7 +25,11 @@ __launch_bounds__(MaxThreadPerBlock, MinBlockPerCu)
#endif
__global__ void kentry(Args... args)
{
+#if defined(__HIP_DEVICE_COMPILE__)
Kernel{}(args...);
+#else
+ (..., (ignore = args, 0));
+#endif
}
//

View File

@@ -0,0 +1,198 @@
{
lib,
stdenv,
fetchFromGitHub,
rocmUpdateScript,
cmake,
rocm-cmake,
rocm-merged-llvm,
clr,
rocminfo,
python3,
hipify,
gitMinimal,
gtest,
zstd,
buildTests ? false,
buildExamples ? false,
gpuTargets ? (
clr.localGpuTargets or [
"gfx900"
"gfx906"
"gfx908"
"gfx90a"
"gfx942"
"gfx1030"
"gfx1100"
"gfx1101"
"gfx1102"
"gfx1200"
"gfx1201"
]
),
}:
# TODO: in 7.x CK is likely to gain support for
# a) miopen kernel only build (MIOPEN_REQ_LIBS_ONLY)
# b) header only build (useful for torch) https://github.com/ROCm/composable_kernel/issues/2030
# that will likely allow us to get rid of this complicated split part build!
stdenv.mkDerivation (finalAttrs: {
preBuild = ''
echo "This derivation isn't intended to be built directly and only exists to be overridden and built in chunks";
exit 1
'';
pname = "composable_kernel_base";
version = "6.4-unstable-2025-05-22";
outputs = [
"out"
]
++ lib.optionals buildTests [
"test"
]
++ lib.optionals buildExamples [
"example"
];
src = fetchFromGitHub {
owner = "ROCm";
repo = "composable_kernel";
# Using a dev snapshot, trying to get MIOpen to work
rev = "bc2551ac3b27edc31f20863e3a873508fb73aad2";
hash = "sha256-bfmwbgR1ya+zkME3wOyaZX/e+1+ie0sSlugK/kozLsI=";
};
nativeBuildInputs = [
# Deliberately not using ninja
# because we're jankily composing build outputs from multiple drvs
# ninja won't believe they're up to date
gitMinimal
cmake
rocminfo
clr
hipify
zstd
python3
];
buildInputs = [
rocm-cmake
clr
zstd
];
strictDeps = true;
enableParallelBuilding = true;
env.ROCM_PATH = clr;
env.HIP_CLANG_PATH = "${rocm-merged-llvm}/bin";
cmakeFlags = [
"-DCMAKE_MODULE_PATH=${clr}/hip/cmake"
"-DCMAKE_BUILD_TYPE=Release"
"-DCMAKE_POLICY_DEFAULT_CMP0069=NEW"
# "-DDL_KERNELS=ON" # Not needed, slow to build
# CK_USE_CODEGEN Required for migraphx which uses device_gemm_multiple_d.hpp
# but migraphx requires an incompatible fork of CK and fails anyway
# "-DCK_USE_CODEGEN=ON"
# It might be worth skipping fp64 in future with this:
# "-DDTYPES=fp32;fp16;fp8;bf16;int8"
# Manually define CMAKE_INSTALL_<DIR>
# See: https://github.com/NixOS/nixpkgs/pull/197838
"-DCMAKE_INSTALL_BINDIR=bin"
"-DCMAKE_INSTALL_LIBDIR=lib"
"-DCMAKE_INSTALL_INCLUDEDIR=include"
"-DBUILD_DEV=OFF"
"-DBUILD_MHA_LIB=ON"
"-DROCM_PATH=${clr}"
"-DENABLE_CLANG_CPP_CHECKS=OFF"
"-DCMAKE_HIP_COMPILER_ROCM_ROOT=${clr}"
# FP8 can build for 908/90a but very slow build
# and produces unusably slow kernels that are huge
"-DCK_USE_FP8_ON_UNSUPPORTED_ARCH=OFF"
]
++ lib.optionals (gpuTargets != [ ]) [
# We intentionally set GPU_ARCHS and not AMD/GPU_TARGETS
# per readme this is required if archs are dissimilar
# In rocm-6.3.x not setting any arch flag worked
# but setting dissimilar arches always failed
"-DGPU_ARCHS=${lib.concatStringsSep ";" gpuTargets}"
]
++ lib.optionals buildTests [
"-DGOOGLETEST_DIR=${gtest.src}" # Custom linker names
];
patches = [
# Significant build performance improvement
./avoid-extra-host-compile.patch
];
# No flags to build selectively it seems...
postPatch =
# Reduce configure time by preventing thousands of clang-tidy targets being added
# We will never call them
# Never build profiler
''
substituteInPlace library/src/utility/CMakeLists.txt library/src/tensor_operation_instance/gpu/CMakeLists.txt \
--replace-fail clang_tidy_check '#clang_tidy_check'
substituteInPlace CMakeLists.txt \
--replace-fail "add_subdirectory(profiler)" ""
substituteInPlace cmake/EnableCompilerWarnings.cmake \
--replace-fail "-Werror" ""
# Apply equivalent change to https://github.com/ROCm/composable_kernel/pull/2564
# TODO: Remove after ROCm 7.1
find include/ck/tensor_operation/ -type f -name "*.hpp" -exec sed -i \
-e 's/!defined(__HIP_DEVICE_COMPILE__)/false/g' \
{} +
''
# Optionally remove tests
+ lib.optionalString (!buildTests) ''
substituteInPlace CMakeLists.txt \
--replace-fail "add_subdirectory(test)" ""
substituteInPlace codegen/CMakeLists.txt \
--replace-fail "include(ROCMTest)" ""
''
# Optionally remove examples
+ lib.optionalString (!buildExamples) ''
substituteInPlace CMakeLists.txt \
--replace-fail "add_subdirectory(example)" ""
'';
postInstall =
lib.optionalString buildTests ''
mkdir -p $test/bin
mv $out/bin/test_* $test/bin
''
+ lib.optionalString buildExamples ''
mkdir -p $example/bin
mv $out/bin/example_* $example/bin
'';
passthru = {
inherit gpuTargets;
updateScript = rocmUpdateScript {
name = finalAttrs.pname;
inherit (finalAttrs.src) owner;
inherit (finalAttrs.src) repo;
};
anyGfx9Target = lib.lists.any (lib.strings.hasPrefix "gfx9") gpuTargets;
anyMfmaTarget =
(lib.lists.intersectLists gpuTargets [
"gfx908"
"gfx90a"
"gfx942"
"gfx950"
]) != [ ];
};
meta = with lib; {
description = "Performance portable programming model for machine learning tensor operators";
homepage = "https://github.com/ROCm/composable_kernel";
license = with licenses; [ mit ];
teams = [ teams.rocm ];
platforms = platforms.linux;
broken = true; # this base package shouldn't be built directly
};
})

View File

@@ -0,0 +1,43 @@
{
buildPythonPackage,
python,
composable_kernel,
lib,
setuptools,
setuptools-scm,
rocm-merged-llvm,
}:
buildPythonPackage {
pyproject = true;
pname = "ck4inductor";
build-system = [
setuptools
setuptools-scm
];
version = "6.4.3";
inherit (composable_kernel) src;
pythonImportsCheck = [
"ck4inductor"
"ck4inductor.universal_gemm.gen_instances"
"ck4inductor.universal_gemm.gen_instances"
"ck4inductor.universal_gemm.op"
];
propagatedBuildInputs = [
# At runtime will fail to compile anything with ck4inductor without this
# can't easily use in checks phase because most of the compiler machinery is in torch
rocm-merged-llvm
];
checkPhase = ''
if [ ! -d "$out/${python.sitePackages}/ck4inductor" ]; then
echo "ck4inductor isn't at the expected location in $out/${python.sitePackages}/ck4inductor"
exit 1
fi
'';
meta = with lib; {
description = "Pytorch inductor backend which uses composable_kernel universal GEMM implementations";
homepage = "https://github.com/ROCm/composable_kernel";
license = with licenses; [ mit ];
teams = [ teams.rocm ];
platforms = platforms.linux;
};
}

View File

@@ -0,0 +1,290 @@
{
lib,
clr,
composable_kernel_base,
}:
let
parts = {
_mha = {
# mha takes ~3hrs on 64 cores on an EPYC milan system at ~2.5GHz
# big-parallel builders are one gen newer and clocked ~30% higher but only 24 cores
# Should be <10h timeout but might be cutting it close
# TODO: work out how to split this into smaller chunks instead of all 3k mha instances together
# mha_0,1,2, search ninja target file for the individual instances, split by the index?
# TODO: can we prune the generated instances down to only what in practice are used with popular models
# when using flash-attention + MHA kernels?
targets = [
"device_mha_instance"
];
extraCmakeFlags = [ "-DHIP_CLANG_NUM_PARALLEL_JOBS=2" ];
};
gemm_multiply_multiply = {
targets = [
"device_gemm_multiply_multiply_instance"
];
extraCmakeFlags = [ "-DHIP_CLANG_NUM_PARALLEL_JOBS=2" ];
onlyFor = [
"gfx942"
"gfx950"
];
};
gemm_multiply_multiply_wp = {
targets = [
"device_gemm_multiply_multiply_wp_instance"
];
extraCmakeFlags = [ "-DHIP_CLANG_NUM_PARALLEL_JOBS=2" ];
onlyFor = [
"gfx942"
"gfx950"
];
};
grouped_conv_bwd = {
targets = [
"device_grouped_conv1d_bwd_weight_instance"
"device_grouped_conv2d_bwd_data_instance"
"device_grouped_conv2d_bwd_weight_instance"
];
};
grouped_conv_fwd = {
targets = [
"device_grouped_conv1d_fwd_instance"
"device_grouped_conv2d_fwd_instance"
"device_grouped_conv2d_fwd_bias_relu_instance"
"device_grouped_conv2d_fwd_dynamic_op_instance"
];
};
grouped_conv_bwd_3d1 = {
targets = [
"device_grouped_conv3d_bwd_data_instance"
"device_grouped_conv3d_bwd_data_bilinear_instance"
"device_grouped_conv3d_bwd_data_scale_instance"
];
};
grouped_conv_bwd_3d2 = {
targets = [
"device_grouped_conv3d_bwd_weight_instance"
"device_grouped_conv3d_bwd_weight_bilinear_instance"
"device_grouped_conv3d_bwd_weight_scale_instance"
];
};
grouped_conv_fwd_3d1 = {
targets = [
"device_grouped_conv3d_fwd_instance"
"device_grouped_conv3d_fwd_bias_relu_instance"
"device_grouped_conv3d_fwd_bilinear_instance"
"device_grouped_conv3d_fwd_convinvscale_instance"
"device_grouped_conv3d_fwd_convscale_instance"
"device_grouped_conv3d_fwd_convscale_add_instance"
];
};
grouped_conv_fwd_3d2 = {
targets = [
"device_grouped_conv3d_fwd_convscale_relu_instance"
"device_grouped_conv3d_fwd_dynamic_op_instance"
"device_grouped_conv3d_fwd_scale_instance"
"device_grouped_conv3d_fwd_scaleadd_ab_instance"
"device_grouped_conv3d_fwd_scaleadd_scaleadd_relu_instance"
];
};
batched_gemm1 = {
targets = [
"device_batched_gemm_instance"
"device_batched_gemm_b_scale_instance"
"device_batched_gemm_multi_d_instance"
"device_batched_gemm_add_relu_gemm_add_instance"
"device_batched_gemm_bias_permute_instance"
"device_batched_gemm_gemm_instance"
"device_batched_gemm_reduce_instance"
"device_batched_gemm_softmax_gemm_instance"
];
};
batched_gemm2 = {
targets = [
"device_batched_gemm_softmax_gemm_permute_instance"
"device_grouped_gemm_instance"
"device_grouped_gemm_bias_instance"
"device_grouped_gemm_fastgelu_instance"
"device_grouped_gemm_fixed_nk_instance"
"device_grouped_gemm_fixed_nk_multi_abd_instance"
"device_grouped_gemm_tile_loop_instance"
];
};
gemm_universal1 = {
targets = [
"device_gemm_universal_instance"
"device_gemm_universal_batched_instance"
];
};
gemm_universal2 = {
targets = [
"device_gemm_universal_reduce_instance"
"device_gemm_universal_streamk_instance"
];
};
gemm_other1 = {
targets = [
"device_gemm_instance"
"device_gemm_b_scale_instance"
"device_gemm_ab_scale_instance"
"device_gemm_add_instance"
"device_gemm_add_add_fastgelu_instance"
"device_gemm_add_fastgelu_instance"
"device_gemm_add_multiply_instance"
"device_gemm_add_relu_instance"
];
};
gemm_other2 = {
targets = [
"device_gemm_add_relu_add_layernorm_instance"
"device_gemm_add_silu_instance"
"device_gemm_bias_add_reduce_instance"
"device_gemm_bilinear_instance"
"device_gemm_fastgelu_instance"
"device_gemm_multi_abd_instance"
"device_gemm_multiply_add_instance"
"device_gemm_reduce_instance"
"device_gemm_splitk_instance"
"device_gemm_streamk_instance"
];
};
conv = {
targets = [
"device_conv1d_bwd_data_instance"
"device_conv2d_bwd_data_instance"
"device_conv2d_fwd_instance"
"device_conv2d_fwd_bias_relu_instance"
"device_conv2d_fwd_bias_relu_add_instance"
"device_conv3d_bwd_data_instance"
];
};
pool = {
targets = [
"device_avg_pool2d_bwd_instance"
"device_avg_pool3d_bwd_instance"
"device_pool2d_fwd_instance"
"device_pool3d_fwd_instance"
"device_max_pool_bwd_instance"
];
};
other1 = {
targets = [
"device_batchnorm_instance"
"device_contraction_bilinear_instance"
"device_contraction_scale_instance"
"device_elementwise_instance"
"device_elementwise_normalization_instance"
];
};
other2 = {
targets = [
"device_column_to_image_instance"
"device_image_to_column_instance"
"device_permute_scale_instance"
"device_quantization_instance"
"device_reduce_instance"
];
};
other3 = {
targets = [
"device_normalization_bwd_data_instance"
"device_normalization_bwd_gamma_beta_instance"
"device_normalization_fwd_instance"
"device_softmax_instance"
"device_transpose_instance"
];
};
};
tensorOpBuilder =
{
part,
targets,
extraCmakeFlags ? [ ],
requiredSystemFeatures ? [ "big-parallel" ],
onlyFor ? [ ],
}:
let
supported =
onlyFor == [ ] || (lib.lists.intersectLists composable_kernel_base.gpuTargets onlyFor) != [ ];
in
if supported then
(composable_kernel_base.overrideAttrs (old: {
inherit requiredSystemFeatures;
pname = "composable_kernel${clr.gpuArchSuffix}-${part}";
makeTargets = targets;
preBuild = ''
echo "Building ${part}"
makeFlagsArray+=($makeTargets)
substituteInPlace $(find ./ -name "Makefile" -type f) \
--replace-fail '.NOTPARALLEL:' '.UNUSED_NOTPARALLEL:'
'';
# Compile parallelism adjusted based on available RAM
# Never uses less than NIX_BUILD_CORES/4, never uses more than NIX_BUILD_CORES
# CK uses an unusually high amount of memory per core in the build step
# Nix/nixpkgs doesn't really have any infra to tell it that this build is unusually memory hungry
# So, bodge. Otherwise you end up having to build all of ROCm with a low core limit when
# it's only this package that has trouble.
preConfigure = old.preConfigure or "" + ''
MEM_GB_TOTAL=$(awk '/MemTotal/ { printf "%d \n", $2/1024/1024 }' /proc/meminfo)
MEM_GB_AVAILABLE=$(awk '/MemAvailable/ { printf "%d \n", $2/1024/1024 }' /proc/meminfo)
APPX_GB=$((MEM_GB_AVAILABLE > MEM_GB_TOTAL ? MEM_GB_TOTAL : MEM_GB_AVAILABLE))
MAX_CORES=$((1 + APPX_GB/3))
MAX_CORES=$((MAX_CORES < NIX_BUILD_CORES/3 ? NIX_BUILD_CORES/3 : MAX_CORES))
export NIX_BUILD_CORES="$((NIX_BUILD_CORES > MAX_CORES ? MAX_CORES : NIX_BUILD_CORES))"
echo "Picked new core limit NIX_BUILD_CORES=$NIX_BUILD_CORES based on available mem: $APPX_GB GB"
cmakeFlagsArray+=(
"-DCK_PARALLEL_COMPILE_JOBS=$NIX_BUILD_CORES"
)
'';
cmakeFlags = old.cmakeFlags ++ extraCmakeFlags;
# Early exit after build phase with success, skips fixups etc
# Will get copied back into /build of the final CK
postBuild = ''
find . -name "*.o" -type f | while read -r file; do
mkdir -p "$out/$(dirname "$file")"
cp --reflink=auto "$file" "$out/$file"
done
exit 0
'';
meta = old.meta // {
broken = false;
};
}))
else
null;
composable_kernel_parts = builtins.mapAttrs (
part: targets: tensorOpBuilder (targets // { inherit part; })
) parts;
in
composable_kernel_base.overrideAttrs (
finalAttrs: old: {
pname = "composable_kernel${clr.gpuArchSuffix}";
parts_dirs = builtins.filter (x: x != null) (builtins.attrValues composable_kernel_parts);
disallowedReferences = builtins.filter (x: x != null) (builtins.attrValues composable_kernel_parts);
preBuild = ''
for dir in $parts_dirs; do
find "$dir" -type f -name "*.o" | while read -r file; do
# Extract the relative path by removing the output directory prefix
rel_path="''${file#"$dir/"}"
# Create parent directory if it doesn't exist
mkdir -p "$(dirname "$rel_path")"
# Copy the file back to its original location, give it a future timestamp
# so make treats it as up to date
cp --reflink=auto --no-preserve=all "$file" "$rel_path"
touch -d "now +10 hours" "$rel_path"
done
done
'';
passthru = old.passthru // {
parts = composable_kernel_parts;
};
meta = old.meta // {
# Builds without any gfx9 fail
broken = !finalAttrs.passthru.anyGfx9Target;
};
}
)