push sheeet
Some checks failed
Periodic Merges (6h) / master → staging-nixos (push) Failing after 12m50s
Periodic Merges (6h) / master → staging-next (push) Failing after 12m54s
Periodic Merges (24h) / merge-base(master,staging) → haskell-updates (push) Failing after 11m54s
Periodic Merges (6h) / staging-next → staging (push) Failing after 12m13s
Periodic Merges (24h) / staging-next-25.05 → staging-25.05 (push) Failing after 13m24s
Periodic Merges (24h) / release-25.05 → staging-next-25.05 (push) Failing after 14m28s

This commit is contained in:
Dark Steveneq
2025-10-09 14:15:47 +02:00
commit 646b892680
49168 changed files with 5897842 additions and 0 deletions

View File

@@ -0,0 +1,161 @@
{
lib,
stdenv,
fetchFromGitHub,
rocmUpdateScript,
cmake,
rocm-cmake,
rocm-smi,
rocm-core,
pkg-config,
clr,
mscclpp,
perl,
hipify,
python3,
gtest,
chrpath,
roctracer,
rocprofiler,
rocprofiler-register,
autoPatchelfHook,
buildTests ? false,
gpuTargets ? (clr.localGpuTargets or [ ]),
# for passthru.tests
rccl,
}:
let
useAsan = buildTests;
useUbsan = buildTests;
san = lib.optionalString (useAsan || useUbsan) (
"-fno-gpu-sanitize -fsanitize=undefined "
+ (lib.optionalString useAsan "-fsanitize=address -shared-libsan ")
);
in
# Note: we can't properly test or make use of multi-node collective ops
# https://github.com/NixOS/nixpkgs/issues/366242 tracks kernel support
# kfd_peerdirect support which is on out-of-tree amdkfd in ROCm/ROCK-Kernel-Driver
# infiniband ib_peer_mem support isn't in the mainline kernel but is carried by some distros
stdenv.mkDerivation (finalAttrs: {
pname = "rccl${clr.gpuArchSuffix}";
version = "6.4.3";
outputs = [
"out"
]
++ lib.optionals buildTests [
"test"
];
patches = [
./fix-mainline-support-and-ub.diff
./enable-mscclpp-on-all-gfx9.diff
./rccl-test-missing-iomanip.diff
./fix_hw_reg_hw_id_gt_gfx10.patch
];
src = fetchFromGitHub {
owner = "ROCm";
repo = "rccl";
rev = "rocm-${finalAttrs.version}";
hash = "sha256-XpD+UjgdbAoGYK5UvvTX3f8rny4tiEDH/vYoCdZhtjo=";
};
nativeBuildInputs = [
cmake
rocm-cmake
clr
perl
hipify
python3
pkg-config
autoPatchelfHook # ASAN doesn't add rpath without this
];
buildInputs = [
rocm-smi
gtest
roctracer
rocprofiler
rocprofiler-register
mscclpp
]
++ lib.optionals buildTests [
chrpath
];
cmakeFlags = [
"-DHIP_CLANG_NUM_PARALLEL_JOBS=4"
"-DCMAKE_BUILD_TYPE=Release"
"-DROCM_PATH=${clr}"
"-DHIP_COMPILER=${clr}/bin/amdclang++"
"-DCMAKE_CXX_COMPILER=${clr}/bin/amdclang++"
"-DROCM_PATCH_VERSION=${rocm-core.ROCM_LIBPATCH_VERSION}"
"-DROCM_VERSION=${rocm-core.ROCM_LIBPATCH_VERSION}"
"-DBUILD_BFD=OFF" # Can't get it to detect bfd.h
"-DENABLE_MSCCL_KERNEL=ON"
# FIXME: this is still running a download because if(NOT mscclpp_nccl_FOUND) is commented out T_T
"-DENABLE_MSCCLPP=OFF"
#"-DMSCCLPP_ROOT=${mscclpp}"
# Manually define CMAKE_INSTALL_<DIR>
# See: https://github.com/NixOS/nixpkgs/pull/197838
"-DCMAKE_INSTALL_BINDIR=bin"
"-DCMAKE_INSTALL_LIBDIR=lib"
"-DCMAKE_INSTALL_INCLUDEDIR=include"
]
++ lib.optionals (gpuTargets != [ ]) [
# AMD can't make up their minds and keep changing which one is used in different projects.
"-DAMDGPU_TARGETS=${lib.concatStringsSep ";" gpuTargets}"
"-DGPU_TARGETS=${lib.concatStringsSep ";" gpuTargets}"
]
++ lib.optionals buildTests [
"-DBUILD_TESTS=ON"
];
# -O2 and -fno-strict-aliasing due to UB issues in RCCL :c
# Reported upstream
env.CFLAGS = "-I${clr}/include -I${roctracer}/include -O2 -fno-strict-aliasing ${san}-fno-omit-frame-pointer -momit-leaf-frame-pointer";
env.CXXFLAGS = "-I${clr}/include -I${roctracer}/include -O2 -fno-strict-aliasing ${san}-fno-omit-frame-pointer -momit-leaf-frame-pointer";
env.LDFLAGS = "${san}";
postPatch = ''
patchShebangs src tools
substituteInPlace CMakeLists.txt \
--replace-fail '${"\${HOST_OS_ID}"}' '"ubuntu"' \
--replace-fail 'target_include_directories(rccl PRIVATE ''${ROCM_SMI_INCLUDE_DIR})' \
'target_include_directories(rccl PRIVATE ''${ROCM_SMI_INCLUDE_DIRS})'
'';
postInstall =
lib.optionalString useAsan ''
patchelf --add-needed ${clr}/llvm/lib/linux/libclang_rt.asan-${stdenv.hostPlatform.parsed.cpu.name}.so $out/lib/librccl.so
''
+ lib.optionalString buildTests ''
mkdir -p $test/bin
mv $out/bin/* $test/bin
rmdir $out/bin
'';
passthru.updateScript = rocmUpdateScript {
name = finalAttrs.pname;
inherit (finalAttrs.src) owner;
inherit (finalAttrs.src) repo;
};
# This package with sanitizers + manual integration test binaries built
# must be ran manually
passthru.tests.rccl = rccl.override {
buildTests = true;
};
meta = with lib; {
description = "ROCm communication collectives library";
homepage = "https://github.com/ROCm/rccl";
license = with licenses; [
bsd2
bsd3
];
teams = [ teams.rocm ];
platforms = platforms.linux;
};
})

View File

@@ -0,0 +1,13 @@
diff --git a/src/init.cc b/src/init.cc
index 738f756..1b0e4fc 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -2049,7 +2049,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
if (mscclEnabled() && (comm->topo->mscclEnabled || mscclForceEnabled()) && mscclppCommCompatible(comm)) {
hipDeviceProp_t devProp;
CUDACHECK(hipGetDeviceProperties(&devProp, cudaDev));
- comm->mscclppCompatible = IsArchMatch(devProp.gcnArchName, "gfx94");
+ comm->mscclppCompatible = IsArchMatch(devProp.gcnArchName, "gfx9");
if (comm->mscclppCompatible) {
bool mapContainsId = (mscclpp_uniqueIdMap.count(job->commId) > 0);
auto& mscclppUniqueId = mscclpp_uniqueIdMap[job->commId];

View File

@@ -0,0 +1,172 @@
diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h
index 8c5f081..9922b79 100644
--- a/src/include/bootstrap.h
+++ b/src/include/bootstrap.h
@@ -10,11 +10,13 @@
#include "nccl.h"
#include "comm.h"
+// this is accessed through unaligned ptrs because ncclUniqueId is a typedef of char[128]
struct ncclBootstrapHandle {
uint64_t magic;
union ncclSocketAddress addr;
};
static_assert(sizeof(struct ncclBootstrapHandle) <= sizeof(ncclUniqueId), "Bootstrap handle is too large to fit inside NCCL unique ID");
+static_assert(alignof(struct ncclBootstrapHandle) == alignof(ncclUniqueId), "Bootstrap handle must have same alignment as NCCL unique ID to avoid UB");
ncclResult_t bootstrapNetInit();
ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv);
diff --git a/src/misc/rocmwrap.cc b/src/misc/rocmwrap.cc
index b3063d5..464b80d 100644
--- a/src/misc/rocmwrap.cc
+++ b/src/misc/rocmwrap.cc
@@ -131,9 +131,12 @@ static void initOnceFunc() {
//format and store the kernel conf file location
snprintf(kernel_conf_file, sizeof(kernel_conf_file), "/boot/config-%s", utsname.release);
fp = fopen(kernel_conf_file, "r");
- if (fp == NULL) INFO(NCCL_INIT,"Could not open kernel conf file");
+ if (fp == NULL) {
+ INFO(NCCL_INIT,"Could not open kernel conf file, will assume CONFIG_DMABUF_MOVE_NOTIFY and CONFIG_PCI_P2PDMA are enabled");
+ }
//look for kernel_opt1 and kernel_opt2 in the conf file and check
- while (fgets(buf, sizeof(buf), fp) != NULL) {
+ // FIXME: This check is broken, CONFIG_DMABUF_MOVE_NOTIFY could be across a buf boundary.
+ while (fp && fgets(buf, sizeof(buf), fp) != NULL) {
if (strstr(buf, kernel_opt1) != NULL) {
found_opt1 = 1;
INFO(NCCL_INIT,"CONFIG_DMABUF_MOVE_NOTIFY=y in /boot/config-%s", utsname.release);
@@ -143,11 +146,12 @@ static void initOnceFunc() {
INFO(NCCL_INIT,"CONFIG_PCI_P2PDMA=y in /boot/config-%s", utsname.release);
}
}
- if (!found_opt1 || !found_opt2) {
+ if (fp && (!found_opt1 || !found_opt2)) {
dmaBufSupport = 0;
INFO(NCCL_INIT, "CONFIG_DMABUF_MOVE_NOTIFY and CONFIG_PCI_P2PDMA should be set for DMA_BUF in /boot/config-%s", utsname.release);
INFO(NCCL_INIT, "DMA_BUF_SUPPORT Failed due to OS kernel support");
}
+ if (fp) fclose(fp);
if(dmaBufSupport) INFO(NCCL_INIT, "DMA_BUF Support Enabled");
else goto error;
diff --git a/src/nccl.h.in b/src/nccl.h.in
index 1d127b0..6296073 100644
--- a/src/nccl.h.in
+++ b/src/nccl.h.in
@@ -39,7 +39,7 @@ typedef struct ncclComm* ncclComm_t;
#define NCCL_UNIQUE_ID_BYTES 128
/*! @brief Opaque unique id used to initialize communicators
@details The ncclUniqueId must be passed to all participating ranks */
-typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; /*!< Opaque array>*/} ncclUniqueId;
+typedef struct alignas(int64_t) { char internal[NCCL_UNIQUE_ID_BYTES]; /*!< Opaque array>*/} ncclUniqueId;
/*! @defgroup rccl_result_code Result Codes
@details The various result codes that RCCL API calls may return
diff --git a/src/proxy.cc b/src/proxy.cc
index 50e5437..51bb401 100644
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -965,7 +965,11 @@ struct ncclProxyConnectionPool {
static ncclResult_t ncclProxyNewConnection(struct ncclProxyConnectionPool* pool, int* id) {
if (pool->offset == NCCL_PROXY_CONN_POOL_SIZE) {
- NCCLCHECK(ncclRealloc(&pool->pools, pool->banks, pool->banks+1));
+ if (pool->pools) {
+ NCCLCHECK(ncclRealloc(&pool->pools, pool->banks, pool->banks+1));
+ } else {
+ NCCLCHECK(ncclCalloc(&pool->pools, pool->banks+1));
+ }
NCCLCHECK(ncclCalloc(pool->pools+pool->banks, NCCL_PROXY_CONN_POOL_SIZE));
pool->banks++;
pool->offset = 0;
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index 6d77784..49762d3 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -573,7 +573,7 @@ ncclResult_t ncclIbGdrSupport() {
// Requires support from NIC driver modules
// Use ONLY for debugging!
moduleLoaded = 1;
- INFO(NCCL_INIT, "RCCL_FORCE_ENABLE_GDRDMA = 1, so explicitly setting moduleLoaded = 1");
+ INFO(NCCL_INIT, "ncclIbGdrSupport: RCCL_FORCE_ENABLE_GDRDMA = 1, so explicitly setting moduleLoaded = 1");
}
if (moduleLoaded == -1) {
@@ -586,9 +586,9 @@ ncclResult_t ncclIbGdrSupport() {
int i = 0;
while (memory_peers_paths[i]) {
if (access(memory_peers_paths[i], F_OK) == 0) {
moduleLoaded = 1;
- INFO(NCCL_INIT,"Found %s", memory_peers_paths[i]);
+ INFO(NCCL_INIT,"ncclIbGdrSupport: Found %s", memory_peers_paths[i]);
break;
} else {
moduleLoaded = 0;
@@ -612,22 +613,23 @@ ncclResult_t ncclIbGdrSupport() {
if (moduleLoaded == 0) {
// Check for `ib_register_peer_memory_client` symbol in `/proc/kallsyms`
// if your system uses native OS ib_peer module
- char buf[256];
- FILE *fp = NULL;
- fp = fopen("/proc/kallsyms", "r");
+ FILE *fp = fopen("/proc/kallsyms", "r");
+ char *line = NULL;
+ size_t len = 0;
if (fp == NULL) {
- INFO(NCCL_INIT,"Could not open /proc/kallsyms");
+ INFO(NCCL_INIT,"ncclIbGdrSupport: Could not open /proc/kallsyms to check for ib_register_peer_memory_client");
} else {
- while (fgets(buf, sizeof(buf), fp) != NULL) {
- if (strstr(buf, "t ib_register_peer_memory_client") != NULL ||
- strstr(buf, "T ib_register_peer_memory_client") != NULL) {
+ while (getline(&line, &len, fp) > 0) {
+ if (line && strstr(line, "ib_register_peer_memory_client") != NULL) {
moduleLoaded = 1;
- INFO(NCCL_INIT,"Found ib_register_peer_memory_client in /proc/kallsyms");
+ INFO(NCCL_INIT,"ncclIbGdrSupport: Found ib_register_peer_memory_client in /proc/kallsyms");
break;
}
}
}
+ if (line) free(line);
+ if (fp) fclose(fp);
}
#else
// Check for the nv_peer_mem module being loaded
@@ -637,7 +639,7 @@ ncclResult_t ncclIbGdrSupport() {
#endif
}
if (moduleLoaded == 0) {
- INFO(NCCL_INIT,"GDRDMA not enabled. Could not find memory_peers directory or peer_memory symbol");
+ INFO(NCCL_INIT,"ncclIbGdrSupport: GDRDMA not enabled. Could not find memory_peers directory or peer_memory symbol");
return ncclSystemError;
}
return ncclSuccess;
diff --git a/tools/ib-test/include/nccl.h b/tools/ib-test/include/nccl.h
index 2c86c33..5801c61 100755
--- a/tools/ib-test/include/nccl.h
+++ b/tools/ib-test/include/nccl.h
@@ -31,7 +31,7 @@ extern "C" {
typedef struct ncclComm* ncclComm_t;
#define NCCL_UNIQUE_ID_BYTES 128
-typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
+typedef struct alignas(int64_t) { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
/* Error type */
typedef enum { ncclSuccess = 0,
diff --git a/tools/topo_expl/include/nccl.h b/tools/topo_expl/include/nccl.h
index 729561b..4e4bdd9 100644
--- a/tools/topo_expl/include/nccl.h
+++ b/tools/topo_expl/include/nccl.h
@@ -35,7 +35,7 @@ typedef struct ncclComm* ncclComm_t;
#define NCCL_COMM_NULL NULL
#define NCCL_UNIQUE_ID_BYTES 128
-typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
+typedef struct alignas(int64_t) { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
/*! @brief Error type */
typedef enum { ncclSuccess = 0,

View File

@@ -0,0 +1,33 @@
fix: __trace_hwreg should use HW_REG_HW_ID1 for all gfx10/11/12
diff --git a/src/device/common.h b/src/device/common.h
index c6c61021..742885f4 100644
--- a/src/device/common.h
+++ b/src/device/common.h
@@ -26,8 +26,9 @@
{ __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST); }
#endif
-#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1200__) || defined(__gfx1201__)
-#define __trace_hwreg()
+#if defined(__GFX10__) || defined(__GFX11__) || defined(__GFX12__)
+#define __trace_hwreg() \
+ asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID1)" : "=s" (collTrace->data_0));
#else
#define __trace_hwreg() \
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (collTrace->data_0));
diff --git a/tools/JitterBench/Common.hpp b/tools/JitterBench/Common.hpp
index bad12a1b..b59856a8 100644
--- a/tools/JitterBench/Common.hpp
+++ b/tools/JitterBench/Common.hpp
@@ -43,9 +43,9 @@ THE SOFTWARE.
#endif
// Macro for collecting HW_REG_HW_ID
-#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__NVCC__)
+#if defined(__GFX10__) || defined(__GFX11__) || defined(__GFX12__)
#define GetHwId(val) \
- val = 0
+ asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID1)" : "=s" (val));
#else
#define GetHwId(val) \
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (val));

View File

@@ -0,0 +1,10 @@
--- a/test/common/TestBed.cpp
+++ b/test/common/TestBed.cpp
@@ -4,6 +4,7 @@
* See LICENSE.txt for license information
************************************************************************/
#include <unistd.h>
+#include <iomanip>
#include "TestBed.hpp"
#include <rccl/rccl.h>