Files

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

110 lines
2.8 KiB
Nix
Raw Permalink Normal View History

2025-10-09 14:15:47 +02:00
# NOTE: At runtime, FlashInfer will fall back to PyTorchs JIT compilation if a
# requested kernel wasnt pre-compiled in AOT mode, and JIT compilation always
# requires the CUDA toolkit (via nvcc) to be available.
#
# This means that if you plan to use flashinfer, you will need to set the
# environment variable `CUDA_HOME` to `cudatoolkit`.
{
lib,
config,
buildPythonPackage,
fetchFromGitHub,
# build-system
setuptools,
# nativeBuildInputs
cmake,
ninja,
cudaPackages,
# dependencies
click,
einops,
numpy,
pynvml,
tabulate,
torch,
tqdm,
}:
buildPythonPackage rec {
pname = "flashinfer";
version = "0.3.1";
pyproject = true;
src = fetchFromGitHub {
owner = "flashinfer-ai";
repo = "flashinfer";
tag = "v${version}";
fetchSubmodules = true;
hash = "sha256-e9PfLfU0DdoLKlXiHylCbGd125c7Iw9y4NDIOAP0xHs=";
};
build-system = [ setuptools ];
nativeBuildInputs = [
cmake
ninja
(lib.getBin cudaPackages.cuda_nvcc)
];
dontUseCmakeConfigure = true;
buildInputs = with cudaPackages; [
cuda_cccl
cuda_cudart
libcublas
libcurand
];
# FlashInfer offers two installation modes:
#
# JIT mode: CUDA kernels are compiled at runtime using PyTorchs JIT, with
# compiled kernels cached for future use. JIT mode allows fast installation,
# as no CUDA kernels are pre-compiled, making it ideal for development and
# testing. JIT version is also available as a sdist in PyPI.
#
# AOT mode: Core CUDA kernels are pre-compiled and included in the library,
# reducing runtime compilation overhead. If a required kernel is not
# pre-compiled, it will be compiled at runtime using JIT. AOT mode is
# recommended for production environments.
#
# Here we use opt for the AOT version.
preConfigure = ''
export FLASHINFER_ENABLE_AOT=1
export TORCH_NVCC_FLAGS="--maxrregcount=64"
export MAX_JOBS="$NIX_BUILD_CORES"
'';
FLASHINFER_CUDA_ARCH_LIST = lib.concatStringsSep ";" torch.cudaCapabilities;
pythonRemoveDeps = [
"nvidia-cudnn-frontend"
];
dependencies = [
click
einops
numpy
pynvml
tabulate
torch
tqdm
];
meta = {
broken = !torch.cudaSupport || !config.cudaSupport;
homepage = "https://flashinfer.ai/";
description = "Library and kernel generator for Large Language Models";
longDescription = ''
FlashInfer is a library and kernel generator for Large Language Models
that provides high-performance implementation of LLM GPU kernels such as
FlashAttention, PageAttention and LoRA. FlashInfer focus on LLM serving
and inference, and delivers state-of-the-art performance across diverse
scenarios.
'';
license = lib.licenses.asl20;
maintainers = with lib.maintainers; [ breakds ];
};
}