110 lines
2.8 KiB
Nix
110 lines
2.8 KiB
Nix
|
|
# NOTE: At runtime, FlashInfer will fall back to PyTorch’s JIT compilation if a
|
|||
|
|
# requested kernel wasn’t pre-compiled in AOT mode, and JIT compilation always
|
|||
|
|
# requires the CUDA toolkit (via nvcc) to be available.
|
|||
|
|
#
|
|||
|
|
# This means that if you plan to use flashinfer, you will need to set the
|
|||
|
|
# environment variable `CUDA_HOME` to `cudatoolkit`.
|
|||
|
|
{
|
|||
|
|
lib,
|
|||
|
|
config,
|
|||
|
|
buildPythonPackage,
|
|||
|
|
fetchFromGitHub,
|
|||
|
|
|
|||
|
|
# build-system
|
|||
|
|
setuptools,
|
|||
|
|
|
|||
|
|
# nativeBuildInputs
|
|||
|
|
cmake,
|
|||
|
|
ninja,
|
|||
|
|
cudaPackages,
|
|||
|
|
|
|||
|
|
# dependencies
|
|||
|
|
click,
|
|||
|
|
einops,
|
|||
|
|
numpy,
|
|||
|
|
pynvml,
|
|||
|
|
tabulate,
|
|||
|
|
torch,
|
|||
|
|
tqdm,
|
|||
|
|
}:
|
|||
|
|
|
|||
|
|
buildPythonPackage rec {
|
|||
|
|
pname = "flashinfer";
|
|||
|
|
version = "0.3.1";
|
|||
|
|
pyproject = true;
|
|||
|
|
|
|||
|
|
src = fetchFromGitHub {
|
|||
|
|
owner = "flashinfer-ai";
|
|||
|
|
repo = "flashinfer";
|
|||
|
|
tag = "v${version}";
|
|||
|
|
fetchSubmodules = true;
|
|||
|
|
hash = "sha256-e9PfLfU0DdoLKlXiHylCbGd125c7Iw9y4NDIOAP0xHs=";
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
build-system = [ setuptools ];
|
|||
|
|
|
|||
|
|
nativeBuildInputs = [
|
|||
|
|
cmake
|
|||
|
|
ninja
|
|||
|
|
(lib.getBin cudaPackages.cuda_nvcc)
|
|||
|
|
];
|
|||
|
|
|
|||
|
|
dontUseCmakeConfigure = true;
|
|||
|
|
|
|||
|
|
buildInputs = with cudaPackages; [
|
|||
|
|
cuda_cccl
|
|||
|
|
cuda_cudart
|
|||
|
|
libcublas
|
|||
|
|
libcurand
|
|||
|
|
];
|
|||
|
|
|
|||
|
|
# FlashInfer offers two installation modes:
|
|||
|
|
#
|
|||
|
|
# JIT mode: CUDA kernels are compiled at runtime using PyTorch’s JIT, with
|
|||
|
|
# compiled kernels cached for future use. JIT mode allows fast installation,
|
|||
|
|
# as no CUDA kernels are pre-compiled, making it ideal for development and
|
|||
|
|
# testing. JIT version is also available as a sdist in PyPI.
|
|||
|
|
#
|
|||
|
|
# AOT mode: Core CUDA kernels are pre-compiled and included in the library,
|
|||
|
|
# reducing runtime compilation overhead. If a required kernel is not
|
|||
|
|
# pre-compiled, it will be compiled at runtime using JIT. AOT mode is
|
|||
|
|
# recommended for production environments.
|
|||
|
|
#
|
|||
|
|
# Here we use opt for the AOT version.
|
|||
|
|
preConfigure = ''
|
|||
|
|
export FLASHINFER_ENABLE_AOT=1
|
|||
|
|
export TORCH_NVCC_FLAGS="--maxrregcount=64"
|
|||
|
|
export MAX_JOBS="$NIX_BUILD_CORES"
|
|||
|
|
'';
|
|||
|
|
|
|||
|
|
FLASHINFER_CUDA_ARCH_LIST = lib.concatStringsSep ";" torch.cudaCapabilities;
|
|||
|
|
|
|||
|
|
pythonRemoveDeps = [
|
|||
|
|
"nvidia-cudnn-frontend"
|
|||
|
|
];
|
|||
|
|
dependencies = [
|
|||
|
|
click
|
|||
|
|
einops
|
|||
|
|
numpy
|
|||
|
|
pynvml
|
|||
|
|
tabulate
|
|||
|
|
torch
|
|||
|
|
tqdm
|
|||
|
|
];
|
|||
|
|
|
|||
|
|
meta = {
|
|||
|
|
broken = !torch.cudaSupport || !config.cudaSupport;
|
|||
|
|
homepage = "https://flashinfer.ai/";
|
|||
|
|
description = "Library and kernel generator for Large Language Models";
|
|||
|
|
longDescription = ''
|
|||
|
|
FlashInfer is a library and kernel generator for Large Language Models
|
|||
|
|
that provides high-performance implementation of LLM GPU kernels such as
|
|||
|
|
FlashAttention, PageAttention and LoRA. FlashInfer focus on LLM serving
|
|||
|
|
and inference, and delivers state-of-the-art performance across diverse
|
|||
|
|
scenarios.
|
|||
|
|
'';
|
|||
|
|
license = lib.licenses.asl20;
|
|||
|
|
maintainers = with lib.maintainers; [ breakds ];
|
|||
|
|
};
|
|||
|
|
}
|