push sheeet

2025-10-09 14:15:47 +02:00
commit 646b892680
49168 changed files with 5897842 additions and 0 deletions
--- a/pkgs/applications/networking/cluster/k3s/docs/examples/EXTERNAL_CONTAINERD.md
+++ b/pkgs/applications/networking/cluster/k3s/docs/examples/EXTERNAL_CONTAINERD.md
@@ -0,0 +1,40 @@
+# Using an external Containerd
+
+K3s ships with its own containerd binary, however, sometimes it's necessary to use an external
+containerd. This can be done in a few lines of configuration.
+
+## Configure Containerd
+
+```nix
+{
+  virtualisation.containerd = {
+    enable = true;
+    settings.plugins."io.containerd.grpc.v1.cri".cni = {
+      bin_dir = "/var/lib/rancher/k3s/data/current/bin";
+      conf_dir = "/var/lib/rancher/k3s/agent/etc/cni/net.d";
+    };
+    # Optionally, configure containerd to use the k3s pause image
+    settings.plugins."io.containerd.grpc.v1.cri" = {
+      sandbox_image = "docker.io/rancher/mirrored-pause:3.6";
+    };
+  };
+}
+```
+
+## Configure k3s
+
+```nix
+{
+  services.k3s = {
+    enable = true;
+    extraFlags = [ "--container-runtime-endpoint unix:///run/containerd/containerd.sock" ];
+  };
+}
+```
+
+## Importing Container Images
+
+K3s provides the `services.k3s.images` option to import container images at startup. This option
+does **not** work with an external containerd, but you can import the images via
+`ctr -n=k8s.io image import /var/lib/rancher/k3s/agent/images/*`. Note that you need to set the
+`k8s.io` namespace to make the images available to the cluster.
--- a/pkgs/applications/networking/cluster/k3s/docs/examples/NVIDIA.md
+++ b/pkgs/applications/networking/cluster/k3s/docs/examples/NVIDIA.md
@@ -0,0 +1,256 @@
+# Nvidia GPU Support
+
+> Note: this article assumes `services.k3s.enable = true;` is already set
+
+## Enable the Nvidia driver
+
+```
+hardware.nvidia = {
+  open = true;
+  package = config.boot.kernelPackages.nvidiaPackages.stable; # change to match your kernel
+  nvidiaSettings = true;
+};
+
+# Hack for getting the nvidia driver recognized
+services.xserver = {
+  enable = false;
+  videoDrivers = [ "nvidia" ];
+};
+
+nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [
+  "nvidia-x11"
+  "nvidia-settings"
+];
+```
+
+Also, enable the Nvidia container toolkit:
+
+```
+hardware.nvidia-container-toolkit.enable = true;
+hardware.nvidia-container-toolkit.mount-nvidia-executables = true;
+
+environment.systemPackages = with pkgs; [
+  nvidia-container-toolkit
+];
+```
+
+Rebuild your NixOS configuration.
+
+### Verify that the GPU is accessible
+
+Use the following command to ensure the GPU is accessible:
+
+```
+nvidia-smi
+```
+
+If there is an error in the output, a reboot may be required for the driver to be assigned to the GPU.
+
+Additionally, `lspci -k` can be used to ensure the driver has been assigned to the GPU:
+
+```
+# lspci -k | grep -i nvidia
+
+01:00.0 VGA compatible controller: NVIDIA Corporation TU106 [GeForce RTX 2060 Rev. A] (rev a1)
+  Kernel driver in use: nvidia
+  Kernel modules: nvidiafb, nouveau, nvidia_drm, nvidia
+```
+
+## Configure k3s
+
+You now need to create a new file in `/var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl` with the following
+
+```
+{{ template "base" . }}
+
+[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
+  privileged_without_host_devices = false
+  runtime_engine = ""
+  runtime_root = ""
+  runtime_type = "io.containerd.runc.v2"
+```
+
+Now apply the following runtime class to k3s cluster:
+
+```yaml
+apiVersion: node.k8s.io/v1
+handler: nvidia
+kind: RuntimeClass
+metadata:
+  labels:
+    app.kubernetes.io/component: gpu-operator
+  name: nvidia
+```
+
+Restart k3s:
+
+```
+systemctl restart k3s.service
+```
+
+Ensure that the Nvidia runtime is detected by k3s:
+
+```
+grep nvidia /var/lib/rancher/k3s/agent/etc/containerd/config.toml
+```
+
+Apply the DaemonSet in the [generic-cdi-plugin README](https://github.com/OlfillasOdikno/generic-cdi-plugin):
+
+```
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: generic-cdi-plugin
+---
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: generic-cdi-plugin-daemonset
+  namespace: generic-cdi-plugin
+spec:
+  selector:
+    matchLabels:
+      name: generic-cdi-plugin
+  template:
+    metadata:
+      labels:
+        name: generic-cdi-plugin
+        app.kubernetes.io/component: generic-cdi-plugin
+        app.kubernetes.io/name: generic-cdi-plugin
+    spec:
+      containers:
+      - image: ghcr.io/olfillasodikno/generic-cdi-plugin:main
+        name: generic-cdi-plugin
+        command:
+          - /generic-cdi-plugin
+          - /var/run/cdi/nvidia-container-toolkit.json
+        imagePullPolicy: Always
+        securityContext:
+          privileged: true
+        tty: true
+        volumeMounts:
+        - name: kubelet
+          mountPath: /var/lib/kubelet
+        - name: nvidia-container-toolkit
+          mountPath: /var/run/cdi/nvidia-container-toolkit.json
+      volumes:
+      - name: kubelet
+        hostPath:
+          path: /var/lib/kubelet
+      - name: nvidia-container-toolkit
+        hostPath:
+          path: /var/run/cdi/nvidia-container-toolkit.json
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: "nixos-nvidia-cdi"
+                operator: In
+                values:
+                - "enabled"
+```
+
+Apply the following node label (replace `#CHANGEME` with your node name):
+
+```
+kind: Node
+apiVersion: v1
+metadata:
+  name: #CHANGEME
+  labels:
+    nixos-nvidia-cdi: enabled
+```
+
+Now, GPU-enabled pods can be run with this configuration:
+
+```
+spec:
+  runtimeClassName: nvidia
+  containers:
+    resources:
+      requests:
+        nvidia.com/gpu-all: "1"
+      limits:
+        nvidia.com/gpu-all: "1"
+```
+
+### Test pod
+
+This is a complete pod configuration for reference/testing:
+
+```
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: gpu-test
+  namespace: default
+spec:
+  runtimeClassName: nvidia # <- THIS FOR GPU
+  containers:
+  - name: gpu-test
+    image: nvidia/cuda:12.6.3-base-ubuntu22.04
+    command: [ "/bin/bash", "-c", "--" ]
+    args: [ "while true; do sleep 30; done;" ]
+    env:
+      - name: NVIDIA_VISIBLE_DEVICES
+        value: all
+      - name: NVIDIA_DRIVER_CAPABILITIES
+        value: all
+    resources: # <- THIS FOR GPU
+      requests:
+        nvidia.com/gpu-all: "1"
+      limits:
+        nvidia.com/gpu-all: "1"
+```
+
+Once the pod is running, use the following command to test that the GPU was detected:
+
+```
+kubectl exec -n default -it pod/gpu-test -- nvidia-smi
+```
+
+If successful, the output will look like the following:
+
+```
+Thu Sep 25 04:17:42 2025
+
+-----------------------------------------------------------------------------------------+
+
+| NVIDIA-SMI 580.82.09              Driver Version: 580.82.09      CUDA Version: 13.0     |
+
+-----------------------------------------+------------------------+----------------------+
+
+| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
+
+| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
+
+|                                         |                        |               MIG M. |
+
+|=========================================+========================+======================|
+
+|   0  NVIDIA GeForce RTX 2060        Off |   00000000:01:00.0  On |                  N/A |
+
+|  0%   36C    P8             10W /  190W |     104MiB /   6144MiB |      0%      Default |
+
+|                                         |                        |                  N/A |
+
+-----------------------------------------+------------------------+----------------------+
+
+
+
+-----------------------------------------------------------------------------------------+
+
+| Processes:                                                                              |
+
+|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
+
+|        ID   ID                                                               Usage      |
+
+|=========================================================================================|
+
+|  No running processes found                                                             |
+
+-----------------------------------------------------------------------------------------+
+```
--- a/pkgs/applications/networking/cluster/k3s/docs/examples/STORAGE.md
+++ b/pkgs/applications/networking/cluster/k3s/docs/examples/STORAGE.md
@@ -0,0 +1,108 @@
+# Storage Examples
+
+The following are some NixOS specific considerations for specific storage mechanisms with kubernetes/k3s.
+
+## Longhorn
+
+NixOS configuration required for Longhorn:
+
+```
+environment.systemPackages = [ pkgs.nfs-utils ];
+services.openiscsi = {
+  enable = true;
+  name = "${config.networking.hostName}-initiatorhost";
+};
+```
+
+Longhorn container has trouble with NixOS path. Solution is to override PATH environment variable, such as:
+
+```
+PATH: /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/run/wrappers/bin:/nix/var/nix/profiles/default/bin:/run/current-system/sw/bin
+```
+
+**Kyverno Policy for Fixing Longhorn Container for NixOS**
+
+```
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: longhorn-nixos-path
+  namespace: longhorn-system
+data:
+  PATH: /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/run/wrappers/bin:/nix/var/nix/profiles/default/bin:/run/current-system/sw/bin
+---
+apiVersion: kyverno.io/v1
+kind: ClusterPolicy
+metadata:
+  name: longhorn-add-nixos-path
+  annotations:
+    policies.kyverno.io/title: Add Environment Variables from ConfigMap
+    policies.kyverno.io/subject: Pod
+    policies.kyverno.io/category: Other
+    policies.kyverno.io/description: >-
+      Longhorn invokes executables on the host system, and needs
+      to be aware of the host systems PATH. This modifies all
+      deployments such that the PATH is explicitly set to support
+      NixOS based systems.
+spec:
+  rules:
+    - name: add-env-vars
+      match:
+        resources:
+          kinds:
+            - Pod
+          namespaces:
+            - longhorn-system
+      mutate:
+        patchStrategicMerge:
+          spec:
+            initContainers:
+              - (name): "*"
+                envFrom:
+                  - configMapRef:
+                      name: longhorn-nixos-path
+            containers:
+              - (name): "*"
+                envFrom:
+                  - configMapRef:
+                      name: longhorn-nixos-path
+---
+```
+
+## NFS
+
+NixOS configuration required for NFS:
+
+```
+boot.supportedFilesystems = [ "nfs" ];
+services.rpcbind.enable = true;
+```
+
+## Rook/Ceph
+
+In order to support Rook/Ceph, the following NixOS kernelModule configuration is required:
+
+```
+  boot.kernelModules = [ "rbd" ];
+```
+
+## ZFS ContainerD Support
+
+The [ZFS snapshotter](https://github.com/containerd/zfs) can be enabled for k3s' embedded ContainerD though it requires mounting a dataset to a specific path used by k3s: `/var/lib/rancher/k3s/agent/containerd/io.containerd.snapshotter.v1.zfs`
+
+For example:
+
+```bash
+$ zfs create -o mountpoint=/var/lib/rancher/k3s/agent/containerd/io.containerd.snapshotter.v1.zfs <zpool name>/containerd
+```
+
+You can now configure k3s to use zfs by passing the `--snapshotter` flag.
+
+```
+services.k3s = {
+  ...
+  extraFlags = [
+    "--snapshotter=zfs"
+  ];
+```