ci/eval/compare/cmp-stats.py

import argparse
import json
import numpy as np
import os
import pandas as pd

from dataclasses import asdict, dataclass
from pathlib import Path
from scipy.stats import ttest_rel
from tabulate import tabulate
from typing import Final


def flatten_data(json_data: dict) -> dict:
    """
    Extracts and flattens metrics from JSON data.
    This is needed because the JSON data can be nested.
    For example, the JSON data entry might look like this:

    "gc":{"cycles":13,"heapSize":5404549120,"totalBytes":9545876464}

    Flattened:

    "gc.cycles": 13
    "gc.heapSize": 5404549120
    ...

    See https://github.com/NixOS/nix/blob/187520ce88c47e2859064704f9320a2d6c97e56e/src/libexpr/eval.cc#L2846
    for the ultimate source of this data.

    Args:
        json_data (dict): JSON data containing metrics.
    Returns:
        dict: Flattened metrics with keys as metric names.
    """
    flat_metrics = {}
    for key, value in json_data.items():
        # This key is duplicated as `time.cpu`; we keep that copy.
        if key == "cpuTime":
            continue

        if isinstance(value, (int, float)):
            flat_metrics[key] = value
        elif isinstance(value, dict):
            for subkey, subvalue in value.items():
                assert isinstance(subvalue, (int, float)), subvalue
                flat_metrics[f"{key}.{subkey}"] = subvalue
        else:
            assert isinstance(value, (float, int, dict)), (
                f"Value `{value}` has unexpected type"
            )

    return flat_metrics


def load_all_metrics(path: Path) -> dict:
    """
    Loads all stats JSON files in the specified file or directory and extracts metrics.
    These stats JSON files are created by Nix when the `NIX_SHOW_STATS` environment variable is set.

    If the provided path is a directory, it must have the structure $path/$system/$stats,
    where $path is the provided path, $system is some system from `lib.systems.doubles.*`,
    and $stats is a stats JSON file.

    If the provided path is a file, it is a stats JSON file.

    Args:
        path (Path): Directory containing JSON files or a stats JSON file.

    Returns:
        dict: Dictionary with filenames as keys and extracted metrics as values.
    """
    metrics = {}
    if path.is_dir():
        for system_dir in path.iterdir():
            assert system_dir.is_dir()

            for chunk_output in system_dir.iterdir():
                with chunk_output.open() as f:
                    data = json.load(f)

                metrics[f"{system_dir.name}/${chunk_output.name}"] = flatten_data(data)
    else:
        with path.open() as f:
            metrics[path.name] = flatten_data(json.load(f))

    return metrics


def metric_table_name(name: str, explain: bool) -> str:
    """
    Returns the name of the metric, plus a footnote to explain it if needed.
    """
    return f"{name}[^{name}]" if explain else name


METRIC_EXPLANATION_FOOTNOTE: Final[str] = """

[^time.cpu]: Number of seconds of CPU time accounted by the OS to the Nix evaluator process. On UNIX systems, this comes from [`getrusage(RUSAGE_SELF)`](https://man7.org/linux/man-pages/man2/getrusage.2.html).
[^time.gc]: Number of seconds of CPU time accounted by the Boehm garbage collector to performing GC.
[^time.gcFraction]: What fraction of the total CPU time is accounted towards performing GC.
[^gc.cycles]: Number of times garbage collection has been performed.
[^gc.heapSize]: Size in bytes of the garbage collector heap.
[^gc.totalBytes]: Size in bytes of all allocations in the garbage collector.
[^envs.bytes]: Size in bytes of all `Env` objects allocated by the Nix evaluator. These are almost exclusively created by [`nix-env`](https://nix.dev/manual/nix/stable/command-ref/nix-env.html).
[^list.bytes]: Size in bytes of all [lists](https://nix.dev/manual/nix/stable/language/syntax.html#list-literal) allocated by the Nix evaluator.
[^sets.bytes]: Size in bytes of all [attrsets](https://nix.dev/manual/nix/stable/language/syntax.html#list-literal) allocated by the Nix evaluator.
[^symbols.bytes]: Size in bytes of all items in the Nix evaluator symbol table.
[^values.bytes]: Size in bytes of all values allocated by the Nix evaluator.
[^envs.number]: The count of all `Env` objects allocated.
[^nrAvoided]: The number of thunks avoided being created.
[^nrExprs]: The number of expression objects ever created.
[^nrFunctionCalls]: The number of function calls ever made.
[^nrLookups]: The number of lookups into an attrset ever made.
[^nrOpUpdateValuesCopied]: The number of attrset values copied in the process of merging attrsets.
[^nrOpUpdates]: The number of attrsets merge operations (`//`) performed.
[^nrPrimOpCalls]: The number of function calls to primops (Nix builtins) ever made.
[^nrThunks]: The number of [thunks](https://nix.dev/manual/nix/latest/language/evaluation.html#laziness) ever made. A thunk is a delayed computation, represented by an expression reference and a closure.
[^sets.number]: The number of attrsets ever made.
[^symbols.number]: The number of symbols ever added to the symbol table.
[^values.number]: The number of values ever made.
[^envs.elements]: The number of values contained within an `Env` object.
[^list.concats]: The number of list concatenation operations (`++`) performed.
[^list.elements]: The number of values contained within a list.
[^sets.elements]: The number of values contained within an attrset.
[^sizes.Attr]: Size in bytes of the `Attr` type.
[^sizes.Bindings]: Size in bytes of the `Bindings` type.
[^sizes.Env]: Size in bytes of the `Env` type.
[^sizes.Value]: Size in bytes of the `Value` type.
"""


@dataclass(frozen=True)
class PairwiseTestResults:
    updated: pd.DataFrame
    equivalent: pd.DataFrame

    @staticmethod
    def tabulate(table, headers) -> str:
        return tabulate(
            table, headers, tablefmt="github", floatfmt=".4f", missingval="-"
        )

    def updated_to_markdown(self, explain: bool) -> str:
        assert not self.updated.empty
        # Header (get column names and format them)
        return self.tabulate(
            headers=[str(column) for column in self.updated.columns],
            table=[
                [
                    # The metric acts as its own footnote name
                    metric_table_name(row["metric"], explain),
                    # Check for no change and NaN in p_value/t_stat
                    *[
                        None if np.isnan(val) or np.allclose(val, 0) else val
                        for val in row[1:]
                    ],
                ]
                for _, row in self.updated.iterrows()
            ],
        )

    def equivalent_to_markdown(self, explain: bool) -> str:
        assert not self.equivalent.empty
        return self.tabulate(
            headers=[str(column) for column in self.equivalent.columns],
            table=[
                [
                    # The metric acts as its own footnote name
                    metric_table_name(row["metric"], explain),
                    row["value"],
                ]
                for _, row in self.equivalent.iterrows()
            ],
        )

    def to_markdown(self, explain: bool) -> str:
        result = ""

        if not self.equivalent.empty:
            result += "## Unchanged values\n\n"
            result += self.equivalent_to_markdown(explain)

        if not self.updated.empty:
            result += ("\n\n" if result else "") + "## Updated values\n\n"
            result += self.updated_to_markdown(explain)

        if explain:
            result += METRIC_EXPLANATION_FOOTNOTE

        return result


@dataclass(frozen=True)
class Equivalent:
    metric: str
    value: float


@dataclass(frozen=True)
class Comparison:
    metric: str
    mean_before: float
    mean_after: float
    mean_diff: float
    mean_pct_change: float


@dataclass(frozen=True)
class ComparisonWithPValue(Comparison):
    p_value: float
    t_stat: float


def metric_sort_key(name: str) -> str:
    if name in ("time.cpu", "time.gc", "time.gcFraction"):
        return (1, name)
    elif name.startswith("gc"):
        return (2, name)
    elif name.endswith(("bytes", "Bytes")):
        return (3, name)
    elif name.startswith("nr") or name.endswith("number"):
        return (4, name)
    else:
        return (5, name)


def perform_pairwise_tests(
    before_metrics: dict, after_metrics: dict
) -> PairwiseTestResults:
    common_files = sorted(set(before_metrics) & set(after_metrics))
    all_keys = sorted(
        {
            metric_keys
            for file_metrics in before_metrics.values()
            for metric_keys in file_metrics.keys()
        },
        key=metric_sort_key,
    )

    updated = []
    equivalent = []

    for key in all_keys:
        before_vals = []
        after_vals = []

        for fname in common_files:
            if key in before_metrics[fname] and key in after_metrics[fname]:
                before_vals.append(before_metrics[fname][key])
                after_vals.append(after_metrics[fname][key])

        if len(before_vals) == 0:
            continue

        before_arr = np.array(before_vals)
        after_arr = np.array(after_vals)

        diff = after_arr - before_arr

        # If there's no difference, add it all to the equivalent output.
        if np.allclose(diff, 0):
            equivalent.append(Equivalent(metric=key, value=before_vals[0]))
        else:
            pct_change = 100 * diff / before_arr

            result = Comparison(
                metric=key,
                mean_before=np.mean(before_arr),
                mean_after=np.mean(after_arr),
                mean_diff=np.mean(diff),
                mean_pct_change=np.mean(pct_change),
            )

            # If there are enough values to perform a t-test, do so.
            if len(before_vals) > 1:
                t_stat, p_val = ttest_rel(after_arr, before_arr)
                result = ComparisonWithPValue(
                    **asdict(result), p_value=p_val, t_stat=t_stat
                )

            updated.append(result)

    return PairwiseTestResults(
        updated=pd.DataFrame(map(asdict, updated)),
        equivalent=pd.DataFrame(map(asdict, equivalent)),
    )


def main():
    parser = argparse.ArgumentParser(
        description="Performance comparison of Nix evaluation statistics"
    )
    parser.add_argument(
        "--explain", action="store_true", help="Explain the evaluation statistics"
    )
    parser.add_argument(
        "before", help="File or directory containing baseline (data before)"
    )
    parser.add_argument(
        "after", help="File or directory containing comparison (data after)"
    )

    options = parser.parse_args()

    before_stats = Path(options.before)
    after_stats = Path(options.after)

    before_metrics = load_all_metrics(before_stats)
    after_metrics = load_all_metrics(after_stats)
    pairwise_test_results = perform_pairwise_tests(before_metrics, after_metrics)
    markdown_table = pairwise_test_results.to_markdown(explain=options.explain)
    print(markdown_table)


if __name__ == "__main__":
    main()
push sheeet 2025-10-09 14:15:47 +02:00			`import argparse`
			`import json`
			`import numpy as np`
			`import os`
			`import pandas as pd`

			`from dataclasses import asdict, dataclass`
			`from pathlib import Path`
			`from scipy.stats import ttest_rel`
			`from tabulate import tabulate`
			`from typing import Final`


			`def flatten_data(json_data: dict) -> dict:`
			`"""`
			`Extracts and flattens metrics from JSON data.`
			`This is needed because the JSON data can be nested.`
			`For example, the JSON data entry might look like this:`

			`"gc":{"cycles":13,"heapSize":5404549120,"totalBytes":9545876464}`

			`Flattened:`

			`"gc.cycles": 13`
			`"gc.heapSize": 5404549120`
			`...`

			`See https://github.com/NixOS/nix/blob/187520ce88c47e2859064704f9320a2d6c97e56e/src/libexpr/eval.cc#L2846`
			`for the ultimate source of this data.`

			`Args:`
			`json_data (dict): JSON data containing metrics.`
			`Returns:`
			`dict: Flattened metrics with keys as metric names.`
			`"""`
			`flat_metrics = {}`
			`for key, value in json_data.items():`
			# This key is duplicated as `time.cpu`; we keep that copy.
			`if key == "cpuTime":`
			`continue`

			`if isinstance(value, (int, float)):`
			`flat_metrics[key] = value`
			`elif isinstance(value, dict):`
			`for subkey, subvalue in value.items():`
			`assert isinstance(subvalue, (int, float)), subvalue`
			`flat_metrics[f"{key}.{subkey}"] = subvalue`
			`else:`
			`assert isinstance(value, (float, int, dict)), (`
			f"Value `{value}` has unexpected type"
			`)`

			`return flat_metrics`


			`def load_all_metrics(path: Path) -> dict:`
			`"""`
			`Loads all stats JSON files in the specified file or directory and extracts metrics.`
			These stats JSON files are created by Nix when the `NIX_SHOW_STATS` environment variable is set.

			`If the provided path is a directory, it must have the structure $path/$system/$stats,`
			where $path is the provided path, $system is some system from `lib.systems.doubles.*`,
			`and $stats is a stats JSON file.`

			`If the provided path is a file, it is a stats JSON file.`

			`Args:`
			`path (Path): Directory containing JSON files or a stats JSON file.`

			`Returns:`
			`dict: Dictionary with filenames as keys and extracted metrics as values.`
			`"""`
			`metrics = {}`
			`if path.is_dir():`
			`for system_dir in path.iterdir():`
			`assert system_dir.is_dir()`

			`for chunk_output in system_dir.iterdir():`
			`with chunk_output.open() as f:`
			`data = json.load(f)`

			`metrics[f"{system_dir.name}/${chunk_output.name}"] = flatten_data(data)`
			`else:`
			`with path.open() as f:`
			`metrics[path.name] = flatten_data(json.load(f))`

			`return metrics`


			`def metric_table_name(name: str, explain: bool) -> str:`
			`"""`
			`Returns the name of the metric, plus a footnote to explain it if needed.`
			`"""`
			`return f"{name}[^{name}]" if explain else name`


			`METRIC_EXPLANATION_FOOTNOTE: Final[str] = """`

			[^time.cpu]: Number of seconds of CPU time accounted by the OS to the Nix evaluator process. On UNIX systems, this comes from [`getrusage(RUSAGE_SELF)`](https://man7.org/linux/man-pages/man2/getrusage.2.html).
			`[^time.gc]: Number of seconds of CPU time accounted by the Boehm garbage collector to performing GC.`
			`[^time.gcFraction]: What fraction of the total CPU time is accounted towards performing GC.`
			`[^gc.cycles]: Number of times garbage collection has been performed.`
			`[^gc.heapSize]: Size in bytes of the garbage collector heap.`
			`[^gc.totalBytes]: Size in bytes of all allocations in the garbage collector.`
			[^envs.bytes]: Size in bytes of all `Env` objects allocated by the Nix evaluator. These are almost exclusively created by [`nix-env`](https://nix.dev/manual/nix/stable/command-ref/nix-env.html).
			`[^list.bytes]: Size in bytes of all [lists](https://nix.dev/manual/nix/stable/language/syntax.html#list-literal) allocated by the Nix evaluator.`
			`[^sets.bytes]: Size in bytes of all [attrsets](https://nix.dev/manual/nix/stable/language/syntax.html#list-literal) allocated by the Nix evaluator.`
			`[^symbols.bytes]: Size in bytes of all items in the Nix evaluator symbol table.`
			`[^values.bytes]: Size in bytes of all values allocated by the Nix evaluator.`
			[^envs.number]: The count of all `Env` objects allocated.
			`[^nrAvoided]: The number of thunks avoided being created.`
			`[^nrExprs]: The number of expression objects ever created.`
			`[^nrFunctionCalls]: The number of function calls ever made.`
			`[^nrLookups]: The number of lookups into an attrset ever made.`
			`[^nrOpUpdateValuesCopied]: The number of attrset values copied in the process of merging attrsets.`
			[^nrOpUpdates]: The number of attrsets merge operations (`//`) performed.
			`[^nrPrimOpCalls]: The number of function calls to primops (Nix builtins) ever made.`
			`[^nrThunks]: The number of [thunks](https://nix.dev/manual/nix/latest/language/evaluation.html#laziness) ever made. A thunk is a delayed computation, represented by an expression reference and a closure.`
			`[^sets.number]: The number of attrsets ever made.`
			`[^symbols.number]: The number of symbols ever added to the symbol table.`
			`[^values.number]: The number of values ever made.`
			[^envs.elements]: The number of values contained within an `Env` object.
			[^list.concats]: The number of list concatenation operations (`++`) performed.
			`[^list.elements]: The number of values contained within a list.`
			`[^sets.elements]: The number of values contained within an attrset.`
			[^sizes.Attr]: Size in bytes of the `Attr` type.
			[^sizes.Bindings]: Size in bytes of the `Bindings` type.
			[^sizes.Env]: Size in bytes of the `Env` type.
			[^sizes.Value]: Size in bytes of the `Value` type.
			`"""`


			`@dataclass(frozen=True)`
			`class PairwiseTestResults:`
			`updated: pd.DataFrame`
			`equivalent: pd.DataFrame`

			`@staticmethod`
			`def tabulate(table, headers) -> str:`
			`return tabulate(`
			`table, headers, tablefmt="github", floatfmt=".4f", missingval="-"`
			`)`

			`def updated_to_markdown(self, explain: bool) -> str:`
			`assert not self.updated.empty`
			`# Header (get column names and format them)`
			`return self.tabulate(`
			`headers=[str(column) for column in self.updated.columns],`
			`table=[`
			`[`
			`# The metric acts as its own footnote name`
			`metric_table_name(row["metric"], explain),`
			`# Check for no change and NaN in p_value/t_stat`
			`*[`
			`None if np.isnan(val) or np.allclose(val, 0) else val`
			`for val in row[1:]`
			`],`
			`]`
			`for _, row in self.updated.iterrows()`
			`],`
			`)`

			`def equivalent_to_markdown(self, explain: bool) -> str:`
			`assert not self.equivalent.empty`
			`return self.tabulate(`
			`headers=[str(column) for column in self.equivalent.columns],`
			`table=[`
			`[`
			`# The metric acts as its own footnote name`
			`metric_table_name(row["metric"], explain),`
			`row["value"],`
			`]`
			`for _, row in self.equivalent.iterrows()`
			`],`
			`)`

			`def to_markdown(self, explain: bool) -> str:`
			`result = ""`

			`if not self.equivalent.empty:`
			`result += "## Unchanged values\n\n"`
			`result += self.equivalent_to_markdown(explain)`

			`if not self.updated.empty:`
			`result += ("\n\n" if result else "") + "## Updated values\n\n"`
			`result += self.updated_to_markdown(explain)`

			`if explain:`
			`result += METRIC_EXPLANATION_FOOTNOTE`

			`return result`


			`@dataclass(frozen=True)`
			`class Equivalent:`
			`metric: str`
			`value: float`


			`@dataclass(frozen=True)`
			`class Comparison:`
			`metric: str`
			`mean_before: float`
			`mean_after: float`
			`mean_diff: float`
			`mean_pct_change: float`


			`@dataclass(frozen=True)`
			`class ComparisonWithPValue(Comparison):`
			`p_value: float`
			`t_stat: float`


			`def metric_sort_key(name: str) -> str:`
			`if name in ("time.cpu", "time.gc", "time.gcFraction"):`
			`return (1, name)`
			`elif name.startswith("gc"):`
			`return (2, name)`
			`elif name.endswith(("bytes", "Bytes")):`
			`return (3, name)`
			`elif name.startswith("nr") or name.endswith("number"):`
			`return (4, name)`
			`else:`
			`return (5, name)`


			`def perform_pairwise_tests(`
			`before_metrics: dict, after_metrics: dict`
			`) -> PairwiseTestResults:`
			`common_files = sorted(set(before_metrics) & set(after_metrics))`
			`all_keys = sorted(`
			`{`
			`metric_keys`
			`for file_metrics in before_metrics.values()`
			`for metric_keys in file_metrics.keys()`
			`},`
			`key=metric_sort_key,`
			`)`

			`updated = []`
			`equivalent = []`

			`for key in all_keys:`
			`before_vals = []`
			`after_vals = []`

			`for fname in common_files:`
			`if key in before_metrics[fname] and key in after_metrics[fname]:`
			`before_vals.append(before_metrics[fname][key])`
			`after_vals.append(after_metrics[fname][key])`

			`if len(before_vals) == 0:`
			`continue`

			`before_arr = np.array(before_vals)`
			`after_arr = np.array(after_vals)`

			`diff = after_arr - before_arr`

			`# If there's no difference, add it all to the equivalent output.`
			`if np.allclose(diff, 0):`
			`equivalent.append(Equivalent(metric=key, value=before_vals[0]))`
			`else:`
			`pct_change = 100 * diff / before_arr`

			`result = Comparison(`
			`metric=key,`
			`mean_before=np.mean(before_arr),`
			`mean_after=np.mean(after_arr),`
			`mean_diff=np.mean(diff),`
			`mean_pct_change=np.mean(pct_change),`
			`)`

			`# If there are enough values to perform a t-test, do so.`
			`if len(before_vals) > 1:`
			`t_stat, p_val = ttest_rel(after_arr, before_arr)`
			`result = ComparisonWithPValue(`
			`**asdict(result), p_value=p_val, t_stat=t_stat`
			`)`

			`updated.append(result)`

			`return PairwiseTestResults(`
			`updated=pd.DataFrame(map(asdict, updated)),`
			`equivalent=pd.DataFrame(map(asdict, equivalent)),`
			`)`


			`def main():`
			`parser = argparse.ArgumentParser(`
			`description="Performance comparison of Nix evaluation statistics"`
			`)`
			`parser.add_argument(`
			`"--explain", action="store_true", help="Explain the evaluation statistics"`
			`)`
			`parser.add_argument(`
			`"before", help="File or directory containing baseline (data before)"`
			`)`
			`parser.add_argument(`
			`"after", help="File or directory containing comparison (data after)"`
			`)`

			`options = parser.parse_args()`

			`before_stats = Path(options.before)`
			`after_stats = Path(options.after)`

			`before_metrics = load_all_metrics(before_stats)`
			`after_metrics = load_all_metrics(after_stats)`
			`pairwise_test_results = perform_pairwise_tests(before_metrics, after_metrics)`
			`markdown_table = pairwise_test_results.to_markdown(explain=options.explain)`
			`print(markdown_table)`


			`if __name__ == "__main__":`
			`main()`