I am done

2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions
--- a/rl/Lib/site-packages/torch/utils/benchmark/examples/init.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/examples/init.py
--- a/rl/Lib/site-packages/torch/utils/benchmark/examples/pycache/init.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/examples/pycache/init.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/examples/pycache/blas_compare_setup.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/examples/pycache/blas_compare_setup.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/examples/pycache/compare.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/examples/pycache/compare.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/examples/pycache/fuzzer.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/examples/pycache/fuzzer.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/examples/pycache/op_benchmark.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/examples/pycache/op_benchmark.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/examples/pycache/simple_timeit.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/examples/pycache/simple_timeit.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/examples/pycache/spectral_ops_fuzz_test.cpython-312.pyc
+++ b/rl/Lib/site-packages/torch/utils/benchmark/examples/pycache/spectral_ops_fuzz_test.cpython-312.pyc
--- a/rl/Lib/site-packages/torch/utils/benchmark/examples/blas_compare_setup.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/examples/blas_compare_setup.py
@ -0,0 +1,222 @@
+# mypy: allow-untyped-defs
+import collections
+import os
+import shutil
+import subprocess
+
+try:
+    # no type stub for conda command line interface
+    import conda.cli.python_api  # type: ignore[import]
+    from conda.cli.python_api import Commands as conda_commands
+except ImportError:
+    # blas_compare.py will fail to import these when it's inside a conda env,
+    # but that's fine as it only wants the constants.
+    pass
+
+
+WORKING_ROOT = "/tmp/pytorch_blas_compare_environments"
+MKL_2020_3 = "mkl_2020_3"
+MKL_2020_0 = "mkl_2020_0"
+OPEN_BLAS = "open_blas"
+EIGEN = "eigen"
+
+
+GENERIC_ENV_VARS = ("USE_CUDA=0", "USE_ROCM=0")
+BASE_PKG_DEPS = (
+    "cmake",
+    "hypothesis",
+    "ninja",
+    "numpy",
+    "pyyaml",
+    "setuptools",
+    "typing_extensions",
+)
+
+
+SubEnvSpec = collections.namedtuple(
+    "SubEnvSpec", (
+        "generic_installs",
+        "special_installs",
+        "environment_variables",
+
+        # Validate install.
+        "expected_blas_symbols",
+        "expected_mkl_version",
+    ))
+
+
+SUB_ENVS = {
+    MKL_2020_3: SubEnvSpec(
+        generic_installs=(),
+        special_installs=("intel", ("mkl=2020.3", "mkl-include=2020.3")),
+        environment_variables=("BLAS=MKL",) + GENERIC_ENV_VARS,
+        expected_blas_symbols=("mkl_blas_sgemm",),
+        expected_mkl_version="2020.0.3",
+    ),
+
+    MKL_2020_0: SubEnvSpec(
+        generic_installs=(),
+        special_installs=("intel", ("mkl=2020.0", "mkl-include=2020.0")),
+        environment_variables=("BLAS=MKL",) + GENERIC_ENV_VARS,
+        expected_blas_symbols=("mkl_blas_sgemm",),
+        expected_mkl_version="2020.0.0",
+    ),
+
+    OPEN_BLAS: SubEnvSpec(
+        generic_installs=("openblas",),
+        special_installs=(),
+        environment_variables=("BLAS=OpenBLAS",) + GENERIC_ENV_VARS,
+        expected_blas_symbols=("exec_blas",),
+        expected_mkl_version=None,
+    ),
+
+    # EIGEN: SubEnvSpec(
+    #     generic_installs=(),
+    #     special_installs=(),
+    #     environment_variables=("BLAS=Eigen",) + GENERIC_ENV_VARS,
+    #     expected_blas_symbols=(),
+    # ),
+}
+
+
+def conda_run(*args):
+    """Convenience method."""
+    stdout, stderr, retcode = conda.cli.python_api.run_command(*args)
+    if retcode:
+        raise OSError(f"conda error: {str(args)}  retcode: {retcode}\n{stderr}")
+
+    return stdout
+
+
+def main():
+    if os.path.exists(WORKING_ROOT):
+        print("Cleaning: removing old working root.")
+        shutil.rmtree(WORKING_ROOT)
+    os.makedirs(WORKING_ROOT)
+
+    git_root = subprocess.check_output(
+        "git rev-parse --show-toplevel",
+        shell=True,
+        cwd=os.path.dirname(os.path.realpath(__file__))
+    ).decode("utf-8").strip()
+
+    for env_name, env_spec in SUB_ENVS.items():
+        env_path = os.path.join(WORKING_ROOT, env_name)
+        print(f"Creating env: {env_name}: ({env_path})")
+        conda_run(
+            conda_commands.CREATE,
+            "--no-default-packages",
+            "--prefix", env_path,
+            "python=3",
+        )
+
+        print("Testing that env can be activated:")
+        base_source = subprocess.run(
+            f"source activate {env_path}",
+            shell=True,
+            capture_output=True,
+            check=False,
+        )
+        if base_source.returncode:
+            raise OSError(
+                "Failed to source base environment:\n"
+                f"  stdout: {base_source.stdout.decode('utf-8')}\n"
+                f"  stderr: {base_source.stderr.decode('utf-8')}"
+            )
+
+        print("Installing packages:")
+        conda_run(
+            conda_commands.INSTALL,
+            "--prefix", env_path,
+            *(BASE_PKG_DEPS + env_spec.generic_installs)
+        )
+
+        if env_spec.special_installs:
+            channel, channel_deps = env_spec.special_installs
+            print(f"Installing packages from channel: {channel}")
+            conda_run(
+                conda_commands.INSTALL,
+                "--prefix", env_path,
+                "-c", channel, *channel_deps
+            )
+
+        if env_spec.environment_variables:
+            print("Setting environment variables.")
+
+            # This does not appear to be possible using the python API.
+            env_set = subprocess.run(
+                f"source activate {env_path} && "
+                f"conda env config vars set {' '.join(env_spec.environment_variables)}",
+                shell=True,
+                capture_output=True,
+                check=False,
+            )
+            if env_set.returncode:
+                raise OSError(
+                    "Failed to set environment variables:\n"
+                    f"  stdout: {env_set.stdout.decode('utf-8')}\n"
+                    f"  stderr: {env_set.stderr.decode('utf-8')}"
+                )
+
+            # Check that they were actually set correctly.
+            actual_env_vars = subprocess.run(
+                f"source activate {env_path} && env",
+                shell=True,
+                capture_output=True,
+                check=True,
+            ).stdout.decode("utf-8").strip().splitlines()
+            for e in env_spec.environment_variables:
+                assert e in actual_env_vars, f"{e} not in envs"
+
+        print(f"Building PyTorch for env: `{env_name}`")
+        # We have to re-run during each build to pick up the new
+        # build config settings.
+        build_run = subprocess.run(
+            f"source activate {env_path} && "
+            f"cd {git_root} && "
+            "python setup.py install --cmake",
+            shell=True,
+            capture_output=True,
+            check=True,
+        )
+
+        print("Checking configuration:")
+        check_run = subprocess.run(
+            # Shameless abuse of `python -c ...`
+            f"source activate {env_path} && "
+            'python -c "'
+            "import torch;"
+            "from torch.utils.benchmark import Timer;"
+            "print(torch.__config__.show());"
+            "setup = 'x=torch.ones((128, 128));y=torch.ones((128, 128))';"
+            "counts = Timer('torch.mm(x, y)', setup).collect_callgrind(collect_baseline=False);"
+            "stats = counts.as_standardized().stats(inclusive=True);"
+            "print(stats.filter(lambda l: 'blas' in l.lower()))\"",
+            shell=True,
+            capture_output=True,
+            check=False,
+        )
+        if check_run.returncode:
+            raise OSError(
+                "Failed to set environment variables:\n"
+                f"  stdout: {check_run.stdout.decode('utf-8')}\n"
+                f"  stderr: {check_run.stderr.decode('utf-8')}"
+            )
+        check_run_stdout = check_run.stdout.decode('utf-8')
+        print(check_run_stdout)
+
+        for e in env_spec.environment_variables:
+            if "BLAS" in e:
+                assert e in check_run_stdout, f"PyTorch build did not respect `BLAS=...`: {e}"
+
+        for s in env_spec.expected_blas_symbols:
+            assert s in check_run_stdout
+
+        if env_spec.expected_mkl_version is not None:
+            assert f"- Intel(R) Math Kernel Library Version {env_spec.expected_mkl_version}" in check_run_stdout
+
+        print(f"Build complete: {env_name}")
+
+
+if __name__ == "__main__":
+    main()
--- a/rl/Lib/site-packages/torch/utils/benchmark/examples/compare.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/examples/compare.py
@ -0,0 +1,99 @@
+# mypy: allow-untyped-defs
+"""Example of Timer and Compare APIs:
+
+$ python -m examples.compare
+"""
+
+import pickle
+import sys
+import time
+
+import torch
+
+import torch.utils.benchmark as benchmark_utils
+
+
+class FauxTorch:
+    """Emulate different versions of pytorch.
+
+    In normal circumstances this would be done with multiple processes
+    writing serialized measurements, but this simplifies that model to
+    make the example clearer.
+    """
+    def __init__(self, real_torch, extra_ns_per_element):
+        self._real_torch = real_torch
+        self._extra_ns_per_element = extra_ns_per_element
+
+    def extra_overhead(self, result):
+        # time.sleep has a ~65 us overhead, so only fake a
+        # per-element overhead if numel is large enough.
+        numel = int(result.numel())
+        if numel > 5000:
+            time.sleep(numel * self._extra_ns_per_element * 1e-9)
+        return result
+
+    def add(self, *args, **kwargs):
+        return self.extra_overhead(self._real_torch.add(*args, **kwargs))
+
+    def mul(self, *args, **kwargs):
+        return self.extra_overhead(self._real_torch.mul(*args, **kwargs))
+
+    def cat(self, *args, **kwargs):
+        return self.extra_overhead(self._real_torch.cat(*args, **kwargs))
+
+    def matmul(self, *args, **kwargs):
+        return self.extra_overhead(self._real_torch.matmul(*args, **kwargs))
+
+
+def main():
+    tasks = [
+        ("add", "add", "torch.add(x, y)"),
+        ("add", "add (extra +0)", "torch.add(x, y + zero)"),
+    ]
+
+    serialized_results = []
+    repeats = 2
+    timers = [
+        benchmark_utils.Timer(
+            stmt=stmt,
+            globals={
+                "torch": torch if branch == "master" else FauxTorch(torch, overhead_ns),
+                "x": torch.ones((size, 4)),
+                "y": torch.ones((1, 4)),
+                "zero": torch.zeros(()),
+            },
+            label=label,
+            sub_label=sub_label,
+            description=f"size: {size}",
+            env=branch,
+            num_threads=num_threads,
+        )
+        for branch, overhead_ns in [("master", None), ("my_branch", 1), ("severe_regression", 5)]
+        for label, sub_label, stmt in tasks
+        for size in [1, 10, 100, 1000, 10000, 50000]
+        for num_threads in [1, 4]
+    ]
+
+    for i, timer in enumerate(timers * repeats):
+        serialized_results.append(pickle.dumps(
+            timer.blocked_autorange(min_run_time=0.05)
+        ))
+        print(f"\r{i + 1} / {len(timers) * repeats}", end="")
+        sys.stdout.flush()
+    print()
+
+    comparison = benchmark_utils.Compare([
+        pickle.loads(i) for i in serialized_results
+    ])
+
+    print("== Unformatted " + "=" * 80 + "\n" + "/" * 95 + "\n")
+    comparison.print()
+
+    print("== Formatted " + "=" * 80 + "\n" + "/" * 93 + "\n")
+    comparison.trim_significant_figures()
+    comparison.colorize()
+    comparison.print()
+
+
+if __name__ == "__main__":
+    main()
--- a/rl/Lib/site-packages/torch/utils/benchmark/examples/fuzzer.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/examples/fuzzer.py
@ -0,0 +1,86 @@
+# mypy: allow-untyped-defs
+"""Example of the Timer and Fuzzer APIs:
+
+$ python -m examples.fuzzer
+"""
+
+import sys
+
+import torch.utils.benchmark as benchmark_utils
+
+
+def main():
+    add_fuzzer = benchmark_utils.Fuzzer(
+        parameters=[
+            [
+                benchmark_utils.FuzzedParameter(
+                    name=f"k{i}",
+                    minval=16,
+                    maxval=16 * 1024,
+                    distribution="loguniform",
+                ) for i in range(3)
+            ],
+            benchmark_utils.FuzzedParameter(
+                name="d",
+                distribution={2: 0.6, 3: 0.4},
+            ),
+        ],
+        tensors=[
+            [
+                benchmark_utils.FuzzedTensor(
+                    name=name,
+                    size=("k0", "k1", "k2"),
+                    dim_parameter="d",
+                    probability_contiguous=0.75,
+                    min_elements=64 * 1024,
+                    max_elements=128 * 1024,
+                ) for name in ("x", "y")
+            ],
+        ],
+        seed=0,
+    )
+
+    n = 250
+    measurements = []
+    for i, (tensors, tensor_properties, _) in enumerate(add_fuzzer.take(n=n)):
+        x, x_order = tensors["x"], str(tensor_properties["x"]["order"])
+        y, y_order = tensors["y"], str(tensor_properties["y"]["order"])
+        shape = ", ".join(tuple(f'{i:>4}' for i in x.shape))
+
+        description = "".join([
+            f"{x.numel():>7} | {shape:<16} | ",
+            f"{'contiguous' if x.is_contiguous() else x_order:<12} | ",
+            f"{'contiguous' if y.is_contiguous() else y_order:<12} | ",
+        ])
+
+        timer = benchmark_utils.Timer(
+            stmt="x + y",
+            globals=tensors,
+            description=description,
+        )
+
+        measurements.append(timer.blocked_autorange(min_run_time=0.1))
+        measurements[-1].metadata = {"numel": x.numel()}
+        print(f"\r{i + 1} / {n}", end="")
+        sys.stdout.flush()
+    print()
+
+    # More string munging to make pretty output.
+    print(f"Average attempts per valid config: {1. / (1. - add_fuzzer.rejection_rate):.1f}")
+
+    def time_fn(m):
+        return m.median / m.metadata["numel"]
+    measurements.sort(key=time_fn)
+
+    template = f"{{:>6}}{' ' * 19}Size    Shape{' ' * 13}X order        Y order\n{'-' * 80}"
+    print(template.format("Best:"))
+    for m in measurements[:15]:
+        print(f"{time_fn(m) * 1e9:>4.1f} ns / element     {m.description}")
+
+    print("\n" + template.format("Worst:"))
+    for m in measurements[-15:]:
+        print(f"{time_fn(m) * 1e9:>4.1f} ns / element     {m.description}")
+
+
+if __name__ == "__main__":
+    main()
--- a/rl/Lib/site-packages/torch/utils/benchmark/examples/op_benchmark.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/examples/op_benchmark.py
@ -0,0 +1,105 @@
+# mypy: allow-untyped-defs
+"""Example use of Timer and op fuzzers to measure kernel performance.
+
+$ python -m examples.op_benchmark
+"""
+
+import numpy as np
+import torch
+
+from torch.utils.benchmark import Timer
+from torch.utils.benchmark.op_fuzzers.binary import BinaryOpFuzzer
+from torch.utils.benchmark.op_fuzzers.unary import UnaryOpFuzzer
+import operator
+
+
+_MEASURE_TIME = 1.0
+
+
+def assert_dicts_equal(dict_0, dict_1):
+    """Builtin dict comparison will not compare numpy arrays.
+    e.g.
+        x = {"a": np.ones((2, 1))}
+        x == x  # Raises ValueError
+    """
+    assert set(dict_0.keys()) == set(dict_0.keys())
+    assert all(np.all(v == dict_1[k]) for k, v in dict_0.items() if k != "dtype")
+
+
+def run(n, stmt, fuzzer_cls):
+    float_iter = fuzzer_cls(seed=0, dtype=torch.float32).take(n)
+    int_iter = fuzzer_cls(seed=0, dtype=torch.int32).take(n)
+    raw_results = []
+    for i, (float_values, int_values) in enumerate(zip(float_iter, int_iter)):
+        float_tensors, float_tensor_params, float_params = float_values
+        int_tensors, int_tensor_params, int_params = int_values
+
+        # This benchmark assumes that the two fuzzers generate identically
+        # sized and strided Tensors, since the same seed is used.
+        assert_dicts_equal(float_params, int_params)
+        assert_dicts_equal(float_tensor_params["x"], int_tensor_params["x"])
+
+        float_measurement, int_measurement = (
+            Timer(
+                stmt,
+                globals=tensors,
+            ).blocked_autorange(min_run_time=_MEASURE_TIME)
+            for tensors in (float_tensors, int_tensors)
+        )
+
+        descriptions = []
+        for name in float_tensors:
+            shape_str = "(" + ", ".join([
+                f"2 ** {int(np.log2(i))}"
+                if 2 ** int(np.log2(i)) == i and i > 1
+                else str(i)
+                for i in float_tensors[name].shape
+            ]) + ")"
+            order = float_tensor_params[name]["order"]
+            order_str = ("" if all(order == np.arange(len(order))) else str(tuple(order)))
+            steps = float_tensor_params[name]["steps"]
+            steps_str = str(steps) if sum(steps) > len(steps) else ""
+            descriptions.append((name, shape_str, order_str, steps_str))
+        raw_results.append((float_measurement, int_measurement, descriptions))
+
+        print(f"\r{i + 1} / {n}", end="")
+    print()
+
+    parsed_results, name_len, shape_len, order_len, steps_len = [], 0, 0, 0, 0
+    for float_measurement, int_measurement, descriptions in raw_results:
+        t_float = float_measurement.median * 1e6
+        t_int = int_measurement.median * 1e6
+        rel_diff = abs(t_float - t_int) / (t_float + t_int) * 2
+        parsed_results.append((t_float, t_int, rel_diff, descriptions))
+        for name, shape, order, steps in descriptions:
+            name_len = max(name_len, len(name))
+            shape_len = max(shape_len, len(shape))
+            order_len = max(order_len, len(order))
+            steps_len = max(steps_len, len(steps))
+
+    parsed_results.sort(key=operator.itemgetter(2))
+
+    print(f"stmt: {stmt}")
+    print(f" diff    faster{'':>17}{' ' * name_len} ", end="")
+    print(f"{'shape'.ljust(shape_len)}{'':>16}{'order'.ljust(order_len)}", end="")
+    print(f"          steps\n{'-' * 100}")
+    for results, spacer in [(parsed_results[:10], "..."), (parsed_results[-10:], "")]:
+        for t_float, t_int, rel_diff, descriptions in results:
+            time_str = [f"{rel_diff * 100:>4.1f}%    {'int' if t_int < t_float else 'float':<20}"]
+            time_str.extend(["".ljust(len(time_str[0])) for _ in descriptions[:-1]])
+            for t_str, (name, shape, order, steps) in zip(time_str, descriptions):
+                name = f"{name}:".ljust(name_len + 1)
+                shape = shape.ljust(shape_len + 10)
+                order = order.ljust(order_len)
+                print(f"{t_str} {name}  {shape}|     {order}      |   {steps}")
+        print(spacer)
+
+
+def main():
+    run(n=100, stmt="torch.median(x, dim=0)", fuzzer_cls=UnaryOpFuzzer)
+    run(n=100, stmt="torch.square(x)", fuzzer_cls=UnaryOpFuzzer)
+    run(n=100, stmt="x + y", fuzzer_cls=BinaryOpFuzzer)
+
+
+if __name__ == "__main__":
+    main()
--- a/rl/Lib/site-packages/torch/utils/benchmark/examples/simple_timeit.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/examples/simple_timeit.py
@ -0,0 +1,26 @@
+# mypy: allow-untyped-defs
+"""Trivial use of Timer API:
+
+$ python -m examples.simple_timeit
+"""
+
+import torch
+
+import torch.utils.benchmark as benchmark_utils
+
+
+def main():
+    timer = benchmark_utils.Timer(
+        stmt="x + y",
+        globals={"x": torch.ones((4, 8)), "y": torch.ones((1, 8))},
+        label="Broadcasting add (4x8)",
+    )
+
+    for i in range(3):
+        print(f"Run: {i}\n{'-' * 40}")
+        print(f"timeit:\n{timer.timeit(10000)}\n")
+        print(f"autorange:\n{timer.blocked_autorange()}\n\n")
+
+
+if __name__ == "__main__":
+    main()
--- a/rl/Lib/site-packages/torch/utils/benchmark/examples/spectral_ops_fuzz_test.py
+++ b/rl/Lib/site-packages/torch/utils/benchmark/examples/spectral_ops_fuzz_test.py
@ -0,0 +1,114 @@
+# mypy: allow-untyped-defs
+"""Microbenchmarks for the torch.fft module"""
+from argparse import ArgumentParser
+from collections import namedtuple
+from collections.abc import Iterable
+
+import torch
+import torch.fft
+from torch.utils import benchmark
+from torch.utils.benchmark.op_fuzzers.spectral import SpectralOpFuzzer
+
+
+def _dim_options(ndim):
+    if ndim == 1:
+        return [None]
+    elif ndim == 2:
+        return [0, 1, None]
+    elif ndim == 3:
+        return [0, 1, 2, (0, 1), (0, 2), None]
+    raise ValueError(f"Expected ndim in range 1-3, got {ndim}")
+
+
+def run_benchmark(name: str, function: object, dtype: torch.dtype, seed: int, device: str, samples: int,
+                  probability_regular: float):
+    cuda = device == 'cuda'
+    spectral_fuzzer = SpectralOpFuzzer(seed=seed, dtype=dtype, cuda=cuda,
+                                       probability_regular=probability_regular)
+    results = []
+    for tensors, tensor_params, params in spectral_fuzzer.take(samples):
+        shape = [params['k0'], params['k1'], params['k2']][:params['ndim']]
+        str_shape = ' x '.join([f"{s:<4}" for s in shape])
+        sub_label = f"{str_shape} {'' if tensor_params['x']['is_contiguous'] else '(discontiguous)'}"
+        for dim in _dim_options(params['ndim']):
+            for nthreads in (1, 4, 16) if not cuda else (1,):
+                measurement = benchmark.Timer(
+                    stmt='func(x, dim=dim)',
+                    globals={'func': function, 'x': tensors['x'], 'dim': dim},
+                    label=f"{name}_{device}",
+                    sub_label=sub_label,
+                    description=f"dim={dim}",
+                    num_threads=nthreads,
+                ).blocked_autorange(min_run_time=1)
+                measurement.metadata = {
+                    'name': name,
+                    'device': device,
+                    'dim': dim,
+                    'shape': shape,
+                }
+                measurement.metadata.update(tensor_params['x'])
+                results.append(measurement)
+    return results
+
+
+Benchmark = namedtuple('Benchmark', ['name', 'function', 'dtype'])
+BENCHMARKS = [
+    Benchmark('fft_real', torch.fft.fftn, torch.float32),
+    Benchmark('fft_complex', torch.fft.fftn, torch.complex64),
+    Benchmark('ifft', torch.fft.ifftn, torch.complex64),
+    Benchmark('rfft', torch.fft.rfftn, torch.float32),
+    Benchmark('irfft', torch.fft.irfftn, torch.complex64),
+]
+BENCHMARK_MAP = {b.name: b for b in BENCHMARKS}
+BENCHMARK_NAMES = [b.name for b in BENCHMARKS]
+DEVICE_NAMES = ['cpu', 'cuda']
+
+def _output_csv(file, results):
+    file.write('benchmark,device,num_threads,numel,shape,contiguous,dim,mean (us),median (us),iqr (us)\n')
+    for measurement in results:
+        metadata = measurement.metadata
+        device, dim, shape, name, numel, contiguous = (
+            metadata['device'], metadata['dim'], metadata['shape'],
+            metadata['name'], metadata['numel'], metadata['is_contiguous'])
+
+        if isinstance(dim, Iterable):
+            dim_str = '-'.join(str(d) for d in dim)
+        else:
+            dim_str = str(dim)
+            shape_str = 'x'.join(str(s) for s in shape)
+
+        print(name, device, measurement.task_spec.num_threads, numel, shape_str, contiguous, dim_str,  # type: ignore[possibly-undefined]
+              measurement.mean * 1e6, measurement.median * 1e6, measurement.iqr * 1e6,
+              sep=',', file=file)
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser(description=__doc__)
+    parser.add_argument('--device', type=str, choices=DEVICE_NAMES, nargs='+', default=DEVICE_NAMES)
+    parser.add_argument('--bench', type=str, choices=BENCHMARK_NAMES, nargs='+', default=BENCHMARK_NAMES)
+    parser.add_argument('--seed', type=int, default=0)
+    parser.add_argument('--samples', type=int, default=10)
+    parser.add_argument('--probability-regular', '--probability_regular', type=float, default=1.0)
+    parser.add_argument('-o', '--output', type=str)
+    args = parser.parse_args()
+
+    num_benchmarks = len(args.device) * len(args.bench)
+    i = 0
+    results = []
+    for device in args.device:
+        for bench in (BENCHMARK_MAP[b] for b in args.bench):
+            results += run_benchmark(
+                name=bench.name, function=bench.function, dtype=bench.dtype,
+                seed=args.seed, device=device, samples=args.samples,
+                probability_regular=args.probability_regular)
+            i += 1
+            print(f'Completed {bench.name} benchmark on {device} ({i} of {num_benchmarks})')
+
+    if args.output is not None:
+        with open(args.output, 'w') as f:
+            _output_csv(f, results)
+
+    compare = benchmark.Compare(results)
+    compare.trim_significant_figures()
+    compare.colorize()
+    compare.print()