I am done

This commit is contained in:
2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions

View File

@ -0,0 +1,6 @@
from torch.utils.benchmark.utils.common import * # noqa: F403
from torch.utils.benchmark.utils.timer import * # noqa: F403
from torch.utils.benchmark.utils.compare import * # noqa: F403
from torch.utils.benchmark.utils.fuzzer import * # noqa: F403
from torch.utils.benchmark.utils.valgrind_wrapper.timer_interface import * # noqa: F403
from torch.utils.benchmark.utils.sparse_fuzzer import * # noqa: F403

View File

@ -0,0 +1,222 @@
# mypy: allow-untyped-defs
import collections
import os
import shutil
import subprocess
try:
# no type stub for conda command line interface
import conda.cli.python_api # type: ignore[import]
from conda.cli.python_api import Commands as conda_commands
except ImportError:
# blas_compare.py will fail to import these when it's inside a conda env,
# but that's fine as it only wants the constants.
pass
WORKING_ROOT = "/tmp/pytorch_blas_compare_environments"
MKL_2020_3 = "mkl_2020_3"
MKL_2020_0 = "mkl_2020_0"
OPEN_BLAS = "open_blas"
EIGEN = "eigen"
GENERIC_ENV_VARS = ("USE_CUDA=0", "USE_ROCM=0")
BASE_PKG_DEPS = (
"cmake",
"hypothesis",
"ninja",
"numpy",
"pyyaml",
"setuptools",
"typing_extensions",
)
SubEnvSpec = collections.namedtuple(
"SubEnvSpec", (
"generic_installs",
"special_installs",
"environment_variables",
# Validate install.
"expected_blas_symbols",
"expected_mkl_version",
))
SUB_ENVS = {
MKL_2020_3: SubEnvSpec(
generic_installs=(),
special_installs=("intel", ("mkl=2020.3", "mkl-include=2020.3")),
environment_variables=("BLAS=MKL",) + GENERIC_ENV_VARS,
expected_blas_symbols=("mkl_blas_sgemm",),
expected_mkl_version="2020.0.3",
),
MKL_2020_0: SubEnvSpec(
generic_installs=(),
special_installs=("intel", ("mkl=2020.0", "mkl-include=2020.0")),
environment_variables=("BLAS=MKL",) + GENERIC_ENV_VARS,
expected_blas_symbols=("mkl_blas_sgemm",),
expected_mkl_version="2020.0.0",
),
OPEN_BLAS: SubEnvSpec(
generic_installs=("openblas",),
special_installs=(),
environment_variables=("BLAS=OpenBLAS",) + GENERIC_ENV_VARS,
expected_blas_symbols=("exec_blas",),
expected_mkl_version=None,
),
# EIGEN: SubEnvSpec(
# generic_installs=(),
# special_installs=(),
# environment_variables=("BLAS=Eigen",) + GENERIC_ENV_VARS,
# expected_blas_symbols=(),
# ),
}
def conda_run(*args):
"""Convenience method."""
stdout, stderr, retcode = conda.cli.python_api.run_command(*args)
if retcode:
raise OSError(f"conda error: {str(args)} retcode: {retcode}\n{stderr}")
return stdout
def main():
if os.path.exists(WORKING_ROOT):
print("Cleaning: removing old working root.")
shutil.rmtree(WORKING_ROOT)
os.makedirs(WORKING_ROOT)
git_root = subprocess.check_output(
"git rev-parse --show-toplevel",
shell=True,
cwd=os.path.dirname(os.path.realpath(__file__))
).decode("utf-8").strip()
for env_name, env_spec in SUB_ENVS.items():
env_path = os.path.join(WORKING_ROOT, env_name)
print(f"Creating env: {env_name}: ({env_path})")
conda_run(
conda_commands.CREATE,
"--no-default-packages",
"--prefix", env_path,
"python=3",
)
print("Testing that env can be activated:")
base_source = subprocess.run(
f"source activate {env_path}",
shell=True,
capture_output=True,
check=False,
)
if base_source.returncode:
raise OSError(
"Failed to source base environment:\n"
f" stdout: {base_source.stdout.decode('utf-8')}\n"
f" stderr: {base_source.stderr.decode('utf-8')}"
)
print("Installing packages:")
conda_run(
conda_commands.INSTALL,
"--prefix", env_path,
*(BASE_PKG_DEPS + env_spec.generic_installs)
)
if env_spec.special_installs:
channel, channel_deps = env_spec.special_installs
print(f"Installing packages from channel: {channel}")
conda_run(
conda_commands.INSTALL,
"--prefix", env_path,
"-c", channel, *channel_deps
)
if env_spec.environment_variables:
print("Setting environment variables.")
# This does not appear to be possible using the python API.
env_set = subprocess.run(
f"source activate {env_path} && "
f"conda env config vars set {' '.join(env_spec.environment_variables)}",
shell=True,
capture_output=True,
check=False,
)
if env_set.returncode:
raise OSError(
"Failed to set environment variables:\n"
f" stdout: {env_set.stdout.decode('utf-8')}\n"
f" stderr: {env_set.stderr.decode('utf-8')}"
)
# Check that they were actually set correctly.
actual_env_vars = subprocess.run(
f"source activate {env_path} && env",
shell=True,
capture_output=True,
check=True,
).stdout.decode("utf-8").strip().splitlines()
for e in env_spec.environment_variables:
assert e in actual_env_vars, f"{e} not in envs"
print(f"Building PyTorch for env: `{env_name}`")
# We have to re-run during each build to pick up the new
# build config settings.
build_run = subprocess.run(
f"source activate {env_path} && "
f"cd {git_root} && "
"python setup.py install --cmake",
shell=True,
capture_output=True,
check=True,
)
print("Checking configuration:")
check_run = subprocess.run(
# Shameless abuse of `python -c ...`
f"source activate {env_path} && "
'python -c "'
"import torch;"
"from torch.utils.benchmark import Timer;"
"print(torch.__config__.show());"
"setup = 'x=torch.ones((128, 128));y=torch.ones((128, 128))';"
"counts = Timer('torch.mm(x, y)', setup).collect_callgrind(collect_baseline=False);"
"stats = counts.as_standardized().stats(inclusive=True);"
"print(stats.filter(lambda l: 'blas' in l.lower()))\"",
shell=True,
capture_output=True,
check=False,
)
if check_run.returncode:
raise OSError(
"Failed to set environment variables:\n"
f" stdout: {check_run.stdout.decode('utf-8')}\n"
f" stderr: {check_run.stderr.decode('utf-8')}"
)
check_run_stdout = check_run.stdout.decode('utf-8')
print(check_run_stdout)
for e in env_spec.environment_variables:
if "BLAS" in e:
assert e in check_run_stdout, f"PyTorch build did not respect `BLAS=...`: {e}"
for s in env_spec.expected_blas_symbols:
assert s in check_run_stdout
if env_spec.expected_mkl_version is not None:
assert f"- Intel(R) Math Kernel Library Version {env_spec.expected_mkl_version}" in check_run_stdout
print(f"Build complete: {env_name}")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,99 @@
# mypy: allow-untyped-defs
"""Example of Timer and Compare APIs:
$ python -m examples.compare
"""
import pickle
import sys
import time
import torch
import torch.utils.benchmark as benchmark_utils
class FauxTorch:
"""Emulate different versions of pytorch.
In normal circumstances this would be done with multiple processes
writing serialized measurements, but this simplifies that model to
make the example clearer.
"""
def __init__(self, real_torch, extra_ns_per_element):
self._real_torch = real_torch
self._extra_ns_per_element = extra_ns_per_element
def extra_overhead(self, result):
# time.sleep has a ~65 us overhead, so only fake a
# per-element overhead if numel is large enough.
numel = int(result.numel())
if numel > 5000:
time.sleep(numel * self._extra_ns_per_element * 1e-9)
return result
def add(self, *args, **kwargs):
return self.extra_overhead(self._real_torch.add(*args, **kwargs))
def mul(self, *args, **kwargs):
return self.extra_overhead(self._real_torch.mul(*args, **kwargs))
def cat(self, *args, **kwargs):
return self.extra_overhead(self._real_torch.cat(*args, **kwargs))
def matmul(self, *args, **kwargs):
return self.extra_overhead(self._real_torch.matmul(*args, **kwargs))
def main():
tasks = [
("add", "add", "torch.add(x, y)"),
("add", "add (extra +0)", "torch.add(x, y + zero)"),
]
serialized_results = []
repeats = 2
timers = [
benchmark_utils.Timer(
stmt=stmt,
globals={
"torch": torch if branch == "master" else FauxTorch(torch, overhead_ns),
"x": torch.ones((size, 4)),
"y": torch.ones((1, 4)),
"zero": torch.zeros(()),
},
label=label,
sub_label=sub_label,
description=f"size: {size}",
env=branch,
num_threads=num_threads,
)
for branch, overhead_ns in [("master", None), ("my_branch", 1), ("severe_regression", 5)]
for label, sub_label, stmt in tasks
for size in [1, 10, 100, 1000, 10000, 50000]
for num_threads in [1, 4]
]
for i, timer in enumerate(timers * repeats):
serialized_results.append(pickle.dumps(
timer.blocked_autorange(min_run_time=0.05)
))
print(f"\r{i + 1} / {len(timers) * repeats}", end="")
sys.stdout.flush()
print()
comparison = benchmark_utils.Compare([
pickle.loads(i) for i in serialized_results
])
print("== Unformatted " + "=" * 80 + "\n" + "/" * 95 + "\n")
comparison.print()
print("== Formatted " + "=" * 80 + "\n" + "/" * 93 + "\n")
comparison.trim_significant_figures()
comparison.colorize()
comparison.print()
if __name__ == "__main__":
main()

View File

@ -0,0 +1,86 @@
# mypy: allow-untyped-defs
"""Example of the Timer and Fuzzer APIs:
$ python -m examples.fuzzer
"""
import sys
import torch.utils.benchmark as benchmark_utils
def main():
add_fuzzer = benchmark_utils.Fuzzer(
parameters=[
[
benchmark_utils.FuzzedParameter(
name=f"k{i}",
minval=16,
maxval=16 * 1024,
distribution="loguniform",
) for i in range(3)
],
benchmark_utils.FuzzedParameter(
name="d",
distribution={2: 0.6, 3: 0.4},
),
],
tensors=[
[
benchmark_utils.FuzzedTensor(
name=name,
size=("k0", "k1", "k2"),
dim_parameter="d",
probability_contiguous=0.75,
min_elements=64 * 1024,
max_elements=128 * 1024,
) for name in ("x", "y")
],
],
seed=0,
)
n = 250
measurements = []
for i, (tensors, tensor_properties, _) in enumerate(add_fuzzer.take(n=n)):
x, x_order = tensors["x"], str(tensor_properties["x"]["order"])
y, y_order = tensors["y"], str(tensor_properties["y"]["order"])
shape = ", ".join(tuple(f'{i:>4}' for i in x.shape))
description = "".join([
f"{x.numel():>7} | {shape:<16} | ",
f"{'contiguous' if x.is_contiguous() else x_order:<12} | ",
f"{'contiguous' if y.is_contiguous() else y_order:<12} | ",
])
timer = benchmark_utils.Timer(
stmt="x + y",
globals=tensors,
description=description,
)
measurements.append(timer.blocked_autorange(min_run_time=0.1))
measurements[-1].metadata = {"numel": x.numel()}
print(f"\r{i + 1} / {n}", end="")
sys.stdout.flush()
print()
# More string munging to make pretty output.
print(f"Average attempts per valid config: {1. / (1. - add_fuzzer.rejection_rate):.1f}")
def time_fn(m):
return m.median / m.metadata["numel"]
measurements.sort(key=time_fn)
template = f"{{:>6}}{' ' * 19}Size Shape{' ' * 13}X order Y order\n{'-' * 80}"
print(template.format("Best:"))
for m in measurements[:15]:
print(f"{time_fn(m) * 1e9:>4.1f} ns / element {m.description}")
print("\n" + template.format("Worst:"))
for m in measurements[-15:]:
print(f"{time_fn(m) * 1e9:>4.1f} ns / element {m.description}")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,105 @@
# mypy: allow-untyped-defs
"""Example use of Timer and op fuzzers to measure kernel performance.
$ python -m examples.op_benchmark
"""
import numpy as np
import torch
from torch.utils.benchmark import Timer
from torch.utils.benchmark.op_fuzzers.binary import BinaryOpFuzzer
from torch.utils.benchmark.op_fuzzers.unary import UnaryOpFuzzer
import operator
_MEASURE_TIME = 1.0
def assert_dicts_equal(dict_0, dict_1):
"""Builtin dict comparison will not compare numpy arrays.
e.g.
x = {"a": np.ones((2, 1))}
x == x # Raises ValueError
"""
assert set(dict_0.keys()) == set(dict_0.keys())
assert all(np.all(v == dict_1[k]) for k, v in dict_0.items() if k != "dtype")
def run(n, stmt, fuzzer_cls):
float_iter = fuzzer_cls(seed=0, dtype=torch.float32).take(n)
int_iter = fuzzer_cls(seed=0, dtype=torch.int32).take(n)
raw_results = []
for i, (float_values, int_values) in enumerate(zip(float_iter, int_iter)):
float_tensors, float_tensor_params, float_params = float_values
int_tensors, int_tensor_params, int_params = int_values
# This benchmark assumes that the two fuzzers generate identically
# sized and strided Tensors, since the same seed is used.
assert_dicts_equal(float_params, int_params)
assert_dicts_equal(float_tensor_params["x"], int_tensor_params["x"])
float_measurement, int_measurement = (
Timer(
stmt,
globals=tensors,
).blocked_autorange(min_run_time=_MEASURE_TIME)
for tensors in (float_tensors, int_tensors)
)
descriptions = []
for name in float_tensors:
shape_str = "(" + ", ".join([
f"2 ** {int(np.log2(i))}"
if 2 ** int(np.log2(i)) == i and i > 1
else str(i)
for i in float_tensors[name].shape
]) + ")"
order = float_tensor_params[name]["order"]
order_str = ("" if all(order == np.arange(len(order))) else str(tuple(order)))
steps = float_tensor_params[name]["steps"]
steps_str = str(steps) if sum(steps) > len(steps) else ""
descriptions.append((name, shape_str, order_str, steps_str))
raw_results.append((float_measurement, int_measurement, descriptions))
print(f"\r{i + 1} / {n}", end="")
print()
parsed_results, name_len, shape_len, order_len, steps_len = [], 0, 0, 0, 0
for float_measurement, int_measurement, descriptions in raw_results:
t_float = float_measurement.median * 1e6
t_int = int_measurement.median * 1e6
rel_diff = abs(t_float - t_int) / (t_float + t_int) * 2
parsed_results.append((t_float, t_int, rel_diff, descriptions))
for name, shape, order, steps in descriptions:
name_len = max(name_len, len(name))
shape_len = max(shape_len, len(shape))
order_len = max(order_len, len(order))
steps_len = max(steps_len, len(steps))
parsed_results.sort(key=operator.itemgetter(2))
print(f"stmt: {stmt}")
print(f" diff faster{'':>17}{' ' * name_len} ", end="")
print(f"{'shape'.ljust(shape_len)}{'':>16}{'order'.ljust(order_len)}", end="")
print(f" steps\n{'-' * 100}")
for results, spacer in [(parsed_results[:10], "..."), (parsed_results[-10:], "")]:
for t_float, t_int, rel_diff, descriptions in results:
time_str = [f"{rel_diff * 100:>4.1f}% {'int' if t_int < t_float else 'float':<20}"]
time_str.extend(["".ljust(len(time_str[0])) for _ in descriptions[:-1]])
for t_str, (name, shape, order, steps) in zip(time_str, descriptions):
name = f"{name}:".ljust(name_len + 1)
shape = shape.ljust(shape_len + 10)
order = order.ljust(order_len)
print(f"{t_str} {name} {shape}| {order} | {steps}")
print(spacer)
def main():
run(n=100, stmt="torch.median(x, dim=0)", fuzzer_cls=UnaryOpFuzzer)
run(n=100, stmt="torch.square(x)", fuzzer_cls=UnaryOpFuzzer)
run(n=100, stmt="x + y", fuzzer_cls=BinaryOpFuzzer)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,26 @@
# mypy: allow-untyped-defs
"""Trivial use of Timer API:
$ python -m examples.simple_timeit
"""
import torch
import torch.utils.benchmark as benchmark_utils
def main():
timer = benchmark_utils.Timer(
stmt="x + y",
globals={"x": torch.ones((4, 8)), "y": torch.ones((1, 8))},
label="Broadcasting add (4x8)",
)
for i in range(3):
print(f"Run: {i}\n{'-' * 40}")
print(f"timeit:\n{timer.timeit(10000)}\n")
print(f"autorange:\n{timer.blocked_autorange()}\n\n")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,114 @@
# mypy: allow-untyped-defs
"""Microbenchmarks for the torch.fft module"""
from argparse import ArgumentParser
from collections import namedtuple
from collections.abc import Iterable
import torch
import torch.fft
from torch.utils import benchmark
from torch.utils.benchmark.op_fuzzers.spectral import SpectralOpFuzzer
def _dim_options(ndim):
if ndim == 1:
return [None]
elif ndim == 2:
return [0, 1, None]
elif ndim == 3:
return [0, 1, 2, (0, 1), (0, 2), None]
raise ValueError(f"Expected ndim in range 1-3, got {ndim}")
def run_benchmark(name: str, function: object, dtype: torch.dtype, seed: int, device: str, samples: int,
probability_regular: float):
cuda = device == 'cuda'
spectral_fuzzer = SpectralOpFuzzer(seed=seed, dtype=dtype, cuda=cuda,
probability_regular=probability_regular)
results = []
for tensors, tensor_params, params in spectral_fuzzer.take(samples):
shape = [params['k0'], params['k1'], params['k2']][:params['ndim']]
str_shape = ' x '.join([f"{s:<4}" for s in shape])
sub_label = f"{str_shape} {'' if tensor_params['x']['is_contiguous'] else '(discontiguous)'}"
for dim in _dim_options(params['ndim']):
for nthreads in (1, 4, 16) if not cuda else (1,):
measurement = benchmark.Timer(
stmt='func(x, dim=dim)',
globals={'func': function, 'x': tensors['x'], 'dim': dim},
label=f"{name}_{device}",
sub_label=sub_label,
description=f"dim={dim}",
num_threads=nthreads,
).blocked_autorange(min_run_time=1)
measurement.metadata = {
'name': name,
'device': device,
'dim': dim,
'shape': shape,
}
measurement.metadata.update(tensor_params['x'])
results.append(measurement)
return results
Benchmark = namedtuple('Benchmark', ['name', 'function', 'dtype'])
BENCHMARKS = [
Benchmark('fft_real', torch.fft.fftn, torch.float32),
Benchmark('fft_complex', torch.fft.fftn, torch.complex64),
Benchmark('ifft', torch.fft.ifftn, torch.complex64),
Benchmark('rfft', torch.fft.rfftn, torch.float32),
Benchmark('irfft', torch.fft.irfftn, torch.complex64),
]
BENCHMARK_MAP = {b.name: b for b in BENCHMARKS}
BENCHMARK_NAMES = [b.name for b in BENCHMARKS]
DEVICE_NAMES = ['cpu', 'cuda']
def _output_csv(file, results):
file.write('benchmark,device,num_threads,numel,shape,contiguous,dim,mean (us),median (us),iqr (us)\n')
for measurement in results:
metadata = measurement.metadata
device, dim, shape, name, numel, contiguous = (
metadata['device'], metadata['dim'], metadata['shape'],
metadata['name'], metadata['numel'], metadata['is_contiguous'])
if isinstance(dim, Iterable):
dim_str = '-'.join(str(d) for d in dim)
else:
dim_str = str(dim)
shape_str = 'x'.join(str(s) for s in shape)
print(name, device, measurement.task_spec.num_threads, numel, shape_str, contiguous, dim_str, # type: ignore[possibly-undefined]
measurement.mean * 1e6, measurement.median * 1e6, measurement.iqr * 1e6,
sep=',', file=file)
if __name__ == '__main__':
parser = ArgumentParser(description=__doc__)
parser.add_argument('--device', type=str, choices=DEVICE_NAMES, nargs='+', default=DEVICE_NAMES)
parser.add_argument('--bench', type=str, choices=BENCHMARK_NAMES, nargs='+', default=BENCHMARK_NAMES)
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--samples', type=int, default=10)
parser.add_argument('--probability-regular', '--probability_regular', type=float, default=1.0)
parser.add_argument('-o', '--output', type=str)
args = parser.parse_args()
num_benchmarks = len(args.device) * len(args.bench)
i = 0
results = []
for device in args.device:
for bench in (BENCHMARK_MAP[b] for b in args.bench):
results += run_benchmark(
name=bench.name, function=bench.function, dtype=bench.dtype,
seed=args.seed, device=device, samples=args.samples,
probability_regular=args.probability_regular)
i += 1
print(f'Completed {bench.name} benchmark on {device} ({i} of {num_benchmarks})')
if args.output is not None:
with open(args.output, 'w') as f:
_output_csv(f, results)
compare = benchmark.Compare(results)
compare.trim_significant_figures()
compare.colorize()
compare.print()

View File

@ -0,0 +1,107 @@
# mypy: allow-untyped-defs
import numpy as np
import torch
from torch.utils.benchmark import Fuzzer, FuzzedParameter, ParameterAlias, FuzzedTensor
_MIN_DIM_SIZE = 16
_MAX_DIM_SIZE = 16 * 1024 ** 2
_POW_TWO_SIZES = tuple(2 ** i for i in range(
int(np.log2(_MIN_DIM_SIZE)),
int(np.log2(_MAX_DIM_SIZE)) + 1,
))
class BinaryOpFuzzer(Fuzzer):
def __init__(self, seed, dtype=torch.float32, cuda=False):
super().__init__(
parameters=[
# Dimensionality of x and y. (e.g. 1D, 2D, or 3D.)
FuzzedParameter("dim", distribution={1: 0.3, 2: 0.4, 3: 0.3}, strict=True),
# Shapes for `x` and `y`.
# It is important to test all shapes, however
# powers of two are especially important and therefore
# warrant special attention. This is done by generating
# both a value drawn from all integers between the min and
# max allowed values, and another from only the powers of two
# (both distributions are loguniform) and then randomly
# selecting between the two.
# Moreover, `y` will occasionally have singleton
# dimensions in order to test broadcasting.
[
FuzzedParameter(
name=f"k_any_{i}",
minval=_MIN_DIM_SIZE,
maxval=_MAX_DIM_SIZE,
distribution="loguniform",
) for i in range(3)
],
[
FuzzedParameter(
name=f"k_pow2_{i}",
distribution={size: 1. / len(_POW_TWO_SIZES) for size in _POW_TWO_SIZES}
) for i in range(3)
],
[
FuzzedParameter(
name=f"k{i}",
distribution={
ParameterAlias(f"k_any_{i}"): 0.8,
ParameterAlias(f"k_pow2_{i}"): 0.2,
},
strict=True,
) for i in range(3)
],
[
FuzzedParameter(
name=f"y_k{i}",
distribution={
ParameterAlias(f"k{i}"): 0.8,
1: 0.2,
},
strict=True,
) for i in range(3)
],
# Steps for `x` and `y`. (Benchmarks strided memory access.)
[
FuzzedParameter(
name=f"{name}_step_{i}",
distribution={1: 0.8, 2: 0.06, 4: 0.06, 8: 0.04, 16: 0.04},
)
for i in range(3)
for name in ("x", "y")
],
# Repeatable entropy for downstream applications.
FuzzedParameter(name="random_value", minval=0, maxval=2 ** 32 - 1, distribution="uniform"),
],
tensors=[
FuzzedTensor(
name="x",
size=("k0", "k1", "k2"),
steps=("x_step_0", "x_step_1", "x_step_2"),
probability_contiguous=0.75,
min_elements=4 * 1024,
max_elements=32 * 1024 ** 2,
max_allocation_bytes=2 * 1024**3, # 2 GB
dim_parameter="dim",
dtype=dtype,
cuda=cuda,
),
FuzzedTensor(
name="y",
size=("y_k0", "y_k1", "y_k2"),
steps=("x_step_0", "x_step_1", "x_step_2"),
probability_contiguous=0.75,
max_allocation_bytes=2 * 1024**3, # 2 GB
dim_parameter="dim",
dtype=dtype,
cuda=cuda,
),
],
seed=seed,
)

View File

@ -0,0 +1,107 @@
# mypy: allow-untyped-defs
import numpy as np
import torch
from torch.utils.benchmark import Fuzzer, FuzzedParameter, ParameterAlias, FuzzedSparseTensor
_MIN_DIM_SIZE = 16
_MAX_DIM_SIZE = 16 * 1024 ** 2
_POW_TWO_SIZES = tuple(2 ** i for i in range(
int(np.log2(_MIN_DIM_SIZE)),
int(np.log2(_MAX_DIM_SIZE)) + 1,
))
class BinaryOpSparseFuzzer(Fuzzer):
def __init__(self, seed, dtype=torch.float32, cuda=False):
super().__init__(
parameters=[
# Dimensionality of x and y. (e.g. 1D, 2D, or 3D.)
FuzzedParameter("dim_parameter", distribution={1: 0.3, 2: 0.4, 3: 0.3}, strict=True),
FuzzedParameter(
name="sparse_dim",
distribution={1: 0.4, 2: 0.4, 3: 0.2},
strict=True
),
# Shapes for `x` and `y`.
# It is important to test all shapes, however
# powers of two are especially important and therefore
# warrant special attention. This is done by generating
# both a value drawn from all integers between the min and
# max allowed values, and another from only the powers of two
# (both distributions are loguniform) and then randomly
# selecting between the two.
# Moreover, `y` will occasionally have singleton
# dimensions in order to test broadcasting.
[
FuzzedParameter(
name=f"k_any_{i}",
minval=_MIN_DIM_SIZE,
maxval=_MAX_DIM_SIZE,
distribution="loguniform",
) for i in range(3)
],
[
FuzzedParameter(
name=f"k_pow2_{i}",
distribution={size: 1. / len(_POW_TWO_SIZES) for size in _POW_TWO_SIZES}
) for i in range(3)
],
[
FuzzedParameter(
name=f"k{i}",
distribution={
ParameterAlias(f"k_any_{i}"): 0.8,
ParameterAlias(f"k_pow2_{i}"): 0.2,
},
strict=True,
) for i in range(3)
],
[
FuzzedParameter(
name=f"y_k{i}",
distribution={
ParameterAlias(f"k{i}"): 1.0},
strict=True,
) for i in range(3)
],
FuzzedParameter(
name="density",
distribution={0.1: 0.4, 0.05: 0.3, 0.01: 0.3},
),
FuzzedParameter(
name="coalesced",
distribution={True: 0.5, False: 0.5},
),
# Repeatable entropy for downstream applications.
FuzzedParameter(name="random_value", minval=0, maxval=2 ** 32 - 1, distribution="uniform"),
],
tensors=[
FuzzedSparseTensor(
name="x",
size=("k0", "k1", "k2"),
dim_parameter="dim_parameter",
sparse_dim="sparse_dim",
density="density",
coalesced="coalesced",
min_elements=4 * 1024,
max_elements=32 * 1024 ** 2,
dtype=dtype,
cuda=cuda,
),
FuzzedSparseTensor(
name="y",
size=("y_k0", "y_k1", "y_k2"),
dim_parameter="dim_parameter",
sparse_dim="sparse_dim",
density="density",
coalesced="coalesced",
min_elements=4 * 1024,
max_elements=32 * 1024 ** 2,
dtype=dtype,
cuda=cuda,
),
],
seed=seed,
)

View File

@ -0,0 +1,83 @@
# mypy: allow-untyped-defs
import numpy as np
import torch
from torch.utils.benchmark import Fuzzer, FuzzedParameter, ParameterAlias, FuzzedSparseTensor
_MIN_DIM_SIZE = 16
_MAX_DIM_SIZE = 16 * 1024 ** 2
_POW_TWO_SIZES = tuple(2 ** i for i in range(
int(np.log2(_MIN_DIM_SIZE)),
int(np.log2(_MAX_DIM_SIZE)) + 1,
))
class UnaryOpSparseFuzzer(Fuzzer):
def __init__(self, seed, dtype=torch.float32, cuda=False):
super().__init__(
parameters=[
# Sparse dim parameter of x. (e.g. 1D, 2D, or 3D.)
FuzzedParameter("dim_parameter", distribution={1: 0.3, 2: 0.4, 3: 0.3}, strict=True),
FuzzedParameter(
name="sparse_dim",
distribution={1: 0.4, 2: 0.4, 3: 0.2},
strict=True
),
# Shapes for `x`.
# It is important to test all shapes, however
# powers of two are especially important and therefore
# warrant special attention. This is done by generating
# both a value drawn from all integers between the min and
# max allowed values, and another from only the powers of two
# (both distributions are loguniform) and then randomly
# selecting between the two.
[
FuzzedParameter(
name=f"k_any_{i}",
minval=_MIN_DIM_SIZE,
maxval=_MAX_DIM_SIZE,
distribution="loguniform",
) for i in range(3)
],
[
FuzzedParameter(
name=f"k_pow2_{i}",
distribution={size: 1. / len(_POW_TWO_SIZES) for size in _POW_TWO_SIZES}
) for i in range(3)
],
[
FuzzedParameter(
name=f"k{i}",
distribution={
ParameterAlias(f"k_any_{i}"): 0.8,
ParameterAlias(f"k_pow2_{i}"): 0.2,
},
strict=True,
) for i in range(3)
],
FuzzedParameter(
name="density",
distribution={0.1: 0.4, 0.05: 0.3, 0.01: 0.3},
),
FuzzedParameter(
name="coalesced",
distribution={True: 0.5, False: 0.5},
),
FuzzedParameter(name="random_value", minval=0, maxval=2 ** 32 - 1, distribution="uniform"),
],
tensors=[
FuzzedSparseTensor(
name="x",
size=("k0", "k1", "k2"),
dim_parameter="dim_parameter",
sparse_dim="sparse_dim",
min_elements=4 * 1024,
max_elements=32 * 1024 ** 2,
density="density",
coalesced="coalesced",
dtype=dtype,
cuda=cuda,
),
],
seed=seed,
)

View File

@ -0,0 +1,94 @@
# mypy: allow-untyped-defs
import math
import torch
from torch.utils import benchmark
from torch.utils.benchmark import FuzzedParameter, FuzzedTensor, ParameterAlias
__all__ = ['SpectralOpFuzzer']
MIN_DIM_SIZE = 16
MAX_DIM_SIZE = 16 * 1024
def power_range(upper_bound, base):
return (base ** i for i in range(int(math.log(upper_bound, base)) + 1))
# List of regular numbers from MIN_DIM_SIZE to MAX_DIM_SIZE
# These numbers factorize into multiples of prime factors 2, 3, and 5 only
# and are usually the fastest in FFT implementations.
REGULAR_SIZES = []
for i in power_range(MAX_DIM_SIZE, 2):
for j in power_range(MAX_DIM_SIZE // i, 3):
ij = i * j
for k in power_range(MAX_DIM_SIZE // ij, 5):
ijk = ij * k
if ijk > MIN_DIM_SIZE:
REGULAR_SIZES.append(ijk)
REGULAR_SIZES.sort()
class SpectralOpFuzzer(benchmark.Fuzzer):
def __init__(self, *, seed: int, dtype=torch.float64,
cuda: bool = False, probability_regular: float = 1.0):
super().__init__(
parameters=[
# Dimensionality of x. (e.g. 1D, 2D, or 3D.)
FuzzedParameter("ndim", distribution={1: 0.3, 2: 0.4, 3: 0.3}, strict=True),
# Shapes for `x`.
# It is important to test all shapes, however
# regular sizes are especially important to the FFT and therefore
# warrant special attention. This is done by generating
# both a value drawn from all integers between the min and
# max allowed values, and another from only the regular numbers
# (both distributions are loguniform) and then randomly
# selecting between the two.
[
FuzzedParameter(
name=f"k_any_{i}",
minval=MIN_DIM_SIZE,
maxval=MAX_DIM_SIZE,
distribution="loguniform",
) for i in range(3)
],
[
FuzzedParameter(
name=f"k_regular_{i}",
distribution={size: 1. / len(REGULAR_SIZES) for size in REGULAR_SIZES}
) for i in range(3)
],
[
FuzzedParameter(
name=f"k{i}",
distribution={
ParameterAlias(f"k_regular_{i}"): probability_regular,
ParameterAlias(f"k_any_{i}"): 1 - probability_regular,
},
strict=True,
) for i in range(3)
],
# Steps for `x`. (Benchmarks strided memory access.)
[
FuzzedParameter(
name=f"step_{i}",
distribution={1: 0.8, 2: 0.06, 4: 0.06, 8: 0.04, 16: 0.04},
) for i in range(3)
],
],
tensors=[
FuzzedTensor(
name="x",
size=("k0", "k1", "k2"),
steps=("step_0", "step_1", "step_2"),
probability_contiguous=0.75,
min_elements=4 * 1024,
max_elements=32 * 1024 ** 2,
max_allocation_bytes=2 * 1024**3, # 2 GB
dim_parameter="ndim",
dtype=dtype,
cuda=cuda,
),
],
seed=seed,
)

View File

@ -0,0 +1,82 @@
# mypy: allow-untyped-defs
import numpy as np
import torch
from torch.utils.benchmark import Fuzzer, FuzzedParameter, ParameterAlias, FuzzedTensor
_MIN_DIM_SIZE = 16
_MAX_DIM_SIZE = 16 * 1024 ** 2
_POW_TWO_SIZES = tuple(2 ** i for i in range(
int(np.log2(_MIN_DIM_SIZE)),
int(np.log2(_MAX_DIM_SIZE)) + 1,
))
class UnaryOpFuzzer(Fuzzer):
def __init__(self, seed, dtype=torch.float32, cuda=False):
super().__init__(
parameters=[
# Dimensionality of x. (e.g. 1D, 2D, or 3D.)
FuzzedParameter("dim", distribution={1: 0.3, 2: 0.4, 3: 0.3}, strict=True),
# Shapes for `x`.
# It is important to test all shapes, however
# powers of two are especially important and therefore
# warrant special attention. This is done by generating
# both a value drawn from all integers between the min and
# max allowed values, and another from only the powers of two
# (both distributions are loguniform) and then randomly
# selecting between the two.
[
FuzzedParameter(
name=f"k_any_{i}",
minval=_MIN_DIM_SIZE,
maxval=_MAX_DIM_SIZE,
distribution="loguniform",
) for i in range(3)
],
[
FuzzedParameter(
name=f"k_pow2_{i}",
distribution={size: 1. / len(_POW_TWO_SIZES) for size in _POW_TWO_SIZES}
) for i in range(3)
],
[
FuzzedParameter(
name=f"k{i}",
distribution={
ParameterAlias(f"k_any_{i}"): 0.8,
ParameterAlias(f"k_pow2_{i}"): 0.2,
},
strict=True,
) for i in range(3)
],
# Steps for `x`. (Benchmarks strided memory access.)
[
FuzzedParameter(
name=f"x_step_{i}",
distribution={1: 0.8, 2: 0.06, 4: 0.06, 8: 0.04, 16: 0.04},
) for i in range(3)
],
# Repeatable entropy for downstream applications.
FuzzedParameter(name="random_value", minval=0, maxval=2 ** 32 - 1, distribution="uniform"),
],
tensors=[
FuzzedTensor(
name="x",
size=("k0", "k1", "k2"),
steps=("x_step_0", "x_step_1", "x_step_2"),
probability_contiguous=0.75,
min_elements=4 * 1024,
max_elements=32 * 1024 ** 2,
max_allocation_bytes=2 * 1024**3, # 2 GB
dim_parameter="dim",
dtype=dtype,
cuda=cuda,
),
],
seed=seed,
)

View File

@ -0,0 +1,40 @@
from typing import Any, Callable, Dict, Protocol, runtime_checkable
class TimerClass(Protocol):
"""This is the portion of the `timeit.Timer` API used by benchmark utils."""
def __init__(
self,
stmt: str,
setup: str,
timer: Callable[[], float],
globals: Dict[str, Any],
**kwargs: Any,
) -> None:
...
def timeit(self, number: int) -> float:
...
@runtime_checkable
class TimeitModuleType(Protocol):
"""Modules generated from `timeit_template.cpp`."""
def timeit(self, number: int) -> float:
...
class CallgrindModuleType(Protocol):
"""Replicates the valgrind endpoints in `torch._C`.
These bindings are used to collect Callgrind profiles on earlier versions
of PyTorch and will eventually be removed.
"""
__file__: str
__name__: str
def _valgrind_supported_platform(self) -> bool:
...
def _valgrind_toggle(self) -> None:
...

View File

@ -0,0 +1,355 @@
"""Base shared classes and utilities."""
import collections
import contextlib
import dataclasses
import os
import shutil
import tempfile
import textwrap
import time
from typing import cast, Any, DefaultDict, Dict, Iterable, Iterator, List, Optional, Tuple
import uuid
import torch
__all__ = ["TaskSpec", "Measurement", "select_unit", "unit_to_english", "trim_sigfig", "ordered_unique", "set_torch_threads"]
_MAX_SIGNIFICANT_FIGURES = 4
_MIN_CONFIDENCE_INTERVAL = 25e-9 # 25 ns
# Measurement will include a warning if the distribution is suspect. All
# runs are expected to have some variation; these parameters set the
# thresholds.
_IQR_WARN_THRESHOLD = 0.1
_IQR_GROSS_WARN_THRESHOLD = 0.25
@dataclasses.dataclass(init=True, repr=False, eq=True, frozen=True)
class TaskSpec:
"""Container for information used to define a Timer. (except globals)"""
stmt: str
setup: str
global_setup: str = ""
label: Optional[str] = None
sub_label: Optional[str] = None
description: Optional[str] = None
env: Optional[str] = None
num_threads: int = 1
@property
def title(self) -> str:
"""Best effort attempt at a string label for the measurement."""
if self.label is not None:
return self.label + (f": {self.sub_label}" if self.sub_label else "")
elif "\n" not in self.stmt:
return self.stmt + (f": {self.sub_label}" if self.sub_label else "")
return (
f"stmt:{f' ({self.sub_label})' if self.sub_label else ''}\n"
f"{textwrap.indent(self.stmt, ' ')}"
)
def setup_str(self) -> str:
return (
"" if (self.setup == "pass" or not self.setup)
else f"setup:\n{textwrap.indent(self.setup, ' ')}" if "\n" in self.setup
else f"setup: {self.setup}"
)
def summarize(self) -> str:
"""Build TaskSpec portion of repr string for other containers."""
sections = [
self.title,
self.description or "",
self.setup_str(),
]
return "\n".join([f"{i}\n" if "\n" in i else i for i in sections if i])
_TASKSPEC_FIELDS = tuple(i.name for i in dataclasses.fields(TaskSpec))
@dataclasses.dataclass(init=True, repr=False)
class Measurement:
"""The result of a Timer measurement.
This class stores one or more measurements of a given statement. It is
serializable and provides several convenience methods
(including a detailed __repr__) for downstream consumers.
"""
number_per_run: int
raw_times: List[float]
task_spec: TaskSpec
metadata: Optional[Dict[Any, Any]] = None # Reserved for user payloads.
def __post_init__(self) -> None:
self._sorted_times: Tuple[float, ...] = ()
self._warnings: Tuple[str, ...] = ()
self._median: float = -1.0
self._mean: float = -1.0
self._p25: float = -1.0
self._p75: float = -1.0
def __getattr__(self, name: str) -> Any:
# Forward TaskSpec fields for convenience.
if name in _TASKSPEC_FIELDS:
return getattr(self.task_spec, name)
return super().__getattribute__(name)
# =========================================================================
# == Convenience methods for statistics ===================================
# =========================================================================
#
# These methods use raw time divided by number_per_run; this is an
# extrapolation and hides the fact that different number_per_run will
# result in different amortization of overheads, however if Timer has
# selected an appropriate number_per_run then this is a non-issue, and
# forcing users to handle that division would result in a poor experience.
@property
def times(self) -> List[float]:
return [t / self.number_per_run for t in self.raw_times]
@property
def median(self) -> float:
self._lazy_init()
return self._median
@property
def mean(self) -> float:
self._lazy_init()
return self._mean
@property
def iqr(self) -> float:
self._lazy_init()
return self._p75 - self._p25
@property
def significant_figures(self) -> int:
"""Approximate significant figure estimate.
This property is intended to give a convenient way to estimate the
precision of a measurement. It only uses the interquartile region to
estimate statistics to try to mitigate skew from the tails, and
uses a static z value of 1.645 since it is not expected to be used
for small values of `n`, so z can approximate `t`.
The significant figure estimation used in conjunction with the
`trim_sigfig` method to provide a more human interpretable data
summary. __repr__ does not use this method; it simply displays raw
values. Significant figure estimation is intended for `Compare`.
"""
self._lazy_init()
n_total = len(self._sorted_times)
lower_bound = int(n_total // 4)
upper_bound = int(torch.tensor(3 * n_total / 4).ceil())
interquartile_points: Tuple[float, ...] = self._sorted_times[lower_bound:upper_bound]
std = torch.tensor(interquartile_points).std(unbiased=False).item()
sqrt_n = torch.tensor(len(interquartile_points)).sqrt().item()
# Rough estimates. These are by no means statistically rigorous.
confidence_interval = max(1.645 * std / sqrt_n, _MIN_CONFIDENCE_INTERVAL)
relative_ci = torch.tensor(self._median / confidence_interval).log10().item()
num_significant_figures = int(torch.tensor(relative_ci).floor())
return min(max(num_significant_figures, 1), _MAX_SIGNIFICANT_FIGURES)
@property
def has_warnings(self) -> bool:
self._lazy_init()
return bool(self._warnings)
def _lazy_init(self) -> None:
if self.raw_times and not self._sorted_times:
self._sorted_times = tuple(sorted(self.times))
_sorted_times = torch.tensor(self._sorted_times, dtype=torch.float64)
self._median = _sorted_times.quantile(.5).item()
self._mean = _sorted_times.mean().item()
self._p25 = _sorted_times.quantile(.25).item()
self._p75 = _sorted_times.quantile(.75).item()
def add_warning(msg: str) -> None:
rel_iqr = self.iqr / self.median * 100
self._warnings += (
f" WARNING: Interquartile range is {rel_iqr:.1f}% "
f"of the median measurement.\n {msg}",
)
if not self.meets_confidence(_IQR_GROSS_WARN_THRESHOLD):
add_warning("This suggests significant environmental influence.")
elif not self.meets_confidence(_IQR_WARN_THRESHOLD):
add_warning("This could indicate system fluctuation.")
def meets_confidence(self, threshold: float = _IQR_WARN_THRESHOLD) -> bool:
return self.iqr / self.median < threshold
@property
def title(self) -> str:
return self.task_spec.title
@property
def env(self) -> str:
return (
"Unspecified env" if self.taskspec.env is None
else cast(str, self.taskspec.env)
)
@property
def as_row_name(self) -> str:
return self.sub_label or self.stmt or "[Unknown]"
def __repr__(self) -> str:
"""
Example repr:
<utils.common.Measurement object at 0x7f395b6ac110>
Broadcasting add (4x8)
Median: 5.73 us
IQR: 2.25 us (4.01 to 6.26)
372 measurements, 100 runs per measurement, 1 thread
WARNING: Interquartile range is 39.4% of the median measurement.
This suggests significant environmental influence.
"""
self._lazy_init()
skip_line, newline = "MEASUREMENT_REPR_SKIP_LINE", "\n"
n = len(self._sorted_times)
time_unit, time_scale = select_unit(self._median)
iqr_filter = '' if n >= 4 else skip_line
repr_str = f"""
{super().__repr__()}
{self.task_spec.summarize()}
{'Median: ' if n > 1 else ''}{self._median / time_scale:.2f} {time_unit}
{iqr_filter}IQR: {self.iqr / time_scale:.2f} {time_unit} ({self._p25 / time_scale:.2f} to {self._p75 / time_scale:.2f})
{n} measurement{'s' if n > 1 else ''}, {self.number_per_run} runs {'per measurement,' if n > 1 else ','} {self.num_threads} thread{'s' if self.num_threads > 1 else ''}
{newline.join(self._warnings)}""".strip() # noqa: B950
return "\n".join(l for l in repr_str.splitlines(keepends=False) if skip_line not in l)
@staticmethod
def merge(measurements: Iterable["Measurement"]) -> List["Measurement"]:
"""Convenience method for merging replicates.
Merge will extrapolate times to `number_per_run=1` and will not
transfer any metadata. (Since it might differ between replicates)
"""
grouped_measurements: DefaultDict[TaskSpec, List[Measurement]] = collections.defaultdict(list)
for m in measurements:
grouped_measurements[m.task_spec].append(m)
def merge_group(task_spec: TaskSpec, group: List["Measurement"]) -> "Measurement":
times: List[float] = []
for m in group:
# Different measurements could have different `number_per_run`,
# so we call `.times` which normalizes the results.
times.extend(m.times)
return Measurement(
number_per_run=1,
raw_times=times,
task_spec=task_spec,
metadata=None,
)
return [merge_group(t, g) for t, g in grouped_measurements.items()]
def select_unit(t: float) -> Tuple[str, float]:
"""Determine how to scale times for O(1) magnitude.
This utility is used to format numbers for human consumption.
"""
time_unit = {-3: "ns", -2: "us", -1: "ms"}.get(int(torch.tensor(t).log10().item() // 3), "s")
time_scale = {"ns": 1e-9, "us": 1e-6, "ms": 1e-3, "s": 1}[time_unit]
return time_unit, time_scale
def unit_to_english(u: str) -> str:
return {
"ns": "nanosecond",
"us": "microsecond",
"ms": "millisecond",
"s": "second",
}[u]
def trim_sigfig(x: float, n: int) -> float:
"""Trim `x` to `n` significant figures. (e.g. 3.14159, 2 -> 3.10000)"""
assert n == int(n)
magnitude = int(torch.tensor(x).abs().log10().ceil().item())
scale = 10 ** (magnitude - n)
return float(torch.tensor(x / scale).round() * scale)
def ordered_unique(elements: Iterable[Any]) -> List[Any]:
return list(collections.OrderedDict(dict.fromkeys(elements)).keys())
@contextlib.contextmanager
def set_torch_threads(n: int) -> Iterator[None]:
prior_num_threads = torch.get_num_threads()
try:
torch.set_num_threads(n)
yield
finally:
torch.set_num_threads(prior_num_threads)
def _make_temp_dir(prefix: Optional[str] = None, gc_dev_shm: bool = False) -> str:
"""Create a temporary directory. The caller is responsible for cleanup.
This function is conceptually similar to `tempfile.mkdtemp`, but with
the key additional feature that it will use shared memory if the
`BENCHMARK_USE_DEV_SHM` environment variable is set. This is an
implementation detail, but an important one for cases where many Callgrind
measurements are collected at once. (Such as when collecting
microbenchmarks.)
This is an internal utility, and is exported solely so that microbenchmarks
can reuse the util.
"""
use_dev_shm: bool = (os.getenv("BENCHMARK_USE_DEV_SHM") or "").lower() in ("1", "true")
if use_dev_shm:
root = "/dev/shm/pytorch_benchmark_utils"
assert os.name == "posix", f"tmpfs (/dev/shm) is POSIX only, current platform is {os.name}"
assert os.path.exists("/dev/shm"), "This system does not appear to support tmpfs (/dev/shm)."
os.makedirs(root, exist_ok=True)
# Because we're working in shared memory, it is more important than
# usual to clean up ALL intermediate files. However we don't want every
# worker to walk over all outstanding directories, so instead we only
# check when we are sure that it won't lead to contention.
if gc_dev_shm:
for i in os.listdir(root):
owner_file = os.path.join(root, i, "owner.pid")
if not os.path.exists(owner_file):
continue
with open(owner_file) as f:
owner_pid = int(f.read())
if owner_pid == os.getpid():
continue
try:
# https://stackoverflow.com/questions/568271/how-to-check-if-there-exists-a-process-with-a-given-pid-in-python
os.kill(owner_pid, 0)
except OSError:
print(f"Detected that {os.path.join(root, i)} was orphaned in shared memory. Cleaning up.")
shutil.rmtree(os.path.join(root, i))
else:
root = tempfile.gettempdir()
# We include the time so names sort by creation time, and add a UUID
# to ensure we don't collide.
name = f"{prefix or tempfile.gettempprefix()}__{int(time.time())}__{uuid.uuid4()}"
path = os.path.join(root, name)
os.makedirs(path, exist_ok=False)
if use_dev_shm:
with open(os.path.join(path, "owner.pid"), "w") as f:
f.write(str(os.getpid()))
return path

View File

@ -0,0 +1,348 @@
# mypy: allow-untyped-defs
"""Display class to aggregate and print the results of many measurements."""
import collections
import enum
import itertools as it
from typing import DefaultDict, List, Optional, Tuple
from torch.utils.benchmark.utils import common
from torch import tensor as _tensor
import operator
__all__ = ["Colorize", "Compare"]
BEST = "\033[92m"
GOOD = "\033[34m"
BAD = "\033[2m\033[91m"
VERY_BAD = "\033[31m"
BOLD = "\033[1m"
TERMINATE = "\033[0m"
class Colorize(enum.Enum):
NONE = "none"
COLUMNWISE = "columnwise"
ROWWISE = "rowwise"
# Classes to separate internal bookkeeping from what is rendered.
class _Column:
def __init__(
self,
grouped_results: List[Tuple[Optional[common.Measurement], ...]],
time_scale: float,
time_unit: str,
trim_significant_figures: bool,
highlight_warnings: bool,
):
self._grouped_results = grouped_results
self._flat_results = list(it.chain(*grouped_results))
self._time_scale = time_scale
self._time_unit = time_unit
self._trim_significant_figures = trim_significant_figures
self._highlight_warnings = (
highlight_warnings
and any(r.has_warnings for r in self._flat_results if r)
)
leading_digits = [
int(_tensor(r.median / self._time_scale).log10().ceil()) if r else None
for r in self._flat_results
]
unit_digits = max(d for d in leading_digits if d is not None)
decimal_digits = min(
max(m.significant_figures - digits, 0)
for digits, m in zip(leading_digits, self._flat_results)
if (m is not None) and (digits is not None)
) if self._trim_significant_figures else 1
length = unit_digits + decimal_digits + (1 if decimal_digits else 0)
self._template = f"{{:>{length}.{decimal_digits}f}}{{:>{7 if self._highlight_warnings else 0}}}"
def get_results_for(self, group):
return self._grouped_results[group]
def num_to_str(self, value: Optional[float], estimated_sigfigs: int, spread: Optional[float]):
if value is None:
return " " * len(self.num_to_str(1, estimated_sigfigs, None))
if self._trim_significant_figures:
value = common.trim_sigfig(value, estimated_sigfigs)
return self._template.format(
value,
f" (! {spread * 100:.0f}%)" if self._highlight_warnings and spread is not None else "")
def optional_min(seq):
l = list(seq)
return None if len(l) == 0 else min(l)
class _Row:
def __init__(self, results, row_group, render_env, env_str_len,
row_name_str_len, time_scale, colorize, num_threads=None):
super().__init__()
self._results = results
self._row_group = row_group
self._render_env = render_env
self._env_str_len = env_str_len
self._row_name_str_len = row_name_str_len
self._time_scale = time_scale
self._colorize = colorize
self._columns: Tuple[_Column, ...] = ()
self._num_threads = num_threads
def register_columns(self, columns: Tuple[_Column, ...]):
self._columns = columns
def as_column_strings(self):
concrete_results = [r for r in self._results if r is not None]
env = f"({concrete_results[0].env})" if self._render_env else ""
env = env.ljust(self._env_str_len + 4)
output = [" " + env + concrete_results[0].as_row_name]
for m, col in zip(self._results, self._columns or ()):
if m is None:
output.append(col.num_to_str(None, 1, None))
else:
output.append(col.num_to_str(
m.median / self._time_scale,
m.significant_figures,
m.iqr / m.median if m.has_warnings else None
))
return output
@staticmethod
def color_segment(segment, value, best_value):
if value <= best_value * 1.01 or value <= best_value + 100e-9:
return BEST + BOLD + segment + TERMINATE * 2
if value <= best_value * 1.1:
return GOOD + BOLD + segment + TERMINATE * 2
if value >= best_value * 5:
return VERY_BAD + BOLD + segment + TERMINATE * 2
if value >= best_value * 2:
return BAD + segment + TERMINATE * 2
return segment
def row_separator(self, overall_width):
return (
[f"{self._num_threads} threads: ".ljust(overall_width, "-")]
if self._num_threads is not None else []
)
def finalize_column_strings(self, column_strings, col_widths):
best_values = [-1 for _ in column_strings]
if self._colorize == Colorize.ROWWISE:
row_min = min(r.median for r in self._results if r is not None)
best_values = [row_min for _ in column_strings]
elif self._colorize == Colorize.COLUMNWISE:
best_values = [
optional_min(r.median for r in column.get_results_for(self._row_group) if r is not None)
for column in (self._columns or ())
]
row_contents = [column_strings[0].ljust(col_widths[0])]
for col_str, width, result, best_value in zip(column_strings[1:], col_widths[1:], self._results, best_values):
col_str = col_str.center(width)
if self._colorize != Colorize.NONE and result is not None and best_value is not None:
col_str = self.color_segment(col_str, result.median, best_value)
row_contents.append(col_str)
return row_contents
class Table:
def __init__(
self,
results: List[common.Measurement],
colorize: Colorize,
trim_significant_figures: bool,
highlight_warnings: bool
):
assert len({r.label for r in results}) == 1
self.results = results
self._colorize = colorize
self._trim_significant_figures = trim_significant_figures
self._highlight_warnings = highlight_warnings
self.label = results[0].label
self.time_unit, self.time_scale = common.select_unit(
min(r.median for r in results)
)
self.row_keys = common.ordered_unique([self.row_fn(i) for i in results])
self.row_keys.sort(key=operator.itemgetter(slice(2))) # preserve stmt order
self.column_keys = common.ordered_unique([self.col_fn(i) for i in results])
self.rows, self.columns = self.populate_rows_and_columns()
@staticmethod
def row_fn(m: common.Measurement) -> Tuple[int, Optional[str], str]:
return m.num_threads, m.env, m.as_row_name
@staticmethod
def col_fn(m: common.Measurement) -> Optional[str]:
return m.description
def populate_rows_and_columns(self) -> Tuple[Tuple[_Row, ...], Tuple[_Column, ...]]:
rows: List[_Row] = []
columns: List[_Column] = []
ordered_results: List[List[Optional[common.Measurement]]] = [
[None for _ in self.column_keys]
for _ in self.row_keys
]
row_position = {key: i for i, key in enumerate(self.row_keys)}
col_position = {key: i for i, key in enumerate(self.column_keys)}
for r in self.results:
i = row_position[self.row_fn(r)]
j = col_position[self.col_fn(r)]
ordered_results[i][j] = r
unique_envs = {r.env for r in self.results}
render_env = len(unique_envs) > 1
env_str_len = max(len(i) for i in unique_envs) if render_env else 0
row_name_str_len = max(len(r.as_row_name) for r in self.results)
prior_num_threads = -1
prior_env = ""
row_group = -1
rows_by_group: List[List[List[Optional[common.Measurement]]]] = []
for (num_threads, env, _), row in zip(self.row_keys, ordered_results):
thread_transition = (num_threads != prior_num_threads)
if thread_transition:
prior_num_threads = num_threads
prior_env = ""
row_group += 1
rows_by_group.append([])
rows.append(
_Row(
results=row,
row_group=row_group,
render_env=(render_env and env != prior_env),
env_str_len=env_str_len,
row_name_str_len=row_name_str_len,
time_scale=self.time_scale,
colorize=self._colorize,
num_threads=num_threads if thread_transition else None,
)
)
rows_by_group[-1].append(row)
prior_env = env
for i in range(len(self.column_keys)):
grouped_results = [tuple(row[i] for row in g) for g in rows_by_group]
column = _Column(
grouped_results=grouped_results,
time_scale=self.time_scale,
time_unit=self.time_unit,
trim_significant_figures=self._trim_significant_figures,
highlight_warnings=self._highlight_warnings,)
columns.append(column)
rows_tuple, columns_tuple = tuple(rows), tuple(columns)
for ri in rows_tuple:
ri.register_columns(columns_tuple)
return rows_tuple, columns_tuple
def render(self) -> str:
string_rows = [[""] + self.column_keys]
for r in self.rows:
string_rows.append(r.as_column_strings())
num_cols = max(len(i) for i in string_rows)
for sr in string_rows:
sr.extend(["" for _ in range(num_cols - len(sr))])
col_widths = [max(len(j) for j in i) for i in zip(*string_rows)]
finalized_columns = [" | ".join(i.center(w) for i, w in zip(string_rows[0], col_widths))]
overall_width = len(finalized_columns[0])
for string_row, row in zip(string_rows[1:], self.rows):
finalized_columns.extend(row.row_separator(overall_width))
finalized_columns.append(" | ".join(row.finalize_column_strings(string_row, col_widths)))
newline = "\n"
has_warnings = self._highlight_warnings and any(ri.has_warnings for ri in self.results)
return f"""
[{(' ' + (self.label or '') + ' ').center(overall_width - 2, '-')}]
{newline.join(finalized_columns)}
Times are in {common.unit_to_english(self.time_unit)}s ({self.time_unit}).
{'(! XX%) Measurement has high variance, where XX is the IQR / median * 100.' + newline if has_warnings else ""}"""[1:]
class Compare:
"""Helper class for displaying the results of many measurements in a
formatted table.
The table format is based on the information fields provided in
:class:`torch.utils.benchmark.Timer` (`description`, `label`, `sub_label`,
`num_threads`, etc).
The table can be directly printed using :meth:`print` or casted as a `str`.
For a full tutorial on how to use this class, see:
https://pytorch.org/tutorials/recipes/recipes/benchmark.html
Args:
results: List of Measurment to display.
"""
def __init__(self, results: List[common.Measurement]):
self._results: List[common.Measurement] = []
self.extend_results(results)
self._trim_significant_figures = False
self._colorize = Colorize.NONE
self._highlight_warnings = False
def __str__(self):
return "\n".join(self._render())
def extend_results(self, results):
"""Append results to already stored ones.
All added results must be instances of ``Measurement``.
"""
for r in results:
if not isinstance(r, common.Measurement):
raise ValueError(
"Expected an instance of `Measurement`, " f"got {type(r)} instead."
)
self._results.extend(results)
def trim_significant_figures(self):
"""Enables trimming of significant figures when building the formatted table."""
self._trim_significant_figures = True
def colorize(self, rowwise=False):
"""Colorize formatted table.
Colorize columnwise by default.
"""
self._colorize = Colorize.ROWWISE if rowwise else Colorize.COLUMNWISE
def highlight_warnings(self):
"""Enables warning highlighting when building formatted table."""
self._highlight_warnings = True
def print(self):
"""Print formatted table"""
print(str(self))
def _render(self):
results = common.Measurement.merge(self._results)
grouped_results = self._group_by_label(results)
output = []
for group in grouped_results.values():
output.append(self._layout(group))
return output
def _group_by_label(self, results: List[common.Measurement]):
grouped_results: DefaultDict[str, List[common.Measurement]] = collections.defaultdict(list)
for r in results:
grouped_results[r.label].append(r)
return grouped_results
def _layout(self, results: List[common.Measurement]):
table = Table(
results,
self._colorize,
self._trim_significant_figures,
self._highlight_warnings
)
return table.render()

View File

@ -0,0 +1,191 @@
# mypy: allow-untyped-defs
from typing import Any, Callable, cast, List, Optional, Union
import torch
import torch._dynamo
from torch._dynamo.testing import CompileCounterWithBackend
from torch.utils.benchmark import Timer
__all__ = ["bench_all", "benchmark_compile"]
_warned_tensor_cores = False
_default_float_32_precision = torch.get_float32_matmul_precision()
try:
from tabulate import tabulate
HAS_TABULATE = True
except ModuleNotFoundError:
HAS_TABULATE = False
tabulate = None # type: ignore[assignment]
print("tabulate is not installed, please pip install tabulate to use this utility")
if HAS_TABULATE:
def _enable_tensor_cores():
global _warned_tensor_cores
if torch.cuda.is_available():
if torch.backends.cuda.matmul.allow_tf32 is False and torch.cuda.get_device_capability() >= (8, 0):
torch.set_float32_matmul_precision("high")
if not _warned_tensor_cores:
print("Your GPU supports tensor cores")
print("we will enable it automatically by setting `torch.set_float32_matmul_precision('high')`")
_warned_tensor_cores = True
def _disable_tensor_cores():
torch.set_float32_matmul_precision(_default_float_32_precision)
def bench_loop(
model: Union[torch.nn.Module, Callable],
sample_input: Union[torch.Tensor, Any],
num_iters: int = 5,
optimizer: Optional[torch.optim.Optimizer] = None,
loss_fn: Optional[Callable] = None,
):
# Define the statement and setup for the benchmark
if optimizer and loss_fn:
# Training mode
stmt = """
output = model(sample_input)
loss = loss_fn(output) if loss_fn else output.sum()
loss.backward()
optimizer.step()
optimizer.zero_grad()
"""
else:
# Inference mode
stmt = "model(sample_input)"
# Create the Timer object
timer = Timer(
stmt=stmt,
globals={"model": model, "sample_input": sample_input, "optimizer": optimizer, "loss_fn": loss_fn},
)
result = timer.timeit(number=num_iters)
# Get the average time per iteration in milliseconds
avg_time = result.mean * 1000
return round(avg_time, 2)
def benchmark_compile(
model: Union[torch.nn.Module, Callable],
sample_input: Union[torch.Tensor, Any],
num_iters: int = 5,
backend: Optional[str] = None,
mode: Optional[str] = "default",
optimizer: Optional[torch.optim.Optimizer] = None,
loss_fn : Union[torch.nn.Module, Callable, None] = None,
):
"""
Use this utility to benchmark torch.compile
"""
if backend:
try:
torch._dynamo.reset()
compile_counter_with_backend = CompileCounterWithBackend(backend)
opt_model = torch.compile(model, backend=compile_counter_with_backend, mode=mode)
# Compilation only happens after the first inference
compilation_time = bench_loop(opt_model, sample_input, 1, optimizer, loss_fn)
running_time = bench_loop(opt_model, sample_input, num_iters, optimizer, loss_fn)
if compile_counter_with_backend.frame_count == 0:
raise RuntimeError("No compilation occurred during benchmarking.")
if compile_counter_with_backend.frame_count > 1:
raise RuntimeError("Recompilation occurred during benchmarking.")
except Exception as e:
print(e)
print(f"Failed to compile {backend} with mode {mode}")
return None, None
else:
opt_model = model
compilation_time = None
running_time = bench_loop(opt_model, sample_input, num_iters, optimizer, loss_fn)
compilation_time = round(compilation_time, 2) if compilation_time else None
running_time = round(running_time, 2) if running_time else None
return compilation_time, running_time
def bench_all(
model : Union[torch.nn.Module, Callable],
sample_input: Union[torch.Tensor, Any],
num_iters : int = 5,
optimizer: Optional[torch.optim.Optimizer] = None,
loss_fn : Union[torch.nn.Module, Callable, None] = None,
):
"""
This is a simple utility that can be used to benchmark torch.compile
In particular it ensures that your GPU is setup to use tensor cores if it supports its
It also tries out all the main backends and prints a table of results so you can easily compare them all
Many of the backendds have their own optional dependencies so please pip install them seperately
You will get one table for inference and another for training
If you'd like to leverage this utility for training make sure to pass in a torch.optim.Optimizer
The important warnings are
Your GPU supports tensor cores
we will enable it automatically by setting `torch.set_float32_matmul_precision('high')`
If a compilation fails for any reason including the dependency not being included
then we will print Failed to compile {backend} with mode {mode}
"""
field_names = ["Train/Inference", "Backend", "Mode", "Compilation Time", "Average Running Time"]
table = []
eager_time = None
torch._dynamo.reset()
_, eager_time = benchmark_compile(model, sample_input, num_iters, None, None, optimizer)
table.append(
[("Training" if optimizer else "Inference"), "Eager", "-", "-", f"{eager_time} ms"]
)
for backend in torch._dynamo.list_backends():
if backend == "inductor":
mode_options = cast(List[Optional[str]], list(torch._inductor.list_mode_options().keys())) + [None]
for mode in mode_options:
if mode == "default":
continue
torch._dynamo.reset()
try:
if torch.cuda.is_available():
_enable_tensor_cores()
compilation_time, running_time = benchmark_compile(
model, sample_input, num_iters, backend, mode, optimizer, loss_fn)
finally:
if torch.cuda.is_available():
_disable_tensor_cores()
table.append([
("Training" if optimizer else "Inference"),
backend if backend else "-",
mode if mode is not None else "-",
f"{compilation_time} ms " if compilation_time else "-",
f"{running_time} ms " if running_time else "-",
])
else:
torch._dynamo.reset()
compilation_time, running_time = benchmark_compile(
model, sample_input, num_iters, backend, None, optimizer, loss_fn)
if running_time is not None:
table.append([
("Training" if optimizer else "Inference"),
backend, "-",
f"{compilation_time} ms " or "-",
f"{running_time} ms ",
])
return tabulate(table, headers=field_names, tablefmt="github")

View File

@ -0,0 +1,172 @@
"""JIT C++ strings into executables."""
import atexit
import os
import re
import shutil
import textwrap
import threading
from typing import Any, List, Optional
import torch
from torch.utils.benchmark.utils._stubs import CallgrindModuleType, TimeitModuleType
from torch.utils.benchmark.utils.common import _make_temp_dir
from torch.utils import cpp_extension
LOCK = threading.Lock()
SOURCE_ROOT = os.path.split(os.path.abspath(__file__))[0]
# We calculate uuid once at import time so that separate processes will have
# separate build roots, but threads will share the same build root.
# `cpp_extension` uses build root as part of the cache key, so per-invocation
# uuid's (e.g. different build root per _compile_template call) would lead to
# a 0% cache hit rate and spurious recompilation. Consider the following:
# ```
# setup = "auto x = torch::ones({1024, 1024});"
# stmt = "torch::mm(x, x);"
# for num_threads in [1, 2, 4, 8]:
# print(Timer(stmt, setup, num_threads=num_threads, language="c++").blocked_autorange())
# ````
# `setup` and `stmt` do not change, so we can reuse the executable from the
# first pass through the loop.
_BUILD_ROOT: Optional[str] = None
def _get_build_root() -> str:
global _BUILD_ROOT
if _BUILD_ROOT is None:
_BUILD_ROOT = _make_temp_dir(prefix="benchmark_utils_jit_build")
atexit.register(shutil.rmtree, _BUILD_ROOT)
return _BUILD_ROOT
# BACK_TESTING_NOTE:
# There are two workflows where this code could be used. One is the obvious
# case where someone simply builds or installs PyTorch and uses Timer.
# The other is that the entire `torch/utils/benchmark` folder from a CURRENT
# PyTorch checkout is copy-pasted into a much OLDER version of the PyTorch
# source code. This is what we refer to here as "back testing". The rationale
# is that we might want to use current tooling to study some aspect of an
# earlier version of PyTorch. (e.g. a regression.)
#
# The problem is that Timer relies on several aspects of core PyTorch, namely
# some binding functions for Valgrind symbols in `torch._C` and the
# `torch.__config__._cxx_flags()` method. If we were to naively copy code
# around this wouldn't work as the symbols of interest aren't present in
# earlier versions of PyTorch. In order to work around this, we must add back
# testing shims. These shims will never activate during normal use, but will
# allow Timer to function outside of the "correct" version of PyTorch by
# emulating functionality that was added later.
#
# These shims are temporary, and as Timer becomes more integrated with
# PyTorch the cost and complexity of such shims will increase. Once back
# testing is no longer required (which is to say we have done enough historic
# analysis and the shims no longer justify their maintenance and code
# complexity costs) back testing paths will be removed.
CXX_FLAGS: Optional[List[str]]
if hasattr(torch.__config__, "_cxx_flags"):
try:
CXX_FLAGS = torch.__config__._cxx_flags().strip().split()
if CXX_FLAGS is not None and "-g" not in CXX_FLAGS:
CXX_FLAGS.append("-g")
# remove "-W" flags to allow build benchmarks
# with a relaxed constraint of compiler versions
if CXX_FLAGS is not None:
CXX_FLAGS = list(filter(lambda x: not x.startswith("-W"), CXX_FLAGS))
except RuntimeError:
# We are in FBCode.
CXX_FLAGS = None
else:
# FIXME: Remove when back testing is no longer required.
CXX_FLAGS = ["-O2", "-fPIC", "-g"]
EXTRA_INCLUDE_PATHS: List[str] = [os.path.join(SOURCE_ROOT, "valgrind_wrapper")]
CONDA_PREFIX = os.getenv("CONDA_PREFIX")
if CONDA_PREFIX is not None:
# Load will automatically search /usr/include, but not conda include.
EXTRA_INCLUDE_PATHS.append(os.path.join(CONDA_PREFIX, "include"))
COMPAT_CALLGRIND_BINDINGS: Optional[CallgrindModuleType] = None
def get_compat_bindings() -> CallgrindModuleType:
with LOCK:
global COMPAT_CALLGRIND_BINDINGS
if COMPAT_CALLGRIND_BINDINGS is None:
COMPAT_CALLGRIND_BINDINGS = cpp_extension.load(
name="callgrind_bindings",
sources=[os.path.join(
SOURCE_ROOT,
"valgrind_wrapper",
"compat_bindings.cpp"
)],
extra_cflags=CXX_FLAGS,
extra_include_paths=EXTRA_INCLUDE_PATHS,
)
return COMPAT_CALLGRIND_BINDINGS
def _compile_template(
*,
stmt: str,
setup: str,
global_setup: str,
src: str,
is_standalone: bool
) -> Any:
for before, after, indentation in (
("// GLOBAL_SETUP_TEMPLATE_LOCATION", global_setup, 0),
("// SETUP_TEMPLATE_LOCATION", setup, 4),
("// STMT_TEMPLATE_LOCATION", stmt, 8)
):
# C++ doesn't care about indentation so this code isn't load
# bearing the way it is with Python, but this makes the source
# look nicer if a human has to look at it.
src = re.sub(
before,
textwrap.indent(after, " " * indentation)[indentation:],
src
)
# We want to isolate different Timers. However `cpp_extension` will
# cache builds which will significantly reduce the cost of repeated
# invocations.
with LOCK:
name = f"timer_cpp_{abs(hash(src))}"
build_dir = os.path.join(_get_build_root(), name)
os.makedirs(build_dir, exist_ok=True)
src_path = os.path.join(build_dir, "timer_src.cpp")
with open(src_path, "w") as f:
f.write(src)
# `cpp_extension` has its own locking scheme, so we don't need our lock.
return cpp_extension.load(
name=name,
sources=[src_path],
build_directory=build_dir,
extra_cflags=CXX_FLAGS,
extra_include_paths=EXTRA_INCLUDE_PATHS,
is_python_module=not is_standalone,
is_standalone=is_standalone,
)
def compile_timeit_template(*, stmt: str, setup: str, global_setup: str) -> TimeitModuleType:
template_path: str = os.path.join(SOURCE_ROOT, "timeit_template.cpp")
with open(template_path) as f:
src: str = f.read()
module = _compile_template(stmt=stmt, setup=setup, global_setup=global_setup, src=src, is_standalone=False)
assert isinstance(module, TimeitModuleType)
return module
def compile_callgrind_template(*, stmt: str, setup: str, global_setup: str) -> str:
template_path: str = os.path.join(SOURCE_ROOT, "valgrind_wrapper", "timer_callgrind_template.cpp")
with open(template_path) as f:
src: str = f.read()
target = _compile_template(stmt=stmt, setup=setup, global_setup=global_setup, src=src, is_standalone=True)
assert isinstance(target, str)
return target

View File

@ -0,0 +1,462 @@
# mypy: allow-untyped-defs
import functools
import itertools as it
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import torch
__all__ = [
"Fuzzer",
"FuzzedParameter", "ParameterAlias",
"FuzzedTensor",
]
_DISTRIBUTIONS = (
"loguniform",
"uniform",
)
class FuzzedParameter:
"""Specification for a parameter to be generated during fuzzing."""
def __init__(
self,
name: str,
minval: Optional[Union[int, float]] = None,
maxval: Optional[Union[int, float]] = None,
distribution: Optional[Union[str, Dict[Any, float]]] = None,
strict: bool = False,
):
"""
Args:
name:
A string name with which to identify the parameter.
FuzzedTensors can reference this string in their
specifications.
minval:
The lower bound for the generated value. See the description
of `distribution` for type behavior.
maxval:
The upper bound for the generated value. Type behavior is
identical to `minval`.
distribution:
Specifies the distribution from which this parameter should
be drawn. There are three possibilities:
- "loguniform"
Samples between `minval` and `maxval` (inclusive) such
that the probabilities are uniform in log space. As a
concrete example, if minval=1 and maxval=100, a sample
is as likely to fall in [1, 10) as it is [10, 100].
- "uniform"
Samples are chosen with uniform probability between
`minval` and `maxval` (inclusive). If either `minval`
or `maxval` is a float then the distribution is the
continuous uniform distribution; otherwise samples
are constrained to the integers.
- dict:
If a dict is passed, the keys are taken to be choices
for the variables and the values are interpreted as
probabilities. (And must sum to one.)
If a dict is passed, `minval` and `maxval` must not be set.
Otherwise, they must be set.
strict:
If a parameter is strict, it will not be included in the
iterative resampling process which Fuzzer uses to find a
valid parameter configuration. This allows an author to
prevent skew from resampling for a given parameter (for
instance, a low size limit could inadvertently bias towards
Tensors with fewer dimensions) at the cost of more iterations
when generating parameters.
"""
self._name = name
self._minval = minval
self._maxval = maxval
self._distribution = self._check_distribution(distribution)
self.strict = strict
@property
def name(self):
return self._name
def sample(self, state):
if self._distribution == "loguniform":
return self._loguniform(state)
if self._distribution == "uniform":
return self._uniform(state)
if isinstance(self._distribution, dict):
return self._custom_distribution(state)
def _check_distribution(self, distribution):
if not isinstance(distribution, dict):
assert distribution in _DISTRIBUTIONS
else:
assert not any(i < 0 for i in distribution.values()), "Probabilities cannot be negative"
assert abs(sum(distribution.values()) - 1) <= 1e-5, "Distribution is not normalized"
assert self._minval is None
assert self._maxval is None
return distribution
def _loguniform(self, state):
import numpy as np
output = int(2 ** state.uniform(
low=np.log2(self._minval) if self._minval is not None else None,
high=np.log2(self._maxval) if self._maxval is not None else None,
))
if self._minval is not None and output < self._minval:
return self._minval
if self._maxval is not None and output > self._maxval:
return self._maxval
return output
def _uniform(self, state):
if isinstance(self._minval, int) and isinstance(self._maxval, int):
return int(state.randint(low=self._minval, high=self._maxval + 1))
return state.uniform(low=self._minval, high=self._maxval)
def _custom_distribution(self, state):
import numpy as np
# If we directly pass the keys to `choice`, numpy will convert
# them to numpy dtypes.
index = state.choice(
np.arange(len(self._distribution)),
p=tuple(self._distribution.values()))
return list(self._distribution.keys())[index]
class ParameterAlias:
"""Indicates that a parameter should alias the value of another parameter.
When used in conjunction with a custom distribution, this allows fuzzed
tensors to represent a broader range of behaviors. For example, the
following sometimes produces Tensors which broadcast:
Fuzzer(
parameters=[
FuzzedParameter("x_len", 4, 1024, distribution="uniform"),
# `y` will either be size one, or match the size of `x`.
FuzzedParameter("y_len", distribution={
0.5: 1,
0.5: ParameterAlias("x_len")
}),
],
tensors=[
FuzzedTensor("x", size=("x_len",)),
FuzzedTensor("y", size=("y_len",)),
],
)
Chains of alias' are allowed, but may not contain cycles.
"""
def __init__(self, alias_to):
self.alias_to = alias_to
def __repr__(self):
return f"ParameterAlias[alias_to: {self.alias_to}]"
def dtype_size(dtype):
if dtype == torch.bool:
return 1
if dtype.is_floating_point or dtype.is_complex:
return int(torch.finfo(dtype).bits / 8)
return int(torch.iinfo(dtype).bits / 8)
def prod(values, base=1):
"""np.prod can overflow, so for sizes the product should be done in Python.
Even though np.prod type promotes to int64, it can still overflow in which
case the negative value will pass the size check and OOM when attempting to
actually allocate the Tensor.
"""
return functools.reduce(lambda x, y: int(x) * int(y), values, base)
class FuzzedTensor:
def __init__(
self,
name: str,
size: Tuple[Union[str, int], ...],
steps: Optional[Tuple[Union[str, int], ...]] = None,
probability_contiguous: float = 0.5,
min_elements: Optional[int] = None,
max_elements: Optional[int] = None,
max_allocation_bytes: Optional[int] = None,
dim_parameter: Optional[str] = None,
roll_parameter: Optional[str] = None,
dtype=torch.float32,
cuda=False,
tensor_constructor: Optional[Callable] = None
):
"""
Args:
name:
A string identifier for the generated Tensor.
size:
A tuple of integers or strings specifying the size of the generated
Tensor. String values will replaced with a concrete int during the
generation process, while ints are simply passed as literals.
steps:
An optional tuple with the same length as `size`. This indicates
that a larger Tensor should be allocated, and then sliced to
produce the generated Tensor. For instance, if size is (4, 8)
and steps is (1, 4), then a tensor `t` of size (4, 32) will be
created and then `t[:, ::4]` will be used. (Allowing one to test
Tensors with strided memory.)
probability_contiguous:
A number between zero and one representing the chance that the
generated Tensor has a contiguous memory layout. This is achieved by
randomly permuting the shape of a Tensor, calling `.contiguous()`,
and then permuting back. This is applied before `steps`, which can
also cause a Tensor to be non-contiguous.
min_elements:
The minimum number of parameters that this Tensor must have for a
set of parameters to be valid. (Otherwise they are resampled.)
max_elements:
Like `min_elements`, but setting an upper bound.
max_allocation_bytes:
Like `max_elements`, but for the size of Tensor that must be
allocated prior to slicing for `steps` (if applicable). For
example, a FloatTensor with size (1024, 1024) and steps (4, 4)
would have 1M elements, but would require a 64 MB allocation.
dim_parameter:
The length of `size` and `steps` will be truncated to this value.
This allows Tensors of varying dimensions to be generated by the
Fuzzer.
dtype:
The PyTorch dtype of the generated Tensor.
cuda:
Whether to place the Tensor on a GPU.
tensor_constructor:
Callable which will be used instead of the default Tensor
construction method. This allows the author to enforce properties
of the Tensor (e.g. it can only have certain values). The dtype and
concrete shape of the Tensor to be created will be passed, and
concrete values of all parameters will be passed as kwargs. Note
that transformations to the result (permuting, slicing) will be
performed by the Fuzzer; the tensor_constructor is only responsible
for creating an appropriately sized Tensor.
"""
self._name = name
self._size = size
self._steps = steps
self._probability_contiguous = probability_contiguous
self._min_elements = min_elements
self._max_elements = max_elements
self._max_allocation_bytes = max_allocation_bytes
self._dim_parameter = dim_parameter
self._dtype = dtype
self._cuda = cuda
self._tensor_constructor = tensor_constructor
@property
def name(self):
return self._name
@staticmethod
def default_tensor_constructor(size, dtype, **kwargs):
if dtype.is_floating_point or dtype.is_complex:
return torch.rand(size=size, dtype=dtype, device="cpu")
else:
return torch.randint(1, 127, size=size, dtype=dtype, device="cpu")
def _make_tensor(self, params, state):
import numpy as np
size, steps, allocation_size = self._get_size_and_steps(params)
constructor = (
self._tensor_constructor or
self.default_tensor_constructor
)
raw_tensor = constructor(size=allocation_size, dtype=self._dtype, **params)
if self._cuda:
raw_tensor = raw_tensor.cuda()
# Randomly permute the Tensor and call `.contiguous()` to force re-ordering
# of the memory, and then permute it back to the original shape.
dim = len(size)
order = np.arange(dim)
if state.rand() > self._probability_contiguous:
while dim > 1 and np.all(order == np.arange(dim)):
order = state.permutation(raw_tensor.dim())
raw_tensor = raw_tensor.permute(tuple(order)).contiguous()
raw_tensor = raw_tensor.permute(tuple(np.argsort(order)))
slices = [slice(0, size * step, step) for size, step in zip(size, steps)]
tensor = raw_tensor[slices]
properties = {
"numel": int(tensor.numel()),
"order": order,
"steps": steps,
"is_contiguous": tensor.is_contiguous(),
"dtype": str(self._dtype),
}
return tensor, properties
def _get_size_and_steps(self, params):
dim = (
params[self._dim_parameter]
if self._dim_parameter is not None
else len(self._size)
)
def resolve(values, dim):
"""Resolve values into concrete integers."""
values = tuple(params.get(i, i) for i in values)
if len(values) > dim:
values = values[:dim]
if len(values) < dim:
values = values + tuple(1 for _ in range(dim - len(values)))
return values
size = resolve(self._size, dim)
steps = resolve(self._steps or (), dim)
allocation_size = tuple(size_i * step_i for size_i, step_i in zip(size, steps))
return size, steps, allocation_size
def satisfies_constraints(self, params):
size, _, allocation_size = self._get_size_and_steps(params)
# Product is computed in Python to avoid integer overflow.
num_elements = prod(size)
assert num_elements >= 0
allocation_bytes = prod(allocation_size, base=dtype_size(self._dtype))
def nullable_greater(left, right):
if left is None or right is None:
return False
return left > right
return not any((
nullable_greater(num_elements, self._max_elements),
nullable_greater(self._min_elements, num_elements),
nullable_greater(allocation_bytes, self._max_allocation_bytes),
))
class Fuzzer:
def __init__(
self,
parameters: List[Union[FuzzedParameter, List[FuzzedParameter]]],
tensors: List[Union[FuzzedTensor, List[FuzzedTensor]]],
constraints: Optional[List[Callable]] = None,
seed: Optional[int] = None
):
"""
Args:
parameters:
List of FuzzedParameters which provide specifications
for generated parameters. Iterable elements will be
unpacked, though arbitrary nested structures will not.
tensors:
List of FuzzedTensors which define the Tensors which
will be created each step based on the parameters for
that step. Iterable elements will be unpacked, though
arbitrary nested structures will not.
constraints:
List of callables. They will be called with params
as kwargs, and if any of them return False the current
set of parameters will be rejected.
seed:
Seed for the RandomState used by the Fuzzer. This will
also be used to set the PyTorch random seed so that random
ops will create reproducible Tensors.
"""
import numpy as np
if seed is None:
seed = np.random.RandomState().randint(0, 2 ** 32 - 1, dtype=np.int64)
self._seed = seed
self._parameters = Fuzzer._unpack(parameters, FuzzedParameter)
self._tensors = Fuzzer._unpack(tensors, FuzzedTensor)
self._constraints = constraints or ()
p_names = {p.name for p in self._parameters}
t_names = {t.name for t in self._tensors}
name_overlap = p_names.intersection(t_names)
if name_overlap:
raise ValueError(f"Duplicate names in parameters and tensors: {name_overlap}")
self._rejections = 0
self._total_generated = 0
@staticmethod
def _unpack(values, cls):
return tuple(it.chain(
*[[i] if isinstance(i, cls) else i for i in values]
))
def take(self, n):
import numpy as np
state = np.random.RandomState(self._seed)
torch.manual_seed(state.randint(low=0, high=2 ** 63, dtype=np.int64))
for _ in range(n):
params = self._generate(state)
tensors = {}
tensor_properties = {}
for t in self._tensors:
tensor, properties = t._make_tensor(params, state)
tensors[t.name] = tensor
tensor_properties[t.name] = properties
yield tensors, tensor_properties, params
@property
def rejection_rate(self):
if not self._total_generated:
return 0.
return self._rejections / self._total_generated
def _generate(self, state):
strict_params: Dict[str, Union[float, int, ParameterAlias]] = {}
for _ in range(1000):
candidate_params: Dict[str, Union[float, int, ParameterAlias]] = {}
for p in self._parameters:
if p.strict:
if p.name in strict_params:
candidate_params[p.name] = strict_params[p.name]
else:
candidate_params[p.name] = p.sample(state)
strict_params[p.name] = candidate_params[p.name]
else:
candidate_params[p.name] = p.sample(state)
candidate_params = self._resolve_aliases(candidate_params)
self._total_generated += 1
if not all(f(candidate_params) for f in self._constraints):
self._rejections += 1
continue
if not all(t.satisfies_constraints(candidate_params) for t in self._tensors):
self._rejections += 1
continue
return candidate_params
raise ValueError("Failed to generate a set of valid parameters.")
@staticmethod
def _resolve_aliases(params):
params = dict(params)
alias_count = sum(isinstance(v, ParameterAlias) for v in params.values())
keys = list(params.keys())
while alias_count:
for k in keys:
v = params[k]
if isinstance(v, ParameterAlias):
params[k] = params[v.alias_to]
alias_count_new = sum(isinstance(v, ParameterAlias) for v in params.values())
if alias_count == alias_count_new:
raise ValueError(f"ParameterAlias cycle detected\n{params}")
alias_count = alias_count_new
return params

View File

@ -0,0 +1,121 @@
# mypy: allow-untyped-defs
from typing import Optional, Tuple, Union
from numbers import Number
import torch
from torch.utils.benchmark import FuzzedTensor
import math
class FuzzedSparseTensor(FuzzedTensor):
def __init__(
self,
name: str,
size: Tuple[Union[str, int], ...],
min_elements: Optional[int] = None,
max_elements: Optional[int] = None,
dim_parameter: Optional[str] = None,
sparse_dim: Optional[str] = None,
nnz: Optional[str] = None,
density: Optional[str] = None,
coalesced: Optional[str] = None,
dtype=torch.float32,
cuda=False
):
"""
Args:
name:
A string identifier for the generated Tensor.
size:
A tuple of integers or strings specifying the size of the generated
Tensor. String values will replaced with a concrete int during the
generation process, while ints are simply passed as literals.
min_elements:
The minimum number of parameters that this Tensor must have for a
set of parameters to be valid. (Otherwise they are resampled.)
max_elements:
Like `min_elements`, but setting an upper bound.
dim_parameter:
The length of `size` will be truncated to this value.
This allows Tensors of varying dimensions to be generated by the
Fuzzer.
sparse_dim:
The number of sparse dimensions in a sparse tensor.
density:
This value allows tensors of varying sparsities to be generated by the Fuzzer.
coalesced:
The sparse tensor format permits uncoalesced sparse tensors,
where there may be duplicate coordinates in the indices.
dtype:
The PyTorch dtype of the generated Tensor.
cuda:
Whether to place the Tensor on a GPU.
"""
super().__init__(name=name, size=size, min_elements=min_elements,
max_elements=max_elements, dim_parameter=dim_parameter, dtype=dtype, cuda=cuda)
self._density = density
self._coalesced = coalesced
self._sparse_dim = sparse_dim
@staticmethod
def sparse_tensor_constructor(size, dtype, sparse_dim, nnz, is_coalesced):
"""sparse_tensor_constructor creates a sparse tensor with coo format.
Note that when `is_coalesced` is False, the number of elements is doubled but the number of indices
represents the same amount of number of non zeros `nnz`, i.e, this is virtually the same tensor
with the same sparsity pattern. Moreover, most of the sparse operation will use coalesce() method
and what we want here is to get a sparse tensor with the same `nnz` even if this is coalesced or not.
In the other hand when `is_coalesced` is True the number of elements is reduced in the coalescing process
by an unclear amount however the probability to generate duplicates indices are low for most of the cases.
This decision was taken on purpose to maintain the construction cost as low as possible.
"""
if isinstance(size, Number):
size = [size] * sparse_dim
assert all(size[d] > 0 for d in range(sparse_dim)) or nnz == 0, 'invalid arguments'
v_size = [nnz] + list(size[sparse_dim:])
if dtype.is_floating_point:
v = torch.rand(size=v_size, dtype=dtype, device="cpu")
else:
v = torch.randint(1, 127, size=v_size, dtype=dtype, device="cpu")
i = torch.rand(sparse_dim, nnz, device="cpu")
i.mul_(torch.tensor(size[:sparse_dim]).unsqueeze(1).to(i))
i = i.to(torch.long)
if not is_coalesced:
v = torch.cat([v, torch.randn_like(v)], 0)
i = torch.cat([i, i], 1)
x = torch.sparse_coo_tensor(i, v, torch.Size(size))
if is_coalesced:
x = x.coalesce()
return x
def _make_tensor(self, params, state):
size, _, _ = self._get_size_and_steps(params)
density = params['density']
nnz = math.ceil(sum(size) * density)
assert nnz <= sum(size)
is_coalesced = params['coalesced']
sparse_dim = params['sparse_dim'] if self._sparse_dim else len(size)
sparse_dim = min(sparse_dim, len(size))
tensor = self.sparse_tensor_constructor(size, self._dtype, sparse_dim, nnz, is_coalesced)
if self._cuda:
tensor = tensor.cuda()
sparse_dim = tensor.sparse_dim()
dense_dim = tensor.dense_dim()
is_hybrid = len(size[sparse_dim:]) > 0
properties = {
"numel": int(tensor.numel()),
"shape": tensor.size(),
"is_coalesced": tensor.is_coalesced(),
"density": density,
"sparsity": 1.0 - density,
"sparse_dim": sparse_dim,
"dense_dim": dense_dim,
"is_hybrid": is_hybrid,
"dtype": str(self._dtype),
}
return tensor, properties

View File

@ -0,0 +1,43 @@
/* C++ template for Timer.timeit
This template will be consumed by `cpp_jit.py`, and will replace:
`GLOBAL_SETUP_TEMPLATE_LOCATION`,
`SETUP_TEMPLATE_LOCATION`
and
`STMT_TEMPLATE_LOCATION`
sections with user provided statements.
*/
#include <chrono>
#include <c10/util/irange.h>
#include <torch/csrc/utils/pybind.h>
#include <pybind11/pybind11.h>
#include <torch/extension.h>
// Global setup. (e.g. #includes)
// GLOBAL_SETUP_TEMPLATE_LOCATION
double timeit(int n) {
pybind11::gil_scoped_release no_gil;
// Setup
// SETUP_TEMPLATE_LOCATION
{
// Warmup
// STMT_TEMPLATE_LOCATION
}
// Main loop
auto start_time = std::chrono::high_resolution_clock::now();
for (const auto loop_idx : c10::irange(n)) {
(void)loop_idx;
// STMT_TEMPLATE_LOCATION
}
auto end_time = std::chrono::high_resolution_clock::now();
return std::chrono::duration<double>(end_time - start_time).count();
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("timeit", &timeit);
}

View File

@ -0,0 +1,537 @@
"""Timer class based on the timeit.Timer class, but torch aware."""
import enum
import timeit
import textwrap
from typing import overload, Any, Callable, Dict, List, NoReturn, Optional, Tuple, Type, Union
import torch
from torch.utils.benchmark.utils import common, cpp_jit
from torch.utils.benchmark.utils._stubs import TimerClass, TimeitModuleType
from torch.utils.benchmark.utils.valgrind_wrapper import timer_interface as valgrind_timer_interface
__all__ = ["Timer", "timer", "Language"]
if torch.backends.cuda.is_built() and torch.cuda.is_available(): # type: ignore[no-untyped-call]
def timer() -> float:
torch.cuda.synchronize()
return timeit.default_timer()
elif torch._C._get_privateuse1_backend_name() != "privateuseone":
privateuse1_device_handler = getattr(torch, torch._C._get_privateuse1_backend_name(), None) \
if torch._C._get_privateuse1_backend_name() != "cpu" else None
def timer() -> float:
if privateuse1_device_handler:
privateuse1_device_handler.synchronize()
return timeit.default_timer()
else:
timer = timeit.default_timer
class Language(enum.Enum):
PYTHON = 0
CPP = 1
class CPPTimer:
def __init__(
self,
stmt: str,
setup: str,
global_setup: str,
timer: Callable[[], float],
globals: Dict[str, Any],
) -> None:
if timer is not timeit.default_timer:
raise NotImplementedError(
"PyTorch was built with CUDA and a GPU is present; however "
"Timer does not yet support GPU measurements. If your "
"code is CPU only, pass `timer=timeit.default_timer` to the "
"Timer's constructor to indicate this. (Note that this will "
"produce incorrect results if the GPU is in fact used, as "
"Timer will not synchronize CUDA.)"
)
if globals:
raise ValueError("C++ timing does not support globals.")
self._stmt: str = textwrap.dedent(stmt)
self._setup: str = textwrap.dedent(setup)
self._global_setup: str = textwrap.dedent(global_setup)
self._timeit_module: Optional[TimeitModuleType] = None
def timeit(self, number: int) -> float:
if self._timeit_module is None:
self._timeit_module = cpp_jit.compile_timeit_template(
stmt=self._stmt,
setup=self._setup,
global_setup=self._global_setup,
)
return self._timeit_module.timeit(number)
class Timer:
"""Helper class for measuring execution time of PyTorch statements.
For a full tutorial on how to use this class, see:
https://pytorch.org/tutorials/recipes/recipes/benchmark.html
The PyTorch Timer is based on `timeit.Timer` (and in fact uses
`timeit.Timer` internally), but with several key differences:
1) Runtime aware:
Timer will perform warmups (important as some elements of PyTorch are
lazily initialized), set threadpool size so that comparisons are
apples-to-apples, and synchronize asynchronous CUDA functions when
necessary.
2) Focus on replicates:
When measuring code, and particularly complex kernels / models,
run-to-run variation is a significant confounding factor. It is
expected that all measurements should include replicates to quantify
noise and allow median computation, which is more robust than mean.
To that effect, this class deviates from the `timeit` API by
conceptually merging `timeit.Timer.repeat` and `timeit.Timer.autorange`.
(Exact algorithms are discussed in method docstrings.) The `timeit`
method is replicated for cases where an adaptive strategy is not
desired.
3) Optional metadata:
When defining a Timer, one can optionally specify `label`, `sub_label`,
`description`, and `env`. (Defined later) These fields are included in
the representation of result object and by the `Compare` class to group
and display results for comparison.
4) Instruction counts
In addition to wall times, Timer can run a statement under Callgrind
and report instructions executed.
Directly analogous to `timeit.Timer` constructor arguments:
`stmt`, `setup`, `timer`, `globals`
PyTorch Timer specific constructor arguments:
`label`, `sub_label`, `description`, `env`, `num_threads`
Args:
stmt: Code snippet to be run in a loop and timed.
setup: Optional setup code. Used to define variables used in `stmt`
global_setup: (C++ only)
Code which is placed at the top level of the file for things like
`#include` statements.
timer:
Callable which returns the current time. If PyTorch was built
without CUDA or there is no GPU present, this defaults to
`timeit.default_timer`; otherwise it will synchronize CUDA before
measuring the time.
globals:
A dict which defines the global variables when `stmt` is being
executed. This is the other method for providing variables which
`stmt` needs.
label:
String which summarizes `stmt`. For instance, if `stmt` is
"torch.nn.functional.relu(torch.add(x, 1, out=out))"
one might set label to "ReLU(x + 1)" to improve readability.
sub_label:
Provide supplemental information to disambiguate measurements
with identical stmt or label. For instance, in our example
above sub_label might be "float" or "int", so that it is easy
to differentiate:
"ReLU(x + 1): (float)"
"ReLU(x + 1): (int)"
when printing Measurements or summarizing using `Compare`.
description:
String to distinguish measurements with identical label and
sub_label. The principal use of `description` is to signal to
`Compare` the columns of data. For instance one might set it
based on the input size to create a table of the form: ::
| n=1 | n=4 | ...
------------- ...
ReLU(x + 1): (float) | ... | ... | ...
ReLU(x + 1): (int) | ... | ... | ...
using `Compare`. It is also included when printing a Measurement.
env:
This tag indicates that otherwise identical tasks were run in
different environments, and are therefore not equivalent, for
instance when A/B testing a change to a kernel. `Compare` will
treat Measurements with different `env` specification as distinct
when merging replicate runs.
num_threads:
The size of the PyTorch threadpool when executing `stmt`. Single
threaded performance is important as both a key inference workload
and a good indicator of intrinsic algorithmic efficiency, so the
default is set to one. This is in contrast to the default PyTorch
threadpool size which tries to utilize all cores.
"""
_timer_cls: Type[TimerClass] = timeit.Timer
def __init__(
self,
stmt: str = "pass",
setup: str = "pass",
global_setup: str = "",
timer: Callable[[], float] = timer,
globals: Optional[Dict[str, Any]] = None,
label: Optional[str] = None,
sub_label: Optional[str] = None,
description: Optional[str] = None,
env: Optional[str] = None,
num_threads: int = 1,
language: Union[Language, str] = Language.PYTHON,
):
if not isinstance(stmt, str):
raise ValueError("Currently only a `str` stmt is supported.")
# We copy `globals` to prevent mutations from leaking.
# (For instance, `eval` adds the `__builtins__` key)
self._globals = dict(globals or {})
timer_kwargs = {}
if language in (Language.PYTHON, "py", "python"):
# Include `torch` if not specified as a convenience feature.
self._globals.setdefault("torch", torch)
self._language: Language = Language.PYTHON
if global_setup:
raise ValueError(
f"global_setup is C++ only, got `{global_setup}`. Most "
"likely this code can simply be moved to `setup`."
)
elif language in (Language.CPP, "cpp", "c++"):
assert self._timer_cls is timeit.Timer, "_timer_cls has already been swapped."
self._timer_cls = CPPTimer
setup = ("" if setup == "pass" else setup)
self._language = Language.CPP
timer_kwargs["global_setup"] = global_setup
else:
raise ValueError(f"Invalid language `{language}`.")
# Convenience adjustment so that multi-line code snippets defined in
# functions do not IndentationError (Python) or look odd (C++). The
# leading newline removal is for the initial newline that appears when
# defining block strings. For instance:
# textwrap.dedent("""
# print("This is a stmt")
# """)
# produces '\nprint("This is a stmt")\n'.
#
# Stripping this down to 'print("This is a stmt")' doesn't change
# what gets executed, but it makes __repr__'s nicer.
stmt = textwrap.dedent(stmt)
stmt = (stmt[1:] if stmt and stmt[0] == "\n" else stmt).rstrip()
setup = textwrap.dedent(setup)
setup = (setup[1:] if setup and setup[0] == "\n" else setup).rstrip()
self._timer = self._timer_cls(
stmt=stmt,
setup=setup,
timer=timer,
globals=valgrind_timer_interface.CopyIfCallgrind.unwrap_all(self._globals),
**timer_kwargs,
)
self._task_spec = common.TaskSpec(
stmt=stmt,
setup=setup,
global_setup=global_setup,
label=label,
sub_label=sub_label,
description=description,
env=env,
num_threads=num_threads,
)
def _timeit(self, number: int) -> float:
# Even calling a timer in C++ takes ~50 ns, so no real operation should
# take less than 1 ns. (And this prevents divide by zero errors.)
return max(self._timer.timeit(number), 1e-9)
def timeit(self, number: int = 1000000) -> common.Measurement:
"""Mirrors the semantics of timeit.Timer.timeit().
Execute the main statement (`stmt`) `number` times.
https://docs.python.org/3/library/timeit.html#timeit.Timer.timeit
"""
with common.set_torch_threads(self._task_spec.num_threads):
# Warmup
self._timeit(number=max(int(number // 100), 2))
return common.Measurement(
number_per_run=number,
raw_times=[self._timeit(number=number)],
task_spec=self._task_spec
)
def repeat(self, repeat: int = -1, number: int = -1) -> None:
raise NotImplementedError("See `Timer.blocked_autorange.`")
def autorange(self, callback: Optional[Callable[[int, float], NoReturn]] = None) -> None:
raise NotImplementedError("See `Timer.blocked_autorange.`")
def _threaded_measurement_loop(
self,
number: int,
time_hook: Callable[[], float],
stop_hook: Callable[[List[float]], bool],
min_run_time: float,
max_run_time: Optional[float] = None,
callback: Optional[Callable[[int, float], NoReturn]] = None
) -> List[float]:
total_time = 0.0
can_stop = False
times: List[float] = []
with common.set_torch_threads(self._task_spec.num_threads):
while (total_time < min_run_time) or (not can_stop):
time_spent = time_hook()
times.append(time_spent)
total_time += time_spent
if callback:
callback(number, time_spent)
can_stop = stop_hook(times)
if max_run_time and total_time > max_run_time:
break
return times
def _estimate_block_size(self, min_run_time: float) -> int:
with common.set_torch_threads(self._task_spec.num_threads):
# Estimate the block size needed for measurement to be negligible
# compared to the inner loop. This also serves as a warmup.
overhead = torch.tensor([self._timeit(0) for _ in range(5)]).median().item()
number = 1
while True:
time_taken = self._timeit(number)
relative_overhead = overhead / time_taken
if relative_overhead <= 1e-4 and time_taken >= min_run_time / 1000:
break
if time_taken > min_run_time:
break
# Avoid overflow in C++ pybind11 interface
if number * 10 > 2147483647:
break
number *= 10
return number
def blocked_autorange(
self,
callback: Optional[Callable[[int, float], NoReturn]] = None,
min_run_time: float = 0.2,
) -> common.Measurement:
"""Measure many replicates while keeping timer overhead to a minimum.
At a high level, blocked_autorange executes the following pseudo-code::
`setup`
total_time = 0
while total_time < min_run_time
start = timer()
for _ in range(block_size):
`stmt`
total_time += (timer() - start)
Note the variable `block_size` in the inner loop. The choice of block
size is important to measurement quality, and must balance two
competing objectives:
1) A small block size results in more replicates and generally
better statistics.
2) A large block size better amortizes the cost of `timer`
invocation, and results in a less biased measurement. This is
important because CUDA synchronization time is non-trivial
(order single to low double digit microseconds) and would
otherwise bias the measurement.
blocked_autorange sets block_size by running a warmup period,
increasing block size until timer overhead is less than 0.1% of
the overall computation. This value is then used for the main
measurement loop.
Returns:
A `Measurement` object that contains measured runtimes and
repetition counts, and can be used to compute statistics.
(mean, median, etc.)
"""
number = self._estimate_block_size(min_run_time)
def time_hook() -> float:
return self._timeit(number)
def stop_hook(times: List[float]) -> bool:
return True
times = self._threaded_measurement_loop(
number, time_hook, stop_hook,
min_run_time=min_run_time,
callback=callback)
return common.Measurement(
number_per_run=number,
raw_times=times,
task_spec=self._task_spec
)
def adaptive_autorange(
self,
threshold: float = 0.1,
*,
min_run_time: float = 0.01,
max_run_time: float = 10.0,
callback: Optional[Callable[[int, float], NoReturn]] = None,
) -> common.Measurement:
"""Similar to `blocked_autorange` but also checks for variablility in measurements
and repeats until iqr/median is smaller than `threshold` or `max_run_time` is reached.
At a high level, adaptive_autorange executes the following pseudo-code::
`setup`
times = []
while times.sum < max_run_time
start = timer()
for _ in range(block_size):
`stmt`
times.append(timer() - start)
enough_data = len(times)>3 and times.sum > min_run_time
small_iqr=times.iqr/times.mean<threshold
if enough_data and small_iqr:
break
Args:
threshold: value of iqr/median threshold for stopping
min_run_time: total runtime needed before checking `threshold`
max_run_time: total runtime for all measurements regardless of `threshold`
Returns:
A `Measurement` object that contains measured runtimes and
repetition counts, and can be used to compute statistics.
(mean, median, etc.)
"""
number = self._estimate_block_size(min_run_time=0.05)
def time_hook() -> float:
return self._timeit(number)
def stop_hook(times: List[float]) -> bool:
if len(times) > 3:
return common.Measurement(
number_per_run=number,
raw_times=times,
task_spec=self._task_spec
).meets_confidence(threshold=threshold)
return False
times = self._threaded_measurement_loop(
number, time_hook, stop_hook, min_run_time, max_run_time, callback=callback)
return common.Measurement(
number_per_run=number,
raw_times=times,
task_spec=self._task_spec
)
@overload
def collect_callgrind(
self,
number: int,
*,
repeats: None,
collect_baseline: bool,
retain_out_file: bool,
) -> valgrind_timer_interface.CallgrindStats:
...
@overload
def collect_callgrind(
self,
number: int,
*,
repeats: int,
collect_baseline: bool,
retain_out_file: bool,
) -> Tuple[valgrind_timer_interface.CallgrindStats, ...]:
...
def collect_callgrind(
self,
number: int = 100,
*,
repeats: Optional[int] = None,
collect_baseline: bool = True,
retain_out_file: bool = False,
) -> Any:
"""Collect instruction counts using Callgrind.
Unlike wall times, instruction counts are deterministic
(modulo non-determinism in the program itself and small amounts of
jitter from the Python interpreter.) This makes them ideal for detailed
performance analysis. This method runs `stmt` in a separate process
so that Valgrind can instrument the program. Performance is severely
degraded due to the instrumentation, however this is ameliorated by
the fact that a small number of iterations is generally sufficient to
obtain good measurements.
In order to to use this method `valgrind`, `callgrind_control`, and
`callgrind_annotate` must be installed.
Because there is a process boundary between the caller (this process)
and the `stmt` execution, `globals` cannot contain arbitrary in-memory
data structures. (Unlike timing methods) Instead, globals are
restricted to builtins, `nn.Modules`'s, and TorchScripted functions/modules
to reduce the surprise factor from serialization and subsequent
deserialization. The `GlobalsBridge` class provides more detail on this
subject. Take particular care with nn.Modules: they rely on pickle and
you may need to add an import to `setup` for them to transfer properly.
By default, a profile for an empty statement will be collected and
cached to indicate how many instructions are from the Python loop which
drives `stmt`.
Returns:
A `CallgrindStats` object which provides instruction counts and
some basic facilities for analyzing and manipulating results.
"""
if not isinstance(self._task_spec.stmt, str):
raise ValueError("`collect_callgrind` currently only supports string `stmt`")
if repeats is not None and repeats < 1:
raise ValueError("If specified, `repeats` must be >= 1")
# Check that the statement is valid. It doesn't guarantee success, but it's much
# simpler and quicker to raise an exception for a faulty `stmt` or `setup` in
# the parent process rather than the valgrind subprocess.
self._timeit(1)
is_python = (self._language == Language.PYTHON)
assert is_python or not self._globals
result = valgrind_timer_interface.wrapper_singleton().collect_callgrind(
task_spec=self._task_spec,
globals=self._globals,
number=number,
repeats=repeats or 1,
collect_baseline=collect_baseline and is_python,
is_python=is_python,
retain_out_file=retain_out_file,
)
return (result[0] if repeats is None else result)

View File

@ -0,0 +1,129 @@
/*
----------------------------------------------------------------
Notice that the following BSD-style license applies to this one
file (callgrind.h) only. The rest of Valgrind is licensed under the
terms of the GNU General Public License, version 2, unless
otherwise indicated. See the COPYING file in the source
distribution for details.
----------------------------------------------------------------
This file is part of callgrind, a valgrind tool for cache simulation
and call tree tracing.
Copyright (C) 2003-2017 Josef Weidendorfer. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. The origin of this software must not be misrepresented; you must
not claim that you wrote the original software. If you use this
software in a product, an acknowledgment in the product
documentation would be appreciated but is not required.
3. Altered source versions must be plainly marked as such, and must
not be misrepresented as being the original software.
4. The name of the author may not be used to endorse or promote
products derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
----------------------------------------------------------------
Notice that the above BSD-style license applies to this one file
(callgrind.h) only. The entire rest of Valgrind is licensed under
the terms of the GNU General Public License, version 2. See the
COPYING file in the source distribution for details.
----------------------------------------------------------------
*/
#ifndef __CALLGRIND_H
#define __CALLGRIND_H
#include "valgrind.h"
/* !! ABIWARNING !! ABIWARNING !! ABIWARNING !! ABIWARNING !!
This enum comprises an ABI exported by Valgrind to programs
which use client requests. DO NOT CHANGE THE ORDER OF THESE
ENTRIES, NOR DELETE ANY -- add new ones at the end.
The identification ('C','T') for Callgrind has historical
reasons: it was called "Calltree" before. Besides, ('C','G') would
clash with cachegrind.
*/
typedef
enum {
VG_USERREQ__DUMP_STATS = VG_USERREQ_TOOL_BASE('C','T'),
VG_USERREQ__ZERO_STATS,
VG_USERREQ__TOGGLE_COLLECT,
VG_USERREQ__DUMP_STATS_AT,
VG_USERREQ__START_INSTRUMENTATION,
VG_USERREQ__STOP_INSTRUMENTATION
} Vg_CallgrindClientRequest;
/* Dump current state of cost centers, and zero them afterwards */
#define CALLGRIND_DUMP_STATS \
VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DUMP_STATS, \
0, 0, 0, 0, 0)
/* Dump current state of cost centers, and zero them afterwards.
The argument is appended to a string stating the reason which triggered
the dump. This string is written as a description field into the
profile data dump. */
#define CALLGRIND_DUMP_STATS_AT(pos_str) \
VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__DUMP_STATS_AT, \
pos_str, 0, 0, 0, 0)
/* Zero cost centers */
#define CALLGRIND_ZERO_STATS \
VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__ZERO_STATS, \
0, 0, 0, 0, 0)
/* Toggles collection state.
The collection state specifies whether the happening of events
should be noted or if they are to be ignored. Events are noted
by increment of counters in a cost center */
#define CALLGRIND_TOGGLE_COLLECT \
VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__TOGGLE_COLLECT, \
0, 0, 0, 0, 0)
/* Start full callgrind instrumentation if not already switched on.
When cache simulation is done, it will flush the simulated cache;
this will lead to an artificial cache warmup phase afterwards with
cache misses which would not have happened in reality. */
#define CALLGRIND_START_INSTRUMENTATION \
VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__START_INSTRUMENTATION, \
0, 0, 0, 0, 0)
/* Stop full callgrind instrumentation if not already switched off.
This flushes Valgrinds translation cache, and does no additional
instrumentation afterwards, which effectivly will run at the same
speed as the "none" tool (ie. at minimal slowdown).
Use this to bypass Callgrind aggregation for uninteresting code parts.
To start Callgrind in this mode to ignore the setup phase, use
the option "--instr-atstart=no". */
#define CALLGRIND_STOP_INSTRUMENTATION \
VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__STOP_INSTRUMENTATION, \
0, 0, 0, 0, 0)
#endif /* __CALLGRIND_H */

View File

@ -0,0 +1,35 @@
/* Used to collect profiles of old versions of PyTorch. */
#include <callgrind.h>
#include <pybind11/pybind11.h>
bool _valgrind_supported_platform() {
#if defined(NVALGRIND)
return false;
#else
return true;
#endif
}
void _valgrind_toggle() {
#if defined(NVALGRIND)
TORCH_CHECK(false, "Valgrind is not supported.");
#else
CALLGRIND_TOGGLE_COLLECT;
#endif
}
void _valgrind_toggle_and_dump_stats() {
#if defined(NVALGRIND)
TORCH_CHECK(false, "Valgrind is not supported.");
#else
// NB: See note in Module.cpp
CALLGRIND_TOGGLE_COLLECT;
CALLGRIND_DUMP_STATS;
#endif
}
PYBIND11_MODULE(callgrind_bindings, m) {
m.def("_valgrind_supported_platform", &_valgrind_supported_platform);
m.def("_valgrind_toggle", &_valgrind_toggle);
m.def("_valgrind_toggle_and_dump_stats", &_valgrind_dump_stats);
}

View File

@ -0,0 +1,68 @@
/* C++ template for Timer.collect_callgrind
This template will be consumed by `cpp_jit.py`, and will replace:
`GLOBAL_SETUP_TEMPLATE_LOCATION`,
`SETUP_TEMPLATE_LOCATION`
and
`STMT_TEMPLATE_LOCATION`
sections with user provided statements.
*/
#include <c10/util/irange.h>
#include <callgrind.h>
#include <torch/torch.h>
#include <string>
// Global setup. (e.g. #includes)
// GLOBAL_SETUP_TEMPLATE_LOCATION
#if defined(NVALGRIND)
static_assert(false);
#endif
int main(int argc, char* argv[]) {
// This file should only be called inside of `Timer`, so we can adopt a
// very simple and rigid argument parsing scheme.
TORCH_CHECK(argc == 9);
TORCH_CHECK(std::string(argv[1]) == "--number");
auto number = std::stoi(argv[2]);
TORCH_CHECK(
std::string(argv[3]) == "--number-warmup" ||
std::string(argv[3]) == "--number_warmup");
auto number_warmup = std::stoi(argv[4]);
TORCH_CHECK(std::string(argv[5]) == "--repeats");
auto repeats = std::stoi(argv[6]);
TORCH_CHECK(
std::string(argv[7]) == "--number-threads" ||
std::string(argv[7]) == "--number_threads");
auto number_threads = std::stoi(argv[8]);
torch::set_num_threads(number_threads);
// Setup
// SETUP_TEMPLATE_LOCATION
// Warmup
for (const auto i : c10::irange(number_warmup)) {
(void)i;
// STMT_TEMPLATE_LOCATION
}
// Main loop
for (const auto repeat : c10::irange(repeats)) {
(void)repeat;
CALLGRIND_TOGGLE_COLLECT;
for (const auto i : c10::irange(number)) {
(void)i;
// STMT_TEMPLATE_LOCATION
}
// NB: See note in Module.cpp
CALLGRIND_TOGGLE_COLLECT;
CALLGRIND_DUMP_STATS;
}
}

View File

@ -0,0 +1,907 @@
"""Intermediate layer between `Timer` and `valgrind`."""
import collections
import enum
import dataclasses
import itertools as it
import os
import pickle
import re
import shutil
import subprocess
import sys
import textwrap
from typing import (
cast, Any, Callable, DefaultDict, Dict, Iterator, List, NamedTuple,
Optional, Tuple, Union, TYPE_CHECKING)
import torch
from torch.utils.benchmark.utils import common, cpp_jit
from torch.utils.benchmark.utils._stubs import CallgrindModuleType
import operator
__all__ = ["FunctionCount", "FunctionCounts", "CallgrindStats", "CopyIfCallgrind"]
if TYPE_CHECKING:
CompletedProcessType = subprocess.CompletedProcess[str]
else:
CompletedProcessType = subprocess.CompletedProcess
class FunctionCount(NamedTuple):
# TODO(#105471): Rename the count field
count: int # type: ignore[assignment]
function: str
@dataclasses.dataclass(repr=False, eq=False, frozen=True)
class FunctionCounts:
"""Container for manipulating Callgrind results.
It supports:
1) Addition and subtraction to combine or diff results.
2) Tuple-like indexing.
3) A `denoise` function which strips CPython calls which are known to
be non-deterministic and quite noisy.
4) Two higher order methods (`filter` and `transform`) for custom
manipulation.
"""
_data: Tuple[FunctionCount, ...]
inclusive: bool
truncate_rows: bool = True
# For normal use, torch._tensor_str.PRINT_OPTS.linewidth determines
# the print settings. This is simply to allow hermetic unit tests.
_linewidth: Optional[int] = None
def __iter__(self) -> Iterator[FunctionCount]:
yield from self._data
def __len__(self) -> int:
return len(self._data)
def __getitem__(self, item: Any) -> Union[FunctionCount, "FunctionCounts"]:
data: Union[FunctionCount, Tuple[FunctionCount, ...]] = self._data[item]
return (
FunctionCounts(cast(Tuple[FunctionCount, ...], data), self.inclusive, truncate_rows=False)
if isinstance(data, tuple) else data
)
def __repr__(self) -> str:
count_len = 0
for c, _ in self:
# Account for sign in string length.
count_len = max(count_len, len(str(c)) + int(c < 0))
lines = []
linewidth = self._linewidth or torch._tensor_str.PRINT_OPTS.linewidth
fn_str_len = max(linewidth - count_len - 4, 40)
for c, fn in self:
if len(fn) > fn_str_len:
left_len = int((fn_str_len - 5) // 2)
fn = fn[:left_len] + " ... " + fn[-(fn_str_len - left_len - 5):]
lines.append(f" {c:>{count_len}} {fn}")
if self.truncate_rows and len(lines) > 18:
lines = lines[:9] + ["...".rjust(count_len + 2)] + lines[-9:]
if not self.inclusive:
lines.extend(["", f"Total: {self.sum()}"])
return "\n".join([super().__repr__()] + lines)
def __add__(
self,
other: "FunctionCounts",
) -> "FunctionCounts":
return self._merge(other, lambda c: c)
def __sub__(
self,
other: "FunctionCounts",
) -> "FunctionCounts":
return self._merge(other, operator.neg)
def __mul__(self, other: Union[int, float]) -> "FunctionCounts":
return self._from_dict({
fn: int(c * other) for c, fn in self._data
}, self.inclusive)
def transform(self, map_fn: Callable[[str], str]) -> "FunctionCounts":
"""Apply `map_fn` to all of the function names.
This can be used to regularize function names (e.g. stripping irrelevant
parts of the file path), coalesce entries by mapping multiple functions
to the same name (in which case the counts are added together), etc.
"""
counts: DefaultDict[str, int] = collections.defaultdict(int)
for c, fn in self._data:
counts[map_fn(fn)] += c
return self._from_dict(counts, self.inclusive)
def filter(self, filter_fn: Callable[[str], bool]) -> "FunctionCounts":
"""Keep only the elements where `filter_fn` applied to function name returns True."""
return FunctionCounts(tuple(i for i in self if filter_fn(i.function)), self.inclusive)
def sum(self) -> int:
return sum(c for c, _ in self)
def denoise(self) -> "FunctionCounts":
"""Remove known noisy instructions.
Several instructions in the CPython interpreter are rather noisy. These
instructions involve unicode to dictionary lookups which Python uses to
map variable names. FunctionCounts is generally a content agnostic
container, however this is sufficiently important for obtaining
reliable results to warrant an exception."""
return self.filter(lambda fn: "dictobject.c:lookdict_unicode" not in fn)
def _merge(
self,
second: "FunctionCounts",
merge_fn: Callable[[int], int]
) -> "FunctionCounts":
assert self.inclusive == second.inclusive, "Cannot merge inclusive and exclusive counts."
counts: DefaultDict[str, int] = collections.defaultdict(int)
for c, fn in self:
counts[fn] += c
for c, fn in second:
counts[fn] += merge_fn(c)
return self._from_dict(counts, self.inclusive)
@staticmethod
def _from_dict(counts: Dict[str, int], inclusive: bool) -> "FunctionCounts":
flat_counts = (FunctionCount(c, fn) for fn, c in counts.items() if c)
return FunctionCounts(tuple(sorted(flat_counts, reverse=True)), inclusive)
@dataclasses.dataclass(repr=False, eq=False, frozen=True)
class CallgrindStats:
"""Top level container for Callgrind results collected by Timer.
Manipulation is generally done using the FunctionCounts class, which is
obtained by calling `CallgrindStats.stats(...)`. Several convenience
methods are provided as well; the most significant is
`CallgrindStats.as_standardized()`.
"""
task_spec: common.TaskSpec
number_per_run: int
built_with_debug_symbols: bool
baseline_inclusive_stats: FunctionCounts
baseline_exclusive_stats: FunctionCounts
stmt_inclusive_stats: FunctionCounts
stmt_exclusive_stats: FunctionCounts
stmt_callgrind_out: Optional[str]
def __repr__(self) -> str:
newline = "\n" # `\` cannot appear in fstring code section.
base_stats = self.baseline_exclusive_stats
output = f"""
{super().__repr__()}
{self.task_spec.summarize()}
{'':>25}All{'':>10}Noisy symbols removed
Instructions: {self.counts(denoise=False):>12}{'':>15}{self.counts(denoise=True):>12}
Baseline: {base_stats.sum():>12}{'':>15}{base_stats.denoise().sum():>12}
{self.number_per_run} runs per measurement, {self.task_spec.num_threads} thread{'s' if self.task_spec.num_threads > 1 else ''}
""".strip()
if not self.built_with_debug_symbols:
output += textwrap.dedent("""
Warning: PyTorch was not built with debug symbols.
Source information may be limited. Rebuild with
REL_WITH_DEB_INFO=1 for more detailed results.""")
return output
def stats(self, inclusive: bool = False) -> FunctionCounts:
"""Returns detailed function counts.
Conceptually, the FunctionCounts returned can be thought of as a tuple
of (count, path_and_function_name) tuples.
`inclusive` matches the semantics of callgrind. If True, the counts
include instructions executed by children. `inclusive=True` is useful
for identifying hot spots in code; `inclusive=False` is useful for
reducing noise when diffing counts from two different runs. (See
CallgrindStats.delta(...) for more details)
"""
return self.stmt_inclusive_stats if inclusive else self.stmt_exclusive_stats
def counts(self, *, denoise: bool = False) -> int:
"""Returns the total number of instructions executed.
See `FunctionCounts.denoise()` for an explanation of the `denoise` arg.
"""
stats = self.stmt_exclusive_stats
return (stats.denoise() if denoise else stats).sum()
# FIXME: Once 3.7 is the minimum version, type annotate `other` per PEP 563
def delta(
self,
other: "CallgrindStats",
inclusive: bool = False,
) -> FunctionCounts:
"""Diff two sets of counts.
One common reason to collect instruction counts is to determine the
the effect that a particular change will have on the number of instructions
needed to perform some unit of work. If a change increases that number, the
next logical question is "why". This generally involves looking at what part
if the code increased in instruction count. This function automates that
process so that one can easily diff counts on both an inclusive and
exclusive basis.
"""
return self.stats(inclusive=inclusive) - other.stats(inclusive=inclusive)
def as_standardized(self) -> "CallgrindStats":
"""Strip library names and some prefixes from function strings.
When comparing two different sets of instruction counts, on stumbling
block can be path prefixes. Callgrind includes the full filepath
when reporting a function (as it should). However, this can cause
issues when diffing profiles. If a key component such as Python
or PyTorch was built in separate locations in the two profiles, which
can result in something resembling::
23234231 /tmp/first_build_dir/thing.c:foo(...)
9823794 /tmp/first_build_dir/thing.c:bar(...)
...
53453 .../aten/src/Aten/...:function_that_actually_changed(...)
...
-9823794 /tmp/second_build_dir/thing.c:bar(...)
-23234231 /tmp/second_build_dir/thing.c:foo(...)
Stripping prefixes can ameliorate this issue by regularizing the
strings and causing better cancellation of equivalent call sites
when diffing.
"""
def strip(stats: FunctionCounts) -> FunctionCounts:
transforms = (
# PyTorch may have been built in different locations.
(r"^.+build/\.\./", "build/../"),
(r"^.+/" + re.escape("build/aten/"), "build/aten/"),
# "Python" and "Objects" come from CPython.
(r"^.+/" + re.escape("Python/"), "Python/"),
(r"^.+/" + re.escape("Objects/"), "Objects/"),
# Strip library name. e.g. `libtorch.so`
(r"\s\[.+\]$", ""),
)
for before, after in transforms:
stats = stats.transform(lambda fn: re.sub(before, after, fn))
return stats
return CallgrindStats(
task_spec=self.task_spec,
number_per_run=self.number_per_run,
built_with_debug_symbols=self.built_with_debug_symbols,
baseline_inclusive_stats=strip(self.baseline_inclusive_stats),
baseline_exclusive_stats=strip(self.baseline_exclusive_stats),
stmt_inclusive_stats=strip(self.stmt_inclusive_stats),
stmt_exclusive_stats=strip(self.stmt_exclusive_stats),
# `as_standardized` will change symbol names, so the contents will
# no longer map directly to `callgrind.out`
stmt_callgrind_out=None,
)
class Serialization(enum.Enum):
PICKLE = 0
TORCH = 1
TORCH_JIT = 2
_GLOBALS_ALLOWED_TYPES: Dict[Serialization, Tuple[Any, ...]] = {
Serialization.PICKLE: (str, bytes, bool, int, float, complex),
Serialization.TORCH_JIT: (torch.jit.ScriptFunction, torch.jit.ScriptModule),
Serialization.TORCH: (torch.nn.Module,),
}
class CopyIfCallgrind:
"""Signal that a global may be replaced with a deserialized copy.
See `GlobalsBridge` for why this matters.
"""
def __init__(self, value: Any, *, setup: Optional[str] = None):
for method, supported_types in _GLOBALS_ALLOWED_TYPES.items():
if any(isinstance(value, t) for t in supported_types):
self._value: Any = value
self._setup: Optional[str] = setup
self._serialization: Serialization = method
break
else:
supported_str = "\n".join([
getattr(t, "__name__", repr(t))
for t in it.chain(_GLOBALS_ALLOWED_TYPES.values())])
raise ValueError(
f"Unsupported type: {type(value)}\n"
f"`collect_callgrind` restricts globals to the following types:\n"
f"{textwrap.indent(supported_str, ' ')}"
)
@property
def value(self) -> Any:
return self._value
@property
def setup(self) -> Optional[str]:
return self._setup
@property
def serialization(self) -> Serialization:
return self._serialization
@staticmethod
def unwrap_all(globals: Dict[str, Any]) -> Dict[str, Any]:
return {
k: (v.value if isinstance(v, CopyIfCallgrind) else v)
for k, v in globals.items()
}
class GlobalsBridge:
"""Handle the transfer of (certain) globals when collecting Callgrind statistics.
Key takeaway: Any globals passed must be wrapped in `CopyIfCallgrind` to
work with `Timer.collect_callgrind`.
Consider the following code snippet:
```
import pickle
import timeit
class Counter:
value = 0
def __call__(self):
self.value += 1
counter = Counter()
timeit.Timer("counter()", globals={"counter": counter}).timeit(10)
print(counter.value) # 10
timeit.Timer(
"counter()",
globals={"counter": pickle.loads(pickle.dumps(counter))}
).timeit(20)
print(counter.value) # Still 10
```
In the first case, `stmt` is executed using the objects in `globals`;
however, the addition of serialization and deserialization changes the
semantics and may meaningfully change behavior.
This is a practical consideration when collecting Callgrind statistics.
Unlike `exec` based execution (which `timeit` uses under the hood) which
can share in-memory data structures with the caller, Callgrind collection
requires an entirely new process in order to run under Valgrind. This means
that any data structures used for statement execution will have to be
serialized and deserialized in the subprocess.
In order to avoid surprising semantics from (user invisible) process
boundaries, what can be passed through `globals` is severely restricted
for `Timer.collect_callgrind`. It is expected that most setup should be
achievable (albeit perhaps less ergonomically) by passing a `setup`
string.
There are, however, exceptions. One such class are TorchScripted functions.
Because they require a concrete file with source code it is not possible
to define them using a `setup` string. Another group are torch.nn.Modules,
whose construction can be complex and prohibitively cumbersome to coerce
into a `setup` string. Finally, most builtin types are sufficiently well
behaved and sufficiently common to warrant allowing as well. (e.g.
`globals={"n": 1}` is very convenient.)
Fortunately, all have well defined serialization semantics. This class
is responsible for enabling the Valgrind subprocess to use elements in
`globals` so long as they are an allowed type.
Caveats:
The user is required to acknowledge this serialization by wrapping
elements in `globals` with `CopyIfCallgrind`.
While ScriptFunction and ScriptModule are expected to save and load
quite robustly, it is up to the user to ensure that an nn.Module can
un-pickle successfully.
`torch.Tensor` and `np.ndarray` are deliberately excluded. The
serialization/deserialization process perturbs the representation of a
tensor in ways that could result in incorrect measurements. For example,
if a tensor lives in pinned CPU memory, this fact would not be preserved
by a dump, and that will in turn change the performance of certain CUDA
operations.
"""
def __init__(self, globals: Dict[str, Any], data_dir: str) -> None:
self._globals: Dict[str, CopyIfCallgrind] = {}
self._data_dir = data_dir
if not os.path.exists(data_dir):
os.mkdir(data_dir)
if globals.get("torch", torch) is not torch:
raise ValueError("`collect_callgrind` does not support mocking out `torch`.")
for name, value in globals.items():
if name in ("torch", "__builtins__"):
# Torch will be imported by the collection script, and
# __builtins__ is added by Timer.
continue
if not isinstance(value, CopyIfCallgrind):
raise ValueError(
"`collect_callgrind` requires that globals be wrapped in "
"`CopyIfCallgrind` so that serialization is explicit."
)
self._globals[name] = value
def construct(self) -> str:
load_lines = []
for name, wrapped_value in self._globals.items():
if wrapped_value.setup is not None:
load_lines.append(textwrap.dedent(wrapped_value.setup))
if wrapped_value.serialization == Serialization.PICKLE:
path = os.path.join(self._data_dir, f"{name}.pkl")
load_lines.append(
f"with open({repr(path)}, 'rb') as f:\n {name} = pickle.load(f)")
with open(path, "wb") as f:
pickle.dump(wrapped_value.value, f)
elif wrapped_value.serialization == Serialization.TORCH:
path = os.path.join(self._data_dir, f"{name}.pt")
load_lines.append(f"{name} = torch.load({repr(path)})")
torch.save(wrapped_value.value, path)
elif wrapped_value.serialization == Serialization.TORCH_JIT:
path = os.path.join(self._data_dir, f"{name}.pt")
load_lines.append(f"{name} = torch.jit.load({repr(path)})")
with open(path, "wb") as f:
torch.jit.save(wrapped_value.value, f) # type: ignore[no-untyped-call]
else:
raise NotImplementedError(
f"Unknown serialization method: {wrapped_value.serialization}")
return "\n".join(load_lines)
class _ValgrindWrapper:
def __init__(self) -> None:
self._bindings_module: Optional[CallgrindModuleType] = None
valgrind_symbols = (
"_valgrind_supported_platform",
"_valgrind_toggle",
"_valgrind_toggle_and_dump_stats",
)
if all(hasattr(torch._C, symbol) for symbol in valgrind_symbols):
self._supported_platform: bool = torch._C._valgrind_supported_platform()
else:
print("Callgrind bindings are not present in `torch._C`. JIT-ing bindings.")
self._bindings_module = cpp_jit.get_compat_bindings()
assert all(hasattr(self._bindings_module, symbol) for symbol in valgrind_symbols)
self._supported_platform = self._bindings_module._valgrind_supported_platform()
self._commands_available: Dict[str, bool] = {}
if self._supported_platform:
# Only bother checking on supported platforms.
for cmd in ("valgrind", "callgrind_control", "callgrind_annotate"):
self._commands_available[cmd] = not subprocess.run(
["which", cmd],
capture_output=True,
check=False,
).returncode
self._build_type: Optional[str] = None
build_search = re.search("BUILD_TYPE=(.+),", torch.__config__.show()) # type: ignore[no-untyped-call]
if build_search is not None:
self._build_type = build_search.groups()[0].split(",")[0]
def _validate(self) -> None:
if not self._supported_platform:
raise OSError("Valgrind is not supported on this platform.")
missing_cmds = [cmd for cmd, available in self._commands_available.items() if not available]
if missing_cmds:
raise OSError("Missing: " + ", ".join(missing_cmds))
def collect_callgrind(
self,
task_spec: common.TaskSpec,
globals: Dict[str, Any],
*,
number: int,
repeats: int,
collect_baseline: bool,
is_python: bool,
retain_out_file: bool,
) -> Tuple[CallgrindStats, ...]:
"""Collect stats, and attach a reference run which can be used to filter interpreter overhead."""
self._validate()
assert is_python or not collect_baseline
*task_stats, baseline_stats = self._invoke(
task_spec=task_spec,
globals=globals,
number=number,
repeats=repeats,
collect_baseline=collect_baseline,
is_python=is_python,
retain_out_file=retain_out_file,
)
assert len(task_stats) == repeats
return tuple(
CallgrindStats(
task_spec=task_spec,
number_per_run=number,
built_with_debug_symbols=self._build_type == "RelWithDebInfo",
baseline_inclusive_stats=baseline_stats[0],
baseline_exclusive_stats=baseline_stats[1],
stmt_inclusive_stats=stmt_inclusive_stats,
stmt_exclusive_stats=stmt_exclusive_stats,
stmt_callgrind_out=out_contents,
)
for stmt_inclusive_stats, stmt_exclusive_stats, out_contents in task_stats
)
def _invoke(
self,
*,
task_spec: common.TaskSpec,
globals: Dict[str, Any],
number: int,
repeats: int,
collect_baseline: bool,
is_python: bool,
retain_out_file: bool,
) -> Tuple[Tuple[FunctionCounts, FunctionCounts, Optional[str]], ...]:
"""Core invocation method for Callgrind collection.
Valgrind operates by effectively replacing the CPU with an emulated
version which allows it to instrument any code at the cost of severe
performance degradation. This has the practical effect that in order
to collect Callgrind statistics, a new process has to be created
running under `valgrind`. The steps for this process are:
1) Create a scratch directory.
2) Codegen a run script. (_ValgrindWrapper._construct_script)
Inside the run script:
* Validate that Python and torch match the parent process
* Validate that it is indeed running under valgrind
* Execute `setup` and warm up `stmt`
* Begin collecting stats
* Run the `stmt` loop
* Stop collecting stats
3) Parse the run results.
4) Cleanup the scratch directory.
"""
working_dir = common._make_temp_dir(prefix="callgrind")
data_dir = os.path.join(working_dir, "data")
script_file = os.path.join(working_dir, "timer_callgrind.py")
callgrind_out = os.path.join(working_dir, "callgrind.out")
error_log = os.path.join(working_dir, "error.txt")
stat_log = os.path.join(working_dir, "callgrind_stat.txt")
stdout_stderr_log = os.path.join(working_dir, "stdout_stderr.log")
def run(args: List[str], **kwargs: Any) -> Tuple[CompletedProcessType, str]:
# https://thraxil.org/users/anders/posts/2008/03/13/Subprocess-Hanging-PIPE-is-your-enemy/
f_stdout_stderr = open(stdout_stderr_log, "wb")
try:
invocation = subprocess.run(
args,
stdout=f_stdout_stderr,
stderr=subprocess.STDOUT,
**kwargs,
)
with open(stdout_stderr_log) as f:
return invocation, f.read()
finally:
f_stdout_stderr.close()
try:
if is_python:
if self._bindings_module is not None:
shutil.copy(
self._bindings_module.__file__,
os.path.join(working_dir, os.path.split(self._bindings_module.__file__)[1])
)
script_file = os.path.join(working_dir, "timer_callgrind.py")
with open(script_file, "w") as f:
f.write(self._construct_script(
task_spec,
globals=GlobalsBridge(globals, data_dir),
number=number,
repeats=repeats,
collect_baseline=collect_baseline,
error_log=error_log,
stat_log=stat_log,
bindings=self._bindings_module))
run_loop_cmd = ["python", script_file]
else:
assert not collect_baseline
run_loop_exec = cpp_jit.compile_callgrind_template(
stmt=task_spec.stmt,
setup=task_spec.setup,
global_setup=task_spec.global_setup,
)
run_loop_cmd = [
run_loop_exec,
"--number", str(number),
"--number-warmup", str(min(number, 10)),
"--repeats", str(repeats),
"--number-threads", str(task_spec.num_threads),
]
valgrind_invocation, valgrind_invocation_output = run([
"valgrind",
"--tool=callgrind",
f"--callgrind-out-file={callgrind_out}",
"--dump-line=yes",
"--dump-instr=yes",
"--instr-atstart=yes",
"--collect-atstart=no",
] + run_loop_cmd)
if valgrind_invocation.returncode:
error_report = ""
if os.path.exists(error_log):
with open(error_log) as f:
error_report = f.read()
if not error_report:
error_report = "Unknown error.\n" + valgrind_invocation_output
raise OSError(f"Failed to collect callgrind profile:\n{error_report}")
def parse_output(fpath: str, inclusive: bool) -> FunctionCounts:
annotate_invocation, annotate_invocation_output = run([
"callgrind_annotate",
f"--inclusive={'yes' if inclusive else 'no'}",
"--threshold=100",
"--show-percs=no",
fpath
], check=True)
total_pattern = re.compile(r"^([0-9,]+)\s+PROGRAM TOTALS")
begin_pattern = re.compile(r"Ir\s+file:function")
function_pattern = re.compile(r"^\s*([0-9,]+)\s+(.+:.+)$")
class ScanState(enum.Enum):
SCANNING_FOR_TOTAL = 0
SCANNING_FOR_START = 1
PARSING = 2
scan_state = ScanState.SCANNING_FOR_TOTAL
fn_counts = []
for l in annotate_invocation_output.splitlines(keepends=False):
if scan_state == ScanState.SCANNING_FOR_TOTAL:
total_match = total_pattern.match(l)
if total_match:
program_totals = int(total_match.groups()[0].replace(",", ""))
scan_state = ScanState.SCANNING_FOR_START
elif scan_state == ScanState.SCANNING_FOR_START:
if begin_pattern.match(l):
scan_state = ScanState.PARSING
else:
assert scan_state == ScanState.PARSING
fn_match = function_pattern.match(l)
if fn_match:
ir_str, file_function = fn_match.groups()
ir = int(ir_str.replace(",", ""))
if ir == program_totals: # type: ignore[possibly-undefined]
# Callgrind includes some top level red herring symbols when
# a program dumps multiple profiles.
continue
fn_counts.append(FunctionCount(ir, file_function))
elif re.match(r"-+", l):
# Ignore heading separator lines.
continue
else:
break
assert scan_state == ScanState.PARSING, f"Failed to parse {fpath}"
return FunctionCounts(tuple(sorted(fn_counts, reverse=True)), inclusive=inclusive)
def read_results(i: int) -> Tuple[FunctionCounts, FunctionCounts, Optional[str]]:
if i == repeats and not collect_baseline:
# Null baseline.
return (
FunctionCounts((), inclusive=True),
FunctionCounts((), inclusive=False),
None,
)
fpath = f"{callgrind_out}.{i + 1}" # Callgrind one-indexes files.
callgrind_out_contents: Optional[str] = None
if retain_out_file:
with open(fpath) as f:
callgrind_out_contents = f.read()
return (
parse_output(fpath, inclusive=True),
parse_output(fpath, inclusive=False),
callgrind_out_contents
)
return tuple(read_results(i) for i in range(repeats + 1))
finally:
shutil.rmtree(working_dir)
@staticmethod
def _construct_script(
task_spec: common.TaskSpec,
globals: GlobalsBridge,
*,
number: int,
repeats: int,
collect_baseline: bool,
error_log: str,
stat_log: str,
bindings: Optional[CallgrindModuleType],
) -> str:
def block_stmt(stmt: str, indent: int = 0) -> str:
"""Partially unroll benchmark loop.
The naive template looks something like:
"for _ in range({number}): {stmt}"
However a loop in Python is surprisingly expensive, and significantly
increases the number of background Python instructions. So instead we
partially unroll the loops, with a block size of 100 chosen to keep
the instruction overhead from `range` low while also not ballooning
the size of the generated file.
"""
block_size = 100
loop_count = number // block_size
if loop_count == 1:
# There is no point in having `for _ in range(1): ...` rather
# than just `...`, and this lets us save shave a few background
# instructions.
loop_count = 0
remainder = number - block_size * loop_count
blocked_stmt = ""
if loop_count:
unrolled_stmts = textwrap.indent("\n".join([stmt] * block_size), " " * 4)
blocked_stmt += f"for _ in range({loop_count}):\n{unrolled_stmts}\n"
if remainder:
blocked_stmt += "\n".join([stmt] * remainder)
return textwrap.indent(blocked_stmt, " " * indent)
pass_baseline = (
"callgrind_bindings._valgrind_toggle()\n"
f"{block_stmt('pass')}\n"
"callgrind_bindings._valgrind_toggle_and_dump_stats()"
)
return textwrap.dedent(r"""
import gc
import os
import pickle
import subprocess
import sys
import time
# Mitigate https://github.com/pytorch/pytorch/issues/37377
# which can sometimes cause the subprocess call to fail.
import numpy as np
import torch
torch.set_num_threads({num_threads})
{bindings_import}
PID = os.getpid()
def log_failure(msg):
with open({error_log_repr}, "wt") as f:
f.write(msg)
sys.exit(1)
def check_result(completed_process):
if completed_process.returncode:
log_failure(f"Command failed: {{' '.join(completed_process.args)}}")
return completed_process
# =============================================================================
# == Check that subprocess matches parent =====================================
# =============================================================================
if os.path.realpath(sys.executable) != "{parent_interpreter}":
log_failure(
"Interpreter mismatch:\n"
f" {{os.path.realpath(sys.executable)}}\n vs.\n {parent_interpreter}"
)
if torch.__file__ != "{torch_file}":
log_failure(
"PyTorch does not match expected file:\n"
f" {{torch.__file__}}\n vs.\n {torch_file}"
)
# =============================================================================
# == User specified setup =====================================================
# =============================================================================
# Load serialized globals
{load_globals}
# User setup str
{setup}
for _ in range({warmup_number}):
{indented_stmt}
# =============================================================================
# == Callgrind management =====================================================
# =============================================================================
with open("{stat_log}", "wb") as stat_file:
# If many instances of callgrind are running at once, the output of
# `callgrind_control` may exceed 16kb which would cause `subprocess.PIPE`
# to deadlock. So instead we use a file.
callgrind_stat = check_result(subprocess.run(
["callgrind_control", "--stat"],
stdout=stat_file,
stderr=subprocess.STDOUT,
))
with open("{stat_log}", "rt") as stat_file:
stat_lines = stat_file.read().splitlines()
if f"PID {{PID}}: python {{__file__}}" not in stat_lines:
log_failure("Process does not appear to be running callgrind.")
gc.collect()
time.sleep(0.01)
# =============================================================================
# == User code block ==========================================================
# =============================================================================
for _ in range({repeats}):
callgrind_bindings._valgrind_toggle()
{blocked_stmt}
callgrind_bindings._valgrind_toggle_and_dump_stats()
gc.collect()
{baseline}
""").strip().format(
indented_stmt=textwrap.indent(task_spec.stmt, " " * 4),
blocked_stmt=block_stmt(task_spec.stmt, indent=4),
baseline=(pass_baseline if collect_baseline else ""),
number=number,
repeats=repeats,
load_globals=globals.construct(),
setup=task_spec.setup,
warmup_number=min(number, 10),
num_threads=task_spec.num_threads,
error_log_repr=repr(error_log),
stat_log=stat_log,
parent_interpreter=os.path.realpath(sys.executable),
torch_file=torch.__file__,
bindings_import=(
"import torch._C as callgrind_bindings" if bindings is None
else f"import {bindings.__name__} as callgrind_bindings"),
)
CALLGRIND_SINGLETON: Optional[_ValgrindWrapper] = None
def wrapper_singleton() -> _ValgrindWrapper:
global CALLGRIND_SINGLETON
if CALLGRIND_SINGLETON is None:
CALLGRIND_SINGLETON = _ValgrindWrapper()
return CALLGRIND_SINGLETON

File diff suppressed because it is too large Load Diff