I am done

This commit is contained in:
2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions

View File

@ -0,0 +1,222 @@
# mypy: allow-untyped-defs
import collections
import os
import shutil
import subprocess
try:
# no type stub for conda command line interface
import conda.cli.python_api # type: ignore[import]
from conda.cli.python_api import Commands as conda_commands
except ImportError:
# blas_compare.py will fail to import these when it's inside a conda env,
# but that's fine as it only wants the constants.
pass
WORKING_ROOT = "/tmp/pytorch_blas_compare_environments"
MKL_2020_3 = "mkl_2020_3"
MKL_2020_0 = "mkl_2020_0"
OPEN_BLAS = "open_blas"
EIGEN = "eigen"
GENERIC_ENV_VARS = ("USE_CUDA=0", "USE_ROCM=0")
BASE_PKG_DEPS = (
"cmake",
"hypothesis",
"ninja",
"numpy",
"pyyaml",
"setuptools",
"typing_extensions",
)
SubEnvSpec = collections.namedtuple(
"SubEnvSpec", (
"generic_installs",
"special_installs",
"environment_variables",
# Validate install.
"expected_blas_symbols",
"expected_mkl_version",
))
SUB_ENVS = {
MKL_2020_3: SubEnvSpec(
generic_installs=(),
special_installs=("intel", ("mkl=2020.3", "mkl-include=2020.3")),
environment_variables=("BLAS=MKL",) + GENERIC_ENV_VARS,
expected_blas_symbols=("mkl_blas_sgemm",),
expected_mkl_version="2020.0.3",
),
MKL_2020_0: SubEnvSpec(
generic_installs=(),
special_installs=("intel", ("mkl=2020.0", "mkl-include=2020.0")),
environment_variables=("BLAS=MKL",) + GENERIC_ENV_VARS,
expected_blas_symbols=("mkl_blas_sgemm",),
expected_mkl_version="2020.0.0",
),
OPEN_BLAS: SubEnvSpec(
generic_installs=("openblas",),
special_installs=(),
environment_variables=("BLAS=OpenBLAS",) + GENERIC_ENV_VARS,
expected_blas_symbols=("exec_blas",),
expected_mkl_version=None,
),
# EIGEN: SubEnvSpec(
# generic_installs=(),
# special_installs=(),
# environment_variables=("BLAS=Eigen",) + GENERIC_ENV_VARS,
# expected_blas_symbols=(),
# ),
}
def conda_run(*args):
"""Convenience method."""
stdout, stderr, retcode = conda.cli.python_api.run_command(*args)
if retcode:
raise OSError(f"conda error: {str(args)} retcode: {retcode}\n{stderr}")
return stdout
def main():
if os.path.exists(WORKING_ROOT):
print("Cleaning: removing old working root.")
shutil.rmtree(WORKING_ROOT)
os.makedirs(WORKING_ROOT)
git_root = subprocess.check_output(
"git rev-parse --show-toplevel",
shell=True,
cwd=os.path.dirname(os.path.realpath(__file__))
).decode("utf-8").strip()
for env_name, env_spec in SUB_ENVS.items():
env_path = os.path.join(WORKING_ROOT, env_name)
print(f"Creating env: {env_name}: ({env_path})")
conda_run(
conda_commands.CREATE,
"--no-default-packages",
"--prefix", env_path,
"python=3",
)
print("Testing that env can be activated:")
base_source = subprocess.run(
f"source activate {env_path}",
shell=True,
capture_output=True,
check=False,
)
if base_source.returncode:
raise OSError(
"Failed to source base environment:\n"
f" stdout: {base_source.stdout.decode('utf-8')}\n"
f" stderr: {base_source.stderr.decode('utf-8')}"
)
print("Installing packages:")
conda_run(
conda_commands.INSTALL,
"--prefix", env_path,
*(BASE_PKG_DEPS + env_spec.generic_installs)
)
if env_spec.special_installs:
channel, channel_deps = env_spec.special_installs
print(f"Installing packages from channel: {channel}")
conda_run(
conda_commands.INSTALL,
"--prefix", env_path,
"-c", channel, *channel_deps
)
if env_spec.environment_variables:
print("Setting environment variables.")
# This does not appear to be possible using the python API.
env_set = subprocess.run(
f"source activate {env_path} && "
f"conda env config vars set {' '.join(env_spec.environment_variables)}",
shell=True,
capture_output=True,
check=False,
)
if env_set.returncode:
raise OSError(
"Failed to set environment variables:\n"
f" stdout: {env_set.stdout.decode('utf-8')}\n"
f" stderr: {env_set.stderr.decode('utf-8')}"
)
# Check that they were actually set correctly.
actual_env_vars = subprocess.run(
f"source activate {env_path} && env",
shell=True,
capture_output=True,
check=True,
).stdout.decode("utf-8").strip().splitlines()
for e in env_spec.environment_variables:
assert e in actual_env_vars, f"{e} not in envs"
print(f"Building PyTorch for env: `{env_name}`")
# We have to re-run during each build to pick up the new
# build config settings.
build_run = subprocess.run(
f"source activate {env_path} && "
f"cd {git_root} && "
"python setup.py install --cmake",
shell=True,
capture_output=True,
check=True,
)
print("Checking configuration:")
check_run = subprocess.run(
# Shameless abuse of `python -c ...`
f"source activate {env_path} && "
'python -c "'
"import torch;"
"from torch.utils.benchmark import Timer;"
"print(torch.__config__.show());"
"setup = 'x=torch.ones((128, 128));y=torch.ones((128, 128))';"
"counts = Timer('torch.mm(x, y)', setup).collect_callgrind(collect_baseline=False);"
"stats = counts.as_standardized().stats(inclusive=True);"
"print(stats.filter(lambda l: 'blas' in l.lower()))\"",
shell=True,
capture_output=True,
check=False,
)
if check_run.returncode:
raise OSError(
"Failed to set environment variables:\n"
f" stdout: {check_run.stdout.decode('utf-8')}\n"
f" stderr: {check_run.stderr.decode('utf-8')}"
)
check_run_stdout = check_run.stdout.decode('utf-8')
print(check_run_stdout)
for e in env_spec.environment_variables:
if "BLAS" in e:
assert e in check_run_stdout, f"PyTorch build did not respect `BLAS=...`: {e}"
for s in env_spec.expected_blas_symbols:
assert s in check_run_stdout
if env_spec.expected_mkl_version is not None:
assert f"- Intel(R) Math Kernel Library Version {env_spec.expected_mkl_version}" in check_run_stdout
print(f"Build complete: {env_name}")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,99 @@
# mypy: allow-untyped-defs
"""Example of Timer and Compare APIs:
$ python -m examples.compare
"""
import pickle
import sys
import time
import torch
import torch.utils.benchmark as benchmark_utils
class FauxTorch:
"""Emulate different versions of pytorch.
In normal circumstances this would be done with multiple processes
writing serialized measurements, but this simplifies that model to
make the example clearer.
"""
def __init__(self, real_torch, extra_ns_per_element):
self._real_torch = real_torch
self._extra_ns_per_element = extra_ns_per_element
def extra_overhead(self, result):
# time.sleep has a ~65 us overhead, so only fake a
# per-element overhead if numel is large enough.
numel = int(result.numel())
if numel > 5000:
time.sleep(numel * self._extra_ns_per_element * 1e-9)
return result
def add(self, *args, **kwargs):
return self.extra_overhead(self._real_torch.add(*args, **kwargs))
def mul(self, *args, **kwargs):
return self.extra_overhead(self._real_torch.mul(*args, **kwargs))
def cat(self, *args, **kwargs):
return self.extra_overhead(self._real_torch.cat(*args, **kwargs))
def matmul(self, *args, **kwargs):
return self.extra_overhead(self._real_torch.matmul(*args, **kwargs))
def main():
tasks = [
("add", "add", "torch.add(x, y)"),
("add", "add (extra +0)", "torch.add(x, y + zero)"),
]
serialized_results = []
repeats = 2
timers = [
benchmark_utils.Timer(
stmt=stmt,
globals={
"torch": torch if branch == "master" else FauxTorch(torch, overhead_ns),
"x": torch.ones((size, 4)),
"y": torch.ones((1, 4)),
"zero": torch.zeros(()),
},
label=label,
sub_label=sub_label,
description=f"size: {size}",
env=branch,
num_threads=num_threads,
)
for branch, overhead_ns in [("master", None), ("my_branch", 1), ("severe_regression", 5)]
for label, sub_label, stmt in tasks
for size in [1, 10, 100, 1000, 10000, 50000]
for num_threads in [1, 4]
]
for i, timer in enumerate(timers * repeats):
serialized_results.append(pickle.dumps(
timer.blocked_autorange(min_run_time=0.05)
))
print(f"\r{i + 1} / {len(timers) * repeats}", end="")
sys.stdout.flush()
print()
comparison = benchmark_utils.Compare([
pickle.loads(i) for i in serialized_results
])
print("== Unformatted " + "=" * 80 + "\n" + "/" * 95 + "\n")
comparison.print()
print("== Formatted " + "=" * 80 + "\n" + "/" * 93 + "\n")
comparison.trim_significant_figures()
comparison.colorize()
comparison.print()
if __name__ == "__main__":
main()

View File

@ -0,0 +1,86 @@
# mypy: allow-untyped-defs
"""Example of the Timer and Fuzzer APIs:
$ python -m examples.fuzzer
"""
import sys
import torch.utils.benchmark as benchmark_utils
def main():
add_fuzzer = benchmark_utils.Fuzzer(
parameters=[
[
benchmark_utils.FuzzedParameter(
name=f"k{i}",
minval=16,
maxval=16 * 1024,
distribution="loguniform",
) for i in range(3)
],
benchmark_utils.FuzzedParameter(
name="d",
distribution={2: 0.6, 3: 0.4},
),
],
tensors=[
[
benchmark_utils.FuzzedTensor(
name=name,
size=("k0", "k1", "k2"),
dim_parameter="d",
probability_contiguous=0.75,
min_elements=64 * 1024,
max_elements=128 * 1024,
) for name in ("x", "y")
],
],
seed=0,
)
n = 250
measurements = []
for i, (tensors, tensor_properties, _) in enumerate(add_fuzzer.take(n=n)):
x, x_order = tensors["x"], str(tensor_properties["x"]["order"])
y, y_order = tensors["y"], str(tensor_properties["y"]["order"])
shape = ", ".join(tuple(f'{i:>4}' for i in x.shape))
description = "".join([
f"{x.numel():>7} | {shape:<16} | ",
f"{'contiguous' if x.is_contiguous() else x_order:<12} | ",
f"{'contiguous' if y.is_contiguous() else y_order:<12} | ",
])
timer = benchmark_utils.Timer(
stmt="x + y",
globals=tensors,
description=description,
)
measurements.append(timer.blocked_autorange(min_run_time=0.1))
measurements[-1].metadata = {"numel": x.numel()}
print(f"\r{i + 1} / {n}", end="")
sys.stdout.flush()
print()
# More string munging to make pretty output.
print(f"Average attempts per valid config: {1. / (1. - add_fuzzer.rejection_rate):.1f}")
def time_fn(m):
return m.median / m.metadata["numel"]
measurements.sort(key=time_fn)
template = f"{{:>6}}{' ' * 19}Size Shape{' ' * 13}X order Y order\n{'-' * 80}"
print(template.format("Best:"))
for m in measurements[:15]:
print(f"{time_fn(m) * 1e9:>4.1f} ns / element {m.description}")
print("\n" + template.format("Worst:"))
for m in measurements[-15:]:
print(f"{time_fn(m) * 1e9:>4.1f} ns / element {m.description}")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,105 @@
# mypy: allow-untyped-defs
"""Example use of Timer and op fuzzers to measure kernel performance.
$ python -m examples.op_benchmark
"""
import numpy as np
import torch
from torch.utils.benchmark import Timer
from torch.utils.benchmark.op_fuzzers.binary import BinaryOpFuzzer
from torch.utils.benchmark.op_fuzzers.unary import UnaryOpFuzzer
import operator
_MEASURE_TIME = 1.0
def assert_dicts_equal(dict_0, dict_1):
"""Builtin dict comparison will not compare numpy arrays.
e.g.
x = {"a": np.ones((2, 1))}
x == x # Raises ValueError
"""
assert set(dict_0.keys()) == set(dict_0.keys())
assert all(np.all(v == dict_1[k]) for k, v in dict_0.items() if k != "dtype")
def run(n, stmt, fuzzer_cls):
float_iter = fuzzer_cls(seed=0, dtype=torch.float32).take(n)
int_iter = fuzzer_cls(seed=0, dtype=torch.int32).take(n)
raw_results = []
for i, (float_values, int_values) in enumerate(zip(float_iter, int_iter)):
float_tensors, float_tensor_params, float_params = float_values
int_tensors, int_tensor_params, int_params = int_values
# This benchmark assumes that the two fuzzers generate identically
# sized and strided Tensors, since the same seed is used.
assert_dicts_equal(float_params, int_params)
assert_dicts_equal(float_tensor_params["x"], int_tensor_params["x"])
float_measurement, int_measurement = (
Timer(
stmt,
globals=tensors,
).blocked_autorange(min_run_time=_MEASURE_TIME)
for tensors in (float_tensors, int_tensors)
)
descriptions = []
for name in float_tensors:
shape_str = "(" + ", ".join([
f"2 ** {int(np.log2(i))}"
if 2 ** int(np.log2(i)) == i and i > 1
else str(i)
for i in float_tensors[name].shape
]) + ")"
order = float_tensor_params[name]["order"]
order_str = ("" if all(order == np.arange(len(order))) else str(tuple(order)))
steps = float_tensor_params[name]["steps"]
steps_str = str(steps) if sum(steps) > len(steps) else ""
descriptions.append((name, shape_str, order_str, steps_str))
raw_results.append((float_measurement, int_measurement, descriptions))
print(f"\r{i + 1} / {n}", end="")
print()
parsed_results, name_len, shape_len, order_len, steps_len = [], 0, 0, 0, 0
for float_measurement, int_measurement, descriptions in raw_results:
t_float = float_measurement.median * 1e6
t_int = int_measurement.median * 1e6
rel_diff = abs(t_float - t_int) / (t_float + t_int) * 2
parsed_results.append((t_float, t_int, rel_diff, descriptions))
for name, shape, order, steps in descriptions:
name_len = max(name_len, len(name))
shape_len = max(shape_len, len(shape))
order_len = max(order_len, len(order))
steps_len = max(steps_len, len(steps))
parsed_results.sort(key=operator.itemgetter(2))
print(f"stmt: {stmt}")
print(f" diff faster{'':>17}{' ' * name_len} ", end="")
print(f"{'shape'.ljust(shape_len)}{'':>16}{'order'.ljust(order_len)}", end="")
print(f" steps\n{'-' * 100}")
for results, spacer in [(parsed_results[:10], "..."), (parsed_results[-10:], "")]:
for t_float, t_int, rel_diff, descriptions in results:
time_str = [f"{rel_diff * 100:>4.1f}% {'int' if t_int < t_float else 'float':<20}"]
time_str.extend(["".ljust(len(time_str[0])) for _ in descriptions[:-1]])
for t_str, (name, shape, order, steps) in zip(time_str, descriptions):
name = f"{name}:".ljust(name_len + 1)
shape = shape.ljust(shape_len + 10)
order = order.ljust(order_len)
print(f"{t_str} {name} {shape}| {order} | {steps}")
print(spacer)
def main():
run(n=100, stmt="torch.median(x, dim=0)", fuzzer_cls=UnaryOpFuzzer)
run(n=100, stmt="torch.square(x)", fuzzer_cls=UnaryOpFuzzer)
run(n=100, stmt="x + y", fuzzer_cls=BinaryOpFuzzer)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,26 @@
# mypy: allow-untyped-defs
"""Trivial use of Timer API:
$ python -m examples.simple_timeit
"""
import torch
import torch.utils.benchmark as benchmark_utils
def main():
timer = benchmark_utils.Timer(
stmt="x + y",
globals={"x": torch.ones((4, 8)), "y": torch.ones((1, 8))},
label="Broadcasting add (4x8)",
)
for i in range(3):
print(f"Run: {i}\n{'-' * 40}")
print(f"timeit:\n{timer.timeit(10000)}\n")
print(f"autorange:\n{timer.blocked_autorange()}\n\n")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,114 @@
# mypy: allow-untyped-defs
"""Microbenchmarks for the torch.fft module"""
from argparse import ArgumentParser
from collections import namedtuple
from collections.abc import Iterable
import torch
import torch.fft
from torch.utils import benchmark
from torch.utils.benchmark.op_fuzzers.spectral import SpectralOpFuzzer
def _dim_options(ndim):
if ndim == 1:
return [None]
elif ndim == 2:
return [0, 1, None]
elif ndim == 3:
return [0, 1, 2, (0, 1), (0, 2), None]
raise ValueError(f"Expected ndim in range 1-3, got {ndim}")
def run_benchmark(name: str, function: object, dtype: torch.dtype, seed: int, device: str, samples: int,
probability_regular: float):
cuda = device == 'cuda'
spectral_fuzzer = SpectralOpFuzzer(seed=seed, dtype=dtype, cuda=cuda,
probability_regular=probability_regular)
results = []
for tensors, tensor_params, params in spectral_fuzzer.take(samples):
shape = [params['k0'], params['k1'], params['k2']][:params['ndim']]
str_shape = ' x '.join([f"{s:<4}" for s in shape])
sub_label = f"{str_shape} {'' if tensor_params['x']['is_contiguous'] else '(discontiguous)'}"
for dim in _dim_options(params['ndim']):
for nthreads in (1, 4, 16) if not cuda else (1,):
measurement = benchmark.Timer(
stmt='func(x, dim=dim)',
globals={'func': function, 'x': tensors['x'], 'dim': dim},
label=f"{name}_{device}",
sub_label=sub_label,
description=f"dim={dim}",
num_threads=nthreads,
).blocked_autorange(min_run_time=1)
measurement.metadata = {
'name': name,
'device': device,
'dim': dim,
'shape': shape,
}
measurement.metadata.update(tensor_params['x'])
results.append(measurement)
return results
Benchmark = namedtuple('Benchmark', ['name', 'function', 'dtype'])
BENCHMARKS = [
Benchmark('fft_real', torch.fft.fftn, torch.float32),
Benchmark('fft_complex', torch.fft.fftn, torch.complex64),
Benchmark('ifft', torch.fft.ifftn, torch.complex64),
Benchmark('rfft', torch.fft.rfftn, torch.float32),
Benchmark('irfft', torch.fft.irfftn, torch.complex64),
]
BENCHMARK_MAP = {b.name: b for b in BENCHMARKS}
BENCHMARK_NAMES = [b.name for b in BENCHMARKS]
DEVICE_NAMES = ['cpu', 'cuda']
def _output_csv(file, results):
file.write('benchmark,device,num_threads,numel,shape,contiguous,dim,mean (us),median (us),iqr (us)\n')
for measurement in results:
metadata = measurement.metadata
device, dim, shape, name, numel, contiguous = (
metadata['device'], metadata['dim'], metadata['shape'],
metadata['name'], metadata['numel'], metadata['is_contiguous'])
if isinstance(dim, Iterable):
dim_str = '-'.join(str(d) for d in dim)
else:
dim_str = str(dim)
shape_str = 'x'.join(str(s) for s in shape)
print(name, device, measurement.task_spec.num_threads, numel, shape_str, contiguous, dim_str, # type: ignore[possibly-undefined]
measurement.mean * 1e6, measurement.median * 1e6, measurement.iqr * 1e6,
sep=',', file=file)
if __name__ == '__main__':
parser = ArgumentParser(description=__doc__)
parser.add_argument('--device', type=str, choices=DEVICE_NAMES, nargs='+', default=DEVICE_NAMES)
parser.add_argument('--bench', type=str, choices=BENCHMARK_NAMES, nargs='+', default=BENCHMARK_NAMES)
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--samples', type=int, default=10)
parser.add_argument('--probability-regular', '--probability_regular', type=float, default=1.0)
parser.add_argument('-o', '--output', type=str)
args = parser.parse_args()
num_benchmarks = len(args.device) * len(args.bench)
i = 0
results = []
for device in args.device:
for bench in (BENCHMARK_MAP[b] for b in args.bench):
results += run_benchmark(
name=bench.name, function=bench.function, dtype=bench.dtype,
seed=args.seed, device=device, samples=args.samples,
probability_regular=args.probability_regular)
i += 1
print(f'Completed {bench.name} benchmark on {device} ({i} of {num_benchmarks})')
if args.output is not None:
with open(args.output, 'w') as f:
_output_csv(f, results)
compare = benchmark.Compare(results)
compare.trim_significant_figures()
compare.colorize()
compare.print()