I am done

This commit is contained in:
2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions

View File

@ -0,0 +1,50 @@
# mypy: allow-untyped-defs
r"""
PyTorch Profiler is a tool that allows the collection of performance metrics during training and inference.
Profiler's context manager API can be used to better understand what model operators are the most expensive,
examine their input shapes and stack traces, study device kernel activity and visualize the execution trace.
.. note::
An earlier version of the API in :mod:`torch.autograd` module is considered legacy and will be deprecated.
"""
import os
from torch._C._autograd import _supported_activities, DeviceType, kineto_available
from torch._C._profiler import _ExperimentalConfig, ProfilerActivity, RecordScope
from torch.autograd.profiler import KinetoStepTracker, record_function
from torch.optim.optimizer import register_optimizer_step_post_hook
from .profiler import (
_KinetoProfile,
ExecutionTraceObserver,
profile,
ProfilerAction,
schedule,
supported_activities,
tensorboard_trace_handler,
)
__all__ = [
"profile",
"schedule",
"supported_activities",
"tensorboard_trace_handler",
"ProfilerAction",
"ProfilerActivity",
"kineto_available",
"DeviceType",
"record_function",
"ExecutionTraceObserver",
]
from . import itt
def _optimizer_post_hook(optimizer, args, kwargs):
KinetoStepTracker.increment_step("Optimizer")
if os.environ.get("KINETO_USE_DAEMON", None):
_ = register_optimizer_step_post_hook(_optimizer_post_hook)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,663 @@
# mypy: allow-untyped-defs
import json
import math
import os
import re
from typing import Dict, List, Optional, Set
import torch
import torch.utils.benchmark as benchmark
from torch._C._profiler import (
_EventType,
_ExtraFields_PyCall,
_ExtraFields_PyCCall,
_ExtraFields_TorchOp,
_ProfilerEvent,
)
from torch.profiler import profile
from torch.profiler._utils import index_of_first_match, traverse_bfs, traverse_dfs
class Pattern:
"""
Base class for all patterns, subclass this class and implement match()
to define custom patterns.
In subclass, define description and skip property.
"""
def __init__(self, prof: profile, should_benchmark: bool = False):
self.prof = prof
self.should_benchmark = should_benchmark
self.name = "Please specify a name for pattern"
self.description = "Please specify a description for pattern"
self.url = ""
assert prof.profiler is not None and prof.profiler.kineto_results is not None
self.event_tree = prof.profiler.kineto_results.experimental_event_tree()
self.tid_root: Dict[int, List[_ProfilerEvent]] = {}
for event in self.event_tree:
self.tid_root.setdefault(event.start_tid, []).append(event)
@property
def skip(self):
return False
def report(self, event: _ProfilerEvent):
msg = (
f"{self.description}\n[Source Code Location] {source_code_location(event)}"
)
return msg
def eventTreeTraversal(self):
"""
Traverse the event tree and yield all events.
Override this method in subclass to customize the traversal.
"""
yield from traverse_dfs(self.event_tree)
def summary(self, events: List[_ProfilerEvent]):
default_summary = f"{self.name}: {len(events)} events matched."
if self.should_benchmark:
# If benchmark summary is not empty, use it.
return (
self.benchmark_summary(events)
if hasattr(self, "benchmark") # type: ignore[attr-defined]
else default_summary
)
return default_summary
def benchmark_summary(self, events: List[_ProfilerEvent]):
def format_time(time_ns: int):
unit_lst = ["ns", "us", "ms"]
for unit in unit_lst:
if time_ns < 1000:
return f"{time_ns:.2f} {unit}"
time_ns //= 1000
return f"{time_ns:.2f} s"
assert hasattr(self, "benchmark"), "Please implement benchmark()"
shapes_factor_map = self.benchmark(events) # type: ignore[attr-defined]
original_time = sum(event.duration_time_ns for event in events)
new_time = sum(
shapes_factor_map[input_shapes(event)] * event.duration_time_ns
for event in events
)
return (
f"{self.name}: {len(events)} events matched. "
f"Total Estimated Speedup: {format_time(original_time - new_time)} ({round(original_time/new_time, 2)}X)"
)
def match(self, event: _ProfilerEvent):
"""
Return True if the event matches the pattern.
This method should be overriden in subclass.
"""
raise NotImplementedError
def matched_events(self):
if self.skip:
return []
matched_events = []
for event in self.eventTreeTraversal():
if self.match(event):
matched_events.append(event)
return matched_events
def root_of(self, event: _ProfilerEvent):
while event.parent:
event = event.parent
return event
def siblings_of(self, event: _ProfilerEvent):
if event.parent:
children = event.parent.children
else:
children = self.tid_root[event.start_tid]
index = children.index(event)
return children[:index], children[index + 1 :]
def next_of(self, event: _ProfilerEvent):
_, next_events = self.siblings_of(event)
return next_events[0] if next_events else None
def prev_of(self, event: _ProfilerEvent):
prev_events, _ = self.siblings_of(event)
return prev_events[-1] if prev_events else None
def go_up_until(self, event: _ProfilerEvent, predicate):
if not event:
return None
while event.parent and not predicate(event):
event = event.parent
return event
# Patterns
class NamePattern(Pattern):
def __init__(self, prof: profile, name: str, should_benchmark: bool = False):
super().__init__(prof, should_benchmark)
self.description = f"Matched Name Event: {name}"
self.name = name
def match(self, event: _ProfilerEvent):
return re.search(self.name, event.name) is not None
class ExtraCUDACopyPattern(Pattern):
"""
This pattern identifies if we creates a constant tensor on CPU and immediately moves it to GPU.
example: torch.zeros((100, 100)).to("cuda")
Pattern:
build-in method |build-in method
... | aten::to
aten::fill_/aten::zero_ | aten::_to_copy
Algorithm:
We start at node aten::to, go parent events' previous events,
and check if we have a aten::fill_/aten::zero_ as we keep going down the tree.
We always select the last child in the children list when we go down the tree.
If at any step we failed, it is not a match.
"""
def __init__(self, prof: profile, should_benchmark: bool = False):
super().__init__(prof, should_benchmark)
self.name = "Extra CUDA Copy Pattern"
self.description = "Filled a CPU tensor and immediately moved it to GPU. Please initialize it on GPU."
self.url = "https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#create-tensors-directly-on-the-target-device"
self.init_ops = {
"aten::fill_",
"aten::zero_",
"aten::normal_",
"aten::uniform_",
}
@property
def skip(self):
return not self.prof.with_stack or not self.prof.record_shapes
def match(self, event):
# TODO: We should also check tensor identities
if event.name != "aten::to":
return False
to_event = event
if not event.children:
return False
event = event.children[-1]
if event.name != "aten::_to_copy":
return False
if not event.children:
return False
event = event.children[-1]
if event.name != "aten::copy_":
return False
# aten::copy_ should have the first 2 args dtype the same
dtypes = input_dtypes(event)
if len(dtypes) < 2:
return False
if dtypes[0] is None or dtypes[0] != dtypes[1]:
return False
event = to_event
# Up one level
event = event.parent
if event is None:
return False
# Check if we have a aten::fill_ in previous leaf
event = self.prev_of(event)
if event is None:
return False
while event.children:
event = event.children[-1]
# aten::zero_ is a special optimzation case where fill_ is not called
if event.name in self.init_ops:
return True
return event.name in self.init_ops
# TODO: Check if tensor is reused
def benchmark(self, events: List[_ProfilerEvent]):
shapes_factor_map = {input_shapes(event): 0.0 for event in events}
for shape in shapes_factor_map:
size = shape[0]
to_timer = benchmark.Timer(
stmt='torch.ones(size).to("cuda")', globals={"size": size}
)
de_timer = benchmark.Timer(
stmt='torch.ones(size, device="cuda")', globals={"size": size}
)
to_time = to_timer.timeit(10).mean
de_time = de_timer.timeit(10).mean
shapes_factor_map[shape] = de_time / to_time
return shapes_factor_map
class ForLoopIndexingPattern(Pattern):
"""
This pattern identifies if we use a for loop to index a tensor that
can be vectorized.
example:
tensor = torch.empty((100, 100))
for i in range(100):
tensor[i] = i
Pattern:
aten::select | ... | aten::select | ... (Repeat)
Algorithm:
We start at node aten::select, and we check if we can find this alternating patterns.
We also keep a dictionary to avoid duplicate match in the for loop.
"""
def __init__(self, prof: profile, should_benchmark: bool = False):
super().__init__(prof, should_benchmark)
self.name = "For Loop Indexing Pattern"
self.description = "For loop indexing detected. Vectorization recommended."
self.visited: Set[int] = set()
def eventTreeTraversal(self):
"""
We need to use BFS traversal order to avoid duplicate match.
"""
yield from traverse_bfs(self.event_tree)
def match(self, event: _ProfilerEvent):
if event.name != "aten::select":
return False
if event.id in self.visited:
return False
repeat_count = 1
_, next = self.siblings_of(event)
if len(next) <= 1:
return False
# Custom event list matching
def same_ops(list1, list2):
if len(list1) != len(list2):
return False
for op1, op2 in zip(list1, list2):
if op1.name != op2.name:
return False
return True
# Record the ops between two aten::select
next_select_idx = index_of_first_match(next, lambda e: e.name == "aten::select")
if next_select_idx is None:
return False
indexing_ops = [event] + next[:next_select_idx]
next = next[len(indexing_ops) - 1 :]
for i in range(0, len(next), len(indexing_ops)):
if same_ops(indexing_ops, next[i : i + len(indexing_ops)]):
repeat_count += 1
self.visited.add(next[i].id)
else:
break
return repeat_count >= 10
class FP32MatMulPattern(Pattern):
def __init__(self, prof: profile, should_benchmark: bool = False):
super().__init__(prof, should_benchmark)
self.name = "FP32 MatMul Pattern"
self.description = (
"You are currently using GPU that supports TF32. "
"Please enable TF32 by setting 'torch.backends.cuda.matmul.allow_tf32 = True'"
)
self.url = "https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
@property
def skip(self):
if torch.version.hip is not None:
has_tf32 = False
else:
# Anything less than sm_80 is not Ampere which doesn't support TF32
has_tf32 = all(int(arch[3:]) >= 80 for arch in torch.cuda.get_arch_list())
return has_tf32 is False or super().skip or not self.prof.record_shapes
def match(self, event: _ProfilerEvent):
# If we saw this pattern once, we don't need to match it again
if event.tag != _EventType.TorchOp:
return False
assert isinstance(event.extra_fields, _ExtraFields_TorchOp)
if event.name == "aten::mm":
if event.extra_fields.allow_tf32_cublas is False:
return True
return False
def report(self, event: _ProfilerEvent):
return self.description
def benchmark(self, events: List[_ProfilerEvent]):
shapes_factor_map = {input_shapes(event): 0.0 for event in events}
for shape in shapes_factor_map:
matrixA = torch.randn(shape[0], device="cuda", dtype=torch.float32)
matrixB = torch.randn(shape[1], device="cuda", dtype=torch.float32)
fp32_timer = benchmark.Timer(
stmt="torch.mm(matrixA, matrixB)",
globals={"matrixA": matrixA, "matrixB": matrixB},
)
tf32_timer = benchmark.Timer(
stmt="torch.mm(matrixA, matrixB)",
setup="torch.backends.cuda.matmul.allow_tf32 = True",
globals={"matrixA": matrixA, "matrixB": matrixB},
)
torch.backends.cuda.matmul.allow_tf32 = False
fp32_time = fp32_timer.timeit(10).mean
tf32_time = tf32_timer.timeit(10).mean
shapes_factor_map[shape] = tf32_time / fp32_time
return shapes_factor_map
class OptimizerSingleTensorPattern(Pattern):
"""
This pattern identifies if we are using the single-tensor version of an optimizer.
example:
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
By adding foreach=True to enable multi-tensor optimizer, we can gain speedup when
the kernels are relatively small.
Pattern:
XXXXX: _single_tenser_<OPTIMIZER_NAME>
Algorithm:
String match
"""
def __init__(self, prof: profile, should_benchmark: bool = False):
super().__init__(prof, should_benchmark)
self.name = "Optimizer Single Tensor Pattern"
self.optimizers_with_foreach = ["adam", "sgd", "adamw"]
self.description = (
"Deteced optimizer running with single tensor implementation. "
"Please enable multi tensor implementation by passing 'foreach=True' into optimizer."
)
self.url = ""
def match(self, event: _ProfilerEvent):
for optimizer in self.optimizers_with_foreach:
if event.name.endswith(f"_single_tensor_{optimizer}"):
return True
return False
class SynchronizedDataLoaderPattern(Pattern):
"""
This pattern identifies if we are using num_workers=0 in DataLoader.
example:
torch.utils.data.DataLoader(dataset, batch_size=batch_size)
Add num_workers=N to the arguments. N depends on system configuration.
Pattern:
dataloader.py(...): __iter__
dataloader.py(...): _get_iterator
NOT dataloader.py(...): check_worker_number_rationality
Algorithm:
If we don't see check_worker_number_rationality call in the dataloader __iter__,
It is not an asynchronous dataloader.
"""
def __init__(self, prof: profile, should_benchmark: bool = False):
super().__init__(prof, should_benchmark)
self.name = "Synchronized DataLoader Pattern"
self.description = (
"Detected DataLoader running with synchronized implementation. "
"Please enable asynchronous dataloading by setting num_workers > 0 when initializing DataLoader."
)
self.url = (
"https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html"
"#enable-async-data-loading-and-augmentation"
)
def match(self, event: _ProfilerEvent):
def is_dataloader_function(name: str, function_name: str):
return name.startswith(
os.path.join("torch", "utils", "data", "dataloader.py")
) and name.endswith(function_name)
# TODO: fixme! Due to lifetime issues of the function name, this field might
# actually point to an already freed string when the even is a PyCall.
# Just silently skip this to unblock testing.
try:
event.name
except UnicodeDecodeError:
return False
if not is_dataloader_function(event.name, "__iter__"):
return False
if not event.children:
return False
event = event.children[0]
if not is_dataloader_function(event.name, "_get_iterator"):
return False
if not event.children:
return False
event = event.children[0]
return not is_dataloader_function(event.name, "check_worker_number_rationality")
# TODO: We should also check if the loader is bottleneck.
class GradNotSetToNonePattern(Pattern):
"""
This pattern identifies if we are not setting grad to None in zero_grad.
example:
optimizer.zero_grad()
By setting set_to_none=True, we can gain speedup
Pattern:
XXXXX: _zero_grad
NOT aten::zeros
aten::zero_
aten::zero_ is called on each parameter in the model.
We also want to make sure it is not called by aten::zeros.
Algorithm:
String match
"""
def __init__(self, prof: profile, should_benchmark: bool = False):
super().__init__(prof, should_benchmark)
self.name = "Gradient Set To Zero Instead of None Pattern"
self.description = (
"Detected gradient set to zero instead of None. "
"Please add 'set_to_none=True' when calling zero_grad()."
)
self.url = (
"https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html"
"#disable-gradient-calculation-for-validation-or-inference"
)
def match(self, event: _ProfilerEvent):
if not event.name.endswith(": zero_grad"):
return False
if not event.children:
return False
for sub_event in traverse_dfs(event.children):
if (
sub_event.name == "aten::zero_"
and sub_event.parent.name != "aten::zeros"
):
return True
# TODO: We should also check if the optimizer's numerical behavior will change.
return False
class Conv2dBiasFollowedByBatchNorm2dPattern(Pattern):
"""
This pattern identifies if we are enabling bias in Conv2d which is followed by BatchNorm2d.
Bias doesn't do anything when followed by batchnorm.
Pattern:
nn.Module: Conv2d | nn.Module: BatchNorm2d
...
aten::conv2d AND dtype of third argument is not null
The third argument is the bias
Algorithm:
String match
"""
def __init__(self, prof: profile, should_benchmark: bool = False):
super().__init__(prof, should_benchmark)
self.name = "Enabling Bias in Conv2d Followed By BatchNorm Pattern"
self.description = "Detected bias enabled in Conv2d that is followed by BatchNorm2d. Please set 'bias=False' in Conv2d."
self.url = (
"https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html"
"#disable-bias-for-convolutions-directly-followed-by-a-batch-norm"
)
@property
def skip(self):
return self.prof.record_shapes is False or super().skip
def match(self, event: _ProfilerEvent):
if event.name != "aten::conv2d":
return False
if len(input_dtypes(event)) < 3 or input_dtypes(event)[2] is None:
return False
# This means bias=True
event = self.go_up_until(
event, lambda e: e.name.startswith("nn.Module: Conv2d")
)
if not event:
return False
event = self.next_of(event)
if not event:
return False
return event.name.startswith("nn.Module: BatchNorm2d")
class MatMulDimInFP16Pattern(Pattern):
def __init__(self, prof: profile, should_benchmark: bool = False):
super().__init__(prof, should_benchmark)
self.name = "Matrix Multiplication Dimension Not Aligned Pattern"
self.description = "Detected matmul with dimension not aligned. Please use matmul with aligned dimension."
self.url = "https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#use-mixed-precision-and-amp"
@property
def skip(self):
return not self.prof.with_stack or not self.prof.record_shapes
def match(self, event: _ProfilerEvent):
def mutiple_of(shapes, multiple):
return all(dim % multiple == 0 for shape in shapes for dim in shape[-2:])
if event.name not in ("aten::mm", "aten::bmm", "aten::addmm"):
return False
if not input_dtypes(event):
return False
arg_dtype = input_dtypes(event)[0]
if arg_dtype in (torch.bfloat16, torch.half) and not mutiple_of(
input_shapes(event), 8
):
return True
return False
def benchmark(self, events: List[_ProfilerEvent]):
def closest_multiple(shapes, multiple):
return [multiple * math.ceil(shape / multiple) for shape in shapes]
shapes_factor_map = {input_shapes(event): 0.0 for event in events}
for shape in shapes_factor_map:
matrixA = torch.randn(shape[0], device="cuda", dtype=torch.float16)
matrixB = torch.randn(shape[1], device="cuda", dtype=torch.float16)
not_aligned_dim_timer = benchmark.Timer(
stmt="torch.mm(matrixA, matrixB)",
globals={"matrixA": matrixA, "matrixB": matrixB},
)
matrixA = torch.randn(
closest_multiple(shape[0], 8), device="cuda", dtype=torch.float16
)
matrixB = torch.randn(
closest_multiple(shape[1], 8), device="cuda", dtype=torch.float16
)
aligned_dim_timer = benchmark.Timer(
stmt="torch.mm(matrixA, matrixB)",
globals={"matrixA": matrixA, "matrixB": matrixB},
)
not_aligned_dim_time = not_aligned_dim_timer.timeit(10).mean
aligned_dim_time = aligned_dim_timer.timeit(10).mean
shapes_factor_map[shape] = aligned_dim_time / not_aligned_dim_time
return shapes_factor_map
def source_code_location(event: Optional[_ProfilerEvent]):
while event:
if event.tag == _EventType.PyCall or event.tag == _EventType.PyCCall:
assert isinstance(
event.extra_fields, (_ExtraFields_PyCall, _ExtraFields_PyCCall)
)
if not event.extra_fields.caller.file_name.startswith("torch" + os.sep):
return f"{event.extra_fields.caller.file_name}:{event.extra_fields.caller.line_number}"
event = event.parent
return "No source code location found"
def input_shapes(event: _ProfilerEvent):
assert isinstance(event.extra_fields, _ExtraFields_TorchOp)
return tuple(tuple(getattr(i, "sizes", ())) for i in event.extra_fields.inputs)
def input_dtypes(event: _ProfilerEvent):
assert isinstance(event.extra_fields, _ExtraFields_TorchOp)
return tuple(getattr(i, "dtype", None) for i in event.extra_fields.inputs)
def report_all_anti_patterns(
prof,
should_benchmark: bool = False,
print_enable: bool = True,
json_report_dir: Optional[str] = None,
):
report_dict: Dict = {}
anti_patterns = [
ExtraCUDACopyPattern(prof, should_benchmark),
# ForLoopIndexingPattern(prof, should_benchmark),
FP32MatMulPattern(prof, should_benchmark),
OptimizerSingleTensorPattern(prof, should_benchmark),
SynchronizedDataLoaderPattern(prof, should_benchmark),
GradNotSetToNonePattern(prof, should_benchmark),
Conv2dBiasFollowedByBatchNorm2dPattern(prof, should_benchmark),
MatMulDimInFP16Pattern(prof, should_benchmark),
]
reported = set()
summaries = []
message_list = [f"{'-'*40}TorchTidy Report{'-'*40}"]
message_list.append("Matched Events:")
for anti_pattern in anti_patterns:
matched_events = anti_pattern.matched_events()
if not matched_events:
continue
summaries.append(anti_pattern.summary(matched_events))
for event in matched_events:
report_msg = anti_pattern.report(event)
if report_msg not in reported:
message_list.append(report_msg)
reported.add(report_msg)
src_location, line_no = source_code_location(event).split(":")
report_dict.setdefault(src_location, []).append(
{
"line_number": int(line_no),
"name": anti_pattern.name,
"url": anti_pattern.url,
"message": anti_pattern.description,
}
)
if json_report_dir is not None:
json_report_path = os.path.join(json_report_dir, "torchtidy_report.json")
if os.path.exists(json_report_path):
with open(json_report_path) as f:
exisiting_report = json.load(f)
exisiting_report.update(report_dict)
report_dict = exisiting_report
with open(json_report_path, "w") as f:
json.dump(report_dict, f, indent=4)
message_list.append("Summary:")
message_list += summaries
message_list.append(f"{'-'*40}TorchTidy Report{'-'*40}")
if print_enable:
print("\n".join(message_list))

View File

@ -0,0 +1,385 @@
# mypy: allow-untyped-defs
import functools
import operator
import re
from collections import deque
from dataclasses import dataclass
from typing import Dict, List, TYPE_CHECKING
from torch.autograd.profiler import profile
from torch.profiler import DeviceType
if TYPE_CHECKING:
from torch.autograd import _KinetoEvent
def _traverse(tree, next_fn, children_fn=lambda x: x.children, reverse: bool = False):
order = reversed if reverse else lambda x: x
remaining = deque(order(tree))
while remaining:
curr_event = next_fn(remaining)
yield curr_event
for child_event in order(children_fn(curr_event)):
remaining.append(child_event)
traverse_dfs = functools.partial(_traverse, next_fn=lambda x: x.pop(), reverse=True)
traverse_bfs = functools.partial(
_traverse, next_fn=lambda x: x.popleft(), reverse=False
)
@dataclass
class EventMetrics:
duration_time_ns: int = 0
self_time_ns: int = 0
idle_time_ns: int = 0
queue_depth: int = 0
@property
def fraction_idle_time(self):
if self.duration_time_ns == 0:
return 0.0
return self.idle_time_ns / self.duration_time_ns
@dataclass
class Interval:
start: int
end: int
queue_depth: int = 0
class EventKey:
def __init__(self, event):
self.event = event
def __hash__(self):
return hash(self.event.id)
def __eq__(self, other):
return self.event.id == other.event.id
def __repr__(self):
return f"{self.event.name}"
def intervals_overlap(self, intervals: List[Interval]):
overlap_time = 0
intervals = sorted(intervals, key=lambda x: x.start)
if intervals:
overlap_start = max(self.event.start_time_ns, intervals[0].start)
overlap_end = min(self.event.end_time_ns, intervals[0].end)
if overlap_start < overlap_end:
overlap_time += overlap_end - overlap_start
i, j = 0, 1
while j < len(intervals):
prev_interval = intervals[i]
curr_interval = intervals[j]
j += 1
if prev_interval.end > curr_interval.start:
# Completely subsumed by previous interval
if prev_interval.end > curr_interval.end:
j += 1
continue
else:
curr_interval.start = prev_interval.end
i = j
overlap_start = max(self.event.start_time_ns, curr_interval.start)
overlap_end = min(self.event.end_time_ns, curr_interval.end)
if overlap_start < overlap_end:
overlap_time += overlap_end - overlap_start
return overlap_time
class BasicEvaluation:
def __init__(self, prof: profile):
self.profile = prof
self.metrics: Dict[EventKey, EventMetrics] = {}
self.compute_self_time()
self.event_keys = sorted(
(e for e in self.metrics.keys()), key=lambda x: x.event.start_time_ns
)
self.events = [e.event for e in self.event_keys]
self.cuda_events: List[_KinetoEvent] = []
self.queue_depth_list = self.compute_queue_depth()
self.compute_idle_time()
def compute_self_time(self):
"""
Computes event's self time(total time - time in child ops).
"""
assert self.profile.kineto_results is not None
stack = deque(self.profile.kineto_results.experimental_event_tree())
# standard iterating dfs
while stack:
curr_event = stack.pop()
self_time = curr_event.duration_time_ns
for child_event in curr_event.children:
self_time -= child_event.duration_time_ns
stack.append(child_event)
assert (
EventKey(curr_event) not in self.metrics
), f"Duplicate id: {curr_event.id}, {curr_event.name}"
self.metrics[EventKey(curr_event)] = EventMetrics(self_time_ns=self_time)
self.metrics[
EventKey(curr_event)
].duration_time_ns = curr_event.duration_time_ns
def compute_queue_depth(self):
"""
Computes queue_depth at each event. This will calculate the queue depth data for
All the events in the tree.
This will return a list of Interval of queue depth data of cuda launch and kernels.
"""
assert self.profile.kineto_results is not None
cuda_event_list = self.profile.kineto_results.events()
def is_cuda_launch_kernel(e):
# TODO: find a better way to identify cudaLaunchKernel
return e.name == "cudaLaunchKernel"
def is_cuda_kernel(e):
# TODO: find a better way to identify CUDA Kernel
return e.device_type() == DeviceType.CUDA and "mem" not in e.name.lower()
cuda_launch_events = sorted(
(e for e in cuda_event_list if is_cuda_launch_kernel(e)),
key=lambda x: x.start_ns(),
)
cuda_kernel_events = sorted(
(e for e in cuda_event_list if is_cuda_kernel(e)),
key=lambda x: x.start_ns(),
)
self.cuda_events = sorted(
cuda_launch_events + cuda_kernel_events, key=lambda x: x.start_ns()
)
kernel_mapping: Dict[_KinetoEvent, int] = {}
last_mapped_kernel = 0
for cuda_launch_event in cuda_launch_events:
index = index_of_first_match(
cuda_kernel_events,
lambda x: x.linked_correlation_id()
== cuda_launch_event.linked_correlation_id(),
start=last_mapped_kernel,
)
kernel_mapping[cuda_launch_event] = index
last_mapped_kernel = index if index is not None else last_mapped_kernel
current_kernel_index = 0
spawned_kernel_index = -1
all_events = cuda_launch_events + cuda_kernel_events + self.events
def new_old_event_comparator(event):
if hasattr(event, "start_us"):
return event.start_us() * 1000
if hasattr(event, "start_ns"):
return event.start_ns()
if hasattr(event, "start_time_ns"):
return event.start_time_ns
raise Exception("Unknown Event Type") # noqa: TRY002
queue_depth_list: List[Interval] = []
all_events.sort(key=new_old_event_comparator)
for event in all_events:
# Find latest cuda kernel event
if hasattr(event, "start_us"):
start_time = event.start_us() * 1000
end_time = (event.start_us() + event.duration_us()) * 1000
# Find current spawned cuda kernel event
if event in kernel_mapping and kernel_mapping[event] is not None:
spawned_kernel_index = kernel_mapping[event]
if hasattr(event, "start_ns"):
start_time = event.start_ns()
end_time = event.start_ns() + event.duration_ns()
# Find current spawned cuda kernel event
if event in kernel_mapping and kernel_mapping[event] is not None:
spawned_kernel_index = kernel_mapping[event]
elif hasattr(event, "start_time_ns"):
start_time = event.start_time_ns # type: ignore[attr-defined]
end_time = event.end_time_ns # type: ignore[attr-defined]
while (
current_kernel_index < len(cuda_kernel_events)
and (cuda_kernel_events[current_kernel_index].start_ns())
<= start_time # type: ignore[possibly-undefined]
):
current_kernel_index += 1
current_queue_depth = spawned_kernel_index - current_kernel_index + 1
current_queue_depth = max(current_queue_depth, 0)
if hasattr(event, "start_us") or hasattr(event, "start_ns"):
queue_depth_list.append(
Interval(start_time, end_time, current_queue_depth) # type: ignore[possibly-undefined]
)
elif hasattr(event, "start_time_ns"):
self.metrics[EventKey(event)].queue_depth = current_queue_depth
return queue_depth_list
def compute_idle_time(self):
"""
Computes idle time of the profile.
"""
# Based on queue_depth_list, we can calculate idle time for all the events
idle = False
idle_start = 0
idle_intervals: List[Interval] = []
if self.queue_depth_list and self.events:
idle_intervals += [
Interval(self.events[0].start_time_ns, self.queue_depth_list[0].start),
Interval(self.queue_depth_list[-1].end, self.events[-1].end_time_ns),
]
for data_point in self.queue_depth_list:
if data_point.queue_depth == 0 and not idle:
idle_start = data_point.end
idle = True
if data_point.queue_depth > 0 and idle:
idle_intervals.append(Interval(idle_start, data_point.start))
idle = False
event_list = [e.event for e in self.metrics.keys()]
for event in event_list:
self.metrics[EventKey(event)].idle_time_ns = EventKey(
event
).intervals_overlap(idle_intervals)
def rank_events(self, length):
"""
Filter and Rank the events based on some heuristics:
1) Events that are in the falling phase of the queue depth.
2) Events that have a high idle_time, self_time difference.
Parameters:
length: The number of events to return.
"""
# Find the interval when qd is falling to 0
import torch
queue_depth_list = list(reversed(self.queue_depth_list))
qd_values = [e.queue_depth for e in queue_depth_list]
bottom_threashold = 0
top_threashold = 4
decrease_interval = []
i = 0
while i < len(qd_values):
if qd_values[i] > bottom_threashold:
i += 1
continue
for j in range(i + 1, len(qd_values)):
# Find next zero and if the max value between them exceeds
# the threshold, then we have a falling interval
next_minimum_idx = index_of_first_match(
qd_values, lambda x: x <= bottom_threashold, start=j
)
peak_idx = argmax(qd_values, start=j, end=next_minimum_idx)
# if is a valid peak, we add to list and continue
if peak_idx is not None and qd_values[peak_idx] >= top_threashold:
decrease_interval.append(
Interval(
queue_depth_list[peak_idx].start, queue_depth_list[i].start
)
)
i = next_minimum_idx if next_minimum_idx is not None else i
break
i += 1
# Filter out events that are not in the decrease interval
event_list = [
event
for event in self.metrics.keys()
if event.intervals_overlap(decrease_interval)
]
if event_list:
self_time = torch.tensor(
[self.metrics[event].self_time_ns for event in event_list],
dtype=torch.float32,
)
idle_time = torch.tensor(
[self.metrics[event].fraction_idle_time for event in event_list],
dtype=torch.float32,
)
normalized_gain = (idle_time - torch.mean(idle_time)) / torch.std(idle_time)
normalized_self = (self_time - torch.mean(self_time)) / torch.std(self_time)
heuristic_score_list = normalized_gain + 0.6 * normalized_self
# Sort events by heuristic
event_list = [
event
for _, event in sorted(
zip(heuristic_score_list, event_list),
key=operator.itemgetter(0),
reverse=True,
)
]
event_list = event_list[:length]
return event_list
def get_optimizable_events(self, length: int = 1, print_enable: bool = True):
event_list = self.rank_events(length)
if not print_enable:
return event_list
output = "Optimizable events:\n" if event_list else "No events to optimize\n"
output += "\n".join(
[
f"""{'-'*80}
Event: {event}
Source code location: {source_code_location(event.event)}
Percentage idle time: {self.metrics[event].fraction_idle_time * 100:.2f}%
{'-'*80}"""
for event in event_list
]
)
if print_enable:
print(output)
return event_list
def index_of_first_match(seq, predicate, start=0, end=None):
if end is None or end >= len(seq):
end = len(seq)
for i in range(start, end):
if predicate(seq[i]):
return i
return None
def argmax(seq, key=lambda x: x, start=0, end=None):
seq = seq[start:end]
if len(seq) == 0:
return None
return seq.index(max(seq, key=key)) + start
def source_code_location(event):
while event is not None:
match = re.search(r"\.py\(.*\)", event.name)
if match is None:
event = event.parent
continue
return event.name
return "No source code location found"
# Provide an OSS workaround for cudagraphs + CUPTI issue
# https://github.com/pytorch/pytorch/issues/75504
# TODO(dberard) - deprecate / remove workaround for CUDA >= 12, when
# we stop supporting older CUDA versions.
def _init_for_cuda_graphs():
from torch.autograd.profiler import profile
with profile():
pass

View File

@ -0,0 +1,80 @@
# mypy: allow-untyped-defs
from contextlib import contextmanager
try:
from torch._C import _itt
except ImportError:
class _ITTStub:
@staticmethod
def _fail(*args, **kwargs):
raise RuntimeError(
"ITT functions not installed. Are you sure you have a ITT build?"
)
@staticmethod
def is_available():
return False
rangePush = _fail
rangePop = _fail
mark = _fail
_itt = _ITTStub() # type: ignore[assignment]
__all__ = ["is_available", "range_push", "range_pop", "mark", "range"]
def is_available():
"""
Check if ITT feature is available or not
"""
return _itt.is_available()
def range_push(msg):
"""
Pushes a range onto a stack of nested range span. Returns zero-based
depth of the range that is started.
Arguments:
msg (str): ASCII message to associate with range
"""
return _itt.rangePush(msg)
def range_pop():
"""
Pops a range off of a stack of nested range spans. Returns the
zero-based depth of the range that is ended.
"""
return _itt.rangePop()
def mark(msg):
"""
Describe an instantaneous event that occurred at some point.
Arguments:
msg (str): ASCII message to associate with the event.
"""
return _itt.mark(msg)
@contextmanager
def range(msg, *args, **kwargs):
"""
Context manager / decorator that pushes an ITT range at the beginning
of its scope, and pops it at the end. If extra arguments are given,
they are passed as arguments to msg.format().
Args:
msg (str): message to associate with the range
"""
range_push(msg.format(*args, **kwargs))
try:
yield
finally:
range_pop()

View File

@ -0,0 +1,935 @@
# mypy: allow-untyped-defs
import gzip
import json
import os
import shutil
import tempfile
from abc import ABC, abstractmethod
from enum import Enum
from functools import partial
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
from typing_extensions import Self
from warnings import warn
import torch
import torch.autograd.profiler as prof
from torch._C import _get_privateuse1_backend_name
from torch._C._profiler import (
_add_execution_trace_observer,
_disable_execution_trace_observer,
_enable_execution_trace_observer,
_ExperimentalConfig,
_remove_execution_trace_observer,
)
from torch.autograd import kineto_available, ProfilerActivity
from torch.profiler._memory_profiler import MemoryProfile, MemoryProfileTimeline
__all__ = [
"supported_activities",
"ProfilerAction",
"schedule",
"tensorboard_trace_handler",
"profile",
"ExecutionTraceObserver",
]
PROFILER_STEP_NAME = "ProfilerStep"
def supported_activities():
"""
Returns a set of supported profiler tracing activities.
Note: profiler uses CUPTI library to trace on-device CUDA kernels.
In case when CUDA is enabled but CUPTI is not available, passing
``ProfilerActivity.CUDA`` to profiler results in using the legacy CUDA
profiling code (same as in the legacy ``torch.autograd.profiler``).
This, in turn, results in including CUDA time in the profiler table output,
but not in the JSON trace.
"""
return torch.autograd._supported_activities()
class _ITraceObserver(ABC):
"""Abstract interface for a Trace observer.
This satisfies 3 methods: start, stop and cleanup"""
@abstractmethod
def start(self):
pass
@abstractmethod
def stop(self):
pass
@abstractmethod
def cleanup(self):
pass
class _KinetoProfile:
"""Low-level profiler wrap the autograd profile
Args:
activities (iterable): list of activity groups (CPU, CUDA) to use in profiling, supported values:
``torch.profiler.ProfilerActivity.CPU``, ``torch.profiler.ProfilerActivity.CUDA``,
``torch.profiler.ProfilerActivity.XPU``.
Default value: ProfilerActivity.CPU and (when available) ProfilerActivity.CUDA
or (when available) ProfilerActivity.XPU.
record_shapes (bool): save information about operator's input shapes.
profile_memory (bool): track tensor memory allocation/deallocation (see ``export_memory_timeline``
for more details).
with_stack (bool): record source information (file and line number) for the ops.
with_flops (bool): use formula to estimate the FLOPS of specific operators
(matrix multiplication and 2D convolution).
with_modules (bool): record module hierarchy (including function names)
corresponding to the callstack of the op. e.g. If module A's forward call's
module B's forward which contains an aten::add op,
then aten::add's module hierarchy is A.B
Note that this support exist, at the moment, only for TorchScript models
and not eager mode models.
experimental_config (_ExperimentalConfig) : A set of experimental options
used by profiler libraries like Kineto. Note, backward compatibility is not guaranteed.
execution_trace_observer (ExecutionTraceObserver) : A PyTorch Execution Trace Observer object.
`PyTorch Execution Traces <https://arxiv.org/pdf/2305.14516.pdf>`__ offer a graph based
representation of AI/ML workloads and enable replay benchmarks, simulators, and emulators.
When this argument is included the observer start() and stop() will be called for the
same time window as PyTorch profiler.
acc_events (bool): Enable the accumulation of FunctionEvents across multiple profiling cycles
.. note::
This API is experimental and subject to change in the future.
Enabling shape and stack tracing results in additional overhead.
When record_shapes=True is specified, profiler will temporarily hold references to the tensors;
that may further prevent certain optimizations that depend on the reference count and introduce
extra tensor copies.
"""
def __init__(
self,
*,
activities: Optional[Iterable[ProfilerActivity]] = None,
record_shapes: bool = False,
profile_memory: bool = False,
with_stack: bool = False,
with_flops: bool = False,
with_modules: bool = False,
experimental_config: Optional[_ExperimentalConfig] = None,
execution_trace_observer: Optional[_ITraceObserver] = None,
acc_events: bool = False,
):
self.activities = set(activities) if activities else supported_activities()
self.record_shapes = record_shapes
self.with_flops = with_flops
self.profile_memory = profile_memory
self.with_stack = with_stack
self.with_modules = with_modules
self.experimental_config = experimental_config
self.execution_trace_observer = execution_trace_observer
self.acc_events = acc_events
self.profiler: Optional[prof.profile] = None
self.mem_tl: Optional[MemoryProfileTimeline] = None
self.use_device = None
if ProfilerActivity.CUDA in self.activities:
self.use_device = "cuda"
elif ProfilerActivity.XPU in self.activities:
self.use_device = "xpu"
elif ProfilerActivity.MTIA in self.activities:
self.use_device = "mtia"
elif ProfilerActivity.PrivateUse1 in self.activities:
self.use_device = _get_privateuse1_backend_name()
# user-defined metadata to be amended to the trace
self.preset_metadata: Dict[str, str] = {}
def start(self):
self.prepare_trace()
self.start_trace()
def stop(self):
self.stop_trace()
def prepare_trace(self):
if (self.profiler is None) or (not self.acc_events):
self.profiler = prof.profile(
use_cpu=(ProfilerActivity.CPU in self.activities),
use_device=self.use_device,
record_shapes=self.record_shapes,
with_flops=self.with_flops,
profile_memory=self.profile_memory,
with_stack=self.with_stack,
with_modules=self.with_modules,
use_kineto=True,
experimental_config=self.experimental_config,
acc_events=self.acc_events,
)
self.profiler._prepare_trace()
def start_trace(self):
if self.execution_trace_observer:
self.execution_trace_observer.start()
assert self.profiler is not None
self.profiler._start_trace()
if self.profile_memory:
self.add_metadata_json("profile_memory", "1")
if self.with_stack:
self.add_metadata_json("with_stack", "1")
if self.record_shapes:
self.add_metadata_json("record_shapes", "1")
if self.with_modules:
self.add_metadata_json("with_modules", "1")
if self.with_flops:
self.add_metadata_json("with_flops", "1")
if kineto_available():
dist_info = self._get_distributed_info()
if dist_info:
self.add_metadata_json("distributedInfo", json.dumps(dist_info))
if hasattr(torch, "_inductor"):
import torch._inductor.config as inductor_config
if inductor_config.triton.cudagraphs:
os.environ["DISABLE_CUPTI_LAZY_REINIT"] = "1"
self.add_metadata_json("DISABLE_CUPTI_LAZY_REINIT", "1")
# FIXME: CUDA Graph does not work well with CUPTI teardown.
# 1) crashes on 1st lazy CUPTI re-init after teardown (CUDA 11)
# 2) crashes on 2nd non-lazy CUPTI re-init after teardown (CUDA 12)
# Workaround: turn off CUPTI teardown when using CUDA Graphs.
os.environ["TEARDOWN_CUPTI"] = "0"
# Insert the preset user metadata to the trace
for k, v in self.preset_metadata.items():
self.add_metadata_json(k, v)
def stop_trace(self):
if self.execution_trace_observer:
self.execution_trace_observer.stop()
assert self.profiler is not None
self.profiler.__exit__(None, None, None)
def export_chrome_trace(self, path: str):
"""
Exports the collected trace in Chrome JSON format. If kineto is enabled, only
last cycle in schedule is exported.
"""
assert self.profiler
if path.endswith(".gz"):
fp = tempfile.NamedTemporaryFile("w+t", suffix=".json", delete=False)
fp.close()
retvalue = self.profiler.export_chrome_trace(fp.name)
with open(fp.name) as fin:
with gzip.open(path, "wt") as fout:
fout.writelines(fin)
os.remove(fp.name)
return retvalue
else:
return self.profiler.export_chrome_trace(path)
def export_stacks(self, path: str, metric: str = "self_cpu_time_total"):
"""Save stack traces to a file
Args:
path (str): save stacks file to this location;
metric (str): metric to use: "self_cpu_time_total" or "self_cuda_time_total"
"""
assert self.profiler
return self.profiler.export_stacks(path, metric)
def toggle_collection_dynamic(
self, enable: bool, activities: Iterable[ProfilerActivity]
):
"""Toggle collection of activities on/off at any point of collection. Currently supports toggling Torch Ops
(CPU) and CUDA activity supported in Kineto
Args:
activities (iterable): list of activity groups to use in profiling, supported values:
``torch.profiler.ProfilerActivity.CPU``, ``torch.profiler.ProfilerActivity.CUDA``
Examples:
.. code-block:: python
with torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
]
) as p:
code_to_profile_0()
// turn off collection of all CUDA activity
p.toggle_collection_dynamic(False, [torch.profiler.ProfilerActivity.CUDA])
code_to_profile_1()
// turn on collection of all CUDA activity
p.toggle_collection_dynamic(True, [torch.profiler.ProfilerActivity.CUDA])
code_to_profile_2()
print(p.key_averages().table(
sort_by="self_cuda_time_total", row_limit=-1))
"""
if not self.profiler:
return
self.profiler.toggle_collection_dynamic(enable, activities)
def key_averages(
self, group_by_input_shape: bool = False, group_by_stack_n: int = 0
):
"""Averages events, grouping them by operator name and (optionally) input shapes and
stack.
.. note::
To use shape/stack functionality make sure to set record_shapes/with_stack
when creating profiler context manager.
"""
assert self.profiler
return self.profiler.key_averages(group_by_input_shape, group_by_stack_n)
def events(self):
"""
Returns the list of unaggregated profiler events,
to be used in the trace callback or after the profiling is finished
"""
assert self.profiler
return self.profiler.function_events
def add_metadata(self, key: str, value: str):
"""
Adds a user defined metadata with a string key and a string value
into the trace file
"""
wrapped_value = '"' + value.replace('"', '\\"') + '"'
torch.autograd._add_metadata_json(key, wrapped_value)
def add_metadata_json(self, key: str, value: str):
"""
Adds a user defined metadata with a string key and a valid json value
into the trace file
"""
torch.autograd._add_metadata_json(key, value)
def preset_metadata_json(self, key: str, value: str):
"""
Preset a user defined metadata when the profiler is not started
and added into the trace file later.
Metadata is in the format of a string key and a valid json value
"""
self.preset_metadata[key] = value
def _get_distributed_info(self):
import torch.distributed as dist
if not dist.is_available() or not dist.is_initialized():
return None
backend = dist.get_backend()
dist_info = {
"backend": backend,
"rank": dist.get_rank(),
"world_size": dist.get_world_size(),
"pg_count": dist.get_pg_count(),
"pg_config": dist.distributed_c10d._get_all_pg_configs(),
}
if backend == "nccl":
nccl_version = torch.cuda.nccl.version()
dist_info["nccl_version"] = ".".join(str(v) for v in nccl_version)
return dist_info
def _memory_profile(self) -> MemoryProfile:
required = ("record_shapes", "profile_memory", "with_stack")
missing = [f"{i}=True" for i in required if not getattr(self, i)]
if missing:
raise ValueError(f"{', '.join(missing)} required for memory profiling.")
assert self.profiler is not None and self.profiler.kineto_results is not None
return MemoryProfile(self.profiler.kineto_results)
def export_memory_timeline(self, path: str, device: Optional[str] = None) -> None:
"""Export memory event information from the profiler collected
tree for a given device, and export a timeline plot. There are 3
exportable files using ``export_memory_timeline``, each controlled by the
``path``'s suffix.
- For an HTML compatible plot, use the suffix ``.html``, and a memory timeline
plot will be embedded as a PNG file in the HTML file.
- For plot points consisting of ``[times, [sizes by category]]``, where
``times`` are timestamps and ``sizes`` are memory usage for each category.
The memory timeline plot will be saved a JSON (``.json``) or gzipped JSON
(``.json.gz``) depending on the suffix.
- For raw memory points, use the suffix ``.raw.json.gz``. Each raw memory
event will consist of ``(timestamp, action, numbytes, category)``, where
``action`` is one of ``[PREEXISTING, CREATE, INCREMENT_VERSION, DESTROY]``,
and ``category`` is one of the enums from
``torch.profiler._memory_profiler.Category``.
Output: Memory timeline written as gzipped JSON, JSON, or HTML.
"""
# Default to device 0, if unset. Fallback on cpu.
if device is None and self.use_device and self.use_device != "cuda":
device = self.use_device + ":0"
if device is None:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# Construct the memory timeline plot data
self.mem_tl = MemoryProfileTimeline(self._memory_profile())
# Depending on the file suffix, save the data as json.gz or json.
# For html, we can embed the image into an HTML file.
if path.endswith(".html"):
self.mem_tl.export_memory_timeline_html(path, device)
elif path.endswith(".gz"):
fp = tempfile.NamedTemporaryFile("w+t", suffix=".json", delete=False)
fp.close()
if path.endswith("raw.json.gz"):
self.mem_tl.export_memory_timeline_raw(fp.name, device)
else:
self.mem_tl.export_memory_timeline(fp.name, device)
with open(fp.name) as fin:
with gzip.open(path, "wt") as fout:
fout.writelines(fin)
os.remove(fp.name)
else:
self.mem_tl.export_memory_timeline(path, device)
class ProfilerAction(Enum):
"""
Profiler actions that can be taken at the specified intervals
"""
NONE = 0
WARMUP = 1
RECORD = 2
RECORD_AND_SAVE = 3
def schedule(
*, wait: int, warmup: int, active: int, repeat: int = 0, skip_first: int = 0
) -> Callable:
"""
Returns a callable that can be used as profiler ``schedule`` argument. The profiler will skip
the first ``skip_first`` steps, then wait for ``wait`` steps, then do the warmup for the next ``warmup`` steps,
then do the active recording for the next ``active`` steps and then repeat the cycle starting with ``wait`` steps.
The optional number of cycles is specified with the ``repeat`` parameter, the zero value means that
the cycles will continue until the profiling is finished.
"""
def schedule_fn(step: int) -> ProfilerAction:
assert step >= 0
if step < skip_first:
return ProfilerAction.NONE
else:
step -= skip_first
num_steps = wait + warmup + active
if repeat > 0 and step / num_steps >= repeat:
return ProfilerAction.NONE
mod_step = step % num_steps
if mod_step < wait:
return ProfilerAction.NONE
elif mod_step < wait + warmup:
return ProfilerAction.WARMUP
else:
return (
ProfilerAction.RECORD
if mod_step < num_steps - 1
else ProfilerAction.RECORD_AND_SAVE
)
assert (
wait >= 0 and warmup >= 0 and active > 0 and repeat >= 0 and skip_first >= 0
), "Invalid profiler schedule arguments"
if warmup == 0:
warn("Profiler won't be using warmup, this can skew profiler results")
return schedule_fn
def _default_schedule_fn(_: int) -> ProfilerAction:
"""
Default profiler behavior - immediately starts recording the events,
keeps doing it on every profiler step.
"""
return ProfilerAction.RECORD
def tensorboard_trace_handler(
dir_name: str, worker_name: Optional[str] = None, use_gzip: bool = False
):
"""
Outputs tracing files to directory of ``dir_name``, then that directory can be
directly delivered to tensorboard as logdir.
``worker_name`` should be unique for each worker in distributed scenario,
it will be set to '[hostname]_[pid]' by default.
"""
import os
import socket
import time
def handler_fn(prof) -> None:
nonlocal worker_name
if not os.path.isdir(dir_name):
try:
os.makedirs(dir_name, exist_ok=True)
except Exception as e:
raise RuntimeError("Can't create directory: " + dir_name) from e
if not worker_name:
worker_name = f"{socket.gethostname()}_{os.getpid()}"
# Use nanosecond here to avoid naming clash when exporting the trace
file_name = f"{worker_name}.{time.time_ns()}.pt.trace.json"
if use_gzip:
file_name = file_name + ".gz"
prof.export_chrome_trace(os.path.join(dir_name, file_name))
return handler_fn
class profile(_KinetoProfile):
"""Profiler context manager.
Args:
activities (iterable): list of activity groups (CPU, CUDA) to use in profiling, supported values:
``torch.profiler.ProfilerActivity.CPU``, ``torch.profiler.ProfilerActivity.CUDA``,
``torch.profiler.ProfilerActivity.XPU``.
Default value: ProfilerActivity.CPU and (when available) ProfilerActivity.CUDA
or (when available) ProfilerActivity.XPU.
schedule (Callable): callable that takes step (int) as a single parameter and returns
``ProfilerAction`` value that specifies the profiler action to perform at each step.
on_trace_ready (Callable): callable that is called at each step when ``schedule``
returns ``ProfilerAction.RECORD_AND_SAVE`` during the profiling.
record_shapes (bool): save information about operator's input shapes.
profile_memory (bool): track tensor memory allocation/deallocation.
with_stack (bool): record source information (file and line number) for the ops.
with_flops (bool): use formula to estimate the FLOPs (floating point operations) of specific operators
(matrix multiplication and 2D convolution).
with_modules (bool): record module hierarchy (including function names)
corresponding to the callstack of the op. e.g. If module A's forward call's
module B's forward which contains an aten::add op,
then aten::add's module hierarchy is A.B
Note that this support exist, at the moment, only for TorchScript models
and not eager mode models.
experimental_config (_ExperimentalConfig) : A set of experimental options
used for Kineto library features. Note, backward compatibility is not guaranteed.
execution_trace_observer (ExecutionTraceObserver) : A PyTorch Execution Trace Observer object.
`PyTorch Execution Traces <https://arxiv.org/pdf/2305.14516.pdf>`__ offer a graph based
representation of AI/ML workloads and enable replay benchmarks, simulators, and emulators.
When this argument is included the observer start() and stop() will be called for the
same time window as PyTorch profiler. See the examples section below for a code sample.
acc_events (bool): Enable the accumulation of FunctionEvents across multiple profiling cycles
use_cuda (bool):
.. deprecated:: 1.8.1
use ``activities`` instead.
.. note::
Use :func:`~torch.profiler.schedule` to generate the callable schedule.
Non-default schedules are useful when profiling long training jobs
and allow the user to obtain multiple traces at the different iterations
of the training process.
The default schedule simply records all the events continuously for the
duration of the context manager.
.. note::
Use :func:`~torch.profiler.tensorboard_trace_handler` to generate result files for TensorBoard:
``on_trace_ready=torch.profiler.tensorboard_trace_handler(dir_name)``
After profiling, result files can be found in the specified directory. Use the command:
``tensorboard --logdir dir_name``
to see the results in TensorBoard.
For more information, see
`PyTorch Profiler TensorBoard Plugin <https://github.com/pytorch/kineto/tree/master/tb_plugin>`__
.. note::
Enabling shape and stack tracing results in additional overhead.
When record_shapes=True is specified, profiler will temporarily hold references to the tensors;
that may further prevent certain optimizations that depend on the reference count and introduce
extra tensor copies.
Examples:
.. code-block:: python
with torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
]
) as p:
code_to_profile()
print(p.key_averages().table(
sort_by="self_cuda_time_total", row_limit=-1))
Using the profiler's ``schedule``, ``on_trace_ready`` and ``step`` functions:
.. code-block:: python
# Non-default profiler schedule allows user to turn profiler on and off
# on different iterations of the training loop;
# trace_handler is called every time a new trace becomes available
def trace_handler(prof):
print(prof.key_averages().table(
sort_by="self_cuda_time_total", row_limit=-1))
# prof.export_chrome_trace("/tmp/test_trace_" + str(prof.step_num) + ".json")
with torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
],
# In this example with wait=1, warmup=1, active=2, repeat=1,
# profiler will skip the first step/iteration,
# start warming up on the second, record
# the third and the forth iterations,
# after which the trace will become available
# and on_trace_ready (when set) is called;
# the cycle repeats starting with the next step
schedule=torch.profiler.schedule(
wait=1,
warmup=1,
active=2,
repeat=1),
on_trace_ready=trace_handler
# on_trace_ready=torch.profiler.tensorboard_trace_handler('./log')
# used when outputting for tensorboard
) as p:
for iter in range(N):
code_iteration_to_profile(iter)
# send a signal to the profiler that the next iteration has started
p.step()
The following sample shows how to setup up an Execution Trace Observer (`execution_trace_observer`)
.. code-block:: python
with torch.profiler.profile(
...
execution_trace_observer=(
ExecutionTraceObserver().register_callback("./execution_trace.json")
),
) as p:
for iter in range(N):
code_iteration_to_profile(iter)
p.step()
You can also refer to test_execution_trace_with_kineto() in tests/profiler/test_profiler.py.
Note: One can also pass any object satisfying the _ITraceObserver interface.
"""
def __init__(
self,
*,
activities: Optional[Iterable[ProfilerActivity]] = None,
schedule: Optional[Callable[[int], ProfilerAction]] = None,
on_trace_ready: Optional[Callable[..., Any]] = None,
record_shapes: bool = False,
profile_memory: bool = False,
with_stack: bool = False,
with_flops: bool = False,
with_modules: bool = False,
experimental_config: Optional[_ExperimentalConfig] = None,
execution_trace_observer: Optional[_ITraceObserver] = None,
acc_events: bool = False,
# deprecated:
use_cuda: Optional[bool] = None,
):
activities_set = set(activities) if activities else supported_activities()
if use_cuda is not None:
warn(
"`use_cuda` is deprecated, use `activities` argument instead",
FutureWarning,
stacklevel=2,
)
if use_cuda:
activities_set.add(ProfilerActivity.CUDA)
elif ProfilerActivity.CUDA in activities_set:
activities_set.remove(ProfilerActivity.CUDA)
assert len(activities_set) > 0, "No valid profiler activities found"
super().__init__(
activities=activities,
record_shapes=record_shapes,
profile_memory=profile_memory,
with_stack=with_stack,
with_flops=with_flops,
with_modules=with_modules,
experimental_config=experimental_config,
execution_trace_observer=execution_trace_observer,
acc_events=acc_events,
)
if schedule:
self.schedule = schedule
# add step markers into the trace and table view
self.record_steps = True
else:
self.schedule = _default_schedule_fn
self.record_steps = False
self.on_trace_ready = on_trace_ready
self.step_num = 0
self.current_action = self.schedule(self.step_num)
self.step_rec_fn: Optional[prof.record_function] = None
self.action_map: Dict[
Tuple[ProfilerAction, Optional[ProfilerAction]], List[Any]
] = {
# key is (prev_action, current_action), value is action list corresponding to the state pair.
(ProfilerAction.NONE, ProfilerAction.NONE): [],
(ProfilerAction.NONE, ProfilerAction.WARMUP): [self.prepare_trace],
(ProfilerAction.NONE, ProfilerAction.RECORD): [
self.prepare_trace,
self.start_trace,
],
(ProfilerAction.NONE, ProfilerAction.RECORD_AND_SAVE): [
self.prepare_trace,
self.start_trace,
],
(ProfilerAction.WARMUP, ProfilerAction.NONE): [
partial(warn, "Incorrect schedule: WARMUP followed by NONE"),
self.start_trace,
self.stop_trace,
],
(ProfilerAction.WARMUP, ProfilerAction.WARMUP): [],
(ProfilerAction.WARMUP, ProfilerAction.RECORD): [self.start_trace],
(ProfilerAction.WARMUP, ProfilerAction.RECORD_AND_SAVE): [self.start_trace],
(ProfilerAction.RECORD, ProfilerAction.NONE): [
partial(warn, "Incorrect schedule: RECORD followed by NONE"),
self.stop_trace,
],
(ProfilerAction.RECORD, ProfilerAction.WARMUP): [
partial(warn, "Incorrect schedule: RECORD followed by WARMUP"),
self.stop_trace,
],
(ProfilerAction.RECORD, ProfilerAction.RECORD): [],
(ProfilerAction.RECORD, ProfilerAction.RECORD_AND_SAVE): [],
(ProfilerAction.RECORD_AND_SAVE, ProfilerAction.NONE): [
self.stop_trace,
self._trace_ready,
],
(ProfilerAction.RECORD_AND_SAVE, ProfilerAction.WARMUP): [
self.stop_trace,
self._trace_ready,
self.prepare_trace,
],
(ProfilerAction.RECORD_AND_SAVE, ProfilerAction.RECORD): [
self.stop_trace,
self._trace_ready,
self.prepare_trace,
self.start_trace,
],
(ProfilerAction.RECORD_AND_SAVE, ProfilerAction.RECORD_AND_SAVE): [
self.stop_trace,
self._trace_ready,
self.prepare_trace,
self.start_trace,
],
# used for exit action
(ProfilerAction.WARMUP, None): [self.start_trace, self.stop_trace],
(ProfilerAction.RECORD, None): [self.stop_trace, self._trace_ready],
(ProfilerAction.RECORD_AND_SAVE, None): [
self.stop_trace,
self._trace_ready,
],
}
# Start tracking increments to profiler step, this will be used
# by Kineto
prof.KinetoStepTracker.init_step_count(PROFILER_STEP_NAME)
def __enter__(self):
self.start()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.stop()
prof.KinetoStepTracker.erase_step_count(PROFILER_STEP_NAME)
if self.execution_trace_observer:
self.execution_trace_observer.cleanup()
def start(self):
self._transit_action(ProfilerAction.NONE, self.current_action)
if self.record_steps:
self.step_rec_fn = prof.record_function(
"ProfilerStep#" + str(self.step_num)
)
self.step_rec_fn.__enter__()
def stop(self):
if self.record_steps and self.step_rec_fn:
self.step_rec_fn.__exit__(None, None, None)
self._transit_action(self.current_action, None)
def step(self):
"""
Signals the profiler that the next profiling step has started.
"""
if self.record_steps and self.step_rec_fn:
self.step_rec_fn.__exit__(None, None, None)
prev_action = self.current_action
self.step_num += 1
self.current_action = self.schedule(self.step_num)
self._transit_action(prev_action, self.current_action)
prof.KinetoStepTracker.increment_step(PROFILER_STEP_NAME)
if self.record_steps:
self.step_rec_fn = prof.record_function(
"ProfilerStep#" + str(self.step_num)
)
self.step_rec_fn.__enter__()
def _trace_ready(self):
if self.on_trace_ready:
self.on_trace_ready(self)
def _transit_action(self, prev_action, current_action):
action_list = self.action_map.get((prev_action, current_action))
if action_list:
for action in action_list:
action()
def _stats(self) -> Optional[prof._ProfilerStats]:
if self.profiler is None:
return None
return self.profiler._stats
class ExecutionTraceObserver(_ITraceObserver):
"""Execution Trace Observer
Each process can have a single ExecutionTraceObserver instance. The observer
can be added to record function callbacks via calling register_callback()
explicitly. Without calling unregister_callback(), repeated calls to
register_callback() will not add additional observers to record function
callbacks. Once an ExecutionTraceObserver is created, the start() and stop()
methods control when the event data is recorded.
Deleting or calling unregister_callback() will remove the observer from the
record function callbacks, finalize the output file, and will stop
incurring any overheads.
"""
def __init__(self) -> None:
"""
Initializes the default states.
"""
self._registered = False
self._execution_trace_running = False
def __del__(self):
"""
Calls unregister_callback() to make sure to finalize outputs.
"""
self.unregister_callback()
def register_callback(self, output_file_path: str) -> Self:
"""
Adds ET observer to record function callbacks. The data will be
written to output_file_path.
"""
if not self._registered:
self._output_file_path = output_file_path
self._registered = _add_execution_trace_observer(output_file_path)
return self
def unregister_callback(self):
"""
Removes ET observer from record function callbacks.
"""
def _save_triton_kernels():
# Save the kernel paths for the generated kernels
from torch._inductor.codecache import PyCodeCache as PyCodeCache
kernel_files = [
v.__file__
for v in PyCodeCache.cache.values()
if getattr(v, "__file__", None) is not None
]
work_dir, file_name = os.path.split(self._output_file_path)
resource_dir = os.path.join(
work_dir, os.path.splitext(file_name)[0] + "_resources"
)
if not os.path.exists(resource_dir):
os.mkdir(resource_dir)
for kernel_file in kernel_files:
if kernel_file is None:
continue
path, name = os.path.split(kernel_file)
dst = os.path.join(resource_dir, name)
shutil.copyfile(kernel_file, dst)
if self._registered:
self.stop()
try:
_save_triton_kernels()
except Exception as e:
warn(f"Execution trace failed to save kernels: {e}")
_remove_execution_trace_observer()
self._registered = False
@property
def is_registered(self):
"""
Returns True if the execution trace observer is registered, otherwise False.
"""
return self._registered
def is_running(self):
"""
Returns True if the observer is running, otherwise False.
"""
return self._execution_trace_running
def start(self):
"""
Starts to capture.
"""
if self._registered and not self._execution_trace_running:
_enable_execution_trace_observer()
self._execution_trace_running = True
self._record_pg_config()
def stop(self):
"""
Stops to capture.
"""
if self._execution_trace_running:
_disable_execution_trace_observer()
self._execution_trace_running = False
def cleanup(self):
"""
Calls unregister_callback() to make sure to finalize outputs.
"""
self.unregister_callback()
def get_output_file_path(self) -> str:
"""
Returns the output file name.
"""
if self.is_registered:
return self._output_file_path
else:
raise RuntimeError(
"A callback to the ET profiler needs to be registered "
"first before getting the output file path"
)
def _record_pg_config(self) -> None:
# Records the PG config info to the trace as node:
# ## process_group:init ##
if (
self.is_registered
and torch.distributed.is_available()
and torch.distributed.is_initialized()
):
pg_config_info = torch.distributed.distributed_c10d._world.pg_config_info
torch.autograd._record_function_with_args_enter(
"## process_group:init ##", json.dumps(pg_config_info)
)

View File

@ -0,0 +1,20 @@
import os
import site
import sys
import typing
import torch
def _prefix_regex() -> typing.List[str]:
raw_paths = (
site.getsitepackages()
+ sys.path
+ [site.getuserbase()]
+ [site.getusersitepackages()]
+ [os.path.dirname(os.path.dirname(torch.__file__))]
)
path_prefixes = sorted({os.path.abspath(i) for i in raw_paths}, reverse=True)
assert all(isinstance(i, str) for i in path_prefixes)
return [i + os.sep for i in path_prefixes]