I am done

This commit is contained in:
2024-10-30 22:14:35 +01:00
parent 720dc28c09
commit 40e2a747cf
36901 changed files with 5011519 additions and 0 deletions

View File

@ -0,0 +1,164 @@
#!/usr/bin/env/python3
# mypy: allow-untyped-defs
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
"""Metrics API.
**Overview**:
The metrics API in torchelastic is used to publish telemetry metrics.
It is designed to be used by torchelastic's internal modules to
publish metrics for the end user with the goal of increasing visibility
and helping with debugging. However you may use the same API in your
jobs to publish metrics to the same metrics ``sink``.
A ``metric`` can be thought of as timeseries data
and is uniquely identified by the string-valued tuple
``(metric_group, metric_name)``.
torchelastic makes no assumptions about what a ``metric_group`` is
and what relationship it has with ``metric_name``. It is totally up
to the user to use these two fields to uniquely identify a metric.
.. note:: The metric group ``torchelastic`` is reserved by torchelastic for
platform level metrics that it produces.
For instance torchelastic may output the latency (in milliseconds)
of a re-rendezvous operation from the agent as
``(torchelastic, agent.rendezvous.duration.ms)``
A sensible way to use metric groups is to map them to a stage or module
in your job. You may also encode certain high level properties
the job such as the region or stage (dev vs prod).
**Publish Metrics**:
Using torchelastic's metrics API is similar to using python's logging
framework. You first have to configure a metrics handler before
trying to add metric data.
The example below measures the latency for the ``calculate()`` function.
::
import time
import torch.distributed.elastic.metrics as metrics
# makes all metrics other than the one from "my_module" to go /dev/null
metrics.configure(metrics.NullMetricsHandler())
metrics.configure(metrics.ConsoleMetricsHandler(), "my_module")
def my_method():
start = time.time()
calculate()
end = time.time()
metrics.put_metric("calculate_latency", int(end-start), "my_module")
You may also use the torch.distributed.elastic.metrics.prof` decorator
to conveniently and succinctly profile functions
::
# -- in module examples.foobar --
import torch.distributed.elastic.metrics as metrics
metrics.configure(metrics.ConsoleMetricsHandler(), "foobar")
metrics.configure(metrics.ConsoleMetricsHandler(), "Bar")
@metrics.prof
def foo():
pass
class Bar():
@metrics.prof
def baz():
pass
``@metrics.prof`` will publish the following metrics
::
<leaf_module or classname>.success - 1 if the function finished successfully
<leaf_module or classname>.failure - 1 if the function threw an exception
<leaf_module or classname>.duration.ms - function duration in milliseconds
**Configuring Metrics Handler**:
`torch.distributed.elastic.metrics.MetricHandler` is responsible for emitting
the added metric values to a particular destination. Metric groups can be
configured with different metric handlers.
By default torchelastic emits all metrics to ``/dev/null``.
By adding the following configuration metrics,
``torchelastic`` and ``my_app`` metric groups will be printed out to
console.
::
import torch.distributed.elastic.metrics as metrics
metrics.configure(metrics.ConsoleMetricHandler(), group = "torchelastic")
metrics.configure(metrics.ConsoleMetricHandler(), group = "my_app")
**Writing a Custom Metric Handler**:
If you want your metrics to be emitted to a custom location, implement
the `torch.distributed.elastic.metrics.MetricHandler` interface
and configure your job to use your custom metric handler.
Below is a toy example that prints the metrics to ``stdout``
::
import torch.distributed.elastic.metrics as metrics
class StdoutMetricHandler(metrics.MetricHandler):
def emit(self, metric_data):
ts = metric_data.timestamp
group = metric_data.group_name
name = metric_data.name
value = metric_data.value
print(f"[{ts}][{group}]: {name}={value}")
metrics.configure(StdoutMetricHandler(), group="my_app")
Now all metrics in the group ``my_app`` will be printed to stdout as:
::
[1574213883.4182858][my_app]: my_metric=<value>
[1574213940.5237644][my_app]: my_metric=<value>
"""
from typing import Optional
from .api import ( # noqa: F401
configure,
ConsoleMetricHandler,
get_elapsed_time_ms,
getStream,
MetricData,
MetricHandler,
MetricsConfig,
NullMetricHandler,
prof,
profile,
publish_metric,
put_metric,
)
def initialize_metrics(cfg: Optional[MetricsConfig] = None):
pass
try:
from torch.distributed.elastic.metrics.static_init import * # type: ignore[import] # noqa: F401 F403
except ModuleNotFoundError:
pass

View File

@ -0,0 +1,216 @@
#!/usr/bin/env python3
# mypy: allow-untyped-defs
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
import abc
import time
from collections import namedtuple
from functools import wraps
from typing import Dict, Optional
from typing_extensions import deprecated
__all__ = [
"MetricsConfig",
"MetricHandler",
"ConsoleMetricHandler",
"NullMetricHandler",
"MetricStream",
"configure",
"getStream",
"prof",
"profile",
"put_metric",
"publish_metric",
"get_elapsed_time_ms",
"MetricData",
]
MetricData = namedtuple("MetricData", ["timestamp", "group_name", "name", "value"])
class MetricsConfig:
__slots__ = ["params"]
def __init__(self, params: Optional[Dict[str, str]] = None):
self.params = params
if self.params is None:
self.params = {}
class MetricHandler(abc.ABC):
@abc.abstractmethod
def emit(self, metric_data: MetricData):
pass
class ConsoleMetricHandler(MetricHandler):
def emit(self, metric_data: MetricData):
print(
f"[{metric_data.timestamp}][{metric_data.group_name}]: {metric_data.name}={metric_data.value}"
)
class NullMetricHandler(MetricHandler):
def emit(self, metric_data: MetricData):
pass
class MetricStream:
def __init__(self, group_name: str, handler: MetricHandler):
self.group_name = group_name
self.handler = handler
def add_value(self, metric_name: str, metric_value: int):
self.handler.emit(
MetricData(time.time(), self.group_name, metric_name, metric_value)
)
_metrics_map: Dict[str, MetricHandler] = {}
_default_metrics_handler: MetricHandler = NullMetricHandler()
# pyre-fixme[9]: group has type `str`; used as `None`.
def configure(handler: MetricHandler, group: Optional[str] = None):
if group is None:
global _default_metrics_handler
# pyre-fixme[9]: _default_metrics_handler has type `NullMetricHandler`; used
# as `MetricHandler`.
_default_metrics_handler = handler
else:
_metrics_map[group] = handler
def getStream(group: str):
if group in _metrics_map:
handler = _metrics_map[group]
else:
handler = _default_metrics_handler
return MetricStream(group, handler)
def _get_metric_name(fn):
qualname = fn.__qualname__
split = qualname.split(".")
if len(split) == 1:
module = fn.__module__
if module:
return module.split(".")[-1] + "." + split[0]
else:
return split[0]
else:
return qualname
def prof(fn=None, group: str = "torchelastic"):
r"""
@profile decorator publishes duration.ms, count, success, failure metrics for the function that it decorates.
The metric name defaults to the qualified name (``class_name.def_name``) of the function.
If the function does not belong to a class, it uses the leaf module name instead.
Usage
::
@metrics.prof
def x():
pass
@metrics.prof(group="agent")
def y():
pass
"""
def wrap(f):
@wraps(f)
def wrapper(*args, **kwargs):
key = _get_metric_name(f)
try:
start = time.time()
result = f(*args, **kwargs)
put_metric(f"{key}.success", 1, group)
except Exception:
put_metric(f"{key}.failure", 1, group)
raise
finally:
put_metric(f"{key}.duration.ms", get_elapsed_time_ms(start), group) # type: ignore[possibly-undefined]
return result
return wrapper
if fn:
return wrap(fn)
else:
return wrap
@deprecated("Deprecated, use `@prof` instead", category=FutureWarning)
def profile(group=None):
"""
@profile decorator adds latency and success/failure metrics to any given function.
Usage
::
@metrics.profile("my_metric_group")
def some_function(<arguments>):
"""
def wrap(func):
@wraps(func)
def wrapper(*args, **kwargs):
try:
start_time = time.time()
result = func(*args, **kwargs)
publish_metric(group, f"{func.__name__}.success", 1)
except Exception:
publish_metric(group, f"{func.__name__}.failure", 1)
raise
finally:
publish_metric(
group,
f"{func.__name__}.duration.ms",
get_elapsed_time_ms(start_time), # type: ignore[possibly-undefined]
)
return result
return wrapper
return wrap
def put_metric(metric_name: str, metric_value: int, metric_group: str = "torchelastic"):
"""
Publish a metric data point.
Usage
::
put_metric("metric_name", 1)
put_metric("metric_name", 1, "metric_group_name")
"""
getStream(metric_group).add_value(metric_name, metric_value)
@deprecated(
"Deprecated, use `put_metric(metric_group)(metric_name, metric_value)` instead",
category=FutureWarning,
)
def publish_metric(metric_group: str, metric_name: str, metric_value: int):
metric_stream = getStream(metric_group)
metric_stream.add_value(metric_name, metric_value)
def get_elapsed_time_ms(start_time_in_seconds: float):
"""Return the elapsed time in millis from the given start time."""
end_time = time.time()
return int((end_time - start_time_in_seconds) * 1000)