506 lines
14 KiB
C++
506 lines
14 KiB
C++
/*
|
|
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under the BSD-style license found in the
|
|
* LICENSE file in the root directory of this source tree.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "AbstractConfig.h"
|
|
#include "ActivityType.h"
|
|
|
|
#include <assert.h>
|
|
#include <chrono>
|
|
#include <functional>
|
|
#include <set>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
namespace libkineto {
|
|
|
|
class Config : public AbstractConfig {
|
|
public:
|
|
Config();
|
|
Config& operator=(const Config&) = delete;
|
|
Config(Config&&) = delete;
|
|
Config& operator=(Config&&) = delete;
|
|
|
|
// Return a full copy including feature config object
|
|
std::unique_ptr<Config> clone() const {
|
|
auto cfg = std::unique_ptr<Config>(new Config(*this));
|
|
cloneFeaturesInto(*cfg);
|
|
return cfg;
|
|
}
|
|
|
|
bool handleOption(const std::string& name, std::string& val) override;
|
|
|
|
void setClientDefaults() override;
|
|
|
|
// Log events to this file
|
|
const std::string& eventLogFile() const {
|
|
return eventLogFile_;
|
|
}
|
|
|
|
bool activityProfilerEnabled() const {
|
|
return activityProfilerEnabled_ ||
|
|
activitiesOnDemandTimestamp_.time_since_epoch().count() > 0;
|
|
}
|
|
|
|
// Log activitiy trace to this file
|
|
const std::string& activitiesLogFile() const {
|
|
return activitiesLogFile_;
|
|
}
|
|
|
|
// Log activitiy trace to this url
|
|
const std::string& activitiesLogUrl() const {
|
|
return activitiesLogUrl_;
|
|
}
|
|
|
|
void setActivitiesLogUrl(const std::string& url) {
|
|
activitiesLogUrl_ = url;
|
|
}
|
|
|
|
bool activitiesLogToMemory() const {
|
|
return activitiesLogToMemory_;
|
|
}
|
|
|
|
bool eventProfilerEnabled() const {
|
|
return !eventNames_.empty() || !metricNames_.empty();
|
|
}
|
|
|
|
// Is profiling enabled for the given device?
|
|
bool eventProfilerEnabledForDevice(uint32_t dev) const {
|
|
return 0 != (eventProfilerDeviceMask_ & (1 << dev));
|
|
}
|
|
|
|
// Take a sample (read hardware counters) at this frequency.
|
|
// This controls how often counters are read - if all counters cannot
|
|
// be collected simultaneously then multiple samples are needed to
|
|
// collect all requested counters - see multiplex period.
|
|
std::chrono::milliseconds samplePeriod() const {
|
|
return samplePeriod_;
|
|
}
|
|
|
|
void setSamplePeriod(std::chrono::milliseconds period) {
|
|
samplePeriod_ = period;
|
|
}
|
|
|
|
// When all requested counters cannot be collected simultaneously,
|
|
// counters will be multiplexed at this frequency.
|
|
// Multiplexing can have a large performance impact if done frequently.
|
|
// To avoid a perf impact, keep this at 1s or above.
|
|
std::chrono::milliseconds multiplexPeriod() const {
|
|
return multiplexPeriod_;
|
|
}
|
|
|
|
void setMultiplexPeriod(std::chrono::milliseconds period) {
|
|
multiplexPeriod_ = period;
|
|
}
|
|
|
|
// Report counters at this frequency. Note that several samples can
|
|
// be reported each time, see samplesPerReport.
|
|
std::chrono::milliseconds reportPeriod() const {
|
|
return reportPeriod_;
|
|
}
|
|
|
|
void setReportPeriod(std::chrono::milliseconds msecs);
|
|
|
|
// Number of samples dispatched each report period.
|
|
// Must be in the range [1, report period / sample period].
|
|
// In other words, aggregation is supported but not interpolation.
|
|
int samplesPerReport() const {
|
|
return samplesPerReport_;
|
|
}
|
|
|
|
void setSamplesPerReport(int count) {
|
|
samplesPerReport_ = count;
|
|
}
|
|
|
|
// The names of events to collect
|
|
const std::set<std::string>& eventNames() const {
|
|
return eventNames_;
|
|
}
|
|
|
|
// Add additional events to be profiled
|
|
void addEvents(const std::set<std::string>& names) {
|
|
eventNames_.insert(names.begin(), names.end());
|
|
}
|
|
|
|
// The names of metrics to collect
|
|
const std::set<std::string>& metricNames() const {
|
|
return metricNames_;
|
|
}
|
|
|
|
// Add additional metrics to be profiled
|
|
void addMetrics(const std::set<std::string>& names) {
|
|
metricNames_.insert(names.begin(), names.end());
|
|
}
|
|
|
|
const std::vector<int>& percentiles() const {
|
|
return eventReportPercentiles_;
|
|
}
|
|
|
|
// Profile for this long, then revert to base config
|
|
std::chrono::seconds eventProfilerOnDemandDuration() const {
|
|
return eventProfilerOnDemandDuration_;
|
|
}
|
|
|
|
void setEventProfilerOnDemandDuration(std::chrono::seconds duration) {
|
|
eventProfilerOnDemandDuration_ = duration;
|
|
}
|
|
|
|
// Too many event profilers on a single system can overload the driver.
|
|
// At some point, latencies shoot through the roof and collection of samples
|
|
// becomes impossible. To avoid this situation we have a limit of profilers
|
|
// per GPU.
|
|
// NOTE: Communication with a daemon is needed for this feature.
|
|
// Library must be built with an active DaemonConfigLoader.
|
|
int maxEventProfilersPerGpu() const {
|
|
return eventProfilerMaxInstancesPerGpu_;
|
|
}
|
|
|
|
// On Cuda11 we've seen occasional hangs when reprogramming counters
|
|
// Monitor profiling threads and report when a thread is not responding
|
|
// for a given number of seconds.
|
|
// A period of 0 means disable.
|
|
std::chrono::seconds eventProfilerHeartbeatMonitorPeriod() const {
|
|
return eventProfilerHeartbeatMonitorPeriod_;
|
|
}
|
|
|
|
// The types of activities selected in the configuration file
|
|
const std::set<ActivityType>& selectedActivityTypes() const {
|
|
return selectedActivityTypes_;
|
|
}
|
|
|
|
void setSelectedActivityTypes(const std::set<ActivityType>& types) {
|
|
selectedActivityTypes_ = types;
|
|
}
|
|
|
|
bool isReportInputShapesEnabled() const {
|
|
return enableReportInputShapes_;
|
|
}
|
|
|
|
bool isProfileMemoryEnabled() const {
|
|
return enableProfileMemory_;
|
|
}
|
|
|
|
bool isWithStackEnabled() const {
|
|
return enableWithStack_;
|
|
}
|
|
|
|
bool isWithFlopsEnabled() const {
|
|
return enableWithFlops_;
|
|
}
|
|
|
|
bool isWithModulesEnabled() const {
|
|
return enableWithModules_;
|
|
}
|
|
|
|
// Trace for this long
|
|
std::chrono::milliseconds activitiesDuration() const {
|
|
return activitiesDuration_;
|
|
}
|
|
|
|
// Trace for this many iterations, determined by external API
|
|
int activitiesRunIterations() const {
|
|
return activitiesRunIterations_;
|
|
}
|
|
|
|
int activitiesMaxGpuBufferSize() const {
|
|
return activitiesMaxGpuBufferSize_;
|
|
}
|
|
|
|
std::chrono::seconds activitiesWarmupDuration() const {
|
|
return activitiesWarmupDuration_;
|
|
}
|
|
|
|
int activitiesWarmupIterations() const {
|
|
return activitiesWarmupIterations_;
|
|
}
|
|
|
|
// Show CUDA Synchronization Stream Wait Events
|
|
bool activitiesCudaSyncWaitEvents() const {
|
|
return activitiesCudaSyncWaitEvents_;
|
|
}
|
|
|
|
void setActivitiesCudaSyncWaitEvents(bool enable) {
|
|
activitiesCudaSyncWaitEvents_ = enable;
|
|
}
|
|
|
|
// Timestamp at which the profiling to start, requested by the user.
|
|
const std::chrono::time_point<std::chrono::system_clock> requestTimestamp()
|
|
const {
|
|
if (profileStartTime_.time_since_epoch().count()) {
|
|
return profileStartTime_;
|
|
}
|
|
// If no one requested timestamp, return 0.
|
|
if (requestTimestamp_.time_since_epoch().count() == 0) {
|
|
return requestTimestamp_;
|
|
}
|
|
|
|
// TODO(T94634890): Deprecate requestTimestamp
|
|
return requestTimestamp_ + maxRequestAge() + activitiesWarmupDuration();
|
|
}
|
|
|
|
bool hasProfileStartTime() const {
|
|
return requestTimestamp_.time_since_epoch().count() > 0 ||
|
|
profileStartTime_.time_since_epoch().count() > 0;
|
|
}
|
|
|
|
int profileStartIteration() const {
|
|
return profileStartIteration_;
|
|
}
|
|
|
|
bool hasProfileStartIteration() const {
|
|
return profileStartIteration_ >= 0 && activitiesRunIterations_ > 0;
|
|
}
|
|
|
|
void setProfileStartIteration(int iter) {
|
|
profileStartIteration_ = iter;
|
|
}
|
|
|
|
int profileStartIterationRoundUp() const {
|
|
return profileStartIterationRoundUp_;
|
|
}
|
|
|
|
// calculate the start iteration accounting for warmup
|
|
int startIterationIncludingWarmup() const {
|
|
if (!hasProfileStartIteration()) {
|
|
return -1;
|
|
}
|
|
return profileStartIteration_ - activitiesWarmupIterations_;
|
|
}
|
|
|
|
const std::chrono::seconds maxRequestAge() const;
|
|
|
|
// All VLOG* macros will log if the verbose log level is >=
|
|
// the verbosity specified for the verbose log message.
|
|
// Default value is -1, so messages with log level 0 will log by default.
|
|
int verboseLogLevel() const {
|
|
return verboseLogLevel_;
|
|
}
|
|
|
|
// Modules for which verbose logging is enabled.
|
|
// If empty, logging is enabled for all modules.
|
|
const std::vector<std::string>& verboseLogModules() const {
|
|
return verboseLogModules_;
|
|
}
|
|
|
|
bool sigUsr2Enabled() const {
|
|
return enableSigUsr2_;
|
|
}
|
|
|
|
bool ipcFabricEnabled() const {
|
|
return enableIpcFabric_;
|
|
}
|
|
|
|
std::chrono::seconds onDemandConfigUpdateIntervalSecs() const {
|
|
return onDemandConfigUpdateIntervalSecs_;
|
|
}
|
|
|
|
static std::chrono::milliseconds alignUp(
|
|
std::chrono::milliseconds duration,
|
|
std::chrono::milliseconds alignment) {
|
|
duration += alignment;
|
|
return duration - (duration % alignment);
|
|
}
|
|
|
|
std::chrono::time_point<std::chrono::system_clock>
|
|
eventProfilerOnDemandStartTime() const {
|
|
return eventProfilerOnDemandTimestamp_;
|
|
}
|
|
|
|
std::chrono::time_point<std::chrono::system_clock>
|
|
eventProfilerOnDemandEndTime() const {
|
|
return eventProfilerOnDemandTimestamp_ + eventProfilerOnDemandDuration_;
|
|
}
|
|
|
|
std::chrono::time_point<std::chrono::system_clock>
|
|
activityProfilerRequestReceivedTime() const {
|
|
return activitiesOnDemandTimestamp_;
|
|
}
|
|
|
|
static constexpr std::chrono::milliseconds kControllerIntervalMsecs{1000};
|
|
|
|
// Users may request and set trace id and group trace id.
|
|
const std::string& requestTraceID() const {
|
|
return requestTraceID_;
|
|
}
|
|
|
|
void setRequestTraceID(const std::string& tid) {
|
|
requestTraceID_ = tid;
|
|
}
|
|
|
|
const std::string& requestGroupTraceID() const {
|
|
return requestGroupTraceID_;
|
|
}
|
|
|
|
void setRequestGroupTraceID(const std::string& gtid) {
|
|
requestGroupTraceID_ = gtid;
|
|
}
|
|
|
|
size_t cuptiDeviceBufferSize() const {
|
|
return cuptiDeviceBufferSize_;
|
|
}
|
|
|
|
size_t cuptiDeviceBufferPoolLimit() const {
|
|
return cuptiDeviceBufferPoolLimit_;
|
|
}
|
|
|
|
void updateActivityProfilerRequestReceivedTime();
|
|
|
|
void printActivityProfilerConfig(std::ostream& s) const override;
|
|
void setActivityDependentConfig() override;
|
|
|
|
void validate(const std::chrono::time_point<std::chrono::system_clock>&
|
|
fallbackProfileStartTime) override;
|
|
|
|
static void addConfigFactory(
|
|
std::string name,
|
|
std::function<AbstractConfig*(Config&)> factory);
|
|
|
|
void print(std::ostream& s) const;
|
|
|
|
// Config relies on some state with global static lifetime. If other
|
|
// threads are using the config, it's possible that the global state
|
|
// is destroyed before the threads stop. By hanging onto this handle,
|
|
// correct destruction order can be ensured.
|
|
static std::shared_ptr<void> getStaticObjectsLifetimeHandle();
|
|
|
|
bool getTSCTimestampFlag() const{
|
|
return useTSCTimestamp_;
|
|
}
|
|
|
|
void setTSCTimestampFlag(bool flag) {
|
|
useTSCTimestamp_ = flag;
|
|
}
|
|
|
|
private:
|
|
explicit Config(const Config& other) = default;
|
|
|
|
AbstractConfig* cloneDerived(AbstractConfig& parent) const override {
|
|
// Clone from AbstractConfig not supported
|
|
assert(false);
|
|
return nullptr;
|
|
}
|
|
|
|
uint8_t createDeviceMask(const std::string& val);
|
|
|
|
// Adds valid activity types from the user defined string list in the
|
|
// configuration file
|
|
void setActivityTypes(const std::vector<std::string>& selected_activities);
|
|
|
|
// Sets the default activity types to be traced
|
|
void selectDefaultActivityTypes() {
|
|
// If the user has not specified an activity list, add all types
|
|
for (ActivityType t : defaultActivityTypes()) {
|
|
selectedActivityTypes_.insert(t);
|
|
}
|
|
}
|
|
|
|
int verboseLogLevel_;
|
|
std::vector<std::string> verboseLogModules_;
|
|
|
|
// Event profiler
|
|
// These settings are also supported in on-demand mode
|
|
std::chrono::milliseconds samplePeriod_;
|
|
std::chrono::milliseconds reportPeriod_;
|
|
int samplesPerReport_;
|
|
std::set<std::string> eventNames_;
|
|
std::set<std::string> metricNames_;
|
|
|
|
// On-demand duration
|
|
std::chrono::seconds eventProfilerOnDemandDuration_;
|
|
// Last on-demand request
|
|
std::chrono::time_point<std::chrono::system_clock>
|
|
eventProfilerOnDemandTimestamp_;
|
|
|
|
int eventProfilerMaxInstancesPerGpu_;
|
|
|
|
// Monitor whether event profiler threads are stuck
|
|
// at this frequency
|
|
std::chrono::seconds eventProfilerHeartbeatMonitorPeriod_;
|
|
|
|
// These settings can not be changed on-demand
|
|
std::string eventLogFile_;
|
|
std::vector<int> eventReportPercentiles_ = {5, 25, 50, 75, 95};
|
|
uint8_t eventProfilerDeviceMask_ = ~0;
|
|
std::chrono::milliseconds multiplexPeriod_;
|
|
|
|
// Activity profiler
|
|
bool activityProfilerEnabled_;
|
|
std::set<ActivityType> selectedActivityTypes_;
|
|
|
|
// The activity profiler settings are all on-demand
|
|
std::string activitiesLogFile_;
|
|
|
|
std::string activitiesLogUrl_;
|
|
|
|
// Log activities to memory buffer
|
|
bool activitiesLogToMemory_{false};
|
|
|
|
int activitiesMaxGpuBufferSize_;
|
|
std::chrono::seconds activitiesWarmupDuration_;
|
|
int activitiesWarmupIterations_;
|
|
bool activitiesCudaSyncWaitEvents_;
|
|
|
|
// Enable Profiler Config Options
|
|
// Temporarily disable shape collection until we re-roll out the feature for on-demand cases
|
|
bool enableReportInputShapes_{false};
|
|
bool enableProfileMemory_{false};
|
|
bool enableWithStack_{false};
|
|
bool enableWithFlops_{false};
|
|
bool enableWithModules_{false};
|
|
|
|
// Profile for specified iterations and duration
|
|
std::chrono::milliseconds activitiesDuration_;
|
|
int activitiesRunIterations_;
|
|
|
|
// Below are not used
|
|
// Use this net name for iteration count
|
|
std::string activitiesExternalAPIIterationsTarget_;
|
|
// Only profile nets that includes this in the name
|
|
std::vector<std::string> activitiesExternalAPIFilter_;
|
|
// Only profile nets with at least this many operators
|
|
int activitiesExternalAPINetSizeThreshold_;
|
|
// Only profile nets with at least this many GPU operators
|
|
int activitiesExternalAPIGpuOpCountThreshold_;
|
|
// Last activity profiler request
|
|
std::chrono::time_point<std::chrono::system_clock>
|
|
activitiesOnDemandTimestamp_;
|
|
|
|
// ActivityProfilers are triggered by either:
|
|
// Synchronized start timestamps
|
|
std::chrono::time_point<std::chrono::system_clock> profileStartTime_;
|
|
// Or start iterations.
|
|
int profileStartIteration_;
|
|
int profileStartIterationRoundUp_;
|
|
|
|
// DEPRECATED
|
|
std::chrono::time_point<std::chrono::system_clock> requestTimestamp_;
|
|
|
|
// Enable profiling via SIGUSR2
|
|
bool enableSigUsr2_;
|
|
|
|
// Enable IPC Fabric instead of thrift communication
|
|
bool enableIpcFabric_;
|
|
std::chrono::seconds onDemandConfigUpdateIntervalSecs_;
|
|
|
|
// Logger Metadata
|
|
std::string requestTraceID_;
|
|
std::string requestGroupTraceID_;
|
|
|
|
// CUPTI Device Buffer
|
|
size_t cuptiDeviceBufferSize_;
|
|
size_t cuptiDeviceBufferPoolLimit_;
|
|
|
|
// CUPTI Timestamp Format
|
|
bool useTSCTimestamp_{true};
|
|
};
|
|
|
|
constexpr char kUseDaemonEnvVar[] = "KINETO_USE_DAEMON";
|
|
|
|
} // namespace libkineto
|