Table of contents
Send time delay report function analysis
the code
class LatencyMonitor : public RecurrentRunner {
public:
LatencyMonitor();
void RunOnce(const double current_time) override;
bool GetFrequency(const std::string& channel_name, double* freq);
private:
void UpdateStat(
const std::shared_ptr<apollo::common::LatencyRecordMap>& records);
void PublishLatencyReport();
void AggregateLatency();
apollo::common::LatencyReport latency_report_;
std::unordered_map<uint64_t,
std::set<std::tuple<uint64_t, uint64_t, std::string>>>
track_map_;
std::unordered_map<std::string, double> freq_map_;
double flush_time_ = 0.0;
};
void LatencyMonitor::RunOnce(const double current_time) {
static auto reader =
MonitorManager::Instance()->CreateReader<LatencyRecordMap>(
FLAGS_latency_recording_topic);
reader->SetHistoryDepth(FLAGS_latency_reader_capacity);
reader->Observe();
static std::string last_processed_key;
std::string first_key_of_current_round;
for (auto it = reader->Begin(); it != reader->End(); ++it) {
const std::string current_key =
absl::StrCat((*it)->module_name(), (*it)->header().sequence_num());
if (it == reader->Begin()) {
first_key_of_current_round = current_key;
}
if (current_key == last_processed_key) {
break;
}
UpdateStat(*it);
}
last_processed_key = first_key_of_current_round;
if (current_time - flush_time_ > FLAGS_latency_report_interval) {
flush_time_ = current_time;
if (!track_map_.empty()) {
PublishLatencyReport();
}
}
}
analyze
Before the analysis, recall that the time delay between the previous module channels was realized through LatencyMonitor, so the job of LatencyMonitor is to collect various time delays and summarize them to form a report.
RunOnce function analysis
Subscribe to latency_recording_topic, the message body is LatencyRecordMap
message LatencyRecord {
optional uint64 begin_time = 1;
optional uint64 end_time = 2;
optional uint64 message_id = 3;
};
message LatencyRecordMap {
optional apollo.common.Header header = 1;
optional string module_name = 2;
repeated LatencyRecord latency_records = 3;
};
Traverse all the information subscribed to, and then use UpdateStat to update the status
UpdateState function analysis
void LatencyMonitor::UpdateStat(
const std::shared_ptr<LatencyRecordMap>& records) {
const auto module_name = records->module_name();
for (const auto& record : records->latency_records()) {
track_map_[record.message_id()].emplace(record.begin_time(),
record.end_time(), module_name);
}
if (!records->latency_records().empty()) {
const auto begin_time = records->latency_records().begin()->begin_time();
const auto end_time = records->latency_records().rbegin()->end_time();
if (end_time > begin_time) {
freq_map_[module_name] =
records->latency_records().size() /
apollo::cyber::Time(end_time - begin_time).ToSecond();
}
}
}
- Save the time-consuming information of each msg to track_map_
- Update the frequency information of the module in freq_map
Send time delay report function analysis
void LatencyMonitor::PublishLatencyReport() {
static auto writer = MonitorManager::Instance()->CreateWriter<LatencyReport>(
FLAGS_latency_reporting_topic);
apollo::common::util::FillHeader("LatencyReport", &latency_report_);
AggregateLatency();
writer->Write(latency_report_);
latency_report_.clear_header();
track_map_.clear();
latency_report_.clear_modules_latency();
latency_report_.clear_e2es_latency();
}
void LatencyMonitor::AggregateLatency() {
static const std::string kE2EStartPoint = FLAGS_pointcloud_topic;
std::unordered_map<std::string, std::vector<uint64_t>> modules_track;
std::unordered_map<std::string, std::vector<uint64_t>> e2es_track;
std::unordered_set<std::string> all_modules;
// Aggregate modules latencies
std::string module_name;
uint64_t begin_time = 0, end_time = 0;
for (const auto& message : track_map_) {
auto iter = message.second.begin();
while (iter != message.second.end()) {
std::tie(begin_time, end_time, module_name) = *iter;
modules_track[module_name].push_back(end_time - begin_time);
all_modules.emplace(module_name);
++iter;
}
}
// Aggregate E2E latencies
std::unordered_map<std::string, uint64_t> e2e_latencies;
for (const auto& message : track_map_) {
uint64_t e2e_begin_time = 0;
auto iter = message.second.begin();
e2e_latencies.clear();
while (iter != message.second.end()) {
std::tie(begin_time, std::ignore, module_name) = *iter;
if (e2e_begin_time == 0 && module_name == kE2EStartPoint) {
e2e_begin_time = begin_time;
} else if (module_name != kE2EStartPoint && e2e_begin_time != 0 &&
e2e_latencies.find(module_name) == e2e_latencies.end()) {
const auto duration = begin_time - e2e_begin_time;
e2e_latencies[module_name] = duration;
e2es_track[module_name].push_back(duration);
}
++iter;
}
}
// The results could be in the following fromat:
// e2e latency:
// pointcloud -> perception: min(500), max(600), average(550),
// sample_size(1500) pointcloud -> planning: min(800), max(1000),
// average(900), sample_size(1500) pointcloud -> control: min(1200),
// max(1300), average(1250), sample_size(1500)
// ...
// modules latency:
// perception: min(5), max(50), average(30), sample_size(1000)
// prediction: min(500), max(5000), average(2000), sample_size(800)
// control: min(500), max(800), average(600), sample_size(800)
// ...
auto* modules_latency = latency_report_.mutable_modules_latency();
for (const auto& module : modules_track) {
SetLatency(module.first, module.second, modules_latency);
}
auto* e2es_latency = latency_report_.mutable_e2es_latency();
for (const auto& e2e : e2es_track) {
SetLatency(absl::StrCat(kE2EStartPoint, " -> ", e2e.first), e2e.second,
e2es_latency);
}
}
It can be seen that there are mainly two parts of the time delay:
- Time Delay for All Modules
- E2E time delay
The E2E here is the end-to-end time delay, which refers to the time from corona information to the output of each module in apollo.
The logic of E2E Latency:
- The start time of recording the first piece of point cloud data
- Record the start time of those records that are not point cloud data in turn, and calculate the difference between them, which becomes the E2E delay of this test cycle.
Remark
The operation of Latency is based on the communication that relies on CyberRT, so it is also necessary to ensure that the Channel communication mechanism of CyberRT is reliable enough, otherwise errors will occur.