Apollo application and source code analysis: Monitor monitoring-hardware monitoring-CPU, disk, memory resource monitoring logic source code analysis

Table of contents

structural analysis

the code

analyze

Perform subject analysis

the code

analyze

SystemStatus & ComponentStatus

Check disk space analysis

the code

analyze

Check CPU usage

the code

analyze

Check memory usage

the code

analyze

The function to get the memory usage of the system is GetSystemMemoryUsage

The function to get the memory usage of the process is GetMemoryUsage

Check disk load

the code

analyze

Get disk load: GetSystemDiskload


structural analysis

the code

class ResourceMonitor : public RecurrentRunner {
 public:
  ResourceMonitor();
  void RunOnce(const double current_time) override;

 private:
  static void UpdateStatus(
      const apollo::dreamview::ResourceMonitorConfig& config,
      ComponentStatus* status);
  static void CheckDiskSpace(
      const apollo::dreamview::ResourceMonitorConfig& config,
      ComponentStatus* status);
  static void CheckCPUUsage(
      const apollo::dreamview::ResourceMonitorConfig& config,
      ComponentStatus* status);
  static void CheckMemoryUsage(
      const apollo::dreamview::ResourceMonitorConfig& config,
      ComponentStatus* status);
  static void CheckDiskLoads(
      const apollo::dreamview::ResourceMonitorConfig& config,
      ComponentStatus* status);
};

analyze

  1. Inherited RecurrentRunner, rewritten RunOnce, indicating that the main task is in RunOnce
  2. There are 4 static functions to check disk space, CPU usage, memory usage, disk load respectively.
  3. There is a function UpdateStatus that updates the status.

Perform subject analysis

the code

void ResourceMonitor::RunOnce(const double current_time) {
  auto manager = MonitorManager::Instance();
  const auto& mode = manager->GetHMIMode();
  auto* components = manager->GetStatus()->mutable_components();

  for (const auto& iter : mode.monitored_components()) {
    const std::string& name = iter.first;
    const auto& config = iter.second;
    if (config.has_resource()) {
      UpdateStatus(config.resource(),
                   components->at(name).mutable_resource_status());
    }
  }
}

analyze

  1. Create an instance of monitor manage;
  2. Get the HMI configuration according to this example, and get the component to be monitored configured by the HMI
  3. Traverse the components to be monitored and use updatestatus to update the status

Here you need to know some proto, otherwise the class analysis later will not understand.

SystemStatus & ComponentStatus

message ComponentStatus {
  enum Status {
    UNKNOWN = 0;
    OK = 1;
    WARN = 2;
    ERROR = 3;
    FATAL = 4;
  }
  optional Status status = 1 [default = UNKNOWN];
  optional string message = 2;
}

message Component {
  // A summary of all detailed status.
  optional ComponentStatus summary = 1;

  // Detailed status.
  optional ComponentStatus process_status = 2;
  optional ComponentStatus channel_status = 3;
  optional ComponentStatus resource_status = 4;
  optional ComponentStatus other_status = 5;
  optional ComponentStatus module_status = 6;
}

message SystemStatus {
  optional apollo.common.Header header = 1;

  map<string, ComponentStatus> hmi_modules = 7;
  map<string, Component> components = 8;

  // Some critical message for passengers. HMI should highlight it or even read
  // loudly.
  optional string passenger_msg = 4;

  // If we have this field, safety_mode should be triggered.
  // We'll check the system action and driver action continuously. If no proper
  // action was taken in a specified period of time (such as 10 seconds), EStop
  // will be sent to bring the vehicle into emergency full stop.
  optional double safety_mode_trigger_time = 5;
  optional bool require_emergency_stop = 6;

  // In simulation mode, the monitor will publish message with this field set,
  // so subscribers could identify it from the recorded messages.
  optional bool is_realtime_in_simulation = 9;

  // In some modes, other processes besides modules and monitored components may
  // need to be monitored
  map<string, ComponentStatus> other_components = 10;

  reserved 2, 3;
}

Check disk space analysis

the code

void ResourceMonitor::CheckDiskSpace(
    const apollo::dreamview::ResourceMonitorConfig& config,
    ComponentStatus* status) {
  // Monitor available disk space.
  for (const auto& disk_space : config.disk_spaces()) {
    for (const auto& path : cyber::common::Glob(disk_space.path())) {
      const auto space = boost::filesystem::space(path);
      const int available_gb = static_cast<int>(space.available >> 30);
      if (available_gb < disk_space.insufficient_space_error()) {
        const std::string err =
            absl::StrCat(path, " has insufficient space: ", available_gb,
                         "GB < ", disk_space.insufficient_space_error());
        SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);
      } else if (available_gb < disk_space.insufficient_space_warning()) {
        const std::string err =
            absl::StrCat(path, " has insufficient space: ", available_gb,
                         "GB < ", disk_space.insufficient_space_warning());
        SummaryMonitor::EscalateStatus(ComponentStatus::WARN, err, status);
      }
    }
  }
}

analyze

First of all, you need to understand what the parameters are. The config parameter is obtained from the mode in manager->GetHMIMode().

mode is obtained from the function below.

HMIConfig HMIWorker::LoadConfig() {
  HMIConfig config;
  // Get available modes, maps and vehicles by listing data directory.
  *config.mutable_modes() =
      ListFilesAsDict(FLAGS_hmi_modes_config_path, ".pb.txt");
  ACHECK(!config.modes().empty())
      << "No modes config loaded from " << FLAGS_hmi_modes_config_path;

  *config.mutable_maps() = ListDirAsDict(FLAGS_maps_data_path);
  *config.mutable_vehicles() = ListDirAsDict(FLAGS_vehicles_config_path);
  AINFO << "Loaded HMI config: " << config.DebugString();
  return config;
}

FLAGS_hmi_modes_config_path can be searched, defined in hmi_work.cc

DEFINE_string(hmi_modes_config_path, "/apollo/modules/dreamview/conf/hmi_modes",
              "HMI modes config path.");

You can know that there are some suffixes under /apollo/modules/dreamview/conf/hmi_modes that are .pb.txt and define config.

Open the corresponding directory, there are the following files below

├── camera-Lidar_sensor_calibration.pb.txt
├── car_teleop.pb.txt
├── console_teleop.pb.txt
├── dev_kit_close_loop.pb.txt
├── dev_kit_debug.pb.txt
├── ipc1_mkz_close_loop.pb.txt
├── ipc1_mkz_standard_debug.pb.txt
├── ipc2_mkz_close_loop.pb.txt
├── ipc2_mkz_standard_debug.pb.txt
├── lidar-IMU_sensor_calibration.pb.txt
├── map_collection.pb.txt
├── mkz_64beam.pb.txt
├── mkz_close_loop.pb.txt
├── mkz_lgsvl.pb.txt
├── mkz_standard_debug.pb.txt
├── mkz_standard_debug_hesai.pb.txt
├── mkz_standard_debug_smart_recorder.pb.txt
├── navigation.pb.txt
├── rtk.pb.txt
└── vehicle_calibration.pb.txt

Then read the contents of the following file into the HMIMode structure

message HMIMode {
  map<string, CyberModule> cyber_modules = 1;
  map<string, Module> modules = 2;
  map<string, MonitoredComponent> monitored_components = 3;
  map<string, ProcessMonitorConfig> other_components = 4;
}

These files define the preset value for the monitoring target, such as: diskspace

monitored_components {
  key: "Recorder"
  value: {
    process {
      command_keywords: "cyber_recorder record"
    }
    resource {
      disk_spaces {
        # For logs.
        path: "/apollo/data"
        insufficient_space_warning: 8
        insufficient_space_error: 2
      }
      disk_spaces {
        # For records.
        path: "/media/apollo/internal_nvme"
        insufficient_space_warning: 128
        insufficient_space_error: 32
      }
    }
  }
}

The storage location of the file and the corresponding error pre-value will be defined.

Look at our function body again, we know

for (const auto& disk_space : config.disk_spaces()) {
    for (const auto& path : cyber::common::Glob(disk_space.path())) {
      const auto space = boost::filesystem::space(path);

This code is to traverse the disk location and the size of the space configured in each monitor config.


 if (available_gb < disk_space.insufficient_space_error()) {
        const std::string err =
            absl::StrCat(path, " has insufficient space: ", available_gb,
                         "GB < ", disk_space.insufficient_space_error());
        SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);
      } else if (available_gb < disk_space.insufficient_space_warning()) {
        const std::string err =
            absl::StrCat(path, " has insufficient space: ", available_gb,
                         "GB < ", disk_space.insufficient_space_warning());
        SummaryMonitor::EscalateStatus(ComponentStatus::WARN, err, status);
      }

This section is to alarm through the preset value.

SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);

This paragraph is to use the current highest level of failure to represent the state of the disk.

SummaryMonitor is a part of software monitor and is introduced in software monitor.

Check CPU usage

the code

void ResourceMonitor::CheckCPUUsage(
    const apollo::dreamview::ResourceMonitorConfig& config,
    ComponentStatus* status) {
  for (const auto& cpu_usage : config.cpu_usages()) {
    const auto process_dag_path = cpu_usage.process_dag_path();
    float cpu_usage_value = 0.f;
    if (process_dag_path.empty()) {
      cpu_usage_value = GetSystemCPUUsage();
    } else {
      int pid = 0;
      if (GetPIDByCmdLine(process_dag_path, &pid)) {
        static std::unordered_map<std::string, uint64_t> prev_jiffies_map;
        if (prev_jiffies_map.find(process_dag_path) == prev_jiffies_map.end()) {
          prev_jiffies_map[process_dag_path] = 0;
        }
        cpu_usage_value = GetCPUUsage(pid, process_dag_path, &prev_jiffies_map);
      }
    }
    const auto high_cpu_warning = cpu_usage.high_cpu_usage_warning();
    const auto high_cpu_error = cpu_usage.high_cpu_usage_error();
    if (cpu_usage_value > high_cpu_error) {
      const std::string err = absl::StrCat(
          process_dag_path, " has high cpu usage: ", cpu_usage_value, "% > ",
          high_cpu_error, "%");
      SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);
    } else if (cpu_usage_value > high_cpu_warning) {
      const std::string warn = absl::StrCat(
          process_dag_path, " has high cpu usage: ", cpu_usage_value, "% > ",
          high_cpu_warning, "%");
      SummaryMonitor::EscalateStatus(ComponentStatus::WARN, warn, status);
    }
  }
}

analyze

The core logic is to obtain the part occupied by the CPU, which is the same as the disk space analysis above.

Read the configuration, traverse the configuration, and then give the corresponding level of alarm according to the CPU usage

The specific acquisition method is GetSystemCPUUsage, which essentially reads the /proc/stat file and then obtains it by parsing the file.

float GetSystemCPUUsage() {
  const std::string system_cpu_stat_file = "/proc/stat";
  const int users = 1, system = 3, total = 7;
  constexpr static int kSystemCpuInfo = 0;
  static uint64_t prev_jiffies = 0, prev_work_jiffies = 0;
  const auto stat_lines =
      GetStatsLines(system_cpu_stat_file, kSystemCpuInfo + 1);
  if (stat_lines.size() <= kSystemCpuInfo) {
    AERROR << "failed to load contents from " << system_cpu_stat_file;
    return 0.f;
  }
  const std::vector<std::string> jiffies_stats =
      absl::StrSplit(stat_lines[kSystemCpuInfo], ' ', absl::SkipWhitespace());
  if (jiffies_stats.size() <= total) {
    AERROR << "failed to get system CPU info from " << system_cpu_stat_file;
    return 0.f;
  }
  uint64_t jiffies = 0, work_jiffies = 0;
  for (int cur_stat = users; cur_stat <= total; ++cur_stat) {
    const auto cur_stat_value = std::stoll(jiffies_stats[cur_stat]);
    jiffies += cur_stat_value;
    if (cur_stat <= system) {
      work_jiffies += cur_stat_value;
    }
  }
  const uint64_t tmp_prev_jiffies = prev_jiffies;
  const uint64_t tmp_prev_work_jiffies = prev_work_jiffies;
  prev_jiffies = jiffies;
  prev_work_jiffies = work_jiffies;
  if (tmp_prev_jiffies == 0) {
    return 0.f;
  }
  return 100.f * (static_cast<float>(work_jiffies - tmp_prev_work_jiffies) /
                  (jiffies - tmp_prev_jiffies));
}

Check memory usage

the code

void ResourceMonitor::CheckMemoryUsage(
    const apollo::dreamview::ResourceMonitorConfig& config,
    ComponentStatus* status) {
  for (const auto& memory_usage : config.memory_usages()) {
    const auto process_dag_path = memory_usage.process_dag_path();
    float memory_usage_value = 0.f;
    if (process_dag_path.empty()) {
      memory_usage_value = GetSystemMemoryUsage();
    } else {
      int pid = 0;
      if (GetPIDByCmdLine(process_dag_path, &pid)) {
        memory_usage_value = GetMemoryUsage(pid, process_dag_path);
      }
    }
    const auto high_memory_warning = memory_usage.high_memory_usage_warning();
    const auto high_memory_error = memory_usage.high_memory_usage_error();
    if (memory_usage_value > static_cast<float>(high_memory_error)) {
      const std::string err = absl::StrCat(
          process_dag_path, " has high memory usage: ", memory_usage_value,
          " > ", high_memory_error);
      SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);
    } else if (memory_usage_value > static_cast<float>(high_memory_warning)) {
      const std::string warn = absl::StrCat(
          process_dag_path, " has high memory usage: ", memory_usage_value,
          " > ", high_memory_warning);
      SummaryMonitor::EscalateStatus(ComponentStatus::WARN, warn, status);
    }
  }
}

analyze

  1. read configuration file
  2. Traverse the configuration to determine whether the process_dag_path is empty, if so, read the memory usage of the entire system, if not, read the memory usage of the process
  3. According to the preset value to be set to determine whether an alarm is required

The function to get the memory usage of the system is GetSystemMemoryUsage

float GetSystemMemoryUsage() {
  const std::string system_mem_stat_file = "/proc/meminfo";
  const int mem_total = 0, mem_free = 1, buffers = 3, cached = 4,
            swap_total = 14, swap_free = 15, slab = 21;
  const auto stat_lines = GetStatsLines(system_mem_stat_file, slab + 1);
  if (stat_lines.size() <= slab) {
    AERROR << "failed to load contents from " << system_mem_stat_file;
    return 0.f;
  }
  const auto total_memory =
      GetSystemMemoryValueFromLine(stat_lines[mem_total]) +
      GetSystemMemoryValueFromLine(stat_lines[swap_total]);
  int64_t used_memory = total_memory;
  for (int cur_line = mem_free; cur_line <= slab; ++cur_line) {
    if (cur_line == mem_free || cur_line == buffers || cur_line == cached ||
        cur_line == swap_free || cur_line == slab) {
      used_memory -= GetSystemMemoryValueFromLine(stat_lines[cur_line]);
    }
  }
  return 100.f * (static_cast<float>(used_memory) / total_memory);
}

The function to get the memory usage of the process is GetMemoryUsage

float GetMemoryUsage(const int pid, const std::string& process_name) {
  const std::string memory_stat_file = absl::StrCat("/proc/", pid, "/statm");
  const uint32_t page_size_kb = (sysconf(_SC_PAGE_SIZE) >> 10);
  const int resident_idx = 1, gb_2_kb = (1 << 20);
  constexpr static int kMemoryInfo = 0;

  const auto stat_lines = GetStatsLines(memory_stat_file, kMemoryInfo + 1);
  if (stat_lines.size() <= kMemoryInfo) {
    AERROR << "failed to load contents from " << memory_stat_file;
    return 0.f;
  }
  const std::vector<std::string> stats =
      absl::StrSplit(stat_lines[kMemoryInfo], ' ', absl::SkipWhitespace());
  if (stats.size() <= resident_idx) {
    AERROR << "failed to get memory info for process " << process_name;
    return 0.f;
  }
  return static_cast<float>(std::stoll(stats[resident_idx]) * page_size_kb) /
         gb_2_kb;
}

They are obtained by reading the files under /proc under linux.

Check disk load

the code

void ResourceMonitor::CheckDiskLoads(
    const apollo::dreamview::ResourceMonitorConfig& config,
    ComponentStatus* status) {
  for (const auto& disk_load : config.disk_load_usages()) {
    const auto disk_load_value = GetSystemDiskload(disk_load.device_name());
    const auto high_disk_load_warning = disk_load.high_disk_load_warning();
    const auto high_disk_load_error = disk_load.high_disk_load_error();
    if (disk_load_value > static_cast<float>(high_disk_load_error)) {
      const std::string err = absl::StrCat(
          disk_load.device_name(), " has high disk load: ", disk_load_value,
          " > ", high_disk_load_error);
      SummaryMonitor::EscalateStatus(ComponentStatus::ERROR, err, status);
    } else if (disk_load_value > static_cast<float>(high_disk_load_warning)) {
      const std::string warn = absl::StrCat(
          disk_load.device_name(), " has high disk load: ", disk_load_value,
          " > ", high_disk_load_warning);
      SummaryMonitor::EscalateStatus(ComponentStatus::WARN, warn, status);
    }
  }
}

analyze

  1. Analyze the configuration file to find the disk load configuration
  2. Traverse the configuration, obtain the actual load of the disk, and judge the fault according to the set threshold

Get disk load: GetSystemDiskload

float GetSystemDiskload(const std::string& device_name) {
  const std::string disks_stat_file = "/proc/diskstats";
  const int device = 2, in_out_ms = 12;
  const int seconds_to_ms = 1000;
  constexpr static int kDiskInfo = 128;
  static uint64_t prev_disk_stats = 0;

  const auto stat_lines = GetStatsLines(disks_stat_file, kDiskInfo);
  uint64_t disk_stats = 0;
  for (const auto& line : stat_lines) {
    const std::vector<std::string> stats =
        absl::StrSplit(line, ' ', absl::SkipWhitespace());
    if (stats[device] == device_name) {
      disk_stats = std::stoll(stats[in_out_ms]);
      break;
    }
  }
  const uint64_t tmp_prev_disk_stats = prev_disk_stats;
  prev_disk_stats = disk_stats;
  if (tmp_prev_disk_stats == 0) {
    return 0.f;
  }
  return 100.f *
         (static_cast<float>(disk_stats - tmp_prev_disk_stats) /
          static_cast<float>(FLAGS_resource_monitor_interval * seconds_to_ms));
}

Guess you like

Origin blog.csdn.net/qq_32378713/article/details/128057369