Apollo 应用与源码分析:Monitor监控-软件监控-定位、camera、功能安全、数据记录监控

目录

定位监控

代码

分析

备注

camera监控

代码

分析

功能安全监控

代码

分析

CheckSafty函数分析

RunOnce 函数分析

记录功能监控

代码

分析

SmartRecorderStatus proto

状态的上报位置分析

监控信息汇总服务

代码

分析


定位监控

代码

class LocalizationMonitor : public RecurrentRunner {
 public:
  LocalizationMonitor();
  void RunOnce(const double current_time) override;
};

void LocalizationMonitor::RunOnce(const double current_time) {
  auto manager = MonitorManager::Instance();
  auto* component = apollo::common::util::FindOrNull(
      *manager->GetStatus()->mutable_components(),
      FLAGS_localization_component_name);
  if (component == nullptr) {
    // localization is not monitored in current mode, skip.
    return;
  }

  static auto reader =
      manager->CreateReader<LocalizationStatus>(FLAGS_localization_msf_status);
  reader->Observe();
  const auto status = reader->GetLatestObserved();

  ComponentStatus* component_status = component->mutable_other_status();
  component_status->clear_status();
  if (status == nullptr) {
    SummaryMonitor::EscalateStatus(ComponentStatus::ERROR,
                                   "No LocalizationStatus received",
                                   component_status);
    return;
  }

  // Translate LocalizationStatus to ComponentStatus. Note that ERROR and FATAL
  // will trigger safety mode in current settings.
  switch (status->fusion_status()) {
    case MeasureState::OK:
      SummaryMonitor::EscalateStatus(ComponentStatus::OK, "", component_status);
      break;
    case MeasureState::WARNNING:
      SummaryMonitor::EscalateStatus(
          ComponentStatus::WARN,
          absl::StrCat("WARNNING: ", status->state_message()),
          component_status);
      break;
    case MeasureState::ERROR:
      SummaryMonitor::EscalateStatus(
          ComponentStatus::WARN,
          absl::StrCat("ERROR: ", status->state_message()), component_status);
      break;
    case MeasureState::CRITICAL_ERROR:
      SummaryMonitor::EscalateStatus(
          ComponentStatus::ERROR,
          absl::StrCat("CRITICAL_ERROR: ", status->state_message()),
          component_status);
      break;
    case MeasureState::FATAL_ERROR:
      SummaryMonitor::EscalateStatus(
          ComponentStatus::FATAL,
          absl::StrCat("FATAL_ERROR: ", status->state_message()),
          component_status);
      break;
    default:
      AFATAL << "Unknown fusion_status: " << status->fusion_status();
      break;
  }
}

分析

  1. 判断定位模块是不是属于被监控的模块
  2. 订阅localization_msf_status,如果读不到就报ERROR级别的故障
  3. 直接判断topic 中的状态信息,根据状态报对应故障

备注

## Check MSF Localization Status We provide a simple way to check lidar localization, GNSS localization and fusion localization status. There are four states {NOT_VALID, NOT_STABLE, OK, VALID} for localization status. You can simply use `rostopic echo /apollo/localization/msf_status` to check localization status. If fusion_status is VALID or OK, the output of msf localization is reliable.

上述是apollo MSF 定位状态的判断逻辑,上述故障都是由业务模块定位部分设置并发出的。

下面是modules/localization/rtk/rtk_localization.cc的状态检测部分

void RTKLocalization::FillLocalizationStatusMsg(
    const drivers::gnss::InsStat &status,
    LocalizationStatus *localization_status) {
  apollo::common::Header *header = localization_status->mutable_header();
  double timestamp = apollo::cyber::Clock::NowInSeconds();
  header->set_timestamp_sec(timestamp);
  localization_status->set_measurement_time(status.header().timestamp_sec());

  if (!status.has_pos_type()) {
    localization_status->set_fusion_status(MeasureState::ERROR);
    localization_status->set_state_message(
        "Error: Current Localization Status Is Missing.");
    return;
  }

camera监控

代码

class CameraMonitor : public RecurrentRunner {
 public:
  CameraMonitor();
  void RunOnce(const double current_time) override;

 private:
  static void UpdateStatus(ComponentStatus* status);
};
void CameraMonitor::RunOnce(const double current_time) {
  auto* manager = MonitorManager::Instance();
  auto* component = apollo::common::util::FindOrNull(
      *manager->GetStatus()->mutable_components(), FLAGS_camera_component_name);
  if (component == nullptr) {
    // camera is not monitored in current mode, skip.
    return;
  }
  auto* status = component->mutable_other_status();
  UpdateStatus(status);
}

分析

除了判断camera是不是被配置为监控配置之外核心函数在UpdateStatus 中

void CameraMonitor::UpdateStatus(ComponentStatus* status) {
  status->clear_status();
  std::string frame_id = "";
  for (const auto& topic : camera_topic_set) {
    const auto& reader_message_pair = CreateReaderAndLatestsMessage(topic);
    const auto& reader = reader_message_pair.first;
    const auto& message = reader_message_pair.second;
    if (reader != nullptr && message != nullptr) {
      if (frame_id.empty()) {
        const auto& header = message->header();
        if (header.has_frame_id()) {
          frame_id = header.frame_id();
        }
      } else {
        SummaryMonitor::EscalateStatus(
            ComponentStatus::ERROR,
            absl::StrCat("Only one camera is permitted"), status);
      }
    }
  }
  if (frame_id.empty()) {
    SummaryMonitor::EscalateStatus(
        ComponentStatus::ERROR, absl::StrCat("No camera is detected"), status);
  } else {
    SummaryMonitor::EscalateStatus(
        ComponentStatus::OK, absl::StrCat("Detected one camera: ", frame_id),
        status);
  }
}
static const auto camera_topic_set = std::set<std::string>{
    FLAGS_image_long_topic,         FLAGS_camera_image_long_topic,
    FLAGS_camera_image_short_topic, FLAGS_camera_front_6mm_topic,
    FLAGS_camera_front_6mm_2_topic, FLAGS_camera_front_12mm_topic,
    // Add more cameras here if you want to monitor.
};
  1. 获取最新的消息
  2. 获取消息头,拿到frame id,如果有两组frame id就报ERROR
absl::StrCat("Only one camera is permitted"), status);

  如果frame id 是 empty,就报ERROR

ComponentStatus::ERROR, absl::StrCat("No camera is detected"), status);

功能安全监控

代码

// Check if we need to switch to safe mode, and then
// 1. Notify driver to take action.
// 2. Trigger Guardian if no proper action was taken.
class FunctionalSafetyMonitor : public RecurrentRunner {
 public:
  FunctionalSafetyMonitor();
  void RunOnce(const double current_time);

 private:
  bool CheckSafety();
};
void FunctionalSafetyMonitor::RunOnce(const double current_time) {
  auto* system_status = MonitorManager::Instance()->GetStatus();
  // Everything looks good or has been handled properly.
  if (CheckSafety()) {
    system_status->clear_passenger_msg();
    system_status->clear_safety_mode_trigger_time();
    system_status->clear_require_emergency_stop();
    return;
  }
  if (system_status->require_emergency_stop()) {
    // EStop has already been triggered.
    return;
  }

  // Newly entered safety mode.
  system_status->set_passenger_msg("Error! Please disengage.");
  if (!system_status->has_safety_mode_trigger_time()) {
    system_status->set_safety_mode_trigger_time(current_time);
    return;
  }

  // Trigger EStop if no action was taken in time.
  if (system_status->safety_mode_trigger_time() +
          FLAGS_safety_mode_seconds_before_estop <
      current_time) {
    system_status->set_require_emergency_stop(true);
  }
}

分析

CheckSafty函数分析

bool FunctionalSafetyMonitor::CheckSafety() {
  // We only check safety in self driving mode.
  auto manager = MonitorManager::Instance();
  if (!manager->IsInAutonomousMode()) {
    return true;
  }

  // Check HMI modules status.
  const auto& mode = manager->GetHMIMode();
  const auto& hmi_modules = manager->GetStatus()->hmi_modules();
  for (const auto& iter : mode.modules()) {
    const std::string& module_name = iter.first;
    const auto& module = iter.second;
    if (module.required_for_safety() &&
        !IsSafe(module_name, hmi_modules.at(module_name))) {
      return false;
    }
  }

  // Check monitored components status.
  const auto& components = manager->GetStatus()->components();
  for (const auto& iter : mode.monitored_components()) {
    const std::string& component_name = iter.first;
    const auto& component = iter.second;
    if (component.required_for_safety() &&
        !IsSafe(component_name, components.at(component_name).summary())) {
      return false;
    }
  }

  // Everything looks good.
  return true;
}
  1. 判断是不是在自动驾驶模式下,如果不是直接跳出
  2. 检查所有被监控的组件状态是不是ERROR或者FATAL
  3. 如果是就是不安全,如果不是就是安全

RunOnce 函数分析

  1. 检查现在的安全状态
  2. 如果安全就清空之前的状态信息并返回
  3. 如果不安全就判断是不是已经调用了安全处置措施:EStop,如果Estop已经在工作了就返回
  4. 如果发现ESTOP还没有开始工作就检查trigger 安全处置ESTOP 的时间,如果没有超时就等待

记录功能监控

recorder monitor 是对于是apollo 对于记录服务的监控,方法是通过订阅/apollo/data/recorder/status 这个topic 获取Recorder status。

代码

class RecorderMonitor : public RecurrentRunner {
 public:
  RecorderMonitor();
  void RunOnce(const double current_time) override;
};

void RecorderMonitor::RunOnce(const double current_time) {
  auto manager = MonitorManager::Instance();
  auto* component = apollo::common::util::FindOrNull(
      *manager->GetStatus()->mutable_components(),
      FLAGS_smart_recorder_component_name);
  if (component == nullptr) {
    // SmartRecorder is not monitored in current mode, skip.
    return;
  }

  static auto reader =
      manager->CreateReader<SmartRecorderStatus>(FLAGS_recorder_status_topic);
  reader->Observe();
  const auto status = reader->GetLatestObserved();

  ComponentStatus* component_status = component->mutable_other_status();
  component_status->clear_status();
  if (status == nullptr) {
    SummaryMonitor::EscalateStatus(ComponentStatus::ERROR,
                                   "No SmartRecorderStatus received",
                                   component_status);
    return;
  }

  // Translate SmartRecorderStatus to ComponentStatus. Note that ERROR and FATAL
  // will trigger safety mode in current settings.
  switch (status->recording_state()) {
    case RecordingState::RECORDING:
      SummaryMonitor::EscalateStatus(ComponentStatus::OK, "", component_status);
      break;
    case RecordingState::TERMINATING:
      SummaryMonitor::EscalateStatus(
          ComponentStatus::WARN,
          absl::StrCat("WARNNING: ", status->state_message()),
          component_status);
      break;
    case RecordingState::STOPPED:
      SummaryMonitor::EscalateStatus(
          ComponentStatus::OK,
          absl::StrCat("STOPPED: ", status->state_message()), component_status);
      break;
    default:
      AFATAL << "Unknown recording status: " << status->recording_state();
      break;
  }
}

分析

第一步依旧是判断recorder 是不是被配置的监控模块,如果不是直接返回。

然后就是直接判断status->recording_state(),如果是RecordingState::TERMINATING(终止)状态就报出一个WARNING 的故障

SmartRecorderStatus proto

enum RecordingState {
  STOPPED = 0;
  RECORDING = 1;
  TERMINATING = 2;
}

message SmartRecorderStatus {
  optional apollo.common.Header header = 1;
  optional RecordingState recording_state = 2;
  optional string state_message = 3;
}

状态的上报位置分析

modules/data/tools/smart_recorder/realtime_record_processor.cc

我们可以在上述文件中找到recorder状态赋值情况,但是可惜apollo 中目前没有一个模块会主动填写RecordingState::TERMINATING(终止)状态。

监控信息汇总服务

代码

// A monitor which summarize other monitors' result and publish the whole status
// if it has changed.
class SummaryMonitor : public RecurrentRunner {
 public:
  SummaryMonitor();
  void RunOnce(const double current_time) override;

  // Escalate the status to a higher priority new status:
  //    FATAL > ERROR > WARN > OK > UNKNOWN.
  static void EscalateStatus(const ComponentStatus::Status new_status,
                             const std::string& message,
                             ComponentStatus* current_status);

 private:
  size_t system_status_fp_ = 0;
  double last_broadcast_ = 0;
};


void SummaryMonitor::RunOnce(const double current_time) {
  auto manager = MonitorManager::Instance();
  auto* status = manager->GetStatus();
  // Escalate the summary status to the most severe one.
  for (auto& component : *status->mutable_components()) {
    auto* summary = component.second.mutable_summary();
    const auto& process_status = component.second.process_status();
    EscalateStatus(process_status.status(), process_status.message(), summary);
    const auto& module_status = component.second.module_status();
    EscalateStatus(module_status.status(), module_status.message(), summary);
    const auto& channel_status = component.second.channel_status();
    EscalateStatus(channel_status.status(), channel_status.message(), summary);
    const auto& resource_status = component.second.resource_status();
    EscalateStatus(resource_status.status(), resource_status.message(),
                   summary);
    const auto& other_status = component.second.other_status();
    EscalateStatus(other_status.status(), other_status.message(), summary);
  }

  // Get fingerprint of current status.
  // Don't use DebugString() which has known bug on Map field. The string
  // doesn't change though the value has changed.
  static std::hash<std::string> hash_fn;
  std::string proto_bytes;
  status->SerializeToString(&proto_bytes);
  const size_t new_fp = hash_fn(proto_bytes);

  if (system_status_fp_ != new_fp ||
      current_time - last_broadcast_ > FLAGS_system_status_publish_interval) {
    static auto writer =
        manager->CreateWriter<SystemStatus>(FLAGS_system_status_topic);

    apollo::common::util::FillHeader("SystemMonitor", status);
    writer->Write(*status);
    status->clear_header();
    system_status_fp_ = new_fp;
    last_broadcast_ = current_time;
  }
}

分析

针对前面所有的monitor 上报的故障信息,进行一个整合,然后发送到/apollo/monitor/system_status这个topic 上。

猜你喜欢

转载自blog.csdn.net/qq_32378713/article/details/128112717