Framework-Watchdog

Problemhintergrund

Seitdem ich an der Stabilität gearbeitet habe, habe ich gelernt, dass das Framework über einen Watchdog verfügt, der überwacht, ob der Hauptprozess hängen bleibt.
Wenn es 60 Sekunden lang hängen bleibt, wird der Prozess, in dem es sich befindet, beendet, nämlich system_server, und die obere Ebene wird neu gestartet.
Suchen Sie einfach nach dem Schlüsselprotokoll „WATCHDOG KILLING SYSTEM PROCESS“.

Wie stellt es also fest, dass das System feststeckt? Hier ist ein kurzer Blick auf den Watchdog-Code, um ihn zu verstehen.

Wachhund

Fügen Sie dem Quellcode einige Protokolle hinzu, um die Analyse zu erleichtern:

    private void run() {
    
    
        boolean waitedHalf = false;

        while (true) {
    
    
            List<HandlerChecker> blockedCheckers = Collections.emptyList();
            String subject = "";
            boolean allowRestart = true;
            int debuggerWasConnected = 0;
            boolean doWaitedHalfDump = false;
            // The value of mWatchdogTimeoutMillis might change while we are executing the loop.
            // We store the current value to use a consistent value for all handlers.
            final long watchdogTimeoutMillis = mWatchdogTimeoutMillis;
            final long checkIntervalMillis = watchdogTimeoutMillis / 2;
            //watchdogTimeoutMillis 60000  
            //checkIntervalMillis 30000
            Slog.i(TAG, "watchdog run watchdogTimeoutMillis: " + watchdogTimeoutMillis+" checkIntervalMillis: "+checkIntervalMillis);
            final ArrayList<Integer> pids;
            ...
            synchronized (mLock) {
    
    
                long timeout = checkIntervalMillis;
                //timeout 30000
                Slog.d(TAG, "watchdog run timeout 111 : " + timeout);
                // Make sure we (re)spin the checkers that have become idle within
                // this wait-and-check interval
                for (int i=0; i<mHandlerCheckers.size(); i++) {
    
    
                    HandlerCheckerAndTimeout hc = mHandlerCheckers.get(i);
                    // We pick the watchdog to apply every time we reschedule the checkers. The
                    // default timeout might have changed since the last run.
                    //scheduleCheckLocked---记录startTime
                    hc.checker().scheduleCheckLocked(hc.customTimeoutMillis()
                            .orElse(watchdogTimeoutMillis * Build.HW_TIMEOUT_MULTIPLIER));
                }

                if (debuggerWasConnected > 0) {
    
    
                    debuggerWasConnected--;
                }

                // NOTE: We use uptimeMillis() here because we do not want to increment the time we
                // wait while asleep. If the device is asleep then the thing that we are waiting
                // to timeout on is asleep as well and won't have a chance to run, causing a false
                // positive on when to kill things.
                long start = SystemClock.uptimeMillis();
                //start 的是从系统启动开始以来的时间
                Slog.i(TAG, "watchdog run start: " + start);
                while (timeout > 0) {
    
    
                    if (Debug.isDebuggerConnected()) {
    
    
                        debuggerWasConnected = 2;
                    }
                    try {
    
    
                    	//wait 30000毫秒
                        mLock.wait(timeout);
                        // Note: mHandlerCheckers and mMonitorChecker may have changed after waiting
                    } catch (InterruptedException e) {
    
    
                        Log.wtf(TAG, e);
                    }
                    if (Debug.isDebuggerConnected()) {
    
    
                        debuggerWasConnected = 2;
                    }
                    //30000 - 时间差
                    //如果程序完全不耗时,因为上面wait 30000,这里趋近于0
                    //但是代码运行总是需要时间的,所以timeout打印出来是-1、-2毫秒
                    timeout = checkIntervalMillis - (SystemClock.uptimeMillis() - start);
                    Slog.d(TAG, "watchdog run timeout 222 : " + timeout);
                }
				//获取状态,后面主要就靠这个状态判断是否要kill重启
                final int waitState = evaluateCheckerCompletionLocked();
                Slog.d(TAG, "watchdog run waitState : " + waitState);
                //30000毫秒内,检查完所有进程,直接continue进行下一轮检测
                if (waitState == COMPLETED) {
    
    
                    // The monitors have returned; reset
                    waitedHalf = false;
                    continue;
                } else if (waitState == WAITING) {
    
    
                    // still waiting but within their configured intervals; back off and recheck
                    ...
                    continue;
                } else if (waitState == WAITED_HALF) {
    
    
                	//waitedHalf最开始是false,第一次等待超时,进入后改为true
                	//第一次超时会进入,第二次不会重复进
                	//配合下面的doWaitedHalfDump 实现第一次超时不重启,第二次重启
                    if (!waitedHalf) {
    
    
                        Slog.i(TAG, "WAITED_HALF");
                        waitedHalf = true;
                        // We've waited half, but we'd need to do the stack trace dump w/o the lock.
                        blockedCheckers = getCheckersWithStateLocked(WAITED_HALF);
                        subject = describeCheckersLocked(blockedCheckers);
                        pids = new ArrayList<>(mInterestingJavaPids);
                        doWaitedHalfDump = true;
                    } else {
    
    
                        continue;
                    }
                } else {
    
    
                	//如果走到这里,说明要杀死pid了,系统即将重启
                    // something is overdue!
                    blockedCheckers = getCheckersWithStateLocked(OVERDUE);
                    subject = describeCheckersLocked(blockedCheckers);
                    allowRestart = mAllowRestart;
                    pids = new ArrayList<>(mInterestingJavaPids);
                }
            } // END synchronized (mLock)

            // If we got here, that means that the system is most likely hung.
            //
            // First collect stack traces from all threads of the system process.
            //
            // Then, if we reached the full timeout, kill this process so that the system will
            // restart. If we reached half of the timeout, just log some information and continue.
            logWatchog(doWaitedHalfDump, subject, pids);
			//doWaitedHalfDump 在第一次超时是赋值为true,保证log能打印,第二次才会真正的kill
            if (doWaitedHalfDump) {
    
    
                // We have waited for only half of the timeout, we continue to wait for the duration
                // of the full timeout before killing the process.
                continue;
            }

            IActivityController controller;
            synchronized (mLock) {
    
    
                controller = mController;
            }
            if (controller != null) {
    
    
                Slog.i(TAG, "Reporting stuck state to activity controller");
                try {
    
    
                    Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
                    // 1 = keep waiting, -1 = kill system
                    int res = controller.systemNotResponding(subject);
                    if (res >= 0) {
    
    
                        Slog.i(TAG, "Activity controller requested to coninue to wait");
                        waitedHalf = false;
                        continue;
                    }
                } catch (RemoteException e) {
    
    
                }
            }
			//这个是判断是否连接了调试程序,断电时防止直接kill系统进程,正常不会走这里
            // Only kill the process if the debugger is not attached.
            if (Debug.isDebuggerConnected()) {
    
    
                debuggerWasConnected = 2;
            }
            if (debuggerWasConnected >= 2) {
    
    
                Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
            } else if (debuggerWasConnected > 0) {
    
    
                Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
            } else if (!allowRestart) {
    
    
                Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
            } else {
    
    
                //这里就是kill系统进程的地方
                Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
                WatchdogDiagnostics.diagnoseCheckers(blockedCheckers);
                Slog.w(TAG, "*** GOODBYE!");
                if (!Build.IS_USER && isCrashLoopFound()
                        && !WatchdogProperties.should_ignore_fatal_count().orElse(false)) {
    
    
                    breakCrashLoop();
                }
                Process.killProcess(Process.myPid());
                System.exit(10);
            }

            waitedHalf = false;
        }
    }

Bei der gesamten Methode müssen zwei Punkte beachtet werden
: Wie beurteilt man, ob das System feststeckt, und wie weist man waitState einen Wert zu?

1、hc.checker().scheduleCheckLocked
public void scheduleCheckLocked(long handlerCheckerTimeoutMillis) {
    
    
    mWaitMaxMillis = handlerCheckerTimeoutMillis;
    if (mCompleted) {
    
    
        // Safe to update monitors in queue, Handler is not in the middle of work
        mMonitors.addAll(mMonitorQueue);
        mMonitorQueue.clear();
    }
    //监视队列没有东西,或者队列正在轮询,直接完成
    if ((mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling())
            || (mPauseCount > 0)) {
    
    
        // Don't schedule until after resume OR
        // If the target looper has recently been polling, then
        // there is no reason to enqueue our checker on it since that
        // is as good as it not being deadlocked.  This avoid having
        // to do a context switch to check the thread. Note that we
        // only do this if we have no monitors since those would need to
        // be executed at this point.
        mCompleted = true;
        return;
    }
    //如果上次检查没有完成,不进行重复操作,直接return
    if (!mCompleted) {
    
    
        // we already have a check in flight, so no need
        return;
    }
	//修改标志位,表示检查没有完成
    mCompleted = false;
    mCurrentMonitor = null;
    //记录启动查询的时间
    mStartTimeMillis = SystemClock.uptimeMillis();
    //mWaitMaxMillis 60000
    Slog.i(TAG, "scheduleCheckLocked mWaitMaxMillis: " + mWaitMaxMillis);
    //mStartTimeMillis 从系统启动开始以来的时间
    Slog.i(TAG, "scheduleCheckLocked mStartTimeMillis: " + mStartTimeMillis);
    //发消息进行检查工作
    mHandler.postAtFrontOfQueue(this);
}

mHandler.run

@Override
public void run() {
    
    
    // Once we get here, we ensure that mMonitors does not change even if we call
    // #addMonitorLocked because we first add the new monitors to mMonitorQueue and
    // move them to mMonitors on the next schedule when mCompleted is true, at which
    // point we have completed execution of this method.
    final int size = mMonitors.size();
    //循环调用所有监视器的monitor方法
    for (int i = 0 ; i < size ; i++) {
    
    
        synchronized (mLock) {
    
    
            mCurrentMonitor = mMonitors.get(i);
        }
        Slog.i(TAG, "HandlerChecker mCurrentMonitor: " + mCurrentMonitor);
        mCurrentMonitor.monitor();
    }
	//检查完mCompleted改为true
    synchronized (mLock) {
    
    
        mCompleted = true;
        mCurrentMonitor = null;
    }
}

//Erkanntes Protokoll
Aus dem Protokoll geht hervor, dass sich Watchdog-Lauf und mHandler.run nicht im selben Thread befinden.

S022A1B  09-28 18:41:00.033  1132  1198 I Watchdog: watchdog run watchdogTimeoutMillis: 60000 checkIntervalMillis: 30000
S022A1D  09-28 18:41:00.034  1132  1198 D Watchdog: watchdog run timeout 111 : 30000
S022A1E  09-28 18:41:00.034  1132  1198 I Watchdog: scheduleCheckLocked mWaitMaxMillis: 60000
S022A1F  09-28 18:41:00.035  1132  1198 I Watchdog: scheduleCheckLocked mStartTimeMillis: 4496639
S022A20  09-28 18:41:00.036  1132  1198 I Watchdog: watchdog run start: 4496641
S022A21  09-28 18:41:00.036  1132  1191 I Watchdog: HandlerChecker mCurrentMonitor: com.android.server.Watchdog$BinderThreadMonitor@75334b9
S022A22  09-28 18:41:00.037  1132  1191 I Watchdog: HandlerChecker mCurrentMonitor: com.android.server.am.UnisocActivityManagerServiceImpl@7dfd5fd
S022A24  09-28 18:41:00.038  1132  1191 I Watchdog: HandlerChecker mCurrentMonitor: com.android.server.power.PowerManagerService@877361c
S022A25  09-28 18:41:00.038  1132  1191 I Watchdog: HandlerChecker mCurrentMonitor: com.android.server.wm.UnisocWindowManagerService@ede477e
S022A26  09-28 18:41:00.039  1132  1191 I Watchdog: HandlerChecker mCurrentMonitor: com.android.server.input.InputManagerService@2e6c225
S022A27  09-28 18:41:00.041  1132  1191 I Watchdog: HandlerChecker mCurrentMonitor: com.android.server.StorageManagerService@9061ffa
S022A28  09-28 18:41:00.043  1132  1191 I Watchdog: HandlerChecker mCurrentMonitor: com.android.server.media.MediaSessionService@efcfeeb
S022A29  09-28 18:41:00.043  1132  1191 I Watchdog: HandlerChecker mCurrentMonitor: com.android.server.media.MediaRouterService@c265448
S022A2A  09-28 18:41:00.044  1132  1191 I Watchdog: HandlerChecker mCurrentMonitor: com.android.server.media.projection.MediaProjectionManagerService@1b002e1
S022A2B  09-28 18:41:00.044  1132  1191 I Watchdog: HandlerChecker mCurrentMonitor: com.android.server.am.BatteryStatsService@3d7f806
S022FCD  09-28 18:41:30.037  1132  1198 D Watchdog: watchdog run timeout 222 : 0
S022FCE  09-28 18:41:30.037  1132  1198 D Watchdog: watchdog run waitState : 0

Auf dem Monitor ist nichts zu sehen. Holen Sie sich einfach die Synchronisierungssperre und prüfen Sie, ob der Dienst andere Aufgaben ausführt.
Wenn Sie über einen längeren Zeitraum andere Aufgaben ausführen, können Sie mLock nicht erhalten.

Frameworks\base\services\core\java\com\android\server\power\PowerManagerService.java

    public void monitor() {
    
    
        // Grab and release lock for watchdog monitor to detect deadlocks.
        synchronized (mLock) {
    
    
        }
    }
2、evaluateCheckerCompletionLocked
    private int evaluateCheckerCompletionLocked() {
    
    
        int state = COMPLETED;
        for (int i=0; i<mHandlerCheckers.size(); i++) {
    
    
            HandlerChecker hc = mHandlerCheckers.get(i).checker();
            state = Math.max(state, hc.getCompletionStateLocked());
        }
        return state;
    }
public int getCompletionStateLocked() {
    
    
    Slog.i(TAG, "getCompletionStateLocked mCompleted: " + mCompleted);
    //上面的检测如果在30秒内完成,mCompleted就是true
    if (mCompleted) {
    
    
        return COMPLETED;
    } else {
    
    
    	//如果没完成,则根据卡顿的时间来计算状态
    	//因为上面wait了30000毫秒,所以latency一定是大于30000毫秒的
        long latency = SystemClock.uptimeMillis() - mStartTimeMillis;
        Slog.i(TAG, "getCompletionStateLocked latency: " + latency);
        if (latency < mWaitMaxMillis / 2) {
    
    
            return WAITING;//这里理论上不会走进来
        } else if (latency < mWaitMaxMillis) {
    
    
            return WAITED_HALF;//第一次卡顿
        }
    }
    return OVERDUE;//第二次卡顿
}

Supongo que te gusta

Origin blog.csdn.net/a396604593/article/details/133390501
Recomendado
Clasificación