Android Watchdog分析

初始化

Watchdog作为一个独立的线程在SystemServer进程中被初始化:

 private void startBootstrapServices(@NonNull TimingsTraceAndSlog t) {
    
    
         // Start the watchdog as early as possible so we can crash the system server
        // if we deadlock during early boot
        t.traceBegin("StartWatchdog");
        final Watchdog watchdog = Watchdog.getInstance();
        watchdog.start();
        t.traceEnd();
 }

Watchdog类没有在定义时实现Runnable接口,但其实现了run()方法,类变量 private final Thread mThread; 在构造器中被初始化,watchdog.start();开始执行此线程。

构造其中添加了"foreground thread",“main thread”,“ui thread”,“i/o thread”,“display thread”,“animation thread”,“surface animation thread”,"BinderThreadMonitor"等HandlerChecker。

    private Watchdog() {
    
    
        mThread = new Thread(this::run, "watchdog");
        // Initialize handler checkers for each common thread we want to check.  Note
        // that we are not currently checking the background thread, since it can
        // potentially hold longer running operations with no guarantees about the timeliness
        // of operations there.

        // The shared foreground thread is the main checker.  It is where we
        // will also dispatch monitor checks and do other work.
        mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
                "foreground thread", DEFAULT_TIMEOUT);
        mHandlerCheckers.add(mMonitorChecker);
        // Add checker for main thread.  We only do a quick check since there
        // can be UI running on the thread.
        mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()),
                "main thread", DEFAULT_TIMEOUT));
        // Add checker for shared UI thread.
        mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(),
                "ui thread", DEFAULT_TIMEOUT));
        // And also check IO thread.
        mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(),
                "i/o thread", DEFAULT_TIMEOUT));
        // And the display thread.
        mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(),
                "display thread", DEFAULT_TIMEOUT));
        // And the animation thread.
        mHandlerCheckers.add(new HandlerChecker(AnimationThread.getHandler(),
                "animation thread", DEFAULT_TIMEOUT));
        // And the surface animation thread.
        mHandlerCheckers.add(new HandlerChecker(SurfaceAnimationThread.getHandler(),
                "surface animation thread", DEFAULT_TIMEOUT));

        // Initialize monitor for Binder threads.
        addMonitor(new BinderThreadMonitor());

        mInterestingJavaPids.add(Process.myPid());

        // See the notes on DEFAULT_TIMEOUT.
        assert DB ||
                DEFAULT_TIMEOUT > ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS;

        mTraceErrorLogger = new TraceErrorLogger();
    }

AMS, PKMS, WMS等会在自己的构造器中将自己添加到Watchdog的HandlerChecker中:

public ActivityManagerService() {
    
    
    Watchdog.getInstance().addMonitor(this);
    Watchdog.getInstance().addThread(mHandler);
}
public PackageManagerService() {
    
    
    Watchdog.getInstance().addThread(mHandler, WATCHDOG_TIMEOUT);
}
public WindowManagerService{
    
    
      public void onInitReady() {
    
    
        // Add ourself to the Watchdog monitors.
        Watchdog.getInstance().addMonitor(this);
    }
}
工作流程

Watchdog作为单独执行的线程,在run()方法中循环监测所有HandlerChecker的状态,导出异常进程的运行日志,必要时给当前进程(system_server)发送signal 9,杀掉此进程。

public void run{
    
    
        while (true) {
    
    
            synchronized (mLock) {
    
    
                //1. 遍历所有HandlerChecker
                for (int i=0; i<mHandlerCheckers.size(); i++) {
    
    
                    HandlerChecker hc = mHandlerCheckers.get(i);
                    hc.scheduleCheckLocked();
                }

                //2. mLock.wait(timeout);使当前线程处于等待状态,等待时间为timeout = CHECK_INTERVAL: 30s
                // NOTE: We use uptimeMillis() here because we do not want to increment the time we
                // wait while asleep. If the device is asleep then the thing that we are waiting
                // to timeout on is asleep as well and won't have a chance to run, causing a false
                // positive on when to kill things.
                long start = SystemClock.uptimeMillis();
                while (timeout > 0) {
    
    
                    Log.d(TAG, "run: timeout = " + timeout);
                    if (Debug.isDebuggerConnected()) {
    
    
                        debuggerWasConnected = 2;
                    }
                    try {
    
    
                        mLock.wait(timeout);
                        // Note: mHandlerCheckers and mMonitorChecker may have changed after waiting
                    } catch (InterruptedException e) {
    
    
                        Log.wtf(TAG, e);
                    }
                    if (Debug.isDebuggerConnected()) {
    
    
                        debuggerWasConnected = 2;
                    }
                    timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
                }

                //3. 监测HandlerChecker的完成状态
                final int waitState = evaluateCheckerCompletionLocked();
                if (waitState == COMPLETED) {
    
    
                    waitedHalf = false;
                    continue;
                } else if (waitState == WAITING) {
    
    
                    continue;
                } else if (waitState == WAITED_HALF) {
    
    
                    if (!waitedHalf) {
    
    
                        waitedHalf = true;
                        pids = new ArrayList<>(mInterestingJavaPids);
                        doWaitedHalfDump = true;
                    } else {
    
    
                        continue;
                    }
                } else {
    
    
                    // 存在超时的 HandlerChecker !!!
                    // something is overdue!
                    blockedCheckers = getBlockedCheckersLocked();
                    subject = describeCheckersLocked(blockedCheckers);
                    allowRestart = mAllowRestart;
                    pids = new ArrayList<>(mInterestingJavaPids);
                }
            } // END synchronized (mLock)

            // 4. 导出异常日志 ANR:/data/anr
            final File finalStack = ActivityManagerService.dumpStackTraces(
                    pids, processCpuTracker, new SparseArray<>(), nativePids,
                    tracesFileException, subject);

            // 5. 导出异常日志 dropbox:/data/system/dropbox/
            Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
    
    
                    public void run() {
    
    
                        if (mActivity != null) {
    
    
                            mActivity.addErrorToDropBox(
                                    "watchdog", null, "system_server", null, null, null,
                                    null, report.toString(), finalStack, null, null, null,
                                    errorId);
                        }
                    }
            };
            dropboxThread.start();
            try {
    
    
                dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
            } catch (InterruptedException ignored) {
    
    }

            // 6. 导出异常日志到 kernel log后关机(trigger kernel panic), 通过/proc/sysrq-trigger触发
            if (crashOnWatchdog) {
    
    
                // Trigger the kernel to dump all blocked threads, and backtraces
                // on all CPUs to the kernel log
                Slog.e(TAG, "Triggering SysRq for system_server watchdog");
                doSysRq('w');
                doSysRq('l');

                // wait until the above blocked threads be dumped into kernel log
                SystemClock.sleep(3000);

                doSysRq('c');
            }

            // 7. 向ActivityController汇报当前状态
            IActivityController controller;
            if (controller != null) {
    
    
                Slog.i(TAG, "Reporting stuck state to activity controller");
                try {
    
    
                    Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
                    // 1 = keep waiting, -1 = kill system
                    int res = controller.systemNotResponding(subject);
                    if (res >= 0) {
    
    
                        Slog.i(TAG, "Activity controller requested to coninue to wait");
                        waitedHalf = false;
                        continue;
                    }
                } catch (RemoteException e) {
    
    
                }
            }

            // 8. 判断是否需要杀掉当前进程(system_server进程) Process.killProcess(Process.myPid())
            // Only kill the process if the debugger is not attached.
            if (Debug.isDebuggerConnected()) {
    
    
                debuggerWasConnected = 2;
            }
            if (debuggerWasConnected >= 2) {
    
    
                Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
            } else if (debuggerWasConnected > 0) {
    
    
                Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
            } else if (!allowRestart) {
    
    
                Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
            } else {
    
    
                Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
                WatchdogDiagnostics.diagnoseCheckers(blockedCheckers);
                Slog.w(TAG, "*** GOODBYE!");
                if(SmartTraceUtils.isPerfettoDumpEnabled() && dueTime > SystemClock.uptimeMillis()){
    
    
                    long timeDelta = dueTime - SystemClock.uptimeMillis();
                    // wait until perfetto log to be dumped completely
                    Slog.i(TAG,"Sleep "+ timeDelta
                            +" ms to make sure perfetto log to be dumped completely");
                    SystemClock.sleep(timeDelta);
                }
                if (!Build.IS_USER && isCrashLoopFound()
                        && !WatchdogProperties.should_ignore_fatal_count().orElse(false)) {
    
    
                    breakCrashLoop();
                }
                Process.killProcess(Process.myPid());
                System.exit(10);
            }
            waitedHalf = false;
        }
    }
检测机制

Watchdog在初始化时将一些重要进程添加到HandlerChecker列表中,通过HandlerChecker对各个监测对象进行监测。

HandlerChecker大致可以分为两类:

  • Monitor Checker,用于检查是Monitor对象可能发生的死锁, FgThread, AMS, WMS等核心的系统服务都是Monitor对象。
  • Looper Checker,用于检查线程的消息队列是否长时间处于工作状态。Watchdog自身的消息队列,Ui, Io, Display这些全局的消息队列都是被检查的对象。此外,一些重要的线程的消息队列,也会加入到Looper Checker中,譬如AMS, PKMS,这些是在对应的对象初始化时加入的。
    public void addMonitor(Monitor monitor) {
    
    
        synchronized (mLock) {
    
    
            mMonitorChecker.addMonitorLocked(monitor);
        }
    }

    public void addThread(Handler thread) {
    
    
        addThread(thread, DEFAULT_TIMEOUT);
    }

HandlerChecker是Watchdog的内部类,也实现了Runnable接口。

从上面Watchdog的工作流程中可以看到,Watchdog主要通过HandlerChecker的scheduleCheckLocked()方法监测进程状态。

在scheduleCheckLocked()方法开始初始化类变量mMonitors,mMonitors变量包含了所有的Monitor Checker对象,如上文说的FgThread, AMS, WMS等。

下面主要关注scheduleCheckLocked()方法中的两行代码:

  1. 通过*mHandler.getLooper().getQueue().isPolling()*方法判断Loop对象是否依然活跃而不是卡住。对于Looper Checker而言,会判断线程的消息队列是否处于空闲状态。 如果被监测的消息队列一直闲不下来,则说明可能已经阻塞等待了很长时间

  2. mHandler.postAtFrontOfQueue(this); 将Monitor Checker的对象置于消息队列之前,优先运行。mHandler.postAtFrontOfQueue(Runable r)参数为Runable对象,将HandlerChecker类中实现的run()方法放在监测对象mHandler进程中执行,调用其实现的monitor()方法,方法实现一般很简单,就是获取当前类的对象锁,如果当前对象锁已经被持有,则monitor()会一直处于wait状态,直到超时,这种情况下,很可能是线程发生了死锁。

public final class HandlerChecker implements Runnable {
    
    
		public void scheduleCheckLocked() {
    
    
            if (mCompleted) {
    
    
                // Safe to update monitors in queue, Handler is not in the middle of work
                mMonitors.addAll(mMonitorQueue);
                mMonitorQueue.clear();
            }
            if ((mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling())
                    || (mPauseCount > 0)) {
    
    
                // Don't schedule until after resume OR
                // If the target looper has recently been polling, then
                // there is no reason to enqueue our checker on it since that
                // is as good as it not being deadlocked.  This avoid having
                // to do a context switch to check the thread. Note that we
                // only do this if we have no monitors since those would need to
                // be executed at this point.
                mCompleted = true;
                return;
            }
            if (!mCompleted) {
    
    
                // we already have a check in flight, so no need
                return;
            }

            mCompleted = false;
            mCurrentMonitor = null;
            mStartTime = SystemClock.uptimeMillis();
            mHandler.postAtFrontOfQueue(this);
        }
    
        @Override
        public void run() {
    
    
            // Once we get here, we ensure that mMonitors does not change even if we call
            // #addMonitorLocked because we first add the new monitors to mMonitorQueue and
            // move them to mMonitors on the next schedule when mCompleted is true, at which
            // point we have completed execution of this method.
            final int size = mMonitors.size();
            for (int i = 0 ; i < size ; i++) {
    
    
                synchronized (mLock) {
    
    
                    mCurrentMonitor = mMonitors.get(i);
                }
                mCurrentMonitor.monitor();
            }

            synchronized (mLock) {
    
    
                mCompleted = true;
                mCurrentMonitor = null;
            }
        }
}
Monitor

Monitor是Watchdog的内部接口:

public class Watchdog {
    
    
    public interface Monitor {
    
    
        void monitor();
    }
}

AMS的monitor()实现:

public class ActivityManagerService extends IActivityManager.Stub
        implements Watchdog.Monitor, BatteryStatsImpl.BatteryCallback, ActivityManagerGlobalLock {
    
    
        
            /** In this method we try to acquire our lock to make sure that we have not deadlocked */
            public void monitor() {
    
    
                synchronized (this) {
    
     }
            }
}

猜你喜欢

转载自blog.csdn.net/qq_36063677/article/details/122109633
今日推荐