From e6f81cf1f69e0683f969238f921950befba8e6c3 Mon Sep 17 00:00:00 2001 From: Christopher Tate Date: Wed, 23 Oct 2013 17:28:27 -0700 Subject: Support different watchdog timeouts for different entities We need to be able to perform very lengthy operations on some threads (e.g. the I/O thread responsible for installing multi-gigabyte APKs) but still have long-run deadlock/hang detection applied to those threads. Previously the watchdog mechanism applied the same policy to all monitored threads: unresponsive after 60 seconds => restart the system. Now, each monitored entity can have its own independent timeout after which the watchdog declares deadlock and restarts the runtime. The halfway-finished intermediate thread stacks are dumped based on the specific entity's declared timeout, not the global 30 second checking interval. With that new mechanism in place, the Package Manager's lengthy-I/O thread watchdog timeout is raised to 10 minutes. Bug 11278188 Change-Id: I512599260009c31416b2385f778681e5b9597f05 --- services/java/com/android/server/Watchdog.java | 113 ++++++++++++++------- .../android/server/pm/PackageManagerService.java | 11 +- 2 files changed, 87 insertions(+), 37 deletions(-) (limited to 'services/java/com/android/server') diff --git a/services/java/com/android/server/Watchdog.java b/services/java/com/android/server/Watchdog.java index 616090e..8054788 100644 --- a/services/java/com/android/server/Watchdog.java +++ b/services/java/com/android/server/Watchdog.java @@ -58,7 +58,14 @@ public class Watchdog extends Thread { // Set this to true to have the watchdog record kernel thread stacks when it fires static final boolean RECORD_KERNEL_THREADS = true; - static final int TIME_TO_WAIT = DB ? 5*1000 : 30*1000; + static final long DEFAULT_TIMEOUT = DB ? 10*1000 : 60*1000; + static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2; + + // These are temporally ordered: larger values as lateness increases + static final int COMPLETED = 0; + static final int WAITING = 1; + static final int WAITED_HALF = 2; + static final int OVERDUE = 3; static final String[] NATIVE_STACKS_OF_INTEREST = new String[] { "/system/bin/mediaserver", @@ -87,13 +94,17 @@ public class Watchdog extends Thread { public final class HandlerChecker implements Runnable { private final Handler mHandler; private final String mName; + private final long mWaitMax; private final ArrayList mMonitors = new ArrayList(); private boolean mCompleted; private Monitor mCurrentMonitor; + private long mStartTime; - HandlerChecker(Handler handler, String name) { + HandlerChecker(Handler handler, String name, long waitMaxMillis) { mHandler = handler; mName = name; + mWaitMax = waitMaxMillis; + mCompleted = true; } public void addMonitor(Monitor monitor) { @@ -111,13 +122,34 @@ public class Watchdog extends Thread { mCompleted = true; return; } + + if (!mCompleted) { + // we already have a check in flight, so no need + return; + } + mCompleted = false; mCurrentMonitor = null; + mStartTime = SystemClock.uptimeMillis(); mHandler.postAtFrontOfQueue(this); } - public boolean isCompletedLocked() { - return mCompleted; + public boolean isOverdueLocked() { + return (!mCompleted) && (SystemClock.uptimeMillis() > mStartTime + mWaitMax); + } + + public int getCompletionStateLocked() { + if (mCompleted) { + return COMPLETED; + } else { + long latency = SystemClock.uptimeMillis() - mStartTime; + if (latency < mWaitMax/2) { + return WAITING; + } else if (latency < mWaitMax) { + return WAITED_HALF; + } + } + return OVERDUE; } public Thread getThread() { @@ -186,16 +218,19 @@ public class Watchdog extends Thread { // The shared foreground thread is the main checker. It is where we // will also dispatch monitor checks and do other work. - mMonitorChecker = new HandlerChecker(FgThread.getHandler(), "foreground thread"); + mMonitorChecker = new HandlerChecker(FgThread.getHandler(), + "foreground thread", DEFAULT_TIMEOUT); mHandlerCheckers.add(mMonitorChecker); // Add checker for main thread. We only do a quick check since there // can be UI running on the thread. mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()), - "main thread")); + "main thread", DEFAULT_TIMEOUT)); // Add checker for shared UI thread. - mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(), "ui thread")); + mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(), + "ui thread", DEFAULT_TIMEOUT)); // And also check IO thread. - mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(), "i/o thread")); + mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(), + "i/o thread", DEFAULT_TIMEOUT)); } public void init(Context context, BatteryService battery, @@ -242,11 +277,15 @@ public class Watchdog extends Thread { } public void addThread(Handler thread, String name) { + addThread(thread, name, DEFAULT_TIMEOUT); + } + + public void addThread(Handler thread, String name, long timeoutMillis) { synchronized (this) { if (isAlive()) { throw new RuntimeException("Threads can't be added once the Watchdog is running"); } - mHandlerCheckers.add(new HandlerChecker(thread, name)); + mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis)); } } @@ -259,21 +298,20 @@ public class Watchdog extends Thread { pms.reboot(false, reason, false); } - private boolean haveAllCheckersCompletedLocked() { + private int evaluateCheckerCompletionLocked() { + int state = COMPLETED; for (int i=0; i getBlockedCheckersLocked() { ArrayList checkers = new ArrayList(); for (int i=0; i pids = new ArrayList(); - pids.add(Process.myPid()); - ActivityManagerService.dumpStackTraces(true, pids, null, null, - NATIVE_STACKS_OF_INTEREST); - waitedHalf = true; + } else if (waitState == WAITING) { + // still waiting but within their configured intervals; back off and recheck + continue; + } else if (waitState == WAITED_HALF) { + if (!waitedHalf) { + // We've waited half the deadlock-detection interval. Pull a stack + // trace and wait another half. + ArrayList pids = new ArrayList(); + pids.add(Process.myPid()); + ActivityManagerService.dumpStackTraces(true, pids, null, null, + NATIVE_STACKS_OF_INTEREST); + waitedHalf = true; + } continue; } + // something is overdue! blockedCheckers = getBlockedCheckersLocked(); subject = describeCheckersLocked(blockedCheckers); allowRestart = mAllowRestart; diff --git a/services/java/com/android/server/pm/PackageManagerService.java b/services/java/com/android/server/pm/PackageManagerService.java index e075862..a781d5f 100755 --- a/services/java/com/android/server/pm/PackageManagerService.java +++ b/services/java/com/android/server/pm/PackageManagerService.java @@ -221,6 +221,14 @@ public class PackageManagerService extends IPackageManager.Stub { static final int REMOVE_CHATTY = 1<<16; /** + * Timeout (in milliseconds) after which the watchdog should declare that + * our handler thread is wedged. The usual default for such things is one + * minute but we sometimes do very lengthy I/O operations on this thread, + * such as installing multi-gigabyte applications, so ours needs to be longer. + */ + private static final long WATCHDOG_TIMEOUT = 1000*60*10; // ten minutes + + /** * Whether verification is enabled by default. */ private static final boolean DEFAULT_VERIFY_ENABLE = true; @@ -1115,7 +1123,8 @@ public class PackageManagerService extends IPackageManager.Stub { synchronized (mPackages) { mHandlerThread.start(); mHandler = new PackageHandler(mHandlerThread.getLooper()); - Watchdog.getInstance().addThread(mHandler, mHandlerThread.getName()); + Watchdog.getInstance().addThread(mHandler, mHandlerThread.getName(), + WATCHDOG_TIMEOUT); File dataDir = Environment.getDataDirectory(); mAppDataDir = new File(dataDir, "data"); -- cgit v1.1