Keith,
the following patch allows kdb & nmi_watchdog to "live in harmony"..
prior to this, if kdb was configured, the nmi_watchdog
code was disabled, if the system got into a "hard" hang
with interrupts disabled, there was no way to get into kdb.
with this code, nmi_watchdog is re-enabled on i386's.
a hard hang will wait 5 seconds, dropping through nmi_watchdog,
print out the NMI "oops", in case all else fails, and drop
into kdb.. Using this capability has helped us uncover a pesky
pagebuf bug this afternoon.
take a look-see, let me know..
-Ted Kline
----------------------------------------
diff -u .base/linux/arch/i386/kernel/traps.c linux/arch/i386/kernel/traps.c
--- .base/linux/arch/i386/kernel/traps.c Thu Jun 1 11:09:10 2000
+++ linux/arch/i386/kernel/traps.c Fri Jun 2 13:34:44 2000
@@ -533,11 +533,10 @@
#if CONFIG_X86_IO_APIC
-#if defined(CONFIG_KDB)
-int nmi_watchdog = 0;
-#else
int nmi_watchdog = 1;
-#endif
+#if defined(CONFIG_KDB)
+int nmi_watchdog_in_kdb = 0;
+#endif /* defined(CONFIG_KDB) */
static int __init setup_nmi_watchdog(char *str)
{
@@ -550,10 +549,18 @@
extern spinlock_t console_lock;
static spinlock_t nmi_print_lock = SPIN_LOCK_UNLOCKED;
+#if defined(CONFIG_SMP) && defined(CONFIG_KDB)
+static void
+do_ack_apic_irq(void)
+{
+ ack_APIC_irq();
+}
+#endif
+
inline void nmi_watchdog_tick(struct pt_regs * regs)
{
/*
- * the best way to detect wether a CPU has a 'hard lockup' problem
+ * the best way to detect whether a CPU has a 'hard lockup' problem
* is to check it's local APIC timer IRQ counts. If they are not
* changing then that CPU has some problem.
*
@@ -584,35 +591,56 @@
* wait a few IRQs (5 seconds) before doing the oops ...
*/
alert_counter[cpu]++;
- if (alert_counter[cpu] == 5*HZ) {
+ if (alert_counter[cpu] >= 5*HZ) {
+ int s;
+
spin_lock(&nmi_print_lock);
+#if defined(CONFIG_SMP) && defined(CONFIG_KDB)
+ if (nmi_watchdog_in_kdb) {
+ spin_unlock(&nmi_print_lock);
+ /*
+ * We're already in the kernel debugger
+ * from another cpu, try to join in,
+ * otherwise skip a beat.
+ */
+ (void)kdb_ipi((kdb_eframe_t)regs,
+ do_ack_apic_irq);
+ return;
+ }
+#endif
/*
* We are in trouble anyway, lets at least try
* to get a message out.
*/
- spin_trylock(&console_lock);
+ s = spin_trylock(&console_lock);
spin_unlock(&console_lock);
+
printk("NMI Watchdog detected LOCKUP on CPU%d,
registers:\n", cpu);
show_registers(regs);
+#if defined(CONFIG_KDB)
+ nmi_watchdog_in_kdb |= 1 << cpu;
+
+ spin_unlock(&nmi_print_lock);
+
+ kdb(KDB_REASON_WATCHDOG, 0, regs);
+
+ spin_lock(&nmi_print_lock);
+ nmi_watchdog_in_kdb &= ~(1 << cpu);
+ spin_unlock(&nmi_print_lock);
+
+#else /* ! defined(CONFIG_KDB) */
printk("console shuts up ...\n");
console_silent();
spin_unlock(&nmi_print_lock);
do_exit(SIGSEGV);
+#endif /* ! defined(CONFIG_KDB) */
}
} else {
last_irq_sums[cpu] = sum;
alert_counter[cpu] = 0;
}
}
-#endif
-
-#if defined(CONFIG_SMP) && defined(CONFIG_KDB)
-static void
-do_ack_apic_irq(void)
-{
- ack_APIC_irq();
-}
-#endif
+#endif /* CONFIG_X86_IO_APIC */
asmlinkage void do_nmi(struct pt_regs * regs, long error_code)
{
@@ -620,6 +648,7 @@
atomic_inc(&nmi_counter(smp_processor_id()));
+
#if defined(CONFIG_SMP) && defined(CONFIG_KDB)
/*
* Call the kernel debugger to see if this NMI is due
@@ -631,6 +660,14 @@
#endif
if (!(reason & 0xc0)) {
#if CONFIG_X86_IO_APIC
+#if defined(CONFIG_KDB)
+ /*
+ * If we're in Kdb, just let the
+ * NMI's "slide by".
+ */
+ if (kdb_active || nmi_watchdog_in_kdb)
+ return;
+#endif /* defined(CONFIG_KDB) */
/*
* Ok, so this is none of the documented NMI sources,
* so it must be the NMI watchdog.
@@ -640,9 +677,9 @@
return;
} else
unknown_nmi_error(reason, regs);
-#else
+#else /* ! CONFIG_X86_IO_APIC */
unknown_nmi_error(reason, regs);
-#endif
+#endif /* ! CONFIG_X86_IO_APIC */
return;
}
if (reason & 0x80)
diff -u .base/linux/include/linux/kdb.h linux/include/linux/kdb.h
--- .base/linux/include/linux/kdb.h Thu Jun 1 11:30:54 2000
+++ linux/include/linux/kdb.h Thu Jun 1 11:29:53 2000
@@ -117,7 +117,8 @@
KDB_REASON_SWITCH, /* CPU switch - regs valid*/
KDB_REASON_INT, /* KDB_ENTER trap - regs valid */
KDB_REASON_KEYBOARD, /* Keyboard entry - regs valid */
- KDB_REASON_NMI /* Non-maskable interrupt; regs valid */
+ KDB_REASON_NMI, /* Non-maskable interrupt; regs valid */
+ KDB_REASON_WATCHDOG /* Watchdog interrupt; regs valid */
} kdb_reason_t;
extern int kdb(kdb_reason_t reason, int error_code, kdb_eframe_t);
diff -u .base/linux/kdb/kdbmain.c linux/kdb/kdbmain.c
--- .base/linux/kdb/kdbmain.c Fri Jun 2 16:14:12 2000
+++ linux/kdb/kdbmain.c Fri Jun 2 16:13:03 2000
@@ -742,6 +742,11 @@
kdba_getpc(ef));
kdba_dumpregs(ef, NULL, NULL);
break;
+ case KDB_REASON_WATCHDOG:
+ kdb_printf("due to WatchDog Interrupt @ 0x%lx\n",
+ kdba_getpc(ef));
+ kdba_dumpregs(ef, NULL, NULL);
+ break;
case KDB_REASON_BREAK:
kdb_printf("due to Breakpoint @ 0x%lx\n", kdba_getpc(ef));
/*
@@ -866,7 +871,9 @@
*/
if (reason != KDB_REASON_SWITCH) {
if (kdb_active) {
- printk("kdb: debugger re-entered, allowing event to
proceed\n");
+ printk(
+ "kdb: debugger re-entered (new reason = %d), allowing event to proceed\n",
+ reason);
return(0);
}
} else if (!kdb_active) {
|