xfs
[Top] [All Lists]

kdb & nmi_watchdog

To: kaos@xxxxxxxxxxxxxxxxxxxxxx
Subject: kdb & nmi_watchdog
From: jtk@xxxxxxx
Date: Fri, 2 Jun 2000 16:38:51 -0500 (CDT)
Cc: linux-xfs@xxxxxxxxxxx
Sender: owner-linux-xfs@xxxxxxxxxxx
Keith,
        the following patch allows kdb & nmi_watchdog to "live in harmony"..

        prior to this, if kdb was configured, the nmi_watchdog
        code was disabled, if the system got into a "hard" hang
        with interrupts disabled, there was no way to get into kdb.

        with this code, nmi_watchdog is re-enabled on i386's.

        a hard hang will wait 5 seconds, dropping through nmi_watchdog,
        print out the NMI "oops", in case all else fails, and drop
        into kdb..  Using this capability has helped us uncover a pesky
        pagebuf bug this afternoon.

        take a look-see, let me know..

                -Ted Kline

----------------------------------------

diff -u .base/linux/arch/i386/kernel/traps.c linux/arch/i386/kernel/traps.c
--- .base/linux/arch/i386/kernel/traps.c        Thu Jun  1 11:09:10 2000
+++ linux/arch/i386/kernel/traps.c      Fri Jun  2 13:34:44 2000
@@ -533,11 +533,10 @@
 
 #if CONFIG_X86_IO_APIC
 
-#if defined(CONFIG_KDB)
-int nmi_watchdog = 0;
-#else
 int nmi_watchdog = 1;
-#endif
+#if    defined(CONFIG_KDB)
+int nmi_watchdog_in_kdb = 0;
+#endif /* defined(CONFIG_KDB) */
 
 static int __init setup_nmi_watchdog(char *str)
 {
@@ -550,10 +549,18 @@
 extern spinlock_t console_lock;
 static spinlock_t nmi_print_lock = SPIN_LOCK_UNLOCKED;
 
+#if defined(CONFIG_SMP) && defined(CONFIG_KDB)
+static void
+do_ack_apic_irq(void)
+{
+       ack_APIC_irq();
+}
+#endif
+
 inline void nmi_watchdog_tick(struct pt_regs * regs)
 {
        /*
-        * the best way to detect wether a CPU has a 'hard lockup' problem
+        * the best way to detect whether a CPU has a 'hard lockup' problem
         * is to check it's local APIC timer IRQ counts. If they are not
         * changing then that CPU has some problem.
         *
@@ -584,35 +591,56 @@
                 * wait a few IRQs (5 seconds) before doing the oops ...
                 */
                alert_counter[cpu]++;
-               if (alert_counter[cpu] == 5*HZ) {
+               if (alert_counter[cpu] >= 5*HZ) {
+                       int s;
+
                        spin_lock(&nmi_print_lock);
+#if defined(CONFIG_SMP) && defined(CONFIG_KDB)
+                       if (nmi_watchdog_in_kdb) {
+                               spin_unlock(&nmi_print_lock);
+                               /*
+                                * We're already in the kernel debugger
+                                * from another cpu, try to join in,
+                                * otherwise skip a beat.
+                                */
+                               (void)kdb_ipi((kdb_eframe_t)regs,
+                                                       do_ack_apic_irq);
+                               return;
+                       }
+#endif
                        /*
                         * We are in trouble anyway, lets at least try
                         * to get a message out.
                         */
-                       spin_trylock(&console_lock);
+                       s = spin_trylock(&console_lock);
                        spin_unlock(&console_lock);
+
                        printk("NMI Watchdog detected LOCKUP on CPU%d, 
registers:\n", cpu);
                        show_registers(regs);
+#if    defined(CONFIG_KDB)
+                       nmi_watchdog_in_kdb |= 1 << cpu;
+
+                       spin_unlock(&nmi_print_lock);
+
+                       kdb(KDB_REASON_WATCHDOG, 0, regs);
+
+                       spin_lock(&nmi_print_lock);
+                       nmi_watchdog_in_kdb &= ~(1 << cpu);
+                       spin_unlock(&nmi_print_lock);
+
+#else  /* ! defined(CONFIG_KDB) */
                        printk("console shuts up ...\n");
                        console_silent();
                        spin_unlock(&nmi_print_lock);
                        do_exit(SIGSEGV);
+#endif /* ! defined(CONFIG_KDB) */
                }
        } else {
                last_irq_sums[cpu] = sum;
                alert_counter[cpu] = 0;
        }
 }
-#endif
-
-#if defined(CONFIG_SMP) && defined(CONFIG_KDB)
-static void
-do_ack_apic_irq(void)
-{
-       ack_APIC_irq();
-}
-#endif
+#endif /* CONFIG_X86_IO_APIC */
 
 asmlinkage void do_nmi(struct pt_regs * regs, long error_code)
 {
@@ -620,6 +648,7 @@
 
 
        atomic_inc(&nmi_counter(smp_processor_id()));
+
 #if defined(CONFIG_SMP) && defined(CONFIG_KDB)
        /*
         * Call the kernel debugger to see if this NMI is due
@@ -631,6 +660,14 @@
 #endif
        if (!(reason & 0xc0)) {
 #if CONFIG_X86_IO_APIC
+#if    defined(CONFIG_KDB)
+               /*
+                * If we're in Kdb, just let the
+                * NMI's "slide by".
+                */
+               if (kdb_active || nmi_watchdog_in_kdb)
+                       return;
+#endif /* defined(CONFIG_KDB) */
                /*
                 * Ok, so this is none of the documented NMI sources,
                 * so it must be the NMI watchdog.
@@ -640,9 +677,9 @@
                        return;
                } else
                        unknown_nmi_error(reason, regs);
-#else
+#else  /* ! CONFIG_X86_IO_APIC */
                unknown_nmi_error(reason, regs);
-#endif
+#endif /* ! CONFIG_X86_IO_APIC */
                return;
        }
        if (reason & 0x80)

diff -u .base/linux/include/linux/kdb.h linux/include/linux/kdb.h
--- .base/linux/include/linux/kdb.h     Thu Jun  1 11:30:54 2000
+++ linux/include/linux/kdb.h   Thu Jun  1 11:29:53 2000
@@ -117,7 +117,8 @@
        KDB_REASON_SWITCH,              /* CPU switch - regs valid*/
        KDB_REASON_INT,                 /* KDB_ENTER trap - regs valid */
        KDB_REASON_KEYBOARD,            /* Keyboard entry - regs valid */
-       KDB_REASON_NMI                  /* Non-maskable interrupt; regs valid */
+       KDB_REASON_NMI,                 /* Non-maskable interrupt; regs valid */
+       KDB_REASON_WATCHDOG             /* Watchdog interrupt; regs valid */
 } kdb_reason_t;
 
 extern int   kdb(kdb_reason_t reason, int error_code, kdb_eframe_t);

diff -u .base/linux/kdb/kdbmain.c linux/kdb/kdbmain.c
--- .base/linux/kdb/kdbmain.c   Fri Jun  2 16:14:12 2000
+++ linux/kdb/kdbmain.c Fri Jun  2 16:13:03 2000
@@ -742,6 +742,11 @@
                          kdba_getpc(ef));
                kdba_dumpregs(ef, NULL, NULL);
                break;
+       case KDB_REASON_WATCHDOG:
+               kdb_printf("due to WatchDog Interrupt @ 0x%lx\n",
+                         kdba_getpc(ef));
+               kdba_dumpregs(ef, NULL, NULL);
+               break;
        case KDB_REASON_BREAK:
                kdb_printf("due to Breakpoint @ 0x%lx\n", kdba_getpc(ef));
                /*
@@ -866,7 +871,9 @@
         */
        if (reason != KDB_REASON_SWITCH) {
                if (kdb_active) {
-                       printk("kdb: debugger re-entered, allowing event to 
proceed\n");
+                       printk(
+    "kdb: debugger re-entered (new reason = %d), allowing event to proceed\n",
+                                                               reason);
                        return(0);
                }
        } else if (!kdb_active) {

<Prev in Thread] Current Thread [Next in Thread>
  • kdb & nmi_watchdog, jtk <=