Hi, Bharata. The patch looks good. Thanks first and foremost
to your team for all the hard work. I noticed a few bug fixes
in the patch which shouldn't have been in the code, but were.
The only thing I would change is based on our conversation the
other night. After thinking about the __cpu_disable() option,
would it be faster to just memcpy() the irq_affinity[] into
saved_affinity[], then just walk the loop? I don't think speed
is a concern, but if you're saving 'em all anyway ...
Thanks again, this looks great. If you haven't already (I didn't
see it in the tree) go ahead and check this in directly to the
2.4 branch.
--Matt
Bharata B Rao wrote:
>
> Given below are patches for different files which are modified by us to
> provide non-disruptive dump support and some SMP fixes. These patches are
> against the files taken from sourceforge cvs of lkcd.
>
> Here is the description of the changes.
>
> Instead of stopping of other CPUs, send a CALL_FUNCTION_VECTOR IPI to
> make them spin while dump is in progress. Before freezing the other CPUs,
> the affinity of every IRQ in the IRQ descriptor table is changed to ensure
> that all subsequent IOAPIC interrupts are sent to the dumping cpu.
> This logic is now common for both panic dumps and non-disruptive dumps.
> Once dump completes the CPUs are released and IRQ affinities are restored
> to their original settings. In the panic dump case machine_restart takes
> care of true shutdown of the CPUs.
>
> Special case: If another CPU is handling an IRQ when the IPI comes in, avoid
> spinning inside the IPI handler. Instead its safer to make it spin inside
> schedule on the way up. This avoids possibilities of deadlock where a
> softirq executing on the dumping cpu attempts a global cli.
>
> Because the other CPUs could be spinning inside the IPI handler with
> interrupts disabled, NMI watchdog checks are ignored on the non-dumping
> CPUs.
>
> Tried this on a 4 way SMP for non-disruptive dumping as well as panic dumps
> from task-time context.
>
> TODO
>
> 1. Dumping from interrupt context doesn't work as yet. One of the approaches
> we are experimenting with is to use a separate kernel thread for the dump
> i/o.
> 2. We curently divert all interrupts to the dumping CPU. May consider merits
> of disabling all but the IRQs necessary for functioning of the dump i/o path.
> 3. Explore ways to avoid the scheduler hook overheads when dump is not active.
> 4. Since we stop scheduling altogether we could face problems with drivers
> that depend on worker threads in order to service i/o. Need to investigate
> this further.
> 5. For disruptive dumps, avoid deadlocks while dumping if another CPU doesn't
> respond to the IPI (Refer to Tony Dziedzic's earlier note)
>
> PATCHES
>
> drivers/block/dump.c
>
> --- /home/bharata/lkcd_cvs/2.4/drivers/block/dump.c Thu Sep 13 12:19:10
> 2001
> +++ dump.c Fri Sep 21 09:38:03 2001
> @@ -145,7 +145,7 @@
> * are lots of possibilities. This is a BITMASK value, not an index.
> *
> * 1: Try to keep the system running _after_ we are done
> - * dumping -- for non-disruptive dumps. (DUMP_NONDISRUPT)
> + * dumping -- for non-disruptive dumps. (DUMP_FLAGS_NONDISRUPT)
> *
> * -----------------------------------------------------------------------
> */
> @@ -212,7 +212,8 @@
> void *dump_page_buf; /* dump page buffer for memcpy()! */
> int dump_sector_size; /* sector size for dump_device */
> int dump_sector_bits; /* sector bits for dump_device */
> -int dump_in_progress = FALSE; /* when we're dumping, we're dumping */
> +volatile int dump_in_progress = FALSE; /* when we're dumping, we're dumping
> */
> +volatile int dumping_cpu = 0; /* cpu on which dump is progressing */
> dump_header_t dump_header; /* the primary dump header */
> dump_header_asm_t dump_header_asm; /* the arch-specific dump header */
> struct kiobuf *dump_iobuf; /* kiobuf for raw I/O to disk */
> @@ -558,15 +559,17 @@
> void
> dump_silence_system(void)
> {
> - /* do what each architecture wants first */
> - __dump_silence_system();
> + int cpu = smp_processor_id();
>
> /* we set this to FALSE so we don't ever re-enter this code! */
> dump_okay = FALSE;
> + dumping_cpu = cpu;
> dump_in_progress = TRUE;
> -
> save_flags(dump_save_flags);
> - sti(); /* enable interrupts just in case ... */
> +
> + /* do what each architecture wants first */
> + __dump_silence_system();
> +
> return;
> }
>
> @@ -586,12 +589,11 @@
> dump_okay = TRUE;
>
> /* reboot the system if this isn't a disrupted dump */
> - if ((panic_timeout > 0) && (!(dump_level & DUMP_FLAGS_NONDISRUPT))) {
> + if((panic_timeout > 0) && (!(dump_flags & DUMP_FLAGS_NONDISRUPT))) {
> DUMP_PRINT("Rebooting in %d seconds ...", panic_timeout);
> mdelay(panic_timeout * 1000);
> machine_restart(NULL);
> }
> -
> return;
> }
>
> ------------------------------------------------------------------
> arch/i386/kernel/dump.c
>
>
> --- /home/bharata/lkcd_cvs/2.4/arch/i386/kernel/dump.c Mon Aug 27 12:01:30
> 2001
> +++ dump_asm.c Fri Sep 21 10:28:59 2001
> @@ -24,6 +24,13 @@
> #include <linux/mm.h>
> #include <asm/processor.h>
>
> +#include <asm/hardirq.h>
> +#include <linux/irq.h>
> +
> +extern volatile int dump_in_progress;
> +extern unsigned long irq_affinity[NR_IRQS];
> +static int saved_affinity[NR_IRQS];
> +
> /*
> * Name: __dump_save_panic_regs()
> * Func: Save the EIP (really the RA). We may pass an argument later.
> @@ -99,52 +106,66 @@
> }
>
> /*
> - * Name: dump_stop_this_cpu()
> - * Func: Just like stop_this_cpu(), but we don't disable the APIC
> - * and we do the same operations for every CPU but 0. Now
> - * this _might_ change ...
> - *
> - * Also, the "hlt" instruction is going to be changed to be
> - * some sort of spinning kernel function so we can resume if
> - * this is a non-disruptive dump.
> - */
> -static void
> -dump_stop_this_cpu(void * dummy)
> -{
> -#ifdef CONFIG_SMP
> - int cpu;
> -
> - /*
> - * Remove this CPU:
> - */
> - cpu = smp_processor_id();
> -
> - /* don't stop CPU 0, per se, for now ... */
> - if (cpu) {
> - clear_bit(cpu, &cpu_online_map);
> - __cli();
> -#if 0
> - /* don't do this for now -- interrupts are scattered */
> - disable_local_APIC();
> -#endif
> - if (cpu_data[smp_processor_id()].hlt_works_ok)
> - for(;;) __asm__("hlt");
> - for (;;);
> + * Non dumping cpus will spin here. If a cpu is handling an irq when ipi is
> + * received, we let go of it here while making sure that it hits schedule
> + * on the way up and make it spin there instead.
> + */
> +static void dump_spin(void)
> +{
> + if (in_irq()) {
> + current->need_resched = 1;
> + } else {
> + while (dump_in_progress) ;
> + }
> + return;
> +}
> +
> +/*
> + * Routine to save the old irq affinities and change affinities of all irqs
> to
> + * the dumping cpu.
> + */
> +static void set_irq_affinity(void)
> +{
> + int i;
> + int cpu = smp_processor_id();
> + for (i = 0; i < NR_IRQS; i++) {
> + if (irq_desc[i].handler == NULL)
> + continue;
> + saved_affinity[i] = irq_affinity[i];
> + irq_affinity[i] = 1UL << cpu;
> + if (irq_desc[i].handler->set_affinity != NULL)
> + irq_desc[i].handler->set_affinity(i, irq_affinity[i]);
> + }
> +}
> +
> +/*
> + * Restore old irq affinities.
> + */
> +static void reset_irq_affinity(void)
> +{
> + int i;
> + for (i = 0; i < NR_IRQS; i++) {
> + if (irq_desc[i].handler == NULL)
> + continue;
> + irq_affinity[i] = saved_affinity[i];
> + if (irq_desc[i].handler->set_affinity != NULL)
> + irq_desc[i].handler->set_affinity(i,
> saved_affinity[i]);
> }
> -#endif
> }
>
> /*
> * Name: __dump_silence_system()
> * Func: Do an architecture-specific silencing of the system.
> + * - Change irq affinities
> + * - Wait for other cpus to come out of irq handling
> + * - Send CALL_FUNCTION_VECTOR ipi to other cpus to put them to spin
> */
> void
> __dump_silence_system(void)
> {
> -#if CONFIG_SMP
> - smp_call_function(dump_stop_this_cpu, (void *)NULL, 1, 0);
> - smp_num_cpus = 1;
> -#endif
> + set_irq_affinity();
> + synchronize_irq();
> + smp_call_function(dump_spin, NULL, 0, 0);
>
> /* return */
> return;
> @@ -157,6 +178,7 @@
> void
> __dump_resume_system(void)
> {
> + reset_irq_affinity();
> /* return */
> return;
> }
>
> ---------------------------------------------------------------
> kernel/sched.c
>
> --- /home/bharata/lkcd_cvs/2.4/kernel/sched.c Mon Aug 27 12:01:31 2001
> +++ sched.c Fri Sep 21 10:51:38 2001
> @@ -529,8 +529,9 @@
> */
> asmlinkage void schedule(void)
> {
> -#if defined(CONFIG_DUMP)
> - extern int dump_in_progress;
> +#if defined(CONFIG_DUMP) || defined(CONFIG_DUMP_MODULE)
> + extern int volatile dump_in_progress;
> + extern int volatile dumping_cpu;
> #endif
> struct schedule_data * sched_data;
> struct task_struct *prev, *next, *p;
> @@ -542,7 +543,7 @@
> prev = current;
> this_cpu = prev->processor;
>
> -#if defined(CONFIG_DUMP)
> +#if defined(CONFIG_DUMP) || defined(CONFIG_DUMP_MODULE)
> if (dump_in_progress) {
> goto dump_scheduling_disabled;
> }
> @@ -713,8 +714,15 @@
> scheduling_in_interrupt:
> printk("Scheduling in interrupt\n");
> BUG();
> -#if defined(CONFIG_DUMP)
> +#if defined(CONFIG_DUMP) || defined(CONFIG_DUMP_MODULE)
> dump_scheduling_disabled:
> + /*
> + * If this is not the dumping cpu, then spin right here
> + * till the dump is complete
> + */
> + if (this_cpu != dumping_cpu) {
> + while (dump_in_progress);
> + }
> #endif
> return;
> }
>
> ------------------------------------------------------------------
> arch/i386/kernel/traps.c
>
> --- traps.c.orig Thu Sep 20 18:55:53 2001
> +++ traps.c Thu Sep 20 18:45:55 2001
> @@ -19,6 +19,7 @@
> #include <linux/ptrace.h>
> #include <linux/timer.h>
> #include <linux/mm.h>
> +#include <linux/dump.h>
> #include <linux/init.h>
> #include <linux/delay.h>
> #include <linux/spinlock.h>
> @@ -56,6 +57,10 @@
> struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
> { 0, 0 }, { 0, 0 } };
>
> +#if defined(CONFIG_DUMP) || defined(CONFIG_DUMP_MODULE)
> +extern void (*dump_function_ptr)(char *, struct pt_regs *);
> +#endif
> +
> /*
> * The IDT has to be page-aligned to simplify the Pentium
> * F0 0F bug workaround.. We have a special link segment
> @@ -220,7 +225,11 @@
> spin_lock_irq(&die_lock);
> printk("%s: %04lx\n", str, err & 0xffff);
> show_registers(regs);
> -
> +#if defined(CONFIG_DUMP) || defined(CONFIG_DUMP_MODULE)
> + if (dump_function_ptr) {
> + dump_function_ptr((char *)str, regs);
> + }
> +#endif
> spin_unlock_irq(&die_lock);
> do_exit(SIGSEGV);
> }
> @@ -406,6 +415,11 @@
>
> static spinlock_t nmi_print_lock = SPIN_LOCK_UNLOCKED;
>
> +#ifdef CONFIG_DUMP
> +extern volatile int dump_in_progress;
> +extern volatile int dumping_cpu;
> +#endif
> +
> inline void nmi_watchdog_tick(struct pt_regs * regs)
> {
> /*
> @@ -432,6 +446,13 @@
> */
> int sum, cpu = smp_processor_id();
>
> +#ifdef CONFIG_DUMP
> + /*
> + * Ignore watchdog when dumping is in progress.
> + * Todo: consider using the touch_nmi_watchdog() approach instead
> + */
> + if (dump_in_progress && cpu != dumping_cpu) return;
> +#endif
> sum = apic_timer_irqs[cpu];
>
> if (last_irq_sums[cpu] == sum) {
> @@ -450,6 +471,12 @@
> printk("NMI Watchdog detected LOCKUP on CPU%d,
> registers:\n", cpu);
> show_registers(regs);
> printk("console shuts up ...\n");
> +#if defined(CONFIG_DUMP) || defined(CONFIG_DUMP_MODULE)
> + if (dump_function_ptr) {
> + dump_function_ptr("NMI Watchdog Detected",
> + regs);
> + }
> +#endif
> console_silent();
> spin_unlock(&nmi_print_lock);
> do_exit(SIGSEGV);
>
> ------------------------------------------------------------
> Regards,
> Crashdump Team, India.
>
> --
> Bharata B Rao,
> IBM Linux Technology Center,
> IBM Software Lab, Bangalore.
>
> Ph: 91-80-5262355 Ex: 3962
> Mail: bharata@xxxxxxxxxx
|