xfs
[Top] [All Lists]

[PATCH] xfs: use s_umount sema in xfs_sync_worker

To: xfs@xxxxxxxxxxx
Subject: [PATCH] xfs: use s_umount sema in xfs_sync_worker
From: Ben Myers <bpm@xxxxxxx>
Date: Mon, 14 May 2012 15:34:49 -0500
In-reply-to: <20120323174327.GU7762@xxxxxxx>
References: <20120323174327.GU7762@xxxxxxx>
User-agent: Mutt/1.5.20 (2009-06-14)
I'm still hitting this on a regular basis.  Here is some analysis from a recent
crash dump which you may want to skip.  The fix is at the end.

PID: 3862   TASK: ffff88022175c640  CPU: 0   COMMAND: "262"

Hrm.  Looks like test 262 was running this time.

PID: 27544  TASK: ffff88013544e040  CPU: 3   COMMAND: "kworker/3:0"
 #0 [ffff88016fdff930] machine_kexec at ffffffff810244e9
 #1 [ffff88016fdff9a0] crash_kexec at ffffffff8108d053
 #2 [ffff88016fdffa70] oops_end at ffffffff813ad1b8
 #3 [ffff88016fdffaa0] no_context at ffffffff8102bd48
 #4 [ffff88016fdffaf0] __bad_area_nosemaphore at ffffffff8102c04d
 #5 [ffff88016fdffb40] bad_area_nosemaphore at ffffffff8102c12e
 #6 [ffff88016fdffb50] do_page_fault at ffffffff813afaee
 #7 [ffff88016fdffc60] page_fault at ffffffff813ac635
    [exception RIP: xlog_get_lowest_lsn+0x30]
    RIP: ffffffffa04a9910  RSP: ffff88016fdffd10  RFLAGS: 00010246
    RAX: ffffc90014e48000  RBX: ffff88014d879980  RCX: ffff88014d879980
    RDX: ffff8802214ee4c0  RSI: 0000000000000000  RDI: 0000000000000000
    RBP: ffff88016fdffd10   R8: ffff88014d879a80   R9: 0000000000000000
    R10: 0000000000000001  R11: 0000000000000000  R12: ffff8802214ee400
    R13: ffff88014d879980  R14: 0000000000000000  R15: ffff88022fd96605
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
 #8 [ffff88016fdffd18] xlog_state_do_callback at ffffffffa04aa186 [xfs]
 #9 [ffff88016fdffd98] xlog_state_done_syncing at ffffffffa04aa568 [xfs]
#10 [ffff88016fdffdc8] xlog_iodone at ffffffffa04aad6e [xfs]
#11 [ffff88016fdffdf8] xfs_buf_iodone_work at ffffffffa044a541 [xfs]
#12 [ffff88016fdffe18] process_one_work at ffffffff8104c53b
#13 [ffff88016fdffe68] worker_thread at ffffffff8104f0e3
#14 [ffff88016fdffee8] kthread at ffffffff8105395e
#15 [ffff88016fdfff48] kernel_thread_helper at ffffffff813b3ae4

Here's the guy who crashed the system.  Lets find the superblock:

Ok, walked from the xlog_in_core_t in rcx to ic_log (struct log) to l_mp
(xfs_mount 0xffff88013f8f9800) to get m_super (struct super_block), which is at
0xffff88021d1c2c00. 

struct xfs_mount {
        m_super = 0xffff88021d1c2c00,
        m_tid = 0x0,
        m_ail = 0xffff880149ea58c0,
        m_sb = {
        sb_magicnum = 0x58465342,       <--- yep, correct xfs super magic.

His s_id is /dev/sdb4 which is not currently mounted.  Looks like this is my
scratch filesystem, as /dev/sdb3 is mounted at /mnt/test.

PID: 4025   TASK: ffff880220870300  CPU: 1   COMMAND: "umount"
 #0 [ffff88022167ba58] __schedule at ffffffff813aacac
 #1 [ffff88022167bba0] schedule at ffffffff813ab0c4
 #2 [ffff88022167bbb0] schedule_timeout at ffffffff813a92d5
 #3 [ffff88022167bc50] wait_for_common at ffffffff813aa527
 #4 [ffff88022167bce0] wait_for_completion at ffffffff813aa6a8
 #5 [ffff88022167bcf0] wait_on_work at ffffffff8104d8b5
 #6 [ffff88022167bd90] __cancel_work_timer at ffffffff8104da2a
 #7 [ffff88022167bdd0] cancel_delayed_work_sync at ffffffff8104db2d
 #8 [ffff88022167bde0] xfs_syncd_stop at ffffffffa045aa78 [xfs]
 #9 [ffff88022167be00] xfs_fs_put_super at ffffffffa0459228 [xfs]
#10 [ffff88022167be20] generic_shutdown_super at ffffffff81110bdc
#11 [ffff88022167be50] kill_block_super at ffffffff81110c8c
#12 [ffff88022167be70] deactivate_locked_super at ffffffff811111f5
#13 [ffff88022167be90] deactivate_super at ffffffff81111f41
#14 [ffff88022167beb0] mntput_no_expire at ffffffff8112be1d
#15 [ffff88022167bef0] sys_umount at ffffffff8112c546
#16 [ffff88022167bf80] system_call_fastpath at ffffffff813b27f9
    RIP: 00007f789d7eb8e7  RSP: 00007fff1a744f60  RFLAGS: 00010246
    RAX: 00000000000000a6  RBX: ffffffff813b27f9  RCX: 00007fff1a7456c8
    RDX: 0000000000000000  RSI: 0000000000000000  RDI: 0000000000610940
    RBP: 0000000000610940   R8: 0000000000610960   R9: 0000000000000001
    R10: 0000000000000000  R11: 0000000000000246  R12: 00007fff1a7461ce
    R13: 0000000000610920  R14: 0000000000610060  R15: 00000000006109a0
    ORIG_RAX: 00000000000000a6  CS: 0033  SS: 002b

We have an unmount running.  Is it the same filesytem as the log crash?

crash> bt -f ffff880220870300
...
 #8 [ffff88022167bde0] xfs_syncd_stop at ffffffffa045aa78 [xfs]
    ffff88022167bde8: 0000000000000000 ffff88013f8f9800 <-------- xfs_mount
    ffff88022167bdf8: ffff88022167be18 ffffffffa0459228 

0xffffffffa045aa60 <xfs_syncd_stop>:    push   %rbp                     *
0xffffffffa045aa61 <xfs_syncd_stop+0x1>:        mov    %rsp,%rbp
0xffffffffa045aa64 <xfs_syncd_stop+0x4>:        push   %rbx             *
0xffffffffa045aa65 <xfs_syncd_stop+0x5>:        mov    %rdi,%rbx
0xffffffffa045aa68 <xfs_syncd_stop+0x8>:        lea    0x370(%rdi),%rdi
0xffffffffa045aa6f <xfs_syncd_stop+0xf>:        sub    $0x8,%rsp

xfs_syncd_stop pushes rbp and rbx on to the stack...

 #9 [ffff88022167be00] xfs_fs_put_super at ffffffffa0459228 [xfs]
    ffff88022167be08: ffff880221a6e0a0 ffff88021d1c2c00 
    ffff88022167be18: ffff88022167be48 ffffffff81110bdc 

crash> dis xfs_fs_put_super
0xffffffffa0459200 <xfs_fs_put_super>:  push   %rbp
0xffffffffa0459201 <xfs_fs_put_super+0x1>:      mov    %rsp,%rbp
0xffffffffa0459204 <xfs_fs_put_super+0x4>:      push   %rbx
0xffffffffa0459205 <xfs_fs_put_super+0x5>:      sub    $0x8,%rsp
0xffffffffa0459209 <xfs_fs_put_super+0x9>:      mov    0x2f0(%rdi),%rbx         
*

The superblock is in rdi,
      [0x2f0] void *s_fs_info;  

We stuff the xfs_mount in s_fs_info, so then the xfs_mount should be in rbx
when xfs_syncd_stop pushes rbx onto the stack.

So the xfs_mount should be at ffff88013f8f9800, the same filesystem as the task
that crashed the system.

PID: 28181  TASK: ffff880001764200  CPU: 3   COMMAND: "kworker/3:10"
 #0 [ffff880001767b40] __schedule at ffffffff813aacac
 #1 [ffff880001767c88] schedule at ffffffff813ab0c4
 #2 [ffff880001767c98] xlog_wait at ffffffffa04a9f0a [xfs]
 #3 [ffff880001767cf8] _xfs_log_force_lsn at ffffffffa04ac95e [xfs]
 #4 [ffff880001767d88] xfs_trans_commit at ffffffffa04a9083 [xfs]
 #5 [ffff880001767dd8] xfs_fs_log_dummy at ffffffffa044f15b [xfs]
 #6 [ffff880001767df8] xfs_sync_worker at ffffffffa045ac4c [xfs]
 #7 [ffff880001767e18] process_one_work at ffffffff8104c53b
 #8 [ffff880001767e68] worker_thread at ffffffff8104f0e3
 #9 [ffff880001767ee8] kthread at ffffffff8105395e
#10 [ffff880001767f48] kernel_thread_helper at ffffffff813b3ae4

Is this xlog_wait the same filesytem?

 #2 [ffff880001767c98] xlog_wait at ffffffffa04a9f0a [xfs]
    ffff880001767ca0: 0000000000000001 ffff880001764200
    ffff880001767cb0: ffffffff81061940 ffff88014d879988
    ffff880001767cc0: ffff88014d879988 ffffffffa04ac2db
    ffff880001767cd0: ffff88014d879980 000000010000003f
    ffff880001767ce0: ffff8802214ee400 ffff8802214ee4a8
    ffff880001767cf0: ffff880001767d80 ffffffffa04ac95e
 #3 [ffff880001767cf8] _xfs_log_force_lsn at ffffffffa04ac95e [xfs]
    ffff880001767d00: ffff880001764200 0000000000011da0
    ffff880001767d10: 0000000000000000 0000000100000001
    ffff880001767d20: 0000000000000001 ffff880001764200
    ffff880001767d30: ffffffff81061940 dead000000100100
    ffff880001767d40: dead000000200200 0000000000000000
    ffff880001767d50: ffff88013f8f9800 ffff880206a80ea8
    ffff880001767d60: 0000000000000000 0000000000000000
    ffff880001767d70: ffff88013f8f9800 0000000000000008
    ffff880001767d80: ffff880001767dd0 ffffffffa04a9083
 #4 [ffff880001767d88] xfs_trans_commit at ffffffffa04a9083 [xfs]
    ffff880001767d90: ffff880149d2d800 ffff88022fd96305
    ffff880001767da0: 0000000000000004 0000000000000000
    ffff880001767db0: ffff880206a80ea8 0000000000000000
                        ^^^^^ xfs_trans_t
    ffff880001767dc0: ffff88022fd8d400 ffff88022fd96305
    ffff880001767dd0: ffff880001767df0 ffffffffa044f15b
 #5 [ffff880001767dd8] xfs_fs_log_dummy at ffffffffa044f15b [xfs]
    ffff880001767de0: ffff88013f8f9b70 ffff88013f8f9800
    ffff880001767df0: ffff880001767e10 ffffffffa045ac4c

crash> dis xfs_fs_log_dummy
0xffffffffa044f100 <xfs_fs_log_dummy>:  push   %rbp
0xffffffffa044f101 <xfs_fs_log_dummy+0x1>:      mov    $0x1,%edx
0xffffffffa044f106 <xfs_fs_log_dummy+0x6>:      mov    $0x1b,%esi
0xffffffffa044f10b <xfs_fs_log_dummy+0xb>:      mov    %rsp,%rbp
0xffffffffa044f10e <xfs_fs_log_dummy+0xe>:      push   %r12
0xffffffffa044f110 <xfs_fs_log_dummy+0x10>:     push   %rbx
0xffffffffa044f111 <xfs_fs_log_dummy+0x11>:     mov    %rdi,%rbx
0xffffffffa044f114 <xfs_fs_log_dummy+0x14>:     callq  0xffffffffa04a7ba0 
<_xfs_trans_alloc>    * (trans will be in rax)
0xffffffffa044f119 <xfs_fs_log_dummy+0x19>:     movzwl 0x7e(%rbx),%edx
0xffffffffa044f11d <xfs_fs_log_dummy+0x1d>:     xor    %r8d,%r8d
0xffffffffa044f120 <xfs_fs_log_dummy+0x20>:     xor    %ecx,%ecx
0xffffffffa044f122 <xfs_fs_log_dummy+0x22>:     xor    %esi,%esi
0xffffffffa044f124 <xfs_fs_log_dummy+0x24>:     mov    $0x1,%r9d
0xffffffffa044f12a <xfs_fs_log_dummy+0x2a>:     mov    %rax,%rdi
0xffffffffa044f12d <xfs_fs_log_dummy+0x2d>:     mov    %rax,%r12                
*
0xffffffffa044f130 <xfs_fs_log_dummy+0x30>:     sub    $0xffffff80,%edx
0xffffffffa044f133 <xfs_fs_log_dummy+0x33>:     callq  0xffffffffa04a8050 
<xfs_trans_reserve>
0xffffffffa044f138 <xfs_fs_log_dummy+0x38>:     test   %eax,%eax
0xffffffffa044f13a <xfs_fs_log_dummy+0x3a>:     mov    %eax,%ebx
0xffffffffa044f13c <xfs_fs_log_dummy+0x3c>:     jne    0xffffffffa044f168 
<xfs_fs_log_dummy+0x68>
0xffffffffa044f13e <xfs_fs_log_dummy+0x3e>:     mov    %r12,%rdi                
*
0xffffffffa044f141 <xfs_fs_log_dummy+0x41>:     mov    $0x20,%esi
0xffffffffa044f146 <xfs_fs_log_dummy+0x46>:     callq  0xffffffffa04a45b0 
<xfs_mod_sb>
0xffffffffa044f14b <xfs_fs_log_dummy+0x4b>:     orl    $0x8,0x68(%r12)
0xffffffffa044f151 <xfs_fs_log_dummy+0x51>:     mov    %r12,%rdi
0xffffffffa044f154 <xfs_fs_log_dummy+0x54>:     xor    %esi,%esi
0xffffffffa044f156 <xfs_fs_log_dummy+0x56>:     callq  0xffffffffa04a8df0 
<xfs_trans_commit>

Looks like the xfs_trans_t should be in r12.

crash> dis xfs_trans_commit
0xffffffffa04a8df0 <xfs_trans_commit>:  push   %rbp
0xffffffffa04a8df1 <xfs_trans_commit+0x1>:      mov    %rsp,%rbp                
*
0xffffffffa04a8df4 <xfs_trans_commit+0x4>:      sub    $0x40,%rsp
0xffffffffa04a8df8 <xfs_trans_commit+0x8>:      mov    %rbx,-0x28(%rbp)
0xffffffffa04a8dfc <xfs_trans_commit+0xc>:      mov    %r12,-0x20(%rbp)         
*

Which is put on the stack at -0x20 by xfs_trans_commit.
So that's ffff880206a80ea8.

crash> xfs_trans_t ffff880206a80ea8
struct xfs_trans_t {
  t_magic = 0x5452414e,
  t_logcb = {
    cb_next = 0x0,
    cb_func = 0,
    cb_arg = 0x0
  },
  t_type = 0x1b,
  t_log_res = 0x280,
  t_log_count = 0x1,
  t_blk_res = 0x0,
  t_blk_res_used = 0x0,
  t_rtx_res = 0x0,
  t_rtx_res_used = 0x0,
  t_ticket = 0xffff88020f721b50,
  t_lsn = 0x0,
  t_commit_lsn = 0x4,
  t_mountp = 0xffff88013f8f9800,                * xfs_mount_t

 #6 [ffff880001767df8] xfs_sync_worker at ffffffffa045ac4c [xfs]
    ffff880001767e00: ffff88013f8f9b70 ffff880221498ec0
    ffff880001767e10: ffff880001767e60 ffffffff8104c53b
 #7 [ffff880001767e18] process_one_work at ffffffff8104c53b

So that's the same xfs_mount as the one involved in the crash.  Looks like the
problem is that the sync worker is still running after the log has been torn
down, and it calls xfs_fs_log_dummy which generates log traffic.

/*
 * Every sync period we need to unpin all items, reclaim inodes and sync
 * disk quotas.  We might need to cover the log to indicate that the
 * filesystem is idle and not frozen.
 */
STATIC void
xfs_sync_worker(
        struct work_struct *work)
{
        struct xfs_mount *mp = container_of(to_delayed_work(work),
                                        struct xfs_mount, m_sync_work);
        int             error;

        /*
         * We shouldn't write/force the log if we are in the mount/unmount
         * process or on a read only filesystem. The workqueue still needs to be
         * active in both cases, however, because it is used for inode reclaim
         * during these times. hence use the MS_ACTIVE flag to avoid doing
         * anything in these periods.
         */
        if (!(mp->m_super->s_flags & MS_ACTIVE) &&              * 
            !(mp->m_flags & XFS_MOUNT_RDONLY)) {
                /* dgc: errors ignored here */
                if (mp->m_super->s_frozen == SB_UNFROZEN &&
                    xfs_log_need_covered(mp))
                        error = xfs_fs_log_dummy(mp);
                else
                        xfs_log_force(mp, 0);

                /* start pushing all the metadata that is currently dirty */
                xfs_ail_push_all(mp->m_ail);
        }

        /* queue us up again */
        xfs_syncd_queue_sync(mp);
}

Looks like checking for MS_ACTIVE is not adequate for preventing a race with
unmount.

Here is the fix.  I'll sign off on it once my testing is completed. -Ben

===================================================================

xfs: use s_umount sema in xfs_sync_worker

xfs_sync_worker checks the MS_ACTIVE flag in sb->s_flags to avoid doing work
during mount and unmount.  This flag can be cleared by unmount after the
xfs_sync_worker checks it but before the work is completed.  The has caused
crashes in the completion handler for the dummy transaction commited by
xfs_sync_worker:

PID: 27544  TASK: ffff88013544e040  CPU: 3   COMMAND: "kworker/3:0"
 #0 [ffff88016fdff930] machine_kexec at ffffffff810244e9
 #1 [ffff88016fdff9a0] crash_kexec at ffffffff8108d053
 #2 [ffff88016fdffa70] oops_end at ffffffff813ad1b8
 #3 [ffff88016fdffaa0] no_context at ffffffff8102bd48
 #4 [ffff88016fdffaf0] __bad_area_nosemaphore at ffffffff8102c04d
 #5 [ffff88016fdffb40] bad_area_nosemaphore at ffffffff8102c12e
 #6 [ffff88016fdffb50] do_page_fault at ffffffff813afaee
 #7 [ffff88016fdffc60] page_fault at ffffffff813ac635
    [exception RIP: xlog_get_lowest_lsn+0x30]
    RIP: ffffffffa04a9910  RSP: ffff88016fdffd10  RFLAGS: 00010246
    RAX: ffffc90014e48000  RBX: ffff88014d879980  RCX: ffff88014d879980
    RDX: ffff8802214ee4c0  RSI: 0000000000000000  RDI: 0000000000000000
    RBP: ffff88016fdffd10   R8: ffff88014d879a80   R9: 0000000000000000
    R10: 0000000000000001  R11: 0000000000000000  R12: ffff8802214ee400
    R13: ffff88014d879980  R14: 0000000000000000  R15: ffff88022fd96605
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
 #8 [ffff88016fdffd18] xlog_state_do_callback at ffffffffa04aa186 [xfs]
 #9 [ffff88016fdffd98] xlog_state_done_syncing at ffffffffa04aa568 [xfs]
#10 [ffff88016fdffdc8] xlog_iodone at ffffffffa04aad6e [xfs]
#11 [ffff88016fdffdf8] xfs_buf_iodone_work at ffffffffa044a541 [xfs]
#12 [ffff88016fdffe18] process_one_work at ffffffff8104c53b
#13 [ffff88016fdffe68] worker_thread at ffffffff8104f0e3
#14 [ffff88016fdffee8] kthread at ffffffff8105395e
#15 [ffff88016fdfff48] kernel_thread_helper at ffffffff813b3ae4

Protect xfs_sync_worker by using the s_umount semaphore at the read level to
provide exclusion with unmount while work is progressing.

Index: xfs/fs/xfs/xfs_sync.c
===================================================================
--- xfs.orig/fs/xfs/xfs_sync.c
+++ xfs/fs/xfs/xfs_sync.c
@@ -386,20 +386,23 @@ xfs_sync_worker(
         * We shouldn't write/force the log if we are in the mount/unmount
         * process or on a read only filesystem. The workqueue still needs to be
         * active in both cases, however, because it is used for inode reclaim
-        * during these times. hence use the MS_ACTIVE flag to avoid doing
-        * anything in these periods.
+        * during these times.  Use the s_umount semaphore to provide exclusion
+        * with unmount.
         */
-       if (!(mp->m_super->s_flags & MS_ACTIVE) &&
-           !(mp->m_flags & XFS_MOUNT_RDONLY)) {
-               /* dgc: errors ignored here */
-               if (mp->m_super->s_frozen == SB_UNFROZEN &&
-                   xfs_log_need_covered(mp))
-                       error = xfs_fs_log_dummy(mp);
-               else
-                       xfs_log_force(mp, 0);
+       if (down_read_trylock(&mp->m_super->s_umount)) {
+               if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+                       /* dgc: errors ignored here */
+                       if (mp->m_super->s_frozen == SB_UNFROZEN &&
+                           xfs_log_need_covered(mp))
+                               error = xfs_fs_log_dummy(mp);
+                       else
+                               xfs_log_force(mp, 0);
 
-               /* start pushing all the metadata that is currently dirty */
-               xfs_ail_push_all(mp->m_ail);
+                       /* start pushing all the metadata that is currently
+                        * dirty */
+                       xfs_ail_push_all(mp->m_ail);
+               }
+               up_read(&mp->m_super->s_umount);
        }
 
        /* queue us up again */

<Prev in Thread] Current Thread [Next in Thread>