xfs
[Top] [All Lists]

XFS unstable with little memory; OOPS in prune_dcache()

To: linux-xfs@xxxxxxxxxxx
Subject: XFS unstable with little memory; OOPS in prune_dcache()
From: Philipp Reisner <philipp.reisner@xxxxxxxxxx>
Date: Tue, 8 Nov 2005 14:25:06 +0100
Organization: LINBIT
Sender: linux-xfs-bounce@xxxxxxxxxxx
User-agent: KMail/1.8
Hi XFS gurus,

I experienced a series of kernel crashes, that are triggered by:

o XFS ( Kernel vanilla 2.6.13.4 )
o uniprocessor (compiled for K7) / no HIGHMEM
o little memory ( 256 MB Ram )
o 200GB XFS filesystem, 138GB used
o   4378907 inodes   ( 4 m)
o  30349548 dentries (30 m)

The system is used as a backup server. Each of the production systems
is backed up by hardlinking the previous day's backup, and then 
breaking the hardlinks and replacing the changed files by running rsync.

[ See rsbak3 at http://oss.linbit.com/ ]

The outcome is that we have mostly small files, but most of these inodes
have about 30 - 40 directory entries.

The OOPS happended during the backup run, after we changed the FS from
ext3 to XFS. 

The same machine run with ext3 perfectly stable, then we tested all
hardware components in multiple ways. In the end it turned out that
with one GB of memory it is also stable with XFS.

A reliable way to reproduce the crash is to run tiobend and an ls -lR
concurrently. It took about 30 minutes to one hour to trigger the OOPS.

BTW, XFS is much faster than ext3 with this highly hardlinked structure.

Here is the OOPS, and what I found out about it...

general protection fault: 0000 [#1]
Modules linked in: ipv6 bonding evdev parport_pc parport pcspkr via_rhine
ehci_hcd uhci_hcd generic 8139too 8139cp via_agp agpgart xfs dm_mirror
dm_snapshot dm_mod raid5 raid1 raid0 xor ide_disk via82cxxx ide_core md_mod
CPU:    0
EIP:    0060:[<c0169747>]    Not tainted VLI
EFLAGS: 00010216   (2.6.13sv-k7-up-lowmem)
EIP is at prune_dcache+0x37/0x160
eax: c039fa64   ebx: c0fc7d9c   ecx: c0bc7d30   edx: ffffffff
esi: 00000000   edi: 00000024   ebp: cdfeea60   esp: cd481eb8
ds: 007b   es: 007b   ss: 0068
Process kswapd0 (pid: 124, threadinfo=cd480000 task=cd42c590)

Stack: c074a794 cd481ef0 c013b068 cd481ef0 0000e8d0 00000000 000000e6
c0169bcf 00000080 c013f8f9 00000080 000000d0 00008116 003a3400 00000000
00000073 00000000 00000000 c039e74c 00000001 c039e620 00008115 c0140bee
00000020 Call Trace:
 [<c013b068>] get_dirty_limits+0x18/0xd0
 [<c0169bcf>] shrink_dcache_memory+0x1f/0x50
 [<c013f8f9>] shrink_slab+0x179/0x1c0
 [<c0140bee>] balance_pgdat+0x2ce/0x3a0
 [<c0140d9e>] kswapd+0xde/0x100
 [<c012b160>] autoremove_wake_function+0x0/0x60
 [<c012b160>] autoremove_wake_function+0x0/0x60
 [<c0140cc0>] kswapd+0x0/0x100
 [<c0100f51>] kernel_thread_helper+0x5/0x14
Code: 75 07 83 c4 10 5b 5e 5f c3 c7 04 24 94 a7 74 c0 e8 9f c2 fa ff 8b 0d
68 fa 39 c0 81 f9 64 fa 39 c0 74 df 8b 01 8b 51 04 89 50 04 <89> 02 89 49
04 89 09 a1 68 fa 39 c0 8d 44 20 00 ff 0d 70 fa 39


The OOPS happens in list_del_init(), it is maked in the C-snipplet.

/**
 * prune_dcache - shrink the dcache
 * @count: number of entries to try and free
 *
 * Shrink the dcache. This is done when we need
 * more memory, or simply when we need to unmount
 * something (at which point we need to unuse
 * all dentries).
 *
 * This function may fail to free any resources if
 * all the dentries are in use.
 */
 
static void prune_dcache(int count)
{
        spin_lock(&dcache_lock);
        for (; count ; count--) {
                struct dentry *dentry;
                struct list_head *tmp;

                cond_resched_lock(&dcache_lock);

                tmp = dentry_unused.prev;
                if (tmp == &dentry_unused)
                        break;
                list_del_init(tmp);   <=<<==<<<===<<<<==== 
                prefetch(dentry_unused.prev);
                dentry_stat.nr_unused--;
                dentry = list_entry(tmp, struct dentry, d_lru);

                spin_lock(&dentry->d_lock);
                /*
                 * We found an inuse dentry which was not removed from
                 * dentry_unused because of laziness during lookup.  Do not free
                 * it - just keep it off the dentry_unused list.
                 */
                if (atomic_read(&dentry->d_count)) {
                        spin_unlock(&dentry->d_lock);
                        continue;
                }
                /* If the dentry was recently referenced, don't free it. */
                if (dentry->d_flags & DCACHE_REFERENCED) {
                        dentry->d_flags &= ~DCACHE_REFERENCED;
                        list_add(&dentry->d_lru, &dentry_unused);
                        dentry_stat.nr_unused++;
                        spin_unlock(&dentry->d_lock);
                        continue;
                }
                prune_one_dentry(dentry);
        }
        spin_unlock(&dcache_lock);
}


Here is the same in GCC's assember output:


prune_dcache:
        .stabn 68,0,395,.LM177-prune_dcache
.LM177:
        pushl   %edi
        pushl   %esi
        pushl   %ebx
        subl    $16, %esp
        movl    32(%esp), %edi
        .stabn 68,0,397,.LM178-prune_dcache
.LM178:
.LBB73:
        testl   %edi, %edi
        jne     .L263
.L209:
        .stabn 68,0,432,.LM179-prune_dcache
.LM179:
        addl    $16, %esp
        popl    %ebx
        popl    %esi
        popl    %edi
        ret
        .p2align 6,,7
.L263:
        .stabn 68,0,401,.LM180-prune_dcache
.LM180:
.LBB74:
        movl    $dcache_lock, (%esp)
        call    cond_resched_lock
        .stabn 68,0,403,.LM181-prune_dcache
.LM181:
        movl    dentry_unused+4, %ecx
        .stabn 68,0,404,.LM182-prune_dcache
.LM182:
        cmpl    $dentry_unused, %ecx
        je      .L209
        .stabs  "include/linux/list.h",132,0,0,.Ltext63
.Ltext63:
        .stabn 68,0,150,.LM183-prune_dcache
.LM183:
.LBB75:
.LBB76:
        movl    (%ecx), %eax
        movl    4(%ecx), %edx
        .stabn 68,0,151,.LM184-prune_dcache
.LM184:
        movl    %edx, 4(%eax)
        .stabn 68,0,152,.LM185-prune_dcache
.LM185:
        movl    %eax, (%edx)      <=<<==<<<===<<<<====
        .stabn 68,0,220,.LM186-prune_dcache
.LM186:
.LBE76:
        movl    %ecx, 4(%ecx)
        movl    %ecx, (%ecx)
        .stabs  "include/asm/processor.h",132,0,0,.Ltext64
.Ltext64:


The output of xfs_info

[root@anat:/tmp]# xfs_info /var/backup
meta-data=/var/backup            isize=256    agcount=16, agsize=3276800 blks
         =                       sectsz=512
data     =                       bsize=4096   blocks=52428800, imaxpct=25
         =                       sunit=0      swidth=0 blks, unwritten=1
naming   =version 2              bsize=4096
log      =internal               bsize=4096   blocks=25600, version=1
         =                       sectsz=512   sunit=0 blks
realtime =none                   extsz=65536  blocks=0, rtextents=0

-- 
: Dipl-Ing Philipp Reisner                      Tel +43-1-8178292-50 :
: LINBIT Information Technologies GmbH          Fax +43-1-8178292-82 :
: Schönbrunnerstr 244, 1120 Vienna, Austria    http://www.linbit.com :


<Prev in Thread] Current Thread [Next in Thread>