Hi XFS gurus,
I experienced a series of kernel crashes, that are triggered by:
o XFS ( Kernel vanilla 2.6.13.4 )
o uniprocessor (compiled for K7) / no HIGHMEM
o little memory ( 256 MB Ram )
o 200GB XFS filesystem, 138GB used
o 4378907 inodes ( 4 m)
o 30349548 dentries (30 m)
The system is used as a backup server. Each of the production systems
is backed up by hardlinking the previous day's backup, and then
breaking the hardlinks and replacing the changed files by running rsync.
[ See rsbak3 at http://oss.linbit.com/ ]
The outcome is that we have mostly small files, but most of these inodes
have about 30 - 40 directory entries.
The OOPS happended during the backup run, after we changed the FS from
ext3 to XFS.
The same machine run with ext3 perfectly stable, then we tested all
hardware components in multiple ways. In the end it turned out that
with one GB of memory it is also stable with XFS.
A reliable way to reproduce the crash is to run tiobend and an ls -lR
concurrently. It took about 30 minutes to one hour to trigger the OOPS.
BTW, XFS is much faster than ext3 with this highly hardlinked structure.
Here is the OOPS, and what I found out about it...
general protection fault: 0000 [#1]
Modules linked in: ipv6 bonding evdev parport_pc parport pcspkr via_rhine
ehci_hcd uhci_hcd generic 8139too 8139cp via_agp agpgart xfs dm_mirror
dm_snapshot dm_mod raid5 raid1 raid0 xor ide_disk via82cxxx ide_core md_mod
CPU: 0
EIP: 0060:[<c0169747>] Not tainted VLI
EFLAGS: 00010216 (2.6.13sv-k7-up-lowmem)
EIP is at prune_dcache+0x37/0x160
eax: c039fa64 ebx: c0fc7d9c ecx: c0bc7d30 edx: ffffffff
esi: 00000000 edi: 00000024 ebp: cdfeea60 esp: cd481eb8
ds: 007b es: 007b ss: 0068
Process kswapd0 (pid: 124, threadinfo=cd480000 task=cd42c590)
Stack: c074a794 cd481ef0 c013b068 cd481ef0 0000e8d0 00000000 000000e6
c0169bcf 00000080 c013f8f9 00000080 000000d0 00008116 003a3400 00000000
00000073 00000000 00000000 c039e74c 00000001 c039e620 00008115 c0140bee
00000020 Call Trace:
[<c013b068>] get_dirty_limits+0x18/0xd0
[<c0169bcf>] shrink_dcache_memory+0x1f/0x50
[<c013f8f9>] shrink_slab+0x179/0x1c0
[<c0140bee>] balance_pgdat+0x2ce/0x3a0
[<c0140d9e>] kswapd+0xde/0x100
[<c012b160>] autoremove_wake_function+0x0/0x60
[<c012b160>] autoremove_wake_function+0x0/0x60
[<c0140cc0>] kswapd+0x0/0x100
[<c0100f51>] kernel_thread_helper+0x5/0x14
Code: 75 07 83 c4 10 5b 5e 5f c3 c7 04 24 94 a7 74 c0 e8 9f c2 fa ff 8b 0d
68 fa 39 c0 81 f9 64 fa 39 c0 74 df 8b 01 8b 51 04 89 50 04 <89> 02 89 49
04 89 09 a1 68 fa 39 c0 8d 44 20 00 ff 0d 70 fa 39
The OOPS happens in list_del_init(), it is maked in the C-snipplet.
/**
* prune_dcache - shrink the dcache
* @count: number of entries to try and free
*
* Shrink the dcache. This is done when we need
* more memory, or simply when we need to unmount
* something (at which point we need to unuse
* all dentries).
*
* This function may fail to free any resources if
* all the dentries are in use.
*/
static void prune_dcache(int count)
{
spin_lock(&dcache_lock);
for (; count ; count--) {
struct dentry *dentry;
struct list_head *tmp;
cond_resched_lock(&dcache_lock);
tmp = dentry_unused.prev;
if (tmp == &dentry_unused)
break;
list_del_init(tmp); <=<<==<<<===<<<<====
prefetch(dentry_unused.prev);
dentry_stat.nr_unused--;
dentry = list_entry(tmp, struct dentry, d_lru);
spin_lock(&dentry->d_lock);
/*
* We found an inuse dentry which was not removed from
* dentry_unused because of laziness during lookup. Do not free
* it - just keep it off the dentry_unused list.
*/
if (atomic_read(&dentry->d_count)) {
spin_unlock(&dentry->d_lock);
continue;
}
/* If the dentry was recently referenced, don't free it. */
if (dentry->d_flags & DCACHE_REFERENCED) {
dentry->d_flags &= ~DCACHE_REFERENCED;
list_add(&dentry->d_lru, &dentry_unused);
dentry_stat.nr_unused++;
spin_unlock(&dentry->d_lock);
continue;
}
prune_one_dentry(dentry);
}
spin_unlock(&dcache_lock);
}
Here is the same in GCC's assember output:
prune_dcache:
.stabn 68,0,395,.LM177-prune_dcache
.LM177:
pushl %edi
pushl %esi
pushl %ebx
subl $16, %esp
movl 32(%esp), %edi
.stabn 68,0,397,.LM178-prune_dcache
.LM178:
.LBB73:
testl %edi, %edi
jne .L263
.L209:
.stabn 68,0,432,.LM179-prune_dcache
.LM179:
addl $16, %esp
popl %ebx
popl %esi
popl %edi
ret
.p2align 6,,7
.L263:
.stabn 68,0,401,.LM180-prune_dcache
.LM180:
.LBB74:
movl $dcache_lock, (%esp)
call cond_resched_lock
.stabn 68,0,403,.LM181-prune_dcache
.LM181:
movl dentry_unused+4, %ecx
.stabn 68,0,404,.LM182-prune_dcache
.LM182:
cmpl $dentry_unused, %ecx
je .L209
.stabs "include/linux/list.h",132,0,0,.Ltext63
.Ltext63:
.stabn 68,0,150,.LM183-prune_dcache
.LM183:
.LBB75:
.LBB76:
movl (%ecx), %eax
movl 4(%ecx), %edx
.stabn 68,0,151,.LM184-prune_dcache
.LM184:
movl %edx, 4(%eax)
.stabn 68,0,152,.LM185-prune_dcache
.LM185:
movl %eax, (%edx) <=<<==<<<===<<<<====
.stabn 68,0,220,.LM186-prune_dcache
.LM186:
.LBE76:
movl %ecx, 4(%ecx)
movl %ecx, (%ecx)
.stabs "include/asm/processor.h",132,0,0,.Ltext64
.Ltext64:
The output of xfs_info
[root@anat:/tmp]# xfs_info /var/backup
meta-data=/var/backup isize=256 agcount=16, agsize=3276800 blks
= sectsz=512
data = bsize=4096 blocks=52428800, imaxpct=25
= sunit=0 swidth=0 blks, unwritten=1
naming =version 2 bsize=4096
log =internal bsize=4096 blocks=25600, version=1
= sectsz=512 sunit=0 blks
realtime =none extsz=65536 blocks=0, rtextents=0
--
: Dipl-Ing Philipp Reisner Tel +43-1-8178292-50 :
: LINBIT Information Technologies GmbH Fax +43-1-8178292-82 :
: Schönbrunnerstr 244, 1120 Vienna, Austria http://www.linbit.com :
|