xfs
[Top] [All Lists]

Re: bad performance on touch/cp file on XFS system

To: Dave Chinner <david@xxxxxxxxxxxxx>
Subject: Re: bad performance on touch/cp file on XFS system
From: Zhang Qiang <zhangqiang.buaa@xxxxxxxxx>
Date: Mon, 25 Aug 2014 16:47:39 +0800
Cc: xfs@xxxxxxxxxxx
Delivered-to: xfs@xxxxxxxxxxx
Dkim-signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20120113; h=mime-version:in-reply-to:references:date:message-id:subject:from:to :cc:content-type; bh=Jn9/HOJOUa6hNZPW/o0SaVBrdoyN6WmvTchyTf/k4Dc=; b=ma6NYggBE60ixKsODLY8KlUCvRwVJ0EKpBEWHTBqFWeoSW+L1GM9X+n94FL4kTd+dc HSs5uBH27vLFQd0cFdYEaFLpZ/t4AqaSwEFW5y/9xlv7drkz6U8rRpmSvYsz9Wn8I2Ta qG3TWILWCg+W8GdAcQ103mH7fj4KObh0pMf9mGNliQNPsHDOyHQvy5vuC5LrWnzsUOGK 8IfqU/bBej0A3mnnTCx/QYlBfGdC4+PqDspvbTtW135RRa2LkRX448LkrAzUaIj/KSuY NhDWYRl45SipPJ7N59lo7N+M0LeKYYaT2zMgxvkMDO1jedUFK0BYr/M0udSPqh7W8AsK Irbg==
In-reply-to: <20140825051801.GY26465@dastard>
References: <CAKEtwsWxZseS8M+O7vSR2FRXr4gjVQ0RDO8ok+jMPWq-8jPEeA@xxxxxxxxxxxxxx> <20140825051801.GY26465@dastard>
I have checked icount and ifree, but I found there are about 11.8 percent free, so the free inode should not be too few.

Here's the detail log, any new clue?Â

# mount /dev/sda4 /data1/
# xfs_info /data1/
meta-data="" Â Â Â Â Â Â Âisize=256 Â Âagcount=4, agsize=142272384 blks
    Â=            sectsz=512  attr=2, projid32bit=0
data   =            bsize=4096  blocks=569089536, imaxpct=5
    Â=            sunit=0   Âswidth=0 blks
naming  =version 2       Âbsize=4096  ascii-ci=0
log   Â=internal        bsize=4096  blocks=277875, version=2
    Â=            sectsz=512  sunit=0 blks, lazy-count=1
realtime =none          extsz=4096  blocks=0, rtextents=0
# umount /dev/sda4
# xfs_db /dev/sda4
xfs_db> sb 0
xfs_db> p
magicnum = 0x58465342
blocksize = 4096
dblocks = 569089536
rblocks = 0
rextents = 0
uuid = 13ecf47b-52cf-4944-9a71-885bddc5e008
logstart = 536870916
rootino = 128
rbmino = 129
rsumino = 130
rextsize = 1
agblocks = 142272384
agcount = 4
rbmblocks = 0
logblocks = 277875
versionnum = 0xb4a4
sectsize = 512
inodesize = 256
inopblock = 16
fname = "\000\000\000\000\000\000\000\000\000\000\000\000"
blocklog = 12
sectlog = 9
inodelog = 8
inopblog = 4
agblklog = 28
rextslog = 0
inprogress = 0
imax_pct = 5
icount = 220619904
ifree = 26202919
fdblocks = 147805479
frextents = 0
uquotino = 0
gquotino = 0
qflags = 0
flags = 0
shared_vn = 0
inoalignmt = 2
unit = 0
width = 0
dirblklog = 0
logsectlog = 0
logsectsize = 0
logsunit = 1
features2 = 0xa
bad_features2 = 0xa
xfs_db> sb 1
xfs_db> p
magicnum = 0x58465342
blocksize = 4096
dblocks = 569089536
rblocks = 0
rextents = 0
uuid = 13ecf47b-52cf-4944-9a71-885bddc5e008
logstart = 536870916
rootino = 128
rbmino = null
rsumino = null
rextsize = 1
agblocks = 142272384
agcount = 4
rbmblocks = 0
logblocks = 277875
versionnum = 0xb4a4
sectsize = 512
inodesize = 256
inopblock = 16
fname = "\000\000\000\000\000\000\000\000\000\000\000\000"
blocklog = 12
sectlog = 9
inodelog = 8
inopblog = 4
agblklog = 28
rextslog = 0
inprogress = 1
imax_pct = 5
icount = 0
ifree = 0
fdblocks = 568811645
frextents = 0
uquotino = 0
gquotino = 0
qflags = 0
flags = 0
shared_vn = 0
inoalignmt = 2
unit = 0
width = 0
dirblklog = 0
logsectlog = 0
logsectsize = 0
logsunit = 1
features2 = 0xa
bad_features2 = 0xa
xfs_db> sb 2
xfs_db> p
magicnum = 0x58465342
blocksize = 4096
dblocks = 569089536
rblocks = 0
rextents = 0
uuid = 13ecf47b-52cf-4944-9a71-885bddc5e008
logstart = 536870916
rootino = null
rbmino = null
rsumino = null
rextsize = 1
agblocks = 142272384
agcount = 4
rbmblocks = 0
logblocks = 277875
versionnum = 0xb4a4
sectsize = 512
inodesize = 256
inopblock = 16
fname = "\000\000\000\000\000\000\000\000\000\000\000\000"
blocklog = 12
sectlog = 9
inodelog = 8
inopblog = 4
agblklog = 28
rextslog = 0
inprogress = 1
imax_pct = 5
icount = 0
ifree = 0
fdblocks = 568811645
frextents = 0
uquotino = 0
gquotino = 0
qflags = 0
flags = 0
shared_vn = 0
inoalignmt = 2
unit = 0
width = 0
dirblklog = 0
logsectlog = 0
logsectsize = 0
logsunit = 1
features2 = 0xa
bad_features2 = 0xa
xfs_db> sb 3
xfs_db> p
magicnum = 0x58465342
blocksize = 4096
dblocks = 569089536
rblocks = 0
rextents = 0
uuid = 13ecf47b-52cf-4944-9a71-885bddc5e008
logstart = 536870916
rootino = 128
rbmino = null
rsumino = null
rextsize = 1
agblocks = 142272384
agcount = 4
rbmblocks = 0
logblocks = 277875
versionnum = 0xb4a4
sectsize = 512
inodesize = 256
inopblock = 16
fname = "\000\000\000\000\000\000\000\000\000\000\000\000"
blocklog = 12
sectlog = 9
inodelog = 8
inopblog = 4
agblklog = 28
rextslog = 0
inprogress = 1
imax_pct = 5
icount = 0
ifree = 0
fdblocks = 568811645
frextents = 0
uquotino = 0
gquotino = 0
qflags = 0
flags = 0
shared_vn = 0
inoalignmt = 2
unit = 0
width = 0
dirblklog = 0
logsectlog = 0
logsectsize = 0
logsunit = 1
features2 = 0xa
bad_features2 = 0xa



2014-08-25 13:18 GMT+08:00 Dave Chinner <david@xxxxxxxxxxxxx>:
On Mon, Aug 25, 2014 at 11:34:34AM +0800, Zhang Qiang wrote:
> Dear XFS community & developers,
>
> I am using CentOS 6.3 and xfs as base file system and use RAID5 as hardware
> storage.
>
> Detail environment as follow:
>Â Â OS: CentOS 6.3
>Â Â Kernel: kernel-2.6.32-279.el6.x86_64
>Â Â XFS option info(df output): /dev/sdb1 on /data type xfs
> (rw,noatime,nodiratime,nobarrier)
>
> Detail phenomenon:
>
>Â Â Â# df
>  ÂFilesystem      Size Used Avail Use% Mounted on
>Â Â Â/dev/sda1Â Â Â Â Â Â Â 29GÂ Â17GÂ Â11GÂ 61% /
>Â Â Â/dev/sdb1Â Â Â Â Â Â Â893GÂ 803GÂ Â91GÂ 90% /data
>Â Â Â/dev/sda4Â Â Â Â Â Â Â2.2TÂ 1.6TÂ 564GÂ 75% /data1
>
>Â Â Â# time touch /data1/1111
>  Âreal  0m23.043s
>  Âuser  0m0.001s
>  Âsys  Â0m0.349s
>
>Â Â Â# perf top
>Â Â ÂEvents: 6K cycles
>Â Â Â 16.96%Â [xfs]Â Â Â Â Â Â Â Â Â Â Â[k] xfs_inobt_get_rec
>Â Â Â 11.95%Â [xfs]Â Â Â Â Â Â Â Â Â Â Â[k] xfs_btree_increment
>Â Â Â 11.16%Â [xfs]Â Â Â Â Â Â Â Â Â Â Â[k] xfs_btree_get_rec
>Â Â Â Â7.39%Â [xfs]Â Â Â Â Â Â Â Â Â Â Â[k] xfs_btree_get_block
>Â Â Â Â5.02%Â [xfs]Â Â Â Â Â Â Â Â Â Â Â[k] xfs_dialloc
>Â Â Â Â4.87%Â [xfs]Â Â Â Â Â Â Â Â Â Â Â[k] xfs_btree_rec_offset
>Â Â Â Â4.33%Â [xfs]Â Â Â Â Â Â Â Â Â Â Â[k] xfs_btree_readahead
>Â Â Â Â4.13%Â [xfs]Â Â Â Â Â Â Â Â Â Â Â[k] _xfs_buf_find
>Â Â Â Â4.05%Â [kernel]Â Â Â Â Â Â Â Â Â [k] intel_idle
>Â Â Â Â2.89%Â [xfs]Â Â Â Â Â Â Â Â Â Â Â[k] xfs_btree_rec_addr
>Â Â Â Â1.04%Â [kernel]Â Â Â Â Â Â Â Â Â [k] kmem_cache_free
>
>
> It seems that some xfs kernel function spend much time (xfs_inobt_get_rec,
> xfs_btree_increment, etc.)
>
> I found a bug in bugzilla [1], is that is the same issue like this?

No.

> It's very greatly appreciated if you can give constructive suggestion about
> this issue, as It's really hard to reproduce from another system and it's
> not possible to do upgrade on that online machine.

You've got very few free inodes, widely distributed in the allocated
inode btree. The CPU time above is the btree search for the next
free inode.

This is the issue solved by this series of recent commits to add a
new on-disk free inode btree index:

53801fd xfs: enable the finobt feature on v5 superblocks
0c153c1 xfs: report finobt status in fs geometry
a3fa516 xfs: add finobt support to growfs
3efa4ff xfs: update the finobt on inode free
2b64ee5 xfs: refactor xfs_difree() inobt bits into xfs_difree_inobt() helper
6dd8638 xfs: use and update the finobt on inode allocation
0aa0a75 xfs: insert newly allocated inode chunks into the finobt
9d43b18 xfs: update inode allocation/free transaction reservations for finobt
aafc3c2 xfs: support the XFS_BTNUM_FINOBT free inode btree type
8e2c84d xfs: reserve v5 superblock read-only compat. feature bit for finobt
57bd3db xfs: refactor xfs_ialloc_btree.c to support multiple inobt numbers

Which is of no help to you, however, because it's not available in
any CentOS kernel.

There's really not much you can do to avoid the problem once you've
punched random freespace holes in the allocated inode btree. IT
generally doesn't affect many people; those that it does affect are
normally using XFS as an object store indexed by a hard link farm
(e.g. various backup programs do this).

If you dump the superblock via xfs_db, the difference between icount
and ifree will give you idea of how much "needle in a haystack"
searching is going on. You can probably narrow it down to a specific
AG by dumping the AGI headers and checking the same thing. filling
in all the holes (by creating a bunch of zero length files in the
appropriate AGs) might take some time, but it should make the
problem go away until you remove more filesystem and create random
free inode holes again...

Cheers,

Dave.
--
Dave Chinner
david@xxxxxxxxxxxxx

<Prev in Thread] Current Thread [Next in Thread>