xfs_iunlink_remove: xfs_inotobp() returned error 22 -- debugging

符永涛 yongtaofu at gmail.com
Fri Apr 19 12:00:12 CDT 2013


Dear Eric,

I checked rh srpm
https://content-web.rhn.redhat.com/rhn/public/NULL/kernel/2.6.32-279.19.1.el6/SRPMS/kernel-2.6.32-279.19.1.el6.src.rpm?__gda__=1366390847_8550b8568c50ea46b3180266b476353d&ext=.rpm
And the code is same, as following:

__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
{
    struct rwsem_waiter *waiter;
    struct task_struct *tsk;
    int woken;

    waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);

    if (!wakewrite) {
        if (waiter->flags & RWSEM_WAITING_FOR_WRITE)
            goto out;
        goto dont_wake_writers;
    }

    /* if we are allowed to wake writers try to grant a single write lock
     * if there's a writer at the front of the queue
     * - we leave the 'waiting count' incremented to signify potential
     *   contention
     */
    if (waiter->flags & RWSEM_WAITING_FOR_WRITE) {
        sem->activity = -1;
        list_del(&waiter->list);
        tsk = waiter->task;
        /* Don't touch waiter after ->task has been NULLed */
        smp_mb();
        waiter->task = NULL;
        wake_up_process(tsk);
        put_task_struct(tsk);
        goto out;
    }

    /* grant an infinite number of read locks to the front of the queue */
 dont_wake_writers:
    woken = 0;
    while (waiter->flags & RWSEM_WAITING_FOR_READ) {
        struct list_head *next = waiter->list.next;

        list_del(&waiter->list);
        tsk = waiter->task;
        smp_mb();
        waiter->task = NULL;
        wake_up_process(tsk);
        put_task_struct(tsk);
        woken++;
        if (list_empty(&sem->wait_list))
            break;
        waiter = list_entry(next, struct rwsem_waiter, list);
    }

    sem->activity += woken;

 out:
    return sem;
}


2013/4/20 符永涛 <yongtaofu at gmail.com>

> Hi Eric,
> Here's the server info:
> [root at 10.23.72.95 ~]# rpm -qa|grep kernel
> kernel-debug-debuginfo-2.6.32-279.19.1.el6.x86_64
> kernel-headers-2.6.32-279.19.1.el6.x86_64
> abrt-addon-kerneloops-2.0.8-6.el6.x86_64
> dracut-kernel-004-283.el6.noarch
> kernel-debuginfo-common-x86_64-2.6.32-279.19.1.el6.x86_64
> kernel-debuginfo-2.6.32-279.19.1.el6.x86_64
> kernel-debug-2.6.32-279.19.1.el6.x86_64
> kernel-devel-2.6.32-279.19.1.el6.x86_64
> libreport-plugin-kerneloops-2.0.9-5.el6.x86_64
> kernel-firmware-2.6.32-279.19.1.el6.noarch
> kernel-2.6.32-279.19.1.el6.x86_64
> kernel-debug-devel-2.6.32-279.19.1.el6.x86_64
> [root at 10.23.72.95 ~]# uname -a
> Linux 10.23.72.95 2.6.32-279.19.1.el6.x86_64 #1 SMP Fri Apr 19 10:44:52
> CST 2013 x86_64 x86_64 x86_64 GNU/Linux
> [root at 10.23.72.95 ~]#
>
> The kernel code looks like:
> __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
> {
>         struct rwsem_waiter *waiter;
>         struct task_struct *tsk;
>         int woken;
>
>         waiter = list_entry(sem->wait_list.next, struct rwsem_waiter,
> list);
>
>         if (!wakewrite) {
>                 if (waiter->flags & RWSEM_WAITING_FOR_WRITE)
>                         goto out;
>                 goto dont_wake_writers;
>         }
>
>         /* if we are allowed to wake writers try to grant a single write
> lock
>          * if there's a writer at the front of the queue
>          * - we leave the 'waiting count' incremented to signify potential
>          *   contention
>          */
>         if (waiter->flags & RWSEM_WAITING_FOR_WRITE) {
>                 sem->activity = -1;
>                 list_del(&waiter->list);
>                 tsk = waiter->task;
>                 /* Don't touch waiter after ->task has been NULLed */
>                 smp_mb();
>                 waiter->task = NULL;
>                 wake_up_process(tsk);
>                 put_task_struct(tsk);
>                 goto out;
>         }
>
>         /* grant an infinite number of read locks to the front of the
> queue */
>  dont_wake_writers:
>         woken = 0;
>         while (waiter->flags & RWSEM_WAITING_FOR_READ) {
>                 struct list_head *next = waiter->list.next;
>
>                 list_del(&waiter->list);
>                 tsk = waiter->task;
>                 smp_mb();
>                 waiter->task = NULL;
>                 wake_up_process(tsk);
>                 put_task_struct(tsk);
>                 woken++;
>                 if (list_empty(&sem->wait_list))
>                         break;
>                 waiter = list_entry(next, struct rwsem_waiter, list);
>         }
>
>         sem->activity += woken;
>
>  out:
>         return sem;
> }
>
> I use srpm because I want to apply the trace path. Can you help to provide
> the official 279.19.1 srpm link.
> Thank you.
>
>
> 2013/4/20 Eric Sandeen <sandeen at sandeen.net>
>
>> On 4/19/13 8:18 AM, 符永涛 wrote:
>> > Dear Eric,
>> > If it's racing issue where the lock is introduced? I want to study the
>> code from you. Thank you.
>> >
>>
>> essentially:
>>
>> xfs_remove()
>> {
>> ...
>>         xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
>> ...
>>         xfs_droplink()
>>
>> You are 100% sure that you were running the 279.19.1 kernel?
>>
>> (I'm not very familiar with Oracle's clone of RHEL - I assume that they
>> have copied all of Red Hat's work verbatim, but I have not looked)
>>
>> Can you verify that in:
>>
>> __rwsem_do_wake()
>>
>> the undo target looks like:
>>
>>   out:
>>         return sem;
>>
>>
>>         /* undo the change to the active count, but check for a transition
>>          * 1->0 */
>>   undo:
>>         if (rwsem_atomic_update(-RWSEM_ACTIVE_BIAS, sem) &
>> RWSEM_ACTIVE_MASK)
>>                 goto out;
>>         goto try_again;
>>
>>
>> thanks,
>> -Eric
>>
>> > 2013/4/19 符永涛 <yongtaofu at gmail.com <mailto:yongtaofu at gmail.com>>
>> >
>> >     Sure the serious thing here is that it corrupt the unlinked list.
>> The inode 0x1bd33 which trigger xfs shutdown is not  0x6b133.
>> >
>> >
>> >     2013/4/19 Eric Sandeen <sandeen at sandeen.net <mailto:
>> sandeen at sandeen.net>>
>> >
>> >         On 4/19/13 4:41 AM, 符永涛 wrote:
>> >         > Dear Brian and Eric,
>> >         >
>> >         > kernel kernel-2.6.32-279.19.1.el6.x86_64.rpm <
>> http://mirror.linux.duke.edu/pub/centos/6.3/updates/x86_64/Packages/kernel-2.6.32-279.19.1.el6.x86_64.rpm>
>> still have this problem
>> >         > I build the kernel from this srpm
>> >         >
>> https://oss.oracle.com/ol6/SRPMS-updates/kernel-2.6.32-279.19.1.el6.src.rpm
>> >         >
>> >         > today the shutdown happens again during test.
>> >         > Seelogs bellow:
>> >         >
>> >         > /var/log/message
>> >         > Apr 19 16:40:05 10 kernel: XFS (sdb): xfs_iunlink_remove:
>> xfs_inotobp() returned error 22.
>> >         > Apr 19 16:40:05 10 kernel: XFS (sdb): xfs_inactive: xfs_ifree
>> returned error 22
>> >         > Apr 19 16:40:05 10 kernel: XFS (sdb):
>> xfs_do_force_shutdown(0x1) called from line 1184 of file
>> fs/xfs/xfs_vnodeops.c.  Return address = 0xffffffffa02d4bda
>> >         > Apr 19 16:40:05 10 kernel: XFS (sdb): I/O Error Detected.
>> Shutting down filesystem
>> >         > Apr 19 16:40:05 10 kernel: XFS (sdb): Please umount the
>> filesystem and rectify the problem(s)
>> >         > Apr 19 16:40:07 10 kernel: XFS (sdb): xfs_log_force: error 5
>> returned.
>> >         > Apr 19 16:40:37 10 kernel: XFS (sdb): xfs_log_force: error 5
>> returned.
>> >         >
>> >         > systemtap script output:
>> >         > --- xfs_imap -- module("xfs").function("xfs_imap at fs/xfs/xfs_ialloc.c:1257").return
>> -- return=0x16
>> >         > vars: mp=0xffff88101801e800 tp=0xffff880ff143ac70
>> ino=0xffffffff imap=0xffff88100e93bc08 flags=0x0 agbno=? agino=? agno=?
>> blks_per_cluster=? chunk_agbno=? cluster_agbno=? error=? offset=?
>> offset_agbno=? __func__=[...]
>> >         > mp: m_agno_log = 0x5, m_agino_log = 0x20
>> >         > mp->m_sb: sb_agcount = 0x1c, sb_agblocks = 0xffffff0,
>> sb_inopblog = 0x4, sb_agblklog = 0x1c, sb_dblocks = 0x1b4900000
>> >         > imap: im_blkno = 0x0, im_len = 0xe778, im_boffset = 0xd997
>> >         > kernel backtrace:
>> >         > Returning from:  0xffffffffa02b4260 : xfs_imap+0x0/0x280 [xfs]
>> >         > Returning to  :  0xffffffffa02b9d59 : xfs_inotobp+0x49/0xc0
>> [xfs]
>> >         >  0xffffffffa02b9ec1 : xfs_iunlink_remove+0xf1/0x360 [xfs]
>> >         >  0xffffffff814ede89
>> >         >  0x0 (inexact)
>> >         > user backtrace:
>> >         >  0x3ec260e5ad [/lib64/libpthread-2.12.so <
>> http://libpthread-2.12.so> <http://libpthread-2.12.so>+0xe5ad/0x219000]
>> >         >
>> >         > --- xfs_iunlink_remove --
>> module("xfs").function("xfs_iunlink_remove at fs/xfs/xfs_inode.c:1681").return
>> -- return=0x16
>> >         > vars: tp=0xffff880ff143ac70 ip=0xffff8811ed111000 next_ino=?
>> mp=? agi=? dip=? agibp=? ibp=? agno=? agino=? next_agino=? last_ibp=?
>> last_dip=0xffff881000000001 bucket_index=? offset=?
>> last_offset=0xffffffffffff8811 error=? __func__=[...]
>> >         > ip: i_ino = 0x1bd33, i_flags = 0x0
>> >         > ip->i_d: di_nlink = 0x0, di_gen = 0x53068791
>> >         >
>> >         > debugfs events trace:
>> >         >
>> https://docs.google.com/file/d/0B7n2C4T5tfNCREZtdC1yamc0RnM/edit?usp=sharing
>> >
>> >         Same issue, one file was unlinked twice in a race:
>> >
>> >         === ino 0x6b133 ===
>> >                    <...>-4477  [003]  2721.176790: xfs_iunlink: dev
>> 8:16 ino 0x6b133
>> >                    <...>-4477  [003]  2721.176839: xfs_iunlink_remove:
>> dev 8:16 ino 0x6b133
>> >                    <...>-4477  [009]  3320.127227: xfs_iunlink: dev
>> 8:16 ino 0x6b133
>> >                    <...>-4477  [001]  3320.141126: xfs_iunlink_remove:
>> dev 8:16 ino 0x6b133
>> >                    <...>-4477  [003]  7973.136368: xfs_iunlink: dev
>> 8:16 ino 0x6b133
>> >                    <...>-4479  [018]  7973.158457: xfs_iunlink: dev
>> 8:16 ino 0x6b133
>> >                    <...>-4479  [018]  7973.158497: xfs_iunlink_remove:
>> dev 8:16 ino 0x6b133
>> >
>> >         -Eric
>> >
>> >
>> >
>> >
>> >     --
>> >     符永涛
>> >
>> >
>> >
>> >
>> > --
>> > 符永涛
>>
>>
>
>
> --
> 符永涛
>



-- 
符永涛
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://oss.sgi.com/pipermail/xfs/attachments/20130420/38b67367/attachment-0001.html>


More information about the xfs mailing list