xfs
[Top] [All Lists]

Re: mkfs.xfs pagefault when removed storage during operation

To: xfs@xxxxxxxxxxx
Subject: Re: mkfs.xfs pagefault when removed storage during operation
From: Ajeet Yadav <ajeet.yadav.77@xxxxxxxxx>
Date: Wed, 2 Feb 2011 17:09:59 +0900
Dkim-signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=gamma; h=domainkey-signature:mime-version:in-reply-to:references:date :message-id:subject:from:to:content-type:content-transfer-encoding; bh=oituJmJOjG/LIobO130VnIASG+lY+wb3h7RxGD56DeA=; b=DJH4IoFABp3oAKuEVuVK6DskClsfi5QsDaVe0P9Gb5prZn/r2F7Dc/zs2NnVN76SZw BBSgk+76x8dVo0vHcfhWLHxG8bGFO/eu4K2C5PeQr6Rmg8Nzyf0C1YaBj2z9VYJRspcR Knq+QK/dQovCXrtINPUVeVdG+7e+Tz+KB9nAU=
Domainkey-signature: a=rsa-sha1; c=nofws; d=gmail.com; s=gamma; h=mime-version:in-reply-to:references:date:message-id:subject:from:to :content-type:content-transfer-encoding; b=nupAi5o0NuHlGfQFjHULjUpqnCK1A2Q+hDU7SHenpUBNIdINq3wDmyJ+rJgXTFDR/E Y+MznJn93rbT5/dSiVXwDtOP2ohROlUlrxh0pKVoBcCFhO3bTFzOd/kqjf+jrckqbQoI D62jdqwcDD0P3lW9MAK+/w7wMM3PgRMdorvKU=
In-reply-to: <AANLkTi=wi_Fhr5v1J4wopvFTY=hC2EA_QmJu4Uc_XgGs@xxxxxxxxxxxxxx>
References: <AANLkTi=wi_Fhr5v1J4wopvFTY=hC2EA_QmJu4Uc_XgGs@xxxxxxxxxxxxxx>
If I see the current sigfault, its easy to fix adding one more patch
to xfsprogs.

diff -Nurp xfsprogs-3.0.5/libxfs/rdwr.c xfsprogs-3.0.5-dirty/libxfs/rdwr.c
--- xfsprogs-3.0.5/libxfs/rdwr.c        2011-01-28 20:22:11.000000000 +0900
+++ xfsprogs-3.0.5-dirty/libxfs/rdwr.c  2011-02-02 16:59:16.000000000 +0900
@@ -207,10 +207,11 @@ libxfs_trace_readbuf(const char *func, c
 {
        xfs_buf_t       *bp = libxfs_readbuf(dev, blkno, len, flags);

-       bp->b_func = func;
-       bp->b_file = file;
-       bp->b_line = line;
-
+       if (bp){
+               bp->b_func = func;
+               bp->b_file = file;
+               bp->b_line = line;
+       }
        return bp;
 }

@@ -485,6 +486,7 @@ libxfs_readbuf(dev_t dev, xfs_daddr_t bl
                error = libxfs_readbufr(dev, blkno, bp, len, flags);
                if (error) {
                        libxfs_putbuf(bp);
+                       errno = error;
                        return NULL;
                }
        }
diff -Nurp xfsprogs-3.0.5/libxfs/trans.c xfsprogs-3.0.5-dirty/libxfs/trans.c
--- xfsprogs-3.0.5/libxfs/trans.c       2011-01-28 20:22:11.000000000 +0900
+++ xfsprogs-3.0.5-dirty/libxfs/trans.c 2011-02-02 17:00:42.000000000 +0900
@@ -508,6 +508,10 @@ libxfs_trans_read_buf(
        }

        bp = libxfs_readbuf(dev, blkno, len, flags);
+       if (!bp){
+               *bpp = NULL;
+               return errno;
+       }
 #ifdef XACT_DEBUG
        fprintf(stderr, "trans_read_buf buffer %p, transaction %p\n", bp, tp);
 #endif


But when I start reviewing the complete project w.r.t read() /
read64() / write() / write64() more importantly libxfs_readbufr() /
libxfs_writebufr().
I find error handing is broken at may places and I get my self lost in
m^n complexity also errno is lost.. therefore caller cannot examine
the exact error,

Back again I think, What if I exit on error ? Does xfsprogs uses
read() / write() error as a part of its functionality, for example
does xfs_repair uses these errors as a part of repair funtionality.

diff -Nurp xfsprogs/libxfs/rdwr.c xfsprogs-dirty/libxfs/rdwr.c
--- xfsprogs/libxfs/rdwr.c        2011-01-28 20:22:11.000000000 +0900
+++ xfsprogs-dirty/libxfs/rdwr.c  2011-02-02 16:42:32.000000000 +0900
@@ -458,8 +458,7 @@ libxfs_readbufr(dev_t dev, xfs_daddr_t b
        if (pread64(fd, bp->b_addr, bytes, LIBXFS_BBTOOFF64(blkno)) < 0) {
                fprintf(stderr, _("%s: read failed: %s\n"),
                        progname, strerror(errno));
-               if (flags & LIBXFS_EXIT_ON_FAILURE)
-                       exit(1);
+               exit(1);
                return errno;
        }
 #ifdef IO_DEBUG
@@ -501,8 +500,7 @@ libxfs_writebufr(xfs_buf_t *bp)
        if (sts < 0) {
                fprintf(stderr, _("%s: pwrite64 failed: %s\n"),
                        progname, strerror(errno));
-               if (bp->b_flags & LIBXFS_B_EXIT)
-                       exit(1);
+               exit(1);
                return errno;
        }
        else if (sts != bp->b_bcount) {


On Tue, Feb 1, 2011 at 8:06 PM, Ajeet Yadav <ajeet.yadav.77@xxxxxxxxx> wrote:
> We are testing mkfs.xfs and xfs_repair stability to look for crashes
> and other issues specially with removable devices.
> And unfortunately crashes does occur.
> Code inspection shows in most cases the caller does not handle
> libxfs_readbuf() for error cases i.e when return value = NULL.
>
> Now I need your suggestion.
> We should fix all such cases or the simplest way is to exit... if
> read() or write() fails with EIO errorno in libxfs_readbufr() and
> libxfs_writebufr().
> Fortunately these function already support exit, if we use flag
> LIBXFS_EXIT_ON_FAILURE, LIBXFS_B_EXIT but they are used selectively.
>
> The current problem is related to function libxfs_trans_read_buf()
>
>       bp = libxfs_readbuf(dev, blkno, len, flags);
> #ifdef XACT_DEBUG
>        fprintf(stderr, "trans_read_buf buffer %p, transaction %p\n", bp, tp);
> #endif
>        xfs_buf_item_init(bp, tp->t_mountp);
>        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
>        bip->bli_recur = 0;
>        xfs_trans_add_item(tp, (xfs_log_item_t *)bip);
>
>        /* initialise b_fsprivate2 so we can find it incore */
>        XFS_BUF_SET_FSPRIVATE2(bp, tp);
>        *bpp = bp;
>        return 0;
>
> if  libxfs_readbuf() fails due to device removal or other error, bp = NULL.
> In function xfs_buf_item_init(bp, tp->t_mountp) as soon as bp is
> dereferenced occurs
>
> mkfs.xfs: unhandled page fault (11) at 0x00000070, code 0x017
>

<Prev in Thread] Current Thread [Next in Thread>