If I see the current sigfault, its easy to fix adding one more patch
to xfsprogs.
diff -Nurp xfsprogs-3.0.5/libxfs/rdwr.c xfsprogs-3.0.5-dirty/libxfs/rdwr.c
--- xfsprogs-3.0.5/libxfs/rdwr.c 2011-01-28 20:22:11.000000000 +0900
+++ xfsprogs-3.0.5-dirty/libxfs/rdwr.c 2011-02-02 16:59:16.000000000 +0900
@@ -207,10 +207,11 @@ libxfs_trace_readbuf(const char *func, c
{
xfs_buf_t *bp = libxfs_readbuf(dev, blkno, len, flags);
- bp->b_func = func;
- bp->b_file = file;
- bp->b_line = line;
-
+ if (bp){
+ bp->b_func = func;
+ bp->b_file = file;
+ bp->b_line = line;
+ }
return bp;
}
@@ -485,6 +486,7 @@ libxfs_readbuf(dev_t dev, xfs_daddr_t bl
error = libxfs_readbufr(dev, blkno, bp, len, flags);
if (error) {
libxfs_putbuf(bp);
+ errno = error;
return NULL;
}
}
diff -Nurp xfsprogs-3.0.5/libxfs/trans.c xfsprogs-3.0.5-dirty/libxfs/trans.c
--- xfsprogs-3.0.5/libxfs/trans.c 2011-01-28 20:22:11.000000000 +0900
+++ xfsprogs-3.0.5-dirty/libxfs/trans.c 2011-02-02 17:00:42.000000000 +0900
@@ -508,6 +508,10 @@ libxfs_trans_read_buf(
}
bp = libxfs_readbuf(dev, blkno, len, flags);
+ if (!bp){
+ *bpp = NULL;
+ return errno;
+ }
#ifdef XACT_DEBUG
fprintf(stderr, "trans_read_buf buffer %p, transaction %p\n", bp, tp);
#endif
But when I start reviewing the complete project w.r.t read() /
read64() / write() / write64() more importantly libxfs_readbufr() /
libxfs_writebufr().
I find error handing is broken at may places and I get my self lost in
m^n complexity also errno is lost.. therefore caller cannot examine
the exact error,
Back again I think, What if I exit on error ? Does xfsprogs uses
read() / write() error as a part of its functionality, for example
does xfs_repair uses these errors as a part of repair funtionality.
diff -Nurp xfsprogs/libxfs/rdwr.c xfsprogs-dirty/libxfs/rdwr.c
--- xfsprogs/libxfs/rdwr.c 2011-01-28 20:22:11.000000000 +0900
+++ xfsprogs-dirty/libxfs/rdwr.c 2011-02-02 16:42:32.000000000 +0900
@@ -458,8 +458,7 @@ libxfs_readbufr(dev_t dev, xfs_daddr_t b
if (pread64(fd, bp->b_addr, bytes, LIBXFS_BBTOOFF64(blkno)) < 0) {
fprintf(stderr, _("%s: read failed: %s\n"),
progname, strerror(errno));
- if (flags & LIBXFS_EXIT_ON_FAILURE)
- exit(1);
+ exit(1);
return errno;
}
#ifdef IO_DEBUG
@@ -501,8 +500,7 @@ libxfs_writebufr(xfs_buf_t *bp)
if (sts < 0) {
fprintf(stderr, _("%s: pwrite64 failed: %s\n"),
progname, strerror(errno));
- if (bp->b_flags & LIBXFS_B_EXIT)
- exit(1);
+ exit(1);
return errno;
}
else if (sts != bp->b_bcount) {
On Tue, Feb 1, 2011 at 8:06 PM, Ajeet Yadav <ajeet.yadav.77@xxxxxxxxx> wrote:
> We are testing mkfs.xfs and xfs_repair stability to look for crashes
> and other issues specially with removable devices.
> And unfortunately crashes does occur.
> Code inspection shows in most cases the caller does not handle
> libxfs_readbuf() for error cases i.e when return value = NULL.
>
> Now I need your suggestion.
> We should fix all such cases or the simplest way is to exit... if
> read() or write() fails with EIO errorno in libxfs_readbufr() and
> libxfs_writebufr().
> Fortunately these function already support exit, if we use flag
> LIBXFS_EXIT_ON_FAILURE, LIBXFS_B_EXIT but they are used selectively.
>
> The current problem is related to function libxfs_trans_read_buf()
>
> bp = libxfs_readbuf(dev, blkno, len, flags);
> #ifdef XACT_DEBUG
> fprintf(stderr, "trans_read_buf buffer %p, transaction %p\n", bp, tp);
> #endif
> xfs_buf_item_init(bp, tp->t_mountp);
> bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
> bip->bli_recur = 0;
> xfs_trans_add_item(tp, (xfs_log_item_t *)bip);
>
> /* initialise b_fsprivate2 so we can find it incore */
> XFS_BUF_SET_FSPRIVATE2(bp, tp);
> *bpp = bp;
> return 0;
>
> if libxfs_readbuf() fails due to device removal or other error, bp = NULL.
> In function xfs_buf_item_init(bp, tp->t_mountp) as soon as bp is
> dereferenced occurs
>
> mkfs.xfs: unhandled page fault (11) at 0x00000070, code 0x017
>
|