Hi. I still haven't determined whether this is an XFS problem, or a raid5
problem,
but here's a situation I see if raid5 is trying to resync an XFS volume while
people
are thrashing it:
Jul 26 14:55:38 gigem101 kernel: raid5: in raid5_sync_request, bufsize=512
redone=6 rtrn=-5
Jul 26 14:55:38 gigem101 kernel: md: sync_request returned sectors=-5
(j=296588438) ... exiting
Jul 26 14:55:38 gigem101 kernel: raid5: resync aborted (err=-5)!
I'm running a 2.4.5-SGI_XFS_1.0.1 kernel, with md patched up to print errors
(patch at end).
The default behaviour was just to print `raid5: resync aborted' with no other
indication (which
definitely needs to be fixed in the md driver anyway). As you can see, the
chunk-size on
the volume in question is 512k. We have another one (identical hw, 128k chunk)
that synced
without trouble, since I kept users off it. This has only happened during
periods of heavy
read/write activity.
I'm starting this on the XFS list only, since I know there have been issues
with
chunk sizes >256k elsewhere. It doesn't look to me like this kernel has the I/O
optimization
problem that's been discussed of late, since that seems to be turned off for
ALL md devices
here.
The basic setup is 8 u160-scsi 180GB Seagate disks on an adaptec 29160
controller,
512k chunk, left-sym parity, 1.2 TB XFS filesystem.
----
Chris J. Bednar <http://optics.tamu.edu/~bednar/>
Director, Distributed Computing Product Group
http://AdvancedDataSolutions.com/
-kernel-2.4.5-raid5.printerr.patch---------
--- linux/drivers/md/md.c.printerr Sat Jul 21 05:25:52 2001
+++ linux/drivers/md/md.c Tue Jul 24 23:04:29 2001
@@ -3248,7 +3248,10 @@
err = down_interruptible(&mddev->resync_sem);
if (err)
+ {
+ printk ("md: down_interruptible err=%d ... exiting\n", err);
goto out_nolock;
+ }
recheck:
serialize = 0;
@@ -3309,6 +3312,7 @@
sectors = mddev->pers->sync_request(mddev, j);
if (sectors < 0) {
+ printk ("md: sync_request returned sectors=%d (j=%d)
... exiting\n", sectors, j);
err = sectors;
goto out;
}
--- linux/drivers/md/raid5.c.printerr Sat Jul 21 05:25:52 2001
+++ linux/drivers/md/raid5.c Tue Jul 24 23:06:42 2001
@@ -1167,6 +1167,7 @@
int data_disks = raid_disks-1;
int redone = 0;
int bufsize;
+ int rtrn;
sh = get_active_stripe(conf, sector_nr, 0, 0);
bufsize = sh->size;
@@ -1183,7 +1184,11 @@
handle_stripe(sh);
release_stripe(sh);
- return (bufsize>>9)-redone;
+ if ((rtrn = (bufsize>>9) - redone) < 0)
+ {
+ printk ("raid5: in raid5_sync_request, bufsize=%d redone=%d
rtrn=%d\n", bufsize, redone, rtrn);
+ }
+ return rtrn;
}
/*
@@ -1241,15 +1246,16 @@
{
raid5_conf_t *conf = data;
mddev_t *mddev = conf->mddev;
+ int errval = 0;
if (!conf->resync_parity)
return;
if (conf->resync_parity == 2)
return;
down(&mddev->recovery_sem);
- if (md_do_sync(mddev,NULL)) {
+ if ((errval = md_do_sync(mddev,NULL))) {
up(&mddev->recovery_sem);
- printk("raid5: resync aborted!\n");
+ printk("raid5: resync aborted (err=%d)!\n", errval);
return;
}
conf->resync_parity = 0;
|