xfs
[Top] [All Lists]

Re: 2.4.25 & xfs & ide write barriers

To: Nathan Scott <nathans@xxxxxxx>
Subject: Re: 2.4.25 & xfs & ide write barriers
From: Michael Lampe <Michael.Lampe@xxxxxxxxxxxxxxxxxxxxx>
Date: Mon, 01 Mar 2004 14:06:38 +0100
Cc: linux-xfs@xxxxxxxxxxx
In-reply-to: <20040229221924.GA731@frodo>
Organization: IWR Uni Heidelberg
References: <403E0A7A.3040505@xxxxxxxxxxxxxxxxxxxxx> <20040229221924.GA731@frodo>
Sender: linux-xfs-bounce@xxxxxxxxxxx
User-agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.6) Gecko/20040117
Nathan Scott wrote:
On Thu, Feb 26, 2004 at 04:02:18PM +0100, Michael Lampe wrote:

fs/xfs/linux/xfs_buf.c already has

 #ifdef RQ_WRITE_ORDERED
         if (flush)
                set_bit(BH_Ordered_Flush, &bufferlist[cnt-1]->b_state);
 #endif

in _pagebuf_page_io.

Is this all I need (I already have applied the ide write barrier patch)
to safely use disk write back caching?



(which IDE write barrier patch is that, out of curiousity?)

The one that SuSE ships with their latest kernel. Due to Jens Axboe, I think. (Attached.)

This stuff is untested by anyone here at SGI, so YMMV.  It is
also not going to work in the presence of unwritten extents
(which were implemented after this change), some additional
code would be needed there.

Pity, that. Beeing forced to disable write caching is major performance killer.

-Michael

diff -ru /usr/src/linux-2.4.25/drivers/block/elevator.c 
linux-2.4.25/drivers/block/elevator.c
--- /usr/src/linux-2.4.25/drivers/block/elevator.c      Fri Jun 13 16:51:32 2003
+++ linux-2.4.25/drivers/block/elevator.c       Wed Feb 25 18:23:18 2004
@@ -94,6 +94,9 @@
                if (__rq->elevator_sequence <= 0)
                        backmerge_only = 1;
 
+               if (__rq->cmd_flags & RQ_WRITE_ORDERED)
+                       break;
+
                if (__rq->waiting)
                        continue;
                if (__rq->rq_dev != bh->b_rdev)
@@ -156,6 +159,12 @@
        entry = &q->queue_head;
        while ((entry = entry->prev) != head) {
                struct request *__rq = blkdev_entry_to_request(entry);
+
+               /*
+                * we can neither merge nor insert before/with a flush
+                */
+               if (__rq->cmd_flags & RQ_WRITE_ORDERED)
+                       break;
 
                if (__rq->cmd != rw)
                        continue;
diff -ru /usr/src/linux-2.4.25/drivers/block/ll_rw_blk.c 
linux-2.4.25/drivers/block/ll_rw_blk.c
--- /usr/src/linux-2.4.25/drivers/block/ll_rw_blk.c     Fri Feb 13 15:43:59 2004
+++ linux-2.4.25/drivers/block/ll_rw_blk.c      Wed Feb 25 18:47:36 2004
@@ -263,6 +263,32 @@
 void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
 {
        q->make_request_fn = mfn;
+       q->ordered = QUEUE_ORDERED_NONE;
+}
+
+/**
+ * blk_queue_ordered - does this queue support ordered writes
+ * @q:     the request queue
+ * @flag:  see below
+ *
+ * Description:
+ *   For journalled file systems, doing ordered writes on a commit
+ *   block instead of explicitly doing wait_on_buffer (which is bad
+ *   for performance) can be a big win. Block drivers supporting this
+ *   feature should call this function and indicate so.
+ *
+ *   SCSI drivers usually need to support ordered tags, while others
+ *   may have to do a complete drive cache flush if they are using write
+ *   back caching (or not and lying about it)
+ *
+ *   With this in mind, the values are
+ *             QUEUE_ORDERED_NONE:     the default, doesn't support barrier
+ *             QUEUE_ORDERED_TAG:      supports ordered tags
+ *             QUEUE_ORDERED_FLUSH:    supports barrier through cache flush
+ **/
+void blk_queue_ordered(request_queue_t *q, int flag)
+{
+        q->ordered = flag;
 }
 
 /**
@@ -577,6 +603,7 @@
                list_del(&rq->queue);
                rl->count--;
                rl->pending[rw]++;
+               rq->cmd_flags = 0;
                rq->rq_status = RQ_ACTIVE;
                rq->cmd = rw;
                rq->special = NULL;
@@ -989,12 +1016,27 @@
        int latency;
        elevator_t *elevator = &q->elevator;
        int should_wake = 0;
+       int write_ordered = 0;
+ 
+       /* check for barrier requests the device can't handle */
+       if (buffer_ordered_tag(bh)) 
+               write_ordered = QUEUE_ORDERED_TAG;
+       else if (buffer_ordered_flush(bh)) 
+               write_ordered = QUEUE_ORDERED_FLUSH;
+       
+       if (write_ordered && q->ordered != write_ordered) {
+               if (buffer_ordered_hard(bh)) {
+                       set_bit(BH_IO_OPNOTSUPP, &bh->b_state);
+                       goto end_io;
+               }
+               write_ordered = 0;
+       }
 
        count = bh->b_size >> 9;
        sector = bh->b_rsector;
        sync = test_and_clear_bit(BH_Sync, &bh->b_state);
 
-       rw_ahead = 0;   /* normal case; gets changed below for READA */
+       latency = rw_ahead = 0; /* normal case; gets changed below for READA */
        switch (rw) {
                case READA:
 #if 0  /* bread() misinterprets failed READA attempts as IO errors on SMP */
@@ -1003,7 +1045,8 @@
                        rw = READ;      /* drop into READ */
                case READ:
                case WRITE:
-                       latency = elevator_request_latency(elevator, rw);
+                       if (!write_ordered)
+                               latency = elevator_request_latency(elevator, 
rw);
                        break;
                default:
                        BUG();
@@ -1136,6 +1179,9 @@
        }
 
 /* fill up the request-info, and add it to the queue */
+       if (write_ordered)
+               req->cmd_flags |= RQ_WRITE_ORDERED;
+
        req->elevator_sequence = latency;
        req->cmd = rw;
        req->errors = 0;
@@ -1637,3 +1683,4 @@
 EXPORT_SYMBOL(blk_max_pfn);
 EXPORT_SYMBOL(blk_seg_merge_ok);
 EXPORT_SYMBOL(blk_nohighio);
+EXPORT_SYMBOL(blk_queue_ordered);
diff -ru /usr/src/linux-2.4.25/drivers/ide/ide-disk.c 
linux-2.4.25/drivers/ide/ide-disk.c
--- /usr/src/linux-2.4.25/drivers/ide/ide-disk.c        Fri Nov 28 19:26:20 2003
+++ linux-2.4.25/drivers/ide/ide-disk.c Wed Feb 25 18:23:18 2004
@@ -784,32 +784,7 @@
 
 static int idedisk_end_request (ide_drive_t *drive, int uptodate)
 {
-       struct request *rq;
-       unsigned long flags;
-       int ret = 1;
-
-       spin_lock_irqsave(&io_request_lock, flags);
-       rq = HWGROUP(drive)->rq;
-
-       /*
-        * decide whether to reenable DMA -- 3 is a random magic for now,
-        * if we DMA timeout more than 3 times, just stay in PIO
-        */
-       if (drive->state == DMA_PIO_RETRY && drive->retry_pio <= 3) {
-               drive->state = 0;
-               HWGROUP(drive)->hwif->ide_dma_on(drive);
-       }
-
-       if (!end_that_request_first(rq, uptodate, drive->name)) {
-               add_blkdev_randomness(MAJOR(rq->rq_dev));
-               blkdev_dequeue_request(rq);
-               HWGROUP(drive)->rq = NULL;
-               end_that_request_last(rq);
-               ret = 0;
-       }
-
-       spin_unlock_irqrestore(&io_request_lock, flags);
-       return ret;
+       return ide_end_request(drive, uptodate);
 }
 
 static u8 idedisk_dump_status (ide_drive_t *drive, const char *msg, u8 stat)
diff -ru /usr/src/linux-2.4.25/drivers/ide/ide-io.c 
linux-2.4.25/drivers/ide/ide-io.c
--- /usr/src/linux-2.4.25/drivers/ide/ide-io.c  Fri Nov 28 19:26:20 2003
+++ linux-2.4.25/drivers/ide/ide-io.c   Wed Feb 25 18:23:18 2004
@@ -56,6 +56,38 @@
 
 #include "ide_modes.h"
 
+ /*
+ * preempt pending requests, and store this cache flush for immediate
+ * execution
+ */
+static struct request *ide_queue_flush_cmd(ide_drive_t *drive,
+                                          struct request *rq, int post)
+{
+       struct request *flush_rq = &HWGROUP(drive)->wrq;
+
+       list_del_init(&rq->queue);
+
+       memset(drive->special_buf, 0, sizeof(drive->special_buf));
+
+       ide_init_drive_cmd(flush_rq);
+
+       flush_rq->buffer = drive->special_buf;
+       flush_rq->special = rq;
+       flush_rq->buffer[0] = WIN_FLUSH_CACHE;
+
+       if (drive->id->cfs_enable_2 & 0x2400)
+               flush_rq->buffer[0] = WIN_FLUSH_CACHE_EXT;
+
+       if (!post) {
+               drive->doing_barrier = 1;
+               flush_rq->cmd_flags |= RQ_WRITE_PREFLUSH;
+       } else
+               flush_rq->cmd_flags |= RQ_WRITE_POSTFLUSH;
+
+       list_add(&flush_rq->queue, &drive->queue.queue_head);
+       return flush_rq;
+}
+
 /*
  *     ide_end_request         -       complete an IDE I/O
  *     @drive: IDE device for the I/O
@@ -85,10 +117,20 @@
 
        if (!end_that_request_first(rq, uptodate, drive->name)) {
                add_blkdev_randomness(MAJOR(rq->rq_dev));
-               blkdev_dequeue_request(rq);
                HWGROUP(drive)->rq = NULL;
-               end_that_request_last(rq);
                ret = 0;
+
+               /*
+                 * if this is a write barrier, flush the writecache before
+                 * signalling completion of this request.
+                 */
+               if (rq->cmd_flags & RQ_WRITE_ORDERED)
+                       ide_queue_flush_cmd(drive, rq, 1);
+               else {
+                       blkdev_dequeue_request(rq);
+                       end_that_request_last(rq);
+               }
+
        }
 
        spin_unlock_irqrestore(&io_request_lock, flags);
@@ -188,6 +230,33 @@
        }
        spin_lock_irqsave(&io_request_lock, flags);
        blkdev_dequeue_request(rq);
+
+       /*
+        * if a cache flush fails, disable ordered write support
+        */
+       if (rq->cmd_flags & (RQ_WRITE_PREFLUSH | RQ_WRITE_POSTFLUSH)) {
+               struct request *real_rq = rq->special;
+
+               /*
+                * should we forcibly disable the write back caching?
+                */
+               if (err) {
+                       printk("%s: cache flushing failed. disable write back 
cacheing for journalled file systems\n", drive->name);
+                       blk_queue_ordered(&drive->queue, QUEUE_ORDERED_NONE);
+               }
+
+               if (rq->cmd_flags & RQ_WRITE_POSTFLUSH) {
+                       drive->doing_barrier = 0;
+                       end_that_request_last(real_rq);
+               } else {
+                       /*
+                        * just indicate that we did the pre flush
+                        */
+                       real_rq->cmd_flags |= RQ_WRITE_PREFLUSH;
+                       list_add(&real_rq->queue, &drive->queue.queue_head);
+               }
+       }
+
        HWGROUP(drive)->rq = NULL;
        end_that_request_last(rq);
        spin_unlock_irqrestore(&io_request_lock, flags);
@@ -246,8 +315,11 @@
        struct request *rq;
        u8 err;
 
+       if (drive == NULL)
+               return ide_stopped;
+
        err = ide_dump_status(drive, msg, stat);
-       if (drive == NULL || (rq = HWGROUP(drive)->rq) == NULL)
+       if ((rq = HWGROUP(drive)->rq) == NULL)
                return ide_stopped;
 
        hwif = HWIF(drive);
@@ -664,6 +736,15 @@
 repeat:        
        best = NULL;
        drive = hwgroup->drive;
+
+       /*
+        * drive is doing pre-flush, ordered write, post-flush sequence. even
+        * though that is 3 requests, it must be seen as a single transaction.
+        * we must no preempt this drive until that is complete
+        */
+       if (drive->doing_barrier)
+               return drive;
+
        do {
                if (!blk_queue_empty(&drive->queue) && (!drive->sleep || 
time_after_eq(jiffies, drive->sleep))) {
                        if (!best
@@ -806,7 +887,18 @@
                        printk(KERN_ERR "%s: Huh? nuking plugged queue\n", 
drive->name);
 
                rq = blkdev_entry_next_request(&drive->queue.queue_head);
+
+               /*
+                * if rq is a barrier write, issue pre cache flush if not
+                * already done
+                */
+               if (rq->cmd_flags & RQ_WRITE_ORDERED) {
+                       if (!(rq->cmd_flags & RQ_WRITE_PREFLUSH))
+                               rq = ide_queue_flush_cmd(drive, rq, 0);
+               }
+
                hwgroup->rq = rq;
+
                /*
                 * Some systems have trouble with IDE IRQs arriving while
                 * the driver is still setting things up.  So, here we disable
diff -ru /usr/src/linux-2.4.25/drivers/ide/ide-probe.c 
linux-2.4.25/drivers/ide/ide-probe.c
--- /usr/src/linux-2.4.25/drivers/ide/ide-probe.c       Fri Nov 28 19:26:20 2003
+++ linux-2.4.25/drivers/ide/ide-probe.c        Wed Feb 25 18:23:18 2004
@@ -981,6 +981,9 @@
        q->queuedata = HWGROUP(drive);
        blk_init_queue(q, do_ide_request);
        blk_queue_throttle_sectors(q, 1);
+
+       if (drive->media == ide_disk)
+               blk_queue_ordered(&drive->queue, QUEUE_ORDERED_FLUSH);
 }
 
 #undef __IRQ_HELL_SPIN
diff -ru /usr/src/linux-2.4.25/fs/jbd/commit.c linux-2.4.25/fs/jbd/commit.c
--- /usr/src/linux-2.4.25/fs/jbd/commit.c       Fri Feb 13 15:44:01 2004
+++ linux-2.4.25/fs/jbd/commit.c        Wed Feb 25 18:23:19 2004
@@ -648,7 +648,15 @@
                struct buffer_head *bh = jh2bh(descriptor);
                clear_bit(BH_Dirty, &bh->b_state);
                bh->b_end_io = journal_end_buffer_io_sync;
+
+               /* if we're on an ide device, setting BH_Ordered_Flush
+                  will force a write cache flush before and after the
+                  commit block.  Otherwise, it'll do nothing.  */
+
+               set_bit(BH_Ordered_Flush, &bh->b_state); 
                submit_bh(WRITE, bh);
+               clear_bit(BH_Ordered_Flush, &bh->b_state);
+
                wait_on_buffer(bh);
                put_bh(bh);             /* One for getblk() */
                journal_unlock_journal_head(descriptor);
diff -ru /usr/src/linux-2.4.25/include/linux/blkdev.h 
linux-2.4.25/include/linux/blkdev.h
--- /usr/src/linux-2.4.25/include/linux/blkdev.h        Sun Feb 22 19:19:21 2004
+++ linux-2.4.25/include/linux/blkdev.h Wed Feb 25 18:23:19 2004
@@ -32,6 +32,7 @@
 
        kdev_t rq_dev;
        int cmd;                /* READ or WRITE */
+       unsigned long cmd_flags;
        int errors;
        unsigned long start_time;
        unsigned long sector;
@@ -48,6 +49,10 @@
        request_queue_t *q;
 };
 
+#define RQ_WRITE_ORDERED       1       /* ordered write */
+#define RQ_WRITE_PREFLUSH      2       /* pre-barrier flush */
+#define RQ_WRITE_POSTFLUSH     4       /* post-barrier flush */
+
 #include <linux/elevator.h>
 
 typedef int (merge_request_fn) (request_queue_t *q, 
@@ -145,6 +150,10 @@
        int                     can_throttle:1;
 
        unsigned long           bounce_pfn;
+       /*
+        * ordered write support
+        */
+       char                    ordered;
 
        /*
         * Is meant to protect the queue in the future instead of
@@ -174,6 +183,9 @@
        }
 }
 
+#define QUEUE_ORDERED_NONE    0       /* no support */
+#define QUEUE_ORDERED_TAG     1       /* supported by tags (fast) */
+#define QUEUE_ORDERED_FLUSH   2       /* supported by cache flush (ugh!) */
 extern unsigned long blk_max_low_pfn, blk_max_pfn;
 
 #define BLK_BOUNCE_HIGH                ((u64)blk_max_low_pfn << PAGE_SHIFT)
@@ -244,6 +256,7 @@
 extern void blk_cleanup_queue(request_queue_t *);
 extern void blk_queue_headactive(request_queue_t *, int);
 extern void blk_queue_throttle_sectors(request_queue_t *, int);
+extern void blk_queue_ordered(request_queue_t *, int);
 extern void blk_queue_make_request(request_queue_t *, make_request_fn *);
 extern void generic_unplug_device(void *);
 extern inline int blk_seg_merge_ok(struct buffer_head *, struct buffer_head *);
diff -ru /usr/src/linux-2.4.25/include/linux/fs.h 
linux-2.4.25/include/linux/fs.h
--- /usr/src/linux-2.4.25/include/linux/fs.h    Sun Feb 22 19:18:58 2004
+++ linux-2.4.25/include/linux/fs.h     Wed Feb 25 18:35:59 2004
@@ -224,7 +224,11 @@
        BH_JBD,         /* 1 if it has an attached journal_head */
        BH_Sync,        /* 1 if the buffer is a sync read */
        BH_Delay,       /* 1 if the buffer is delayed allocate */
-
+       BH_Ordered_Tag, /* 1 if this buffer is a ordered write barrier */
+       BH_Ordered_Flush,/* 1 if this buffer is a flush write barrier */
+       BH_Ordered_Hard,/* 1 if barrier required by the caller */
+       BH_IO_OPNOTSUPP,/* 1 if block layer rejected a barrier write */
+       
        BH_PrivateStart,/* not a state bit, but the first bit available
                         * for private allocation by other entities
                         */
@@ -287,6 +291,9 @@
 #define buffer_async(bh)       __buffer_state(bh,Async)
 #define buffer_launder(bh)     __buffer_state(bh,Launder)
 #define buffer_delay(bh)       __buffer_state(bh,Delay)
+#define buffer_ordered_tag(bh) __buffer_state(bh,Ordered_Tag)
+#define buffer_ordered_hard(bh)        __buffer_state(bh,Ordered_Hard)
+#define buffer_ordered_flush(bh)       __buffer_state(bh,Ordered_Flush)
 
 #define bh_offset(bh)          ((unsigned long)(bh)->b_data & ~PAGE_MASK)
 
diff -ru /usr/src/linux-2.4.25/include/linux/ide.h 
linux-2.4.25/include/linux/ide.h
--- /usr/src/linux-2.4.25/include/linux/ide.h   Sun Feb 22 19:19:59 2004
+++ linux-2.4.25/include/linux/ide.h    Wed Feb 25 18:23:19 2004
@@ -747,6 +747,7 @@
        unsigned ata_flash      : 1;    /* 1=present, 0=default */
        unsigned dead           : 1;    /* 1=dead, no new attachments */
        unsigned id_read        : 1;    /* 1=id read from disk 0 = synthetic */
+       unsigned doing_barrier  : 1;    /* state, 1=currently doing flush */
        unsigned addressing;            /*      : 3;
                                         *  0=28-bit
                                         *  1=48-bit
@@ -792,6 +793,8 @@
        int             forced_lun;     /* if hdxlun was given at boot */
        int             lun;            /* logical unit */
        int             crc_count;      /* crc counter to reduce drive speed */
+
+       char            special_buf[4]; /* IDE_DRIVE_CMD, free use */
 } ide_drive_t;
 
 typedef struct ide_pio_ops_s {
<Prev in Thread] Current Thread [Next in Thread>