xfs
[Top] [All Lists]

[PATCH 3/3] XFS: Sort delayed write buffers before dispatch

To: xfs@xxxxxxxxxxx
Subject: [PATCH 3/3] XFS: Sort delayed write buffers before dispatch
From: Dave Chinner <david@xxxxxxxxxxxxx>
Date: Sat, 2 Jan 2010 14:03:36 +1100
In-reply-to: <1262401416-19546-1-git-send-email-david@xxxxxxxxxxxxx>
References: <1262401416-19546-1-git-send-email-david@xxxxxxxxxxxxx>
Currently when the xfsbufd writes delayed write buffers, it pushes
them to disk in the order they come off the delayed write list. If
there are lots of buffers ѕpread widely over the disk, this results
in overwhelming the elevator sort queues in the block layer and we
end up losing the posibility of merging adjacent buffers to minimise
the number of IOs.

Add a sort array to the buftarg so that we can do high level sorting
of the buffers once they are pulled off the delwri queue for
writeback. Currently this array can hold 4096 buffers at a time
which gives us a window 32 times larger than the default elevator
maximums for ordering buffers.

Ideally this should use a list sort rather than requiring an
external buffer to sort the buffers in, but for simplicity
just do it via sort function. Followup patches are needed to
take the list sort functions from the DRM and UBIFS code and
make it a common function and to utilise it. That will allow
sorting the entire delwri queue to be written in one go.

Signed-off-by: Dave Chinner <david@xxxxxxxxxxxxx>
---
 fs/xfs/linux-2.6/xfs_buf.c |  121 ++++++++++++++++++++++++++++++++------------
 fs/xfs/linux-2.6/xfs_buf.h |    5 ++
 2 files changed, 93 insertions(+), 33 deletions(-)

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index aaefc33..d53d08b 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1644,12 +1644,18 @@ xfs_buf_delwri_promote(
        spinlock_t      *dwlk = &bp->b_target->bt_delwrite_lock;
        long            age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1;
 
-       spin_lock(dwlk);
        ASSERT(bp->b_flags & XBF_DELWRI);
        ASSERT(bp->b_flags & _XBF_DELWRI_Q);
-       list_del(&bp->b_list);
-       list_add(&bp->b_list, dwq);
+
+       /*
+        * Check the buffer age before locking the delayed write queue as we
+        * don't need to promote buffers that are already past the flush age.
+        */
+       if (bp->b_queuetime < jiffies - age)
+               return;
        bp->b_queuetime = jiffies - age;
+       spin_lock(dwlk);
+       list_move(&bp->b_list, dwq);
        spin_unlock(dwlk);
 }
 
@@ -1723,14 +1729,55 @@ xfs_buf_delwri_split(
 
 }
 
+/*
+ * Compare function is more complex than it needs to be because
+ * the return value is only 32 bits and we are doing comparisons
+ * on 64 bit values
+ */
+int
+xfs_buf_cmp(
+       const void      *a,
+       const void      *b)
+{
+       const struct xfs_buf    *ap = *(const struct xfs_buf**)a;
+       const struct xfs_buf    *bp = *(const struct xfs_buf**)b;
+       xfs_daddr_t             diff;
+
+       diff = ap->b_bn - bp->b_bn;
+       if (diff < 0)
+               return -1;
+       if (diff > 0)
+               return 1;
+       return 0;
+}
+
+int
+xfs_buf_delwri_sort(
+       xfs_buftarg_t   *target,
+       struct list_head *list)
+{
+       int     i = 0;
+
+       while (i < XFS_BUF_SORTBUF_SIZE && !list_empty(list)) {
+               struct xfs_buf  *bp = list_entry(list->next, xfs_buf_t, b_list);
+
+               ASSERT(target == bp->b_target);
+               list_del_init(&bp->b_list);
+               target->bt_sortbuf[i++] = bp;
+       }
+       sort(target->bt_sortbuf, i, sizeof(struct xfs_buf *), xfs_buf_cmp, 
NULL);
+
+       target->bt_sortbuf_num = i;
+       if (!list_empty(list))
+               return 1;
+       return 0;
+}
+
 STATIC int
 xfsbufd(
        void            *data)
 {
-       struct list_head tmp;
        xfs_buftarg_t   *target = (xfs_buftarg_t *)data;
-       int             count;
-       xfs_buf_t       *bp;
 
        current->flags |= PF_MEMALLOC;
 
@@ -1739,6 +1786,9 @@ xfsbufd(
        do {
                long    age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
                long    tout = age;
+               int     count = 0;
+               int     more = 0;
+               struct list_head tmp;
 
                if (unlikely(freezing(current))) {
                        set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
@@ -1753,15 +1803,14 @@ xfsbufd(
                schedule_timeout_interruptible(tout);
 
                xfs_buf_delwri_split(target, &tmp, age);
-               count = 0;
-               while (!list_empty(&tmp)) {
-                       bp = list_entry(tmp.next, xfs_buf_t, b_list);
-                       ASSERT(target == bp->b_target);
-
-                       list_del_init(&bp->b_list);
-                       xfs_buf_iostrategy(bp);
-                       count++;
-               }
+               do {
+                       int     i;
+                       more = xfs_buf_delwri_sort(target, &tmp);
+                       for (i = 0; i < target->bt_sortbuf_num; i++) {
+                               xfs_buf_iostrategy(target->bt_sortbuf[i]);
+                               count++;
+                       }
+               } while (more);
 
                if (as_list_len > 0)
                        purge_addresses();
@@ -1783,38 +1832,44 @@ xfs_flush_buftarg(
        xfs_buftarg_t   *target,
        int             wait)
 {
-       struct list_head tmp;
-       xfs_buf_t       *bp, *n;
+       xfs_buf_t       *bp;
        int             pincount = 0;
+       int             more = 0;
+       LIST_HEAD(tmp_list);
+       LIST_HEAD(wait_list);
 
        xfs_buf_runall_queues(xfsconvertd_workqueue);
        xfs_buf_runall_queues(xfsdatad_workqueue);
        xfs_buf_runall_queues(xfslogd_workqueue);
 
        set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
-       pincount = xfs_buf_delwri_split(target, &tmp, 0);
+       pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
 
        /*
-        * Dropped the delayed write list lock, now walk the temporary list
+        * Dropped the delayed write list lock, now walk the temporary list.
+        * All I/O is issued async and then if we need to wait for completion
+        * we do that after issuing all the IO.
         */
-       list_for_each_entry_safe(bp, n, &tmp, b_list) {
-               ASSERT(target == bp->b_target);
-               if (wait)
-                       bp->b_flags &= ~XBF_ASYNC;
-               else
-                       list_del_init(&bp->b_list);
-
-               xfs_buf_iostrategy(bp);
-       }
+       do {
+               int     i;
+               more = xfs_buf_delwri_sort(target, &tmp_list);
+               for (i = 0; i < target->bt_sortbuf_num; i++) {
+                       bp = target->bt_sortbuf[i];
+                       ASSERT(target == bp->b_target);
+                       if (wait) {
+                               bp->b_flags &= ~XBF_ASYNC;
+                               list_add(&bp->b_list, &wait_list);
+                       }
+                       xfs_buf_iostrategy(bp);
+               }
+       } while (more);
 
        if (wait)
                blk_run_address_space(target->bt_mapping);
 
-       /*
-        * Remaining list items must be flushed before returning
-        */
-       while (!list_empty(&tmp)) {
-               bp = list_entry(tmp.next, xfs_buf_t, b_list);
+       /* Now wait for IO to complete if required. */
+       while (!list_empty(&wait_list)) {
+               bp = list_entry(wait_list.next, xfs_buf_t, b_list);
 
                list_del_init(&bp->b_list);
                xfs_iowait(bp);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index a7c6895..599708e 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -128,6 +128,8 @@ typedef struct xfs_bufhash {
        spinlock_t              bh_lock;
 } xfs_bufhash_t;
 
+#define XFS_BUF_SORTBUF_SIZE   4096
+
 typedef struct xfs_buftarg {
        dev_t                   bt_dev;
        struct block_device     *bt_bdev;
@@ -147,6 +149,9 @@ typedef struct xfs_buftarg {
        struct list_head        bt_delwrite_queue;
        spinlock_t              bt_delwrite_lock;
        unsigned long           bt_flags;
+       int                     bt_sortbuf_num;
+       struct xfs_buf *        bt_sortbuf[XFS_BUF_SORTBUF_SIZE];
+
 } xfs_buftarg_t;
 
 /*
-- 
1.6.5

<Prev in Thread] Current Thread [Next in Thread>