xfs
[Top] [All Lists]

[PATCH 1/2] xfs: dynamic speculative EOF preallocation

To: xfs@xxxxxxxxxxx
Subject: [PATCH 1/2] xfs: dynamic speculative EOF preallocation
From: Dave Chinner <david@xxxxxxxxxxxxx>
Date: Mon, 29 Nov 2010 11:43:50 +1100
In-reply-to: <1290991431-20519-1-git-send-email-david@xxxxxxxxxxxxx>
References: <1290991431-20519-1-git-send-email-david@xxxxxxxxxxxxx>
From: Dave Chinner <dchinner@xxxxxxxxxx>

Currently the size of the speculative preallocation during delayed
allocation is fixed by either the allocsize mount option of a
default size. We are seeing a lot of cases where we need to
recommend using the allocsize mount option to prevent fragmentation
when buffered writes land in the same AG.

Rather than using a fixed preallocation size by default (up to 64k),
make it dynamic by basing it on the current inode size. That way the
EOF preallocation will increase as the file size increases.  Hence
for streaming writes we are much more likely to get large
preallocations exactly when we need it to reduce fragementation.

For default settings, the size of the initial extents is determined
by the number of parallel writers and the amount of memory in the
machine. For 4GB RAM and 4 concurrent 32GB file writes:

EXT: FILE-OFFSET           BLOCK-RANGE          AG AG-OFFSET                 
TOTAL
   0: [0..1048575]:         1048672..2097247      0 (1048672..2097247)      
1048576
   1: [1048576..2097151]:   5242976..6291551      0 (5242976..6291551)      
1048576
   2: [2097152..4194303]:   12583008..14680159    0 (12583008..14680159)    
2097152
   3: [4194304..8388607]:   25165920..29360223    0 (25165920..29360223)    
4194304
   4: [8388608..16777215]:  58720352..67108959    0 (58720352..67108959)    
8388608
   5: [16777216..33554423]: 117440584..134217791  0 (117440584..134217791) 
16777208
   6: [33554424..50331511]: 184549056..201326143  0 (184549056..201326143) 
16777088
   7: [50331512..67108599]: 251657408..268434495  0 (251657408..268434495) 
16777088

and for 16 concurrent 16GB file writes:

 EXT: FILE-OFFSET           BLOCK-RANGE          AG AG-OFFSET                 
TOTAL
   0: [0..262143]:          2490472..2752615      0 (2490472..2752615)       
262144
   1: [262144..524287]:     6291560..6553703      0 (6291560..6553703)       
262144
   2: [524288..1048575]:    13631592..14155879    0 (13631592..14155879)     
524288
   3: [1048576..2097151]:   30408808..31457383    0 (30408808..31457383)    
1048576
   4: [2097152..4194303]:   52428904..54526055    0 (52428904..54526055)    
2097152
   5: [4194304..8388607]:   104857704..109052007  0 (104857704..109052007)  
4194304
   6: [8388608..16777215]:  209715304..218103911  0 (209715304..218103911)  
8388608
   7: [16777216..33554423]: 452984848..469762055  0 (452984848..469762055) 
16777208

Because it is hard to take back specualtive preallocation, cases
where there are large slow growing log files on a nearly full
filesystem may cause premature ENOSPC. Hence as the filesystem nears
full, the maximum dynamic prealloc size іs reduced according to this
table (based on 4k block size):

freespace       max prealloc size
  >5%             full extent (8GB)
  4-5%             2GB (8GB >> 2)
  3-4%             1GB (8GB >> 3)
  2-3%           512MB (8GB >> 4)
  1-2%           256MB (8GB >> 5)
  <1%            128MB (8GB >> 6)

This should reduce the amount of space held in speculative
preallocation for such cases.

The allocsize mount option turns off the dynamic behaviour and fixes
the prealloc size to whatever the mount option specifies. i.e. the
behaviour is unchanged.

Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx>
---
 fs/xfs/xfs_fsops.c |    1 +
 fs/xfs/xfs_iomap.c |   74 +++++++++++++++++++++++++++++++++++++++++++++-------
 fs/xfs/xfs_mount.c |   26 ++++++++++++-----
 fs/xfs/xfs_mount.h |   26 +++++++++++++++++-
 4 files changed, 107 insertions(+), 20 deletions(-)

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index be34ff2..6d17206 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -374,6 +374,7 @@ xfs_growfs_data_private(
                mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
        } else
                mp->m_maxicount = 0;
+       xfs_set_low_space_thresholds(mp);
 
        /* update secondary superblocks. */
        for (agno = 1; agno < nagcount; agno++) {
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 2057614..d7c2f05 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -389,6 +389,9 @@ error_out:
  * If the caller is doing a write at the end of the file, then extend the
  * allocation out to the file system's write iosize.  We clean up any extra
  * space left over when the file is closed in xfs_inactive().
+ *
+ * If we find we already have delalloc preallocation beyond EOF, don't do more
+ * preallocation as it it not needed.
  */
 STATIC int
 xfs_iomap_eof_want_preallocate(
@@ -405,6 +408,7 @@ xfs_iomap_eof_want_preallocate(
        xfs_filblks_t   count_fsb;
        xfs_fsblock_t   firstblock;
        int             n, error, imaps;
+       int             found_delalloc = 0;
 
        *prealloc = 0;
        if ((offset + count) <= ip->i_size)
@@ -427,11 +431,16 @@ xfs_iomap_eof_want_preallocate(
                        if ((imap[n].br_startblock != HOLESTARTBLOCK) &&
                            (imap[n].br_startblock != DELAYSTARTBLOCK))
                                return 0;
+
                        start_fsb += imap[n].br_blockcount;
                        count_fsb -= imap[n].br_blockcount;
+
+                       if (imap[n].br_startblock == DELAYSTARTBLOCK)
+                               found_delalloc = 1;
                }
        }
-       *prealloc = 1;
+       if (!found_delalloc)
+               *prealloc = 1;
        return 0;
 }
 
@@ -469,6 +478,7 @@ xfs_iomap_write_delay(
        extsz = xfs_get_extsz_hint(ip);
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
 
+
        error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
                                ioflag, imap, XFS_WRITE_IMAPS, &prealloc);
        if (error)
@@ -476,9 +486,44 @@ xfs_iomap_write_delay(
 
 retry:
        if (prealloc) {
+               xfs_fileoff_t   alloc_blocks = 0;
+               /*
+                * If we don't have a user specified preallocation size, 
dynamically
+                * increase the preallocation size as the size of the file
+                * grows. Cap the maximum size at a single extent or less if
+                * the filesystem is near full. The closer the filesystem is to
+                * full, the smaller the maximum prealocation.
+                */
+               if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
+                       int shift = 0;
+                       int64_t freesp;
+
+                       alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size);
+                       alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
+                                       rounddown_pow_of_two(alloc_blocks));
+
+                       freesp = xfs_icsb_read(mp, XFS_ICSB_FDBLOCKS);
+                       if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
+                               shift = 2;
+                               if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
+                                       shift++;
+                               if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
+                                       shift++;
+                               if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
+                                       shift++;
+                               if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
+                                       shift++;
+                       }
+                       if (shift)
+                               alloc_blocks >>= shift;
+               }
+
+               if (alloc_blocks < mp->m_writeio_blocks)
+                       alloc_blocks = mp->m_writeio_blocks;
+
                aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
                ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
-               last_fsb = ioalign + mp->m_writeio_blocks;
+               last_fsb = ioalign + alloc_blocks;
        } else {
                last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
        }
@@ -496,22 +541,31 @@ retry:
                          XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
                          XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
                          &nimaps, NULL);
-       if (error && (error != ENOSPC))
+       switch (error) {
+       case 0:
+       case ENOSPC:
+       case EDQUOT:
+               break;
+       default:
                return XFS_ERROR(error);
+       }
 
        /*
-        * If bmapi returned us nothing, and if we didn't get back EDQUOT,
-        * then we must have run out of space - flush all other inodes with
-        * delalloc blocks and retry without EOF preallocation.
+        * If bmapi returned us nothing, we got either ENOSPC or EDQUOT.  For
+        * ENOSPC, * flush all other inodes with delalloc blocks to free up
+        * some of the excess reserved metadata space. For both cases, retry
+        * without EOF preallocation.
         */
        if (nimaps == 0) {
                trace_xfs_delalloc_enospc(ip, offset, count);
                if (flushed)
-                       return XFS_ERROR(ENOSPC);
+                       return XFS_ERROR(error ? error : ENOSPC);
 
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               xfs_flush_inodes(ip);
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
+               if (error == ENOSPC) {
+                       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                       xfs_flush_inodes(ip);
+                       xfs_ilock(ip, XFS_ILOCK_EXCL);
+               }
 
                flushed = 1;
                error = 0;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 3905fc3..98cb266 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -315,14 +315,6 @@ xfs_icsb_sum(
        return percpu_counter_sum_positive(&mp->m_icsb[counter]);
 }
 
-static inline int64_t
-xfs_icsb_read(
-       struct xfs_mount        *mp,
-       int                     counter)
-{
-       return percpu_counter_read_positive(&mp->m_icsb[counter]);
-}
-
 void
 xfs_icsb_reinit_counters(
        struct xfs_mount        *mp)
@@ -1145,6 +1137,21 @@ xfs_set_rw_sizes(xfs_mount_t *mp)
 }
 
 /*
+ * precalculate the low space thresholds for dynamic speculative preallocation.
+ */
+void
+xfs_set_low_space_thresholds(
+       struct xfs_mount        *mp)
+{
+       int i;
+
+       for (i = 0; i < XFS_LOWSP_MAX; i++) {
+               mp->m_low_space[i] = mp->m_sb.sb_dblocks / 100 * (i + 1);
+       }
+}
+
+
+/*
  * Set whether we're using inode alignment.
  */
 STATIC void
@@ -1366,6 +1373,9 @@ xfs_mountfs(
         */
        xfs_set_rw_sizes(mp);
 
+       /* set the low space thresholds for dynamic preallocation */
+       xfs_set_low_space_thresholds(mp);
+
        /*
         * Set the inode cluster size.
         * This may still be overridden by the file system
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index c8ff435..9f99a62 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -75,8 +75,15 @@ enum {
        XFS_ICSB_MAX,
 };
 
-extern int     xfs_icsb_modify_inodes(struct xfs_mount *, int, int64_t, int);
-extern int     xfs_icsb_modify_free_blocks(struct xfs_mount *, int64_t, int);
+/* dynamic preallocation free space thresholds, 5% down to 1% */
+enum {
+       XFS_LOWSP_1_PCNT = 0,
+       XFS_LOWSP_2_PCNT,
+       XFS_LOWSP_3_PCNT,
+       XFS_LOWSP_4_PCNT,
+       XFS_LOWSP_5_PCNT,
+       XFS_LOWSP_MAX,
+};
 
 typedef struct xfs_mount {
        struct super_block      *m_super;
@@ -172,6 +179,8 @@ typedef struct xfs_mount {
                                                   on the next remount,rw */
        struct shrinker         m_inode_shrink; /* inode reclaim shrinker */
        struct percpu_counter   m_icsb[XFS_ICSB_MAX];
+       int64_t                 m_low_space[XFS_LOWSP_MAX];
+                                               /* low free space thresholds */
 } xfs_mount_t;
 
 /*
@@ -333,6 +342,19 @@ extern int xfs_icsb_init_counters(struct xfs_mount *);
 extern void    xfs_icsb_reinit_counters(struct xfs_mount *);
 extern void    xfs_icsb_destroy_counters(struct xfs_mount *);
 extern void    xfs_icsb_sync_counters(struct xfs_mount *);
+extern int     xfs_icsb_modify_inodes(struct xfs_mount *, int, int64_t, int);
+extern int     xfs_icsb_modify_free_blocks(struct xfs_mount *, int64_t, int);
+
+static inline int64_t
+xfs_icsb_read(
+       struct xfs_mount        *mp,
+       int                     counter)
+{
+       return percpu_counter_read_positive(&mp->m_icsb[counter]);
+}
+
+extern void    xfs_set_low_space_thresholds(struct xfs_mount *);
+
 
 #endif /* __KERNEL__ */
 
-- 
1.7.2.3

<Prev in Thread] Current Thread [Next in Thread>