[PATCH 06/34] xfs: dynamic speculative EOF preallocation
Alex Elder
aelder at sgi.com
Mon Dec 27 08:57:36 CST 2010
On Tue, 2010-12-21 at 18:29 +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner at redhat.com>
>
> Currently the size of the speculative preallocation during delayed
> allocation is fixed by either the allocsize mount option of a
> default size. We are seeing a lot of cases where we need to
> recommend using the allocsize mount option to prevent fragmentation
> when buffered writes land in the same AG.
This looks good. Logarithmic reduction in the size
as the file system gets close to full seems like a
reasonable heuristic. I have a few minor comments
below, which you can address at your option.
In any case:
Reviewed-by: Alex Elder <aelder at sgi.com>
>
> Rather than using a fixed preallocation size by default (up to 64k),
> make it dynamic by basing it on the current inode size. That way the
> EOF preallocation will increase as the file size increases. Hence
> for streaming writes we are much more likely to get large
> preallocations exactly when we need it to reduce fragementation.
>
> For default settings, the size of the initial extents is determined
> by the number of parallel writers and the amount of memory in the
> machine. For 4GB RAM and 4 concurrent 32GB file writes:
>
> EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL
> 0: [0..1048575]: 1048672..2097247 0 (1048672..2097247) 1048576
> 1: [1048576..2097151]: 5242976..6291551 0 (5242976..6291551) 1048576
> 2: [2097152..4194303]: 12583008..14680159 0 (12583008..14680159) 2097152
> 3: [4194304..8388607]: 25165920..29360223 0 (25165920..29360223) 4194304
> 4: [8388608..16777215]: 58720352..67108959 0 (58720352..67108959) 8388608
> 5: [16777216..33554423]: 117440584..134217791 0 (117440584..134217791) 16777208
> 6: [33554424..50331511]: 184549056..201326143 0 (184549056..201326143) 16777088
> 7: [50331512..67108599]: 251657408..268434495 0 (251657408..268434495) 16777088
>
> and for 16 concurrent 16GB file writes:
>
> EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL
> 0: [0..262143]: 2490472..2752615 0 (2490472..2752615) 262144
> 1: [262144..524287]: 6291560..6553703 0 (6291560..6553703) 262144
> 2: [524288..1048575]: 13631592..14155879 0 (13631592..14155879) 524288
> 3: [1048576..2097151]: 30408808..31457383 0 (30408808..31457383) 1048576
> 4: [2097152..4194303]: 52428904..54526055 0 (52428904..54526055) 2097152
> 5: [4194304..8388607]: 104857704..109052007 0 (104857704..109052007) 4194304
> 6: [8388608..16777215]: 209715304..218103911 0 (209715304..218103911) 8388608
> 7: [16777216..33554423]: 452984848..469762055 0 (452984848..469762055) 16777208
>
> Because it is hard to take back specualtive preallocation, cases
> where there are large slow growing log files on a nearly full
> filesystem may cause premature ENOSPC. Hence as the filesystem nears
> full, the maximum dynamic prealloc size іs reduced according to this
> table (based on 4k block size):
>
> freespace max prealloc size
> >5% full extent (8GB)
> 4-5% 2GB (8GB >> 2)
> 3-4% 1GB (8GB >> 3)
> 2-3% 512MB (8GB >> 4)
> 1-2% 256MB (8GB >> 5)
> <1% 128MB (8GB >> 6)
On really small filesystems this might be excessive.
On the other hand, you're already basing the size on
the file size so that should limit things just fine.
> This should reduce the amount of space held in speculative
> preallocation for such cases.
>
> The allocsize mount option turns off the dynamic behaviour and fixes
> the prealloc size to whatever the mount option specifies. i.e. the
> behaviour is unchanged.
>
> Signed-off-by: Dave Chinner <dchinner at redhat.com>
> ---
> fs/xfs/xfs_fsops.c | 1 +
> fs/xfs/xfs_iomap.c | 84 +++++++++++++++++++++++++++++++++++++++++++++------
> fs/xfs/xfs_mount.c | 21 +++++++++++++
> fs/xfs/xfs_mount.h | 14 ++++++++
> 4 files changed, 110 insertions(+), 10 deletions(-)
>
> diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
> index be34ff2..6d17206 100644
> --- a/fs/xfs/xfs_fsops.c
> +++ b/fs/xfs/xfs_fsops.c
> @@ -374,6 +374,7 @@ xfs_growfs_data_private(
> mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
> } else
> mp->m_maxicount = 0;
> + xfs_set_low_space_thresholds(mp);
>
> /* update secondary superblocks. */
> for (agno = 1; agno < nagcount; agno++) {
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 22b62a1..f36d2c8 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -267,6 +267,9 @@ error_out:
> * If the caller is doing a write at the end of the file, then extend the
> * allocation out to the file system's write iosize. We clean up any extra
> * space left over when the file is closed in xfs_inactive().
> + *
> + * If we find we already have delalloc preallocation beyond EOF, don't do more
> + * preallocation as it it not needed.
> */
> STATIC int
> xfs_iomap_eof_want_preallocate(
> @@ -282,6 +285,7 @@ xfs_iomap_eof_want_preallocate(
> xfs_filblks_t count_fsb;
> xfs_fsblock_t firstblock;
> int n, error, imaps;
> + int found_delalloc = 0;
>
> *prealloc = 0;
> if ((offset + count) <= ip->i_size)
> @@ -306,12 +310,60 @@ xfs_iomap_eof_want_preallocate(
> return 0;
> start_fsb += imap[n].br_blockcount;
> count_fsb -= imap[n].br_blockcount;
> +
> + if (imap[n].br_startblock == DELAYSTARTBLOCK)
> + found_delalloc = 1;
> }
> }
> - *prealloc = 1;
> + if (!found_delalloc)
> + *prealloc = 1;
Isn't this a separate change, possibly worthy of its
own commit?
> return 0;
> }
>
> +/*
> + * If we don't have a user specified preallocation size, dynamically increase
> + * the preallocation size as the size of the file grows. Cap the maximum size
> + * at a single extent or less if the filesystem is near full. The closer the
> + * filesystem is to full, the smaller the maximum prealocation.
> + */
> +STATIC xfs_fsblock_t
> +xfs_iomap_prealloc_size(
> + struct xfs_mount *mp,
> + struct xfs_inode *ip)
> +{
> + xfs_fsblock_t alloc_blocks = 0;
> +
> + if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
> + int shift = 0;
> + int64_t freesp;
> +
> + alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size);
Why not i_size_read()? Only matters for 32-bit, but
you're using do_div() below so you do seem to care
about it sometimes.
> + alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
> + rounddown_pow_of_two(alloc_blocks));
> +
> + freesp = percpu_counter_read_positive(
> + &mp->m_icsb[XFS_ICSB_FDBLOCKS]);
> + if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
> + shift = 2;
> + if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
> + shift++;
> + if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
> + shift++;
> + if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
> + shift++;
> + if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
> + shift++;
> + }
> + if (shift)
> + alloc_blocks >>= shift;
> + }
> +
> + if (alloc_blocks < mp->m_writeio_blocks)
> + alloc_blocks = mp->m_writeio_blocks;
> +
> + return alloc_blocks;
> +}
> +
> int
> xfs_iomap_write_delay(
> xfs_inode_t *ip,
> @@ -344,6 +396,7 @@ xfs_iomap_write_delay(
> extsz = xfs_get_extsz_hint(ip);
> offset_fsb = XFS_B_TO_FSBT(mp, offset);
>
> +
Extra blank line.
> error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
> imap, XFS_WRITE_IMAPS, &prealloc);
> if (error)
> @@ -351,9 +404,11 @@ xfs_iomap_write_delay(
>
> retry:
> if (prealloc) {
> + xfs_fsblock_t alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
> +
> aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
> ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
> - last_fsb = ioalign + mp->m_writeio_blocks;
> + last_fsb = ioalign + alloc_blocks;
> } else {
> last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
> }
> @@ -371,22 +426,31 @@ retry:
> XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
> XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
> &nimaps, NULL);
> - if (error && (error != ENOSPC))
> + switch (error) {
> + case 0:
> + case ENOSPC:
> + case EDQUOT:
> + break;
This (above and below, in this hunk) is also a separate
change, possibly worthy of its own small commit.
> + default:
> return XFS_ERROR(error);
> + }
>
> /*
> - * If bmapi returned us nothing, and if we didn't get back EDQUOT,
> - * then we must have run out of space - flush all other inodes with
> - * delalloc blocks and retry without EOF preallocation.
> + * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For
> + * ENOSPC, * flush all other inodes with delalloc blocks to free up
> + * some of the excess reserved metadata space. For both cases, retry
> + * without EOF preallocation.
> */
> if (nimaps == 0) {
> trace_xfs_delalloc_enospc(ip, offset, count);
> if (flushed)
> - return XFS_ERROR(ENOSPC);
> + return XFS_ERROR(error ? error : ENOSPC);
>
> - xfs_iunlock(ip, XFS_ILOCK_EXCL);
> - xfs_flush_inodes(ip);
> - xfs_ilock(ip, XFS_ILOCK_EXCL);
> + if (error == ENOSPC) {
> + xfs_iunlock(ip, XFS_ILOCK_EXCL);
> + xfs_flush_inodes(ip);
> + xfs_ilock(ip, XFS_ILOCK_EXCL);
> + }
>
> flushed = 1;
> error = 0;
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index d5710232..f1b094d 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -1101,6 +1101,24 @@ xfs_set_rw_sizes(xfs_mount_t *mp)
> }
>
> /*
> + * precalculate the low space thresholds for dynamic speculative preallocation.
> + */
> +void
> +xfs_set_low_space_thresholds(
> + struct xfs_mount *mp)
> +{
> + int i;
> +
> + for (i = 0; i < XFS_LOWSP_MAX; i++) {
> + __uint64_t space = mp->m_sb.sb_dblocks;
> +
> + do_div(space, 100);
How about computing this once, outside the loop?
> + mp->m_low_space[i] = space * (i + 1);
> + }
> +}
> +
> +
> +/*
> * Set whether we're using inode alignment.
> */
> STATIC void
> @@ -1322,6 +1340,9 @@ xfs_mountfs(
> */
> xfs_set_rw_sizes(mp);
>
> + /* set the low space thresholds for dynamic preallocation */
> + xfs_set_low_space_thresholds(mp);
> +
> /*
> * Set the inode cluster size.
> * This may still be overridden by the file system
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 03ad25c6..7b42e04 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -75,6 +75,16 @@ enum {
> XFS_ICSB_MAX,
> };
>
> +/* dynamic preallocation free space thresholds, 5% down to 1% */
> +enum {
> + XFS_LOWSP_1_PCNT = 0,
Why not make 1% be 1, 2% be 2, etc.?
> + XFS_LOWSP_2_PCNT,
> + XFS_LOWSP_3_PCNT,
> + XFS_LOWSP_4_PCNT,
> + XFS_LOWSP_5_PCNT,
> + XFS_LOWSP_MAX,
> +};
> +
> typedef struct xfs_mount {
> struct super_block *m_super;
> xfs_tid_t m_tid; /* next unused tid for fs */
> @@ -169,6 +179,8 @@ typedef struct xfs_mount {
> on the next remount,rw */
> struct shrinker m_inode_shrink; /* inode reclaim shrinker */
> struct percpu_counter m_icsb[XFS_ICSB_MAX];
> + int64_t m_low_space[XFS_LOWSP_MAX];
> + /* low free space thresholds */
> } xfs_mount_t;
>
> /*
> @@ -333,6 +345,8 @@ extern void xfs_icsb_sync_counters(struct xfs_mount *);
> extern int xfs_icsb_modify_inodes(struct xfs_mount *, int, int64_t);
> extern int xfs_icsb_modify_free_blocks(struct xfs_mount *, int64_t, int);
>
> +extern void xfs_set_low_space_thresholds(struct xfs_mount *);
> +
> #endif /* __KERNEL__ */
>
> extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
More information about the xfs
mailing list