[PATCH 5/9] xfs: add configuration of error failure speed
Brian Foster
bfoster at redhat.com
Tue Feb 16 10:44:33 CST 2016
On Fri, Feb 05, 2016 at 12:23:23PM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner at redhat.com>
>
> On reception of an error, we can fail immediately, perform some
> bound amount of retries or retry indefinitely. The current behaviour
> we have is to retry forever.
>
> However, we'd like the ability to choose what behaviour we have, and
> that requires the ability to configure the behaviour through the new
> sysfs interfaces. Add configuration options for fail fast, slow or
> never to reflect the three choices above. Fail fast or fail never
> don't require any other options, but "fail slow" needs configuration
> to bound the retry behaviour. Add both a maximum retry count and a
> retry timeout so that we can bound by time and/or physical IO
> attempts.
>
> Finally, plumb these into xfs_buf_iodone error processing so that
> the error behaviour follows the selected configuration.
>
> Signed-off-by: Dave Chinner <dchinner at redhat.com>
> ---
> fs/xfs/xfs_buf.h | 23 ++++++++-
> fs/xfs/xfs_buf_item.c | 22 ++++++++-
> fs/xfs/xfs_mount.h | 2 +
> fs/xfs/xfs_sysfs.c | 128 ++++++++++++++++++++++++++++++++++++++++++++++++--
> 4 files changed, 169 insertions(+), 6 deletions(-)
>
...
> diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
> index 68e34d1..7afd4d5 100644
> --- a/fs/xfs/xfs_buf_item.c
> +++ b/fs/xfs/xfs_buf_item.c
...
> @@ -979,9 +982,25 @@ xfs_buf_iodone_callback_error(
> * Repeated failure on an async write. Take action according to the
> * error configuration we have been set up to use.
> */
> - if (cfg->fail_speed == XFS_ERR_FAIL_FAST)
> + switch (cfg->fail_speed) {
> + case XFS_ERR_FAIL_FAST:
> goto permanent_error;
>
> + case XFS_ERR_FAIL_SLOW:
> + if (++bp->b_retries > cfg->max_retries)
> + goto permanent_error;
> + if (!cfg->retry_timeout)
> + break;
> + if (time_after(jiffies,
> + cfg->retry_timeout + bp->b_first_retry_time))
> + goto permanent_error;
> + break;
> +
> + case XFS_ERR_FAIL_NEVER:
> + default:
> + break;
> + }
> +
I wonder a bit how granular this system needs to be in terms of user
interface, at least right now. For example, fail fast and fail never
just seem like variants of fail slow with particular tunables. Fail fast
is roughly equivalent to a retry count of one, whereas fail never
implies an infinite (e.g., -1) retry count. Do we really need the higher
level classification?
> /* still a transient error, higher layers will retry */
> xfs_buf_ioerror(bp, 0);
> xfs_buf_relse(bp);
> @@ -1023,6 +1042,7 @@ xfs_buf_iodone_callbacks(
> * retry state here in preparation for the next error that may occur.
> */
> bp->b_last_error = 0;
> + bp->b_retries = 0;
>
> xfs_buf_do_callbacks(bp);
> bp->b_fspriv = NULL;
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 9a61f39..2a3d178 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -62,6 +62,8 @@ enum {
> struct xfs_error_cfg {
> struct xfs_kobj kobj;
> int fail_speed;
> + int max_retries; /* INT_MAX = retry forever */
> + unsigned long retry_timeout; /* in jiffies, 0 = no timeout */
> };
>
> typedef struct xfs_mount {
> diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
> index 27487ce..51d9fa7 100644
> --- a/fs/xfs/xfs_sysfs.c
> +++ b/fs/xfs/xfs_sysfs.c
...
> @@ -330,6 +326,123 @@ to_error_cfg(struct kobject *kobject)
...
> +static ssize_t
> +retry_timeout_seconds_show(
> + struct kobject *kobject,
> + char *buf)
> +{
> + struct xfs_error_cfg *cfg = to_error_cfg(kobject);
> +
> + return snprintf(buf, PAGE_SIZE, "%ld\n",
Trailing whitespace here ^
Brian
> + jiffies_to_msecs(cfg->retry_timeout) * MSEC_PER_SEC);
> +}
> +
> +static ssize_t
> +retry_timeout_seconds_store(
> + struct kobject *kobject,
> + const char *buf,
> + size_t count)
> +{
> + struct xfs_error_cfg *cfg = to_error_cfg(kobject);
> + int ret;
> + int val;
> +
> + ret = kstrtoint(buf, 0, &val);
> + if (ret)
> + return ret;
> +
> + /* 1 day timeout maximum */
> + if (val < 0 || val > 86400)
> + return -EINVAL;
> +
> + cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC);
> + return count;
> +}
> +XFS_SYSFS_ATTR_RW(retry_timeout_seconds);
> +
> +static struct attribute *xfs_error_attrs[] = {
> + ATTR_LIST(failure_speed),
> + ATTR_LIST(max_retries),
> + ATTR_LIST(retry_timeout_seconds),
> + NULL,
> +};
> +
> +
> struct kobj_type xfs_error_cfg_ktype = {
> .release = xfs_sysfs_release,
> .sysfs_ops = &xfs_sysfs_ops,
> @@ -349,11 +462,15 @@ struct kobj_type xfs_error_ktype = {
> struct xfs_error_init {
> char *name;
> int fail_speed;
> + int max_retries;
> + int retry_timeout; /* in seconds */
> };
>
> static const struct xfs_error_init xfs_error_meta_init[XFS_ERR_ERRNO_MAX] = {
> { .name = "Default",
> .fail_speed = XFS_ERR_FAIL_NEVER,
> + .max_retries = INT_MAX,
> + .retry_timeout = 0,
> },
> };
>
> @@ -384,6 +501,9 @@ xfs_error_sysfs_init_class(
> goto out_error;
>
> cfg->fail_speed = init[i].fail_speed;
> + cfg->max_retries = init[i].max_retries;
> + cfg->retry_timeout = msecs_to_jiffies(
> + init[i].retry_timeout * MSEC_PER_SEC);
> }
> return 0;
>
> --
> 2.5.0
>
> _______________________________________________
> xfs mailing list
> xfs at oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs
More information about the xfs
mailing list