[PATCH] xfs: di_flushiter considered harmful
Markus Trippelsdorf
markus at trippelsdorf.de
Mon Jul 22 10:15:42 CDT 2013
On 2013.07.22 at 09:40 -0500, Mark Tinguely wrote:
> On 07/22/13 06:07, Markus Trippelsdorf wrote:
> > On 2013.07.22 at 20:18 +1000, Dave Chinner wrote:
> >> From: Dave Chinner<dchinner at redhat.com>
> >>
> >> When we made all inode updates transactional, we no longer needed
> >> the log recovery detection for inodes being newer on disk than the
> >> transaction being replayed - it was redundant as replay of the log
> >> would always result in the latest version of the inode woul dbe on
> >> disk. It was redundant, but left in place because it wasn't
> >> considered to be a problem.
> >>
> >> However, with the new "don't read inodes on create" optimisation,
> >> flushiter has come back to bite us. Essentially, the optimisation
> >> made always initialises flushiter to zero in the create transaction,
> >> and so if we then crash and run recovery and the inode already on
> >> disk has a non-zero flushiter it will skip recovery of that inode.
> >> As a result, log recovery does the wrong thing and we end up with a
> >> corrupt filesystem.
> >>
> >> Because we have to support old kernel to new kernl upgrades, we
> >> can't just get rid of the flushiter support in log recovery as we
> >> might be upgrading from a kernel that doesn't have fully transaction
> >> inode updates. Unfortunately, for v4 superblocks there is no way to
> >> guarantee that log recovery knows about this fact.
> >>
> >> We cannot add a new inode format flag to say it's a "special inode
> >> create" because it won't be understood by older kernels and so
> >> recovery could do the wrong thing on downgrade. We cannot specially
> >> detect the combination of zero mode/non-zero flushiter on disk to
> >> non-zero mode, zero flushiter in the log item during recovery
> >> because wrapping of the flushiter can result in false detection.
> >>
> >> Hence that makes this "don't use flushiter" optimisation limited to
> >> a disk format that guarantees that we don't need it. And that means
> >> the only fix here is to limit the "no read IO on create"
> >> optimisation to version 5 superblocks....
> >
> > I think your patch misses the following part:
> >
>
>
> Dave's patch is limited to the new v5 (crc) superblock. The constraints
> that has to be dealt with are in the commit message as to why it is
> limited to the new v5 superblock.
>
> Going back to your 07/10/2013 message, your filesystem is:
>
> /dev/root on / type xfs (rw,relatime,attr2,inode64,logbsize=256k,noquota)
>
> or the non-crc v4 superblock with inode 2 that is probably why it is
> still failing for you.
>
> It seems to me that since we cannot fix this for inode 1/2, then besides
> this patch we have to revert patch cca9f93a52d and make it inode 3+ /
> superblock 5+ (crc) dependent.
Which is exactly what the hunk I've posted does.
Here's the combined patch:
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index 07d735a..e5869b5 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -39,6 +39,9 @@ typedef struct xfs_timestamp {
* There is a very similar struct icdinode in xfs_inode which matches the
* layout of the first 96 bytes of this structure, but is kept in native
* format instead of big endian.
+ *
+ * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed
+ * padding field for v3 inodes.
*/
typedef struct xfs_dinode {
__be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b78481f..5d7e344 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -896,7 +896,6 @@ xfs_dinode_to_disk(
to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
- to->di_flushiter = cpu_to_be16(from->di_flushiter);
to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
@@ -924,6 +923,9 @@ xfs_dinode_to_disk(
to->di_lsn = cpu_to_be64(from->di_lsn);
memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
uuid_copy(&to->di_uuid, &from->di_uuid);
+ to->di_flushiter = 0;
+ } else {
+ to->di_flushiter = cpu_to_be16(from->di_flushiter);
}
}
@@ -1054,17 +1056,15 @@ xfs_iread(
/* shortcut IO on inode allocation if possible */
if ((iget_flags & XFS_IGET_CREATE) &&
- !(mp->m_flags & XFS_MOUNT_IKEEP)) {
+ !(mp->m_flags & XFS_MOUNT_IKEEP) &&
+ xfs_sb_version_hascrc(&mp->m_sb)) {
/* initialise the on-disk inode core */
memset(&ip->i_d, 0, sizeof(ip->i_d));
ip->i_d.di_magic = XFS_DINODE_MAGIC;
ip->i_d.di_gen = prandom_u32();
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- ip->i_d.di_version = 3;
- ip->i_d.di_ino = ip->i_ino;
- uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
- } else
- ip->i_d.di_version = 2;
+ ip->i_d.di_version = 3;
+ ip->i_d.di_ino = ip->i_ino;
+ uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
return 0;
}
@@ -2882,12 +2882,18 @@ xfs_iflush_int(
__func__, ip->i_ino, ip->i_d.di_forkoff, ip);
goto corrupt_out;
}
+
/*
- * bump the flush iteration count, used to detect flushes which
- * postdate a log record during recovery. This is redundant as we now
- * log every change and hence this can't happen. Still, it doesn't hurt.
+ * Inode item log recovery for v1/v2 inodes are dependent on the
+ * di_flushiter count for correct sequencing. We bump the flush
+ * iteration count so we can detect flushes which postdate a log record
+ * during recovery. This is redundant as we now log every change and
+ * hence this can't happen but we need to still do it to ensure
+ * backwards compatibility with old kernels that predate logging all
+ * inode changes.
*/
- ip->i_d.di_flushiter++;
+ if (ip->i_d.di_version < 3)
+ ip->i_d.di_flushiter++;
/*
* Copy the dirty parts of the inode into the on-disk
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 6fcc910a..7681b19 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2592,8 +2592,16 @@ xlog_recover_inode_pass2(
goto error;
}
- /* Skip replay when the on disk inode is newer than the log one */
- if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
+ /*
+ * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
+ * are transactional and if ordering is necessary we can determine that
+ * more accurately by the LSN field in the V3 inode core. Don't trust
+ * the inode versions we might be changing them here - use the
+ * superblock flag to determine whether we need to look at di_flushiter
+ * to skip replay when the on disk inode is newer than the log one
+ */
+ if (!xfs_sb_version_hascrc(&mp->m_sb) &&
+ dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
/*
* Deal with the wrap case, DI_MAX_FLUSH is less
* than smaller numbers
@@ -2608,6 +2616,7 @@ xlog_recover_inode_pass2(
goto error;
}
}
+
/* Take the opportunity to reset the flush iteration count */
dicp->di_flushiter = 0;
--
Markus
More information about the xfs
mailing list