Hi,
I have tracked down the probable cause for processes hanging when acting
on an XFS filesystem, running on a system with a kernel compiled with
GCC 2.95.x. I have included a patch that seems to correct this problem.
The first mention of this bug was in the following thread:
http://oss.sgi.com/projects/linux-xfs/indexed/msg01455.html
This bug can be easily observed by untaring a large tarball on to the
XFS mount. For testing, I used the Debian base2_2.tgz tarball; it has a
good mix of large and many small files (eg. the /dev/* files).
ftp://ftp.us.debian.org:/debian/dists/potato/main/disks-i386/current/base2_2.tgz
This bug is only triggered when compiling a kernel using GCC 2.95.x with
the -march=i686 optimization flag enabled. Using -march=i386, -march=i486,
or -march=i586 will produce a working kernel.
In xfs_trans_ail.c, xfs_trans_push_ail() line 141, GCC 2.95.x generates
incorrect code for the following line:
while (((restarts < XFS_TRANS_PUSH_AIL_RESTARTS) &&
(XFS_LSN_CMP(lip->li_lsn, threshold_lsn) < 0))) {
More specifically, XFS_LSN_CMP(), really a macro wrapper for the static
inline function _lsn_cmp(), defined in xfs_log.h, is inlined incorrectly
by the i686 optimizer. The resulting code causes the condition in the
while() statement to always be false. Consequently the body of the
while() statement never executes.
Making _lsm_cmp() into a non-inlined function is a quick fix for the
problem. I don't know specifically what triggers this bug, so I can't
offer any suggestions for an elegant work around for an inlined
_lsm_cmp().
Cheers,
..h
---- Patch Starts ----
diff -urN linux.orig/Makefile linux/Makefile
--- linux.orig/Makefile Thu Feb 1 12:10:24 2001
+++ linux/Makefile Wed Feb 7 13:39:22 2001
@@ -29,7 +29,7 @@
#comment out this line if compiling on RH 7.0
#CC = $(CROSS_COMPILE)gcc -V egcs-2.91.66
# AND uncomment the following line
-CC = $(CROSS_COMPILE)kgcc
+CC = $(CROSS_COMPILE)gcc
CPP = $(CC) -E
AR = $(CROSS_COMPILE)ar
NM = $(CROSS_COMPILE)nm
diff -urN linux.orig/fs/xfs/xfs_log.h linux/fs/xfs/xfs_log.h
--- linux.orig/fs/xfs/xfs_log.h Tue Nov 14 18:40:34 2000
+++ linux/fs/xfs/xfs_log.h Wed Feb 7 13:30:17 2001
@@ -50,7 +50,8 @@
* By comparing each compnent, we don't have to worry about extra
* endian issues in treating two 32 bit numbers as one 64 bit number
*/
-static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2,
xfs_arch_t arch)
+
+static inline xfs_lsn_t _lsn_cmp_inline(xfs_lsn_t lsn1, xfs_lsn_t lsn2,
xfs_arch_t arch)
{
if (CYCLE_LSN(lsn1, arch) != CYCLE_LSN(lsn2, arch))
return (CYCLE_LSN(lsn1, arch)<CYCLE_LSN(lsn2, arch))? -999 :
999;
@@ -60,6 +61,19 @@
return 0;
}
+
+#define _lsn_cmp_inline(x,y,arch) _lsn_cmp(x,y,arch)
+/*
+ * Unfortunately, the __i686__ or __pentiumpro__ macros do not seem to be
+ * defined in gcc-2.95.x, so checking for GCC version will have to do for now.
+ */
+#ifdef __GNUC__
+# if ((__GNUC__ == 2) && (__GNUC_MINOR__ == 95))
+/* defined in xfs_trans_ail.c */
+# undef _lsn_cmp
+extern xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2, xfs_arch_t arch);
+# endif
+#endif
#define XFS_LSN_CMP_ARCH(x,y,arch) _lsn_cmp(x, y, arch)
#define XFS_LSN_CMP(x,y) XFS_LSN_CMP_ARCH(x,y,ARCH_NOCONVERT)
diff -urN linux.orig/fs/xfs/xfs_trans_ail.c linux/fs/xfs/xfs_trans_ail.c
--- linux.orig/fs/xfs/xfs_trans_ail.c Mon Sep 25 00:42:07 2000
+++ linux/fs/xfs/xfs_trans_ail.c Wed Feb 7 13:30:17 2001
@@ -59,6 +59,38 @@
#define xfs_ail_check(a)
#endif /* XFSDEBUG */
+#ifdef __GNUC__
+# if ((__GNUC__ == 2) && (__GNUC_MINOR__ == 95))
+/*
+ * Un-inlined version of _lsn_cmp(), to work around a gcc-2.95.x inlining
+ * bug. This bug is only invoked by the -march=i686 flag; other
+ * architecture specific optimizations, such as -march=i586, seem fine.
+ *
+ * I don't know where this function should belong; although the lsn macros
+ * are defined in xfs_log.h, it doesn't really belong with the xlog_*
+ * functions in xfs_log.c.
+ *
+ * HHP
+ */
+xfs_lsn_t _lsn_cmp(
+ xfs_lsn_t lsn1,
+ xfs_lsn_t lsn2,
+ xfs_arch_t arch)
+{
+ /* copy and pasted from xfs_log.h */
+ if (CYCLE_LSN(lsn1, arch) != CYCLE_LSN(lsn2, arch))
+ return (CYCLE_LSN(lsn1, arch)<CYCLE_LSN(lsn2, arch))? -999 :
999;
+
+ if (BLOCK_LSN(lsn1, arch) != BLOCK_LSN(lsn2, arch))
+ return (BLOCK_LSN(lsn1, arch)<BLOCK_LSN(lsn2, arch))? -999 :
999;
+
+ return 0;
+
+ /* return _lsn_cmp_inline( lsn1, lsn2, arch ); */
+}
+# endif
+#endif
+
/*
* This is called by the log manager code to determine the LSN
|