[BACK]Return to xfs_rw.c CVS log [TXT][DIR] Up to [Development] / xfs-linux

File: [Development] / xfs-linux / xfs_rw.c (download)

Revision 1.296, Mon Jan 24 21:38:02 2000 UTC (17 years, 8 months ago) by lord
Branch: MAIN
Changes since 1.295: +8 -6 lines

Merge from irix/irix6.5f to pingu/slinx-xfs
Merge of irix6.5f:irix:32165a created by doucette on 11/02/99
  xfs_dio_write_zero_rtarea doesn't need in+out offset & length parameters
  any more.  It's now only called for filesystems that don't allow unwritten
  extents; otherwise we just split the extent into normal & unwritten.
  xfs_dio_write does the locking for xfs_dio_write_zero_rtarea.
  It also needs to get the iolock exclusive when that routine will
  be called, otherwise there's a race on that extent.
  Also fix the realtime space reservation for realtime files with an
  extent size set.  When the realtime extent is allocated it's marked
  unwritten (if allowed) then the written part is converted to normal.
  Add a trace point for xfs_diordwr.

Merge of irix6.5f:irix:33486a created by doucette on 11/16/99
  In xfs_read() and xfs_write() cast count to ssize_t at the point of its
  comparison with 0, and fix the error value to EINVAL if negative.
  In xfs_write_file() change the type of count to size_t.

Merge of irix6.5f:irix:34672a created by doucette on 12/03/99
  In xfs_bmap assert that the realtime bit in iocore flags matches the
  inode's copy before we use it to control anything.

/*
 * Copyright (C) 1999 Silicon Graphics, Inc.  All Rights Reserved.
 * 
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of version 2 of the GNU General Public License as published
 * by the Free Software Fondation.
 * 
 * This program is distributed in the hope that it would be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  Further, any license provided herein,
 * whether implied or otherwise, is limited to this program in accordance with
 * the express provisions of the GNU General Public License.  Patent licenses,
 * if any, provided herein do not apply to combinations of this program with
 * other product or programs, or any other product whatsoever.  This program is
 * distributed without any warranty that the program is delivered free of the
 * rightful claim of any third person by way of infringement or the like.  See
 * the GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License along with
 * this program; if not, write the Free Software Foundation, Inc., 59 Temple
 * Place - Suite 330, Boston MA 02111-1307, USA.
 */
#ident "$Revision$"

#if defined(__linux__)
#include <xfs_linux.h>
#endif

#include <sys/param.h>
#include "xfs_buf.h"
#include <sys/uio.h>
#include <sys/vfs.h>
#include <sys/vnode.h>
#include <sys/cred.h>
#include <sys/sysmacros.h>
#include <sys/pfdat.h>
#include <sys/uuid.h>
#include <sys/major.h>
#include <sys/grio.h>
#include <sys/pda.h>
#include <sys/dmi_kern.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/errno.h> 
#include <sys/fcntl.h>
#include <sys/var.h>
#include <sys/conf.h>
#include <sys/systm.h>
#include <sys/uthread.h>
#include <ksys/as.h>
#include <sys/kmem.h>
#include <sys/sema.h>
#include <ksys/vfile.h>
#include <sys/flock.h>
#include <sys/fs_subr.h>
#include <sys/dmi.h>
#include <sys/dmi_kern.h>
#include <sys/schedctl.h>
#include <sys/atomic_ops.h>
#include <sys/ktrace.h>
#include <sys/sysinfo.h>
#include <sys/ksa.h>
#include <ksys/sthread.h>
#include "xfs_macros.h"
#include "xfs_types.h"
#include "xfs_inum.h"
#include "xfs_log.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dir.h"
#include "xfs_dir2.h"
#include "xfs_mount.h"
#include "xfs_alloc_btree.h"
#include "xfs_bmap_btree.h"
#include "xfs_ialloc_btree.h"
#include "xfs_itable.h"
#include "xfs_btree.h"
#include "xfs_alloc.h"
#include "xfs_bmap.h"
#include "xfs_ialloc.h"
#include "xfs_attr_sf.h"
#include "xfs_dir_sf.h"
#include "xfs_dir2_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode_item.h"
#include "xfs_inode.h"
#include "xfs_error.h"
#include "xfs_bit.h"
#include "xfs_rw.h"
#include "xfs_quota.h"
#include "xfs_trans_space.h"
#include "xfs_dmapi.h"
#include "xfs_cxfs.h"
#include <limits.h>

/*
 * turning on UIOSZ_DEBUG in a DEBUG kernel causes each xfs_write/xfs_read
 * to set the write/read i/o size to a random valid value and instruments
 * the distribution.
 *
#define UIOSZ_DEBUG
 */

#ifdef UIOSZ_DEBUG
int uiodbg = 0;
int uiodbg_readiolog[XFS_UIO_MAX_READIO_LOG - XFS_UIO_MIN_READIO_LOG + 1] =
		{0, 0, 0, 0};
int uiodbg_writeiolog[XFS_UIO_MAX_WRITEIO_LOG - XFS_UIO_MIN_WRITEIO_LOG + 1] =
		{0, 0, 0, 0};
int uiodbg_switch = 0;
#endif

#define XFS_NUMVNMAPS 10	    /* number of uacc maps to pass to VM */

extern int xfs_nfs_io_units;
typedef	uuid_t		stream_id_t;
void daemonize(void); /* from linux/xfs_thread.c */
void set_thread_name(char *name); /* from linux/xfs_thread.c */
extern void griostrategy(xfs_buf_t *bp);  /* prototype -- where to find it? */
int grio_io_is_guaranteed( vfile_t *fp, stream_id_t *stream_id);  /* prototype -- where to find it? */
extern int			grio_monitor_start( sysarg_t );
int grio_monitor_io_start( stream_id_t *stream_id, __int64_t iosize);
int grio_monitor_io_end( stream_id_t *stream_id, int index );

int sthread_create(char *name,
				   caddr_t stack_addr,
				   uint_t stack_size,
				   uint_t flags,
				   uint_t pri,
				   uint_t schedflags,
				   st_func_t func,
				   void *arg0,
				   void *arg1,
				   void *arg2,
				   void *arg3);  /* from linux/xfs_thread.c */
/*
 * This lock is used by xfs_strat_write().
 * The xfs_strat_lock is initialized in xfs_init().
 */
lock_t	xfs_strat_lock;

/*
 * Variables for coordination with the xfs_strat daemon.
 * The xfsc_lock and xfsc_wait variables are initialized
 * in xfs_init();
 */
static int	xfsc_count;
static xfs_buf_t	*xfsc_list;
static int	xfsc_bufcount;
lock_t		xfsc_lock;
sv_t		xfsc_wait;

/*
 * Variables for coordination with the xfsd daemons.
 * The xfsd_lock and xfsd_wait variables are initialized
 * in xfs_init();
 */
static int	xfsd_count;
static xfs_buf_t	*xfsd_list;
static int	xfsd_bufcount;
lock_t		xfsd_lock;
sv_t		xfsd_wait;

/*
 * Zone allocator for xfs_gap_t structures.
 */
zone_t		*xfs_gap_zone;

#ifdef DEBUG
/*
 * Global trace buffer for xfs_strat_write() tracing.
 */
ktrace_t	*xfs_strat_trace_buf;
#endif

#if !defined(XFS_STRAT_TRACE)
#define	xfs_strat_write_bp_trace(tag, ip, bp)
#define	xfs_strat_write_subbp_trace(tag, io, bp, rbp, loff, lcnt, lblk)
#endif	/* !XFS_STRAT_TRACE */

STATIC int
xfs_zero_last_block(
	vnode_t		*vp,
	xfs_iocore_t	*io,
	off_t		offset,
	xfs_fsize_t	isize,
	cred_t		*credp,
	struct pm	*pmp);

STATIC void
xfs_zero_bp(
	xfs_buf_t	*bp,
	int	data_offset,
	int	data_len);

STATIC int
xfs_retrieved(
	uint		available,
	off_t		offset,
	size_t		count,
	uint		*total_retrieved,
	xfs_fsize_t	isize);

#ifndef DEBUG

#define	xfs_strat_write_check(io,off,count,imap,nimap)
#define	xfs_check_rbp(io,bp,rbp,locked)
#define	xfs_check_bp(io,bp)
#define	xfs_check_gap_list(ip)

#else /* DEBUG */

STATIC void
xfs_strat_write_check(
	xfs_iocore_t	*io,
	xfs_fileoff_t	offset_fsb,
	xfs_filblks_t	buf_fsb,
	xfs_bmbt_irec_t	*imap,
	int		imap_count);

STATIC void
xfs_check_rbp(
	xfs_iocore_t	*io,
	xfs_buf_t		*bp,
	xfs_buf_t		*rbp,
	int		locked);

STATIC void
xfs_check_bp(
	xfs_iocore_t	*io,
	xfs_buf_t		*bp);

STATIC void
xfs_check_gap_list(
	xfs_iocore_t	*ip);

#endif /* DEBUG */		      

STATIC int
xfs_build_gap_list(
	xfs_iocore_t	*ip,
	off_t		offset,
	size_t		count);

STATIC void
xfs_free_gap_list(
	xfs_iocore_t	*ip);

STATIC void
xfs_cmp_gap_list_and_zero(
	xfs_iocore_t	*ip,
	xfs_buf_t		*bp);

STATIC void
xfs_delete_gap_list(
	xfs_iocore_t	*ip,
	xfs_fileoff_t	offset_fsb,
	xfs_extlen_t	count_fsb);

STATIC void
xfs_strat_comp(void);

STATIC int
xfsd(void);

void
xfs_strat_write_iodone(
	xfs_buf_t		*bp);

STATIC int
xfs_dio_write_zero_rtarea(
	xfs_inode_t	*ip,
	struct xfs_buf	*bp,
	xfs_fileoff_t	offset_fsb,
	xfs_filblks_t	count_fsb);
#if defined(__sgi__)
extern int
grio_io_is_guaranteed(
	vfile_t *,
	stream_id_t	*);

extern void
griostrategy(
	xfs_buf_t	*);

extern int
grio_monitor_io_start( 
	stream_id_t *, 
	__int64_t);

extern int
grio_monitor_io_end(
	stream_id_t *,
	int);
#endif	

extern void xfs_error(
	xfs_mount_t *,
	int);

STATIC void
xfs_delalloc_cleanup(
	xfs_inode_t	*ip,
	xfs_fileoff_t	start_fsb,
	xfs_filblks_t	count_fsb);

extern void xfs_buf_iodone_callbacks(struct xfs_buf *);
extern void xlog_iodone(struct xfs_buf *);

/*
 * Round the given file offset down to the nearest read/write
 * size boundary.
 */
#define	XFS_READIO_ALIGN(io,off)	(((off) >> io->io_readio_log) \
					        << io->io_readio_log)
#define	XFS_WRITEIO_ALIGN(io,off)	(((off) >> io->io_writeio_log) \
					        << io->io_writeio_log)

#if !defined(XFS_RW_TRACE)
#define	xfs_rw_enter_trace(tag, ip, uiop, ioflags)
#define	xfs_iomap_enter_trace(tag, ip, offset, count);
#define	xfs_iomap_map_trace(tag, ip, offset, count, bmapp, imapp)
#define xfs_inval_cached_trace(ip, offset, len, first, last)
#else
/*
 * Trace routine for the read/write path.  This is the routine entry trace.
 */
static void
xfs_rw_enter_trace(
	int		tag,	     
	xfs_iocore_t	*io,
	uio_t		*uiop,
	int		ioflags)
{
	xfs_inode_t	*ip = XFS_IO_INODE(io);

	if (!IO_IS_XFS(io) || (ip->i_rwtrace == NULL)) {
		return;
	}

	ktrace_enter(ip->i_rwtrace,
		     (void*)((unsigned long)tag),
		     (void*)ip,
		     (void*)((ip->i_d.di_size >> 32) & 0xffffffff),
		     (void*)(ip->i_d.di_size & 0xffffffff),
		     (void*)(((__uint64_t)uiop->uio_offset >> 32) &
			     0xffffffff),
		     (void*)(uiop->uio_offset & 0xffffffff),
		     (void*)uiop->uio_resid,
		     (void*)((unsigned long)ioflags),
		     (void*)((io->io_next_offset >> 32) & 0xffffffff),
		     (void*)(io->io_next_offset & 0xffffffff),
		     (void*)((unsigned long)((io->io_offset >> 32) &
					     0xffffffff)),
		     (void*)(io->io_offset & 0xffffffff),
		     (void*)((unsigned long)(io->io_size)),
		     (void*)((unsigned long)(io->io_last_req_sz)),
		     (void*)((unsigned long)((io->io_new_size >> 32) &
					     0xffffffff)),
		     (void*)(io->io_new_size & 0xffffffff));
}

static void
xfs_iomap_enter_trace(
	int		tag,
	xfs_iocore_t	*io,
	off_t		offset,
	size_t		count)
{
	xfs_inode_t	*ip = XFS_IO_INODE(io);

	if (!IO_IS_XFS(io) || (ip->i_rwtrace == NULL)) {
		return;
	}

	ktrace_enter(ip->i_rwtrace,
		     (void*)((unsigned long)tag),
		     (void*)ip,
		     (void*)((ip->i_d.di_size >> 32) & 0xffffffff),
		     (void*)(ip->i_d.di_size & 0xffffffff),
		     (void*)(((__uint64_t)offset >> 32) & 0xffffffff),
		     (void*)(offset & 0xffffffff),
		     (void*)((unsigned long)count),
		     (void*)((io->io_next_offset >> 32) & 0xffffffff),
		     (void*)(io->io_next_offset & 0xffffffff),
		     (void*)((io->io_offset >> 32) & 0xffffffff),
		     (void*)(io->io_offset & 0xffffffff),
		     (void*)((unsigned long)(io->io_size)),
		     (void*)((unsigned long)(io->io_last_req_sz)),
		     (void*)((io->io_new_size >> 32) & 0xffffffff),
		     (void*)(io->io_new_size & 0xffffffff),
		     (void*)0);
}

void
xfs_iomap_map_trace(
	int		tag,	     
	xfs_iocore_t	*io,
	off_t		offset,
	size_t		count,
	struct bmapval	*bmapp,
	xfs_bmbt_irec_t	*imapp)    
{
	xfs_inode_t	*ip = XFS_IO_INODE(io);

	if (!IO_IS_XFS(io) || (ip->i_rwtrace == NULL)) {
		return;
	}

	ktrace_enter(ip->i_rwtrace,
		     (void*)((unsigned long)tag),
		     (void*)ip,
		     (void*)((ip->i_d.di_size >> 32) & 0xffffffff),
		     (void*)(ip->i_d.di_size & 0xffffffff),
		     (void*)(((__uint64_t)offset >> 32) & 0xffffffff),
		     (void*)(offset & 0xffffffff),
		     (void*)((unsigned long)count),
		     (void*)((bmapp->offset >> 32) & 0xffffffff),
		     (void*)(bmapp->offset & 0xffffffff),
		     (void*)((unsigned long)(bmapp->length)),
		     (void*)((unsigned long)(bmapp->pboff)),
		     (void*)((unsigned long)(bmapp->pbsize)),
		     (void*)(bmapp->bn),
		     (void*)(__psint_t)(imapp->br_startoff),
		     (void*)((unsigned long)(imapp->br_blockcount)),
		     (void*)(__psint_t)(imapp->br_startblock));
}

static void
xfs_inval_cached_trace(
	xfs_iocore_t	*io,
	off_t		offset,
	off_t		len,
	off_t		first,
	off_t		last)
{
	xfs_inode_t	*ip = XFS_IO_INODE(io);

	if (!IO_IS_XFS(io) || (ip->i_rwtrace == NULL)) 
		return;
	ktrace_enter(ip->i_rwtrace,
		(void *)(__psint_t)XFS_INVAL_CACHED,
		(void *)ip,
		(void *)(((__uint64_t)offset >> 32) & 0xffffffff),
		(void *)(offset & 0xffffffff),
		(void *)(((__uint64_t)len >> 32) & 0xffffffff),
		(void *)(len & 0xffffffff),
		(void *)(((__uint64_t)first >> 32) & 0xffffffff),
		(void *)(first & 0xffffffff),
		(void *)(((__uint64_t)last >> 32) & 0xffffffff),
		(void *)(last & 0xffffffff),
		(void *)0,
		(void *)0,
		(void *)0,
		(void *)0,
		(void *)0,
		(void *)0);
}
#endif	/* XFS_RW_TRACE */

#ifdef DEBUG
/* ARGSUSED */
void
debug_print_vnmaps(vnmap_t *vnmap, int numvnmaps, int vnmap_flags)
{
	int i;

	for (i = 0; i < numvnmaps; i++, vnmap++) {
		cmn_err(CE_DEBUG,
"vaddr = 0x%llx, len = 0x%llx, flags = 0x%x\n  ovvaddr = 0x%llx len = 0x%llx offset = %lld\n",
			(uint64_t)vnmap->vnmap_vaddr,
			(uint64_t)vnmap->vnmap_len,
			vnmap->vnmap_flags,
			(uint64_t)vnmap->vnmap_ovvaddr,
			(uint64_t)vnmap->vnmap_ovlen,
			vnmap->vnmap_ovoffset);
	}
}
#endif

/*
 * Fill in the bmap structure to indicate how the next bp
 * should fit over the given extent.
 *
 * Everything here is in terms of file system blocks, not BBs.
 */
#if 1
void
xfs_next_bmap(
	xfs_mount_t	*mp,
	xfs_bmbt_irec_t	*imapp,
	struct bmapval	*bmapp,
	int		iosize,
	int		last_iosize,
	xfs_fileoff_t	ioalign,
	xfs_fileoff_t	last_offset,
	xfs_fileoff_t	req_offset,
	xfs_fsize_t	isize)
{
	__int64_t	extra_blocks;
	xfs_fileoff_t	size_diff;
	xfs_fileoff_t	ext_offset;
	xfs_fileoff_t	last_file_fsb;
	xfs_fsblock_t	start_block;

	/*
	 * Make sure that the request offset lies in the extent given.
	 */
	ASSERT(req_offset >= imapp->br_startoff);
	ASSERT(req_offset < (imapp->br_startoff + imapp->br_blockcount));

	if (last_offset == -1) {
		ASSERT(ioalign != -1);
		if (ioalign < imapp->br_startoff) {
			/*
			 * The alignment we guessed at can't
			 * happen on this extent, so align
			 * to the beginning of this extent.
			 * Subtract whatever we drop from the
			 * iosize so that we stay aligned on
			 * our iosize boundaries.
			 */
			size_diff = imapp->br_startoff - ioalign;
			iosize -= size_diff;
			ASSERT(iosize > 0);
			ext_offset = 0;
			bmapp->offset = imapp->br_startoff;
			ASSERT(bmapp->offset <= req_offset);
		} else {
			/*
			 * The alignment requested fits on this
			 * extent, so use it.
			 */
			ext_offset = ioalign - imapp->br_startoff;
			bmapp->offset = ioalign;
			ASSERT(bmapp->offset <= req_offset);
		}
	} else {
		/*
		 * This is one of a series of sequential access to the
		 * file.  Make sure to line up the buffer we specify
		 * so that it doesn't overlap the last one.  It should
		 * either be the same as the last one (if we need data
		 * from it) or follow immediately after the last one.
		 */
		ASSERT(ioalign == -1);
		if (last_offset >= imapp->br_startoff) {
			/*
			 * The last I/O was from the same extent
			 * that this one will at least start on.
			 * This assumes that we're going sequentially.
			 */
			if (req_offset < (last_offset + last_iosize)) {
				/*
				 * This request overlaps the buffer
				 * we used for the last request.  Just
				 * get that buffer again.
				 */
				ext_offset = last_offset -
					     imapp->br_startoff;
				bmapp->offset = last_offset;
				iosize = last_iosize;
			} else {
				/*
				 * This request does not overlap the buffer
				 * used for the last one.  Get it its own.
				 */
				ext_offset = req_offset - imapp->br_startoff;
				bmapp->offset = req_offset;
			}
		} else {
			/*
			 * The last I/O was on a different extent than
			 * this one.  We start at the beginning of this one.
			 * This assumes that we're going sequentially.
			 */
			ext_offset = req_offset - imapp->br_startoff;
			bmapp->offset = req_offset;
		}

	}
	start_block = imapp->br_startblock;
	if (start_block == HOLESTARTBLOCK) {
		bmapp->bn = -1;
		bmapp->eof = BMAP_HOLE;
	} else if (start_block == DELAYSTARTBLOCK) {
		bmapp->bn = -1;
		bmapp->eof = BMAP_DELAY;
	} else {
		bmapp->bn = start_block + ext_offset;
		bmapp->eof = 0;
		if (imapp->br_state == XFS_EXT_UNWRITTEN)
			bmapp->eof |= BMAP_UNWRITTEN;
	}
	bmapp->length = iosize;
	
	/*
	 * If the iosize from our offset extends beyond the
	 * end of the extent, then trim down the length
	 * to match that of the extent.
	 */
	 extra_blocks = (off_t)(bmapp->offset + bmapp->length) -
	                (__uint64_t)(imapp->br_startoff +
				     imapp->br_blockcount);   
	 if (extra_blocks > 0) {
	    	bmapp->length -= extra_blocks;
		ASSERT(bmapp->length > 0);
	}

	/*
	 * If the iosize from our offset extends beyond the end
	 * of the file and the current extent is simply a hole,
	 * then trim down the length to match the
	 * size of the file.  This keeps us from going out too
	 * far into hole at the EOF that extends to infinity.
	 */
	if (start_block == HOLESTARTBLOCK) {
		last_file_fsb = XFS_B_TO_FSB(mp, isize);
		extra_blocks = (off_t)(bmapp->offset + bmapp->length) -
			(__uint64_t)last_file_fsb;
		if (extra_blocks > 0) {
			bmapp->length -= extra_blocks;
			ASSERT(bmapp->length > 0);
		}
		ASSERT((bmapp->offset + bmapp->length) <= last_file_fsb);
	}

	bmapp->bsize = XFS_FSB_TO_B(mp, bmapp->length);
}
#endif /* !defined(__linux__) */

/*
 * xfs_retrieved() is a utility function used to calculate the
 * value of bmap.pbsize.
 *
 * Available is the number of bytes mapped by the current bmap.
 * Offset is the file offset of the current request by the user.
 * Count is the size of the current request by the user.
 * Total_retrieved is a running total of the number of bytes
 *  which have been setup for the user in this call so far.
 * Isize is the current size of the file being read.
 */
#if 1 
STATIC int
xfs_retrieved(
	uint		available,
	off_t		offset,
	size_t		count,
	uint		*total_retrieved,
	xfs_fsize_t	isize)
{
	uint		retrieved;
	xfs_fsize_t	file_bytes_left;
	

	if ((available + *total_retrieved) > count) {
		/*
		 * This buffer will return more bytes
		 * than we asked for.  Trim retrieved
		 * so we can set bmapp->pbsize correctly.
		 */
		retrieved = count - *total_retrieved;
	} else {
		retrieved = available;
	}

	file_bytes_left = isize - (offset + *total_retrieved);
	if (file_bytes_left < retrieved) {
		/*
		 * The user has requested more bytes
		 * than there are in the file.  Trim
		 * down the number to those left in
		 * the file.
		 */
		retrieved = file_bytes_left;
	}

	*total_retrieved += retrieved;
	return retrieved;
}
#endif /* !defined(__linux__) */
/*
 * xfs_iomap_extra()
 *
 * This is called when the VM/chunk cache is trying to create a buffer
 * for a page which is beyond the end of the file.  If we're at the
 * start of the page we give it as much of a mapping as we can, but
 * if it comes back for the rest of the page we say there is nothing there.
 * This behavior is tied to the code in the VM/chunk cache (do_pdflush())
 * that will call here.
 */
#if 1 /* !defined(__linux__) */
STATIC int					/* error */
xfs_iomap_extra(
	xfs_iocore_t	*io,
	off_t		offset,
	size_t		count,
	struct bmapval	*bmapp,
	int		*nbmaps,
	struct pm	*pmp)
{
	xfs_fileoff_t	offset_fsb;
	xfs_fileoff_t	end_fsb;
	xfs_fsize_t	nisize;
	xfs_mount_t	*mp;
	int		nimaps;
	xfs_fsblock_t	firstblock;
	int		error;
	xfs_bmbt_irec_t	imap;

	mp = io->io_mount;
	nisize = io->io_new_size;
	if (nisize < XFS_SIZE(mp, io)) {
		nisize = XFS_SIZE(mp, io);
	}
	offset_fsb = XFS_B_TO_FSBT(mp, offset);

	if (poff(offset) != 0) {
		/*
		 * This is the 'remainder' of a page being mapped out.
		 * Since it is already beyond the EOF, there is no reason
		 * to bother.
		 */
		ASSERT(count < NBPP);
		*nbmaps = 1;
		bmapp->eof = BMAP_EOF;
		bmapp->bn = -1;
		bmapp->offset = XFS_FSB_TO_BB(mp, offset_fsb);
		bmapp->length = 0;
		bmapp->bsize = 0;
		bmapp->pboff = 0;
		bmapp->pbsize = 0;
		bmapp->pmp = pmp;
		if (io->io_flags & XFS_IOCORE_RT) {
			bmapp->pbdev = mp->m_rtdev;
		} else {
			bmapp->pbdev = mp->m_dev;
		}
	} else {
		/*
		 * A page is being mapped out so that it can be flushed.
		 * The page is beyond the EOF, but we need to return
		 * something to keep the chunk cache happy.
		 */
		ASSERT(count <= NBPP);
		end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
		nimaps = 1;
		firstblock = NULLFSBLOCK;
		error = XFS_BMAPI(mp, NULL, io, offset_fsb,
				  (xfs_filblks_t)(end_fsb - offset_fsb),
				  0, &firstblock, 0, &imap,
				  &nimaps, NULL);
		if (error) {
			return error;
		}
		ASSERT(nimaps == 1);
		*nbmaps = 1;
		bmapp->eof = BMAP_EOF;
		if (imap.br_startblock == HOLESTARTBLOCK) {
			bmapp->eof |= BMAP_HOLE;
			bmapp->bn = -1;
		} else if (imap.br_startblock == DELAYSTARTBLOCK) {
			bmapp->eof |= BMAP_DELAY;
			bmapp->bn = -1;
		} else {
			bmapp->bn = XFS_FSB_TO_DB_IO(io, imap.br_startblock);
			if (imap.br_state == XFS_EXT_UNWRITTEN)
				bmapp->eof |= BMAP_UNWRITTEN;
		}
		bmapp->offset = XFS_FSB_TO_BB(mp, offset_fsb);
		bmapp->length =	XFS_FSB_TO_BB(mp, imap.br_blockcount);
		ASSERT(bmapp->length > 0);
		bmapp->bsize = BBTOB(bmapp->length);
		bmapp->pboff = offset - BBTOOFF(bmapp->offset);
		ASSERT(bmapp->pboff >= 0);
		bmapp->pbsize = bmapp->bsize - bmapp->pboff;
		ASSERT(bmapp->pbsize > 0);
		bmapp->pmp = pmp;
		if (bmapp->pbsize > count) {
			bmapp->pbsize = count;
		}
		if (io->io_flags & XFS_IOCORE_RT) {
			bmapp->pbdev = mp->m_rtdev;
		} else {
			bmapp->pbdev = mp->m_dev;
		}
	}
	return 0;
}
#endif /* !defined(__linux__) */
/*
 * xfs_iomap_read()
 *
 * This is the main I/O policy routine for reads.  It fills in
 * the given bmapval structures to indicate what I/O requests
 * should be used to read in the portion of the file for the given
 * offset and count.
 *
 * The inode's I/O lock may be held SHARED here, but the inode lock
 * must be held EXCL because it protects the read ahead state variables
 * in the inode.
 * Bug 516806: The readahead state is now maintained by i_rlock therefore,
 * the inode lock can be held in SHARED mode. The only time we need it 
 * in EXCL mode is when it is being read in the first time.  
 */
#if  !defined(__linux__)
int					/* error */
xfs_iomap_read(
	xfs_iocore_t	*io,
	off_t		offset,
	size_t		count,
	struct bmapval	*bmapp,
	int		*nbmaps,
	struct pm	*pmp,
	int		*unlocked,
	unsigned int	lockmode)
{
	xfs_fileoff_t	offset_fsb;
	xfs_fileoff_t	ioalign;
	xfs_fileoff_t	last_offset;
	xfs_fileoff_t	last_required_offset;
	xfs_fileoff_t	next_offset;
	xfs_fileoff_t	last_fsb;
	xfs_fileoff_t	max_fsb;
	xfs_fsize_t	nisize;
	off_t		offset_page;
	off_t		aligned_offset;
	xfs_fsblock_t	firstblock;
	int		nimaps;
	int		error;
	unsigned int	iosize;
	unsigned int	last_iosize;
	unsigned int	retrieved_bytes;
	unsigned int	total_retrieved_bytes;
	short		filled_bmaps;
	short		read_aheads;
	int		x;
	xfs_mount_t	*mp;
	struct bmapval	*curr_bmapp;
	struct bmapval	*next_bmapp;
	struct bmapval	*last_bmapp;
	struct bmapval	*first_read_ahead_bmapp;
	struct bmapval	*next_read_ahead_bmapp;
	xfs_bmbt_irec_t	*curr_imapp;
	xfs_bmbt_irec_t	*last_imapp;
	xfs_bmbt_irec_t	imap[XFS_MAX_RW_NBMAPS];

	ASSERT(ismrlocked(io->io_lock, MR_UPDATE | MR_ACCESS) != 0);
	ASSERT(ismrlocked(io->io_iolock, MR_UPDATE | MR_ACCESS) != 0);
	xfs_iomap_enter_trace(XFS_IOMAP_READ_ENTER, io, offset, count);

	mp = io->io_mount;
	nisize = io->io_new_size;
	if (nisize < XFS_SIZE(mp, io)) {
		nisize = XFS_SIZE(mp, io);
	}
	offset_fsb = XFS_B_TO_FSBT(mp, offset);
	nimaps = sizeof(imap) / sizeof(imap[0]);
	last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)nisize));
	if (offset >= nisize) {
		/*
		 * The VM/chunk code is trying to map a page or part
		 * of a page to be pushed out that is beyond the end
		 * of the file.  We handle these cases separately so
		 * that they do not interfere with the normal path
		 * code.
		 */
		error = xfs_iomap_extra(io, offset, count, bmapp, nbmaps, pmp);
		return error;
	}
	/*
	 * Map out to the maximum possible file size.  This will return
	 * an extra hole we don't really care about at the end, but we
	 * won't do any read-ahead beyond the EOF anyway.  We do this
	 * so that the buffers we create here line up well with those
	 * created in xfs_iomap_write() which extend beyond the end of
	 * the file.
	 */
	max_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAX_FILE_OFFSET);
	firstblock = NULLFSBLOCK;
	error = XFS_BMAPI(mp, NULL, io, offset_fsb,
			  (xfs_filblks_t)(max_fsb - offset_fsb),
			  XFS_BMAPI_ENTIRE, &firstblock, 0, imap,
			  &nimaps, NULL);
	if (error) {
		return error;
	}

	XFS_UNLK_MAP_SHARED(mp, io, lockmode);
	*unlocked = 1;
	mutex_lock(&io->io_rlock, PINOD);
	if ((offset == io->io_next_offset) &&
	    (count <= io->io_last_req_sz)) {
		/*
		 * Sequential I/O of same size as last time.
	 	 */
		ASSERT(io->io_size > 0);
		iosize = io->io_size;
		ASSERT(iosize <= XFS_BB_TO_FSBT(mp, XFS_MAX_BMAP_LEN_BB));
		last_offset = io->io_offset;
		ioalign = -1;
	} else {
		/*
		 * The I/O size for the file has not yet been
		 * determined, so figure it out.
		 */
		if (XFS_B_TO_FSB(mp, count) <= io->io_readio_blocks) {
			/*
			 * The request is smaller than our
			 * minimum I/O size, so default to
			 * the minimum.  For these size requests
			 * we always want to align the requests
			 * to XFS_READ_SIZE boundaries as well.
			 */
			iosize = io->io_readio_blocks;
			ASSERT(iosize <=
			       XFS_BB_TO_FSBT(mp, XFS_MAX_BMAP_LEN_BB));
			aligned_offset = XFS_READIO_ALIGN(io, offset);
			ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
		} else {
			/*
			 * The request is bigger than our
			 * minimum I/O size and it's the
			 * first one in this sequence, so
			 * set the I/O size for the file
			 * now.
			 *
			 * In calculating the offset rounded down
			 * to a page, make sure to round down the
			 * fs block offset rather than the byte
			 * offset for the case where our block size
			 * is greater than the page size.  This way
			 * offset_page will always align to a fs block
			 * as well as a page.
			 *
			 * For the end of the I/O we need to round
			 * offset + count up to a page boundary and
			 * then round that up to a file system block
			 * boundary.
			 */
			offset_page = ctooff(offtoct(XFS_FSB_TO_B(mp,
							 offset_fsb)));
			last_fsb = XFS_B_TO_FSB(mp,
					ctooff(offtoc(offset + count)));
			iosize = last_fsb - XFS_B_TO_FSBT(mp, offset_page);
			if (iosize >
			    XFS_BB_TO_FSBT(mp, XFS_MAX_BMAP_LEN_BB)) {
				iosize = XFS_BB_TO_FSBT(mp,
							XFS_MAX_BMAP_LEN_BB);
			}
			ioalign = XFS_B_TO_FSB(mp, offset_page);
		}
		last_offset = -1;
	}

	/*
	 * Now we've got the I/O size and the last offset,
	 * so start figuring out how to align our
	 * buffers.
	 */
	xfs_next_bmap(mp, imap, bmapp, iosize, iosize, ioalign,
		      last_offset, offset_fsb, nisize);
	ASSERT((bmapp->length > 0) &&
	       (offset >= XFS_FSB_TO_B(mp, bmapp->offset)));
	
	if (XFS_FSB_TO_B(mp, bmapp->offset + bmapp->length) >= nisize) {
		bmapp->eof |= BMAP_EOF;
	}

	bmapp->pboff = offset - XFS_FSB_TO_B(mp, bmapp->offset);
	retrieved_bytes = bmapp->bsize - bmapp->pboff;
	total_retrieved_bytes = 0;
	bmapp->pbsize = xfs_retrieved(retrieved_bytes, offset, count,
				      &total_retrieved_bytes, nisize);
	xfs_iomap_map_trace(XFS_IOMAP_READ_MAP,
			    io, offset, count, bmapp, imap);

	/*
	 * Only map additional buffers if they've been asked for
	 * and the I/O being done is sequential and has reached the
	 * point where we need to initiate more read ahead or we didn't get
	 * the whole request in the first bmap.
	 */
	last_fsb = XFS_B_TO_FSB(mp, nisize);
	filled_bmaps = 1;
	last_required_offset = bmapp[0].offset;
	first_read_ahead_bmapp = NULL;
	if ((*nbmaps > 1) &&
	    (((offset == io->io_next_offset) &&
	      (offset != 0) &&
	      (offset_fsb >= io->io_reada_blkno)) ||
	     retrieved_bytes < count)) {
		curr_bmapp = &bmapp[0];
		next_bmapp = &bmapp[1];
		last_bmapp = &bmapp[*nbmaps - 1];
		curr_imapp = &imap[0];
		last_imapp = &imap[nimaps - 1];

		/*
		 * curr_bmap is always the last one we filled
		 * in, and next_bmapp is always the next one
		 * to be filled in.
		 */
		while (next_bmapp <= last_bmapp) {
			next_offset = curr_bmapp->offset +
				      curr_bmapp->length;
			if (next_offset >= last_fsb) {
				/*
				 * We've mapped all the way to the EOF.
				 * Everything beyond there is inaccessible,
				 * so get out now.
				 */
				break;
			}

			last_iosize = curr_bmapp->length;
			if (next_offset <
			    (curr_imapp->br_startoff +
			     curr_imapp->br_blockcount)) {
				xfs_next_bmap(mp, curr_imapp,
					 next_bmapp, iosize, last_iosize, -1,
					 curr_bmapp->offset, next_offset,
					 nisize);
			} else {
				curr_imapp++;
				if (curr_imapp <= last_imapp) {
					xfs_next_bmap(mp,
					    curr_imapp, next_bmapp,
					    iosize, last_iosize, -1,
					    curr_bmapp->offset, next_offset,
					    nisize);	      
				} else {
					/*
					 * We're out of imaps.  We
					 * either hit the end of
					 * file or just didn't give
					 * enough of them to bmapi.
					 * The caller will just come
					 * back if we haven't done
					 * enough yet.
					 */
					break;
				}
			}
			
			filled_bmaps++;
			curr_bmapp = next_bmapp;
			next_bmapp++;
			ASSERT(curr_bmapp->length > 0);
		       
			/*
			 * Make sure to fill in the pboff and pbsize
			 * fields as long as the bmaps are required for
			 * the request (as opposed to strictly read-ahead).
			 */
			if (total_retrieved_bytes < count) {
				curr_bmapp->pboff = 0;
				curr_bmapp->pbsize =
					xfs_retrieved(curr_bmapp->bsize,
						      offset, count,
						      &total_retrieved_bytes,
						      nisize);
			}
			
			if (XFS_FSB_TO_B(mp, curr_bmapp->offset +
					 curr_bmapp->length) >= nisize) {
				curr_bmapp->eof |= BMAP_EOF;
			}
			xfs_iomap_map_trace(XFS_IOMAP_READ_MAP, io, offset,
					    count, curr_bmapp, curr_imapp);

 			/*
			 * Keep track of the offset of the last buffer
			 * needed to satisfy the I/O request.  This will
			 * be used for i_io_offset later.  Also record
			 * the first bmapp used to track a read ahead.
			 */
			if (XFS_FSB_TO_B(mp, curr_bmapp->offset) <
			    (offset + count)) {
				last_required_offset = curr_bmapp->offset;
			} else if (first_read_ahead_bmapp == NULL) {
				first_read_ahead_bmapp = curr_bmapp;
			}

		}

		/*
		 * If we're describing any read-ahead here, then move
		 * the read-ahead blkno up to the point where we've
		 * gone through half the read-ahead described here.
		 * This way we don't issue more read-ahead until we
		 * are half-way through the last read-ahead.
		 * 
		 * If we're not describing any read-ahead because the
		 * request is just fragmented, then move up the
		 * read-ahead blkno to just past what we're returning
		 * so that we can maybe start it on the next request.
		 */
		if (first_read_ahead_bmapp != NULL) {
			read_aheads = curr_bmapp - first_read_ahead_bmapp +1;
			next_read_ahead_bmapp = first_read_ahead_bmapp +
						(read_aheads / 2);
			io->io_reada_blkno = next_read_ahead_bmapp->offset;
		} else {
			io->io_reada_blkno = curr_bmapp->offset +
					    curr_bmapp->length;
		}
	} else if ((*nbmaps > 1) && (offset != io->io_offset)) {
		/*
		 * In this case the caller is not yet accessing the
		 * file sequentially, but set the read-ahead blkno
		 * so that we can tell if they start doing so.
		 */
		io->io_reada_blkno = bmapp[0].offset + bmapp[0].length;
	}

	ASSERT(iosize <= XFS_BB_TO_FSBT(mp, XFS_MAX_BMAP_LEN_BB));
	io->io_size = iosize;
	io->io_offset = last_required_offset;
	if (count > io->io_last_req_sz) {
		/*
		 * Record the "last request size" for the file.
		 * We don't let it shrink so that big requests
		 * that are not satisfied in one call here still
		 * record the full request size (not the smaller
		 * one that comes in to finish mapping the request).
		 */
		io->io_last_req_sz = count;
	}
	if (total_retrieved_bytes >= count) {
		/*
		 * We've mapped all of the caller's request, so
		 * the next request in a sequential read will
		 * come in the block this one ended on or the
		 * next if we consumed up to the very end of
		 * a block.
		 */
		io->io_next_offset = offset + count;
	} else {
		/*
		 * We didn't satisfy the entire request, so we
		 * can expect xfs_read_file() to come back with
		 * what is left of the request.
		 */
		io->io_next_offset = offset + total_retrieved_bytes;
	}
	mutex_unlock(&io->io_rlock);

	*nbmaps = filled_bmaps;
	for (x = 0; x < filled_bmaps; x++) {
		curr_bmapp = &bmapp[x];
		if (io->io_flags & XFS_IOCORE_RT) {
			curr_bmapp->pbdev = mp->m_rtdev;
		} else {
			curr_bmapp->pbdev = mp->m_dev;
		}
		ASSERT(curr_bmapp->offset <= XFS_B_TO_FSB(mp, nisize));
		curr_bmapp->offset = XFS_FSB_TO_BB(mp, curr_bmapp->offset);
		curr_bmapp->length = XFS_FSB_TO_BB(mp, curr_bmapp->length);
		ASSERT(curr_bmapp->length > 0);
		ASSERT((x == 0) ||
		       ((bmapp[x - 1].offset + bmapp[x - 1].length) ==
			curr_bmapp->offset));
		if (curr_bmapp->bn != -1) {
			curr_bmapp->bn = XFS_FSB_TO_DB_IO(io, curr_bmapp->bn);
		}
		curr_bmapp->pmp = pmp;
	}
	return 0;
}				
#endif /* !defined(__linux__) */

/* ARGSUSED */
#if !defined(__linux__)
int
xfs_vop_readbuf(bhv_desc_t 	*bdp,
		off_t		offset,
		ssize_t		len,
		int		ioflags,
		struct cred	*creds,
		flid_t		*fl,
		xfs_buf_t		**rbuf,
		int		*pboff,
		int		*pbsize)
{
	vnode_t		*vp;
	xfs_inode_t	*ip;
	int		error;
	struct bmapval	bmaps[2];
	int		nmaps;
	xfs_buf_t		*bp;
	extern void	chunkrelse(xfs_buf_t *bp);
	int		unlocked;
	int		lockmode;

	vp = BHV_TO_VNODE(bdp);
	ip = XFS_BHVTOI(bdp);
	*rbuf = NULL;
	*pboff = *pbsize = -1;
	error = 0;

	if (!(ioflags & IO_ISLOCKED))
		xfs_rwlockf(bdp, VRWLOCK_READ, 0);

	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
		error = XFS_ERROR(EIO);
		goto out;
	}

	/*
	 * blow this off if mandatory locking or DMI are involved
	 */
	if ((vp->v_flag & (VENF_LOCKING|VFRLOCKS)) == (VENF_LOCKING|VFRLOCKS))
		goto out;

	if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) && !(ioflags & IO_INVIS))
		goto out;

	unlocked = 0;
	lockmode = xfs_ilock_map_shared(ip);

	if (offset >= ip->i_d.di_size) {
		xfs_iunlock_map_shared(ip, lockmode);
		goto out;
	}

	/*
	 * prohibit iomap read from giving us back our data in
	 * two buffers but let it set up read-ahead.  Turn off
	 * read-ahead for NFSv2.  It's I/O sizes are too small
	 * to be of any real benefit (8K reads, 32K read buffers).
	 */
	nmaps = (ioflags & IO_NFS) ? 1 : 2;

	error = xfs_iomap_read(&ip->i_iocore, offset, len, bmaps, &nmaps,
				NULL, &unlocked, lockmode);
	if (!unlocked)
		xfs_iunlock_map_shared(ip, lockmode);

	/*
	 * if the first bmap doesn't match the I/O request, forget it.
	 * This means that we can't fit the request into one buffer.
	 */
	if (error ||
	    ((bmaps[0].pbsize != len) &&
	     (bmaps[0].eof & BMAP_EOF) == 0))
		goto out;

	/*
	 * if the caller has specified that the I/O fit into
	 * one page and it doesn't, forget it.  The caller won't
	 * be able to use it.
	 */
	if ((ioflags & IO_ONEPAGE)
	    && pnum(offset) != pnum(offset + bmaps[0].pbsize-1)) {
		goto out;
	}

	bp = chunkread(vp, bmaps, nmaps, creds);

	if (XFS_BUF_ISERROR(bp)) {
		error = XFS_BUF_GETERROR(bp);
		ASSERT(error != EINVAL);
		/*
		 * b_relse functions like chunkhold
		 * expect B_DONE to be there.
		 */
		XFS_BUF_DONE(bp);
		XFS_BUF_STALE(bp);
		xfs_buf_relse(bp);
		goto out;
	}

	if ((bmaps[0].pboff + bmaps[0].pbsize) == bmaps[0].bsize)
		bp->b_relse = chunkrelse;

	*rbuf = bp;
	*pboff = bmaps[0].pboff;
	*pbsize = bmaps[0].pbsize;

	xfs_ichgtime(ip, XFS_ICHGTIME_ACC);
out:
	if (!(ioflags & IO_ISLOCKED))
		xfs_rwunlockf(bdp, VRWLOCK_READ, 0);
	return XFS_ERROR(error);
}
#endif /* !defined(__linux__) */
/*
 * set of routines (xfs_lockdown_iopages, xfs_unlock_iopages,
 * xfs_mapped_biomove) to deal with i/o to or from mmapped files where
 * some or all of the user buffer is mmapped to the file passed in
 * as the target of the read/write system call.
 * 
 * there are 6 sets of deadlocks and possible problems.
 *
 * 1)	the i/o is a read, the user buffer lies in a region
 *	that is mapped to the file and not paged in.  the fault
 *	code calls VOP_READ.  But we deadlock if another thread
 *	has tried to write the file in the meantime because he's
 *	now waiting on the lock and the i/o lock has to be a barrier
 *	lock to prevent writer starvation.  this is addressed by calling
 *	the new VASOP, verifyvnmap that returns a set of maps indicating
 *	which ranges of the biomove'd virtual addresses are mmapped to
 *	the file.  the i/o path then breaks up the biomove (using
 *	xfs_mapped_biomove) into pieces and enables nested locking
 *	on the i/o lock during the biomove calls that could result
 *	in page faults that need to read data from the file.
 *
 * 2)   like above only the i/o is a write.  the page fault deadlocks
 *	instantly since we already hold the i/o lock in write mode and
 *	the xfs_read called by the page fault needs it in read mode.
 *	this one is handled as above by enabling nested locking around
 *	the appropriate biomove calls.
 *
 * 3)	the i/o is a read, the user buffer lies in a region
 *	that is mapped autogrow,shared, and the biomove filling
 *	the user buffer causes the file to grow.  the page fault
 *	triggered by the biomove needs to run xfs_write() and take
 *	the i/o lock in write mode.  this is addressed by making the
 *	read path smart enough to detect this condition using the
 *	information returned by the verifyvnmap VASOP and take the
 *	i/o lock in update mode at the beginning.  we then enable
 *	recursive locking (see xfs_ilock) to allow the fault to obtain
 *	the i/o lock regardless of whose waiting on it.
 *
 * 4)	deadlock in pagewait().  if the biomove faults and needs a
 *	page that exists but that page was created by the chunkread
 *	issued by xfs_read/xfs_write, the page won't be marked P_DONE
 *	(and usable by the fault code) until the i/o finishes and releases
 *	the buffer.  so the fault code will find the page and wait forever
 *	waiting for it to be marked P_DONE.  this case is handled by
 *	useracc'ing the dangerous pages before the chunkread so that
 *	the pages exist prior to the chunkread.  this case only happens
 *	if the range of file offsets touched by the i/o overlap with
 *	the range of file offsets associated with the mmapped virtual
 *	addresses touched by the biomove.
 *
 * 5)	buffer deadlock.  in the autogrow case, it's possible that
 *	that there can be no page overlap but that the file blocks
 *	that need to be initialized for the autogrow by xfs_zeroeof_blocks()
 *	and the file blocks in the i/o will both wind up living in the same
 *	buffer set up by the i/o path.  this will deadlock because the
 *	the buffer will be held by the i/o path when the biomove faults
 *	causing the autogrow process to deadlock on the buffer semaphore.
 *	this case is handled like case #2.  we create the pages and cause
 *	the autogrow to happen before the chunkread so that the fault/autogrow
 *	code and the io path code don't have to use the same buffer at the
 *	same time.
 *
 * 6)	a write i/o that passes in a data buffer that is mmapped beyond
 *	the current EOF.  Even if nested locking is enabled, the write path
 *	assumes that because buffered writes are single-threaded only one
 *	buffered writer can use the gap list and other inode fields at once.
 *	this is addressed by faulting in the user virtual address associated
 *	mapped to the largest file offset mapped by the buffer.  the fault
 *	occurs after the i/o lock is taken in write mode but before any real
 *	work is done.  this allows the VM system to issue a VOP_WRITE to
 *	grow the file to the appropriate point.  then the real write call
 *	executes after the file has been grown.  the fault is issued with
 *	nested locking enabled so the write-path inode fields are used
 *	serially while still holding onto the i/o lock to prevent a race
 *	with truncate().
 *
 * Situations this code does NOT attempt to deal with:
 *
 *	- the above situations only we're doing direct I/O instead of
 *	  buffered I/O.
 *
 *	- situations arising from the mappings changing while our i/o
 *	  is in progress.  it's possible that someone could remap part
 *	  of the buffer to make it dangerous after we've called to
 *	  verifyvnmap but before we've done all our biomoves.  fixing
 *	  this would require serializing i/o's and mmaps/munmaps
 *	  and I'd rather not do that.
 */

/*
 * routines to enable/disable/query nested iolock locking and isolate
 * the curuthread references to make the Cellular Irix merge easier
 */
#ifndef __linux__
void
xfs_enable_nested_locking(void)
{
	ASSERT_ALWAYS(curuthread->ut_vnlock == 0);
	curuthread->ut_vnlock = UT_FSNESTED;
}

void
xfs_disable_nested_locking(void)
{
	ASSERT_ALWAYS(curuthread->ut_vnlock == UT_FSNESTED);
	curuthread->ut_vnlock = 0;
}

int
xfs_is_nested_locking_enabled(void)
{
	return curuthread && curuthread->ut_vnlock & UT_FSNESTED;
}
#else
void
xfs_enable_nested_locking(void)
{
}

void
xfs_disable_nested_locking(void)
{
}

int
xfs_is_nested_locking_enabled(void)
{
  return 0; 
}
#endif
/*
 * xfs_lockdown_iopages() - lock down any mmapped user pages required for
 *	this i/o.  this is either
 *
 *	1) pages which will be referenced by the biomove and whose backing
 *		file blocks will be directly read/written by the i/o
 *	2) pages which will be referenced by the biomove, whose backing
 *		file blocks will reside in the same buffer used by the i/o,
 *		and whose file blocks are beyond the current EOF causing
 *		the file to be grown as part of the biomove
 *
 *	if any pages are locked down, *useracced is set to 1 and a set
 *	of xfs_uaccmap_t's are returned indicating the set of address ranges
 *	were locked down.  these pages should be unlocked as soon as
 *	possible after they are biomoved.
 *
 * returns ENOMEM if the number of pages being locked down exceeds
 * maxdmasz.  the pages should be unlocked ASAP after the biomove
 * using xfs_unlock_iopages().
 */
/* ARGSUSED */
#if  !defined(__linux__) 
int
xfs_lockdown_iopages(
	struct bmapval	*bmapp,
	xfs_fsize_t	isize,		/* in - current inode size/eof */
	int		vnmapflags,	/* in - map flags */
	vnmap_t		**vnmapp,	/* in/out - vmaps array */
	int		*nvnmapp,	/* in/out - number of valid maps left */
	xfs_uaccmap_t	*uaccmap,	/* in - caller supplied useracc maps */
	int		*nuaccmapp,	/* out - number of filled in uaccmaps */
	int		*useracced)	/* out - did we useracc anything */
{
	int		nuaccmaps;
	vnmap_t		*vnmap = *vnmapp;
	uvaddr_t	useracc_startvaddr;
	uvaddr_t	useracc_endvaddr;
	size_t		useracc_len;	
	__psunsigned_t	uacc_startshift;	
	__psunsigned_t	uacc_endshift;	
	uvaddr_t	agac_vaddr_start;
	uvaddr_t	agac_vaddr_end;
	__psunsigned_t	start_trim;	
	off_t		bmap_end;
	off_t		bmap_start;
	off_t		vnmap_end;
	off_t		overlap_off_start;
	off_t		overlap_off_end;
	off_t		iomove_off_start;
	off_t		iomove_off_end;
	off_t		curr_offset;
	int		error;
	int		numpages;

	/*
	 * do we have overlapping pages we need to
	 * useracc?  don't have to worry about readahead
	 * buffers since those pages will be marked
	 * P_DONE by the buffer release function out of
	 * biodone when the i/o to the buffer finishes.
	 */
	ASSERT(*nuaccmapp >= *nvnmapp);
	ASSERT((vnmapflags & (AS_VNMAP_OVERLAP|AS_VNMAP_AUTOGROW)));

	error = numpages = *useracced = *nuaccmapp = nuaccmaps = 0;
	curr_offset = 0;
	bmap_start = BBTOOFF(bmapp[0].offset);
	bmap_end = bmap_start + BBTOOFF(bmapp[0].length);

	/*
	 * process all vnmaps up through the end of the current
	 * buffer
	 */
	while (vnmap && curr_offset < bmap_end &&
	       vnmap->vnmap_ovoffset < bmap_end) {
		/*
		 * skip over maps that aren't marked as overlap or
		 * autogrow
		 */
		if (!(vnmap->vnmap_flags &
		      (AS_VNMAP_OVERLAP|AS_VNMAP_AUTOGROW))) {
			(*nvnmapp)--;
			if (*nvnmapp >= 0)
				vnmap++;
			else
				vnmap = NULL;
			continue;
		}

		/*
		 * if we haven an autogrow region, if the iomove
		 * grows the file, we need to calculate if any part
		 * of the iomove that is beyond the EOF will land
		 * in this buffer.  if so, we need to lock those
		 * pages down now otherwise the xfs_write called by
		 * the fault code will need to zero the area of the
		 * file between the current and new EOF but it has to
		 * to get the buffer to do that.  the problem is
		 * the buffer will be held (already locked) by the
		 * i/o so we'll deadlock.
		 */
		if (isize >= 0 && vnmap->vnmap_flags & AS_VNMAP_AUTOGROW) {
			/*
			 * calculate file offsets touched by the iomove
			 * indicated in this vnmap
			 */
			start_trim = vnmap->vnmap_ovvaddr - vnmap->vnmap_vaddr;
			iomove_off_start = vnmap->vnmap_ovoffset - start_trim;
			iomove_off_end = iomove_off_start + vnmap->vnmap_len;

			/*
			 * determine if any of the uimoved pages are in
			 * the buffer that will be set up by this bmap.
			 * we have to lock down any pages in the buffer
			 * between eof and the end of the uiomove to
			 * grow the file out to the end of the uiomove.
			 */
			if (isize < bmap_start) {
				overlap_off_start = MAX(bmap_start,
							iomove_off_start);
			} else {
				overlap_off_start = MIN(isize,
							iomove_off_start);
			}

			overlap_off_end = MIN(iomove_off_end, bmap_end);

			/*
			 * if so, set up the useracc range to span those
			 * pages.  the useracc range can only grow larger
			 * than that range.  it can't get smaller.
			 */
			if (overlap_off_end - overlap_off_start > 0) {
				agac_vaddr_start = vnmap->vnmap_vaddr +
					(iomove_off_start > overlap_off_start ?
					 iomove_off_start - overlap_off_start :
					 0);
				agac_vaddr_end = vnmap->vnmap_vaddr +
					vnmap->vnmap_len -
					 (iomove_off_end > overlap_off_end ?
					  iomove_off_end - overlap_off_end :
					  0);
			} else {
				agac_vaddr_start = (uvaddr_t) -1LL;
				agac_vaddr_end = (uvaddr_t) 0LL;
			}
		} else {
			agac_vaddr_start = (uvaddr_t) -1LL;
			agac_vaddr_end = (uvaddr_t) 0LL;
		}

		/*
		 * useracc the smallest possible range.  don't bother
		 * unless the overlap specified in the vnmap overlaps
		 * the file offsets specified for the buffer by the bmap.
		 */
		vnmap_end = vnmap->vnmap_ovoffset + vnmap->vnmap_ovlen;
		overlap_off_start = MAX(vnmap->vnmap_ovoffset, bmap_start);
		overlap_off_end = MIN(vnmap_end, bmap_end);

		if (vnmap->vnmap_flags & AS_VNMAP_OVERLAP &&
		    overlap_off_end - overlap_off_start > 0) {
			uacc_startshift = overlap_off_start -
						vnmap->vnmap_ovoffset;
			uacc_endshift = vnmap_end - overlap_off_end;
			ASSERT(overlap_off_start - vnmap->vnmap_ovoffset >= 0);
			ASSERT(vnmap_end - overlap_off_end >= 0);
			useracc_startvaddr = vnmap->vnmap_ovvaddr +
						uacc_startshift;
			useracc_endvaddr = vnmap->vnmap_ovvaddr +
						 vnmap->vnmap_ovlen -
						 uacc_endshift;
		} else {
			useracc_startvaddr = (uvaddr_t) -1LL;
			useracc_endvaddr = (uvaddr_t) 0LL;
		}

		/*
		 * enlarge range if necessary to include
		 * pages that have to be pinned because they
		 * would cause a zero-eof autogrow scenario
		 */
		useracc_startvaddr = MIN(useracc_startvaddr, agac_vaddr_start);
		useracc_endvaddr = MAX(useracc_endvaddr, agac_vaddr_end);

		if (useracc_startvaddr != (uvaddr_t) -1LL &&
		    useracc_endvaddr != (uvaddr_t) 0LL) {
			/*
			 * enable recursive locking so the fault handler won't
			 * block on the i/o lock when setting up the pages.
			 * round the lockdown range to page boundaries.
			 * make sure we don't pin more than maxdmasz pages
			 * per i/o.  that's not allowed.
			 */
			ASSERT(useracc_endvaddr > useracc_startvaddr);
			useracc_startvaddr = (uvaddr_t)
						ctob(btoct(useracc_startvaddr));
			useracc_endvaddr = (uvaddr_t)
						ctob(btoc(useracc_endvaddr));
			useracc_len = useracc_endvaddr - useracc_startvaddr;
			numpages += btoc(useracc_len);

			if (numpages > maxdmasz) {
				cmn_err(CE_WARN,
"xfs_lockdown_iopages needed to pin %d pages, maxdmasz = %d\nPlease increase maxdmasz and try again.\n",
					numpages, maxdmasz);
				error = XFS_ERROR(ENOMEM);
				break;
			}

			xfs_enable_nested_locking();
			error = useracc((void *)useracc_startvaddr, useracc_len,
					B_READ|B_PHYS, NULL);
			xfs_disable_nested_locking();
			if (!error) {
				*useracced = 1;
				uaccmap->xfs_uacstart = useracc_startvaddr;
				uaccmap->xfs_uaclen = useracc_len;
				uaccmap++;
				nuaccmaps++;
			}
		}

		/*
		 * have to check next vmap if this vmap ends
		 * before the buffer does
		 */
		if (bmap_end >= vnmap_end) {
			(*nvnmapp)--;
			if (*nvnmapp >= 0)
				vnmap++;
			else
				vnmap = NULL;
			curr_offset = vnmap_end;
		} else
			curr_offset = bmap_end;
	}

	*nuaccmapp = nuaccmaps;
	*vnmapp = vnmap;

	return error;
}
#endif /* !defined(__linux__) */
/*
 * xfs_unlock_iopages() - unlock the set of pages specified in the
 *	xfs_uaccmap_t's set up by xfs_lockdown_iopages().
 */
#if  !defined(__linux__) 
void
xfs_unlock_iopages(
	xfs_uaccmap_t	*uaccmap,	/* useracc maps */
	int		nuaccmaps)	/* number of filled in uaccmaps */

{
	int	i;

	for (i = 0; i < nuaccmaps; i++, uaccmap++) {
		ASSERT(uaccmap->xfs_uaclen <= ((size_t)-1));
		unuseracc((void *)uaccmap->xfs_uacstart,
			  (size_t)uaccmap->xfs_uaclen,
			  B_PHYS|B_READ);
	}

	return;
}
#endif /* !defined(__linux__) */
/*
 * handles biomoves of data where some of the user addresses are mapped to
 * the file being read/written.  each vnmap_t represents a range of addresses
 * mapped to a file.  that range needs to be biomoved with recursive locking
 * enabled so we don't deadlock on the i/o lock when faulting in the biomove.
 * however, we don't want recursive locking enabled on any other page since
 * we could screw up the locking on other inodes if we try and take the
 * i/o lock on a different inode where we don't hold the lock and we have
 * recursive locking enabled.
 *
 * note -- we could do one biomove if we were willing to add an
 * inode pointer in the uthread along with the vnlocks field.  this
 * code trades off increased complexity and slower execution speed
 * (the extra biomoves plus the cycles required in pas_verifyvnmap
 * to set up the additional vnmap's that we might not need if we
 * weren't breaking up our biomoves) suffered only in the danger cases
 * against memory bloat in the uthread structure that would be suffered by
 * all uthreads.
 *
 * vnmapp and nvnmapp are set to the first map that hasn't completely been
 * moved and the count of remaining valid maps respectively.
 */
#if !defined(__linux__)
int
xfs_mapped_biomove(
	struct xfs_buf	*bp,
	u_int		pboff,
	size_t		io_len,
	enum uio_rw	rw,
	struct uio	*uiop,
	vnmap_t		**vnmapp,
	int		*nvnmapp)
{
	uvaddr_t	io_end;
	uvaddr_t	current_vaddr;
	int		numvnmaps = *nvnmapp;
	vnmap_t		*vnmap = *vnmapp;
	size_t		partial_io_len;
	size_t		vmap_io_len;
	int		error = 0;

	ASSERT(uiop->uio_iovcnt == 1);

	/*
	 * do nothing if the requested biomove is entirely before
	 * the first vnmap address range
	 */
	current_vaddr = uiop->uio_iov->iov_base;
	io_end = current_vaddr + io_len;

	if (io_end < vnmap->vnmap_vaddr)
		return XFS_ERROR(biomove(bp, pboff, io_len, rw, uiop));

	/*
	 * move as much as we can (up to the first vnmap)
	 */
	if (current_vaddr < vnmap->vnmap_vaddr) {
		partial_io_len = MIN(io_len,
				 (__psunsigned_t) vnmap->vnmap_vaddr -
				 (__psunsigned_t) uiop->uio_iov->iov_base);
		if (error = biomove(bp, pboff, partial_io_len, rw, uiop))
			return XFS_ERROR(error);
		pboff += partial_io_len;
		io_len -= partial_io_len;
		current_vaddr += partial_io_len;
	}

	while (vnmap && current_vaddr < io_end) {
		/*
		 * move what we can of the first vnmap.  allow recursive
		 * io-lock locking in the biomove.
		 */
		ASSERT(uiop->uio_iov->iov_base >= vnmap->vnmap_vaddr);

		ASSERT(current_vaddr >= vnmap->vnmap_vaddr);
		vmap_io_len = vnmap->vnmap_len -
				((__psunsigned_t) uiop->uio_iov->iov_base -
				 (__psunsigned_t) vnmap->vnmap_vaddr);
		partial_io_len = MIN(vmap_io_len, io_len);

		xfs_enable_nested_locking();
		if (error = biomove(bp, pboff, partial_io_len, rw, uiop)) {
			xfs_disable_nested_locking();
			error = XFS_ERROR(error);
			break;
		}
		xfs_disable_nested_locking();
		pboff += partial_io_len;
		io_len -= partial_io_len;
		current_vaddr += partial_io_len;

		/*
		 * did we move the entire vnmap?  if so, look at the next map
		 * and move the data up to the next vnmap if there is one
		 * (or finish up the I/O if we're out of maps)
		 */
		ASSERT(current_vaddr <= vnmap->vnmap_vaddr + vnmap->vnmap_len);
		if (current_vaddr == vnmap->vnmap_vaddr + vnmap->vnmap_len) {
			numvnmaps--;
			if (numvnmaps > 0) {
				vnmap++;
				partial_io_len = MIN(
					(__psunsigned_t)vnmap->vnmap_vaddr -
					(__psunsigned_t)uiop->uio_iov->iov_base,
						     io_len);
			} else {
				vnmap = NULL;
				partial_io_len = io_len;
			}
			if (partial_io_len > 0) {
				if (error = biomove(bp, pboff, partial_io_len,
							rw, uiop)) {
					error = XFS_ERROR(error);
					break;
				}
				pboff += partial_io_len;
				io_len -= partial_io_len;
				current_vaddr += partial_io_len;
			}
		}
	}

	*nvnmapp = numvnmaps;
	*vnmapp = vnmap;

	return error;
}
#endif /* !defined(__linux__) */
/* ARGSUSED */		
#if !defined(__linux__)
int
xfs_read_file(
	xfs_iocore_t	*io,
	bhv_desc_t	*bdp,	      
	uio_t		*uiop,
	int		ioflag,
	cred_t		*credp,
	vnmap_t		*vnmaps,
	int		numvnmaps,
	const uint	vnmapflags,
	xfs_uaccmap_t	*uaccmaps,
	xfs_fsize_t	isize)
{
	struct bmapval	bmaps[XFS_MAX_RW_NBMAPS];
	struct bmapval	*bmapp;
	int		nbmaps;
	vnode_t		*vp;
	xfs_buf_t		*bp;
	int		read_bmaps;
	int		buffer_bytes_ok;
	int		error;
	int		unlocked;
	unsigned int	lockmode;
	xfs_mount_t	*mp;
	int		useracced = 0;
	vnmap_t		*cur_ldvnmap = vnmaps;
	int		num_ldvnmaps = numvnmaps;
	int		num_biovnmaps = numvnmaps;
	int		nuaccmaps;
	int		do_lockdown = vnmapflags & (AS_VNMAP_OVERLAP |
						    AS_VNMAP_AUTOGROW);
	vnmap_t		*cur_biovnmap = vnmaps;

	vp = BHV_TO_VNODE(bdp);
	mp = io->io_mount;

	if (XFS_FORCED_SHUTDOWN(mp))
		return XFS_ERROR(EIO);


	error = 0;
	buffer_bytes_ok = 0;
	XFSSTATS.xs_read_calls++;
	XFSSTATS.xs_read_bytes += uiop->uio_resid;

	/*
	 * Loop until uio->uio_resid, which is the number of bytes the
	 * caller has requested, goes to 0 or we get an error.  Each
	 * call to xfs_iomap_read tries to map as much of the request
	 * plus read-ahead as it can.  We must hold the inode lock
	 * exclusively when calling xfs_iomap_read.
	 * Bug 516806: Introduced i_rlock to protect the readahead state
	 * therefore do not need to hold the inode lock in exclusive
	 * mode except when we first read in the file and the extents
	 * are in btree format - xfs_ilock_map_shared takes care of it.
	 */
	do {
		lockmode = XFS_LCK_MAP_SHARED(mp, io);
		xfs_rw_enter_trace(XFS_READ_ENTER, io, uiop, ioflag);

		/*
		 * We've fallen off the end of the file, so
		 * just return with what we've done so far.
		 */
		if (uiop->uio_offset >= XFS_SIZE(mp, io)) {
			XFS_UNLK_MAP_SHARED(mp, io, lockmode);
			break;
		}
 
		unlocked = 0;
		nbmaps = mp->m_nreadaheads ;
		ASSERT(nbmaps <= sizeof(bmaps) / sizeof(bmaps[0]));
		/*
		 * XXX - rcc - we could make sure that if an overlap
		 * exists, that we don't set up bmaps that are > maxdmasz
		 * pages
		 */
		error = xfs_iomap_read(io, uiop->uio_offset,
			uiop->uio_resid, bmaps, &nbmaps, uiop->uio_pmp,
			&unlocked, lockmode);

		if (!unlocked)
			XFS_UNLK_MAP_SHARED(mp, io, lockmode);

		if (error || (bmaps[0].pbsize == 0)) {
			break;
		}

		bmapp = &bmaps[0];
		read_bmaps = nbmaps;
		ASSERT(BBTOOFF(bmapp->offset) <= uiop->uio_offset);
		/*
		 * The first time through this loop we kick off I/O on
		 * all the bmaps described by the iomap_read call.
		 * Subsequent passes just wait for each buffer needed
		 * to satisfy this request to complete.  Buffers which
		 * are started in the first pass but are actually just
		 * read ahead buffers are never waited for, since uio_resid
		 * will go to 0 before we get to them.
		 *
		 * This works OK because iomap_read always tries to
		 * describe all the buffers we need to satisfy this
		 * read call plus the necessary read-ahead in the
		 * first call to it.
		 */
		while ((uiop->uio_resid != 0) && (nbmaps > 0)) {
			/*
			 * do we have overlapping pages we need to
			 * useracc?  don't have to worry about readahead
			 * buffers since those pages will be marked
			 * P_DONE by the buffer release function out of
			 * biodone when the i/o to the buffer finishes.
			 */
			if (cur_ldvnmap && do_lockdown) {
				nuaccmaps = numvnmaps;
				if (xfs_lockdown_iopages(bmapp, isize,
							vnmapflags,
							&cur_ldvnmap,
							&num_ldvnmaps,
							uaccmaps, &nuaccmaps,
							&useracced)) {
					if (useracced)
						xfs_unlock_iopages(uaccmaps,
								    nuaccmaps);
					error = XFS_ERROR(ENOMEM);
					useracced = 0;
					break;
				}
			}

			bp = chunkread(vp, bmapp, read_bmaps, credp);

			if (XFS_BUF_ISERROR(bp)) {
				error = XFS_BUF_GETERROR(bp);
				ASSERT(error != EINVAL);
				/*
				 * b_relse functions like chunkhold
				 * expect B_DONE to be there.
				 */
				XFS_BUF_DONE(bp);
				XFS_BUF_STALE(bp);
				xfs_buf_relse(bp);
				break;
			} else if (bp->b_resid != 0) {
				buffer_bytes_ok = 0;
				XFS_BUF_DONE(bp);
				XFS_BUF_STALE(bp);
				xfs_buf_relse(bp);
				break;
			} else {
				buffer_bytes_ok = 1;
				ASSERT((BBTOOFF(bmapp->offset) + bmapp->pboff)
				       == uiop->uio_offset);
				if (!cur_biovnmap) {
					error = biomove(bp, bmapp->pboff,
							bmapp->pbsize, UIO_READ,
							uiop);
				} else {
#pragma mips_frequency_hint NEVER
					/*
					 * break up the biomoves so that
					 * we never biomove across a region
					 * that might fault on more than
					 * one inode
					 */
					error = xfs_mapped_biomove(bp,
							bmapp->pboff,
							bmapp->pbsize,
							UIO_READ, uiop,
							&cur_biovnmap,
							&num_biovnmaps);
				}
				if (error) {
					XFS_BUF_DONE(bp);
					XFS_BUF_STALE(bp);
					xfs_buf_relse(bp);
					break;
				}
			}

			xfs_buf_relse(bp);

			if (useracced) {
				xfs_unlock_iopages(uaccmaps, nuaccmaps);
				useracced = 0;
			}

			XFSSTATS.xs_read_bufs++;
			read_bmaps = 1;
			nbmaps--;
			bmapp++;
		}

		if (useracced) {
			xfs_unlock_iopages(uaccmaps, nuaccmaps);
			useracced = 0;
		}
	} while (!error && (uiop->uio_resid != 0) && buffer_bytes_ok);

	return error;
}
#endif /* !defined(__linux__) */

/*	Core component of xfs read vop - this function is used by both
 *	xfs and cxfs to do the top part of a VOP_READ after filesystem
 *	specific locking and token management has been done.
 */

#if !defined(__linux__)
int
xfs_read_core(
	bhv_desc_t	*bdp,
	xfs_iocore_t	*io,
	uio_t		*uiop,
	int		ioflag,
	cred_t		*credp,
	flid_t		*fl,
	int		type,
	vnmap_t		*vnmaps,
	int		numvnmaps,
	const uint	vnmapflags,
	xfs_uaccmap_t	*uaccmaps,
	xfs_fsize_t	map_maxoffset,
	int		lflag,
	vrwlock_t	*lmode)
{
	vnode_t		*vp;
	off_t		n;
	off_t		offset;
	size_t		count, resid;
	xfs_fsize_t	isize;
	int		error;
	xfs_mount_t	*mp;

	vp = BHV_TO_VNODE(bdp);
	mp = io->io_mount;

	offset = uiop->uio_offset;
	count = uiop->uio_resid;

	/*
	 * if we're in a possible mmap autogrow case, check to
	 * see if a biomove is going to have to grow the file.
	 * if so, drop the iolock and re-obtain it in write mode.
	 * it's possible that someone might have grown the file
	 * while we were re-acquiring the lock.  if so, then we
	 * demote the iolock from exclusive back to shared and
	 * proceed onwards.
	 */
	isize = -1;

	if (vnmapflags & AS_VNMAP_AUTOGROW) {
		XFS_ILOCK(mp, io, XFS_ILOCK_SHARED | XFS_SIZE_TOKEN_RD);

		isize = XFS_SIZE(mp, io);
		if (map_maxoffset <= isize) {
			XFS_IUNLOCK(mp, io,
				XFS_ILOCK_SHARED | XFS_SIZE_TOKEN_RD);
			isize = -1;
		} else {
			/*
			 * note, we don't have to worry about the
			 * multi-threaded ilock_nowait case above
			 * because the fault path will never biomove
			 * a page and cause an autogrow fault
			 */
			XFS_IUNLOCK(mp, io,
				XFS_ILOCK_SHARED | XFS_IOLOCK_SHARED |
				XFS_SIZE_TOKEN_RD | lflag);
			XFS_ILOCK(mp, io,
				XFS_ILOCK_SHARED | XFS_IOLOCK_EXCL |
				XFS_SIZE_TOKEN_RD | lflag);
			ioflag |= IO_LOCKED_EXCL;
			isize = XFS_SIZE(mp, io);
			if (map_maxoffset > isize) {
				XFS_IUNLOCK(mp, io,
					XFS_ILOCK_SHARED | XFS_SIZE_TOKEN_RD);
				*lmode = VRWLOCK_WRITE;
			} else {
				isize = -1;
				XFS_IUNLOCK(mp, io,
					XFS_ILOCK_SHARED | XFS_SIZE_TOKEN_RD);
				XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL);
			}
		}
	}


#ifndef SIM
	/* check for locks if some exist and mandatory locking is enabled */
	if ((vp->v_flag & (VENF_LOCKING|VFRLOCKS)) == 
	    (VENF_LOCKING|VFRLOCKS)) {
		error = XFS_CHECKLOCK(mp, bdp, vp, FREAD, offset, count, 
				uiop->uio_fmode, credp, fl, VRWLOCK_READ,
				ioflag);
		if (error)
			goto out;
	}
#endif

	if (offset < 0) {
		error = XFS_ERROR(EINVAL);
		goto out;
	}
	if ((ssize_t)count <= 0) {
		error = (ssize_t)count < 0 ? XFS_ERROR(EINVAL) : 0;
		goto out;
	}
	if (ioflag & IO_RSYNC) {
		/* First we sync the data */
		if ((ioflag & IO_SYNC) || (ioflag & IO_DSYNC)) {
			chunkpush(vp, offset, offset + count - 1, 0);
		}
		error = XFS_RSYNC(mp, io, ioflag, offset, offset + count);
		if (error == EFSCORRUPTED)
			goto out;
	}
	switch (type) {
	case IFREG:
		/*
		 * Don't allow reads to pass down counts which could
		 * overflow.  Be careful to record the part that we
		 * refuse so that we can add it back into uio_resid
		 * so that the caller will see a short read.
		 */
		n = XFS_MAX_FILE_OFFSET - offset;
		if (n <= 0) {
			error = 0;
			goto out;
		}
		if (n < uiop->uio_resid) {
			resid = uiop->uio_resid - n;
			uiop->uio_resid = n;
		} else {
			resid = 0;
		}
			    
#ifndef SIM
		if (DM_EVENT_ENABLED_IO(vp->v_vfsp, io, DM_EVENT_READ) &&
		    !(ioflag & IO_INVIS)) {
			vrwlock_t	locktype = VRWLOCK_READ;

			error = xfs_dm_send_data_event(DM_EVENT_READ, bdp,
					offset, count,
					UIO_DELAY_FLAG(uiop), &locktype);
			if (error)
				goto out;
		}
#endif /* SIM */

#ifndef SIM
		/*
		 * Respect preferred read size if indicated in uio structure.
		 * But if the read size has already been set, go with the
		 * smallest value.  Silently ignore requests that aren't
		 * within valid I/O size limits.
		 */

		if ((ioflag & IO_UIOSZ) &&
		    uiop->uio_readiolog != io->io_readio_log &&
		    uiop->uio_readiolog >= mp->m_sb.sb_blocklog &&
		    uiop->uio_readiolog >= XFS_UIO_MIN_READIO_LOG &&
		    uiop->uio_readiolog <= XFS_UIO_MAX_READIO_LOG) {
			XFS_ILOCK(mp, io, XFS_ILOCK_EXCL);
#if !(defined(DEBUG) && defined(UIOSZ_DEBUG))
			if (!(io->io_flags & XFS_IOCORE_UIOSZ) ||
			    uiop->uio_readiolog < io->io_readio_log) {
#endif /* ! (DEBUG && UIOSZ_DEBUG) */
				io->io_readio_log =  uiop->uio_readiolog;
				io->io_readio_blocks = 1 <<
						(int) (io->io_readio_log -
							mp->m_sb.sb_blocklog);
				/*
				 * set inode max io field to largest
				 * possible value that could have been
				 * applied to the inode
				 */
				if (!(io->io_flags & XFS_IOCORE_UIOSZ))  {
					io->io_max_io_log = MAX(io->io_max_io_log,
							MAX(mp->m_readio_log,
							    io->io_readio_log));
					io->io_flags |= XFS_IOCORE_UIOSZ;
				}
#if defined(DEBUG) && defined(UIOSZ_DEBUG)
				atomicAddInt(&uiodbg_switch, 1);
				atomicAddInt(
					&(uiodbg_readiolog[io->io_readio_log -
						XFS_UIO_MIN_READIO_LOG]),
					1);
#endif
#if !(defined(DEBUG) && defined(UIOSZ_DEBUG))
			}
#endif /* ! (DEBUG && UIOSZ_DEBUG) */
			XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL);
		}
#endif /* !SIM */

		if (ioflag & IO_DIRECT) {
			error = xfs_diordwr(bdp, io, uiop, ioflag, credp,
						B_READ, NULL, NULL);
		} else {
			error = xfs_read_file(io, bdp, uiop, ioflag, credp,
					      vnmaps, numvnmaps,
					      vnmapflags, uaccmaps, isize);
		}

		ASSERT(ismrlocked(io->io_iolock, MR_ACCESS | MR_UPDATE) != 0);
		/* don't update timestamps if doing invisible I/O */
		if (!(ioflag & IO_INVIS)) {
			XFS_CHGTIME(mp, io, XFS_ICHGTIME_ACC);
		}

		/*
		 * Add back whatever we refused to do because of file
		 * size limitations.
		 */
		uiop->uio_resid += resid;

		break;

	case IFDIR:
		error = XFS_ERROR(EISDIR);
		break;

	case IFLNK:
		error = XFS_ERROR(EINVAL);
		break;
	      
	case IFSOCK:
		error = XFS_ERROR(ENODEV);
		break;

	default:
		ASSERT(0);
		error = XFS_ERROR(EINVAL);
		break;
	}

out:
	return error;
}
#endif /* !defined(__linux__) */

#if 1
int
xfs_check_mapped_io(
	vnode_t		*vp,
	uio_t		*uiop,
	vnmap_t		**rvnmaps,
	int		*num_rvnmaps,
	int		*rvnmap_size,
	int		*rvnmap_flags,
	xfs_fsize_t 	*map_maxoffset,
	xfs_uaccmap_t	**uaccmaps)
{
	int			error;
	vasid_t			vasid;
	as_verifyvnmap_t	vnmap_args;
	as_verifyvnmapres_t	vnmap_res;

	as_lookup_current(&vasid);
	VAS_LOCK(vasid, AS_SHARED);

	vnmap_args.as_vp = vp;
	vnmap_args.as_vaddr = (uvaddr_t)
				ctob(btoct(uiop->uio_iov->iov_base));
	vnmap_args.as_len = uiop->uio_iov->iov_len +
			((__psunsigned_t) uiop->uio_iov->iov_base -
			 (__psunsigned_t) vnmap_args.as_vaddr);
	vnmap_args.as_offstart = uiop->uio_offset;
	vnmap_args.as_offend = uiop->uio_offset + uiop->uio_resid;
	vnmap_args.as_vnmap = *rvnmaps;
	vnmap_args.as_nmaps = *num_rvnmaps;

	if (error = VAS_VERIFYVNMAP(vasid, &vnmap_args, &vnmap_res)) {
		VAS_UNLOCK(vasid);
		return error;
	}

	VAS_UNLOCK(vasid);

	if (vnmap_res.as_rescodes) {
		if (vnmap_res.as_multimaps) {
			*rvnmaps = vnmap_res.as_multimaps;
		}
		*rvnmap_flags = vnmap_res.as_rescodes;
		*num_rvnmaps = vnmap_res.as_nmaps;
		*rvnmap_size = vnmap_res.as_mapsize;
		*map_maxoffset = vnmap_res.as_maxoffset;
		if (vnmap_res.as_nmaps > XFS_NUMVNMAPS) {
			if ((*uaccmaps = kmem_alloc(vnmap_res.as_nmaps *
						sizeof(xfs_uaccmap_t),
						KM_SLEEP)) == NULL) {
				if (vnmap_res.as_multimaps > 0)
					kmem_free(*rvnmaps, *rvnmap_size);

				return ENOMEM;
			}
		}
	} else {
		*rvnmaps = NULL;
		*uaccmaps = NULL;
	}

	return 0;
}
#endif /* !defined(__linux__) */
/*
 * xfs_read
 *
 * This is the XFS VOP_READ entry point.  It does some minimal
 * error checking and then switches out based on the file type.
 */
#if !defined(__linux__)
int
xfs_read(
	bhv_desc_t	*bdp,
	uio_t		*uiop,
	int		ioflag,
	cred_t		*credp,
	flid_t		*fl)
{
	xfs_inode_t	*ip;
	int		type;
	int		error;
	int		lflag;
	vrwlock_t	lmode;
	vnode_t 	*vp;
	xfs_fsize_t	map_maxoffset;
	vnmap_t			vnmaps[XFS_NUMVNMAPS];
	vnmap_t			*rvnmaps;
	int			num_rvnmaps;
	int			rvnmap_flags;
	int			rvnmap_size = 0;
	xfs_uaccmap_t		uaccmap_array[XFS_NUMVNMAPS];
	xfs_uaccmap_t		*uaccmaps;

#if defined(DEBUG) && defined(UIOSZ_DEBUG)
	/*
	 * Randomly set io size
	 */
	extern ulong_t	random(void);
	extern int	srandom(int);
	timespec_t	now;		/* current time */
	static int	seed = 0;	/* randomizing seed value */

	if (!seed) {
		nanotime(&now);
		seed = (int)now.tv_sec ^ (int)now.tv_nsec;
		srandom(seed);
	}
	ioflag |= IO_UIOSZ;
	uiop->uio_readiolog = (random() & 0x3) + XFS_UIO_MIN_READIO_LOG;
#endif

	vp = BHV_TO_VNODE(bdp);
	ip = XFS_BHVTOI(bdp);
	lmode = VRWLOCK_READ;
	lflag = 0;

	/*
	 * need to protect against deadlocks that can occur if the
	 * biomove touches a virtual address in user space that is
	 * mapped to the file being read.  This only works for
	 * read/write and pread/pwrite.  readv/writev lose.
	 * direct i/o loses too for now.
	 *
	 * note that if someone remaps the user buffer to this file
	 * while the I/O in progess, we lose, too.  instant deadlock.
	 */
	rvnmaps = NULL;
	num_rvnmaps = 0;
	rvnmap_flags = 0;
	uaccmaps = NULL;

	if (uiop->uio_segflg == UIO_USERSPACE && uiop->uio_iovcnt == 1 &&
	    !(ioflag & IO_DIRECT) && VN_MAPPED(vp)) {
#pragma mips_frequency_hint NEVER

		rvnmaps = vnmaps;
		uaccmaps = uaccmap_array;
		num_rvnmaps = XFS_NUMVNMAPS;

		if (error = xfs_check_mapped_io(vp, uiop, &rvnmaps,
				&num_rvnmaps, &rvnmap_size, &rvnmap_flags,
				&map_maxoffset, &uaccmaps)) {
			return XFS_ERROR(error);
		}
	}


	/*
	 * check if we're in recursive lock mode (a read inside a biomove
	 * to a page that is mapped to ip that faulted)
	 */
	lflag = xfs_is_nested_locking_enabled()
		 ? XFS_IOLOCK_NESTED
		 : 0;
	if (!(ioflag & IO_ISLOCKED)) {
		/* For calls from the paging system where the faulting
		 * process is multithreaded, try to grab the I/O lock,
		 * if it is already held, then we ask the paging system
		 * to try again by returning EAGAIN.  It's safe to return
		 * directly since the UIO_NOSPACE i/o never takes the
		 * aspacelock (VAS_LOCK) above.
		 */
		if ((uiop->uio_segflg == UIO_NOSPACE) &&
		    (ioflag & IO_MTTHREAD) && VN_MAPPED(vp)) {
			if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED|lflag)) {
				return XFS_ERROR(EAGAIN);
			}
		} else {
			xfs_rwlockf(bdp, VRWLOCK_READ, lflag);
		}
	}

	type = ip->i_d.di_mode & IFMT;
	ASSERT(type == IFDIR ||
	       ismrlocked(&ip->i_iolock, MR_ACCESS | MR_UPDATE) != 0);

	ASSERT(type == IFREG || type == IFDIR ||
	       type == IFLNK || type == IFSOCK);

	/*
	 * Not ready for in-line files yet.
	 */
	if (type == IFREG) {
		ASSERT((ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS) ||
		       (ip->i_d.di_format == XFS_DINODE_FMT_BTREE));
	}

	error = xfs_read_core(bdp, &ip->i_iocore, uiop, ioflag, credp, fl,
			      type, rvnmaps, num_rvnmaps, rvnmap_flags,
			      uaccmaps, map_maxoffset, lflag, &lmode);

	if (rvnmap_size > 0)
		kmem_free(rvnmaps, rvnmap_size);

	if (num_rvnmaps > XFS_NUMVNMAPS)
		kmem_free(uaccmaps, num_rvnmaps * sizeof(xfs_uaccmap_t));

	if (!(ioflag & IO_ISLOCKED))
		xfs_rwunlockf(bdp, lmode, lflag);

	return error;
}
#endif /* !defined(__linux__) */

/*
 * Map the given I/O size and I/O alignment over the given extent.
 * If we're at the end of the file and the underlying extent is
 * delayed alloc, make sure we extend out to the
 * next i_writeio_blocks boundary.  Otherwise make sure that we
 * are confined to the given extent.
 */
/*ARGSUSED*/
#if 1
STATIC void
xfs_write_bmap(
	xfs_mount_t	*mp,
	xfs_bmbt_irec_t	*imapp,
	struct bmapval	*bmapp,
	int		iosize,
	xfs_fileoff_t	ioalign,
	xfs_fsize_t	isize)
{
	__int64_t	extra_blocks;
	xfs_fileoff_t	size_diff;
	xfs_fileoff_t	ext_offset;
	xfs_fsblock_t	start_block;
	
	if (ioalign < imapp->br_startoff) {
		/*
		 * The desired alignment doesn't end up on this
		 * extent.  Move up to the beginning of the extent.
		 * Subtract whatever we drop from the iosize so that
		 * we stay aligned on iosize boundaries.
		 */
		size_diff = imapp->br_startoff - ioalign;
		iosize -= (int)size_diff;
		ASSERT(iosize > 0);
		ext_offset = 0;
		bmapp->offset = imapp->br_startoff;
	} else {
		/*
		 * The alignment requested fits on this extent,
		 * so use it.
		 */
		ext_offset = ioalign - imapp->br_startoff;
		bmapp->offset = ioalign;
	}
	start_block = imapp->br_startblock;
	ASSERT(start_block != HOLESTARTBLOCK);
	if (start_block != DELAYSTARTBLOCK) {
		bmapp->bn = start_block + ext_offset;
		bmapp->eof = (imapp->br_state != XFS_EXT_UNWRITTEN) ?
					0 : BMAP_UNWRITTEN;
	} else {
		bmapp->bn = -1;
		bmapp->eof = BMAP_DELAY;
	}
	bmapp->length = iosize;

	/*
	 * If the iosize from our offset extends beyond the end of
	 * the extent, then trim down length to match that of the extent.
	 */
	extra_blocks = (off_t)(bmapp->offset + bmapp->length) -
		       (__uint64_t)(imapp->br_startoff +
				    imapp->br_blockcount);
	if (extra_blocks > 0) {
		bmapp->length -= extra_blocks;
		ASSERT(bmapp->length > 0);
	}

	bmapp->bsize = XFS_FSB_TO_B(mp, bmapp->length);
}
#endif /* !defined(__linux__) */
/*
 * This routine is called to handle zeroing any space in the last
 * block of the file that is beyond the EOF.  We do this since the
 * size is being increased without writing anything to that block
 * and we don't want anyone to read the garbage on the disk.
 */
/* ARGSUSED */
#if 1
STATIC int				/* error */
xfs_zero_last_block(
	vnode_t		*vp,
	xfs_iocore_t	*io,
	off_t		offset,
	xfs_fsize_t	isize,
	cred_t		*credp,
	struct pm	*pmp)
{
	xfs_fileoff_t	last_fsb;
	xfs_fileoff_t	next_fsb;
	xfs_fileoff_t	end_fsb;
	xfs_fsblock_t	firstblock;
	xfs_mount_t	*mp;
	xfs_buf_t		*bp;
	int		nimaps;
	int		zero_offset;
	int		zero_len;
	int		isize_fsb_offset;
	int		i;
	int		error;
	int		hole;
	pfd_t		*pfdp;
	xfs_bmbt_irec_t	imap;
	struct bmapval	bmap;

	ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0);
	ASSERT(offset > isize);

	mp = io->io_mount;

	/*
	 * If the file system block size is less than the page size,
	 * then there could be bytes in the last page after the last
	 * fsblock containing isize which have not been initialized.
	 * Since if such a page is in memory it will be marked P_DONE
	 * and may now be fully accessible, we need to zero any part of
	 * it which is beyond the old file size.  We don't need to send
	 * this out to disk, we're just initializing it to zeroes like
	 * we would have done in xfs_strat_read() had the size been bigger.
	 */
	if ((mp->m_sb.sb_blocksize < NBPP) && ((i = poff(isize)) != 0)) {
		pfdp = pfind(vp, offtoct(isize), VM_ATTACH);
		if (pfdp != NULL) {
			page_zero(pfdp, NOCOLOR, i, (NBPP - i));

			/*
			 * Now we check to see if there are any holes in the
			 * page over the end of the file that are beyond the
			 * end of the file.  If so, we want to set the P_HOLE
			 * flag in the page and blow away any active mappings
			 * to it so that future faults on the page will cause
			 * the space where the holes are to be allocated.
			 * This keeps us from losing updates that are beyond
			 * the current end of file when the page is already
			 * in memory.
			 */
			next_fsb = XFS_B_TO_FSBT(mp, isize);
			end_fsb = XFS_B_TO_FSB(mp, ctooff(offtoc(isize)));
			hole = 0;
			while (next_fsb < end_fsb) {
				nimaps = 1;
				firstblock = NULLFSBLOCK;
				error = XFS_BMAPI(mp, NULL, io, next_fsb, 1, 0,
						  &firstblock, 0, &imap,
						  &nimaps, NULL);
				if (error) {
					pagefree(pfdp);
					return error;
				}
				ASSERT(nimaps > 0);
				if (imap.br_startblock == HOLESTARTBLOCK) {
					hole = 1;
					break;
				}
				next_fsb++;
			}
			if (hole) {
				/*
				 * In order to make processes notice the
				 * newly set P_HOLE flag, blow away any
				 * mappings to the file.  We have to drop
				 * the inode lock while doing this to avoid
				 * deadlocks with the chunk cache.
				 */
				if (VN_MAPPED(vp)) {
					XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL |
							    XFS_EXTSIZE_RD);
					VOP_PAGES_SETHOLE(vp, pfdp, 1, 1,
						ctooff(offtoct(isize)));
					XFS_ILOCK(mp, io, XFS_ILOCK_EXCL |
							  XFS_EXTSIZE_RD);
				}
			}
			pagefree(pfdp);
		}
	}

	isize_fsb_offset = XFS_B_FSB_OFFSET(mp, isize);
	if (isize_fsb_offset == 0) {
		/*
		 * There are not extra bytes in the last block to
		 * zero, so return.
		 */
		return 0;
	}

	last_fsb = XFS_B_TO_FSBT(mp, isize);
	nimaps = 1;
	firstblock = NULLFSBLOCK;
	error = XFS_BMAPI(mp, NULL, io, last_fsb, 1, 0, &firstblock, 0, &imap,
			  &nimaps, NULL);
	if (error) {
		return error;
	}
	ASSERT(nimaps > 0);
	/*
	 * If the block underlying isize is just a hole, then there
	 * is nothing to zero.
	 */
	if (imap.br_startblock == HOLESTARTBLOCK) {
		return 0;
	}
	/*
	 * Get a buffer for the last block, zero the part beyond the
	 * EOF, and write it out sync.  We need to drop the ilock
	 * while we do this so we don't deadlock when the buffer cache
	 * calls back to us.
	 */
	XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD);
	bmap.offset = XFS_FSB_TO_BB(mp, last_fsb);
	bmap.length = XFS_FSB_TO_BB(mp, 1);
	bmap.bsize = BBTOB(bmap.length);
	bmap.pboff = 0;
	bmap.pbsize = bmap.bsize;
	if (io->io_flags & XFS_IOCORE_RT) {
		bmap.pbdev = mp->m_rtdev;
	} else {
		bmap.pbdev = mp->m_dev;
	}
	bmap.eof = BMAP_EOF;
	bmap.pmp = pmp;
	if (imap.br_startblock != DELAYSTARTBLOCK) {
		bmap.bn = XFS_FSB_TO_DB_IO(io, imap.br_startblock);
		if (imap.br_state == XFS_EXT_UNWRITTEN)
			bmap.eof |= BMAP_UNWRITTEN;
	} else {
		bmap.bn = -1;
		bmap.eof |= BMAP_DELAY;
	}
	bp = chunkread(vp, &bmap, 1, credp);
	if (XFS_BUF_ISERROR(bp)) {
		error = XFS_BUF_GETERROR(bp);
		XFS_BUF_DONE(bp);
		XFS_BUF_STALE(bp);
		xfs_buf_relse(bp);
		XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
		return error;
	}
	zero_offset = isize_fsb_offset;
	zero_len = mp->m_sb.sb_blocksize - isize_fsb_offset;
	xfs_zero_bp(bp, zero_offset, zero_len);
	/*
	 * We don't want to start a transaction here, so don't
	 * push out a buffer over a delayed allocation extent.
	 * Also, we can get away with it since the space isn't
	 * allocated so it's faster anyway.
	 *
	 * We don't bother to call xfs_b*write here since this is
	 * just userdata, and we don't want to bring the filesystem
	 * down if they hit an error. Since these will go through
	 * xfsstrategy anyway, we have control over whether to let the
	 * buffer go thru or not, in case of a forced shutdown.
	 */
	ASSERT(bp->b_vp);
	if (imap.br_startblock == DELAYSTARTBLOCK ||
	    imap.br_state == XFS_EXT_UNWRITTEN) {
		XFS_bdwrite(bp);
	} else {
		error = XFS_bwrite(bp);
	}

	XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
	return error;
}
#endif /* !defined(__linux__) */
/*
 * Zero any on disk space between the current EOF and the new,
 * larger EOF.  This handles the normal case of zeroing the remainder
 * of the last block in the file and the unusual case of zeroing blocks
 * out beyond the size of the file.  This second case only happens
 * with fixed size extents and when the system crashes before the inode
 * size was updated but after blocks were allocated.  If fill is set,
 * then any holes in the range are filled and zeroed.  If not, the holes
 * are left alone as holes.
 */
#if 1
int					/* error */
xfs_zero_eof(
	vnode_t		*vp,
	xfs_iocore_t	*io,
	off_t		offset,
	xfs_fsize_t	isize,
	cred_t		*credp,
	struct pm	*pmp)
{
	xfs_fileoff_t	start_zero_fsb;
	xfs_fileoff_t	end_zero_fsb;
	xfs_fileoff_t	prev_zero_fsb;
	xfs_fileoff_t	zero_count_fsb;
	xfs_fileoff_t	last_fsb;
	xfs_fsblock_t	firstblock;
	xfs_extlen_t	buf_len_fsb;
	xfs_extlen_t	prev_zero_count;
	xfs_mount_t	*mp;
	xfs_buf_t		*bp;
	int		nimaps;
	int		error;
	xfs_bmbt_irec_t	imap;
	struct bmapval	bmap;
	pfd_t		*pfdp;
	int		i;
	int		length;

	ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
	ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));

	mp = io->io_mount;

	/*
	 * First handle zeroing the block on which isize resides.
	 * We only zero a part of that block so it is handled specially.
	 */
	error = xfs_zero_last_block(vp, io, offset, isize, credp, pmp);
	if (error) {
		ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
		ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
		return error;
	}

	/*
	 * Calculate the range between the new size and the old
	 * where blocks needing to be zeroed may exist.  To get the
	 * block where the last byte in the file currently resides,
	 * we need to subtract one from the size and truncate back
	 * to a block boundary.  We subtract 1 in case the size is
	 * exactly on a block boundary.
	 */
	last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
	start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
	end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
	ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
	if (last_fsb == end_zero_fsb) {
		/*
		 * The size was only incremented on its last block.
		 * We took care of that above, so just return.
		 */
		return 0;
	}

	ASSERT(start_zero_fsb <= end_zero_fsb);
	prev_zero_fsb = NULLFILEOFF;
	prev_zero_count = 0;
	while (start_zero_fsb <= end_zero_fsb) {
		nimaps = 1;
		zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
		firstblock = NULLFSBLOCK;
		error = XFS_BMAPI(mp, NULL, io, start_zero_fsb, zero_count_fsb,
				  0, &firstblock, 0, &imap, &nimaps, NULL);
		if (error) {
			ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
			ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
			return error;
		}
		ASSERT(nimaps > 0);

		if (imap.br_startblock == HOLESTARTBLOCK) {
			/* 
			 * This loop handles initializing pages that were
			 * partially initialized by the code below this 
			 * loop. It basically zeroes the part of the page
			 * that sits on a hole and sets the page as P_HOLE
			 * and calls remapf if it is a mapped file.
			 */	
			if ((prev_zero_fsb != NULLFILEOFF) && 
			    (dtopt(XFS_FSB_TO_BB(mp, prev_zero_fsb)) ==
			     dtopt(XFS_FSB_TO_BB(mp, imap.br_startoff)) ||
			     dtopt(XFS_FSB_TO_BB(mp, prev_zero_fsb + 
						     prev_zero_count)) ==
			     dtopt(XFS_FSB_TO_BB(mp, imap.br_startoff)))) {

				pfdp = pfind(vp, dtopt(XFS_FSB_TO_BB(mp, 
						imap.br_startoff)), VM_ATTACH);

				if (pfdp != NULL) {
					i = poff(XFS_FSB_TO_B(mp, 
							imap.br_startoff));
					length = MIN(NBPP - i, XFS_FSB_TO_B(mp, 
							 imap.br_blockcount));

					page_zero(pfdp, NOCOLOR, i, length);

					if (VN_MAPPED(vp))
					    VOP_PAGES_SETHOLE(vp, pfdp, 1, 1, 
						ctooff(offtoct(XFS_FSB_TO_B(mp, 
						imap.br_startoff))));
                                        pagefree(pfdp);
                                }
			}
		   	prev_zero_fsb = NULLFILEOFF;
			prev_zero_count = 0;
		   	start_zero_fsb = imap.br_startoff +
					 imap.br_blockcount;
			ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
			continue;
		}

		/*
		 * There are blocks in the range requested.
		 * Zero them a single write at a time.  We actually
		 * don't zero the entire range returned if it is
		 * too big and simply loop around to get the rest.
		 * That is not the most efficient thing to do, but it
		 * is simple and this path should not be exercised often.
		 */
		buf_len_fsb = XFS_FILBLKS_MIN(imap.br_blockcount,
					      io->io_writeio_blocks);

		/*
		 * Drop the inode lock while we're doing the I/O.
		 * We'll still have the iolock to protect us.
		 */
		XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);

		bmap.offset = XFS_FSB_TO_BB(mp, imap.br_startoff);
		bmap.length = XFS_FSB_TO_BB(mp, buf_len_fsb);
		bmap.bsize = BBTOB(bmap.length);
		bmap.eof = BMAP_EOF;
		bmap.pmp = pmp;
		if (imap.br_startblock == DELAYSTARTBLOCK) {
			bmap.eof |= BMAP_DELAY;
			bmap.bn = -1;
		} else {
			bmap.bn = XFS_FSB_TO_DB_IO(io, imap.br_startblock);
			if (imap.br_state == XFS_EXT_UNWRITTEN)
				bmap.eof |= BMAP_UNWRITTEN;
		}
		if (io->io_flags & XFS_IOCORE_RT) {
			bmap.pbdev = mp->m_rtdev;
		} else {
			bmap.pbdev = mp->m_dev;
		}
		bp = getchunk(vp, &bmap, credp);

#ifdef _VCE_AVOIDANCE
		if (vce_avoidance) {
			extern void biozero(struct xfs_buf *, u_int, int);
			biozero(bp, 0, bmap.bsize);
		} else
#endif
		{	
			xfs_bp_mapin(bp);
			bzero(XFS_BUF_PTR(bp), XFS_BUF_COUNT(bp));
	        }
		ASSERT(bp->b_vp);
		xfs_buftrace("XFS ZERO EOF", bp);
		if (imap.br_startblock == DELAYSTARTBLOCK ||
		    imap.br_state == XFS_EXT_UNWRITTEN) {
			XFS_bdwrite(bp);
		} else {
			error = XFS_bwrite(bp);
			if (error) {
				XFS_ILOCK(mp, io, XFS_ILOCK_EXCL);
				return error;
			}
		}

		prev_zero_fsb = start_zero_fsb;
		prev_zero_count = buf_len_fsb;
		start_zero_fsb = imap.br_startoff + buf_len_fsb;
		ASSERT(start_zero_fsb <= (end_zero_fsb + 1));

		XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
	}

	return 0;
}
#endif /* !defined(__linux__) */

#if 1
STATIC int
xfs_iomap_write(
	xfs_iocore_t	*io,
	off_t		offset,
	size_t		count,
	struct bmapval	*bmapp,
	int		*nbmaps,
	int		ioflag,
	struct pm	*pmp)
{
	xfs_fileoff_t	offset_fsb;
	xfs_fileoff_t	ioalign;
	xfs_fileoff_t	next_offset_fsb;
	xfs_fileoff_t	last_fsb;
	xfs_fileoff_t	bmap_end_fsb;
	xfs_fileoff_t	last_file_fsb;
	xfs_fileoff_t	start_fsb;
	xfs_filblks_t	count_fsb;
	off_t		aligned_offset;
	xfs_fsize_t	isize;
	xfs_fsblock_t	firstblock;
	__uint64_t	last_page_offset;
	int		nimaps;
	int		error;
	int		n;
	unsigned int	iosize;
	unsigned int	writing_bytes;
	short		filled_bmaps;
	short		x;
	short		small_write;
	size_t		count_remaining;
	xfs_mount_t	*mp;
	struct bmapval	*curr_bmapp;
	struct bmapval	*next_bmapp;
	struct bmapval	*last_bmapp;
	xfs_bmbt_irec_t	*curr_imapp;
	xfs_bmbt_irec_t	*last_imapp;
#define	XFS_WRITE_IMAPS	XFS_BMAP_MAX_NMAP
	xfs_bmbt_irec_t	imap[XFS_WRITE_IMAPS];
	int		aeof;

	ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0);

	xfs_iomap_enter_trace(XFS_IOMAP_WRITE_ENTER, io, offset, count);

	mp = io->io_mount;
/***
	ASSERT(! XFS_NOT_DQATTACHED(mp, ip));
***/

	isize = XFS_SIZE(mp, io);
	if (io->io_new_size > isize) {
		isize = io->io_new_size;
	}

	aeof = 0;
	offset_fsb = XFS_B_TO_FSBT(mp, offset);
	last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
	/*
	 * If the caller is doing a write at the end of the file,
	 * then extend the allocation (and the buffer used for the write)
	 * out to the file system's write iosize.  We clean up any extra
	 * space left over when the file is closed in xfs_inactive().
	 * We can only do this if we are sure that we will create buffers
	 * over all of the space we allocate beyond the end of the file.
	 * Not doing so would allow us to create delalloc blocks with
	 * no pages in memory covering them.  So, we need to check that
	 * there are not any real blocks in the area beyond the end of
	 * the file which we are optimistically going to preallocate. If
	 * there are then our buffers will stop when they encounter them
	 * and we may accidentally create delalloc blocks beyond them
	 * that we never cover with a buffer.  All of this is because
	 * we are not actually going to write the extra blocks preallocated
	 * at this point.
	 *
	 * We don't bother with this for sync writes, because we need
	 * to minimize the amount we write for good performance.
	 */
	if (!(ioflag & IO_SYNC) && ((offset + count) > XFS_SIZE(mp, io))) {
		start_fsb = XFS_B_TO_FSBT(mp,
				  ((xfs_ufsize_t)(offset + count - 1)));
		count_fsb = io->io_writeio_blocks;
		while (count_fsb > 0) {
			nimaps = XFS_WRITE_IMAPS;
			firstblock = NULLFSBLOCK;
			error = XFS_BMAPI(mp, NULL, io, start_fsb, count_fsb,
					  0, &firstblock, 0, imap, &nimaps,
					  NULL);
			if (error) {
				return error;
			}
			for (n = 0; n < nimaps; n++) {
				if ((imap[n].br_startblock != HOLESTARTBLOCK) &&
				    (imap[n].br_startblock != DELAYSTARTBLOCK)) {
					goto write_map;
				}
				start_fsb += imap[n].br_blockcount;
				count_fsb -= imap[n].br_blockcount;
				ASSERT(count_fsb < 0xffff000);
			}
		}
		iosize = io->io_writeio_blocks;
		aligned_offset = XFS_WRITEIO_ALIGN(io, (offset + count - 1));
		ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
		last_fsb = ioalign + iosize;
		aeof = 1;
	}
 write_map:
	nimaps = XFS_WRITE_IMAPS;
	firstblock = NULLFSBLOCK;

	/*
	 * roundup the allocation request to m_dalign boundary if file size
	 * is greater that 512K and we are allocating past the allocation eof
	 */
	if (mp->m_dalign && (XFS_SIZE(mp, io) >= 524288) && aeof) {
		int eof;
		xfs_fileoff_t new_last_fsb;
		new_last_fsb = roundup(last_fsb, mp->m_dalign);
		error = XFS_BMAP_EOF(mp, io, new_last_fsb, XFS_DATA_FORK, &eof);
		if (error) {
			return error;
		}
		if (eof)
			last_fsb = new_last_fsb;
	}

	error = XFS_BMAPI(mp, NULL, io, offset_fsb,
			  (xfs_filblks_t)(last_fsb - offset_fsb),
			  XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
			  XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
			  &nimaps, NULL);
	/* 
	 * This can be EDQUOT, if nimaps == 0
	 */
	if (error) {
		return error;
	}
	/*
	 * If bmapi returned us nothing, and if we didn't get back EDQUOT,
	 * then we must have run out of space.
	 */
	if (nimaps == 0) {
		xfs_iomap_enter_trace(XFS_IOMAP_WRITE_NOSPACE,
				      io, offset, count);
		return XFS_ERROR(ENOSPC);
	}

	if (!(ioflag & IO_SYNC) ||
	    ((last_fsb - offset_fsb) >= io->io_writeio_blocks)) {
		/*
		 * For normal or large sync writes, align everything
		 * into i_writeio_blocks sized chunks.
		 */
		iosize = io->io_writeio_blocks;
		aligned_offset = XFS_WRITEIO_ALIGN(io, offset);
		ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
		small_write = 0;
	} else {
		/*
		 * For small sync writes try to minimize the amount
		 * of I/O we do.  Round down and up to the larger of
		 * page or block boundaries.  Set the small_write
		 * variable to 1 to indicate to the code below that
		 * we are not using the normal buffer alignment scheme.
		 */
		if (NBPP > mp->m_sb.sb_blocksize) {
			aligned_offset = ctooff(offtoct(offset));
			ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
			last_page_offset = ctob64(btoc64(offset + count));
			iosize = XFS_B_TO_FSBT(mp, last_page_offset -
					       aligned_offset);
		} else {
			ioalign = offset_fsb;
			iosize = last_fsb - offset_fsb;
		}
		small_write = 1;
	}

	/*
	 * Now map our desired I/O size and alignment over the
	 * extents returned by xfs_bmapi().
	 */
	xfs_write_bmap(mp, imap, bmapp, iosize, ioalign, isize);
	ASSERT((bmapp->length > 0) &&
	       (offset >= XFS_FSB_TO_B(mp, bmapp->offset)));

	/*
	 * A bmap is the EOF bmap when it reaches to or beyond the new
	 * inode size.
	 */
	bmap_end_fsb = bmapp->offset + bmapp->length;
	if (XFS_FSB_TO_B(mp, bmap_end_fsb) >= isize) {
		bmapp->eof |= BMAP_EOF;
	}
	bmapp->pboff = offset - XFS_FSB_TO_B(mp, bmapp->offset);
	writing_bytes = bmapp->bsize - bmapp->pboff;
	if (writing_bytes > count) {
		/*
		 * The mapping is for more bytes than we're actually
		 * going to write, so trim writing_bytes so we can
		 * get bmapp->pbsize right.
		 */
		writing_bytes = count;
	}
	bmapp->pbsize = writing_bytes;

	xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP,
			    io, offset, count, bmapp, imap);

	/*
	 * Map more buffers if the first does not map the entire
	 * request.  We do this until we run out of bmaps, imaps,
	 * or bytes to write.
	 */
	last_file_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)isize));
	filled_bmaps = 1;
	if ((*nbmaps > 1) &&
	    ((nimaps > 1) || (bmapp->offset + bmapp->length <
	     imap[0].br_startoff + imap[0].br_blockcount)) &&
	    (writing_bytes < count)) {
		curr_bmapp = &bmapp[0];
		next_bmapp = &bmapp[1];
		last_bmapp = &bmapp[*nbmaps - 1];
		curr_imapp = &imap[0];
		last_imapp = &imap[nimaps - 1];
		count_remaining = count - writing_bytes;

		/*
		 * curr_bmapp is always the last one we filled
		 * in, and next_bmapp is always the next one to
		 * be filled in.
		 */
		while (next_bmapp <= last_bmapp) {
			next_offset_fsb = curr_bmapp->offset +
					  curr_bmapp->length;
			if (next_offset_fsb >= last_file_fsb) {
				/*
				 * We've gone beyond the region asked for
				 * by the caller, so we're done.
				 */
				break;
			}
			if (small_write) {
				iosize -= curr_bmapp->length;
				ASSERT((iosize > 0) ||
				       (curr_imapp == last_imapp));
				/*
				 * We have nothing more to write, so
				 * we're done.
				 */
				if (iosize == 0) {
					break;
				}
			}
			if (next_offset_fsb <
			    (curr_imapp->br_startoff +
			     curr_imapp->br_blockcount)) {
				/*
				 * I'm still on the same extent, so
				 * the last bmap must have ended on
				 * a writeio_blocks boundary.  Thus,
				 * we just start where the last one
				 * left off.
				 */
				ASSERT((XFS_FSB_TO_B(mp, next_offset_fsb) &
					((1 << (int) io->io_writeio_log) - 1))
					==0);
				xfs_write_bmap(mp, curr_imapp, next_bmapp,
					       iosize, next_offset_fsb,
					       isize);
			} else {
				curr_imapp++;
				if (curr_imapp <= last_imapp) {
					/*
					 * We're moving on to the next
					 * extent.  Since we try to end
					 * all buffers on writeio_blocks
					 * boundaries, round next_offset
					 * down to a writeio_blocks boundary
					 * before calling xfs_write_bmap().
					 *
					 * For small, sync writes we don't
					 * bother with the alignment stuff.
					 *
					 * XXXajs
					 * Adding a macro to writeio align
					 * fsblocks would be good to reduce
					 * the bit shifting here.
					 */
					if (small_write) {
						ioalign = next_offset_fsb;
					} else {
						aligned_offset =
							XFS_FSB_TO_B(mp,
							    next_offset_fsb);
						aligned_offset =
							XFS_WRITEIO_ALIGN(io,
							    aligned_offset);
						ioalign = XFS_B_TO_FSBT(mp,
							    aligned_offset);
					}
					xfs_write_bmap(mp, curr_imapp,
						       next_bmapp, iosize,
						       ioalign, isize);
				} else {
					/*
					 * We're out of imaps.  The caller
					 * will have to call again to map
					 * the rest of the write request.
					 */
					break;
				}
			}
			/*
			 * The write must start at offset 0 in this bmap
			 * since we're just continuing from the last
			 * buffer.  Thus the request offset in the buffer
			 * indicated by pboff must be 0.
			 */
			next_bmapp->pboff = 0;

			/*
			 * The request size within this buffer is the
			 * entire buffer unless the count of bytes to
			 * write runs out.
			 */
			writing_bytes = next_bmapp->bsize;
			if (writing_bytes > count_remaining) {
				writing_bytes = count_remaining;
			}
			next_bmapp->pbsize = writing_bytes;
			count_remaining -= writing_bytes;
			ASSERT(((long)count_remaining) >= 0);

			filled_bmaps++;
			curr_bmapp++;
			next_bmapp++;
			/*
			 * A bmap is the EOF bmap when it reaches to
			 * or beyond the new inode size.
			 */
			bmap_end_fsb = curr_bmapp->offset +
				       curr_bmapp->length;
			if (((xfs_ufsize_t)XFS_FSB_TO_B(mp, bmap_end_fsb)) >=
			    (xfs_ufsize_t)isize) {
				curr_bmapp->eof |= BMAP_EOF;
			}
			xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, io, offset,
					    count, curr_bmapp, curr_imapp);
		}
	}
	*nbmaps = filled_bmaps;
	for (x = 0; x < filled_bmaps; x++) {
		curr_bmapp = &bmapp[x];
		if (io->io_flags & XFS_IOCORE_RT) {
			curr_bmapp->pbdev = mp->m_rtdev;
		} else {
			curr_bmapp->pbdev = mp->m_dev;
		}
		curr_bmapp->offset = XFS_FSB_TO_BB(mp, curr_bmapp->offset);
		curr_bmapp->length = XFS_FSB_TO_BB(mp, curr_bmapp->length);
		ASSERT((x == 0) ||
		       ((bmapp[x - 1].offset + bmapp[x - 1].length) ==
			curr_bmapp->offset));
		if (curr_bmapp->bn != -1) {
			curr_bmapp->bn = XFS_FSB_TO_DB_IO(io, curr_bmapp->bn);
		}
		curr_bmapp->pmp = pmp;
	}

	return 0;
}
#endif /* !defined(__linux__) */

#if !defined(__linux__)
int
xfs_write_file(
	bhv_desc_t	*bdp,
	xfs_iocore_t	*io,
	uio_t		*uiop,
	int		ioflag,
	cred_t		*credp,
	xfs_lsn_t	*commit_lsn_p,
	vnmap_t		*vnmaps,
	int		numvnmaps,
	const uint	vnmapflags,
	xfs_uaccmap_t	*uaccmaps)
{
	struct bmapval	bmaps[XFS_MAX_RW_NBMAPS];
	struct bmapval	*bmapp;
	int		nbmaps;
	vnode_t		*vp;
	xfs_buf_t		*bp;
	int		error;
	int		eof_zeroed;
	int		fillhole;
	int		gaps_mapped;
	off_t		offset;
	size_t		count;
	int		read;
	xfs_fsize_t	isize;
	xfs_fsize_t	new_size;
	xfs_mount_t	*mp;
	int		fsynced;
	extern void	chunkrelse(xfs_buf_t*);
	int		useracced = 0;
	vnmap_t		*cur_ldvnmap = vnmaps;
	int		num_ldvnmaps = numvnmaps;
	int		num_biovnmaps = numvnmaps;
	int		nuaccmaps;
	vnmap_t		*cur_biovnmap = vnmaps;

	vp = BHV_TO_VNODE(bdp);
	mp = io->io_mount;


	error = 0;
	eof_zeroed = 0;
	gaps_mapped = 0;
	XFSSTATS.xs_write_calls++;
	XFSSTATS.xs_write_bytes += uiop->uio_resid;

	/*
	 * i_new_size is used by xfs_iomap_read() when the chunk
	 * cache code calls back into the file system through
	 * xfs_bmap().  This way we can tell where the end of
	 * file is going to be even though we haven't yet updated
	 * ip->i_d.di_size.  This is guarded by the iolock and the
	 * inode lock.  Either is sufficient for reading the value.
	 */
	new_size = uiop->uio_offset + uiop->uio_resid;

	/*
	 * i_write_offset is used by xfs_strat_read() when the chunk
	 * cache code calls back into the file system through
	 * xfs_strategy() to initialize a buffer.  We use it there
	 * to know how much of the buffer needs to be zeroed and how
	 * much will be initialize here by the write or not need to
	 * be initialized because it will be beyond the inode size.
	 * This is protected by the io lock.
	 */
	io->io_write_offset = uiop->uio_offset;

	/*
	 * Loop until uiop->uio_resid, which is the number of bytes the
	 * caller has requested to write, goes to 0 or we get an error.
	 * Each call to xfs_iomap_write() tries to map as much of the
	 * request as it can in ip->i_writeio_blocks sized chunks.
	 */
	if (!((ioflag & (IO_NFS3|IO_NFS)) &&
	    uiop->uio_offset > XFS_SIZE(mp, io) &&
	    uiop->uio_offset - XFS_SIZE(mp, io) <= (xfs_nfs_io_units *
				 (1 << (int) MAX(io->io_writeio_log,
				   uiop->uio_writeiolog))))) {
		fillhole = 0;
		offset = uiop->uio_offset;
		count = uiop->uio_resid;
	} else  {
		/*
		 * Cope with NFS out-of-order writes.  If we're
		 * extending eof to a point within the indicated
		 * window, fill any holes between old and new eof.
		 * Set up offset/count so we deal with all the bytes
		 * between current eof and end of the new write.
		 */
		fillhole = 1;
		offset = XFS_SIZE(mp, io);
		count = uiop->uio_offset + uiop->uio_resid - offset;
	}
	fsynced = 0;

	do {
		XFS_ILOCK(mp, io, XFS_ILOCK_EXCL | XFS_EXTSIZE_WR);
		isize = XFS_SIZE(mp, io);
		if (new_size > isize) {
			io->io_new_size = new_size;
		}

		xfs_rw_enter_trace(XFS_WRITE_ENTER, io, uiop, ioflag);

		/*
		 * If this is the first pass through the loop, then map
		 * out all of the holes we might fill in with this write
		 * and list them in the inode's gap list.  This is for
		 * use by xfs_strat_read() in determining if the real
		 * blocks underlying a delalloc buffer have been initialized
		 * or not.  Since writes are single threaded, if the blocks
		 * were holes when we started and xfs_strat_read() is asked
		 * to read one in while we're still here in xfs_write_file(),
		 * then the block is not initialized.  Only we can
		 * initialize it and once we write out a buffer we remove
		 * any entries in the gap list which overlap that buffer.
		 */
		if (!gaps_mapped) {
			error = xfs_build_gap_list(io, offset, count);
			if (error) {
				goto error0;
			}
			gaps_mapped = 1;
		}

		/*
		 * If we've seeked passed the EOF to do this write,
		 * then we need to make sure that any buffer overlapping
		 * the EOF is zeroed beyond the EOF.
		 */
		if (!eof_zeroed && uiop->uio_offset > isize && isize != 0) {
			error = xfs_zero_eof(vp, io, uiop->uio_offset, isize,
						credp, uiop->uio_pmp);
			if (error) {
				goto error0;
			}
			eof_zeroed = 1;
		}

		nbmaps = sizeof(bmaps) / sizeof(bmaps[0]);
		error = xfs_iomap_write(io, offset, count, bmaps,
					&nbmaps, ioflag, uiop->uio_pmp);
		XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL | XFS_EXTSIZE_WR);

		/*
	 	 * Clear out any read-ahead info since the write may
	 	 * have made it invalid.
	 	 */
		if (!error)
			XFS_INODE_CLEAR_READ_AHEAD(io);

		if (error == ENOSPC) {
			switch (fsynced) {
			case 0:
				VOP_FLUSH_PAGES(vp, 0,
					(off_t)XFS_LASTBYTE(mp, io) - 1, 0,
					FI_NONE, error);
				error = 0;
				fsynced = 1;
				continue;
			case 1:
				fsynced = 2;
				if (!(ioflag & IO_SYNC)) {
					ioflag |= IO_SYNC;
					error = 0;
					continue;
				}
				/* FALLTHROUGH */
			case 2:
			case 3:
				VFS_SYNC(vp->v_vfsp,
					SYNC_NOWAIT|SYNC_BDFLUSH|SYNC_FSDATA,
					get_current_cred(), error);
				error = 0;
				delay(HZ);
				fsynced++;
				continue;
			}
		}
		if (error || (bmaps[0].pbsize == 0)) {
			break;
		}

		fsynced = 0;
		bmapp = &bmaps[0];
		/*
		 * Each pass through the loop writes another buffer
		 * to the file.  For big requests, iomap_write will
		 * have given up multiple bmaps to use so we make fewer
		 * calls to it on big requests than if it only gave
		 * us one at a time.
		 *
		 * Error handling is a bit tricky because of delayed
		 * allocation.  We need to make sure that we create
		 * dirty buffers over all the delayed allocation
		 * extents created in xfs_iomap_write().  Thus, when
		 * we get an error we continue to process each of
		 * the bmaps returned by xfs_iomap_write().  Each is
		 * read in so that it is fully initialized and then
		 * written out without actually copying in the user's
		 * data.
		 */
		while (((uiop->uio_resid != 0) || (error != 0)) &&
		       (nbmaps > 0)) {
			/*
			 * do we have overlapping pages we need to
			 * useracc?  don't have to worry about autogrow
			 * case here as those have been failed earlier
			 */
			if (cur_ldvnmap && vnmapflags & AS_VNMAP_OVERLAP) {
				nuaccmaps = numvnmaps;
				/*
				 * tell xfs_lockdown_iopages to skip
				 * the autogrow checking since any
				 * necessary file growing was handled
				 * at the beginning of xfs_write()
				 */
				if (error = xfs_lockdown_iopages(bmapp, -1,
							vnmapflags,
							&cur_ldvnmap,
							&num_ldvnmaps,
							uaccmaps, &nuaccmaps,
							&useracced)) {
					if (useracced)
						xfs_unlock_iopages(uaccmaps,
								    nuaccmaps);
					error = XFS_ERROR(ENOMEM);
					useracced = 0;
				}
			}

			/*
			 * If the write doesn't completely overwrite
			 * the buffer and we're not writing from
			 * the beginning of the buffer to the end
			 * of the file then we need to read the
			 * buffer.  We also always want to read the
			 * buffer if we've encountered an error and
			 * we're just cleaning up.
			 *
			 * Reading the buffer will send it to xfs_strategy
			 * which will take care of zeroing the holey
			 * parts of the buffer and coordinating with
			 * other, simultaneous writers.
			 */
			if ((error != 0) ||
			    ((bmapp->pbsize != bmapp->bsize) &&
			    !((bmapp->pboff == 0) &&
			      (uiop->uio_offset >= isize)))) {
				bp = chunkread(vp, bmapp, 1, credp);
				read = 1;
			} else {
				bp = getchunk(vp, bmapp, credp);
				read = 0;
			}

			/*
			 * There is not much we can do with buffer errors.
			 * The assumption here is that the space underlying
			 * the buffer must now be allocated (even if it
			 * wasn't when we mapped the buffer) and we got an
			 * error reading from it.  In this case the blocks
			 * will remain unreadable, so we just toss the buffer
			 * and its associated pages.
			 */
			if (XFS_BUF_ISERROR(bp)) {
			    error = XFS_BUF_GETERROR(bp);
				ASSERT(error != EINVAL);
				XFS_BUF_DONE(bp);
				XFS_BUF_STALE(bp);
				XFS_BUF_UNDELAYWRITE(bp);
				xfs_buf_relse(bp);
				bmapp++;
				nbmaps--;
				continue;
			}

			/*
			 * If we've already encountered an error, then
			 * write the buffers out without copying the user's
			 * data into them.  This way we get dirty buffers
			 * over our delayed allocation extents which
			 * have been initialized by xfs_strategy() since
			 * we forced the chunkread() above.
			 * We write the data out synchronously here so that
			 * we don't have to worry about having buffers
			 * possibly out beyond the EOF when we later flush
			 * or truncate the file.  We set the B_STALE bit so
			 * that the buffer will be decommissioned after it
			 * is synced out.
			 */
			if (error != 0) {
				XFS_BUF_STALE(bp);
				(void) bwrite(bp);
				bmapp++;
				nbmaps--;
				continue;
			}

			ASSERT(fillhole || fillhole == 0 &&
					BBTOOFF(bmapp->offset) + bmapp->pboff
						== uiop->uio_offset);

			/*
			 * zero the bytes up to the data being written
			 * but don't overwrite data we read in.  This
			 * zero-fills the buffers we set up for the NFS
			 * case to fill the holes between EOF and the new
			 * write.
			 */
			if (!read && BBTOOFF(bmapp->offset) + bmapp->pboff
					<  uiop->uio_offset)  {
				xfs_zero_bp(bp, bmapp->pboff,
					MIN(bmapp->pbsize, uiop->uio_offset -
					    (BBTOOFF(bmapp->offset) +
					     bmapp->pboff)));
			}

			/*
			 * biomove the data in the region to be written.
			 * In the NFS hole-filling case, don't fill
			 * anything until we hit the first buffer with
			 * data that we have to write.
			 */
			if (!fillhole) {
				if (!cur_biovnmap) {
					error = biomove(bp, bmapp->pboff,
							bmapp->pbsize,
							UIO_WRITE,
							uiop);
				} else {
#pragma mips_frequency_hint NEVER
					/*
					 * break up the biomoves so that
					 * we never biomove across a region
					 * that might fault on more than
					 * one inode
					 */
					error = xfs_mapped_biomove(bp,
							bmapp->pboff,
							bmapp->pbsize,
							UIO_WRITE, uiop,
							&cur_biovnmap,
							&num_biovnmaps);
				}
			} else if (BBTOOFF(bmapp->offset) + bmapp->pboff +
					bmapp->pbsize >= uiop->uio_offset)  {
				/*
				 * NFS - first buffer to be written.  biomove
				 * into the portion of the buffer that the
				 * user originally asked to write to.
				 */
				ASSERT(BBTOOFF(bmapp->offset) + bmapp->pboff
							<= uiop->uio_offset);
				if (!cur_biovnmap) {
					error = biomove(bp,
						  uiop->uio_offset -
						   BBTOOFF(bmapp->offset),
						  bmapp->pbsize -
						    (int)(uiop->uio_offset -
						      (BBTOOFF(bmapp->offset) +
						       bmapp->pboff)),
						  UIO_WRITE,
						  uiop);
				} else {
#pragma mips_frequency_hint NEVER
					/*
					 * break up the biomoves so that
					 * we never biomove across a region
					 * that might fault on more than
					 * one inode
					 */
					error = xfs_mapped_biomove(bp,
						  uiop->uio_offset -
						   BBTOOFF(bmapp->offset),
						  bmapp->pbsize -
						    (int)(uiop->uio_offset -
						      (BBTOOFF(bmapp->offset) +
						       bmapp->pboff)),
						  UIO_WRITE,
						  uiop,
						  &cur_biovnmap,
						  &num_biovnmaps);
				}

				/*
				 * turn off hole-filling code.  The rest
				 * of the buffers can be handled as per
				 * the usual write path.
				 */
				fillhole = 0;
			}

			/*
			 * reset offset/count to reflect the biomove
			 */
			if (!fillhole)  {
				offset = uiop->uio_offset;
				count = uiop->uio_resid;
			} else  {
				/*
				 * NFS - set offset to the beginning
				 * of the next area in the file to be
				 * copied or zero-filled.  Drop count
				 * by the the amount we just zero-filled.
				 */
				offset = BBTOOFF(bmapp->offset) +
					 bmapp->pboff + bmapp->pbsize;
				count -= bmapp->pbsize;
			}

			/*
			 * Make sure that any gap list entries overlapping
			 * the buffer being written are removed now that
			 * we know that the blocks underlying the buffer
			 * will be initialized.  We don't need the inode
			 * lock to manipulate the gap list here, because
			 * we have the io lock held exclusively so noone
			 * else can get to xfs_strat_read() where we look
			 * at the list.
			 */
			xfs_delete_gap_list(io,
					    XFS_BB_TO_FSBT(mp, bp->b_offset),
					    XFS_B_TO_FSBT(mp, bp->b_bcount));
			ASSERT(bp->b_vp);

			if (error)  {
				/*
				 * If the buffer is already done then just
				 * mark it dirty without copying any more
				 * data into it.  It is already fully
				 * initialized.
				 * Otherwise, we must have getchunk()'d
				 * the buffer above.  Use chunkreread()
				 * to get it initialized by xfs_strategy()
				 * and then write it out.
				 * We write the data out synchronously here
				 * so that we don't have to worry about
				 * having buffers possibly out beyond the
				 * EOF when we later flush or truncate
				 * the file.  We set the B_STALE bit so
				 * that the buffer will be decommissioned
				 * after it is synced out.
				 */

			  if (!XFS_BUF_ISDONE(bp)) {
					chunkreread(bp);
				}

				XFS_BUF_STALE(bp);
				(void) bwrite(bp);
			} else {

				if ((ioflag & IO_SYNC) || (ioflag & IO_DSYNC)) {
					if ((bmapp->pboff + bmapp->pbsize) ==
					    bmapp->bsize) {
						bp->b_relse = chunkrelse;
					}

					/* save the commit lsn for dsync
					 * writes so xfs_write can force the
					 * log up to the appropriate point.
					 * don't bother doing this for sync
					 * writes since xfs_write will have to
					 * kick off a sync xaction to log the
					 * timestamp updates anyway.
					 */

					if (ioflag & IO_DSYNC) {
						bp->b_fsprivate3 = commit_lsn_p;
						XFS_BUF_HOLD(bp);
					}
					error = bwrite(bp);
					if (ioflag & IO_DSYNC) {
						bp->b_fsprivate3 = NULL;
						XFS_BUF_UNHOLD(bp);
						xfs_buf_relse(bp);
					}
				} else {
					bdwrite(bp);
				}
			}

			if (useracced) {
				xfs_unlock_iopages(uaccmaps, nuaccmaps);
				useracced = 0;
			}

			/*
			 * If we've grown the file, get back the
			 * inode lock and move di_size up to the
			 * new size.  It may be that someone else
			 * made it even bigger, so be careful not
			 * to shrink it.
			 *
			 * No one could have shrunk the file, because
			 * we are holding the iolock exclusive.
			 *
			 * Have to update di_size after brelsing the buffer
			 * because if we are running low on buffers and 
			 * xfsd is trying to push out a delalloc buffer for
			 * our inode, then it grabs the ilock in exclusive 
			 * mode to do an allocation, and calls get_buf to 
			 * get to read in a metabuffer (agf, agfl). If the
			 * metabuffer is in the buffer cache, but it gets 
			 * reused before we can grab the cpsema(), then
			 * we will sleep in get_buf waiting for it to be
			 * released whilst holding the ilock.
			 * If it so happens that the buffer was reused by
			 * the above code path, then we end up holding this 
			 * buffer locked whilst we try to get the ilock so we 
			 * end up deadlocking. (bug 504578).
			 * For the IO_SYNC writes, the di_size now gets logged
			 * and synced to disk in the transaction in xfs_write().
			 */  

			if (offset > isize) {
				isize = XFS_SETSIZE(mp, io, offset);
			}

			XFSSTATS.xs_write_bufs++;
			bmapp++;
			nbmaps--;
		}
	} while ((uiop->uio_resid > 0) && !error);

	/*
	 * Free up any remaining entries in the gap list, because the 
	 * list only applies to this write call.  Also clear the new_size
	 * field of the inode while we've go it locked.
	 */
	XFS_ILOCK(mp, io, XFS_ILOCK_EXCL);
error0:
	xfs_free_gap_list(io);
	io->io_new_size = 0;
	XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL);
	io->io_write_offset = 0;

	return error;
}
#endif /* !defined(__linux__) */
/*
 * This is a subroutine for xfs_write() and other writers
 * (xfs_fcntl) which clears the setuid and setgid bits when a file is written.
 */
#if 1
int
xfs_write_clear_setuid(
	xfs_inode_t	*ip)
{
	xfs_mount_t	*mp;
	xfs_trans_t	*tp;
	int		error;

	mp = ip->i_mount;
	tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
	if (error = xfs_trans_reserve(tp, 0,
				      XFS_WRITEID_LOG_RES(mp),
				      0, 0, 0)) {
		xfs_trans_cancel(tp, 0);
		return error;
	}
	xfs_ilock(ip, XFS_ILOCK_EXCL);
	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
	xfs_trans_ihold(tp, ip);
	ip->i_d.di_mode &= ~ISUID;

	/*
	 * Note that we don't have to worry about mandatory
	 * file locking being disabled here because we only
	 * clear the ISGID bit if the Group execute bit is
	 * on, but if it was on then mandatory locking wouldn't
	 * have been enabled.
	 */
	if (ip->i_d.di_mode & (IEXEC >> 3)) {
		ip->i_d.di_mode &= ~ISGID;
	}
	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
	xfs_trans_set_sync(tp);
	error = xfs_trans_commit(tp, 0, NULL);
	xfs_iunlock(ip, XFS_ILOCK_EXCL);
	return 0;
}
#endif /* !defined(__linux__) */
/*
 * xfs_write
 *
 * This is the XFS VOP_WRITE entry point.  It does some minimal error
 * checking and then switches out based on the file type.
 */
#if !defined(__linux__) 
int
xfs_write(
	bhv_desc_t	*bdp,
	uio_t		*uiop,
	int		ioflag,
	cred_t		*credp,
	flid_t		*fl)
{
	xfs_inode_t	*ip;
	xfs_iocore_t	*io;
	xfs_mount_t	*mp;
	xfs_trans_t	*tp;
	int		type;
	off_t		offset;
	size_t		count;
	int		error, transerror;
	int		lflag;
	off_t		n;
	int		resid;
	off_t		savedsize;
	xfs_fsize_t	limit;
	xfs_fsize_t	map_maxoffset;
	int		eventsent;
	vnode_t 	*vp;
	xfs_lsn_t	commit_lsn;
	vnmap_t		vnmaps[XFS_NUMVNMAPS];
	vnmap_t		*rvnmaps;
	vnmap_t		*map;
	int		num_rvnmaps;
	int		rvnmap_flags;
	int		rvnmap_size = 0;
	int		i;
	xfs_uaccmap_t		uaccmap_array[XFS_NUMVNMAPS];
	xfs_uaccmap_t		*uaccmaps;

#if defined(DEBUG) && defined(UIOSZ_DEBUG)
	/*
	 * Randomly set io size
	 */
	extern ulong_t	random(void);
	extern int	srandom(int);
	timespec_t	now;		/* current time */
	static int	seed = 0;	/* randomizing seed value */

	if (!seed) {
		nanotime(&now);
		seed = (int)now.tv_sec ^ (int)now.tv_nsec;
		srandom(seed);
	}
	ioflag |= IO_UIOSZ;
	uiop->uio_writeiolog = (random() & 0x3) + XFS_UIO_MIN_WRITEIO_LOG;
#endif

	vp = BHV_TO_VNODE(bdp);
	ip = XFS_BHVTOI(bdp);
	io = &ip->i_iocore;
	mp = ip->i_mount;

	eventsent = 0;
	commit_lsn = -1;
	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
		return (EIO);

	/*
	 * need to protect against deadlocks that can occur if the
	 * biomove touches a virtual address in user space that is
	 * mapped to the file being read.  This only works for
	 * read/write and pread/pwrite.  readv/writev lose.
	 * direct i/o loses too for now.
	 *
	 * note that if someone remaps the user buffer to this file
	 * while the I/O in progess, we lose too.  instant deadlock.
	 */
	rvnmaps = NULL;
	num_rvnmaps = 0;
	rvnmap_flags = 0;
	uaccmaps = NULL;

	if (uiop->uio_segflg == UIO_USERSPACE && uiop->uio_iovcnt == 1 &&
	    !(ioflag & IO_DIRECT) && VN_MAPPED(vp)) {
#pragma mips_frequency_hint NEVER
		rvnmaps = vnmaps;
		uaccmaps = uaccmap_array;
		num_rvnmaps = XFS_NUMVNMAPS;

		if (error = xfs_check_mapped_io(vp, uiop, &rvnmaps,
				&num_rvnmaps, &rvnmap_size, &rvnmap_flags,
				&map_maxoffset, &uaccmaps)) {
			return XFS_ERROR(error);
		}
	}

	/*
	 * check if we're in recursive lock mode (a read inside a biomove
	 * to a page that is mapped to ip that faulted)
	 */
	lflag = xfs_is_nested_locking_enabled()
		 ? XFS_IOLOCK_NESTED
		 : 0;

	if (!(ioflag & IO_ISLOCKED))
		xfs_rwlockf(bdp, (ioflag & IO_DIRECT) ?
			   VRWLOCK_WRITE_DIRECT : VRWLOCK_WRITE,
			   lflag);

	/*
	 * if the write will grow the file beyond the current
	 * eof, grow the file now by faulting in the page with
	 * the largest file offset.  That will zero-fill all
	 * pages in the buffer containing eof and will enable
	 * faults between the current eof and the new eof to
	 * read in zero'ed pages.
	 */
	if (rvnmap_flags & AS_VNMAP_AUTOGROW) {
#pragma mips_frequency_hint NEVER
		xfs_ilock(ip, XFS_ILOCK_SHARED);
		if (map_maxoffset <= ip->i_d.di_size) {
			xfs_iunlock(ip, XFS_ILOCK_SHARED);
		} else {
			xfs_iunlock(ip, XFS_ILOCK_SHARED);
			map = rvnmaps;
			for (i = 0; i < num_rvnmaps; i++) {
				if (!(map->vnmap_flags & AS_VNMAP_AUTOGROW))
					continue;
				xfs_enable_nested_locking();
				error = fubyte((char *) map->vnmap_ovvaddr +
						map->vnmap_ovlen - 1);
				xfs_disable_nested_locking();
				if (error == -1) {
					error = XFS_ERROR(EFAULT);
					goto out;
				}
			}
		}
	}

	type = ip->i_d.di_mode & IFMT;
	ASSERT(!(vp->v_vfsp->vfs_flag & VFS_RDONLY));
	ASSERT(type == IFDIR ||
	       ismrlocked(&ip->i_iolock, MR_UPDATE) ||
	       (ismrlocked(&ip->i_iolock, MR_ACCESS) &&
		(ioflag & IO_DIRECT)));

	ASSERT(type == IFREG || type == IFDIR ||
	       type == IFLNK || type == IFSOCK);

start:
	if (ioflag & IO_APPEND) {
		/*
		 * In append mode, start at the end of the file.
		 * Since I've got the iolock exclusive I can look
		 * at di_size.
		 */
		uiop->uio_offset = savedsize = ip->i_d.di_size;
	}

	offset = uiop->uio_offset;
	count = uiop->uio_resid;

	/* check for locks if some exist and mandatory locking is enabled */
	if ((vp->v_flag & (VENF_LOCKING|VFRLOCKS)) == 
	    (VENF_LOCKING|VFRLOCKS)) {
		error = XFS_CHECKLOCK(mp, bdp, vp, FWRITE, offset, count, 
				     uiop->uio_fmode, credp, fl, 
				     VRWLOCK_WRITE, ioflag);
		if (error)
			goto out;
	}

	if (offset < 0) {
		error = XFS_ERROR(EINVAL);
		goto out;
	}
	if ((ssize_t)count <= 0) {
		error = (ssize_t)count < 0 ? XFS_ERROR(EINVAL) : 0;
		goto out;
	}

	switch (type) {
	case IFREG:
		limit = ((uiop->uio_limit < XFS_MAX_FILE_OFFSET) ?
			 uiop->uio_limit : XFS_MAX_FILE_OFFSET);
		n = limit - uiop->uio_offset;
		if (n <= 0) {
			error = XFS_ERROR(EFBIG);
			goto out;
		}
		if (n < uiop->uio_resid) {
			resid = uiop->uio_resid - n;
			uiop->uio_resid = n;
		} else {
			resid = 0;
		}

		if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_WRITE) &&
		    !(ioflag & IO_INVIS) && !eventsent) {
			vrwlock_t	locktype;

			locktype = (ioflag & IO_DIRECT) ?
				VRWLOCK_WRITE_DIRECT:VRWLOCK_WRITE;

			error = xfs_dm_send_data_event(DM_EVENT_WRITE, bdp,
					offset, count,
					UIO_DELAY_FLAG(uiop), &locktype);
			if (error)
				goto out;
			eventsent = 1;
		}
		/*
		 *  The iolock was dropped and reaquired in
		 *  xfs_dm_send_data_event so we have to recheck the size
		 *  when appending.  We will only "goto start;" once,
		 *  since having sent the event prevents another call
		 *  to xfs_dm_send_data_event, which is what
		 *  allows the size to change in the first place.
		 */
		if ((ioflag & IO_APPEND) && savedsize != ip->i_d.di_size)
			goto start;
		/*
		 * implement osync == dsync option
		 */
		if (ioflag & IO_SYNC && mp->m_flags & XFS_MOUNT_OSYNCISDSYNC) {
			ioflag &= ~IO_SYNC;
			ioflag |= IO_DSYNC;
		}

		/*
		 * If we're writing the file then make sure to clear the
		 * setuid and setgid bits if the process is not being run
		 * by root.  This keeps people from modifying setuid and
		 * setgid binaries.  Don't allow this to happen if this
		 * file is a swap file (I know, weird).
		 */
		if (((ip->i_d.di_mode & ISUID) ||
		    ((ip->i_d.di_mode & (ISGID | (IEXEC >> 3))) ==
			(ISGID | (IEXEC >> 3)))) &&
		    !(vp->v_flag & VISSWAP) &&
		    !cap_able_cred(credp, CAP_FSETID)) {
			error = xfs_write_clear_setuid(ip);
			if (error) {
				goto out;
			}
		}

		/*
		 * Respect preferred write size if indicated in uio structure.
		 * But if the write size has already been set, go with the
		 * smallest value.  Silently ignore requests that aren't
		 * within valid I/O size limits.
		 */
		if ((ioflag & IO_UIOSZ) &&
		    uiop->uio_writeiolog != io->io_writeio_log &&
		    uiop->uio_writeiolog >= mp->m_sb.sb_blocklog &&
		    uiop->uio_writeiolog >= XFS_UIO_MIN_WRITEIO_LOG &&
		    uiop->uio_writeiolog <= XFS_UIO_MAX_WRITEIO_LOG) {
			xfs_ilock(ip, XFS_ILOCK_EXCL);
#if !(defined(DEBUG) && defined(UIOSZ_DEBUG))
			if (!(io->io_flags & XFS_IOCORE_UIOSZ) ||
			    uiop->uio_writeiolog < io->io_writeio_log) {
#endif /* ! (DEBUG && UIOSZ_DEBUG) */
				io->io_writeio_log = uiop->uio_writeiolog;
				io->io_writeio_blocks = 1 <<
					(int) (io->io_writeio_log -
						mp->m_sb.sb_blocklog);
				/*
				 * set inode max io size to largest value
				 * that the inode could ever have had
				 */
				if (!(io->io_flags & XFS_IOCORE_UIOSZ)) {
					io->io_max_io_log = MAX(io->io_max_io_log,
							MAX(mp->m_writeio_log,
							    io->io_writeio_log));
					io->io_flags |= XFS_IOCORE_UIOSZ;
				}
#if defined(DEBUG) && defined(UIOSZ_DEBUG)
				atomicAddInt(&uiodbg_switch, 1);
				atomicAddInt(
					&(uiodbg_writeiolog[io->io_writeio_log -
						XFS_UIO_MIN_WRITEIO_LOG]),
					1);
#endif
#if !(defined(DEBUG) && defined(UIOSZ_DEBUG))
			}
#endif /* ! (DEBUG && UIOSZ_DEBUG) */
			xfs_iunlock(ip, XFS_ILOCK_EXCL);
		}

retry:
		if (ioflag & IO_DIRECT) {
			error = xfs_diordwr(bdp, io, uiop, ioflag, credp,
						B_WRITE, NULL, NULL);
		} else {
			if (XFS_FORCED_SHUTDOWN(mp))
				error = EIO;
			else if ((ip->i_d.di_extsize) ||
				 (ip->i_iocore.io_flags & XFS_IOCORE_RT))
				error = EINVAL;
			else {
				error = 0;
				/*
				 * Make sure that the dquots are there
				 */
				if (XFS_IS_QUOTA_ON(mp)) {
					if (XFS_NOT_DQATTACHED(mp, ip)) {
						error = xfs_qm_dqattach(ip, 0);
					}
				}

				if (!error) {
					error = xfs_write_file(bdp,
						&ip->i_iocore,
						uiop, ioflag, credp,
						&commit_lsn,
						rvnmaps, num_rvnmaps,
						rvnmap_flags, uaccmaps);
				}
			}
		}

		if (error == ENOSPC &&
		    DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_NOSPACE) &&
		    !(ioflag & IO_INVIS)) {
			vrwlock_t	locktype;

			locktype = (ioflag & IO_DIRECT) ?
				VRWLOCK_WRITE_DIRECT:VRWLOCK_WRITE;

			VOP_RWUNLOCK(vp, locktype);
			error = dm_send_namesp_event(DM_EVENT_NOSPACE, bdp,
					DM_RIGHT_NULL, bdp, DM_RIGHT_NULL, NULL, NULL,
					0, 0, 0); /* Delay flag intentionally unused */
			VOP_RWLOCK(vp, locktype);
			if (error)
				goto out;

			offset = uiop->uio_offset;
			goto retry;
		} else if (error == ENOSPC) {
			if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)  {
				xfs_error(mp, 2);
			} else {
				xfs_error(mp, 1);
			}
		}

		/*
		 * Add back whatever we refused to do because of
		 * uio_limit.
		 */
		uiop->uio_resid += resid;

		/*
		 * We've done at least a partial write, so don't
		 * return an error on this call.  Also update the
		 * timestamps since we changed the file.
		 */
		if (count != uiop->uio_resid) {
			error = 0;
			/* don't update timestamps if doing invisible I/O */
			if (!(ioflag & IO_INVIS))
				xfs_ichgtime(ip,
					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
		}

		/*
		 * If the write was synchronous then we need to make
		 * sure that the inode modification time is permanent.
		 * We'll have update the timestamp above, so here
		 * we use a synchronous transaction to log the inode.
		 * It's not fast, but it's necessary.
		 *
		 * If this a dsync write and the size got changed
		 * non-transactionally, then we need to ensure that
		 * the size change gets logged in a synchronous
		 * transaction.  If an allocation transaction occurred
		 * without extending the size, then we have to force
		 * the log up the proper point to ensure that the
		 * allocation is permanent.  We can't count on
		 * the fact that buffered writes lock out direct I/O
		 * writes because the direct I/O write could have extended
		 * the size non-transactionally and then finished just before
		 * we started.  xfs_write_file will think that the file
		 * didn't grow but the update isn't safe unless the
		 * size change is logged.
		 *
		 * If the vnode is a swap vnode, then don't do anything
		 * which could require allocating memory.
		 */
		if ((ioflag & IO_SYNC ||
		     (ioflag & IO_DSYNC && ip->i_update_size)) &&
		    !(vp->v_flag & VISSWAP)) {
			tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
			if (transerror = xfs_trans_reserve(tp, 0,
						      XFS_SWRITE_LOG_RES(mp),
						      0, 0, 0)) {
				xfs_trans_cancel(tp, 0);
				error = transerror;
				break;
			}
			xfs_ilock(ip, XFS_ILOCK_EXCL);
			xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
			xfs_trans_ihold(tp, ip);
			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
			xfs_trans_set_sync(tp);
			transerror = xfs_trans_commit(tp, 0, &commit_lsn);
			if ( transerror )
				error = transerror;
			xfs_iunlock(ip, XFS_ILOCK_EXCL);
		} else if ((ioflag & IO_DSYNC) && !(vp->v_flag & VISSWAP)) {
			/*
			 * force the log if we've committed a transaction
			 * against the inode or if someone else has and
			 * the commit record hasn't gone to disk (e.g.
			 * the inode is pinned).  This guarantees that
			 * all changes affecting the inode are permanent
			 * when we return.
			 */
			if (commit_lsn != -1)
				xfs_log_force(mp, (xfs_lsn_t)commit_lsn,
					      XFS_LOG_FORCE | XFS_LOG_SYNC );
			else if (xfs_ipincount(ip) > 0)
				xfs_log_force(mp, (xfs_lsn_t)0,
					      XFS_LOG_FORCE | XFS_LOG_SYNC );
		}
		if (ioflag & (IO_NFS|IO_NFS3)) {
			xfs_refcache_insert(ip);
		}
		break;

	case IFDIR:
		error = XFS_ERROR(EISDIR);
		break;

	case IFLNK:
		error = XFS_ERROR(EINVAL);
		break;

	case IFSOCK:
		error = XFS_ERROR(ENODEV);
		break;

	default:
		ASSERT(0);
		error = XFS_ERROR(EINVAL);
		break;
	}

out:
	if (rvnmap_size > 0)
		kmem_free(rvnmaps, rvnmap_size);

	if (num_rvnmaps > XFS_NUMVNMAPS)
		kmem_free(uaccmaps, num_rvnmaps * sizeof(xfs_uaccmap_t));

	if (!(ioflag & IO_ISLOCKED))
		xfs_rwunlockf(bdp, (ioflag & IO_DIRECT) ?
			     VRWLOCK_WRITE_DIRECT : VRWLOCK_WRITE,
			     lflag);

	return error;
}
#endif /* !defined(__linux__) */

/*
 * This is the XFS entry point for VOP_BMAP().
 * It simply switches based on the given flags
 * to either xfs_iomap_read() or xfs_iomap_write().
 * This cannot be used to grow a file or to read
 * beyond the end of the file.
 *
 * In the caes xfs_bmap, the caller is required to be holding 
 * the inode's iolock in at least shared mode for a read mapping
 * and exclusively for a write mapping.
 *
 * xfs_bmap2 allows the user to request that iolock be obtained
 * before doing the requested action and released afterwards.
 */
/* ARGSUSED */
#if 0
int
xfs_bmap(
	bhv_desc_t	*bdp,
	off_t		offset,
	ssize_t		count,
	int		flags,
	cred_t		*credp,
	struct bmapval	*bmapp,
	int		*nbmaps)
{
	xfs_inode_t	*ip;
	int		error;
	int		unlocked;
	int		lockmode;

	ip = XFS_BHVTOI(bdp);
	ASSERT((ip->i_d.di_mode & IFMT) == IFREG);
	ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) ==
	       ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0));
	ASSERT((flags == B_READ) || (flags == B_WRITE));
	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
		return (EIO);
	}

	if (flags == XFS_B_READ) {
		ASSERT(ismrlocked(&ip->i_iolock, MR_ACCESS | MR_UPDATE) != 0);
		unlocked = 0;
		lockmode = xfs_ilock_map_shared(ip);
		error = xfs_iomap_read(&ip->i_iocore, offset, count, bmapp,
				 nbmaps, NULL, &unlocked, lockmode);
		if (!unlocked)
			xfs_iunlock_map_shared(ip, lockmode);
	} else {
		ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0);
		xfs_ilock(ip, XFS_ILOCK_EXCL);
		ASSERT(ip->i_d.di_size >= (offset + count));

		/* 
		 * Make sure that the dquots are there. This doesn't hold 
		 * the ilock across a disk read.
		 */
		if (XFS_IS_QUOTA_ON(ip->i_mount)) {
			if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) {
				if (error = xfs_qm_dqattach(ip,
						    XFS_QMOPT_ILOCKED)){
					xfs_iunlock(ip, XFS_ILOCK_EXCL);
					return error;
				}
			}
		}

		error = xfs_iomap_write(&ip->i_iocore, offset, count, bmapp,
					nbmaps, 0, NULL);

		xfs_iunlock(ip, XFS_ILOCK_EXCL);
		if (!error)
			XFS_INODE_CLEAR_READ_AHEAD(&ip->i_iocore);
	}
	return error;
}
#endif /* !defined(__linux__) */
/*
 * Set up rbp so that it points to the memory attached to bp
 * from rbp_offset from the start of bp for rbp_len bytes.
 */
#if !defined(__linux__)
STATIC void
xfs_overlap_bp(
	xfs_buf_t	*bp,
	xfs_buf_t	*rbp,
	uint	rbp_offset,
	uint	rbp_len)
{
	int	pgbboff;
	int	bytes_off;
	pfd_t	*pfdp;

	
	if (BP_ISMAPPED(bp)) {
		/*
		 * The real buffer is already mapped, so just use
		 * its virtual memory for ourselves.
		 */
		XFS_BUF_PTR(rbp) = XFS_BUF_PTR(bp) + rbp_offset;
		rbp->b_bcount = rbp_len;
		rbp->b_bufsize = rbp_len;
	} else {
		/*
		 * The real buffer is not yet mapped to virtual memory.
		 * Just get the subordinate buffer's page pointers
		 * set up and make it a PAGEIO buffer like the real one.
		 *
		 * First find the first page of rbp.  We do this by
		 * walking the list of pages in bp until we find the
		 * one containing the start of rbp.  Note that neither
		 * bp nor rbp are required to start on page boundaries.
		 */
		bytes_off = rbp_offset + BBTOOFF(dpoff(bp->b_offset));
		pfdp = NULL;
		pfdp = getnextpg(bp, pfdp);
		ASSERT(pfdp != NULL);
		while (bytes_off >= NBPP) {
			pfdp = getnextpg(bp, pfdp);
			ASSERT(pfdp != NULL);
			bytes_off -= NBPP;
		}
		rbp->b_pages = pfdp;

		rbp->b_bcount = rbp_len;
		rbp->b_offset = bp->b_offset + BTOBB(rbp_offset);
		pgbboff = dpoff(rbp->b_offset);
		rbp->b_bufsize = ctob(dtop(pgbboff + BTOBB(rbp_len)));

		XFS_BUF_PAGEIO(rbp);

		if (pgbboff != 0) {
			bp_mapin(rbp);
		}
	}
	rbp->b_blkno = bp->b_blkno + BTOBB(rbp_offset);
	rbp->b_remain = 0;
	rbp->b_vp = bp->b_vp;
	rbp->b_edev = bp->b_edev;
	/* note XFS_BFLAGS must be member access for this to work */
	/* Think about changing this */
	XFS_BUF_BFLAGS(rbp) |= (XFS_BUF_ISUNCACHED(rbp) | XFS_BUF_ASYNC(rbp)); 
}
#endif /* !defined(__linux__) */

/*
 * Zero the given bp from data_offset from the start of it for data_len
 * bytes.
 */
#if 1 /* !defined(__linux__) */
STATIC void
xfs_zero_bp(
	xfs_buf_t	*bp,
	int	data_offset,
	int	data_len)
{
	pfd_t	*pfdp;
	caddr_t	page_addr;
	int	len;

	if (XFS_BUF_BP_ISMAPPED(bp)) {
		bzero(XFS_BUF_PTR(bp) + data_offset, data_len);
		return;
	}

	data_offset += BBTOOFF(dpoff(XFS_BUF_OFFSET(bp)));
	pfdp = NULL;
	pfdp = getnextpg(bp, pfdp);
	ASSERT(pfdp != NULL);
	while (data_offset >= NBPP) {
		pfdp = getnextpg(bp, pfdp);
		ASSERT(pfdp != NULL);
		data_offset -= NBPP;
	}
	ASSERT(data_offset >= 0);
	while (data_len > 0) {
		page_addr = page_mapin(pfdp, (XFS_BUF_ISUNCACHED(bp) ?
					      VM_UNCACHED : 0), 0);
		len = MIN(data_len, NBPP - data_offset);
		bzero(page_addr + data_offset, len);
		data_len -= len;
		data_offset = 0;
		page_mapout(page_addr);
		pfdp = getnextpg(bp, pfdp);
	}
}
#endif /* !defined(__linux__) */

/*
 * Verify that the gap list is properly sorted and that no entries
 * overlap.
 */
#ifdef DEBUG
STATIC void
xfs_check_gap_list(
	xfs_iocore_t	*io)
{
	xfs_gap_t	*last_gap;
	xfs_gap_t	*curr_gap;
	int		loops;

	last_gap = NULL;
	curr_gap = io->io_gap_list;
	loops = 0;
	while (curr_gap != NULL) {
		ASSERT(curr_gap->xg_count_fsb > 0);
		if (last_gap != NULL) {
			ASSERT((last_gap->xg_offset_fsb +
				last_gap->xg_count_fsb) <
			       curr_gap->xg_offset_fsb);
		}
		last_gap = curr_gap;
		curr_gap = curr_gap->xg_next;
		ASSERT(loops++ < 1000);
	}
}
#endif

/*
 * For the given inode, offset, and count of bytes, build a list
 * of xfs_gap_t structures in the inode's gap list describing the
 * holes in the file in the range described by the offset and count.
 *
 * The list must be empty when we start, and the inode lock must
 * be held exclusively.
 */
STATIC int				/* error */
xfs_build_gap_list(
	xfs_iocore_t	*io,
	off_t		offset,
	size_t		count)
{
	xfs_fileoff_t	offset_fsb;
	xfs_fileoff_t	last_fsb;
	xfs_filblks_t	count_fsb;
	xfs_fsblock_t	firstblock;
	xfs_gap_t	*new_gap;
	xfs_gap_t	*last_gap;
	xfs_mount_t	*mp;
	int		i;
	int		error;
	int		nimaps;
#define	XFS_BGL_NIMAPS	8
	xfs_bmbt_irec_t	imaps[XFS_BGL_NIMAPS];
	xfs_bmbt_irec_t	*imapp;

	ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0);
	ASSERT(io->io_gap_list == NULL);

	mp = io->io_mount;
	offset_fsb = XFS_B_TO_FSBT(mp, offset);
	last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
	count_fsb = (xfs_filblks_t)(last_fsb - offset_fsb);
	ASSERT(count_fsb > 0);

	last_gap = NULL;
	while (count_fsb > 0) {
		nimaps = XFS_BGL_NIMAPS;
		firstblock = NULLFSBLOCK;
		error = XFS_BMAPI(mp, NULL, io, offset_fsb, count_fsb,
				  0, &firstblock, 0, imaps, &nimaps, NULL);
		if (error) {
			return error;
		}
		ASSERT(nimaps != 0);

		/*
		 * Look for the holes in the mappings returned by bmapi.
		 * Decrement count_fsb and increment offset_fsb as we go.
		 */
		for (i = 0; i < nimaps; i++) {
			imapp = &imaps[i];
			count_fsb -= imapp->br_blockcount;
			ASSERT(((long)count_fsb) >= 0);
			ASSERT(offset_fsb == imapp->br_startoff);
			offset_fsb += imapp->br_blockcount;
			ASSERT(offset_fsb <= last_fsb);
			ASSERT((offset_fsb < last_fsb) || (count_fsb == 0));

			/*
			 * Skip anything that is not a hole or
			 * unwritten.
			 */
			if (imapp->br_startblock != HOLESTARTBLOCK ||
			    imapp->br_state == XFS_EXT_UNWRITTEN) {
				continue;
			}

			/*
			 * We found a hole.  Now add an entry to the inode's
			 * gap list corresponding to it.  The list is
			 * a singly linked, NULL terminated list.  We add
			 * each entry to the end of the list so that it is
			 * sorted by file offset.
			 */
			new_gap = kmem_zone_alloc(xfs_gap_zone, KM_SLEEP);
			new_gap->xg_offset_fsb = imapp->br_startoff;
			new_gap->xg_count_fsb = imapp->br_blockcount;
			new_gap->xg_next = NULL;

			if (last_gap == NULL) {
				io->io_gap_list = new_gap;
			} else {
				last_gap->xg_next = new_gap;
			}
			last_gap = new_gap;
		}
	}
	xfs_check_gap_list(io);
	return 0;
}

/*
 * Remove or trim any entries in the inode's gap list which overlap
 * the given range.  I'm going to assume for now that we never give
 * a range which is actually in the middle of an entry (i.e. we'd need
 * to split it in two).  This is a valid assumption for now given the
 * use of this in xfs_write_file() where we start at the front and
 * move sequentially forward.
 */
#if !defined(__linux__)
STATIC void
xfs_delete_gap_list(
	xfs_iocore_t	*io,
	xfs_fileoff_t	offset_fsb,
	xfs_extlen_t	count_fsb)
{
	xfs_gap_t	*curr_gap;
	xfs_gap_t	*last_gap;
	xfs_gap_t	*next_gap;
	xfs_fileoff_t	gap_offset_fsb;
	xfs_extlen_t	gap_count_fsb;
	xfs_fileoff_t	gap_end_fsb;
	xfs_fileoff_t	end_fsb;

	last_gap = NULL;
	curr_gap = io->io_gap_list;
	while (curr_gap != NULL) {
		gap_offset_fsb = curr_gap->xg_offset_fsb;
		gap_count_fsb = curr_gap->xg_count_fsb;

		/*
		 * The entries are sorted by offset, so if we see
		 * one beyond our range we're done.
		 */
		end_fsb = offset_fsb + count_fsb;
		if (gap_offset_fsb >= end_fsb) {
			return;
		}

		gap_end_fsb = gap_offset_fsb + gap_count_fsb;
		if (gap_end_fsb <= offset_fsb) {
			/*
			 * This shouldn't be able to happen for now.
			 */
			ASSERT(0);
			last_gap = curr_gap;
			curr_gap = curr_gap->xg_next;
			continue;
		}

		/*
		 * We've go an overlap.  If the gap is entirely contained
		 * in the region then remove it.  If not, then shrink it
		 * by the amount overlapped.
		 */
		if (gap_end_fsb > end_fsb) {
			/*
			 * The region does not extend to the end of the gap.
			 * Shorten the gap by the amount in the region,
			 * and then we're done since we've reached the
			 * end of the region.
			 */
			ASSERT(gap_offset_fsb >= offset_fsb);
			curr_gap->xg_offset_fsb = end_fsb;
			curr_gap->xg_count_fsb = gap_end_fsb - end_fsb;
			return;
		}

		next_gap = curr_gap->xg_next;
		if (last_gap == NULL) {
			io->io_gap_list = next_gap;
		} else {
			ASSERT(0);
			ASSERT(last_gap->xg_next == curr_gap);
			last_gap->xg_next = next_gap;
		}
		kmem_zone_free(xfs_gap_zone, curr_gap);
		curr_gap = next_gap;
	}
}		    
/*
 * Free up all of the entries in the inode's gap list.  This requires
 * the inode lock to be held exclusively.
 */
#endif /* !defined(__linux__) */
#if !defined(__linux__)
STATIC void
xfs_free_gap_list(
	xfs_iocore_t	*io)
{
	xfs_gap_t	*curr_gap;
	xfs_gap_t	*next_gap;

	ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0);
	xfs_check_gap_list(io);

	curr_gap = io->io_gap_list;
	while (curr_gap != NULL) {
		next_gap = curr_gap->xg_next;
		kmem_zone_free(xfs_gap_zone, curr_gap);
		curr_gap = next_gap;
	}
	io->io_gap_list = NULL;
}
#endif /* !defined(__linux__) */
/*
 * Zero the parts of the buffer which overlap gaps in the inode's gap list.
 * Deal with everything in BBs since the buffer is not guaranteed to be block
 * aligned.
 */
#if !defined(__linux__)
STATIC void
xfs_cmp_gap_list_and_zero(
	xfs_iocore_t	*io,
	xfs_buf_t		*bp)
{
	off_t		bp_offset_bb;
	int		bp_len_bb;
	off_t		gap_offset_bb;
	int		gap_len_bb;
	int		zero_offset_bb;
	int		zero_len_bb;
	xfs_gap_t	*curr_gap;
	xfs_mount_t	*mp;

	ASSERT(ismrlocked(io->io_lock, MR_UPDATE | MR_ACCESS) != 0);
	xfs_check_gap_list(io);

	bp_offset_bb = bp->b_offset;
	bp_len_bb = BTOBB(bp->b_bcount);
	mp = io->io_mount;
	curr_gap = io->io_gap_list;
	while (curr_gap != NULL) {
		gap_offset_bb = XFS_FSB_TO_BB(mp, curr_gap->xg_offset_fsb);
		gap_len_bb = XFS_FSB_TO_BB(mp, curr_gap->xg_count_fsb);

		/*
		 * Check to see if this gap is before the buffer starts.
		 */
		if ((gap_offset_bb + gap_len_bb) <= bp_offset_bb) {
			curr_gap = curr_gap->xg_next;
			continue;
		}

		/*
		 * Check to see if this gap is after th buffer ends.
		 * If it is then we're done since the list is sorted
		 * by gap offset.
		 */
		if (gap_offset_bb >= (bp_offset_bb + bp_len_bb)) {
			break;
		}

		/*
		 * We found a gap which overlaps the buffer.  Zero
		 * the portion of the buffer overlapping the gap.
		 */
		if (gap_offset_bb < bp_offset_bb) {
			/*
			 * The gap starts before the buffer, so we start
			 * zeroing from the start of the buffer.
			 */
			zero_offset_bb = 0;
			/*
			 * To calculate the amount of overlap.  First
			 * subtract the portion of the gap which is before
			 * the buffer.  If the length is still longer than
			 * the buffer, then just zero the entire buffer.
			 */
			zero_len_bb = gap_len_bb -
				      (bp_offset_bb - gap_offset_bb);
			if (zero_len_bb > bp_len_bb) {
				zero_len_bb = bp_len_bb;
			}
			ASSERT(zero_len_bb > 0);
		} else {
			/*
			 * The gap starts at the beginning or in the middle
			 * of the buffer.  The offset into the buffer is
			 * the difference between the two offsets.
			 */
			zero_offset_bb = gap_offset_bb - bp_offset_bb;
			/*
			 * Figure out the length of the overlap.  If the
			 * gap extends beyond the end of the buffer, then
			 * just zero to the end of the buffer.  Otherwise
			 * just zero the length of the gap.
			 */
			if ((gap_offset_bb + gap_len_bb) >
			    (bp_offset_bb + bp_len_bb)) {
				zero_len_bb = bp_len_bb - zero_offset_bb;
			} else {
				zero_len_bb = gap_len_bb;
			}
		}

		/*
		 * Now that we've calculated the range of the buffer to
		 * zero, do it.
		 */
		xfs_zero_bp(bp, BBTOB(zero_offset_bb), BBTOB(zero_len_bb));

		curr_gap = curr_gap->xg_next;
	}
}
#endif /* !defined(__linux__) */


/*
 * "Read" in a buffer whose b_blkno is -1 or uninitialized.
 * If b_blkno is -1, this means that at the time the buffer was created
 * there was no underlying backing store for the range of the file covered
 * by the bp. An uninitialized buffer (with B_UNINITIAL set) indicates
 * that there is allocated storage, but all or some portions of the
 * underlying blocks have never been written.
 *
 * To figure out the current state of things, we lock the inode
 * and call xfs_bmapi() to look at the current extents format.
 * If we're over a hole, delayed allocation or uninitialized space we
 * simply zero the corresponding portions of the buffer.  For parts
 * over real disk space we need to read in the stuff from disk.
 *
 * We know that we can just use xfs_ilock(SHARED) rather than
 * xfs_ilock_map_shared() here, because the extents had to be
 * read in in order to create the buffer we're trying to write out.
 */
#if !defined(__linux__)
int
xfs_strat_read(
	xfs_iocore_t	*io,
	xfs_buf_t		*bp)
{
	xfs_fileoff_t	offset_fsb;
	xfs_fileoff_t   map_start_fsb;
	xfs_fileoff_t	imap_offset;
	xfs_fsblock_t	last_bp_bb;
	xfs_fsblock_t	last_map_bb;
	xfs_fsblock_t	firstblock;
	xfs_filblks_t	count_fsb;
	xfs_extlen_t	imap_blocks;
	xfs_fsize_t	isize;
	off_t		offset;
	off_t		end_offset;
	off_t		init_limit;
	int		x;
	caddr_t		datap;
	xfs_buf_t		*rbp;
	xfs_mount_t	*mp;
	int		count;
	int		block_off;
	int		data_bytes;
	int		data_offset;
	int		nimaps;
	int		error;
#define	XFS_STRAT_READ_IMAPS	XFS_BMAP_MAX_NMAP
	xfs_bmbt_irec_t	imap[XFS_STRAT_READ_IMAPS];
	
	ASSERT((bp->b_blkno == -1) || (XFS_BUF_ISUNINITIAL(bp)));
	mp = io->io_mount;
	offset_fsb = XFS_BB_TO_FSBT(mp, bp->b_offset);
	/*
	 * Only read up to the EOF or the current write offset.
	 * The idea here is to avoid initializing pages which are
	 * going to be immediately overwritten in xfs_write_file().
	 * The most important case is the sequential write case, where
	 * the new pages at the end of the file are sent here by
	 * chunk_patch().  We don't want to zero them since they
	 * are about to be overwritten.
	 *
	 * The ip->i_write_off tells us the offset of any write in
	 * progress.  If it is 0 then we assume that no write is
	 * in progress.  If the write offset is within the file size,
	 * the the file size is the upper limit.  If the write offset
	 * is beyond the file size, then we only want to initialize the
	 * buffer up to the write offset.  Beyond that will either be
	 * overwritten or be beyond the new EOF.
	 */
	isize = XFS_SIZE(mp, io);
	offset = BBTOOFF(bp->b_offset);
	end_offset = offset + bp->b_bcount;

	if (io->io_write_offset == 0) {
		init_limit = isize;
	} else if (io->io_write_offset <= isize) {
		init_limit = isize;
	} else {
		init_limit = io->io_write_offset;
	}

	if (end_offset <= init_limit) {
		count = bp->b_bcount;
	} else {
		count = init_limit - offset;
	}

	if (count <= 0) {
		iodone(bp);
		return 0;
	}

	/*
	 * Since the buffer may not be file system block aligned, we
	 * can't do a simple shift to find the number of blocks underlying
	 * it.  Instead we subtract the last block it sits on from the first.
	 */
	count_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))) -
		    XFS_B_TO_FSBT(mp, offset);
	map_start_fsb = offset_fsb;
	XFS_ILOCK(mp, io, XFS_ILOCK_SHARED | XFS_EXTSIZE_RD);
	while (count_fsb != 0) {
		nimaps = XFS_STRAT_READ_IMAPS;
		firstblock = NULLFSBLOCK;
		error = XFS_BMAPI(mp, NULL, io, map_start_fsb, count_fsb, 0,
				  &firstblock, 0, imap, &nimaps, NULL);
		if (error) {
			XFS_IUNLOCK(mp, io, XFS_ILOCK_SHARED |
					    XFS_EXTSIZE_RD);
			xfs_bioerror_relse(bp);
			return error;
		}
		ASSERT(nimaps >= 1);
		
		for (x = 0; x < nimaps; x++) {
			imap_offset = imap[x].br_startoff;
			ASSERT(imap_offset == map_start_fsb);
			imap_blocks = imap[x].br_blockcount;
			ASSERT(imap_blocks <= count_fsb);
			/*
			 * Calculate the offset of this mapping in the
			 * buffer and the number of bytes of this mapping
			 * that are in the buffer.  If the block size is
			 * greater than the page size, then the buffer may
			 * not line up on file system block boundaries
			 * (e.g. pages being read in from chunk_patch()).
			 * In that case we need to account for the space
			 * in the file system blocks underlying the buffer
			 * that is not actually a part of the buffer.  This
			 * space is the space in the first block before the
			 * start of the buffer and the space in the last
			 * block after the end of the buffer.
			 */
			data_offset = XFS_FSB_TO_B(mp,
						   imap_offset - offset_fsb);
			data_bytes = XFS_FSB_TO_B(mp, imap_blocks);
			block_off = 0;

			if (mp->m_sb.sb_blocksize > NBPP) {
				/*
				 * If the buffer is actually fsb
				 * aligned then this will simply
				 * subtract 0 and do no harm.  If the
				 * current mapping is for the start of
				 * the buffer, then data offset will be
				 * zero so we don't need to subtract out
				 * any space at the beginning.
				 */
				if (data_offset > 0) {
					data_offset -= BBTOB(
							XFS_BB_FSB_OFFSET(mp,
							      bp->b_offset));
				}

				if (map_start_fsb == offset_fsb) {
					ASSERT(data_offset == 0);
					/*
					 * This is on the first block
					 * mapped, so it must be the start
					 * of the buffer.  Subtract out from
					 * the number of bytes the bytes
					 * between the start of the block
					 * and the start of the buffer.
					 */
					data_bytes -=
						BBTOB(XFS_BB_FSB_OFFSET(
							mp, bp->b_offset));

					/*
					 * Set block_off to the number of
					 * BBs that the buffer is offset
					 * from the start of this mapping.
					 */
					block_off = XFS_BB_FSB_OFFSET(mp,
							    bp->b_offset);
					ASSERT(block_off >= 0);
				}

				if (imap_blocks == count_fsb) {
					/*
					 * This mapping includes the last
					 * block to be mapped.  Subtract out
					 * from the number of bytes the bytes
					 * between the end of the buffer and
					 * the end of the block.  It may
					 * be the case that the buffer
					 * extends beyond the mapping (if
					 * it is beyond the end of the file),
					 * in which case no adjustment
					 * is necessary.
					 */
					last_bp_bb = bp->b_offset +
						BTOBB(bp->b_bcount);
					last_map_bb =
						XFS_FSB_TO_BB(mp,
							      (imap_offset +
							       imap_blocks));

					if (last_map_bb > last_bp_bb) {
						data_bytes -=
							BBTOB(last_map_bb -
							      last_bp_bb);
					}

				}
			}
			ASSERT(data_bytes > 0);
			ASSERT(data_offset >= 0);
			if ((imap[x].br_startblock == DELAYSTARTBLOCK) ||
			    (imap[x].br_startblock == HOLESTARTBLOCK) ||
			    (imap[x].br_state == XFS_EXT_UNWRITTEN)) {
				/*
				 * This is either a hole,a delayed alloc
				 * extent or uninitialized allocated space.
				 * Either way, just fill it with zeroes.
				 */
				datap = bp_mapin(bp);
				datap += data_offset;
				bzero(datap, data_bytes);
				if (!dpoff(bp->b_offset)) {
					bp_mapout(bp);
				}

			} else {
				/*
				 * The extent really exists on disk, so
				 * read it in.
				 */
				rbp = getrbuf(KM_SLEEP);
				xfs_overlap_bp(bp, rbp, data_offset,
					       data_bytes);
				rbp->b_blkno = XFS_FSB_TO_DB_IO(io,
						   imap[x].br_startblock) +
					       block_off;
				rbp->b_offset = XFS_FSB_TO_BB(mp,
							      imap_offset) +
						block_off;
				XFS_BUF_READ(rbp);
				XFS_BUF_UNASYNC(rbp);
				rbp->b_target = bp->b_target;

				xfs_check_rbp(io, bp, rbp, 1);
				(void) xfsbdstrat(mp, rbp);
				error = iowait(rbp);
				if (error) {
					XFS_BUF_ERROR(bp,error);
					ASSERT(bp->b_error != EINVAL);
				}

				/*
				 * Check to see if the block extent (or parts
				 * of it) have not yet been initialized and
				 * should therefore be zeroed.
				 */
				xfs_cmp_gap_list_and_zero(io, rbp);

				if (BP_ISMAPPED(rbp)) {
					bp_mapout(rbp);
				}

				freerbuf(rbp);
			}
			count_fsb -= imap_blocks;
			map_start_fsb += imap_blocks;
		}
	}
	XFS_IUNLOCK(mp, io, XFS_ILOCK_SHARED | XFS_EXTSIZE_RD);
	iodone(bp);
	return error;
}
#endif /* !defined(__linux__) */

#if defined(XFS_STRAT_TRACE)

void
xfs_strat_write_bp_trace(
	int		tag,
	xfs_inode_t	*ip,
	xfs_buf_t		*bp)
{
	if (ip->i_strat_trace == NULL) {
		return;
	}

	ktrace_enter(ip->i_strat_trace,
		     (void*)((__psunsigned_t)tag),
		     (void*)ip,
		     (void*)((__psunsigned_t)((ip->i_d.di_size >> 32) &
					     0xffffffff)),
		     (void*)(ip->i_d.di_size & 0xffffffff),
		     (void*)bp,
		     (void*)((__psunsigned_t)((bp->b_offset >> 32) &
					     0xffffffff)),
		     (void*)(bp->b_offset & 0xffffffff),
		     (void*)((__psunsigned_t)(bp->b_bcount)),
		     (void*)((__psunsigned_t)(bp->b_bufsize)),
		     (void*)(bp->b_blkno),
		     (void*)(__psunsigned_t)((XFS_BUF_BFLAGS(bp) >> 32) & 0xffffffff),
		     (void*)(XFS_BUF_BFLAGS(bp) & 0xffffffff),
		     (void*)(bp->b_pages),
		     (void*)(bp->b_pages->pf_pageno),
		     (void*)0,
		     (void*)0);

	ktrace_enter(xfs_strat_trace_buf,
		     (void*)((__psunsigned_t)tag),
		     (void*)ip,
		     (void*)((__psunsigned_t)((ip->i_d.di_size >> 32) &
					     0xffffffff)),
		     (void*)(ip->i_d.di_size & 0xffffffff),
		     (void*)bp,
		     (void*)((__psunsigned_t)((bp->b_offset >> 32) &
					     0xffffffff)),
		     (void*)(bp->b_offset & 0xffffffff),
		     (void*)((__psunsigned_t)(bp->b_bcount)),
		     (void*)((__psunsigned_t)(bp->b_bufsize)),
		     (void*)(bp->b_blkno),
		     (void*)(__psunsigned_t)((XFS_BUF_BFLAGS(bp) >> 32) & 0xffffffff),
		     (void*)(XFS_BUF_BFLAGS(bp) & 0xffffffff),
		     (void*)(bp->b_pages),
		     (void*)(bp->b_pages->pf_pageno),
		     (void*)0,
		     (void*)0);
}


void
xfs_strat_write_subbp_trace(
	int		tag,
	xfs_iocore_t	*io,
	xfs_buf_t		*bp,
	xfs_buf_t		*rbp,
	off_t		last_off,
	int		last_bcount,
	daddr_t		last_blkno)			    
{
	xfs_inode_t	*ip = XFS_IO_INODE(io);

	if (!IO_IS_XFS(io) || (ip->i_strat_trace == NULL)) {
		return;
	}

	ktrace_enter(ip->i_strat_trace,
		     (void*)((unsigned long)tag),
		     (void*)ip,
		     (void*)((unsigned long)((ip->i_d.di_size >> 32) &
					     0xffffffff)),
		     (void*)(ip->i_d.di_size & 0xffffffff),
		     (void*)bp,
		     (void*)rbp,
		     (void*)((unsigned long)((rbp->b_offset >> 32) &
					     0xffffffff)),
		     (void*)(rbp->b_offset & 0xffffffff),
		     (void*)((unsigned long)(rbp->b_bcount)),
		     (void*)(rbp->b_blkno),
		     (void*)((__psunsigned_t)(XFS_BUF_BFLAGS(rbp)), /* lower 32 flags only */
		     (void*)(XFS_BUF_PTR(rbp)),
		     (void*)(bp->b_pages),
		     (void*)(last_off),
		     (void*)((unsigned long)(last_bcount)),
		     (void*)(last_blkno));

	ktrace_enter(xfs_strat_trace_buf,
		     (void*)((unsigned long)tag),
		     (void*)ip,
		     (void*)((unsigned long)((ip->i_d.di_size >> 32) &
					     0xffffffff)),
		     (void*)(ip->i_d.di_size & 0xffffffff),
		     (void*)bp,
		     (void*)rbp,
		     (void*)((unsigned long)((rbp->b_offset >> 32) &
					     0xffffffff)),
		     (void*)(rbp->b_offset & 0xffffffff),
		     (void*)((unsigned long)(rbp->b_bcount)),
		     (void*)(rbp->b_blkno),
		     (void*)((__psunsigned_t)(XFS_BUF_BFLAGS(rbp)), /* lower 32 flags only */
		     (void*)(XFS_BUF_PTR(rbp)),
		     (void*)(bp->b_pages),
		     (void*)(last_off),
		     (void*)((unsigned long)(last_bcount)),
		     (void*)(last_blkno));
}
#endif /* XFS_STRAT_TRACE */

#ifdef DEBUG
/*
 * xfs_strat_write_check
 *
 * Make sure that there are blocks or delayed allocation blocks
 * underlying the entire area given.  The imap parameter is simply
 * given as a scratch area in order to reduce stack space.  No
 * values are returned within it.
 */
STATIC void
xfs_strat_write_check(
	xfs_iocore_t	*io,
	xfs_fileoff_t	offset_fsb,
	xfs_filblks_t	buf_fsb,
	xfs_bmbt_irec_t	*imap,
	int		imap_count)
{
	xfs_filblks_t	count_fsb;
	xfs_fsblock_t	firstblock;
	xfs_mount_t	*mp;
	int		nimaps;
	int		n;
	int		error;

	if (!IO_IS_XFS(io)) return;

	mp = io->io_mount;
	XFS_ILOCK(mp, io, XFS_ILOCK_SHARED | XFS_EXTSIZE_RD);
	count_fsb = 0;
	while (count_fsb < buf_fsb) {
		nimaps = imap_count;
		firstblock = NULLFSBLOCK;
		error = XFS_BMAPI(mp, NULL, io, (offset_fsb + count_fsb),
				  (buf_fsb - count_fsb), 0, &firstblock, 0,
				  imap, &nimaps, NULL);
		if (error) {
			XFS_IUNLOCK(mp, io, XFS_ILOCK_SHARED |
					    XFS_EXTSIZE_RD);
			return;
		}
		ASSERT(nimaps > 0);
		n = 0;
		while (n < nimaps) {
			ASSERT(imap[n].br_startblock != HOLESTARTBLOCK);
			count_fsb += imap[n].br_blockcount;
			ASSERT(count_fsb <= buf_fsb);
			n++;
		}
	}
	XFS_IUNLOCK(mp, io, XFS_ILOCK_SHARED | XFS_EXTSIZE_RD);
		
	return;
}
#endif /* DEBUG */

/*
 * xfs_strat_write_iodone -
 *	I/O completion for the first write of unwritten buffered data.
 *	Since this occurs in an interrupt thread, massage some bp info
 *	and queue to the xfs_strat daemon.
 */
#if !defined(__linux__)
void
xfs_strat_write_iodone(
	xfs_buf_t	*bp)
{
	int		s;

	ASSERT(XFS_BUF_ISUNITIAL(bp));
	ASSERT(bp->b_vp);
	ASSERT(xfsc_count > 0);
	/*
	 * Delay I/O done work until the transaction is completed.
	 */
	bp->b_iodone = NULL;
	ASSERT(XFS_BUF_ISBUSY(bp));
	ASSERT(valusema(&bp->b_lock) <= 0);
	ASSERT(!(XFS_BUF_ISDONE(bp));

	/*
	 * Queue to the xfsc_list.
	 */
	s = mp_mutex_spinlock(&xfsc_lock);
	/*
	 * Queue the buffer at the end of the list.
	 * Bump the inode count of the number of queued buffers.
	 */
	if (xfsc_list == NULL) {
		bp->av_forw = bp;
		bp->av_back = bp;
		xfsc_list = bp;
	} else {
		bp->av_back = xfsc_list->av_back;
		xfsc_list->av_back->av_forw = bp;
		xfsc_list->av_back = bp;
		bp->av_forw = xfsc_list;
	}
	buftrace("STRAT_WRITE_IODONE", bp);
	xfsc_bufcount++;
	(void)sv_signal(&xfsc_wait);
	mp_mutex_spinunlock(&xfsc_lock, s);
	return;
}
#endif /* !defined(__linux__) */


/* Issue transactions to convert a buffer range from unwritten
 * to written extents.
 */
#if !defined(__linux__)
void	xfs_strat_complete_buf(
	bhv_desc_t	*bdp,
	xfs_buf_t		*bp)
{
	xfs_inode_t	*ip;
	xfs_trans_t	*tp;
	xfs_mount_t	*mp;
	xfs_fileoff_t	offset_fsb;
	xfs_filblks_t	count_fsb;
	xfs_filblks_t	numblks_fsb;
	xfs_bmbt_irec_t	imap;
	int		nimaps;
	int		nres;
	int		error;
	int		committed;
	xfs_fsblock_t	firstfsb;
	xfs_bmap_free_t	free_list;

	/*	ASSERT((bp->b_flags & B_UNINITIAL) == B_UNINITIAL); */
	ASSERT(XFS_BUF_ISUNITIAL(bp)); /* this isn't quite the same as the obove line 
									* but it should be equivalent */		   
	ASSERT(bp->b_vp);
	buftrace("STRAT_WRITE_CMPL", bp);
	ip = XFS_BHVTOI(bdp);
	mp = ip->i_mount;
	offset_fsb = XFS_BB_TO_FSBT(mp, bp->b_offset);
	count_fsb = XFS_B_TO_FSB(mp, bp->b_bcount);
	do {
		nres = XFS_DIOSTRAT_SPACE_RES(mp, 0);
		xfs_strat_write_bp_trace(XFS_STRAT_UNINT_CMPL, ip, bp);
		/*
		 * Set up a transaction with which to allocate the
		 * backing store for the file.  Do allocations in a
		 * loop until we get some space in the range we are
		 * interested in.  The other space that might be
		 * allocated is in the delayed allocation extent
		 * on which we sit but before our buffer starts.
		 */
		tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
		error = xfs_trans_reserve(tp, nres,
				XFS_WRITE_LOG_RES(mp), 0,
				XFS_TRANS_PERM_LOG_RES,
				XFS_WRITE_LOG_COUNT);
		if (error) {
			xfs_trans_cancel(tp, 0);
			goto error0;
		}

		xfs_ilock(ip, XFS_ILOCK_EXCL);
		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
		xfs_trans_ihold(tp, ip);
		xfs_strat_write_bp_trace(XFS_STRAT_ENTER, ip, bp);

		/*
		 * Modify the unwritten extent state of the buffer.
		 */
		XFS_BMAP_INIT(&free_list, &firstfsb);
		nimaps = 1;
		error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
				  XFS_BMAPI_WRITE, &firstfsb,
				  1, &imap, &nimaps, &free_list);
		if (error)
			goto error_on_bmapi_transaction;

		error = xfs_bmap_finish(&(tp), &(free_list),
				firstfsb, &committed);
		if (error)
			goto error_on_bmapi_transaction;

		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES,
						NULL);
		xfs_iunlock(ip, XFS_ILOCK_EXCL);
		if (error)
			goto error0;

		if ((numblks_fsb = imap.br_blockcount) == 0) {
			/*
			 * The extent size should alway get bigger
			 * otherwise the loop is stuck.
			 */
			ASSERT(imap.br_blockcount);
			break;
		}
		offset_fsb += numblks_fsb;
		count_fsb -= numblks_fsb;
	} while (count_fsb > 0);

	XFS_BUF_UNUNINITIAL(bp);
	xfs_strat_write_bp_trace(XFS_STRAT_UNINT_CMPL, ip, bp);
	buftrace("STRAT_WRITE_CMPL", bp);
	/* fall through on normal completion */
	biodone(bp);
	return;

error_on_bmapi_transaction:
	xfs_bmap_cancel(&free_list);
	xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT));
	xfs_iunlock(ip, XFS_ILOCK_EXCL);
error0:
	XFS_BUF_ERROR(bp,error);
	biodone(bp);
}
#endif /* !defined(__linux__) */

#if !defined(__linux__)
STATIC void
xfs_strat_comp(void)
{
	xfs_buf_t		*bp;
	xfs_buf_t		*forw;
	xfs_buf_t		*back;
	int		s;
	bhv_desc_t	*bdp;

#ifdef __linux__
	daemonize();
	set_thread_name("xfsc");
#endif /* __linux__ */

	s = mp_mutex_spinlock(&xfsc_lock);
	xfsc_count++;

	while (1) {
		while (xfsc_list == NULL) {
			mp_sv_wait(&xfsc_wait, PRIBIO, &xfsc_lock, s);
			s = mp_mutex_spinlock(&xfsc_lock);
		}

		/*
		 * Pull a buffer off of the list.
		 */
		bp = xfsc_list;
		ASSERT(bp);
		forw = bp->av_forw;
		back = bp->av_back;
		forw->av_back = back;
		back->av_forw = forw;
		if (forw == bp) {
			xfsc_list = NULL;
		} else {
			xfsc_list = forw;
		}
		xfsc_bufcount--;;
		ASSERT(xfsc_bufcount >= 0);

		mp_mutex_spinunlock(&xfsc_lock, s);
		bp->av_forw = bp;
		bp->av_back = bp;
		/*		ASSERT((bp->b_flags & B_UNINITIAL) == B_UNINITIAL); */
		ASSERT(XFS_BUF_ISUNINITIAL(bp));
		ASSERT(bp->b_vp);

		bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(bp->b_vp),
					&xfs_vnodeops);
#ifdef CELL_CAPABLE
		if (bdp)
			xfs_strat_complete_buf(bdp, bp);
		else
			cxfs_strat_complete_buf(bp);
#else
		ASSERT(bdp);
		xfs_strat_complete_buf(bdp, bp);
#endif

		s = mp_mutex_spinlock(&xfsc_lock);
	}
}
#endif /* !defined(__linux__) */
/*
 * This is the completion routine for the heap-allocated buffers
 * used to write out a buffer which becomes fragmented during
 * xfs_strat_write().  It must coordinate with xfs_strat_write()
 * to properly mark the lead buffer as done when necessary and
 * to free the subordinate buffer.
 */
#if !defined(__linux__)
STATIC void
xfs_strat_write_relse(
	xfs_buf_t	*rbp)
{
	int	s;
	xfs_buf_t	*leader;
	xfs_buf_t	*forw;
	xfs_buf_t	*back;
	

	s = mutex_spinlock(&xfs_strat_lock);
	ASSERT(XFS_BUF_ISDONE(rbp));

	forw = (xfs_buf_t*)rbp->b_fsprivate2;
	back = (xfs_buf_t*)rbp->b_fsprivate;
	ASSERT(back != NULL);
	ASSERT(((xfs_buf_t *)back->b_fsprivate2) == rbp);
	ASSERT((forw == NULL) || (((xfs_buf_t *)forw->b_fsprivate) == rbp));

	/*
	 * Pull ourselves from the list.
	 */
	back->b_fsprivate2 = forw;
	if (forw != NULL) {
		forw->b_fsprivate = back;
	}

	if ((forw == NULL) &&
	    (back->b_flags & B_LEADER) &&
	    !(back->b_flags & B_PARTIAL)) {
		/*
		 * We are the only buffer in the list and the lead buffer
		 * has cleared the B_PARTIAL bit to indicate that all
		 * subordinate buffers have been issued.  That means it
		 * is time to finish off the lead buffer.
		 */
		leader = back;
		if (rbp->b_flags & B_ERROR) {
			leader->b_flags |= B_ERROR;
			leader->b_error = XFS_ERROR(rbp->b_error);
			ASSERT(leader->b_error != EINVAL);
		}
		leader->b_flags &= ~B_LEADER;
		mutex_spinunlock(&xfs_strat_lock, s);

		iodone(leader);
	} else {
		/*
		 * Either there are still other buffers in the list or
		 * not all of the subordinate buffers have yet been issued.
		 * In this case just pass any errors on to the lead buffer.
		 */
		while (!(back->b_flags & B_LEADER)) {
			back = (xfs_buf_t*)back->b_fsprivate;
		}
		ASSERT(back != NULL);
		ASSERT(back->b_flags & B_LEADER);
		leader = back;
		if (rbp->b_flags & B_ERROR) {
			leader->b_flags |= B_ERROR;
			leader->b_error = XFS_ERROR(rbp->b_error);
			ASSERT(leader->b_error != EINVAL);
		}
		mutex_spinunlock(&xfs_strat_lock, s);
	}

	rbp->b_fsprivate = NULL;
	rbp->b_fsprivate2 = NULL;
	rbp->b_relse = NULL;

	if (BP_ISMAPPED(rbp)) {
		bp_mapout(rbp);
	}

	freerbuf(rbp);
}
#endif /* !defined(__linux__) */

#ifdef DEBUG
/*ARGSUSED*/
void
xfs_check_rbp(
	xfs_iocore_t	*io,
	xfs_buf_t		*bp,
	xfs_buf_t		*rbp,
	int		locked)
{
	xfs_mount_t	*mp;
	int		nimaps;
	xfs_bmbt_irec_t	imap;
	xfs_fileoff_t	rbp_offset_fsb;
	xfs_filblks_t	rbp_len_fsb;
	pfd_t		*pfdp;
	xfs_fsblock_t	firstblock;
	int		error;

	mp = io->io_mount;
	rbp_offset_fsb = XFS_BB_TO_FSBT(mp, rbp->b_offset);
	rbp_len_fsb = XFS_BB_TO_FSB(mp, rbp->b_offset+BTOBB(rbp->b_bcount)) -
		      XFS_BB_TO_FSBT(mp, rbp->b_offset);
	nimaps = 1;
	if (!locked) {
		XFS_ILOCK(mp, io, XFS_ILOCK_SHARED | XFS_EXTSIZE_RD);
	}
	firstblock = NULLFSBLOCK;
	error = XFS_BMAPI(mp, NULL, io, rbp_offset_fsb, rbp_len_fsb, 0,
			  &firstblock, 0, &imap, &nimaps, NULL);
	if (!locked) {
		XFS_IUNLOCK(mp, io, XFS_ILOCK_SHARED | XFS_EXTSIZE_RD);
	}
	if (error) {
		return;
	}

	ASSERT(imap.br_startoff == rbp_offset_fsb);
	ASSERT(imap.br_blockcount == rbp_len_fsb);
	ASSERT((XFS_FSB_TO_DB_IO(io, imap.br_startblock) +
		XFS_BB_FSB_OFFSET(mp, rbp->b_offset)) ==
	       rbp->b_blkno);

#ifndef __linux__
	if (rbp->b_flags & B_PAGEIO) {
		pfdp = NULL;
		pfdp = getnextpg(rbp, pfdp);
		ASSERT(pfdp != NULL);
		ASSERT(dtopt(rbp->b_offset) == pfdp->pf_pageno);
	}

	if (rbp->b_flags & B_MAPPED) {
		ASSERT(BTOBB(poff(XFS_BUF_PTR(rbp))) ==
		       dpoff(rbp->b_offset));
	}
#endif
}

/*
 * Verify that the given buffer is going to the right place in its
 * file.  Also check that it is properly mapped and points to the
 * right page.  We can only do a trylock from here in order to prevent
 * deadlocks, since this is called from the strategy routine.
 */
void
xfs_check_bp(
	xfs_iocore_t	*io,
	xfs_buf_t		*bp)
{
	xfs_mount_t	*mp;
	int		nimaps;
	xfs_bmbt_irec_t	imap[2];
	xfs_fileoff_t	bp_offset_fsb;
	xfs_filblks_t	bp_len_fsb;
	pfd_t		*pfdp;
	int		locked;
	xfs_fsblock_t	firstblock;
	int		error;
	int		bmapi_flags;

	if (!IO_IS_XFS(io))
		return;

	mp = io->io_mount;

#ifndef __linux__
	if (bp->b_flags & B_PAGEIO) {
		pfdp = NULL;
		pfdp = getnextpg(bp, pfdp);
		ASSERT(pfdp != NULL);
		ASSERT(dtopt(bp->b_offset) == pfdp->pf_pageno);
		if (dpoff(bp->b_offset)) {
			ASSERT(bp->b_flags & B_MAPPED);
		}
	}

	if (bp->b_flags & B_MAPPED) {
		ASSERT(BTOBB(poff(XFS_BUF_PTR(bp))) ==
		       dpoff(bp->b_offset));
	}
#endif

	bp_offset_fsb = XFS_BB_TO_FSBT(mp, bp->b_offset);
	bp_len_fsb = XFS_BB_TO_FSB(mp, bp->b_offset + BTOBB(bp->b_bcount)) -
		     XFS_BB_TO_FSBT(mp, bp->b_offset);
	ASSERT(bp_len_fsb > 0);
	if (XFS_BUF_ISUNINITIAL(bp)) {
		nimaps = 2;
		bmapi_flags = XFS_BMAPI_WRITE|XFS_BMAPI_IGSTATE;
	} else {
		nimaps = 1;
		bmapi_flags = 0;
	}

	locked = XFS_ILOCK_NOWAIT(mp, io, XFS_ILOCK_SHARED |
					  XFS_EXTSIZE_RD);
	if (!locked) {
		return;
	}
	firstblock = NULLFSBLOCK;
	error = XFS_BMAPI(mp, NULL, io, bp_offset_fsb, bp_len_fsb, bmapi_flags,
			  &firstblock, 0, imap, &nimaps, NULL);
	XFS_IUNLOCK(mp, io, XFS_ILOCK_SHARED | XFS_EXTSIZE_RD);

	if (error) {
		return;
	}

	ASSERT(nimaps == 1);
	ASSERT(imap->br_startoff == bp_offset_fsb);
	ASSERT(imap->br_blockcount == bp_len_fsb);
	ASSERT((XFS_FSB_TO_DB_IO(io, imap->br_startblock) +
		XFS_BB_FSB_OFFSET(mp, bp->b_offset)) ==
	       bp->b_blkno);
}
#endif /* DEBUG */


/*
 *	xfs_strat_write_unwritten is called for buffered
 *	writes of preallocated but unwritten extents. These
 *	require a transaction to indicate the extent has been
 *	written.
 *	The write is set up by a call to xfs_bmapi with
 *	an "ignore state" flag. After the I/O has completed,
 *	xfs_strat_write_iodone is called to queue the completed
 *	buffer to xfs_strat_write_complete. That routine 
 *	calls xfs_bmapi() with a write flag, and issues the 
 *	required transaction.
 */
#if !defined(__linux__)
int
xfs_strat_write_unwritten(
	xfs_iocore_t	*io,
	xfs_buf_t		*bp)
{
	xfs_fileoff_t	offset_fsb;
	off_t		offset_fsb_bb;
	xfs_filblks_t	count_fsb;
	/* REFERENCED */
	xfs_mount_t	*mp;
	xfs_inode_t	*ip;
	xfs_bmap_free_t	free_list;
	xfs_fsblock_t	first_block;
	int		error;
	int		nimaps;
	xfs_bmbt_irec_t	imap[XFS_BMAP_MAX_NMAP];
#define	XFS_STRAT_WRITE_IMAPS	2

	/*
	 * If XFS_STRAT_WRITE_IMAPS is changed then the definition
	 * of XFS_STRATW_LOG_RES in xfs_trans.h must be changed to
	 * reflect the new number of extents that can actually be
	 * allocated in a single transaction.
	 */
	if (IO_IS_XFS(io)) {
		ip = XFS_IO_INODE(io);
	}
	mp = io->io_mount;
	error = 0;

	/*
	 * Drop the count of queued buffers. We need to do
	 * this before the bdstrat because callers of
	 * VOP_FLUSHINVAL_PAGES(), for example, may expect the queued_buf
	 * count to be down when it rturns. See xfs_itruncate_start.
	 */
	atomicAddInt(&(io->io_queued_bufs), -1);
	ASSERT(io->io_queued_bufs >= 0);

	/*
	 * Don't proceed if we're forcing a shutdown.
	 */
	if (XFS_FORCED_SHUTDOWN(mp)) {
		return xfs_bioerror_relse(bp);
	}

	if (XFS_IS_QUOTA_ON(mp) && XFS_IO_INODE(io)) {
		if (XFS_NOT_DQATTACHED(mp, ip)) {
			if (error = xfs_qm_dqattach(ip, 0)) {
				return xfs_bioerror_relse(bp);
			}
		}
	}

	/*
	 * 
	 */

	offset_fsb = XFS_BB_TO_FSBT(mp, bp->b_offset);
	count_fsb = XFS_B_TO_FSB(mp, bp->b_bcount);
	offset_fsb_bb = XFS_FSB_TO_BB(mp, offset_fsb);
	ASSERT((offset_fsb_bb == bp->b_offset) || (count_fsb == 1));

	XFS_ILOCK(mp, io, XFS_ILOCK_EXCL | XFS_EXTSIZE_WR);

	/*
	 * Modify the unwritten extent state of the buffer.
	 */
	XFS_BMAP_INIT(&(free_list), &(first_block));
	nimaps = 2;
	error = XFS_BMAPI(mp, NULL, io, offset_fsb, count_fsb,
			  XFS_BMAPI_WRITE|XFS_BMAPI_IGSTATE, &first_block,
			  0, imap, &nimaps, &free_list);
	if (error) {
		XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL | XFS_EXTSIZE_WR);
		goto error0;
	}

	ASSERT(nimaps == 1);
	if (bp->b_blkno < 0) {
		bp->b_blkno = XFS_FSB_TO_DB_IO(io, imap->br_startblock) +
				(bp->b_offset - offset_fsb_bb);
	}
	
	/*
	 * Before dropping the inode lock, clear any
	 * read-ahead state since in allocating space
	 * here we may have made it invalid.
	 */
	XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL | XFS_EXTSIZE_WR);
	XFS_INODE_CLEAR_READ_AHEAD(io);

	/*
	 * For an unwritten buffer, do a write of the buffer,
	 * and commit the transaction which modified the
	 * extent state.
	 */
	ASSERT((imap[0].br_startoff <= offset_fsb) &&
	       (imap[0].br_blockcount >=
	       (offset_fsb + count_fsb - imap[0].br_startoff)));
	ASSERT(bp->b_iodone == NULL);
	bp->b_iodone = xfs_strat_write_iodone;
	xfs_check_bp(io, bp);
	xfsbdstrat(mp, bp);
	return 0;

 error0:
	bp->b_flags |= B_ERROR;
	bp->b_error = error;
	biodone(bp);
	return error;
}
#endif /* !defined(__linux__) */

/*
 * This is called to convert all delayed allocation blocks in the given
 * range back to 'holes' in the file.  It is used when a buffer will not
 * be able to be written out due to disk errors in the allocation calls.
 */
#if !defined(__linux__)
STATIC void
xfs_delalloc_cleanup(
	xfs_inode_t	*ip,
	xfs_fileoff_t	start_fsb,
	xfs_filblks_t	count_fsb)
{
	xfs_fsblock_t	first_block;
	int		nimaps;
	int		done;
	int		error;
	int		n;
#define	XFS_CLEANUP_MAPS	4
	xfs_bmbt_irec_t	imap[XFS_CLEANUP_MAPS];

	ASSERT(count_fsb < 0xffff000);
	xfs_ilock(ip, XFS_ILOCK_EXCL);
	while (count_fsb != 0) {
		first_block = NULLFSBLOCK;
		nimaps = XFS_CLEANUP_MAPS;
		error = xfs_bmapi(NULL, ip, start_fsb, count_fsb, 0,
				  &first_block, 1, imap, &nimaps, NULL);
		if (error) {
			xfs_iunlock(ip, XFS_ILOCK_EXCL);
			return;
		}

		ASSERT(nimaps > 0);
		n = 0;
		while (n < nimaps) {
			if (imap[n].br_startblock == DELAYSTARTBLOCK) {
				if (!XFS_FORCED_SHUTDOWN(ip->i_mount))
					xfs_force_shutdown(ip->i_mount,
						XFS_METADATA_IO_ERROR);
				error = xfs_bunmapi(NULL, ip,
						    imap[n].br_startoff,
						    imap[n].br_blockcount,
						    0, 1, &first_block, NULL,
						    &done);
				if (error) {
					xfs_iunlock(ip, XFS_ILOCK_EXCL);
					return;
				}
				ASSERT(done);
			}
			start_fsb += imap[n].br_blockcount;
			count_fsb -= imap[n].br_blockcount;
			ASSERT(count_fsb < 0xffff000);
			n++;
		}
	}
	xfs_iunlock(ip, XFS_ILOCK_EXCL);
}
#endif /* !defined(__linux__) */
/*
 *	xfs_strat_write is called for buffered writes which
 *	require a transaction. These cases are:
 *	- Delayed allocation (since allocation now takes place).
 *	- Writing a previously unwritten extent.
 */
#if !defined(__linux__)

int
xfs_strat_write(
	xfs_iocore_t	*io,
	xfs_buf_t		*bp)
{
	xfs_inode_t	*ip;

	ip = XFS_IO_INODE(io);

	/* Now make sure we still want to write out this buffer */
	if ((ip->i_d.di_nlink == 0) && (bp->b_vp->v_flag & VINACT)) {
		bp->b_flags |= B_STALE;
		atomicAddInt(&(io->io_queued_bufs), -1);
		biodone(bp);
		return 0;
	}

	return xfs_strat_write_core(io, bp, 1);
}
#endif /* !defined(__linux__) */
#if !defined(__linux__)
int
xfs_strat_write_core(
	xfs_iocore_t	*io,
	xfs_buf_t		*bp,
	int		is_xfs)
{
	xfs_fileoff_t	offset_fsb;
	off_t		offset_fsb_bb;
	xfs_fileoff_t   map_start_fsb;
	xfs_fileoff_t	imap_offset;
	xfs_fsblock_t	first_block;
	xfs_filblks_t	count_fsb;
	xfs_extlen_t	imap_blocks;
#ifdef DEBUG
	off_t		last_rbp_offset;
	xfs_extlen_t	last_rbp_bcount;
	daddr_t		last_rbp_blkno;
#endif
	/* REFERENCED */
	int		rbp_count;
	xfs_buf_t		*rbp;
	xfs_mount_t	*mp;
	xfs_inode_t	*ip;
	xfs_trans_t	*tp;
	int		error;
	xfs_bmap_free_t	free_list;
	xfs_bmbt_irec_t	*imapp;
	int		rbp_offset;
	int		rbp_len;
	int		set_lead;
	int		s;
	/* REFERENCED */
	int		loops;
	int		imap_index;
	int		nimaps;
	int		committed;
	xfs_lsn_t	commit_lsn;
	xfs_bmbt_irec_t	imap[XFS_BMAP_MAX_NMAP];
#define	XFS_STRAT_WRITE_IMAPS	2

	/*
	 * If XFS_STRAT_WRITE_IMAPS is changed then the definition
	 * of XFS_STRATW_LOG_RES in xfs_trans.h must be changed to
	 * reflect the new number of extents that can actually be
	 * allocated in a single transaction.
	 */

	 
	XFSSTATS.xs_xstrat_bytes += bp->b_bcount;
	if ((bp->b_flags & B_UNINITIAL) == B_UNINITIAL)
		return xfs_strat_write_unwritten(io, bp);

	if (is_xfs) {
		ip = XFS_IO_INODE(io);
	}

	mp = io->io_mount;
	set_lead = 0;
	rbp_count = 0;
	error = 0;
	bp->b_flags |= B_STALE;

	/*
	 * Drop the count of queued buffers. We need to do
	 * this before the bdstrat(s) because callers of
	 * VOP_FLUSHINVAL_PAGES(), for example, may expect the queued_buf
	 * count to be down when it rturns. See xfs_itruncate_start.
	 */
	atomicAddInt(&(io->io_queued_bufs), -1);
	ASSERT(io->io_queued_bufs >= 0);

	/*
	 * Don't proceed if we're forcing a shutdown.
	 * We may not have bmap'd all the blocks needed.
	 */
	if (XFS_FORCED_SHUTDOWN(mp)) {
		return xfs_bioerror_relse(bp);
	}

	if (is_xfs && XFS_IS_QUOTA_ON(mp)) {
		if (XFS_NOT_DQATTACHED(mp, ip)) {
			if (error = xfs_qm_dqattach(ip, 0)) {
				return xfs_bioerror_relse(bp);
			}
		}
	}

	/*
	 * It is possible that the buffer does not start on a block
	 * boundary in the case where the system page size is less
	 * than the file system block size.  In this case, the buffer
	 * is guaranteed to be only a single page long, so we know
	 * that we will allocate the block for it in a single extent.
	 * Thus, the looping code below does not have to worry about
	 * this case.  It is only handled in the fast path code.
	 */

	ASSERT(bp->b_blkno == -1);
	offset_fsb = XFS_BB_TO_FSBT(mp, bp->b_offset);
	count_fsb = XFS_B_TO_FSB(mp, bp->b_bcount);
	offset_fsb_bb = XFS_FSB_TO_BB(mp, offset_fsb);
	ASSERT((offset_fsb_bb == bp->b_offset) || (count_fsb == 1));
	xfs_strat_write_check(io, offset_fsb,
			      count_fsb, imap,
			      XFS_STRAT_WRITE_IMAPS);
	map_start_fsb = offset_fsb;
	while (count_fsb != 0) {
		/*
		 * Set up a transaction with which to allocate the
		 * backing store for the file.  Do allocations in a
		 * loop until we get some space in the range we are
		 * interested in.  The other space that might be allocated
		 * is in the delayed allocation extent on which we sit
		 * but before our buffer starts.
		 */
		nimaps = 0;
		loops = 0;
		while (nimaps == 0) {
			if (is_xfs) {
				tp = xfs_trans_alloc(mp,
						     XFS_TRANS_STRAT_WRITE);
				error = xfs_trans_reserve(tp, 0,
						XFS_WRITE_LOG_RES(mp),
						0, XFS_TRANS_PERM_LOG_RES,
						XFS_WRITE_LOG_COUNT);
				if (error) {
					xfs_trans_cancel(tp, 0);
					bp->b_flags |= B_ERROR;
					bp->b_error = error;
					goto error0;
				}

				ASSERT(error == 0);
				xfs_ilock(ip, XFS_ILOCK_EXCL);
				xfs_trans_ijoin(tp, ip,
						XFS_ILOCK_EXCL);
				xfs_trans_ihold(tp, ip);
				xfs_strat_write_bp_trace(XFS_STRAT_ENTER,
							 ip, bp);
			} else {
				tp = NULL;
				XFS_ILOCK(mp, io, XFS_ILOCK_EXCL |
						  XFS_EXTSIZE_WR);
			}

			/*
			 * Allocate the backing store for the file.
			 */
			XFS_BMAP_INIT(&(free_list),
				      &(first_block));
			nimaps = XFS_STRAT_WRITE_IMAPS;
			error = XFS_BMAPI(mp, tp, io, map_start_fsb, count_fsb,
					  XFS_BMAPI_WRITE, &first_block, 1,
					  imap, &nimaps, &free_list);
			if (error) {
				if (is_xfs) {
					xfs_bmap_cancel(&free_list);
					xfs_trans_cancel(tp,
						 (XFS_TRANS_RELEASE_LOG_RES |
						  XFS_TRANS_ABORT));
				}
				XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL |
						    XFS_EXTSIZE_WR);
				bp->b_flags |= B_ERROR;
				bp->b_error = error;
				goto error0;
			}
			ASSERT(loops++ <=
			       (offset_fsb +
				XFS_B_TO_FSB(mp, bp->b_bcount)));
			if (is_xfs) {
				error = xfs_bmap_finish(&(tp), &(free_list),
						first_block, &committed);
				if (error) {
					xfs_bmap_cancel(&free_list);
					xfs_trans_cancel(tp,
						 (XFS_TRANS_RELEASE_LOG_RES |
						  XFS_TRANS_ABORT));
					xfs_iunlock(ip, XFS_ILOCK_EXCL);
					bp->b_flags |= B_ERROR;
					bp->b_error = error;
					goto error0;
				}

				error = xfs_trans_commit(tp,
						 XFS_TRANS_RELEASE_LOG_RES,
						 &commit_lsn);
				if (error) {
					xfs_iunlock(ip, XFS_ILOCK_EXCL);
					bp->b_flags |= B_ERROR;
					bp->b_error = error;
					goto error0;
				}

				/*
				 * write the commit lsn if requested into the
				 * place pointed at by the buffer.  This is
				 * used by IO_DSYNC writes and b_fsprivate3
				 * should be a pointer to a stack (automatic)
				 * variable.  So be *very* careful if you muck
				 * with b_fsprivate3.
				 */
				if (bp->b_fsprivate3)
					*(xfs_lsn_t *)bp->b_fsprivate3 =
								commit_lsn;
			}

			/*
			 * Before dropping the lock, clear any read-ahead
			 * state since in allocating space here we may have
			 * made it invalid.
			 */
			XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL | XFS_EXTSIZE_WR);
			XFS_INODE_CLEAR_READ_AHEAD(io);
		}

		/*
		 * This is a quick check to see if the first time through
		 * was able to allocate a single extent over which to
		 * write.
		 */
		if ((map_start_fsb == offset_fsb) &&
		    (imap[0].br_blockcount == count_fsb)) {
			ASSERT(nimaps == 1);
			/*
			 * Set the buffer's block number to match
			 * what we allocated.  If the buffer does
			 * not start on a block boundary (can only
			 * happen if the block size is larger than
			 * the page size), then make sure to add in
			 * the offset of the buffer into the file system
			 * block to the disk block number to write.
			 */
			bp->b_blkno =
				XFS_FSB_TO_DB_IO(io, imap[0].br_startblock) +
				(bp->b_offset - offset_fsb_bb);
			if (is_xfs) {
				xfs_strat_write_bp_trace(XFS_STRAT_FAST,
							 ip, bp);
			}
			xfs_check_bp(io, bp);
#ifdef XFSRACEDEBUG
			delay_for_intr();
			delay(100);
#endif
			xfsbdstrat(mp, bp);

			XFSSTATS.xs_xstrat_quick++;
			return 0;
		}

		/*
		 * Bmap couldn't manage to lay the buffer out as
		 * one extent, so we need to do multiple writes
		 * to push the data to the multiple extents.
		 * Write out the subordinate bps asynchronously
		 * and have their completion functions coordinate
		 * with the code at the end of this function to
		 * deal with marking our bp as done when they have
		 * ALL completed.
		 */
		XFSSTATS.xs_xstrat_split++;
		imap_index = 0;
		if (!set_lead) {
			bp->b_flags |= B_LEADER | B_PARTIAL;
			set_lead = 1;
		}
		while (imap_index < nimaps) {
			rbp = getrbuf(KM_SLEEP);

			imapp = &(imap[imap_index]);
			ASSERT((imapp->br_startblock !=
				DELAYSTARTBLOCK) &&
			       (imapp->br_startblock !=
				HOLESTARTBLOCK));
			imap_offset = imapp->br_startoff;
			rbp_offset = XFS_FSB_TO_B(mp,
						  imap_offset -
						  offset_fsb);
			imap_blocks = imapp->br_blockcount;
			ASSERT((imap_offset + imap_blocks) <=
			       (offset_fsb +
				XFS_B_TO_FSB(mp, bp->b_bcount)));
			rbp_len = XFS_FSB_TO_B(mp,
					       imap_blocks);
			xfs_overlap_bp(bp, rbp, rbp_offset,
				       rbp_len);
			rbp->b_blkno =
				XFS_FSB_TO_DB_IO(io, imapp->br_startblock);
			rbp->b_offset = XFS_FSB_TO_BB(mp,
						      imap_offset);
			rbp->b_target = bp->b_target;
			xfs_strat_write_subbp_trace(XFS_STRAT_SUB,
						    io, bp,
						    rbp,
						    last_rbp_offset,
						    last_rbp_bcount,
						    last_rbp_blkno);
#ifdef DEBUG
			xfs_check_rbp(io, bp, rbp, 0);
			if (rbp_count > 0) {
				ASSERT((last_rbp_offset +
					BTOBB(last_rbp_bcount)) ==
				       rbp->b_offset);
				ASSERT((rbp->b_blkno <
					last_rbp_blkno) ||
				       (rbp->b_blkno >=
					(last_rbp_blkno +
					 BTOBB(last_rbp_bcount))));
				if (rbp->b_blkno <
				    last_rbp_blkno) {
					ASSERT((rbp->b_blkno +
					      BTOBB(rbp->b_bcount)) <
					       last_rbp_blkno);
				}
			}
			last_rbp_offset = rbp->b_offset;
			last_rbp_bcount = rbp->b_bcount;
			last_rbp_blkno = rbp->b_blkno;
#endif
					       
			
			/*
			 * Link the buffer into the list of subordinate
			 * buffers started at bp->b_fsprivate2.  The
			 * subordinate buffers use b_fsprivate and
			 * b_fsprivate2 for back and forw pointers, but
			 * the lead buffer cannot use b_fsprivate.
			 * A subordinate buffer can always find the lead
			 * buffer by searching back through the fsprivate
			 * fields until it finds the buffer marked with
			 * B_LEADER.
			 */
			s = mutex_spinlock(&xfs_strat_lock);
			rbp->b_fsprivate = bp;
			rbp->b_fsprivate2 = bp->b_fsprivate2;
			if (bp->b_fsprivate2 != NULL) {
				((xfs_buf_t*)(bp->b_fsprivate2))->b_fsprivate =
								rbp;
			}
			bp->b_fsprivate2 = rbp;
			mutex_spinunlock(&xfs_strat_lock, s);

			rbp->b_relse = xfs_strat_write_relse;
			rbp->b_flags |= B_ASYNC;

#ifdef XFSRACEDEBUG
			delay_for_intr();
			delay(100);
#endif
			xfsbdstrat(mp, rbp);
			map_start_fsb +=
				imapp->br_blockcount;
			count_fsb -= imapp->br_blockcount;
			ASSERT(count_fsb < 0xffff000);

			imap_index++;
		}
	}

	/*
	 * Now that we've issued all the partial I/Os, check to see
	 * if they've all completed.  If they have then mark the buffer
	 * as done, otherwise clear the B_PARTIAL flag in the buffer to
	 * indicate that the last subordinate buffer to complete should
	 * mark the buffer done.  Also, drop the count of queued buffers
	 * now that we know that all the space underlying the buffer has
	 * been allocated and it has really been sent out to disk.
	 *
	 * Use set_lead to tell whether we kicked off any partial I/Os
	 * or whether we jumped here after an error before issuing any.
	 */
 error0:
	if (error) {
		ASSERT(count_fsb != 0);
		/*
		 * Since we're never going to convert the remaining
		 * delalloc blocks beneath this buffer into real block,
		 * get rid of them now.
		 */
		ASSERT(is_xfs || XFS_FORCED_SHUTDOWN(mp));
		if (is_xfs) {
			xfs_delalloc_cleanup(ip, map_start_fsb, count_fsb);
		}
	}
	if (set_lead) {
		s = mutex_spinlock(&xfs_strat_lock);
		ASSERT((bp->b_flags & (B_DONE | B_PARTIAL)) == B_PARTIAL);
		ASSERT(bp->b_flags & B_LEADER);
		
		if (bp->b_fsprivate2 == NULL) {
			/*
			 * All of the subordinate buffers have completed.
			 * Call iodone() to note that the I/O has completed.
			 */
			bp->b_flags &= ~(B_PARTIAL | B_LEADER);
			mutex_spinunlock(&xfs_strat_lock, s);

			biodone(bp);
			return error;
		}

		bp->b_flags &= ~B_PARTIAL;
		mutex_spinunlock(&xfs_strat_lock, s);
	} else {
		biodone(bp);
	}
	return error;
}
#endif /* !defined(__linux__) */
/*
 * Force a shutdown of the filesystem instantly while keeping
 * the filesystem consistent. We don't do an unmount here; just shutdown
 * the shop, make sure that absolutely nothing persistent happens to
 * this filesystem after this point. 
 */
#if 1
void
xfs_force_shutdown(
	xfs_mount_t	*mp,
	int		flags)
{
	int ntries;
	int logerror;
	extern dev_t rootdev;		/* from sys/systm.h */

#define XFS_MAX_DRELSE_RETRIES	10
	logerror = flags & XFS_LOG_IO_ERROR;

	/*
	 * No need to duplicate efforts.
	 */
	if (XFS_FORCED_SHUTDOWN(mp) && !logerror)
		return;

	if (XFS_MTOVFS(mp)->vfs_dev == rootdev)
		cmn_err(CE_PANIC, "Fatal error on root filesystem");

	/*
	 * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't
	 * queue up anybody new on the log reservations, and wakes up
	 * everybody who's sleeping on log reservations and tells
	 * them the bad news.
	 */
	if (xfs_log_force_umount(mp, logerror))
		return;

	if (flags & XFS_CORRUPT_INCORE)
		cmn_err(CE_ALERT,
    "Corruption of in-memory data detected.  Shutting down filesystem: %s",
			mp->m_fsname);
	else
		cmn_err(CE_ALERT,
			"I/O Error Detected.  Shutting down filesystem: %s",
			mp->m_fsname);

	cmn_err(CE_ALERT,
		"Please umount the filesystem, and rectify the problem(s)");

	/*
	 * Release all delayed write buffers for this device.
	 * It wouldn't be a fatal error if we couldn't release all
	 * delwri bufs; in general they all get unpinned eventually.
	 */
	ntries = 0;
#ifdef XFSERRORDEBUG
	{
		int nbufs;
		while (nbufs = xfs_incore_relse(mp->m_ddev_targ, 1, 0)) {
			printf("XFS: released 0x%x bufs\n", nbufs);
			if (ntries >= XFS_MAX_DRELSE_RETRIES) {
				printf("XFS: ntries 0x%x\n", ntries);
				debug("ntries");
				break;
			}
			delay(++ntries * 5);
		}
	}
#else
	while (xfs_incore_relse(mp->m_ddev_targ, 1, 0)) {
		if (ntries >= XFS_MAX_DRELSE_RETRIES)
			break;
		delay(++ntries * 5);
	}

#endif

#if CELL_CAPABLE
	if (cell_enabled && !(flags & XFS_SHUTDOWN_REMOTE_REQ)) {
		extern void cxfs_force_shutdown(xfs_mount_t *, int); /*@@@*/

		/* 
		 * We're being called for a problem discovered locally.
		 * Tell CXFS to pass along the shutdown request.
		 */
		cxfs_force_shutdown(mp, flags);
	}
#endif /* CELL_CAPABLE */
}
#endif /* !defined(__linux__) */


/*
 * Called when we want to stop a buffer from getting written or read.
 * We attach the EIO error, muck with its flags, and call biodone
 * so that the proper iodone callbacks get called.
 */
#if 1 /* !defined(__linux__) */
int
xfs_bioerror(
	xfs_buf_t *bp)
{

#ifdef XFSERRORDEBUG
	ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
#endif

	/*
	 * No need to wait until the buffer is unpinned.
	 * We aren't flushing it.
	 */
    xfs_buftrace("XFS IOERROR", bp);
	XFS_BUF_ERROR(bp, EIO);
	/*
	 * We're calling biodone, so delete B_DONE flag. Either way
	 * we have to call the iodone callback, and calling biodone
	 * probably is the best way since it takes care of
	 * GRIO as well.
	 */
	XFS_BUF_UNREAD(bp);
	XFS_BUF_UNDELAYWRITE(bp);
	XFS_BUF_UNDONE(bp);
	XFS_BUF_STALE(bp);

	XFS_BUF_CLR_BDSTRAT_FUNC(bp);
	xfs_biodone(bp);
	
	return (EIO);
}
#endif /* !defined(__linux__) */

/*
 * Same as xfs_bioerror, except that we are releasing the buffer
 * here ourselves, and avoiding the biodone call.
 * This is meant for userdata errors; metadata bufs come with
 * iodone functions attached, so that we can track down errors.
 */
#if 1 /* !defined(__linux__) */
int
xfs_bioerror_relse(
	xfs_buf_t *bp)
{
	int64_t fl;

	ASSERT(bp->b_iodone != xfs_buf_iodone_callbacks);
	ASSERT(bp->b_iodone != xlog_iodone);

	xfs_buftrace("XFS IOERRELSE", bp);
	fl = XFS_BUF_BFLAGS(bp);
	/*
	 * No need to wait until the buffer is unpinned.
	 * We aren't flushing it.
	 *
	 * chunkhold expects B_DONE to be set, whether
	 * we actually finish the I/O or not. We don't want to
	 * change that interface.
	 */
	XFS_BUF_UNREAD(bp);
	XFS_BUF_UNDELAYWRITE(bp);
	XFS_BUF_DONE(bp);
	XFS_BUF_STALE(bp);
	XFS_BUF_CLR_IODONE_FUNC(bp);
 	XFS_BUF_CLR_BDSTRAT_FUNC(bp);
	if (!(fl & XFS_B_ASYNC)) {
		/*
		 * Mark b_error and B_ERROR _both_.
		 * Lot's of chunkcache code assumes that.
		 * There's no reason to mark error for
		 * ASYNC buffers.
		 */
		XFS_BUF_ERROR(bp, EIO);
		XFS_BUF_V_IODONESEMA(bp);
	} else {
		xfs_buf_relse(bp);
	}
	return (EIO);
}
#endif /* !defined(__linux__) */
/*
 * Prints out an ALERT message about I/O error. 
 */
void
xfs_ioerror_alert(
	char 			*func,
	struct xfs_mount	*mp,
	dev_t			dev,
	daddr_t			blkno)
{
	cmn_err(CE_ALERT,
 "I/O error in filesystem (\"%s\") meta-data dev 0x%x block 0x%x (\"%s\")",
		mp->m_fsname, 
		dev,
		blkno,
		func);
}

/*
 * This isn't an absolute requirement, but it is
 * just a good idea to call xfs_read_buf instead of
 * directly doing a read_buf call. For one, we shouldn't
 * be doing this disk read if we are in SHUTDOWN state anyway,
 * so this stops that from happening. Secondly, this does all
 * the error checking stuff and the brelse if appropriate for
 * the caller, so the code can be a little leaner.
 */
#if 1 /* !defined(__linux__) */
int
xfs_read_buf(
	struct xfs_mount *mp,
	buftarg_t	 *target,
        daddr_t 	 blkno,
        int              len,
        uint             flags,
	xfs_buf_t		 **bpp)
{
	xfs_buf_t		 *bp;
	int 		 error;
	
	bp = xfs_buf_read(target, blkno, len, flags);
	error = XFS_BUF_GETERROR(bp);
	if (bp && !error && !XFS_FORCED_SHUTDOWN(mp)) {
		*bpp = bp;
	} else {
		*bpp = NULL;
		if (!error)
			error = XFS_ERROR(EIO);
		if (bp) {
		    XFS_BUF_UNDONE(bp);
		    XFS_BUF_UNDELAYWRITE(bp);
		    XFS_BUF_STALE(bp);
			/* 
			 * brelse clears B_ERROR and b_error
			 */
			xfs_buf_relse(bp);
		}
	}
	return (error);
}
#endif /* !defined(__linux__) */
	
/*
 * Wrapper around bwrite() so that we can trap 
 * write errors, and act accordingly.
 */
#if 1 /*  !defined(__linux__) */
int
xfs_bwrite(
	struct xfs_mount *mp,
	struct xfs_buf	 *bp)
{
	int	error;

	/*
	 * XXXsup how does this work for quotas.
	 */
	ASSERT(bp->b_target);
	ASSERT(bp->b_vp == #NULL);
	XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
	XFS_BUF_SET_FSPRIVATE3(bp, mp);

   	if (error = XFS_bwrite(bp)) {
		ASSERT(mp);
		/* 
		 * Cannot put a buftrace here since if the buffer is not 
		 * B_HOLD then we will brelse() the buffer before returning 
		 * from bwrite and we could be tracing a buffer that has 
		 * been reused.
		 */
		xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
	}
	return (error);
}
#endif /* !defined(__linux__) */
/*
 * All xfs metadata buffers except log state machine buffers
 * get this attached as their b_bdstrat callback function. 
 * This is so that we can catch a buffer
 * after prematurely unpinning it to forcibly shutdown the filesystem.
 */
#if !defined(__linux__)
int
xfs_bdstrat_cb(struct xfs_buf *bp)
{

	xfs_mount_t	*mp;

	mp = bp->b_fsprivate3;

	ASSERT(bp->b_target);
	if (!XFS_FORCED_SHUTDOWN(mp)) {
		struct bdevsw *my_bdevsw;
		my_bdevsw =  bp->b_target->bdevsw;
		ASSERT(my_bdevsw != NULL);
		bp->b_bdstrat = NULL;
		bdstrat(my_bdevsw, bp);
		return 0;
	} else { 
		xfs_buftrace("XFS__BDSTRAT IOERROR", bp);
		/*
		 * Metadata write that didn't get logged but 
		 * written delayed anyway. These aren't associated
		 * with a transaction, and can be ignored.
		 */
		if (XFS_BUF_IODONE_FUNC(bp) == NULL &&
		    (XFS_BUF_ISREAD(bp)) == 0)
			return (xfs_bioerror_relse(bp));
		else
			return (xfs_bioerror(bp));
	}
}
#endif /* !defined(__linux__) */
/*
 * Wrapper around bdstrat so that we can stop data
 * from going to disk in case we are shutting down the filesystem.
 * Typically user data goes thru this path; one of the exceptions
 * is the superblock.
 */
#if !defined(__linux__)
int
xfsbdstrat(
	struct xfs_mount 	*mp,
	struct xfs_buf		*bp)
{
	int		dev_major = emajor(bp->b_edev);

	ASSERT(mp);
	ASSERT(bp->b_target);
	if (!XFS_FORCED_SHUTDOWN(mp)) {
		/*
		 * We want priority I/Os to non-XLV disks to go thru'
		 * griostrategy(). The rest of the I/Os follow the normal
		 * path, and are uncontrolled. If we want to rectify
		 * that, use griostrategy2.
		 */
		if ( (XFS_BUF_IS_GRIO(bp)) &&
				(dev_major != XLV_MAJOR) ) {
			griostrategy(bp);
		} else {
			struct bdevsw	*my_bdevsw;

			my_bdevsw = bp->b_target->bdevsw;
			bdstrat(my_bdevsw, bp);
		}
		return 0;
	}

	buftrace("XFSBDSTRAT IOERROR", bp);
	return (xfs_bioerror_relse(bp));
}
#endif /* !defined(__linux__) */

/*
 * xfs_strategy
 *
 * This is where all the I/O and all the REAL allocations take
 * place.  For buffers with -1 for their b_blkno field, we need
 * to do a bmap to figure out what to do with them.  If it's a
 * write we may need to do an allocation, while if it's a read
 * we may either need to read from disk or do some block zeroing.
 * If b_blkno specifies a real but prevously unwritten block,
 * a transaction needs to be initiated to mark the block as
 * initialized. If b_blkno specifies a real block, then all
 * we need to do is pass the buffer on to the underlying driver.
 */
#if !defined(__linux__)
void
xfs_strategy(
	bhv_desc_t	*bdp,
	xfs_buf_t		*bp)
{
	xfs_inode_t	*ip;

	ip = XFS_BHVTOI(bdp);

	xfs_strat_core(&ip->i_iocore, bp);
}
#endif /* !defined(__linux__) */

#if !defined(__linux__)
void
xfs_strat_core(
	xfs_iocore_t	*io,
	xfs_buf_t		*bp)
{
	int		s;
	xfs_mount_t	*mp;

	mp = io->io_mount;
	bp->b_target = (bp->b_edev == mp->m_dev) ? mp->m_ddev_targp :
						   &mp->m_rtdev_targ;
	/*
	 * If this is just a buffer whose underlying disk space
	 * is already allocated, then just do the requested I/O.
	 */
	buftrace("XFS_STRATEGY", bp);
	if (bp->b_blkno >= 0 && !(bp->b_flags & B_UNINITIAL)) {
		xfs_check_bp(io, bp);
		/*
		 * XXXsup We should probably ignore FORCED_SHUTDOWN
		 * in this case. The disk space is already allocated,
		 * and this seems like data loss that can be avoided.
		 * But I need to test it first.
		 */
		(void) xfsbdstrat(mp, bp);
		return;
	}

	/*
	 * If we're reading, then we need to find out how the
	 * portion of the file required for this buffer is layed
	 * out and zero/read in the appropriate data.
	 */
	if (bp->b_flags & B_READ) {
		xfs_strat_read(io, bp);
		return;
	}

	if (XFS_FORCED_SHUTDOWN(mp)) {
		xfs_bioerror_relse(bp);
		return;
	}
	/*
	 * Here we're writing the file and probably need to allocate
	 * some underlying disk space or to mark it as initialized.
	 * If the buffer is being written asynchronously by bdflush()
	 * then we queue if for the xfsds so that we won't put
	 * bdflush() to sleep.
	 */
	if ((bp->b_flags & (B_ASYNC | B_BDFLUSH)) == (B_ASYNC | B_BDFLUSH) &&
	    (xfsd_count > 0)) {
		s = mp_mutex_spinlock(&xfsd_lock);
		/*
		 * Queue the buffer at the end of the list.
		 * Bump the inode count of the number of queued buffers.
		 */
		if (xfsd_list == NULL) {
			bp->av_forw = bp;
			bp->av_back = bp;
			xfsd_list = bp;
		} else {
			bp->av_back = xfsd_list->av_back;
			xfsd_list->av_back->av_forw = bp;
			xfsd_list->av_back = bp;
			bp->av_forw = xfsd_list;
		}
		/*
		 * Store the behavior pointer where the xfsds can find
		 * it so that we don't have to lookup the XFS behavior
		 * from the vnode in the buffer again.
		 */
		bp->b_private = io;
		xfsd_bufcount++;
		ASSERT(io->io_queued_bufs >= 0);
		atomicAddInt(&(io->io_queued_bufs), 1);
		(void)sv_signal(&xfsd_wait);
		mp_mutex_spinunlock(&xfsd_lock, s);
	} else {
		/*
		 * We're not going to queue it for the xfsds, but bump the
		 * inode's count anyway so that we can tell that this
		 * buffer is still on its way out.
		 */
		ASSERT(io->io_queued_bufs >= 0);
		atomicAddInt(&(io->io_queued_bufs), 1);
		XFS_STRAT_WRITE(mp, io, bp);
	}
}
#endif /* !defined(__linux__) */
/*
 * This is called from xfs_init() to start the xfs daemons.
 * We'll start with a minimum of 4 of them, and add 1
 * for each 128 MB of memory up to 1 GB.  That should
 * be enough.
 */
#if !defined(__linux__)
void
xfs_start_daemons(void)
{
	int	num_daemons;
	int	i;
	int	num_pages;
	extern int xfsd_pri;
	st_func_t	*func;

#define XFSD_SSIZE	(2*KTHREAD_DEF_STACKSZ)

	func = (st_func_t *)xfs_strat_comp;

	num_daemons = 4;
#if _MIPS_SIM != _ABI64
	/*
	 * For small memory systems we reduce the number of daemons
	 * to conserve memory.  For systems with less than 32 MB of
	 * memory we go with 2 daemons, and for systems with less
	 * than 48 MB of memory we go with 3.
	 */
	if (physmem < 8192) {
		num_daemons = 2;
	} else if (physmem < 12288) {
		num_daemons = 3;
	}
#endif
	num_pages = (int)physmem - 32768;
	while ((num_pages > 0) && (num_daemons < 13)) {
		num_pages -= 32768;
		num_daemons++;
	}
	ASSERT(num_daemons <= 13);

	/*
	 * Start the xfs_strat_completion daemon, and the 
	 * xfsds. For now, use the same priority.
	 */
	sthread_create("xfsc", 0, XFSD_SSIZE, 0, xfsd_pri, KT_PS,
			(st_func_t *)func, 0, 0, 0, 0);
	for (i = 0; i < num_daemons; i++) {
		sthread_create("xfsd", 0, XFSD_SSIZE, 0, xfsd_pri, KT_PS,
				(st_func_t *)xfsd, 0, 0, 0, 0);
	}
#undef XFSD_SSIZE
	return;
}
#endif /* !defined(__linux__) */


#define MAX_BUF_EXAMINED 10

/*
 * This function purges the xfsd list of all bufs belonging to the
 * specified vnode.  This will allow a file about to be deleted to 
 * remove its buffers from the xfsd_list so it doesn't have to wait
 * for them to be pushed out to disk
 */
#if !defined(_USING_PAGEBUF_T) 
void 
_xfs_xfsd_list_evict(bhv_desc_t * bdp)
{
	vnode_t		*vp;
	xfs_iocore_t	*io;
	
	int	s;
	int	cur_count;	/* 
				 * Count of buffers that have been processed
				 * since aquiring the spin lock 
				 */
	int countdown;		/* 
				 * Count of buffers that have been processed
				 * since we started.  This will prevent non-
				 * termination if buffers are being added to
				 * the head of the list 
				 */
	xfs_buf_t	*bp;
	xfs_buf_t	*forw;
	xfs_buf_t	*back;
	xfs_buf_t	*next_bp;
	
	/* List and count of the saved buffers */
	xfs_buf_t	*bufs;
	unsigned int bufcount;

	/* Marker Buffers */
	xfs_buf_t	*cur_marker;
	xfs_buf_t	*end_marker;
	
	vp = BHV_TO_VNODE(bdp);
	
	/* Initialize bufcount and bufs */
	bufs = NULL;
	bufcount = 0;

	/* Allocate both markers at once... it's a little nicer. */
	cur_marker = (xfs_buf_t *)kmem_alloc(sizeof(xfs_buf_t)*2, KM_SLEEP);
	
	/* A little sketchy pointer-math, but should be ok. */
	end_marker = cur_marker + 1;

	s = mp_mutex_spinlock(&xfsd_lock);
	
	/* Make sure there are buffers to check */
	if (xfsd_list == NULL) {
		mp_mutex_spinunlock(&xfsd_lock, s);
		
		kmem_free(cur_marker, sizeof(xfs_buf_t)*2);
		return;
	}

	/* 
	 * Ok.  We know we're going to use the markers now, so let's 
	 * actually initialize them.  At Ray's suggestion, we'll make
	 * the b_vp == -1 as the signifier that this is a marker.  We 
	 * know a marker is unlinked if it's av_forw and av_back pointers
	 * point to itself.
	 */

	cur_marker->b_vp = (vnode_t *)-1L;
	end_marker->b_vp = (vnode_t *)-1L;

	/* Now link end_marker onto the end of the list */

	end_marker->av_back = xfsd_list->av_back;
	xfsd_list->av_back->av_forw = end_marker;
	end_marker->av_forw = xfsd_list;
	xfsd_list->av_back = end_marker;

	xfsd_bufcount++;

	/* 
	 * Set the countdown to it's initial value.  This will be a snapshot
	 * of the size of the list.  If we process this many buffers without
	 * finding the end_marker, then someone is putting buffers onto the 
	 * head of the list
	 */
	countdown = xfsd_bufcount;
	
	/* Zero the initial count */
	cur_count = 0;

	bp = xfsd_list;
 
	/* 
	 * Loop: Assumptions: the end_marker has been set, bp is set to the 
	 * current buf being examined, the xfsd_lock is held, cur_marker is
	 * <not> linked into the list.
	 */
	while (1) {
		/* We are processing a buffer.  Take note of that fact. */
		cur_count++;
		countdown--;
		
		if (bp == end_marker) {
			/* Unlink it from the list */
			
			/* 
			 * If it's the only thing on the list, NULL the 
			 * xfsd_list. Otherwise, unlink normally 
			 */
			if (bp->av_forw == bp) {
				xfsd_list = NULL;
			} else {
				forw = bp->av_forw;
				back = bp->av_back;
				forw->av_back = back;
				back->av_forw = forw;
			}
			
			xfsd_bufcount--;

			/* Move the head of the list forward if necessary */
			if (bp == xfsd_list)
				xfsd_list = bp->av_forw;
			
			break;
		}

		/* Check to see if this buffer should be removed */
		if (bp->b_vp == vp) {
			next_bp = bp->av_forw;
	    
			/* Remove the buffer from the xfsd_list */
			forw = bp->av_forw;
			back = bp->av_back;
			forw->av_back = back;
			back->av_forw = forw;

			/* 
			 * If we removed the head of the xfsd_list, move 
			 * it forward. 
			 */
			if (xfsd_list == bp) xfsd_list = next_bp;

			xfsd_bufcount--;
			
			/* 
			 * We can't remove all of the list, since we know 
			 * we have yet to see the end_marker.. that's the
			 * only buffer we'll see that MIGHT be the sole
			 * occupant of the list.
			 */
			   
			/* Now add the buffer to the list of buffers to free */
			if (bufcount > 0) {			
				bufs->av_back->av_forw = bp;
				bp->av_back = bufs->av_back;
				bufs->av_back = bp;
				bp->av_forw = bufs;
				
				bufcount++;
			} else {
				bufs = bp;
				bp->av_forw = bp;
				bp->av_back = bp;
				
				bufcount = 1;
			}
			
			bp = next_bp;
		} else {
			bp = bp->av_forw;
		}
		
		/* Now, bp has been advanced. */
		
		/* Now before we iterate, make sure we haven't run too long */
		if (cur_count > MAX_BUF_EXAMINED) {
			/* 
			 * Stick the cur_marker into the current pos in the 
			 * list.  The only special case is if the current bp
			 * is the head of the list, in which case we have to
			 * point the head of the list.
			 */
			
			/* First, link the cur_marker before the new bp */
			cur_marker->av_forw = bp;
			cur_marker->av_back = bp->av_back;
			bp->av_back->av_forw = cur_marker;
			bp->av_back = cur_marker;
			
			xfsd_bufcount++;
			
			if (bp == xfsd_list)
				xfsd_list = cur_marker;
			
			/* Now, it's safe to release the lock... */
			mp_mutex_spinunlock(&xfsd_lock, s);
			
			/*
			 * Kill me! I'm here! KILL ME!!! (If an interrupt
			 * needs too happen, it can. Now we won't blow
			 * realtime ;-).
			 */
			
			s = mp_mutex_spinlock(&xfsd_lock);
			
			/* Zero the current count */
			cur_count = 0;
			
			/* 
			 * Figure out if we SHOULD continue (if the end_marker
			 * has been removed, give up, unless it's the head of
			 * the otherwise empty list, since it's about to be 
			 * dequed and then we'll stop.
			 */
			if ((end_marker->av_forw == end_marker) && 
				(xfsd_list != end_marker)) {
				break;
			}
			
			/*
			 * Now determine if we should start at the marker or
			 * from the beginning of the list.  It can't be the
			 * only thing on the list, since the end_marker should
			 * be there too.
			 */
			if (cur_marker->av_forw == cur_marker) {
				bp = xfsd_list;
			} else {
				bp = cur_marker->av_forw;
				
				/* 
				 * Now dequeue the marker.  It might be the 
				 * head of the list, so we might have to move
				 * the list head... 
				 */
				
				forw = cur_marker->av_forw;
				back = cur_marker->av_back;
				forw->av_back = back;
				back->av_forw = forw;
				
				if (cur_marker == xfsd_list)
					xfsd_list = bp;
				
				xfsd_bufcount--;
				
				/* 
				 * We know we can't be the only buffer on the 
				 * list, as previously stated... so we 
				 * continue. 
				 */
			}
		}

		/*
		 * If countdown reaches zero without breaking by dequeuing the
		 * end_marker, someone is putting things on the front of the 
		 * list, so we'll quit to thwart their cunning attempt to keep
		 * us tied up in the list.  So dequeue the end_marker now and
		 * break.
		 */
		   
		/*
		 * Invarients at this point:  The end_marker is on the list.
		 * It may or may not be the head of the list.  cur_marker is
		 * NOT on the list.
		 */
		if (countdown == 0) {
			/* Unlink the end_marker from the list */
			
			if (end_marker->av_forw == end_marker) {
				xfsd_list = NULL;
			} else {
				forw = end_marker->av_forw;
				back = end_marker->av_back;
				forw->av_back = back;
				back->av_forw = forw;
			}
			
			xfsd_bufcount--;

			/* Move the head of the list forward if necessary */
			if (end_marker == xfsd_list) xfsd_list = end_marker->av_forw;
			
			break;
		}
	}
	
	mp_mutex_spinunlock(&xfsd_lock, s);
	kmem_free(cur_marker, sizeof(xfs_buf_t)*2);	
	
	/*
	 * At this point, bufs contains the list of buffers that would have
	 * been written to disk, if we hadn't swiped them (which we did
	 * because they are part of a file being deleted, so they obviously
	 * shouldn't go to the disk.  At this point, we need to make them as
	 * done.
	 */
	
	bp = bufs;
	
	/* We use s as a counter.  It's handy, so there. */
	for (s = 0; s < bufcount; s++) {
		next_bp = bp->av_forw;
		
		bp->av_forw = bp;
		bp->av_back = bp;
		
		XFS_BUF_STALE(bp);
		io = bp->b_private;
		atomicAddInt(&(io->io_queued_bufs), -1);
		bp->b_private = NULL;
		
		/* Now call biodone. */
		biodone(bp);
		
		bp = next_bp;
	}
}
#endif /* !defined(__linux__) */
/*
 * This is the main loop for the xfs daemons.
 * From here they wait in a loop for buffers which will
 * require transactions to write out and process them as they come.
 * This way we never force bdflush() to wait on one of our transactions,
 * thereby keeping the system happier and preventing buffer deadlocks.
 */
#if !defined(__linux__)
STATIC int
xfsd(void)
{
	int		s;
	xfs_buf_t		*bp;
	xfs_buf_t		*forw;
	xfs_buf_t		*back;
	xfs_iocore_t	*io;

#ifdef __linux__
	daemonize();
	set_thread_name("xfsd");
#endif /* __linux__ */

	s = mp_mutex_spinlock(&xfsd_lock);
	xfsd_count++;

	while (1) {
		while (xfsd_list == NULL) {
			mp_sv_wait(&xfsd_wait, PRIBIO, &xfsd_lock, s);
			s = mp_mutex_spinlock(&xfsd_lock);
		}

		/*
		 * Pull a buffer off of the list.
		 */
		bp = xfsd_list;
		forw = bp->av_forw;
		back = bp->av_back;
		forw->av_back = back;
		back->av_forw = forw;
		if (forw == bp) {
			xfsd_list = NULL;
		} else {
			xfsd_list = forw;
		}
		xfsd_bufcount--;;
		ASSERT(xfsd_bufcount >= 0);

		bp->av_forw = bp;
		bp->av_back = bp;

		/* Now make sure we didn't just process a marker */
		if (bp->b_vp == (vnode_t *)-1L) {
			continue;
		}
		/*
		 * Don't give up the xfsd_lock until we have set the
		 * av_forw and av_back pointers, because otherwise 
		 * xfs_xfsd_list_evict() might be racing with us on
		 * a marker that it placed and we just removed.
		 */
		mp_mutex_spinunlock(&xfsd_lock, s);

		ASSERT((bp->b_flags & (B_BUSY | B_ASYNC | B_READ)) ==
		       (B_BUSY | B_ASYNC));
		XFSSTATS.xs_xfsd_bufs++;
		io = bp->b_private;
		bp->b_private = NULL;
		XFS_STRAT_WRITE(io->io_mount, io, bp);

		s = mp_mutex_spinlock(&xfsd_lock);
	}
}
#endif /* !defined(__linux__) */
/*
 * xfs_inval_cached_pages()
 * This routine is responsible for keeping direct I/O and buffered I/O
 * somewhat coherent.  From here we make sure that we're at least
 * temporarily holding the inode I/O lock exclusively and then call
 * the page cache to flush and invalidate any cached pages.  If there
 * are no cached pages this routine will be very quick.
 */
#if 1
void
xfs_inval_cached_pages(
	vnode_t		*vp,
	xfs_iocore_t	*io,
	off_t		offset,
	off_t		len,
	void		*dio)		    
{
	xfs_dio_t	*diop = (xfs_dio_t *)dio;
	int		relock;
	__uint64_t	flush_end;
	xfs_mount_t	*mp;

	if (!VN_CACHED(vp)) {
		return;
	}

	mp = io->io_mount;

	/*
	 * We need to get the I/O lock exclusively in order
	 * to safely invalidate pages and mappings.
	 */
	relock = ismrlocked(io->io_iolock, MR_ACCESS);
	if (relock) {
		XFS_IUNLOCK(mp, io, XFS_IOLOCK_SHARED);
		XFS_ILOCK(mp, io, XFS_IOLOCK_EXCL);
	}

	/* Writing beyond EOF creates a hole that must be zeroed */
	if (diop && (offset > XFS_SIZE(mp, io))) {
		xfs_fsize_t	isize;

		XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
		isize = XFS_SIZE(mp, io);
		if (offset > isize) {
			xfs_zero_eof(vp, io, offset, isize, diop->xd_cr,
					diop->xd_pmp);
		}
		XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
	}

	/*
	 * Round up to the next page boundary and then back
	 * off by one byte.  We back off by one because this
	 * is a first byte/last byte interface rather than
	 * a start/len interface.  We round up to a page
	 * boundary because the page/chunk cache code is
	 * slightly broken and won't invalidate all the right
	 * buffers otherwise.
	 *
	 * We also have to watch out for overflow, so if we
	 * go over the maximum off_t value we just pull back
	 * to that max.
	 */
	flush_end = (__uint64_t)ctooff(offtoc(offset + len)) - 1;
	if (flush_end > (__uint64_t)LONGLONG_MAX) {
		flush_end = LONGLONG_MAX;
	}
	xfs_inval_cached_trace(io, offset, len, ctooff(offtoct(offset)),
		flush_end);
	VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(offset)), (off_t)flush_end,
			FI_REMAPF_LOCKED);
	if (relock) {
		XFS_IUNLOCK(mp, io, XFS_IOLOCK_EXCL);
		XFS_ILOCK(mp, io, XFS_IOLOCK_SHARED);
	}
}
#endif /* !defined(__linux__) */
/*
 * A user has written some portion of a realtime extent.  We need to zero
 * what remains, so the caller can mark the entire realtime extent as
 * written.  This is only used for filesystems that don't support unwritten
 * extents.
 */
#if  !defined(__linux__) 
STATIC int
xfs_dio_write_zero_rtarea(
	xfs_inode_t	*ip,
	struct xfs_buf	*bp,
	xfs_fileoff_t	offset_fsb,
	xfs_filblks_t	count_fsb)
{
	char		*buf;
	long		bufsize, remain_count;
	int		error;
	xfs_mount_t	*mp;
	struct bdevsw	*my_bdevsw;
	xfs_bmbt_irec_t	imaps[XFS_BMAP_MAX_NMAP], *imapp;
	xfs_buf_t		*nbp;
	int		reccount, sbrtextsize;
	xfs_fsblock_t	firstfsb;
	xfs_fileoff_t	zero_offset_fsb, limit_offset_fsb;
	xfs_fileoff_t	orig_zero_offset_fsb;
	xfs_filblks_t	zero_count_fsb;

	ASSERT(ip->i_d.di_flags & XFS_DIFLAG_REALTIME);
	mp = ip->i_mount;
	sbrtextsize = mp->m_sb.sb_rextsize;
	/* Arbitrarily limit the buffer size to 32 FS blocks or less. */
	if (sbrtextsize <= 32)
		bufsize = XFS_FSB_TO_B(mp, sbrtextsize);
	else
		bufsize = mp->m_sb.sb_blocksize * 32;
	ASSERT(sbrtextsize > 0 && bufsize > 0);
	limit_offset_fsb = (((offset_fsb + count_fsb + sbrtextsize - 1)
				/ sbrtextsize ) * sbrtextsize );
	zero_offset_fsb = offset_fsb - (offset_fsb % sbrtextsize);
	orig_zero_offset_fsb = zero_offset_fsb;
	zero_count_fsb = limit_offset_fsb - zero_offset_fsb;
	reccount = 1;

	/* Discover the full realtime extent affected */

	error = xfs_bmapi(NULL, ip, zero_offset_fsb, 
			  zero_count_fsb, 0, &firstfsb, 0, imaps, 
			  &reccount, 0);
	imapp = &imaps[0];
	if (error)
		return error;

	buf = (char *)kmem_alloc(bufsize, KM_SLEEP|KM_CACHEALIGN);
	bzero(buf, bufsize);
	nbp = getphysbuf(bp->b_edev);
	nbp->b_grio_private = bp->b_grio_private;
						/* b_iopri */
     	nbp->b_error     = 0;
	nbp->b_edev	 = bp->b_edev;
	XFS_BUF_PTR(nbp) = buf;
	my_bdevsw	 = get_bdevsw(nbp->b_edev);
	ASSERT(my_bdevsw != NULL);

	/* Loop while there are blocks that need to be zero'ed */

	while (zero_offset_fsb < limit_offset_fsb) {
		remain_count = 0;
		if (zero_offset_fsb < offset_fsb)
			remain_count = offset_fsb - zero_offset_fsb;
		else if (zero_offset_fsb >= (offset_fsb + count_fsb))
			remain_count = limit_offset_fsb - zero_offset_fsb;
		else {
			zero_offset_fsb += count_fsb;
			continue;
		}
		remain_count = XFS_FSB_TO_B(mp, remain_count);
		XFS_BUF_BFLAGS(nbp)  = XFS_BUF_BFLAGS(bp);
		nbp->b_bcount    = (bufsize < remain_count) ? bufsize :
						remain_count;
 	    	nbp->b_error     = 0;
		nbp->b_blkno     = XFS_FSB_TO_BB(mp, imapp->br_startblock +
				    (zero_offset_fsb - orig_zero_offset_fsb));
		(void) bdstrat(my_bdevsw, nbp);
		if ((error = geterror(nbp)) != 0)
			break;
		biowait(nbp);
		/* Stolen directly from xfs_dio_write */
		nbp->b_flags &= ~B_GR_BUF;	/* Why? B_PRV_BUF? */
		if ((error = geterror(nbp)) != 0)
			break;
		else if (nbp->b_resid)
			nbp->b_bcount -= nbp->b_resid;
			
		zero_offset_fsb += XFS_B_TO_FSB(mp, nbp->b_bcount);
	}
	/* Clean up for the exit */
	nbp->b_flags		= 0;
	nbp->b_bcount		= 0;
	XFS_BUF_PTR(nbp)	= 0;
	nbp->b_grio_private	= 0;	/* b_iopri */
 	putphysbuf( nbp );
	kmem_free(buf, bufsize);

	return error;
}
#endif /* !defined(__linux__) */

/*
 * xfs_dio_read()
 *	This routine issues the calls to the disk device strategy routine
 *	for file system read made using direct I/O from user space.
 *	The I/Os for each extent involved are issued at once.
 *
 * RETURNS:
 *	error 
 */
#if !defined(__linux__) 
int
xfs_dio_read(
	xfs_dio_t *diop)
{
	xfs_buf_t		*bp;
	xfs_iocore_t 	*io;
	xfs_trans_t	*tp;
	xfs_mount_t	*mp;
	xfs_bmbt_irec_t	imaps[XFS_BMAP_MAX_NMAP], *imapp;
	xfs_buf_t		*bps[XFS_BMAP_MAX_NMAP], *nbp;
	xfs_fileoff_t	offset_fsb;
	xfs_fsblock_t	firstfsb;
	xfs_filblks_t	count_fsb;
	xfs_bmap_free_t free_list;
	caddr_t		base;
	ssize_t		resid, count, totxfer;
	off_t		offset, offset_this_req, bytes_this_req, trail = 0;
	int		i, j, error, reccount;
	int		end_of_file, bufsissued, totresid;
	int		blk_algn, rt;
	int		unwritten;
	uint		lock_mode;
	xfs_fsize_t	new_size;

	CHECK_GRIO_TIMESTAMP(bp, 40);

	bp = diop->xd_bp;
	io = diop->xd_io;
	mp = io->io_mount;
	blk_algn = diop->xd_blkalgn;
	base = XFS_BUF_PTR(bp);
	
	error = resid = totxfer = end_of_file = 0;
	offset = BBTOOFF((off_t)bp->b_blkno);
	totresid = count = bp->b_bcount;

	/*
 	 * Determine if this file is using the realtime volume.
	 */
	rt = (io->io_flags & XFS_IOCORE_RT);

	/*
	 * Process the request until:
	 * 1) an I/O error occurs
	 * 2) end of file is reached.
	 * 3) end of device (driver error) occurs
	 * 4) request is completed.
	 */
	while (!error && !end_of_file && !resid && count) {
		offset_fsb = XFS_B_TO_FSBT(mp, offset);
		count_fsb  = XFS_B_TO_FSB(mp, count);

		tp = NULL;
		unwritten = 0;
retry:
		XFS_BMAP_INIT(&free_list, &firstfsb);
		/*
		 * Read requests will be issued 
		 * up to XFS_BMAP_MAX_MAP at a time.
		 */
		reccount = XFS_BMAP_MAX_NMAP;
		imapp = &imaps[0];
		CHECK_GRIO_TIMESTAMP(bp, 40);

		lock_mode = XFS_LCK_MAP_SHARED(mp, io);

		CHECK_GRIO_TIMESTAMP(bp, 40);

		/*
 		 * Issue the bmapi() call to get the extent info.
		 */
		CHECK_GRIO_TIMESTAMP(bp, 40);
		error = XFS_BMAPI(mp, tp, io, offset_fsb, count_fsb, 
				  0, &firstfsb, 0, imapp,
				  &reccount, &free_list);
		CHECK_GRIO_TIMESTAMP(bp, 40);

		XFS_UNLK_MAP_SHARED(mp, io, lock_mode);
		if (error)
			break;

                /*
                 * xfs_bmapi() did not return an error but the 
 		 * reccount was zero. This means that a delayed write is
		 * in progress and it is necessary to call xfs_bmapi() again
		 * to map the correct portion of the file.
                 */
                if ((!error) && (reccount == 0)) {
			goto retry;
                }

		/*
   		 * Run through each extent.
		 */
		bufsissued = 0;
		for (i = 0; (i < reccount) && (!end_of_file) && (count);
		     i++) {
			imapp = &imaps[i];
			unwritten = !(imapp->br_state == XFS_EXT_NORM);

			bytes_this_req =
				XFS_FSB_TO_B(mp, imapp->br_blockcount) -
				BBTOB(blk_algn);

			ASSERT(bytes_this_req);

			offset_this_req =
				XFS_FSB_TO_B(mp, imapp->br_startoff) +
				BBTOB(blk_algn); 

			/*
			 * Reduce request size, if it
			 * is longer than user buffer.
			 */
			if (bytes_this_req > count) {
				 bytes_this_req = count;
			}

			/*
			 * Check if this is the end of the file.
			 */
			new_size = offset_this_req + bytes_this_req;
			if (new_size > XFS_SIZE(mp, io)) {
				xfs_fsize_t	isize;

				/*
 			 	 * If trying to read past end of
 			 	 * file, shorten the request size.
				 */
				XFS_ILOCK(mp, io, XFS_ILOCK_SHARED);
				isize = XFS_SIZE(mp, io);
				if (new_size > isize) {
				   if (isize > offset_this_req) {
					trail = isize - offset_this_req;
					bytes_this_req = trail;
					bytes_this_req &= ~BBMASK;
					bytes_this_req += BBSIZE;
				   } else {
					bytes_this_req =  0;
				   }

				   end_of_file = 1;

				   if (!bytes_this_req) {
					XFS_IUNLOCK(mp, io, XFS_ILOCK_SHARED);
					break;
				   }
				}
				XFS_IUNLOCK(mp, io, XFS_ILOCK_SHARED);
			}

			/*
 			 * Do not do I/O if there is a hole in the file.
			 * Do not read if the blocks are unwritten.
			 */
			if ((imapp->br_startblock == HOLESTARTBLOCK) ||
			    unwritten) {
				/*
 				 * Physio() has already mapped user address.
				 */
				bzero(base, bytes_this_req);

				/*
				 * Bump the transfer count.
				 */
				if (trail) 
					totxfer += trail;
				else
					totxfer += bytes_this_req;
			} else {
				/*
 				 * Setup I/O request for this extent.
				 */
				CHECK_GRIO_TIMESTAMP(bp, 40);
	     			bps[bufsissued++]= nbp = getphysbuf(bp->b_edev);
				CHECK_GRIO_TIMESTAMP(bp, 40);

	     			nbp->b_flags     = bp->b_flags;
				nbp->b_grio_private = bp->b_grio_private;
								/* b_iopri */

	     			nbp->b_error     = 0;
				nbp->b_target    = bp->b_target;
				if (rt) {
	     				nbp->b_blkno = XFS_FSB_TO_BB(mp,
						imapp->br_startblock);
				} else {
	     				nbp->b_blkno = XFS_FSB_TO_DADDR(mp,
						imapp->br_startblock) + 
						blk_algn;
				}
				ASSERT(bytes_this_req);
	     			nbp->b_bcount    = bytes_this_req;
	     			XFS_BUF_PTR(nbp) = base;
				/*
 				 * Issue I/O request.
				 */
				CHECK_GRIO_TIMESTAMP(nbp, 40);
				(void) xfsbdstrat(mp, nbp);
				
		    		if (error = geterror(nbp)) {
					biowait(nbp);
					nbp->b_flags = 0;
		     			XFS_BUF_PTR(nbp) = 0;
					nbp->b_grio_private = 0; /* b_iopri */
					putphysbuf( nbp );
					bps[bufsissued--] = 0;
					break;
		     		}
			}

			/*
			 * update pointers for next round.
			 */

	     		base   += bytes_this_req;
	     		offset += bytes_this_req;
	     		count  -= bytes_this_req;
			blk_algn= 0;

		} /* end of for loop */

		/*
		 * Wait for I/O completion and recover buffers.
		 */
		for (j = 0; j < bufsissued ; j++) {
	  		nbp = bps[j];
	    		biowait(nbp);
			nbp->b_flags &= ~B_GR_BUF;	/* Why? B_PRV_BUF? */

	     		if (!error)
				error = geterror(nbp);

	     		if (!error && !resid) {
				resid = nbp->b_resid;

				/*
				 * prevent adding up partial xfers
				 */
				if (trail && (j == (bufsissued -1 ))) {
					if (resid <= (nbp->b_bcount - trail) )
						totxfer += trail;
				} else {
					totxfer += (nbp->b_bcount - resid);
				}
			} 
	    	 	nbp->b_flags		= 0;
	     		nbp->b_bcount		= 0;
	     		XFS_BUF_PTR(nbp)	= 0;
	     		nbp->b_grio_private	= 0; /* b_iopri */
	    	 	putphysbuf( nbp );
	     	}
	} /* end of while loop */

	/*
 	 * Fill in resid count for original buffer.
	 * if any of the io's fail, the whole thing fails
	 */
	if (error) {
		totxfer = 0;
	}

	bp->b_resid = totresid - totxfer;

	return (error);
}
#endif /* !defined(__linux__) */


/*
 * xfs_dio_write()
 *	This routine issues the calls to the disk device strategy routine
 *	for file system writes made using direct I/O from user space. The
 *	I/Os are issued one extent at a time.
 *
 * RETURNS:
 *	error
 */
#if !defined(__linux__)  
int
xfs_dio_write(
	xfs_dio_t *diop)
{
	xfs_buf_t		*bp;
	xfs_iocore_t	*io;
	xfs_inode_t 	*ip;
	xfs_trans_t	*tp;
			/* REFERENCED */
	vnode_t		*vp;
	xfs_mount_t	*mp;
	xfs_bmbt_irec_t	imaps[XFS_BMAP_MAX_NMAP], *imapp;
	xfs_buf_t		*nbp;
	xfs_fileoff_t	offset_fsb;
	xfs_fsblock_t	firstfsb;
	xfs_filblks_t	count_fsb, datablocks;
	xfs_bmap_free_t free_list;
	caddr_t		base;
	ssize_t		resid, count, totxfer;
	off_t		offset, offset_this_req, bytes_this_req;
	int		error, reccount, bmapi_flag, ioexcl;
	int		end_of_file, totresid, exist;
	int		blk_algn, rt, numrtextents, sbrtextsize, iprtextsize;
	int		committed, unwritten, using_quotas, nounwritten;
	xfs_fsize_t	new_size;
	int		nres;

	bp = diop->xd_bp;
	vp = BHV_TO_VNODE(diop->xd_bdp);
	io = diop->xd_io;
	blk_algn = diop->xd_blkalgn;
	mp = io->io_mount;
	ip = XFS_IO_INODE(io);

	base = XFS_BUF_PTR(bp);
	error = resid = totxfer = end_of_file = ioexcl = 0;
	offset = BBTOOFF((off_t)bp->b_blkno);
	numrtextents = iprtextsize = sbrtextsize = 0;
	totresid = count = bp->b_bcount;

	/*
 	 * Determine if this file is using the realtime volume.
	 */
	if ((rt = ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
		sbrtextsize = mp->m_sb.sb_rextsize;
		iprtextsize =
			ip->i_d.di_extsize ? ip->i_d.di_extsize : sbrtextsize;
	}
	if (using_quotas = XFS_IS_QUOTA_ON(mp)) {
		if (XFS_NOT_DQATTACHED(mp, ip)) {
			if (error = xfs_qm_dqattach(ip, 0)) 
				goto error0;
		}
	}
	nounwritten = XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb) == 0;

	/*
	 * Process the request until:
	 * 1) an I/O error occurs
	 * 2) end of file is reached.
	 * 3) end of device (driver error) occurs
	 * 4) request is completed.
	 */
	while (!error && !end_of_file && !resid && count) {
		offset_fsb = XFS_B_TO_FSBT(mp, offset);
		count_fsb  = XFS_B_TO_FSB(mp, count);

		tp = NULL;
retry:
		XFS_BMAP_INIT(&free_list, &firstfsb);

		/*
 		 * We need to call bmapi() with the read flag set first to
		 * determine the existing 
		 * extents. This is done so that the correct amount
		 * of space can be reserved in the transaction 
		 * structure. Also, a check is needed to see if the
		 * extents are for valid blocks but also unwritten.
		 * If so, again a transaction needs to be reserved.
		 */
		reccount = 1;

		xfs_ilock(ip, XFS_ILOCK_EXCL);

		error = xfs_bmapi(NULL, ip, offset_fsb, 
				  count_fsb, 0, &firstfsb, 0, imaps, 
				  &reccount, 0);

		if (error) {
			xfs_iunlock(ip, XFS_ILOCK_EXCL);
			break;
		}

		/*
 		 * Get a pointer to the current extent map.
		 */
		imapp = &imaps[0];

		/*
		 * Check if the file extents already exist
		 */
		exist = imapp->br_startblock != DELAYSTARTBLOCK &&
			imapp->br_startblock != HOLESTARTBLOCK;

		reccount = 1;
		count_fsb = imapp->br_blockcount;

		/*
		 * If blocks are not yet allocated for this part of
		 * the file, allocate space for the transactions.
		 */
		if (!exist) {
			bmapi_flag = XFS_BMAPI_WRITE;
			if (rt) {
				/*
				 * Round up to even number of extents.
				 * Need the worst case, aligning the start
				 * offset down and the end offset up.
				 */
				xfs_fileoff_t	s, e;

				s = offset_fsb / iprtextsize;
				s *= iprtextsize;
				e = roundup(offset_fsb + count_fsb,
					    iprtextsize);
				numrtextents = (e - s) / sbrtextsize;
				datablocks = 0;
			} else {
				/*
				 * If this is a write to the data
				 * partition, reserve the space.
				 */
				datablocks = count_fsb;
			}

			/*
 			 * Setup transaction.
 			 */
			xfs_iunlock(ip, XFS_ILOCK_EXCL);
			if (rt && nounwritten && !ioexcl) {
				xfs_iunlock(ip, XFS_IOLOCK_SHARED);
				xfs_ilock(ip, XFS_IOLOCK_EXCL);
				ioexcl = 1;
				goto retry;
			}
			tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);

			nres = XFS_DIOSTRAT_SPACE_RES(mp, datablocks);
			error = xfs_trans_reserve(tp, nres,
				   XFS_WRITE_LOG_RES(mp), numrtextents,
				   XFS_TRANS_PERM_LOG_RES,
				   XFS_WRITE_LOG_COUNT );
			xfs_ilock(ip, XFS_ILOCK_EXCL);

			if (error) {
				/*
				 * Ran out of file system space.
				 * Free the transaction structure.
				 */
				ASSERT(error == ENOSPC || 
				       XFS_FORCED_SHUTDOWN(mp));
				xfs_trans_cancel(tp, 0);
				xfs_iunlock(ip, XFS_ILOCK_EXCL);
				break;
			} 
			/* 
			 * quota reservations
			 */
			if (using_quotas &&
			    xfs_trans_reserve_blkquota(tp, ip, nres)) {
				error = XFS_ERROR(EDQUOT);
				xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
				xfs_iunlock(ip, XFS_ILOCK_EXCL);
				break;
			}
			xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
			xfs_trans_ihold(tp, ip);

			if (offset < ip->i_d.di_size || rt)
				bmapi_flag |= XFS_BMAPI_PREALLOC;

			/*
 			 * Issue the bmapi() call to do actual file
			 * space allocation.
			 */
			CHECK_GRIO_TIMESTAMP(bp, 40);
			error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, 
				  bmapi_flag, &firstfsb, 0, imapp, &reccount,
				  &free_list);
			CHECK_GRIO_TIMESTAMP(bp, 40);

			if (error) 
				goto error_on_bmapi_transaction;

			/*
	 		 * Complete the bmapi() allocations transactions.
			 * The bmapi() unwritten to written changes will
			 * be committed after the writes are completed.
			 */
		    	error = xfs_bmap_finish(&tp, &free_list,
					    firstfsb, &committed);
			if (error) 
				goto error_on_bmapi_transaction;

			xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES,
				     NULL);
		} else if (ioexcl) {
			xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
			ioexcl = 0;
		}
		xfs_iunlock(ip, XFS_ILOCK_EXCL);

                /*
                 * xfs_bmapi() did not return an error but the 
 		 * reccount was zero. This means that a delayed write is
		 * in progress and it is necessary to call xfs_bmapi() again
		 * to map the correct portion of the file.
                 */
                if ((!error) && (reccount == 0)) {
			if (ioexcl) {
				xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
				ioexcl = 0;
			}
			goto retry;
                }

		imapp = &imaps[0];
		unwritten = imapp->br_state != XFS_EXT_NORM;

		bytes_this_req = XFS_FSB_TO_B(mp, imapp->br_blockcount) -
				BBTOB(blk_algn);

		ASSERT(bytes_this_req);

		offset_this_req = XFS_FSB_TO_B(mp, imapp->br_startoff) +
				BBTOB(blk_algn); 

		/*
		 * Reduce request size, if it
		 * is longer than user buffer.
		 */
		if (bytes_this_req > count) {
			 bytes_this_req = count;
		}

		/*
		 * Check if this is the end of the file.
		 */
		new_size = offset_this_req + bytes_this_req;
		if (new_size >ip->i_d.di_size) {
			/*
			 * File is being extended on a
			 * write, update the file size if
			 * someone else didn't make it even
			 * bigger.
			 */
	         	ASSERT((vp->v_flag & VISSWAP) == 0);
			xfs_ilock(ip, XFS_ILOCK_EXCL);
			if (new_size > ip->i_d.di_size) {
		 		ip->i_d.di_size = offset_this_req + 
							bytes_this_req;
				ip->i_update_core = 1;
				ip->i_update_size = 1;
			}
			xfs_iunlock(ip, XFS_ILOCK_EXCL);
		}

		/*
		 * For realtime extents in filesystems that don't support
		 * unwritten extents we need to zero the part of the
		 * extent we're not writing.  If unwritten extents are
		 * supported the transaction after the write will leave
		 * the unwritten piece of the extent marked as such.
		 */
		if (ioexcl) {
			ASSERT(!unwritten);
			offset_fsb = XFS_B_TO_FSBT(mp, offset_this_req);
			count_fsb = XFS_B_TO_FSB(mp, bytes_this_req);
			error = xfs_dio_write_zero_rtarea(ip, bp, offset_fsb,
					count_fsb);
			xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
			ioexcl = 0;
			if (error)
				goto error0;
		}

		/*
 		 * Setup I/O request for this extent.
		 */
		CHECK_GRIO_TIMESTAMP(bp, 40);
		nbp = getphysbuf(bp->b_edev);
		CHECK_GRIO_TIMESTAMP(bp, 40);

	     	nbp->b_flags     = bp->b_flags;
		nbp->b_grio_private = bp->b_grio_private;
						/* b_iopri */

	     	nbp->b_error     = 0;
		nbp->b_target    = bp->b_target;
		nbp->b_blkno	 = XFS_FSB_TO_DB(ip, imapp->br_startblock) +
				   blk_algn;
		ASSERT(bytes_this_req);
	     	nbp->b_bcount    = bytes_this_req;
	     	XFS_BUF_PTR(nbp) = base;
		/*
 		 * Issue I/O request.
		 */
		CHECK_GRIO_TIMESTAMP(nbp, 40);
		(void) xfsbdstrat(mp, nbp);

    		if ((error = geterror(nbp)) == 0) {

			/*
			 * update pointers for next round.
			 */

     			base   += bytes_this_req;
     			offset += bytes_this_req;
     			count  -= bytes_this_req;
			blk_algn = 0;
     		}

		/*
		 * Wait for I/O completion and recover buffer.
		 */
		biowait(nbp);
		nbp->b_flags &= ~B_GR_BUF;	/* Why? B_PRV_BUF? */

		if (!error)
			error = geterror(nbp);

		if (!error && !resid) {
			resid = nbp->b_resid;

			/*
			 * prevent adding up partial xfers
			 */
			totxfer += (nbp->b_bcount - resid);
		} 
 		nbp->b_flags		= 0;
		nbp->b_bcount		= 0;
		XFS_BUF_PTR(nbp)	= 0;
		nbp->b_grio_private	= 0;	/* b_iopri */
 		putphysbuf( nbp );
		if (error)
			break;
		
		if (unwritten) {
			offset_fsb = XFS_B_TO_FSBT(mp, offset_this_req);
			count_fsb = XFS_B_TO_FSB(mp, bytes_this_req);
			/*
			 * Set up the xfs_bmapi() call to change the 
			 * extent from unwritten to written.
			 */
			tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);

			nres = XFS_DIOSTRAT_SPACE_RES(mp, 0);
			error = xfs_trans_reserve(tp, nres,
				   XFS_WRITE_LOG_RES(mp), 0,
				   XFS_TRANS_PERM_LOG_RES,
				   XFS_WRITE_LOG_COUNT );
			xfs_ilock(ip, XFS_ILOCK_EXCL);

			if (error) {
				/*
				 * Ran out of file system space.
				 * Free the transaction structure.
				 */
				ASSERT(error == ENOSPC || 
				       XFS_FORCED_SHUTDOWN(mp));
				xfs_trans_cancel(tp, 0);
				xfs_iunlock(ip, XFS_ILOCK_EXCL);
				break;
			} 
			xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
			xfs_trans_ihold(tp, ip);

			/*
 			 * Issue the bmapi() call to change the extents
			 * to written.
			 */
			reccount = 1;
			CHECK_GRIO_TIMESTAMP(bp, 40);
			error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, 
				  XFS_BMAPI_WRITE, &firstfsb, 0, imapp,
				  &reccount, &free_list);
			CHECK_GRIO_TIMESTAMP(bp, 40);

			if (error) 
				goto error_on_bmapi_transaction;

			/*
	 		 * Complete the bmapi() allocations transactions.
			 * The bmapi() unwritten to written changes will
			 * be committed after the writes are completed.
			 */
		    	error = xfs_bmap_finish(&tp, &free_list,
					    firstfsb, &committed);
			if (error) 
				goto error_on_bmapi_transaction;

			xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES,
				     NULL);
			xfs_iunlock(ip, XFS_ILOCK_EXCL);
		}
	} /* end of while loop */
	
	/*
 	 * Fill in resid count for original buffer.
	 * if any of the io's fail, the whole thing fails
	 */
	if (error) {
		totxfer = 0;
	}

	bp->b_resid = totresid - totxfer;

	/*
 	 *  Update the inode timestamp.
 	 */
	xfs_ilock(ip, XFS_ILOCK_EXCL);
	if ((ip->i_d.di_mode & (ISUID|ISGID)) &&
	    !cap_able_cred(diop->xd_cr, CAP_FSETID)) {
		ip->i_d.di_mode &= ~ISUID;
		/*
		 * Note that we don't have to worry about mandatory
		 * file locking being disabled here because we only
		 * clear the ISGID bit if the Group execute bit is
		 * on, but if it was on then mandatory locking wouldn't
		 * have been enabled.
		 */
		if (ip->i_d.di_mode & (IEXEC >> 3))
			ip->i_d.di_mode &= ~ISGID;
	}
	xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
	xfs_iunlock(ip, XFS_ILOCK_EXCL);

 error0:
	if (ioexcl)
		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
	return (error);

 error_on_bmapi_transaction:
	xfs_bmap_cancel(&free_list);
	xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT));
	xfs_iunlock(ip, XFS_ILOCK_EXCL);
	goto error0;
}
#endif /* !defined(__linux__) */

/*
 * xfs_diostrat()
 *	This routine issues the calls to the disk device strategy routine
 *	for file system reads and writes made using direct I/O from user
 *	space. In the case of a write request the I/Os are issued one 
 *	extent at a time. In the case of a read request I/Os for each extent
 *	involved are issued at once.
 *
 *	This function is common to xfs and cxfs.
 *
 * RETURNS:
 *	none
 */
#if !defined(__linux__)
int
xfs_diostrat(
	xfs_buf_t	*bp)
{
	xfs_dio_t	*diop;
	xfs_iocore_t	*io;
	xfs_mount_t	*mp;
	vnode_t		*vp;
	off_t		offset;
	int		error;

	CHECK_GRIO_TIMESTAMP(bp, 40);

	diop = (xfs_dio_t *)bp->b_private;
	io = diop->xd_io;
	mp = io->io_mount;
	vp = BHV_TO_VNODE(diop->xd_bdp);
	
	offset = BBTOOFF((off_t)bp->b_blkno);
	ASSERT(!(bp->b_flags & B_DONE));
        ASSERT(ismrlocked(io->io_iolock, MR_ACCESS| MR_UPDATE) != 0);

	/*
 	 * Check if the request is on a file system block boundary.
	 */
	diop->xd_blkalgn = ((offset & mp->m_blockmask) != 0) ? 
		 		OFFTOBB(offset & mp->m_blockmask) : 0;

	/*
	 * We're going to access the disk directly.
	 * Blow anything in the range of the request out of the
	 * buffer cache first.  This isn't perfect because we allow
	 * simultaneous direct I/O writers and buffered readers, but
	 * it should be good enough.
	 */
	if (!(diop->xd_ioflag & IO_IGNCACHE) && VN_CACHED(vp)) {
		xfs_inval_cached_pages(vp, io,
					offset, XFS_BUF_COUNT(bp), diop);
	}

	/*
	 * Alignment checks are done in xfs_diordwr().
	 * Determine if the operation is a read or a write.
	 */
	if (bp->b_flags & B_READ) {
		error = XFS_DIO_READ(mp, diop);
	} else {
		error = XFS_DIO_WRITE(mp, diop);
	}

	/*
	 * Issue completion on the original buffer.
	 */
	XFS_BUF_ERROR(bp, error);
	biodone(bp);

        ASSERT(ismrlocked(io->io_iolock, MR_ACCESS| MR_UPDATE) != 0);

	return (0);
}
#endif /* !defined(__linux__) */

/*
 * xfs_diordwr()
 *	This routine sets up a buf structure to be used to perform 
 * 	direct I/O operations to user space. The user specified 
 *	parameters are checked for alignment and size limitations. A buf
 *	structure is allocated an biophysio() is called.
 *
 *	This function is common to xfs and cxfs.
 *
 * RETURNS:
 * 	 0 on success
 * 	errno on error
 */
#if !defined(__linux__)
int
xfs_diordwr(
	bhv_desc_t	*bdp,
	xfs_iocore_t	*io,
	uio_t		*uiop,
	int		ioflag,
	cred_t		*credp,
	uint64_t	rw,
	off_t		*u_start,
	size_t		*u_length)
{
	extern 		zone_t	*grio_buf_data_zone;

	vnode_t		*vp;
	xfs_dio_t	dio;
	xfs_mount_t	*mp;
	uuid_t		stream_id;
	xfs_buf_t		*bp;
	int		error, index;
	__int64_t	iosize;
	extern int	scache_linemask;
	int		guartype = -1;

	vp = BHV_TO_VNODE(bdp);
	mp = io->io_mount;
	xfs_rw_enter_trace(rw & B_READ ? XFS_DIORD_ENTER : XFS_DIOWR_ENTER,
		XFS_BHVTOI(bdp), uiop, ioflag);

	/*
 	 * Check that the user buffer address is on a secondary cache
	 * line offset, while file offset and
 	 * request size are both multiples of file system block size. 
	 * This prevents the need for read/modify/write operations.
	 *
	 * This enforces the alignment restrictions indicated by 
 	 * the F_DIOINFO fcntl call.
	 *
	 * We make an exception for swap I/O and trusted clients like
	 * cachefs.  Swap I/O will always be page aligned and all the
	 * blocks will already be allocated, so we don't need to worry
	 * about read/modify/write stuff.  Cachefs ensures that it only
	 * reads back data which it has written, so we don't need to
	 * worry about block zeroing and such.
 	 */
	if (!(vp->v_flag & VISSWAP) && !(ioflag & IO_TRUSTEDDIO) &&
	    ((((long)(uiop->uio_iov->iov_base)) & scache_linemask) ||
	     (uiop->uio_offset & mp->m_blockmask) ||
	     (uiop->uio_resid & mp->m_blockmask))) {

		/*
		 * if the user tries to start reading at the
		 * end of the file, just return 0.
		 */
		if ((rw & B_READ) &&
		    (uiop->uio_offset == XFS_SIZE(mp, io))) {
			return (0);
		}
		return XFS_ERROR(EINVAL);
	}
	/*
	 * This ASSERT should catch bad addresses being passed in by
	 * trusted callers.
	 */
	ASSERT(!(((long)(uiop->uio_iov->iov_base)) & scache_linemask));

	/*
 	 * Do maxio check.
 	 */
	if (uiop->uio_resid > ctooff(v.v_maxdmasz - 1)) {
		return XFS_ERROR(EINVAL);
	}

	/*
 	 * Allocate local buf structure.
	 */
	if (io->io_flags & XFS_IOCORE_RT) {
		bp = getphysbuf(mp->m_rtdev);
		bp->b_target = &mp->m_rtdev_targ;
	} else {
		bp = getphysbuf(mp->m_dev);
		bp->b_target = mp->m_ddev_targp;
	}

	/*
 	 * Use xfs_dio_t structure to pass file/credential
	 * information to file system strategy routine.
	 */

	dio.xd_bp = bp;
	dio.xd_bdp = bdp;
	dio.xd_io = io;
	dio.xd_cr = credp;
	dio.xd_ioflag = ioflag;
	dio.xd_length = 0;
	dio.xd_start = 0;
	dio.xd_pmp = uiop->uio_pmp;
	bp->b_private = &dio;

	bp->b_grio_private = NULL;		/* b_iopri = 0 */
	bp->b_flags &= ~(B_GR_BUF|B_PRV_BUF);	/* lo pri queue */

	/*
	 * Check if this is a guaranteed rate I/O
	 */
	if (ioflag & IO_PRIORITY) {

		guartype = grio_io_is_guaranteed(uiop->uio_fp, &stream_id);

		/*
		 * Get priority level if this is a multilevel request.
		 * The level is stored in b_iopri, except if the request
		 * is controlled by griostrategy.
		 */
		if (uiop->uio_fp->vf_flag & FPRIO) {
			bp->b_flags |= B_PRV_BUF;
			VFILE_GETPRI(uiop->uio_fp, bp->b_iopri);
			/*
			 * Take care of some other thread racing
			 * and clearing FPRIO.
			 */
			if (bp->b_iopri == 0) 
				bp->b_flags &= ~B_PRV_BUF;
		}

		if (guartype == -1) {
			/*
			 * grio is not configed into kernel, but FPRIO
			 * is set.
			 */
		} else if (guartype) {

			short prval = bp->b_iopri;

			bp->b_flags |= B_GR_BUF;
			ASSERT((bp->b_grio_private == NULL) || 
						(bp->b_flags & B_PRV_BUF));
			bp->b_grio_private = 
				kmem_zone_alloc(grio_buf_data_zone, KM_SLEEP);
			ASSERT(BUF_GRIO_PRIVATE(bp));
			COPY_STREAM_ID(stream_id,BUF_GRIO_PRIVATE(bp)->grio_id);
			SET_GRIO_IOPRI(bp, prval);
			iosize =  uiop->uio_iov[0].iov_len;
			index = grio_monitor_io_start(&stream_id, iosize);
			INIT_GRIO_TIMESTAMP(bp);
		} else {
			/*
			 * FPRIORITY|FPRIO was set when we looked,
			 * but FPRIORITY is not set anymore.
			 */
		}
	}

	/*
 	 * Perform I/O operation.
	 */
	error = biophysio(xfs_diostrat, bp, bp->b_edev, rw, 
		(daddr_t)OFFTOBB(uiop->uio_offset), uiop);

	/*
 	 * Free local buf structure.
 	 */
	if (ioflag & IO_PRIORITY) {
		bp->b_flags &= ~(B_PRV_BUF|B_GR_BUF);
		if (guartype > 0) {
			grio_monitor_io_end(&stream_id, index);
#ifdef GRIO_DEBUG
			CHECK_GRIO_TIMESTAMP(bp, 400);
#endif
			ASSERT(BUF_GRIO_PRIVATE(bp));
			kmem_zone_free(grio_buf_data_zone, BUF_GRIO_PRIVATE(bp));
		}
		bp->b_grio_private = NULL;
	}

	ASSERT((bp->b_flags & B_MAPPED) == 0);
	XFS_BUF_ZEROFLAGS(bp);
	XFS_BUF_PTR(bp) = 0;
	putphysbuf(bp);

	/* CXFS needs the unwritten range covered by the write */
	if (u_start) {
		*u_start = dio.xd_start;
		*u_length = dio.xd_length;
	}

	return (error);
}
#endif /* !defined(__linux__) */



lock_t		xfs_refcache_lock;
xfs_inode_t	**xfs_refcache;
int		xfs_refcache_size;
int		xfs_refcache_index;
int		xfs_refcache_busy;
int		xfs_refcache_count;

/*
 * Insert the given inode into the reference cache.
 */
#if 1
void
xfs_refcache_insert(
	xfs_inode_t	*ip)
{
	int		s;
	vnode_t		*vp;
	xfs_inode_t	*release_ip;
	xfs_inode_t	**refcache;

	ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE));

	/*
	 * If an unmount is busy blowing entries out of the cache,
	 * then don't bother.
	 */
	if (xfs_refcache_busy) {
		return;
	}

	/*
	 * The inode is already in the refcache, so don't bother
	 * with it.
	 */
	if (ip->i_refcache != NULL) {
		return;
	}

	vp = XFS_ITOV(ip);
	ASSERT(vp->v_count > 0);
	VN_HOLD(vp);

	/*
	 * We allocate the reference cache on use so that we don't
	 * waste the memory on systems not being used as NFS servers.
	 */
	if (xfs_refcache == NULL) {
		refcache = (xfs_inode_t **)kmem_zalloc(xfs_refcache_size *
						       sizeof(xfs_inode_t *),
						       KM_SLEEP);
	} else {
		refcache = NULL;
	}

	s = mp_mutex_spinlock(&xfs_refcache_lock);

	/*
	 * If we allocated memory for the refcache above and it still
	 * needs it, then use the memory we allocated.  Otherwise we'll
	 * free the memory below.
	 */
	if (refcache != NULL) {
		if (xfs_refcache == NULL) {
			xfs_refcache = refcache;
			refcache = NULL;
		}
	}

	/*
	 * If an unmount is busy clearing out the cache, don't add new
	 * entries to it.
	 */
	if ((xfs_refcache_busy) || (vp->v_vfsp->vfs_flag & VFS_OFFLINE)) {
		mp_mutex_spinunlock(&xfs_refcache_lock, s);
		VN_RELE(vp);
		/*
		 * If we allocated memory for the refcache above but someone
		 * else beat us to using it, then free the memory now.
		 */
		if (refcache != NULL) {
			kmem_free(refcache,
				  xfs_refcache_size * sizeof(xfs_inode_t *));
		}
		return;
	}
	release_ip = xfs_refcache[xfs_refcache_index];
	if (release_ip != NULL) {
		release_ip->i_refcache = NULL;
		xfs_refcache_count--;
		ASSERT(xfs_refcache_count >= 0);
	}
	xfs_refcache[xfs_refcache_index] = ip;
	ASSERT(ip->i_refcache == NULL);
	ip->i_refcache = &(xfs_refcache[xfs_refcache_index]);
	xfs_refcache_count++;
	ASSERT(xfs_refcache_count <= xfs_refcache_size);
	xfs_refcache_index++;
	if (xfs_refcache_index == xfs_refcache_size) {
		xfs_refcache_index = 0;
	}
	mp_mutex_spinunlock(&xfs_refcache_lock, s);

	/*
	 * Save the pointer to the inode to be released so that we can
	 * VN_RELE it once we've dropped our inode locks in xfs_rwunlock().
	 * The pointer may be NULL, but that's OK.
	 */
	ip->i_release = release_ip;

	/*
	 * If we allocated memory for the refcache above but someone
	 * else beat us to using it, then free the memory now.
	 */
	if (refcache != NULL) {
		kmem_free(refcache,
			  xfs_refcache_size * sizeof(xfs_inode_t *));
	}
	return;
}
#endif /* !defined(__linux__) */

/*
 * If the given inode is in the reference cache, purge its entry and
 * release the reference on the vnode.
 */
#if 1 
void
xfs_refcache_purge_ip(
	xfs_inode_t	*ip)
{
	int	s;
	vnode_t	*vp;

	/*
	 * If we're not pointing to our entry in the cache, then
	 * we must not be in the cache.
	 */
	if (ip->i_refcache == NULL) {
		return;
	}

	s = mp_mutex_spinlock(&xfs_refcache_lock);
	if (ip->i_refcache == NULL) {
		mp_mutex_spinunlock(&xfs_refcache_lock, s);
		return;
	}

	/*
	 * Clear both our pointer to the cache entry and its pointer
	 * back to us.
	 */
	ASSERT(*(ip->i_refcache) == ip);
	*(ip->i_refcache) = NULL;
	ip->i_refcache = NULL;
	xfs_refcache_count--;
	ASSERT(xfs_refcache_count >= 0);
	mp_mutex_spinunlock(&xfs_refcache_lock, s);

	vp = XFS_ITOV(ip);
	ASSERT(vp->v_count > 1);
	VN_RELE(vp);

	return;
}
#endif /* !defined(__linux__) */

/*
 * This is called from the XFS unmount code to purge all entries for the
 * given mount from the cache.  It uses the refcache busy counter to
 * make sure that new entries are not added to the cache as we purge them.
 */
#if 1 
void
xfs_refcache_purge_mp(
	xfs_mount_t	*mp)
{
	int		s;
	vnode_t		*vp;
	int		i;
	xfs_inode_t	*ip;

	if (xfs_refcache == NULL) {
		return;
	}

	s = mp_mutex_spinlock(&xfs_refcache_lock);
	/*
	 * Bumping the busy counter keeps new entries from being added
	 * to the cache.  We use a counter since multiple unmounts could
	 * be in here simultaneously.
	 */
	xfs_refcache_busy++;

	for (i = 0; i < xfs_refcache_size; i++) {
		ip = xfs_refcache[i];
		if ((ip != NULL) && (ip->i_mount == mp)) {
			xfs_refcache[i] = NULL;
			ip->i_refcache = NULL;
			xfs_refcache_count--;
			ASSERT(xfs_refcache_count >= 0);
			mp_mutex_spinunlock(&xfs_refcache_lock, s);
			vp = XFS_ITOV(ip);
			VN_RELE(vp);

			s = mp_mutex_spinlock(&xfs_refcache_lock);
		} else {
			/*
			 * Make sure the don't hold the lock for too long.
			 */
			if ((i & 15) == 0) {
				mp_mutex_spinunlock(&xfs_refcache_lock, s);
				s = mp_mutex_spinlock(&xfs_refcache_lock);
			}
		}
	}

	xfs_refcache_busy--;
	ASSERT(xfs_refcache_busy >= 0);
	mp_mutex_spinunlock(&xfs_refcache_lock, s);
}
#endif /* !defined(__linux__) */

/*
 * This is called from the XFS sync code to ensure that the refcache
 * is emptied out over time.  We purge a small number of entries with
 * each call.
 */
#if 1
void
xfs_refcache_purge_some(void)
{
	int		s;
	int		i;
	xfs_inode_t	*ip;
	int		iplist_index;
#define	XFS_REFCACHE_PURGE_COUNT	10
	xfs_inode_t	*iplist[XFS_REFCACHE_PURGE_COUNT];

	if ((xfs_refcache == NULL) || (xfs_refcache_count == 0)) {
		return;
	}

	iplist_index = 0;
	s = mp_mutex_spinlock(&xfs_refcache_lock);

	/*
	 * Store any inodes we find in the next several entries
	 * into the iplist array to be released after dropping
	 * the spinlock.  We always start looking from the currently
	 * oldest place in the cache.  We move the refcache index
	 * forward as we go so that we are sure to eventually clear
	 * out the entire cache when the system goes idle.
	 */
	for (i = 0; i < XFS_REFCACHE_PURGE_COUNT; i++) {
		ip = xfs_refcache[xfs_refcache_index];
		if (ip != NULL) {
			xfs_refcache[xfs_refcache_index] = NULL;
			ip->i_refcache = NULL;
			xfs_refcache_count--;
			ASSERT(xfs_refcache_count >= 0);
			iplist[iplist_index] = ip;
			iplist_index++;
		}
		xfs_refcache_index++;
		if (xfs_refcache_index == xfs_refcache_size) {
			xfs_refcache_index = 0;
		}
	}

	mp_mutex_spinunlock(&xfs_refcache_lock, s);

	/*
	 * Now drop the inodes we collected.
	 */
	for (i = 0; i < iplist_index; i++) {
		VN_RELE(XFS_ITOV(iplist[i]));
	}
}
#endif /* !defined(__linux__) */