File: [Development] / xfs-linux / linux-2.4 / Attic / xfs_lrw.c (download)
Revision 1.124, Tue Feb 26 00:33:20 2002 UTC (15 years, 7 months ago) by nathans
Branch: MAIN
Changes since 1.123: +9 -10
lines
Merge of 2.4.18-xfs:slinx:112528b by nathans.
remove several no-longer-used fields (iosize) from the xfs_iocore_t.
|
/*
* Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it would be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* Further, this software is distributed without any warranty that it is
* free of the rightful claim of any third person regarding infringement
* or the like. Any license provided herein, whether implied or
* otherwise, applies only to this software file. Patent licenses, if
* any, provided herein do not apply to combinations of this program with
* other software, or any other product whatsoever.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write the Free Software Foundation, Inc., 59
* Temple Place - Suite 330, Boston MA 02111-1307, USA.
*
* Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
* Mountain View, CA 94043, or:
*
* http://www.sgi.com
*
* For further information regarding this notice, see:
*
* http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
*/
/*
* fs/xfs/linux/xfs_lrw.c (Linux Read Write stuff)
*
*/
#include <xfs.h>
#include <linux/pagemap.h>
#include <linux/capability.h>
#include <linux/xfs_iops.h>
#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
<< mp->m_writeio_log)
#define XFS_STRAT_WRITE_IMAPS 2
STATIC int xfs_iomap_read(xfs_iocore_t *, loff_t, size_t, int, pb_bmap_t *,
int *, struct pm *);
STATIC int xfs_iomap_write(xfs_iocore_t *, loff_t, size_t, pb_bmap_t *,
int *, int, struct pm *);
STATIC int xfs_iomap_write_delay(xfs_iocore_t *, loff_t, size_t, pb_bmap_t *,
int *, int, int);
STATIC int xfs_iomap_write_direct(xfs_iocore_t *, loff_t, size_t, pb_bmap_t *,
int *, int, int);
STATIC int _xfs_imap_to_bmap(xfs_iocore_t *, xfs_off_t, xfs_bmbt_irec_t *,
pb_bmap_t *, int, int);
#ifndef DEBUG
#define xfs_strat_write_check(io,off,count,imap,nimap)
#else /* DEBUG */
void
xfs_strat_write_check(
xfs_iocore_t *io,
xfs_fileoff_t offset_fsb,
xfs_filblks_t buf_fsb,
xfs_bmbt_irec_t *imap,
int imap_count);
#endif /* DEBUG */
ssize_t /* error (positive) */
xfs_read(
bhv_desc_t *bdp,
uio_t *uiop,
int ioflag,
cred_t *credp,
flid_t *fl)
{
ssize_t ret;
int error = 0;
xfs_fsize_t n;
xfs_inode_t *ip;
struct file *filp = uiop->uio_fp;
struct inode *linux_ip = filp->f_dentry->d_inode;
char *buf;
size_t size;
loff_t *offsetp;
xfs_iocore_t *io;
xfs_mount_t *mp;
ASSERT(uiop); /* we only support exactly 1 */
ASSERT(uiop->uio_iovcnt == 1); /* iov in a uio on linux */
ASSERT(uiop->uio_iov);
buf = uiop->uio_iov->iov_base;
size = uiop->uio_iov->iov_len;
offsetp = &uiop->uio_offset;
ip = XFS_BHVTOI(bdp);
io = &(ip->i_iocore);
mp = io->io_mount;
if (filp->f_flags & O_DIRECT) {
if (((__psint_t)buf & (linux_ip->i_sb->s_blocksize - 1)) ||
(uiop->uio_offset & mp->m_blockmask) ||
(size & mp->m_blockmask)) {
if (uiop->uio_offset == XFS_SIZE(mp, io)) {
return (0);
}
return XFS_ERROR(EINVAL);
}
}
n = XFS_MAX_FILE_OFFSET - *offsetp;
if ((n <= 0) || (size == 0))
return 0;
if (n < size)
size = n;
if (XFS_FORCED_SHUTDOWN(mp)) {
return EIO;
}
XFS_ILOCK(mp, io, XFS_IOLOCK_SHARED);
#ifdef CONFIG_HAVE_XFS_DMAPI
if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_READ) &&
!(filp->f_mode & FINVIS)) {
vrwlock_t locktype = VRWLOCK_READ;
error = xfs_dm_send_data_event(DM_EVENT_READ, bdp,
*offsetp, size,
FILP_DELAY_FLAG(filp),
&locktype);
if (error) {
XFS_IUNLOCK(mp, io, XFS_IOLOCK_SHARED);
return error;
}
}
#endif /* CONFIG_HAVE_XFS_DMAPI */
if (filp->f_flags & O_DIRECT) {
/* Flush and keep lock to keep out buffered writers */
fs_flush_pages(bdp, *offsetp, *offsetp + size, 0, FI_NONE);
ret = pagebuf_direct_file_read(filp, buf, size, offsetp,
linvfs_pb_bmap);
} else {
ret = generic_file_read(filp, buf, size, offsetp);
}
XFS_IUNLOCK(mp, io, XFS_IOLOCK_SHARED);
/*
* In either case above, ret >= 0 is num bytes read
* ret < 0 is an error.
*/
if (ret > 0) {
uiop->uio_resid = size - ret;
} else {
/* return positive error */
error = -(int)ret;
}
if (!(filp->f_mode & FINVIS))
XFS_CHGTIME(mp, io, XFS_ICHGTIME_ACC);
ASSERT (error >= 0);
return error;
}
/*
* This routine is called to handle zeroing any space in the last
* block of the file that is beyond the EOF. We do this since the
* size is being increased without writing anything to that block
* and we don't want anyone to read the garbage on the disk.
*/
/* We don' want the IRIX poff */
#define poff(x) ((x) & (PAGE_CACHE_SIZE - 1))
/* ARGSUSED */
STATIC int /* error (positive) */
xfs_zero_last_block(
struct inode *ip,
xfs_iocore_t *io,
xfs_off_t offset,
xfs_fsize_t isize,
xfs_fsize_t end_size,
struct pm *pmp)
{
xfs_fileoff_t last_fsb;
xfs_fileoff_t next_fsb;
xfs_fileoff_t end_fsb;
xfs_fsblock_t firstblock;
xfs_mount_t *mp;
page_buf_t *pb;
int nimaps;
int zero_offset;
int zero_len;
int isize_fsb_offset;
int i;
int error = 0;
int hole;
xfs_bmbt_irec_t imap;
loff_t loff;
size_t lsize;
pb_bmap_t pbmap;
int npbmaps = 1;
ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0);
ASSERT(offset > isize);
mp = io->io_mount;
/*
* If the file system block size is less than the page size,
* then there could be bytes in the last page after the last
* fsblock containing isize which have not been initialized.
* Since if such a page is in memory it will be
* fully accessible, we need to zero any part of
* it which is beyond the old file size. We don't need to send
* this out to disk, we're just initializing it to zeroes like
* we would have done in xfs_strat_read() had the size been bigger.
*/
if ((mp->m_sb.sb_blocksize < NBPP) && ((i = poff(isize)) != 0)) {
struct page *page;
page = find_lock_page(ip->i_mapping, isize >> PAGE_CACHE_SHIFT);
if (page) {
memset((void *)kmap(page)+i, 0, PAGE_SIZE-i);
kunmap(page);
/*
* Now we check to see if there are any holes in the
* page over the end of the file that are beyond the
* end of the file. If so, we want to set the P_HOLE
* flag in the page and blow away any active mappings
* to it so that future faults on the page will cause
* the space where the holes are to be allocated.
* This keeps us from losing updates that are beyond
* the current end of file when the page is already
* in memory.
*/
next_fsb = XFS_B_TO_FSBT(mp, isize);
end_fsb = XFS_B_TO_FSB(mp, ctooff(offtoc(isize)));
hole = 0;
while (next_fsb < end_fsb) {
nimaps = 1;
firstblock = NULLFSBLOCK;
error = XFS_BMAPI(mp, NULL, io, next_fsb, 1, 0,
&firstblock, 0, &imap,
&nimaps, NULL);
if (error) {
UnlockPage(page);
page_cache_release(page);
return error;
}
ASSERT(nimaps > 0);
if (imap.br_startblock == HOLESTARTBLOCK) {
hole = 1;
break;
}
next_fsb++;
}
if (hole) {
printk("xfs_zero_last_block: hole found? need more implementation\n");
#ifndef linux
/*
* In order to make processes notice the
* newly set P_HOLE flag, blow away any
* mappings to the file. We have to drop
* the inode lock while doing this to avoid
* deadlocks with the chunk cache.
*/
if (VN_MAPPED(vp)) {
XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL |
XFS_EXTSIZE_RD);
VOP_PAGES_SETHOLE(vp, pfdp, 1, 1,
ctooff(offtoct(isize)));
XFS_ILOCK(mp, io, XFS_ILOCK_EXCL |
XFS_EXTSIZE_RD);
}
#endif
}
UnlockPage(page);
page_cache_release(page);
}
}
isize_fsb_offset = XFS_B_FSB_OFFSET(mp, isize);
if (isize_fsb_offset == 0) {
/*
* There are no extra bytes in the last block on disk to
* zero, so return.
*/
return 0;
}
last_fsb = XFS_B_TO_FSBT(mp, isize);
nimaps = 1;
firstblock = NULLFSBLOCK;
error = XFS_BMAPI(mp, NULL, io, last_fsb, 1, 0, &firstblock, 0, &imap,
&nimaps, NULL);
if (error) {
return error;
}
ASSERT(nimaps > 0);
/*
* If the block underlying isize is just a hole, then there
* is nothing to zero.
*/
if (imap.br_startblock == HOLESTARTBLOCK)
{
return 0;
}
/*
* Get a pagebuf for the last block, zero the part beyond the
* EOF, and write it out sync. We need to drop the ilock
* while we do this so we don't deadlock when the buffer cache
* calls back to us.
*/
XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD);
loff = XFS_FSB_TO_B(mp, last_fsb);
lsize = BBTOB(XFS_FSB_TO_BB(mp, 1));
zero_offset = isize_fsb_offset;
zero_len = mp->m_sb.sb_blocksize - isize_fsb_offset;
/*
* Realtime needs work here
*/
pb = pagebuf_lookup(ip, loff, lsize, PBF_ENTER_PAGES);
if (!pb) {
error = ENOMEM;
XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
return error;
}
if (io->io_flags & XFS_IOCORE_RT) {
pb->pb_dev = mp->m_rtdev;
}
if ((imap.br_startblock > 0) &&
(imap.br_startblock != DELAYSTARTBLOCK)) {
pb->pb_bn = XFS_FSB_TO_DB_IO(io, imap.br_startblock);
if (imap.br_state == XFS_EXT_UNWRITTEN) {
printk("xfs_zero_last_block: unwritten?\n");
}
if (PBF_NOT_DONE(pb)) {
/* pagebuf functions return negative errors */
if ((error = -pagebuf_iostart(pb, PBF_READ))) {
pagebuf_rele(pb);
goto out_lock;
}
}
}
npbmaps = _xfs_imap_to_bmap(io, offset, &imap, &pbmap, nimaps, npbmaps);
error = -pagebuf_iozero(ip, pb, zero_offset, zero_len, end_size, &pbmap);
pagebuf_rele(pb);
out_lock:
XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
ASSERT(error >= 0);
return error;
}
/*
* Zero any on disk space between the current EOF and the new,
* larger EOF. This handles the normal case of zeroing the remainder
* of the last block in the file and the unusual case of zeroing blocks
* out beyond the size of the file. This second case only happens
* with fixed size extents and when the system crashes before the inode
* size was updated but after blocks were allocated. If fill is set,
* then any holes in the range are filled and zeroed. If not, the holes
* are left alone as holes.
*/
int /* error (positive) */
xfs_zero_eof(
vnode_t *vp,
xfs_iocore_t *io,
xfs_off_t offset, /* starting I/O offset */
xfs_fsize_t isize, /* current inode size */
xfs_fsize_t end_size, /* terminal inode size */
struct pm *pmp)
{
struct inode *ip = LINVFS_GET_IP(vp);
xfs_fileoff_t start_zero_fsb;
xfs_fileoff_t end_zero_fsb;
xfs_fileoff_t prev_zero_fsb;
xfs_fileoff_t zero_count_fsb;
xfs_fileoff_t last_fsb;
xfs_fsblock_t firstblock;
xfs_extlen_t buf_len_fsb;
xfs_extlen_t prev_zero_count;
xfs_mount_t *mp;
page_buf_t *pb;
int nimaps;
int error = 0;
xfs_bmbt_irec_t imap;
loff_t loff;
size_t lsize;
pb_bmap_t pbmap;
int npbmaps = 1;
ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
mp = io->io_mount;
/*
* First handle zeroing the block on which isize resides.
* We only zero a part of that block so it is handled specially.
*/
error = xfs_zero_last_block(ip, io, offset, isize, end_size, pmp);
if (error) {
ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
return error;
}
/*
* Calculate the range between the new size and the old
* where blocks needing to be zeroed may exist. To get the
* block where the last byte in the file currently resides,
* we need to subtract one from the size and truncate back
* to a block boundary. We subtract 1 in case the size is
* exactly on a block boundary.
*/
last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
if (last_fsb == end_zero_fsb) {
/*
* The size was only incremented on its last block.
* We took care of that above, so just return.
*/
return 0;
}
ASSERT(start_zero_fsb <= end_zero_fsb);
prev_zero_fsb = NULLFILEOFF;
prev_zero_count = 0;
/*
* Maybe change this loop to do the bmapi call and
* loop while we split the mappings into pagebufs?
*/
while (start_zero_fsb <= end_zero_fsb) {
nimaps = 1;
zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
firstblock = NULLFSBLOCK;
error = XFS_BMAPI(mp, NULL, io, start_zero_fsb, zero_count_fsb,
0, &firstblock, 0, &imap, &nimaps, NULL);
if (error) {
ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
return error;
}
ASSERT(nimaps > 0);
if (imap.br_startblock == HOLESTARTBLOCK)
{
/*
* This loop handles initializing pages that were
* partially initialized by the code below this
* loop. It basically zeroes the part of the page
* that sits on a hole and sets the page as P_HOLE
* and calls remapf if it is a mapped file.
*/
prev_zero_fsb = NULLFILEOFF;
prev_zero_count = 0;
start_zero_fsb = imap.br_startoff +
imap.br_blockcount;
ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
continue;
}
/*
* There are blocks in the range requested.
* Zero them a single write at a time. We actually
* don't zero the entire range returned if it is
* too big and simply loop around to get the rest.
* That is not the most efficient thing to do, but it
* is simple and this path should not be exercised often.
*/
buf_len_fsb = XFS_FILBLKS_MIN(imap.br_blockcount,
mp->m_writeio_blocks);
/*
* Drop the inode lock while we're doing the I/O.
* We'll still have the iolock to protect us.
*/
XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
loff = XFS_FSB_TO_B(mp, start_zero_fsb);
lsize = XFS_FSB_TO_B(mp, buf_len_fsb);
/*
* real-time files need work here
*/
pb = pagebuf_lookup(ip, loff, lsize, PBF_ENTER_PAGES);
if (!pb) {
error = ENOMEM;
goto out_lock;
}
if (imap.br_startblock != DELAYSTARTBLOCK) {
pb->pb_bn = XFS_FSB_TO_DB_IO(io, imap.br_startblock);
if (imap.br_state == XFS_EXT_UNWRITTEN) {
printk("xfs_zero_eof: unwritten? what do we do here?\n");
}
}
npbmaps = _xfs_imap_to_bmap(io, offset, &imap, &pbmap, nimaps, npbmaps);
/* pagebuf_iozero returns negative error */
error = -pagebuf_iozero(ip, pb, 0, lsize, end_size, &pbmap);
pagebuf_rele(pb);
if (error) {
goto out_lock;
}
prev_zero_fsb = start_zero_fsb;
prev_zero_count = buf_len_fsb;
start_zero_fsb = imap.br_startoff + buf_len_fsb;
ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
}
return 0;
out_lock:
XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
ASSERT(error >= 0);
return error;
}
ssize_t /* error (positive) */
xfs_write(
bhv_desc_t *bdp,
uio_t *uiop,
int ioflags,
cred_t *credp,
flid_t *fl)
{
xfs_inode_t *xip;
struct file *filp = uiop->uio_fp;
struct inode *ip = filp->f_dentry->d_inode;
loff_t *offsetp = &uiop->uio_offset;
xfs_mount_t *mp;
xfs_trans_t *tp;
ssize_t ret;
int error = 0;
xfs_fsize_t isize;
xfs_fsize_t n, limit = XFS_MAX_FILE_OFFSET;
xfs_iocore_t *io;
vnode_t *vp;
int iolock;
int direct = ioflags & O_DIRECT;
#ifdef CONFIG_HAVE_XFS_DMAPI
int eventsent = 0;
loff_t savedsize = *offsetp;
#endif
vrwlock_t locktype;
char *buf;
size_t size;
unsigned int mode;
ASSERT(uiop); /* we only support exactly 1 */
ASSERT(uiop->uio_iovcnt == 1); /* iov in a uio on linux */
ASSERT(uiop->uio_iov);
vp = BHV_TO_VNODE(bdp);
xip = XFS_BHVTOI(bdp);
buf = uiop->uio_iov->iov_base;
size = uiop->uio_iov->iov_len;
if (size == 0)
return 0;
io = &(xip->i_iocore);
mp = io->io_mount;
xfs_check_frozen(mp, bdp, ioflags, XFS_FREEZE_WRITE);
if (XFS_FORCED_SHUTDOWN(xip->i_mount)) {
return EIO;
}
if (direct) {
if (((__psint_t)buf & (ip->i_sb->s_blocksize - 1)) ||
(uiop->uio_offset & mp->m_blockmask) ||
(size & mp->m_blockmask)) {
return XFS_ERROR(EINVAL);
}
iolock = XFS_IOLOCK_SHARED;
locktype = VRWLOCK_WRITE_DIRECT;
} else {
iolock = XFS_IOLOCK_EXCL;
locktype = VRWLOCK_WRITE;
}
xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
isize = xip->i_d.di_size;
#ifdef CONFIG_HAVE_XFS_DMAPI
start:
#endif
n = limit - *offsetp;
if (n <= 0) {
xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
return EFBIG;
}
if (n < size)
size = n;
#ifdef CONFIG_HAVE_XFS_DMAPI
if ((DM_EVENT_ENABLED_IO(vp->v_vfsp, io, DM_EVENT_WRITE) &&
!(filp->f_mode & FINVIS) && !eventsent)) {
error = xfs_dm_send_data_event(DM_EVENT_WRITE, bdp,
*offsetp, size,
FILP_DELAY_FLAG(filp), &locktype);
if (error) {
xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
return error;
}
eventsent = 1;
/*
* The iolock was dropped and reaquired in
* xfs_dm_send_data_event so we have to recheck the size
* when appending. We will only "goto start;" once,
* since having sent the event prevents another call
* to xfs_dm_send_data_event, which is what
* allows the size to change in the first place.
*/
if ((ioflags & O_APPEND) && savedsize != xip->i_d.di_size) {
*offsetp = isize = xip->i_d.di_size;
goto start;
}
}
#endif /* CONFIG_HAVE_XFS_DMAPI */
/*
* On Linux, generic_file_write updates the times even if
* no data is copied in so long as the write had a size.
*
* We must update xfs' times since revalidate will overcopy xfs.
*/
if (size) {
if (!(filp->f_mode & FINVIS))
xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
}
/*
* If the offset is beyond the size of the file, we have a couple
* of things to do. First, if there is already space allocated
* we need to either create holes or zero the disk or ...
*
* If there is a page where the previous size lands, we need
* to zero it out up to the new size.
*/
if (!direct && (*offsetp > isize && isize)) {
error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, *offsetp,
isize, *offsetp + size, NULL);
if (error) {
xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
return(error);
}
}
xfs_iunlock(xip, XFS_ILOCK_EXCL);
#ifdef CONFIG_HAVE_XFS_DMAPI
retry:
#endif
if (direct) {
xfs_inval_cached_pages(vp, &xip->i_iocore, *offsetp,
(xfs_off_t) size, (void *)vp);
}
/*
* pagebuf_generic_file_write will return positive if bytes
* written, negative if error. We'll live with (-) error
* for the moment, but flip error sign before we pass it up
*/
ret = pagebuf_generic_file_write(filp, buf, size, offsetp,
linvfs_pb_bmap);
#ifdef CONFIG_HAVE_XFS_DMAPI
if ((ret == -ENOSPC) &&
DM_EVENT_ENABLED_IO(vp->v_vfsp, io, DM_EVENT_NOSPACE) &&
!(filp->f_mode & FINVIS)) {
xfs_rwunlock(bdp, locktype);
error = dm_send_namesp_event(DM_EVENT_NOSPACE, bdp,
DM_RIGHT_NULL, bdp, DM_RIGHT_NULL, NULL, NULL,
0, 0, 0); /* Delay flag intentionally unused */
xfs_rwlock(bdp, locktype);
if (error)
return error;
*offsetp = ip->i_size;
goto retry;
}
#endif /* CONFIG_HAVE_XFS_DMAPI */
if (ret <=0) { /*
* ret from pagebuf_generic_file_write <= 0, it's
* an error, we want to return positive though
* then bail out...
*/
xfs_rwunlock(bdp, locktype);
error = -(int)ret;
return(error);
}
/*
* ret > 0 == number of bytes written by pagebuf_generic_file_write()
* Keep track of any unwritten bytes in uio_resid.
*/
uiop->uio_resid = size - ret;
/* JIMJIM Lock? around the stuff below if Linux doesn't lock above */
/* set S_IGID if S_IXGRP is set, and always set S_ISUID */
mode = (ip->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
/* were any of the uid bits set? */
mode &= ip->i_mode;
if (mode && !capable(CAP_FSETID)) {
ip->i_mode &= ~mode;
xfs_write_clear_setuid(xip);
}
if (*offsetp > xip->i_d.di_size) {
XFS_SETSIZE(mp, io, *offsetp);
}
/* Handle various SYNC-type writes */
if (ioflags & O_SYNC) {
/* Flush all inode data buffers */
error = -fsync_inode_buffers(ip);
if (error)
goto out;
/*
* If we're treating this as O_DSYNC and we have not updated the
* size, force the log.
*/
if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC)
&& !(xip->i_update_size)) {
/*
* If an allocation transaction occurred
* without extending the size, then we have to force
* the log up the proper point to ensure that the
* allocation is permanent. We can't count on
* the fact that buffered writes lock out direct I/O
* writes - the direct I/O write could have extended
* the size nontransactionally, then finished before
* we started. xfs_write_file will think that the file
* didn't grow but the update isn't safe unless the
* size change is logged.
*
* Force the log if we've committed a transaction
* against the inode or if someone else has and
* the commit record hasn't gone to disk (e.g.
* the inode is pinned). This guarantees that
* all changes affecting the inode are permanent
* when we return.
*/
xfs_inode_log_item_t *iip;
xfs_lsn_t lsn;
iip = xip->i_itemp;
if (iip && iip->ili_last_lsn) {
lsn = iip->ili_last_lsn;
xfs_log_force(mp, lsn,
XFS_LOG_FORCE | XFS_LOG_SYNC);
} else if (xfs_ipincount(xip) > 0) {
xfs_log_force(mp, (xfs_lsn_t)0,
XFS_LOG_FORCE | XFS_LOG_SYNC);
}
} else {
/*
* O_SYNC or O_DSYNC _with_ a size update are handled
* the same way.
*
* If the write was synchronous then we need to make
* sure that the inode modification time is permanent.
* We'll have updated the timestamp above, so here
* we use a synchronous transaction to log the inode.
* It's not fast, but it's necessary.
*
* If this a dsync write and the size got changed
* non-transactionally, then we need to ensure that
* the size change gets logged in a synchronous
* transaction.
*/
tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
if ((error = xfs_trans_reserve(tp, 0,
XFS_SWRITE_LOG_RES(mp),
0, 0, 0))) {
/* Transaction reserve failed */
xfs_trans_cancel(tp, 0);
} else {
/* Transaction reserve successful */
xfs_ilock(xip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL);
xfs_trans_ihold(tp, xip);
xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE);
xfs_trans_set_sync(tp);
error = xfs_trans_commit(tp, 0, (xfs_lsn_t)0);
xfs_iunlock(xip, XFS_ILOCK_EXCL);
}
}
} /* (ioflags & O_SYNC) */
out:
/*
* If we are coming from an nfsd thread then insert into the
* reference cache.
*/
if (!strcmp(current->comm, "nfsd"))
xfs_refcache_insert(xip);
/* Drop lock this way - the old refcache release is in here */
xfs_rwunlock(bdp, locktype);
ASSERT(ret >= 0);
return(error);
}
/*
* xfs_bmap() is the same as the irix xfs_bmap from xfs_rw.c
* execpt for slight changes to the params
*/
int
xfs_bmap(bhv_desc_t *bdp,
xfs_off_t offset,
ssize_t count,
int flags,
struct cred *cred,
pb_bmap_t *pbmapp,
int *npbmaps)
{
xfs_inode_t *ip;
int error;
int unlocked;
int lockmode;
int fsynced = 0;
vnode_t *vp;
ip = XFS_BHVTOI(bdp);
ASSERT((ip->i_d.di_mode & IFMT) == IFREG);
ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) ==
((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0));
ASSERT((flags & PBF_READ) || (flags & PBF_WRITE));
if (XFS_FORCED_SHUTDOWN(ip->i_iocore.io_mount))
return XFS_ERROR(EIO);
if (flags & PBF_READ) {
unlocked = 0;
lockmode = xfs_ilock_map_shared(ip);
error = xfs_iomap_read(&ip->i_iocore, offset, count,
XFS_BMAPI_ENTIRE, pbmapp, npbmaps, NULL);
xfs_iunlock_map_shared(ip, lockmode);
} else { /* PBF_WRITE */
ASSERT(flags & PBF_WRITE);
vp = BHV_TO_VNODE(bdp);
xfs_ilock(ip, XFS_ILOCK_EXCL);
/*
* Make sure that the dquots are there. This doesn't hold
* the ilock across a disk read.
*/
if (XFS_IS_QUOTA_ON(ip->i_mount)) {
if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) {
if ((error = xfs_qm_dqattach(ip, XFS_QMOPT_ILOCKED))) {
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return XFS_ERROR(error);
}
}
}
retry:
error = xfs_iomap_write(&ip->i_iocore, offset, count,
pbmapp, npbmaps, flags, NULL);
/* xfs_iomap_write unlocks/locks/unlocks */
if ((error == ENOSPC) && strcmp(current->comm, "nfsd")) {
switch (fsynced) {
case 0:
if (ip->i_delayed_blks) {
fsync_inode_buffers(LINVFS_GET_IP(vp));
fsynced = 1;
} else {
fsynced = 2;
flags |= PBF_SYNC;
}
error = 0;
xfs_ilock(ip, XFS_ILOCK_EXCL);
goto retry;
case 1:
fsynced = 2;
if (!(flags & PBF_SYNC)) {
flags |= PBF_SYNC;
error = 0;
xfs_ilock(ip, XFS_ILOCK_EXCL);
goto retry;
}
case 2:
VFS_SYNC(vp->v_vfsp,
SYNC_NOWAIT|SYNC_DELWRI|SYNC_BDFLUSH|SYNC_FSDATA,
NULL, error);
error = 0;
/**
delay(HZ);
**/
fsynced++;
xfs_ilock(ip, XFS_ILOCK_EXCL);
goto retry;
}
}
}
return XFS_ERROR(error);
}
int
xfs_strategy(bhv_desc_t *bdp,
xfs_off_t offset,
ssize_t count,
int flags,
struct cred *cred,
pb_bmap_t *pbmapp,
int *npbmaps)
{
xfs_inode_t *ip;
xfs_iocore_t *io;
xfs_mount_t *mp;
int error;
xfs_fileoff_t offset_fsb;
xfs_fileoff_t end_fsb;
xfs_fileoff_t map_start_fsb;
xfs_fileoff_t last_block;
xfs_fsblock_t first_block;
xfs_bmap_free_t free_list;
xfs_filblks_t count_fsb;
int committed, i, loops, nimaps;
int is_xfs = 1; /* This will be a variable at some point */
xfs_bmbt_irec_t imap[XFS_MAX_RW_NBMAPS];
xfs_trans_t *tp;
ip = XFS_BHVTOI(bdp);
io = &ip->i_iocore;
mp = ip->i_mount;
/* is_xfs = IO_IS_XFS(io); */
ASSERT((ip->i_d.di_mode & IFMT) == IFREG);
ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) ==
((io->io_flags & XFS_IOCORE_RT) != 0));
ASSERT((flags & PBF_READ) || (flags & PBF_WRITE));
if (XFS_FORCED_SHUTDOWN(mp))
return XFS_ERROR(EIO);
ASSERT(flags & PBF_WRITE);
offset_fsb = XFS_B_TO_FSBT(mp, offset);
nimaps = min(XFS_MAX_RW_NBMAPS, *npbmaps);
end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
first_block = NULLFSBLOCK;
XFS_ILOCK(mp, io, XFS_ILOCK_SHARED | XFS_EXTSIZE_RD);
error = XFS_BMAPI(mp, NULL, io, offset_fsb,
(xfs_filblks_t)(end_fsb - offset_fsb),
XFS_BMAPI_ENTIRE, &first_block, 0, imap,
&nimaps, NULL);
XFS_IUNLOCK(mp, io, XFS_ILOCK_SHARED | XFS_EXTSIZE_RD);
if (error) {
return XFS_ERROR(error);
}
if (nimaps && !ISNULLSTARTBLOCK(imap[0].br_startblock)) {
*npbmaps = _xfs_imap_to_bmap(&ip->i_iocore, offset, imap,
pbmapp, nimaps, *npbmaps);
return 0;
}
/*
* Make sure that the dquots are there.
*/
if (XFS_IS_QUOTA_ON(mp)) {
if (XFS_NOT_DQATTACHED(mp, ip)) {
if ((error = xfs_qm_dqattach(ip, 0))) {
return XFS_ERROR(error);
}
}
}
XFS_STATS_ADD(xfsstats.xs_xstrat_bytes,
XFS_FSB_TO_B(mp, imap[0].br_blockcount));
offset_fsb = imap[0].br_startoff;
count_fsb = imap[0].br_blockcount;
map_start_fsb = offset_fsb;
while (count_fsb != 0) {
/*
* Set up a transaction with which to allocate the
* backing store for the file. Do allocations in a
* loop until we get some space in the range we are
* interested in. The other space that might be allocated
* is in the delayed allocation extent on which we sit
* but before our buffer starts.
*/
nimaps = 0;
loops = 0;
while (nimaps == 0) {
if (is_xfs) {
tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
error = xfs_trans_reserve(tp, 0,
XFS_WRITE_LOG_RES(mp),
0, XFS_TRANS_PERM_LOG_RES,
XFS_WRITE_LOG_COUNT);
if (error) {
xfs_trans_cancel(tp, 0);
goto error0;
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip,
XFS_ILOCK_EXCL);
xfs_trans_ihold(tp, ip);
} else {
tp = NULL;
XFS_ILOCK(mp, io, XFS_ILOCK_EXCL |
XFS_EXTSIZE_WR);
}
/*
* Allocate the backing store for the file.
*/
XFS_BMAP_INIT(&(free_list),
&(first_block));
nimaps = XFS_STRAT_WRITE_IMAPS;
/*
* Ensure we don't go beyond eof - it is possible
* the extents changed since we did the read call,
* we dropped the ilock in the interim.
*/
end_fsb = XFS_B_TO_FSB(mp, XFS_SIZE(mp, io));
xfs_bmap_last_offset(NULL, ip, &last_block,
XFS_DATA_FORK);
last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
if ((map_start_fsb + count_fsb) > last_block) {
count_fsb = last_block - map_start_fsb;
if (count_fsb == 0) {
if (is_xfs) {
xfs_bmap_cancel(&free_list);
xfs_trans_cancel(tp,
(XFS_TRANS_RELEASE_LOG_RES |
XFS_TRANS_ABORT));
}
XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL |
XFS_EXTSIZE_WR);
return XFS_ERROR(EAGAIN);
}
}
error = XFS_BMAPI(mp, tp, io, map_start_fsb, count_fsb,
XFS_BMAPI_WRITE, &first_block, 1,
imap, &nimaps, &free_list);
if (error) {
xfs_bmap_cancel(&free_list);
xfs_trans_cancel(tp,
(XFS_TRANS_RELEASE_LOG_RES |
XFS_TRANS_ABORT));
XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL |
XFS_EXTSIZE_WR);
goto error0;
}
if (is_xfs) {
error = xfs_bmap_finish(&(tp), &(free_list),
first_block, &committed);
if (error) {
xfs_bmap_cancel(&free_list);
xfs_trans_cancel(tp,
(XFS_TRANS_RELEASE_LOG_RES |
XFS_TRANS_ABORT));
xfs_iunlock(ip, XFS_ILOCK_EXCL);
goto error0;
}
error = xfs_trans_commit(tp,
XFS_TRANS_RELEASE_LOG_RES,
NULL);
if (error) {
xfs_iunlock(ip, XFS_ILOCK_EXCL);
goto error0;
}
}
if (nimaps == 0) {
XFS_IUNLOCK(mp, io,
XFS_ILOCK_EXCL|XFS_EXTSIZE_WR);
} /* else hold 'till we maybe loop again below */
}
/*
* See if we were able to allocate an extent that
* covers at least part of the user's requested size.
*/
offset_fsb = XFS_B_TO_FSBT(mp, offset);
for(i = 0; i < nimaps; i++) {
int maps;
if (offset_fsb >= imap[i].br_startoff &&
(offset_fsb < (imap[i].br_startoff + imap[i].br_blockcount))) {
XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL | XFS_EXTSIZE_WR);
maps = min(nimaps, *npbmaps);
*npbmaps = _xfs_imap_to_bmap(io, offset, &imap[i],
pbmapp, maps, *npbmaps);
XFS_STATS_INC(xfsstats.xs_xstrat_quick);
return 0;
}
count_fsb -= imap[i].br_blockcount; /* for next bmapi,
if needed. */
}
/*
* We didn't get an extent the caller can write into so
* loop around and try starting after the last imap we got back.
*/
nimaps--; /* Index of last entry */
ASSERT(nimaps >= 0);
ASSERT(offset_fsb >= imap[nimaps].br_startoff + imap[nimaps].br_blockcount);
ASSERT(count_fsb);
offset_fsb = imap[nimaps].br_startoff + imap[nimaps].br_blockcount;
map_start_fsb = offset_fsb;
XFS_STATS_INC(xfsstats.xs_xstrat_split);
XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_WR);
}
ASSERT(0); /* Should never get here */
error0:
if (error) {
ASSERT(count_fsb != 0);
ASSERT(is_xfs || XFS_FORCED_SHUTDOWN(mp));
}
return XFS_ERROR(error);
}
STATIC int
_xfs_imap_to_bmap(
xfs_iocore_t *io,
xfs_off_t offset,
xfs_bmbt_irec_t *imap,
pb_bmap_t *pbmapp,
int imaps, /* Number of imap entries */
int pbmaps) /* Number of pbmap entries */
{
xfs_mount_t *mp;
xfs_fsize_t nisize;
int im, pbm;
xfs_fsblock_t start_block;
mp = io->io_mount;
nisize = XFS_SIZE(mp, io);
if (io->io_new_size > nisize)
nisize = io->io_new_size;
for (im=0, pbm=0; im < imaps && pbm < pbmaps; im++,pbmapp++,imap++,pbm++) {
if (io->io_flags & XFS_IOCORE_RT) {
pbmapp->pbm_dev = mp->m_rtdev;
} else {
pbmapp->pbm_dev = mp->m_dev;
}
pbmapp->pbm_offset = XFS_FSB_TO_B(mp, imap->br_startoff);
pbmapp->pbm_delta = offset - pbmapp->pbm_offset;
pbmapp->pbm_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount);
pbmapp->pbm_flags = 0;
start_block = imap->br_startblock;
if (start_block == HOLESTARTBLOCK) {
pbmapp->pbm_bn = PAGE_BUF_DADDR_NULL;
pbmapp->pbm_flags = PBMF_HOLE;
} else if (start_block == DELAYSTARTBLOCK) {
pbmapp->pbm_bn = PAGE_BUF_DADDR_NULL;
pbmapp->pbm_flags = PBMF_DELAY;
} else {
pbmapp->pbm_bn = XFS_FSB_TO_DB_IO(io, start_block);
if (imap->br_state == XFS_EXT_UNWRITTEN)
pbmapp->pbm_flags |= PBMF_UNWRITTEN;
}
if (XFS_FSB_TO_B(mp, pbmapp->pbm_offset + pbmapp->pbm_bsize)
>= nisize) {
pbmapp->pbm_flags |= PBMF_EOF;
}
offset += pbmapp->pbm_bsize - pbmapp->pbm_delta;
}
return(pbm); /* Return the number filled */
}
STATIC int
xfs_iomap_read(
xfs_iocore_t *io,
loff_t offset,
size_t count,
int flags,
pb_bmap_t *pbmapp,
int *npbmaps,
struct pm *pmp)
{
xfs_fileoff_t offset_fsb;
xfs_fileoff_t end_fsb;
xfs_fsblock_t firstblock;
int nimaps;
int error;
xfs_mount_t *mp;
xfs_bmbt_irec_t imap[XFS_MAX_RW_NBMAPS];
ASSERT(ismrlocked(io->io_lock, MR_UPDATE | MR_ACCESS) != 0);
/** ASSERT(ismrlocked(io->io_iolock, MR_UPDATE | MR_ACCESS) != 0); **/
/* xfs_iomap_enter_trace(XFS_IOMAP_READ_ENTER, io, offset, count); */
mp = io->io_mount;
offset_fsb = XFS_B_TO_FSBT(mp, offset);
nimaps = sizeof(imap) / sizeof(imap[0]);
nimaps = min(nimaps, *npbmaps); /* Don't ask for more than caller has */
end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
firstblock = NULLFSBLOCK;
error = XFS_BMAPI(mp, NULL, io, offset_fsb,
(xfs_filblks_t)(end_fsb - offset_fsb),
flags, &firstblock, 0, imap,
&nimaps, NULL);
if (error) {
return XFS_ERROR(error);
}
if(nimaps) {
*npbmaps = _xfs_imap_to_bmap(io, offset, imap, pbmapp, nimaps,
*npbmaps);
} else
*npbmaps = 0;
return XFS_ERROR(error);
}
/*
* xfs_iomap_write: return pagebuf_bmap_t's telling higher layers
* where to write.
* There are 2 main cases:
* 1 the extents already exist
* 2 must allocate.
* There are 3 cases when we allocate:
* delay allocation (doesn't really allocate or use transactions)
* direct allocation (no previous delay allocation
* convert delay to real allocations
*/
STATIC int
xfs_iomap_write(
xfs_iocore_t *io,
loff_t offset,
size_t count,
pb_bmap_t *pbmapp,
int *npbmaps,
int ioflag,
struct pm *pmp)
{
int maps;
int error = 0;
#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP
int found;
int flags = 0;
maps = *npbmaps;
if (!maps)
goto out;
/*
* If we have extents that are allocated for this range,
* return them.
*/
found = 0;
error = xfs_iomap_read(io, offset, count, flags, pbmapp, npbmaps, NULL);
if (error)
goto out;
/*
* If we found mappings and they can just have data written
* without conversion,
* let the caller write these and call us again.
*
* If we have a HOLE or UNWRITTEN, proceed down lower to
* get the space or to convert to written.
*/
if (*npbmaps) {
if (!(pbmapp->pbm_flags & PBMF_HOLE)) {
*npbmaps = 1; /* Only checked the first one. */
/* We could check more, ... */
goto out;
}
}
found = *npbmaps;
*npbmaps = maps; /* Restore to original requested */
if (ioflag & PBF_DIRECT) {
error = xfs_iomap_write_direct(io, offset, count, pbmapp,
npbmaps, ioflag, found);
} else {
error = xfs_iomap_write_delay(io, offset, count, pbmapp,
npbmaps, ioflag, found);
}
out:
XFS_IUNLOCK(io->io_mount, io, XFS_ILOCK_EXCL);
XFS_INODE_CLEAR_READ_AHEAD(io);
return XFS_ERROR(error);
}
#ifdef DEBUG
/*
* xfs_strat_write_check
*
* Make sure that there are blocks or delayed allocation blocks
* underlying the entire area given. The imap parameter is simply
* given as a scratch area in order to reduce stack space. No
* values are returned within it.
*/
void
xfs_strat_write_check(
xfs_iocore_t *io,
xfs_fileoff_t offset_fsb,
xfs_filblks_t buf_fsb,
xfs_bmbt_irec_t *imap,
int imap_count)
{
xfs_filblks_t count_fsb;
xfs_fsblock_t firstblock;
xfs_mount_t *mp;
int nimaps;
int n;
int error;
if (!IO_IS_XFS(io)) return;
mp = io->io_mount;
count_fsb = 0;
while (count_fsb < buf_fsb) {
nimaps = imap_count;
firstblock = NULLFSBLOCK;
error = XFS_BMAPI(mp, NULL, io, (offset_fsb + count_fsb),
(buf_fsb - count_fsb), 0, &firstblock, 0,
imap, &nimaps, NULL);
if (error) {
return;
}
ASSERT(nimaps > 0);
n = 0;
while (n < nimaps) {
ASSERT(imap[n].br_startblock != HOLESTARTBLOCK);
count_fsb += imap[n].br_blockcount;
ASSERT(count_fsb <= buf_fsb);
n++;
}
}
return;
}
#endif /* DEBUG */
/*
* Map the given I/O size and I/O alignment over the given extent.
* If we're at the end of the file and the underlying extent is
* delayed alloc, make sure we extend out to the
* next i_writeio_blocks boundary. Otherwise make sure that we
* are confined to the given extent.
*/
/*ARGSUSED*/
STATIC void
xfs_write_bmap(
xfs_mount_t *mp,
xfs_iocore_t *io,
xfs_bmbt_irec_t *imapp,
pb_bmap_t *pbmapp,
int iosize,
xfs_fileoff_t ioalign,
xfs_fsize_t isize)
{
__int64_t extra_blocks;
xfs_fileoff_t size_diff;
xfs_fileoff_t ext_offset;
xfs_fsblock_t start_block;
int length; /* length of this mapping in blocks */
xfs_off_t offset; /* logical block offset of this mapping */
if (ioalign < imapp->br_startoff) {
/*
* The desired alignment doesn't end up on this
* extent. Move up to the beginning of the extent.
* Subtract whatever we drop from the iosize so that
* we stay aligned on iosize boundaries.
*/
size_diff = imapp->br_startoff - ioalign;
iosize -= (int)size_diff;
ASSERT(iosize > 0);
ext_offset = 0;
offset = imapp->br_startoff;
pbmapp->pbm_offset = XFS_FSB_TO_B(mp, imapp->br_startoff);
} else {
/*
* The alignment requested fits on this extent,
* so use it.
*/
ext_offset = ioalign - imapp->br_startoff;
offset = ioalign;
pbmapp->pbm_offset = XFS_FSB_TO_B(mp, ioalign);
}
start_block = imapp->br_startblock;
ASSERT(start_block != HOLESTARTBLOCK);
if (start_block != DELAYSTARTBLOCK) {
pbmapp->pbm_bn = XFS_FSB_TO_DB_IO(io, start_block + ext_offset);
if (imapp->br_state == XFS_EXT_UNWRITTEN) {
pbmapp->pbm_flags = PBMF_UNWRITTEN;
}
} else {
pbmapp->pbm_bn = PAGE_BUF_DADDR_NULL;
pbmapp->pbm_flags = PBMF_DELAY;
}
length = iosize;
/*
* If the iosize from our offset extends beyond the end of
* the extent, then trim down length to match that of the extent.
*/
extra_blocks = (xfs_off_t)(offset + length) -
(__uint64_t)(imapp->br_startoff +
imapp->br_blockcount);
if (extra_blocks > 0) {
length -= extra_blocks;
ASSERT(length > 0);
}
pbmapp->pbm_bsize = XFS_FSB_TO_B(mp, length);
}
STATIC int
xfs_iomap_write_delay(
xfs_iocore_t *io,
loff_t offset,
size_t count,
pb_bmap_t *pbmapp,
int *npbmaps,
int ioflag,
int found)
{
xfs_fileoff_t offset_fsb;
xfs_fileoff_t ioalign;
xfs_fileoff_t last_fsb;
xfs_fileoff_t start_fsb;
xfs_filblks_t count_fsb;
xfs_off_t aligned_offset;
xfs_fsize_t isize;
xfs_fsblock_t firstblock;
__uint64_t last_page_offset;
int nimaps;
int error;
int n;
unsigned int iosize;
short small_write;
xfs_mount_t *mp;
#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP
xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
int aeof;
#ifdef DELALLOC_BUG
unsigned int writing_bytes;
#endif
ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0);
/* xfs_iomap_enter_trace(XFS_IOMAP_WRITE_ENTER, io, offset, count); */
mp = io->io_mount;
/***
ASSERT(! XFS_NOT_DQATTACHED(mp, ip));
***/
isize = XFS_SIZE(mp, io);
if (io->io_new_size > isize) {
isize = io->io_new_size;
}
aeof = 0;
offset_fsb = XFS_B_TO_FSBT(mp, offset);
last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
/*
* If the caller is doing a write at the end of the file,
* then extend the allocation (and the buffer used for the write)
* out to the file system's write iosize. We clean up any extra
* space left over when the file is closed in xfs_inactive().
* We can only do this if we are sure that we will create buffers
* over all of the space we allocate beyond the end of the file.
* Not doing so would allow us to create delalloc blocks with
* no pages in memory covering them. So, we need to check that
* there are not any real blocks in the area beyond the end of
* the file which we are optimistically going to preallocate. If
* there are then our buffers will stop when they encounter them
* and we may accidentally create delalloc blocks beyond them
* that we never cover with a buffer. All of this is because
* we are not actually going to write the extra blocks preallocated
* at this point.
*
* We don't bother with this for sync writes, because we need
* to minimize the amount we write for good performance.
*/
if (!(ioflag & PBF_SYNC) && ((offset + count) > XFS_SIZE(mp, io))) {
start_fsb = XFS_B_TO_FSBT(mp,
((xfs_ufsize_t)(offset + count - 1)));
count_fsb = mp->m_writeio_blocks;
while (count_fsb > 0) {
nimaps = XFS_WRITE_IMAPS;
firstblock = NULLFSBLOCK;
error = XFS_BMAPI(mp, NULL, io, start_fsb, count_fsb,
0, &firstblock, 0, imap, &nimaps,
NULL);
if (error) {
return error;
}
for (n = 0; n < nimaps; n++) {
if ((imap[n].br_startblock != HOLESTARTBLOCK) &&
(imap[n].br_startblock != DELAYSTARTBLOCK)) {
goto write_map;
}
start_fsb += imap[n].br_blockcount;
count_fsb -= imap[n].br_blockcount;
ASSERT(count_fsb < 0xffff000);
}
}
iosize = mp->m_writeio_blocks;
aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
last_fsb = ioalign + iosize;
aeof = 1;
}
write_map:
nimaps = XFS_WRITE_IMAPS;
firstblock = NULLFSBLOCK;
/*
* roundup the allocation request to m_dalign boundary if file size
* is greater that 512K and we are allocating past the allocation eof
*/
if (mp->m_dalign && (XFS_SIZE(mp, io) >= mp->m_dalign) && aeof) {
int eof;
xfs_fileoff_t new_last_fsb;
new_last_fsb = roundup_64(last_fsb, mp->m_dalign);
error = XFS_BMAP_EOF(mp, io, new_last_fsb, XFS_DATA_FORK, &eof);
if (error) {
return error;
}
if (eof) {
last_fsb = new_last_fsb;
}
}
error = XFS_BMAPI(mp, NULL, io, offset_fsb,
(xfs_filblks_t)(last_fsb - offset_fsb),
XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
&nimaps, NULL);
/*
* This can be EDQUOT, if nimaps == 0
*/
if (error) {
return XFS_ERROR(error);
}
/*
* If bmapi returned us nothing, and if we didn't get back EDQUOT,
* then we must have run out of space.
*/
if (nimaps == 0) {
/* xfs_iomap_enter_trace(XFS_IOMAP_WRITE_NOSPACE,
io, offset, count); */
return XFS_ERROR(ENOSPC);
}
if (!(ioflag & PBF_SYNC) ||
((last_fsb - offset_fsb) >= mp->m_writeio_blocks)) {
/*
* For normal or large sync writes, align everything
* into i_writeio_blocks sized chunks.
*/
iosize = mp->m_writeio_blocks;
aligned_offset = XFS_WRITEIO_ALIGN(mp, offset);
ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
small_write = 0;
/* XXX - Are we shrinking? XXXXX */
} else {
/*
* For small sync writes try to minimize the amount
* of I/O we do. Round down and up to the larger of
* page or block boundaries. Set the small_write
* variable to 1 to indicate to the code below that
* we are not using the normal buffer alignment scheme.
*/
if (NBPP > mp->m_sb.sb_blocksize) {
ASSERT(!(offset & PAGE_MASK));
aligned_offset = offset;
ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
ASSERT(!((offset + count) & PAGE_MASK));
last_page_offset = offset + count;
iosize = XFS_B_TO_FSBT(mp, last_page_offset -
aligned_offset);
} else {
ioalign = offset_fsb;
iosize = last_fsb - offset_fsb;
}
small_write = 1;
/* XXX - Are we shrinking? XXXXX */
}
/*
* Now map our desired I/O size and alignment over the
* extents returned by xfs_bmapi().
*/
xfs_write_bmap(mp, io, imap, pbmapp, iosize, ioalign, isize);
pbmapp->pbm_delta = offset - pbmapp->pbm_offset;
ASSERT((pbmapp->pbm_bsize > 0)
&& (pbmapp->pbm_bsize - pbmapp->pbm_delta > 0));
/*
* A bmap is the EOF bmap when it reaches to or beyond the new
* inode size.
*/
if ((pbmapp->pbm_offset + pbmapp->pbm_bsize ) >= isize) {
pbmapp->pbm_flags |= PBMF_EOF;
}
/* xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP,
io, offset, count, bmapp, imap); */
/* On IRIX, we walk more imaps filling in more bmaps. On Linux
just handle one for now. To find the code on IRIX,
look in xfs_iomap_write() in xfs_rw.c. */
*npbmaps = 1;
return 0;
}
STATIC int
xfs_iomap_write_direct(
xfs_iocore_t *io,
loff_t offset,
size_t count,
pb_bmap_t *pbmapp,
int *npbmaps,
int ioflag,
int found)
{
xfs_inode_t *ip = XFS_IO_INODE(io);
xfs_mount_t *mp;
xfs_fileoff_t offset_fsb;
xfs_fileoff_t last_fsb;
xfs_filblks_t count_fsb;
xfs_fsize_t isize;
xfs_fsblock_t firstfsb;
int nimaps, maps;
int error;
xfs_trans_t *tp;
#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP
xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS], *imapp;
xfs_bmap_free_t free_list;
int aeof;
int bmapi_flags;
xfs_filblks_t datablocks;
int rt;
int committed;
int numrtextents;
uint resblks;
int rtextsize;
maps = min(XFS_WRITE_IMAPS, *npbmaps);
nimaps = maps;
mp = io->io_mount;
isize = XFS_SIZE(mp, io);
if (io->io_new_size > isize)
isize = io->io_new_size;
if ((offset + count) > isize) {
aeof = 1;
} else {
aeof = 0;
}
offset_fsb = XFS_B_TO_FSBT(mp, offset);
last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
count_fsb = last_fsb - offset_fsb;
if (found && (pbmapp->pbm_flags & PBMF_HOLE)) {
xfs_fileoff_t map_last_fsb;
map_last_fsb = XFS_B_TO_FSB(mp,
(pbmapp->pbm_bsize + pbmapp->pbm_offset));
if (map_last_fsb < last_fsb) {
last_fsb = map_last_fsb;
count_fsb = last_fsb - offset_fsb;
}
ASSERT(count_fsb > 0);
}
/*
* roundup the allocation request to m_dalign boundary if file size
* is greater that 512K and we are allocating past the allocation eof
*/
if (!found && mp->m_dalign && (isize >= 524288) && aeof) {
int eof;
xfs_fileoff_t new_last_fsb;
new_last_fsb = roundup_64(last_fsb, mp->m_dalign);
printk("xfs_iomap_write_direct: about to XFS_BMAP_EOF %Ld\n",
new_last_fsb);
error = XFS_BMAP_EOF(mp, io, new_last_fsb, XFS_DATA_FORK, &eof);
if (error) {
goto error_out;
}
if (eof)
last_fsb = new_last_fsb;
}
bmapi_flags = XFS_BMAPI_WRITE|XFS_BMAPI_DIRECT_IO|XFS_BMAPI_ENTIRE;
bmapi_flags &= ~XFS_BMAPI_DIRECT_IO;
/*
* determine if this is a realtime file
*/
if ((rt = (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) != 0) {
rtextsize = mp->m_sb.sb_rextsize;
} else
rtextsize = 0;
error = 0;
/*
* allocate file space for the bmapp entries passed in.
*/
/*
* determine if reserving space on
* the data or realtime partition.
*/
if (rt) {
numrtextents = (count_fsb + rtextsize - 1);
do_div(numrtextents, rtextsize);
datablocks = 0;
} else {
datablocks = count_fsb;
numrtextents = 0;
}
/*
* allocate and setup the transaction
*/
tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
resblks = XFS_DIOSTRAT_SPACE_RES(mp, datablocks);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
error = xfs_trans_reserve(tp,
resblks,
XFS_WRITE_LOG_RES(mp),
numrtextents,
XFS_TRANS_PERM_LOG_RES,
XFS_WRITE_LOG_COUNT);
/*
* check for running out of space
*/
if (error) {
/*
* Free the transaction structure.
*/
xfs_trans_cancel(tp, 0);
}
xfs_ilock(ip, XFS_ILOCK_EXCL);
if (error) {
goto error_out; /* Don't return in above if .. trans ..,
need lock to return */
}
if (XFS_IS_QUOTA_ON(mp)) {
if (xfs_trans_reserve_quota(tp,
ip->i_udquot,
ip->i_gdquot,
resblks, 0, 0)) {
error = (EDQUOT);
goto error1;
}
nimaps = 1;
} else {
nimaps = 2;
}
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_ihold(tp, ip);
/*
* issue the bmapi() call to allocate the blocks
*/
XFS_BMAP_INIT(&free_list, &firstfsb);
imapp = &imap[0];
error = XFS_BMAPI(mp, tp, io, offset_fsb, count_fsb,
bmapi_flags, &firstfsb, 1, imapp, &nimaps, &free_list);
if (error) {
goto error0;
}
/*
* complete the transaction
*/
error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
if (error) {
goto error0;
}
error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
if (error) {
goto error_out;
}
/* copy any maps to caller's array and return any error. */
if (nimaps == 0) {
error = (ENOSPC);
goto error_out;
}
maps = min(nimaps, maps);
*npbmaps = _xfs_imap_to_bmap(io, offset, &imap[0], pbmapp, maps, *npbmaps);
if(*npbmaps) {
/*
* this is new since xfs_iomap_read
* didn't find it.
*/
if (*npbmaps != 1) {
printk("NEED MORE WORK FOR MULTIPLE BMAPS (which are new)\n");
}
}
goto out;
error0: /* Cancel bmap, unlock inode, and cancel trans */
xfs_bmap_cancel(&free_list);
error1: /* Just cancel transaction */
xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
*npbmaps = 0; /* nothing set-up here */
error_out:
out: /* Just return error and any tracing at end of routine */
return XFS_ERROR(error);
}
/*
* All xfs metadata buffers except log state machine buffers
* get this attached as their b_bdstrat callback function.
* This is so that we can catch a buffer
* after prematurely unpinning it to forcibly shutdown the filesystem.
*/
int
xfs_bdstrat_cb(struct xfs_buf *bp)
{
xfs_mount_t *mp;
mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *);
if (!XFS_FORCED_SHUTDOWN(mp)) {
pagebuf_iorequest(bp);
return 0;
} else {
xfs_buftrace("XFS__BDSTRAT IOERROR", bp);
/*
* Metadata write that didn't get logged but
* written delayed anyway. These aren't associated
* with a transaction, and can be ignored.
*/
if (XFS_BUF_IODONE_FUNC(bp) == NULL &&
(XFS_BUF_ISREAD(bp)) == 0)
return (xfs_bioerror_relse(bp));
else
return (xfs_bioerror(bp));
}
}
/*
* Wrapper around bdstrat so that we can stop data
* from going to disk in case we are shutting down the filesystem.
* Typically user data goes thru this path; one of the exceptions
* is the superblock.
*/
int
xfsbdstrat(
struct xfs_mount *mp,
struct xfs_buf *bp)
{
ASSERT(mp);
if (!XFS_FORCED_SHUTDOWN(mp)) {
if (XFS_BUF_IS_GRIO(bp)) {
printk("xfsbdstrat needs grio_strategy\n");
} else {
pagebuf_iorequest(bp);
}
return 0;
}
xfs_buftrace("XFSBDSTRAT IOERROR", bp);
return (xfs_bioerror_relse(bp));
}
void
XFS_bflush(buftarg_t target)
{
pagebuf_delwri_flush(target.pb_targ, PBDF_WAIT, NULL);
}
/* Push all fs state out to disk
*/
void
XFS_log_write_unmount_ro(bhv_desc_t *bdp)
{
xfs_mount_t *mp;
int pincount = 0;
int count = 0;
int error;
mp = XFS_BHVTOM(bdp);
xfs_refcache_purge_mp(mp);
xfs_binval(mp->m_ddev_targ);
do {
xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
VFS_SYNC(XFS_MTOVFS(mp), SYNC_ATTR|SYNC_WAIT, sys_cred, error);
pagebuf_delwri_flush(mp->m_ddev_targ.pb_targ,
PBDF_WAIT, &pincount);
if (pincount == 0) {delay(50); count++;}
} while (count < 2);
/* Ok now write out an unmount record */
xfs_log_unmount_write(mp);
xfs_unmountfs_writesb(mp);
}
/*
* In these two situations we disregard the readonly mount flag and
* temporarily enable writes (we must, to ensure metadata integrity).
*/
STATIC int
xfs_is_read_only(xfs_mount_t *mp)
{
if (is_read_only(mp->m_dev) || is_read_only(mp->m_logdev)) {
cmn_err(CE_NOTE,
"XFS: write access unavailable, cannot proceed.");
return EROFS;
}
cmn_err(CE_NOTE,
"XFS: write access will be enabled during mount.");
XFS_MTOVFS(mp)->vfs_flag &= ~VFS_RDONLY;
return 0;
}
int
xfs_recover_read_only(xlog_t *log)
{
cmn_err(CE_NOTE, "XFS: WARNING: "
"recovery required on readonly filesystem.");
return xfs_is_read_only(log->l_mp);
}
int
xfs_quotacheck_read_only(xfs_mount_t *mp)
{
cmn_err(CE_NOTE, "XFS: WARNING: "
"quotacheck required on readonly filesystem.");
return xfs_is_read_only(mp);
}