5129 lines
113 KiB
C
5129 lines
113 KiB
C
#ident "@(#)ufs_vnodeops.c 1.1 94/10/31 SMI"
|
|
|
|
/*
|
|
* Copyright (c) 1989, 1990 by Sun Microsystems, Inc.
|
|
*/
|
|
|
|
#include <sys/param.h>
|
|
#include <sys/systm.h>
|
|
#include <sys/user.h>
|
|
#include <sys/buf.h>
|
|
#include <sys/vfs.h>
|
|
#include <sys/vfs_stat.h>
|
|
#include <sys/vnode.h>
|
|
#include <sys/proc.h>
|
|
#include <sys/file.h>
|
|
#include <sys/uio.h>
|
|
#include <sys/conf.h>
|
|
#include <sys/kernel.h>
|
|
#include <sys/mman.h>
|
|
#include <sys/pathname.h>
|
|
#include <sys/debug.h>
|
|
#include <sys/vmmeter.h>
|
|
#include <sys/trace.h>
|
|
#include <sys/unistd.h>
|
|
#include <sys/stat.h>
|
|
#include <sys/filio.h> /* FIOLFS */
|
|
#include <sys/vaccess.h> /* FIOLFS */
|
|
#include <sys/lockfs.h> /* FIOLFS */
|
|
#include <sys/filai.h> /* FIOAI */
|
|
|
|
#include <specfs/fifo.h>
|
|
#include <ufs/fs.h>
|
|
#include <ufs/inode.h>
|
|
#include <ufs/mount.h>
|
|
#include <ufs/fsdir.h>
|
|
#include <ufs/lockf.h> /* Defines constants for the locking code */
|
|
#include <ufs/lockfs.h> /* FIOLFS */
|
|
#ifdef QUOTA
|
|
#include <ufs/quota.h>
|
|
#endif
|
|
#include <sys/dirent.h> /* must be AFTER <ufs/fsdir>! */
|
|
|
|
#include <vm/hat.h>
|
|
#include <vm/page.h>
|
|
#include <vm/pvn.h>
|
|
#include <vm/as.h>
|
|
#include <vm/seg.h>
|
|
#include <vm/seg_map.h>
|
|
#include <vm/seg_vn.h>
|
|
#include <vm/rm.h>
|
|
#include <vm/swap.h>
|
|
|
|
#include <krpc/lockmgr.h>
|
|
|
|
#define ISVDEV(t) ((t == VCHR) || (t == VBLK) || (t == VFIFO))
|
|
|
|
static int ufs_open();
|
|
static int ufs_close();
|
|
static int ufs_rdwr();
|
|
static int ufs_ioctl();
|
|
static int ufs_select();
|
|
static int ufs_getattr();
|
|
static int ufs_setattr();
|
|
static int ufs_access();
|
|
static int ufs_lookup();
|
|
static int ufs_create();
|
|
static int ufs_remove();
|
|
static int ufs_link();
|
|
static int ufs_rename();
|
|
static int ufs_mkdir();
|
|
static int ufs_rmdir();
|
|
static int ufs_readdir();
|
|
static int ufs_symlink();
|
|
static int ufs_readlink();
|
|
static int ufs_fsync();
|
|
static int ufs_inactive();
|
|
static int ufs_lockctl();
|
|
static int ufs_fid();
|
|
static int ufs_getpage();
|
|
static int ufs_putpage();
|
|
static int ufs_map();
|
|
static int ufs_cmp();
|
|
static int ufs_realvp();
|
|
static int ufs_cntl();
|
|
static int ufs_badop();
|
|
|
|
/*
|
|
* ulockfs intercepts
|
|
* Substituted for normal VOP entry points in ufs_vnodeops below
|
|
*/
|
|
static int ufs_l_open();
|
|
static int ufs_l_close();
|
|
static int ufs_l_rdwr();
|
|
static int ufs_l_select();
|
|
static int ufs_l_getattr();
|
|
static int ufs_l_setattr();
|
|
static int ufs_l_access();
|
|
static int ufs_l_lookup();
|
|
static int ufs_l_create();
|
|
static int ufs_l_remove();
|
|
static int ufs_l_link();
|
|
static int ufs_l_rename();
|
|
static int ufs_l_mkdir();
|
|
static int ufs_l_rmdir();
|
|
static int ufs_l_readdir();
|
|
static int ufs_l_symlink();
|
|
static int ufs_l_readlink();
|
|
static int ufs_l_fsync();
|
|
static int ufs_l_inactive();
|
|
static int ufs_l_lockctl();
|
|
static int ufs_l_fid();
|
|
static int ufs_l_getpage();
|
|
static int ufs_l_putpage();
|
|
static int ufs_l_map();
|
|
static int ufs_l_cntl();
|
|
|
|
|
|
/*
|
|
* Replace standard entries with ulockfs intercepts
|
|
*/
|
|
struct vnodeops ufs_vnodeops = {
|
|
ufs_l_open,
|
|
ufs_l_close,
|
|
ufs_l_rdwr,
|
|
ufs_ioctl,
|
|
ufs_l_select,
|
|
ufs_l_getattr,
|
|
ufs_l_setattr,
|
|
ufs_l_access,
|
|
ufs_l_lookup,
|
|
ufs_l_create,
|
|
ufs_l_remove,
|
|
ufs_l_link,
|
|
ufs_l_rename,
|
|
ufs_l_mkdir,
|
|
ufs_l_rmdir,
|
|
ufs_l_readdir,
|
|
ufs_l_symlink,
|
|
ufs_l_readlink,
|
|
ufs_l_fsync,
|
|
ufs_l_inactive,
|
|
ufs_l_lockctl,
|
|
ufs_l_fid,
|
|
ufs_l_getpage,
|
|
ufs_l_putpage,
|
|
ufs_l_map,
|
|
ufs_badop, /* dump */
|
|
ufs_cmp,
|
|
ufs_realvp,
|
|
ufs_l_cntl,
|
|
};
|
|
|
|
/*
|
|
* FORCED UNMOUNT ENTRY POINTS
|
|
* Alternate vnodeops branch table substitued for ufs_vnodeops
|
|
*/
|
|
static int ufs_eio();
|
|
static int ufs_f_close();
|
|
static int ufs_f_inactive();
|
|
|
|
struct vnodeops ufs_forcedops = {
|
|
ufs_eio, /* ufs_open, */
|
|
ufs_f_close,
|
|
ufs_eio, /* ufs_rdwr, */
|
|
ufs_eio, /* ufs_ioctl, */
|
|
ufs_eio, /* ufs_select, */
|
|
ufs_eio, /* ufs_getattr, */
|
|
ufs_eio, /* ufs_setattr, */
|
|
ufs_eio, /* ufs_access, */
|
|
ufs_eio, /* ufs_lookup, */
|
|
ufs_eio, /* ufs_create, */
|
|
ufs_eio, /* ufs_remove, */
|
|
ufs_eio, /* ufs_link, */
|
|
ufs_eio, /* ufs_rename, */
|
|
ufs_eio, /* ufs_mkdir, */
|
|
ufs_eio, /* ufs_rmdir, */
|
|
ufs_eio, /* ufs_readdir, */
|
|
ufs_eio, /* ufs_symlink, */
|
|
ufs_eio, /* ufs_readlink, */
|
|
ufs_eio, /* ufs_fsync, */
|
|
ufs_f_inactive,
|
|
ufs_eio, /* ufs_lockctl, */
|
|
ufs_eio, /* ufs_fid, */
|
|
ufs_eio, /* ufs_getpage, */
|
|
ufs_eio, /* ufs_putpage, */
|
|
ufs_eio, /* ufs_map, */
|
|
ufs_badop, /* dump */
|
|
ufs_cmp,
|
|
ufs_eio, /* ufs_realvp, */
|
|
ufs_eio, /* ufs_cntl, */
|
|
};
|
|
|
|
/*
|
|
* FORCED UNMOUNT VOP ROUTINES
|
|
* VOP calls for inodes belonging to forcibly unmounted file systems
|
|
* enter one of the following routines.
|
|
*/
|
|
static int
|
|
ufs_eio()
|
|
{
|
|
return (EIO);
|
|
}
|
|
/*ARGSUSED*/
|
|
static int
|
|
ufs_f_close(vp, flag, count, cred)
|
|
struct vnode *vp;
|
|
int flag;
|
|
int count;
|
|
struct ucred *cred;
|
|
{
|
|
return (0);
|
|
}
|
|
/*ARGSUSED*/
|
|
static int
|
|
ufs_f_inactive(vp, cred)
|
|
struct vnode *vp;
|
|
struct ucred *cred;
|
|
{
|
|
iinactive(VTOI(vp));
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* ULOCKFS MACROS
|
|
*/
|
|
/*
|
|
* ulockfs intercept routines surround the normal ufs VOP call with a
|
|
* locking wrapper by using the following wrapper macro
|
|
*/
|
|
#define ULOCKFS(VP, VAID, VOPCALL) \
|
|
{ \
|
|
int reterr; \
|
|
struct mount *mp; \
|
|
\
|
|
if (reterr = ufs_lockfs_begin(VP, VAID, &mp)) \
|
|
return (reterr); \
|
|
reterr = (VOPCALL); \
|
|
ufs_lockfs_end(VAID, mp); \
|
|
return (reterr); \
|
|
}
|
|
|
|
int lock(), unlock();
|
|
void test_lock(), kill_proc_locks();
|
|
|
|
/*ARGSUSED*/
|
|
static int
|
|
ufs_open(vpp, flag, cred)
|
|
struct vnode **vpp;
|
|
int flag;
|
|
struct ucred *cred;
|
|
{
|
|
register int error, cmd;
|
|
register struct inode *ip;
|
|
struct eflock ld; /* Holder for an I/O lock */
|
|
|
|
VFS_RECORD((*vpp)->v_vfsp, VS_OPEN, VS_CALL);
|
|
ip = VTOI(*vpp);
|
|
/*
|
|
* Mandatory file and record locking stuff. MFRL is enforced
|
|
* when the SGID bit is set and the XGRP bit is reset (hey I
|
|
* didn't come up with this scheme!) When enabled reads and
|
|
* writes are checked to see if they will 'violate' an existing
|
|
* lock on the file. Failure modes are determined by the state
|
|
* of the O_NDELAY flag on the file descriptor, when set the
|
|
* error EAGAIN is returned when reset the process blocks until
|
|
* there are no blocking locks. In either case if a dead lock
|
|
* would occur EDEADLK is returned.
|
|
*/
|
|
if (((ip->i_mode & ISGID) != 0) && ((ip->i_mode & IFMT) == IFREG) &&
|
|
((ip->i_mode & (IEXEC >> 3)) == 0)) {
|
|
ld.l_type = F_WRLCK;
|
|
ld.l_start = 0;
|
|
ld.l_len = 0x7fffffff;
|
|
ld.l_whence = 0;
|
|
cmd = F_SETLK;
|
|
/* XXX need a better way to get pid */
|
|
if ((error = lock(*vpp, &ld, cmd, u.u_procp->p_pid, IO_LOCK)) !=
|
|
0) {
|
|
/* to make it SVID compliant return EAGAIN */
|
|
if (error == EACCES)
|
|
error = EAGAIN;
|
|
return (error);
|
|
}
|
|
} else {
|
|
cmd = 0;
|
|
}
|
|
if (cmd) {
|
|
ld.l_type = F_UNLCK;
|
|
cmd = F_SETLK;
|
|
/* XXX need a better way to get pid */
|
|
(void) unlock(*vpp, &ld, cmd, u.u_procp->p_pid, IO_LOCK);
|
|
}
|
|
return (0);
|
|
}
|
|
|
|
/*ARGSUSED*/
|
|
static int
|
|
ufs_close(vp, flag, count, cred)
|
|
struct vnode *vp;
|
|
int flag;
|
|
int count;
|
|
struct ucred *cred;
|
|
{
|
|
VFS_RECORD(vp->v_vfsp, VS_CLOSE, VS_CALL);
|
|
return (0);
|
|
}
|
|
|
|
|
|
/*
|
|
* read or write a vnode
|
|
*/
|
|
/*ARGSUSED*/
|
|
static int
|
|
ufs_rdwr(vp, uiop, rw, ioflag, cred)
|
|
struct vnode *vp;
|
|
struct uio *uiop;
|
|
enum uio_rw rw;
|
|
int ioflag;
|
|
struct ucred *cred;
|
|
{
|
|
register struct inode *ip;
|
|
int error;
|
|
int didlock; /* TRUE if the inode was locked. */
|
|
int cmd; /* I/O lock command, zero if no lock */
|
|
struct eflock ld; /* Holder for an I/O lock */
|
|
|
|
ip = VTOI(vp);
|
|
|
|
/*
|
|
* Mandatory file and record locking stuff. MFRL is enforced
|
|
* when the SGID bit is set and the XGRP bit is reset (hey I
|
|
* didn't come up with this scheme!) When enabled reads and
|
|
* writes are checked to see if they will 'violate' an existing
|
|
* lock on the file. Failure modes are determined by the state
|
|
* of the O_NDELAY flag on the file descriptor, when set the
|
|
* error EAGAIN is returned when reset the process blocks until
|
|
* there are no blocking locks. In either case if a dead lock
|
|
* would occur EDEADLK is returned.
|
|
*/
|
|
if (((ip->i_mode & ISGID) != 0) && ((ip->i_mode & IFMT) == IFREG) &&
|
|
((ip->i_mode & (IEXEC >> 3)) == 0)) {
|
|
ld.l_type = (rw == UIO_WRITE) ? F_WRLCK : F_RDLCK;
|
|
ld.l_start = uiop->uio_offset;
|
|
ld.l_len = uiop->uio_resid;
|
|
ld.l_whence = 0;
|
|
cmd = (ioflag & IO_NDELAY) ? F_SETLK : F_SETLKW;
|
|
/* XXX need a better way to get pid */
|
|
error = lock(vp, &ld, cmd, u.u_procp->p_pid, IO_LOCK);
|
|
if (error != 0) {
|
|
/* to make it SVID compliant return EAGAIN */
|
|
if (error == EACCES)
|
|
error = EAGAIN;
|
|
return (error);
|
|
}
|
|
} else
|
|
cmd = 0; /* No lock set */
|
|
|
|
if ((ioflag & IO_APPEND) != 0 && (rw == UIO_WRITE) &&
|
|
(ip->i_mode & IFMT) == IFREG) {
|
|
/*
|
|
* In append mode start at end of file after locking it.
|
|
*/
|
|
didlock = 1;
|
|
ILOCK(ip);
|
|
uiop->uio_offset = ip->i_size;
|
|
} else
|
|
didlock = 0;
|
|
|
|
error = rwip(ip, uiop, rw, ioflag);
|
|
ITIMES(ip);
|
|
|
|
if (didlock)
|
|
IUNLOCK(ip);
|
|
if (cmd) {
|
|
ld.l_type = F_UNLCK;
|
|
cmd = F_SETLK;
|
|
/* XXX need a better way to get pid */
|
|
(void) unlock(vp, &ld, cmd, u.u_procp->p_pid, IO_LOCK);
|
|
}
|
|
|
|
return (error);
|
|
}
|
|
|
|
/*
|
|
* Don't cache write blocks to files with the sticky bit set.
|
|
* Used to keep swap files from blowing the page cache on a server.
|
|
*/
|
|
int stickyhack = 1;
|
|
|
|
/*
|
|
* Bytes / inode allowed in the disk queue.
|
|
*/
|
|
int ufs_WRITES = 512 * 1024;
|
|
|
|
#ifdef MULTIPROCESSOR
|
|
/*
|
|
* release the kernel lock during uiomove.
|
|
*/
|
|
int ufs_uiomove_nolock = 1;
|
|
#endif
|
|
|
|
/*
|
|
* prefault segmkmap mapping in rwip to avoid traps.
|
|
*/
|
|
int dogetmapflt = 1;
|
|
|
|
/*
|
|
* The idea behind the freebehind stuff is this:
|
|
* We want caching but we don't want large i/o's to blow everything else
|
|
* out. Furthermore, it is more expensive (cpu wise) to wait for the
|
|
* page to free up memory; it's faster to have the process do free up
|
|
* it's own memory.
|
|
*
|
|
* The knobs associated with this stuff are:
|
|
*
|
|
* freebehind on/off switch for both read and write
|
|
* write_free on/off for unconditional free's upon write completion
|
|
* pages_before_pager the pager turns on at 'lotsfree'; we turn on at
|
|
* 'lotsfree + pages_before_pager'. This wants to be
|
|
* at least a clusters worth.
|
|
* smallfile don't free behind at offsets less than this.
|
|
*/
|
|
int freebehind = 1;
|
|
int pages_before_pager = 30; /* 1 cluster on a sun4c, 2 on all others */
|
|
int write_free = 0;
|
|
int smallfile = 32 * 1024;
|
|
#ifdef MULTIPROCESSOR
|
|
int ufs_lock_released = 0;
|
|
#endif
|
|
|
|
/*
|
|
* rwip does the real work of read or write requests for ufs.
|
|
*/
|
|
static int
|
|
rwip(ip, uio, rw, ioflag)
|
|
register struct inode *ip;
|
|
register struct uio *uio;
|
|
enum uio_rw rw;
|
|
int ioflag;
|
|
{
|
|
register u_int off;
|
|
register addr_t base;
|
|
register int n, on, mapon;
|
|
register struct fs *fs;
|
|
struct vnode *vp;
|
|
int type, error, pagecreate;
|
|
u_int flags;
|
|
int iupdat_flag;
|
|
long old_blocks;
|
|
int adjust_resid = 0;
|
|
int dofree;
|
|
extern int freemem, lotsfree, pages_before_pager;
|
|
int orig_resid = 0;
|
|
int last = 0;
|
|
#ifdef MULTIPROCESSOR
|
|
int klock_released;
|
|
#endif
|
|
#ifdef LWP
|
|
extern int runthreads;
|
|
#endif
|
|
extern caddr_t segmap_getmapflt();
|
|
|
|
if (rw != UIO_READ && rw != UIO_WRITE)
|
|
panic("rwip");
|
|
type = ip->i_mode & IFMT;
|
|
if (type != IFREG && type != IFDIR && type != IFLNK)
|
|
panic("rwip type");
|
|
if (uio->uio_offset < 0 || (uio->uio_offset + uio->uio_resid) < 0)
|
|
return (EINVAL);
|
|
if (uio->uio_resid == 0)
|
|
return (0);
|
|
|
|
trace6(TR_UFS_RWIP, ip, uio, rw, ioflag, uio->uio_offset,
|
|
TRC_RWIP_ENTER);
|
|
|
|
ILOCK(ip);
|
|
|
|
if (rw == UIO_WRITE) {
|
|
if (type == IFREG && uio->uio_offset + uio->uio_resid >
|
|
u.u_rlimit[RLIMIT_FSIZE].rlim_cur) {
|
|
if (uio->uio_offset >=
|
|
u.u_rlimit[RLIMIT_FSIZE].rlim_cur) {
|
|
psignal(u.u_procp, SIGXFSZ);
|
|
error = EFBIG;
|
|
goto out;
|
|
} else {
|
|
adjust_resid = uio->uio_resid;
|
|
uio->uio_resid =
|
|
u.u_rlimit[RLIMIT_FSIZE].rlim_cur - uio->uio_offset;
|
|
adjust_resid -= uio->uio_resid;
|
|
}
|
|
}
|
|
ip->i_flag |= INOACC; /* don't update ref time in getpage */
|
|
} else {
|
|
if (!ULOCKFS_IS_NOIACC(ITOU(ip)))
|
|
ip->i_flag |= IACC;
|
|
}
|
|
|
|
if (ioflag & IO_SYNC) {
|
|
ip->i_flag |= ISYNC;
|
|
old_blocks = ip->i_blocks;
|
|
iupdat_flag = 0;
|
|
}
|
|
fs = ip->i_fs;
|
|
vp = ITOV(ip);
|
|
do {
|
|
off = uio->uio_offset & MAXBMASK;
|
|
mapon = uio->uio_offset & MAXBOFFSET;
|
|
on = blkoff(fs, uio->uio_offset);
|
|
n = MIN(fs->fs_bsize - on, uio->uio_resid);
|
|
|
|
if (rw == UIO_READ) {
|
|
int diff = ip->i_size - uio->uio_offset;
|
|
|
|
VFS_RECORD(ITOV(ip)->v_vfsp, VS_READ, VS_CALL);
|
|
if (diff <= 0) {
|
|
error = 0;
|
|
goto out;
|
|
}
|
|
if (diff < n)
|
|
n = diff;
|
|
dofree = freebehind &&
|
|
ip->i_nextr == (off & PAGEMASK) && off > smallfile;
|
|
} else {
|
|
int s;
|
|
|
|
/*
|
|
* Limit the amount of memory that this inode can use
|
|
* Protected because the count is modified at interrupt
|
|
* level.
|
|
*/
|
|
s = splbio();
|
|
while (ufs_WRITES && ip->i_writes > ufs_WRITES) {
|
|
(void) sleep((caddr_t)&ip->i_writes, PZERO);
|
|
}
|
|
(void) splx(s);
|
|
VFS_RECORD(ITOV(ip)->v_vfsp, VS_WRITE, VS_CALL);
|
|
}
|
|
|
|
if ( dogetmapflt && rw == UIO_READ
|
|
&& off + MAXBSIZE <= ip->i_size )
|
|
base = segmap_getmapflt(segkmap, vp, off);
|
|
else
|
|
base = segmap_getmap(segkmap, vp, off);
|
|
|
|
trace6(TR_UFS_RWIP, ip, uio, rw, ioflag, uio->uio_offset,
|
|
TRC_RWIP_GETMAP);
|
|
|
|
if (rw == UIO_WRITE) {
|
|
if (uio->uio_offset + n > ip->i_size) {
|
|
/*
|
|
* We are extending the length of the file.
|
|
* bmap is used so that we are sure that
|
|
* if we need to allocate new blocks, that it
|
|
* is done here before we up the file size.
|
|
*/
|
|
error = bmap_write(ip,
|
|
(daddr_t)lblkno(fs, uio->uio_offset),
|
|
0, (daddr_t*)0, (int*)0,
|
|
(int)(on + n), mapon == 0);
|
|
|
|
if (error && (on % fs->fs_fsize)) {
|
|
int llbn = lblkno(fs, ip->i_size-1);
|
|
int olbn = lblkno(fs, uio->uio_offset);
|
|
|
|
/*
|
|
* If the offset is within the same
|
|
* block as the last byte of the file,
|
|
* fill out the rest of that block
|
|
* or frag for Posix */
|
|
|
|
if (llbn == olbn) {
|
|
int avail=
|
|
blksize(fs, ip, llbn) - on;
|
|
n = MIN(MAX(0,avail), n);
|
|
orig_resid = uio->uio_resid - n;
|
|
uio->uio_resid = n;
|
|
error = 0;
|
|
}
|
|
}
|
|
trace6(TR_UFS_RWIP, ip, uio, rw, ioflag,
|
|
uio->uio_offset, TRC_RWIP_BMAPALLOC);
|
|
if (error) {
|
|
(void) segmap_release(segkmap, base, 0);
|
|
/*
|
|
* For Posix. If the last write worked
|
|
* and now we're out of space return
|
|
* the number of bytes written.
|
|
*/
|
|
if (last > 0 && error == ENOSPC)
|
|
error = 0;
|
|
break;
|
|
}
|
|
ip->i_size = uio->uio_offset + n;
|
|
iupdat_flag = 1;
|
|
/*
|
|
* If we are writing from the beginning of
|
|
* the mapping, we can just create the
|
|
* pages without having to read them.
|
|
*/
|
|
if (mapon == 0) {
|
|
segmap_pagecreate(segkmap, base,
|
|
(u_int)n, 0);
|
|
pagecreate = 1;
|
|
} else
|
|
pagecreate = 0;
|
|
} else if (n == MAXBSIZE) {
|
|
/*
|
|
* Going to do a whole mappings worth,
|
|
* so we can just create the pages w/o
|
|
* having to read them in. But before
|
|
* we do that, we need to make sure any
|
|
* needed blocks are allocated first.
|
|
*/
|
|
error = bmap_write(ip,
|
|
(daddr_t)lblkno(fs, uio->uio_offset),
|
|
0, (daddr_t*)0, (int*)0, (int)(on + n), 1);
|
|
trace6(TR_UFS_RWIP, ip, uio, rw, ioflag,
|
|
uio->uio_offset, TRC_RWIP_BMAPALLOC);
|
|
|
|
if (error && (on % fs->fs_fsize)) {
|
|
int llbn = lblkno(fs, ip->i_size-1);
|
|
int olbn = lblkno(fs, uio->uio_offset);
|
|
|
|
/*
|
|
* If the offset is within the same
|
|
* block as the last byte of the file,
|
|
* fill out the rest of that block
|
|
* or frag for Posix */
|
|
|
|
if (llbn == olbn) {
|
|
int avail=
|
|
blksize(fs, ip, llbn) - on;
|
|
n = MIN(MAX(0,avail), n);
|
|
orig_resid = uio->uio_resid - n;
|
|
uio->uio_resid = n;
|
|
error = 0;
|
|
}
|
|
}
|
|
|
|
if (error) {
|
|
(void) segmap_release(segkmap, base, 0);
|
|
if (last > 0 && error == ENOSPC)
|
|
error = 0;
|
|
break;
|
|
}
|
|
segmap_pagecreate(segkmap, base, (u_int)n, 0);
|
|
pagecreate = 1;
|
|
} else
|
|
pagecreate = 0;
|
|
} else
|
|
pagecreate = 0;
|
|
|
|
#ifdef MULTIPROCESSOR
|
|
#ifdef LWP
|
|
if (ufs_uiomove_nolock && n >= 512 && (runthreads == 0)) {
|
|
#else
|
|
if (ufs_uiomove_nolock && n >= 512) {
|
|
#endif
|
|
ufs_lock_released++;
|
|
klock_exit();
|
|
klock_released = 1;
|
|
} else
|
|
klock_released = 0;
|
|
error = uiomove(base + mapon, n, rw, uio);
|
|
if (klock_released) {
|
|
klock_enter();
|
|
ufs_lock_released--;
|
|
}
|
|
#else
|
|
error = uiomove(base + mapon, n, rw, uio);
|
|
#endif
|
|
|
|
if (pagecreate && uio->uio_offset <
|
|
roundup(off + mapon + n, PAGESIZE)) {
|
|
/*
|
|
* We created pages w/o initializing them completely,
|
|
* thus we need to zero the part that wasn't set up.
|
|
* This happens on most EOF write cases and if
|
|
* we had some sort of error during the uiomove.
|
|
*/
|
|
int nzero, nmoved;
|
|
|
|
nmoved = uio->uio_offset - (off + mapon);
|
|
ASSERT(nmoved >= 0 && nmoved <= n);
|
|
nzero = roundup(on + n, PAGESIZE) - nmoved;
|
|
ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE);
|
|
(void) kzero(base + mapon + nmoved, (u_int)nzero);
|
|
}
|
|
trace6(TR_UFS_RWIP, ip, uio, rw, ioflag, uio->uio_offset,
|
|
TRC_RWIP_UIOMOVE);
|
|
|
|
if (error == 0) {
|
|
int free;
|
|
|
|
flags = 0;
|
|
if (rw == UIO_WRITE) {
|
|
if (write_free ||
|
|
(freebehind &&
|
|
freemem < lotsfree + pages_before_pager)) {
|
|
free = SM_FREE;
|
|
} else {
|
|
free = 0;
|
|
}
|
|
/*
|
|
* Force write back for synchronous write cases.
|
|
*/
|
|
if ((ioflag & IO_SYNC) || type == IFDIR) {
|
|
/*
|
|
* If the sticky bit is set but the
|
|
* execute bit is not set, we do a
|
|
* synchronous write back and free
|
|
* the page when done. We set up swap
|
|
* files to be handled this way to
|
|
* prevent servers from keeping around
|
|
* the client's swap pages too long.
|
|
* XXX - there ought to be a better way.
|
|
*/
|
|
if (IS_SWAPVP(vp)) {
|
|
flags = SM_WRITE | SM_FREE |
|
|
SM_DONTNEED;
|
|
} else {
|
|
iupdat_flag = 1;
|
|
flags = SM_WRITE | free;
|
|
}
|
|
} else if (n + on == MAXBSIZE ||
|
|
IS_SWAPVP(vp)) {
|
|
/*
|
|
* Have written a whole block.
|
|
* Start an asynchronous write and
|
|
* mark the buffer to indicate that
|
|
* it won't be needed again soon.
|
|
*/
|
|
flags = SM_WRITE | SM_ASYNC | free;
|
|
}
|
|
ip->i_flag |= IUPD | ICHG;
|
|
if (u.u_ruid != 0 && (ip->i_mode & (IEXEC |
|
|
(IEXEC >> 3) | (IEXEC >> 6))) != 0) {
|
|
/*
|
|
* Clear Set-UID & Set-GID bits on
|
|
* successful write if not super-user
|
|
* and at least one of the execute bits
|
|
* is set. If we always clear Set-GID,
|
|
* mandatory file and record locking is
|
|
* unuseable.
|
|
*/
|
|
ip->i_mode &= ~(ISUID | ISGID);
|
|
}
|
|
} else if (rw == UIO_READ) {
|
|
if (freebehind && dofree &&
|
|
freemem < lotsfree + pages_before_pager) {
|
|
flags = SM_FREE | SM_DONTNEED;
|
|
}
|
|
}
|
|
error = segmap_release(segkmap, base, flags);
|
|
} else {
|
|
(void) segmap_release(segkmap, base, 0);
|
|
}
|
|
trace6(TR_UFS_RWIP, ip, uio, rw, ioflag, uio->uio_offset,
|
|
TRC_RWIP_RELEASE);
|
|
|
|
/*
|
|
* For Posix conformance
|
|
*/
|
|
if (orig_resid) {
|
|
uio->uio_resid = orig_resid;
|
|
if(error == ENOSPC)
|
|
error = 0;
|
|
break;
|
|
}
|
|
|
|
last = n;
|
|
|
|
} while (error == 0 && uio->uio_resid > 0 && n != 0);
|
|
|
|
/*
|
|
* If we are doing synchronous write the only time we should
|
|
* not be sync'ing the ip here is if we have the stickyhack
|
|
* activated, the file is marked with the sticky bit and
|
|
* no exec bit, the file length has not been changed and
|
|
* no new blocks have been allocated during this write.
|
|
*/
|
|
if ((ioflag & IO_SYNC) != 0 && rw == UIO_WRITE &&
|
|
(iupdat_flag != 0 || old_blocks != ip->i_blocks)) {
|
|
iupdat(ip, 1);
|
|
trace6(TR_UFS_RWIP, ip, uio, rw, ioflag, uio->uio_offset,
|
|
TRC_RWIP_IUPDAT);
|
|
}
|
|
|
|
out:
|
|
ip->i_flag &= ~(ISYNC | INOACC);
|
|
IUNLOCK(ip);
|
|
if (!error && adjust_resid) {
|
|
uio->uio_resid = adjust_resid;
|
|
psignal(u.u_procp, SIGXFSZ);
|
|
}
|
|
trace6(TR_UFS_RWIP, ip, uio, rw, ioflag, uio->uio_offset,
|
|
TRC_RWIP_RETURN);
|
|
return (error);
|
|
}
|
|
|
|
/*ARGSUSED*/
|
|
static int
|
|
ufs_ioctl(vp, com, data, flag, cred)
|
|
struct vnode *vp;
|
|
int com;
|
|
caddr_t data;
|
|
int flag;
|
|
struct ucred *cred;
|
|
{
|
|
int error;
|
|
|
|
VFS_RECORD(vp->v_vfsp, VS_IOCTL, VS_CALL);
|
|
|
|
switch (com) {
|
|
case FIOLFS:
|
|
/*
|
|
* file system locking
|
|
*/
|
|
if ((error = ufs_lockfs_hold(vp->v_vfsp)) == 0) {
|
|
error = ufs_fiolfs(vp, (struct lockfs **)data);
|
|
ufs_lockfs_rele(vp->v_vfsp);
|
|
}
|
|
break;
|
|
case FIOLFSS:
|
|
/*
|
|
* file system lock status
|
|
*/
|
|
if ((error = ufs_lockfs_hold(vp->v_vfsp)) == 0) {
|
|
error = ufs_fiolfss(vp, (struct lockfs **)data);
|
|
ufs_lockfs_rele(vp->v_vfsp);
|
|
}
|
|
break;
|
|
case FIOFFS:
|
|
/*
|
|
* file system flush (push w/invalidate)
|
|
*/
|
|
if ((error = ufs_lockfs_hold(vp->v_vfsp)) == 0) {
|
|
error = ufs_fioffs(vp, (struct lockfs **)data);
|
|
ufs_lockfs_rele(vp->v_vfsp);
|
|
}
|
|
break;
|
|
case FIOAI:
|
|
/*
|
|
* file allocation information
|
|
*/
|
|
ULOCKFS(vp, VA_GETATTR,
|
|
ufs_fioai(vp, (struct filai **)data));
|
|
/* NOTREACHED */
|
|
break;
|
|
case FIODUTIMES:
|
|
/*
|
|
* file allocation information
|
|
*/
|
|
ULOCKFS(vp, VA_CHANGE,
|
|
ufs_fiodutimes(vp, (struct timeval **)data));
|
|
/* NOTREACHED */
|
|
break;
|
|
case FIODIO:
|
|
/*
|
|
* file system meta/user data delayed io
|
|
*/
|
|
ULOCKFS(vp, VA_WRITE,
|
|
ufs_fiodio(vp, (u_long **)data));
|
|
/* NOTREACHED */
|
|
break;
|
|
case FIODIOS:
|
|
/*
|
|
* file system meta/user data delayed io status
|
|
*/
|
|
ULOCKFS(vp, VA_READ,
|
|
ufs_fiodios(vp, (u_long **)data));
|
|
/* NOTREACHED */
|
|
break;
|
|
default:
|
|
error = ENOTTY;
|
|
break;
|
|
}
|
|
return (error);
|
|
}
|
|
|
|
/*ARGSUSED*/
|
|
static int
|
|
ufs_select(vp, which, cred)
|
|
struct vnode *vp;
|
|
int which;
|
|
struct ucred *cred;
|
|
{
|
|
|
|
VFS_RECORD(vp->v_vfsp, VS_SELECT, VS_CALL);
|
|
return (EINVAL);
|
|
}
|
|
|
|
/*ARGSUSED*/
|
|
static int
|
|
ufs_getattr(vp, vap, cred)
|
|
struct vnode *vp;
|
|
register struct vattr *vap;
|
|
struct ucred *cred;
|
|
{
|
|
register struct inode *ip;
|
|
|
|
VFS_RECORD(vp->v_vfsp, VS_GETATTR, VS_CALL);
|
|
|
|
ip = VTOI(vp);
|
|
/*
|
|
* Mark correct time in inode.
|
|
*/
|
|
ITIMES(ip);
|
|
/*
|
|
* Copy from inode table.
|
|
*/
|
|
vap->va_type = IFTOVT(ip->i_mode);
|
|
vap->va_mode = ip->i_mode;
|
|
vap->va_uid = ip->i_uid;
|
|
vap->va_gid = ip->i_gid;
|
|
vap->va_fsid = ip->i_dev;
|
|
vap->va_nodeid = ip->i_number;
|
|
vap->va_nlink = ip->i_nlink;
|
|
vap->va_size = ip->i_size;
|
|
vap->va_atime = ip->i_atime;
|
|
vap->va_mtime = ip->i_mtime;
|
|
vap->va_ctime = ip->i_ctime;
|
|
vap->va_rdev = ip->i_rdev;
|
|
vap->va_blocks = ip->i_blocks;
|
|
|
|
switch (ip->i_mode & IFMT) {
|
|
|
|
case IFBLK:
|
|
vap->va_blocksize = MAXBSIZE; /* was BLKDEV_IOSIZE */
|
|
break;
|
|
|
|
case IFCHR:
|
|
vap->va_blocksize = MAXBSIZE;
|
|
break;
|
|
|
|
default:
|
|
vap->va_blocksize = vp->v_vfsp->vfs_bsize;
|
|
break;
|
|
}
|
|
return (0);
|
|
}
|
|
|
|
static int
|
|
ufs_setattr(vp, vap, cred)
|
|
register struct vnode *vp;
|
|
register struct vattr *vap;
|
|
struct ucred *cred;
|
|
{
|
|
register struct inode *ip;
|
|
int chtime = 0;
|
|
int error = 0;
|
|
|
|
VFS_RECORD(vp->v_vfsp, VS_SETATTR, VS_CALL);
|
|
|
|
/*
|
|
* Cannot set these attributes
|
|
*/
|
|
if ((vap->va_nlink != -1) || (vap->va_blocksize != -1) ||
|
|
(vap->va_rdev != -1) || (vap->va_blocks != -1) ||
|
|
(vap->va_fsid != -1) || (vap->va_nodeid != -1) ||
|
|
((int)vap->va_type != -1)) {
|
|
return (EINVAL);
|
|
}
|
|
|
|
ip = VTOI(vp);
|
|
ILOCK(ip);
|
|
/*
|
|
* Change file access modes. Must be owner or su.
|
|
*/
|
|
if (vap->va_mode != (u_short)-1) {
|
|
error = OWNER(cred, ip);
|
|
if (error)
|
|
goto out;
|
|
ip->i_mode &= IFMT;
|
|
ip->i_mode |= vap->va_mode & ~IFMT;
|
|
if (cred->cr_uid != 0) {
|
|
if ((ip->i_mode & IFMT) != IFDIR)
|
|
/* DBE_FAST_OSYNC */
|
|
if (ip->i_mode & (IEXEC | (IEXEC>>3) | (IEXEC>>6)))
|
|
/* DBE_FAST_OSYNC */
|
|
ip->i_mode &= ~ISVTX;
|
|
if (!groupmember((int)ip->i_gid))
|
|
ip->i_mode &= ~ISGID;
|
|
}
|
|
ip->i_flag |= ICHG;
|
|
}
|
|
/*
|
|
* To change file ownership, must be su.
|
|
* To change group ownership, must be su or owner and in target group.
|
|
* This is now enforced in chown1() below.
|
|
*/
|
|
if ((vap->va_uid != (uid_t)-1) || (vap->va_gid != (gid_t)-1)) {
|
|
error = chown1(ip, vap->va_uid, vap->va_gid);
|
|
if (error)
|
|
goto out;
|
|
}
|
|
/*
|
|
* Truncate file. Must have write permission (checked above vnode
|
|
* layer) and not be a directory.
|
|
*/
|
|
if (vap->va_size != (u_long)-1) {
|
|
if ((ip->i_mode & IFMT) == IFDIR) {
|
|
error = EISDIR;
|
|
goto out;
|
|
}
|
|
if ((error = itrunc(ip, vap->va_size)) != 0) {
|
|
goto out;
|
|
}
|
|
}
|
|
/*
|
|
* Change file access or modified times.
|
|
*/
|
|
if (vap->va_atime.tv_sec != -1) {
|
|
if (cred->cr_uid != ip->i_uid && cred->cr_uid != 0) {
|
|
error = iaccess(ip, IWRITE);
|
|
if (error)
|
|
goto out;
|
|
}
|
|
ip->i_atime = vap->va_atime;
|
|
chtime++;
|
|
}
|
|
if (vap->va_mtime.tv_sec != -1) {
|
|
/*
|
|
* Allow SysV-compatible option to set access and
|
|
* modified times to the current time if root, owner,
|
|
* or write access.
|
|
*
|
|
* XXX - va_mtime.tv_usec == -1 flags this.
|
|
*/
|
|
if (cred->cr_uid != ip->i_uid && cred->cr_uid != 0) {
|
|
error = iaccess(ip, IWRITE);
|
|
if (error)
|
|
goto out;
|
|
}
|
|
if (vap->va_mtime.tv_usec == -1) {
|
|
ip->i_atime = time;
|
|
ip->i_mtime = time;
|
|
} else {
|
|
ip->i_mtime = vap->va_mtime;
|
|
}
|
|
ip->i_flag |= IMODTIME;
|
|
chtime++;
|
|
}
|
|
if (chtime) {
|
|
ip->i_ctime = time;
|
|
ip->i_flag |= IMOD;
|
|
}
|
|
out:
|
|
iupdat(ip, 1); /* XXX - should be async for perf */
|
|
IUNLOCK(ip);
|
|
return (error);
|
|
}
|
|
|
|
/*
|
|
* Perform chown operation on inode ip;
|
|
* inode must be locked prior to call.
|
|
*/
|
|
static int
|
|
chown1(ip, uid, gid)
|
|
register struct inode *ip;
|
|
uid_t uid;
|
|
gid_t gid;
|
|
{
|
|
#ifdef QUOTA
|
|
register long change;
|
|
#endif
|
|
|
|
if (uid == (uid_t)-1)
|
|
uid = ip->i_uid;
|
|
if (gid == (gid_t)-1)
|
|
gid = ip->i_gid;
|
|
|
|
/*
|
|
* If:
|
|
* 1) not the owner of the file, or
|
|
* 2) trying to change the owner of the file, or
|
|
* 3) trying to change the group of the file to a group not in the
|
|
* process' group set,
|
|
* then must be super-user.
|
|
* Check super-user last, and use "suser", so that the accounting
|
|
* file's "used super-user privileges" flag is properly set.
|
|
*/
|
|
if ((u.u_uid != uid || uid != ip->i_uid || !groupmember((int)gid)) &&
|
|
!suser())
|
|
return (EPERM);
|
|
|
|
#ifdef QUOTA
|
|
if (ip->i_uid == uid) /* this just speeds things a little */
|
|
change = 0;
|
|
else
|
|
change = ip->i_blocks;
|
|
(void) chkdq(ip, -change, 1);
|
|
(void) chkiq(VFSTOM(ip->i_vnode.v_vfsp), ip, (int)ip->i_uid, 1);
|
|
dqrele(ip->i_dquot);
|
|
#endif
|
|
ip->i_uid = uid;
|
|
ip->i_gid = gid;
|
|
ip->i_flag |= ICHG;
|
|
if (u.u_uid != 0)
|
|
ip->i_mode &= ~(ISUID|ISGID);
|
|
#ifdef QUOTA
|
|
ip->i_dquot = getinoquota(ip);
|
|
(void) chkdq(ip, change, 1);
|
|
(void) chkiq(VFSTOM(ip->i_vnode.v_vfsp), (struct inode *)NULL,
|
|
(int)uid, 1);
|
|
#endif
|
|
return (0);
|
|
}
|
|
|
|
/*ARGSUSED*/
|
|
static int
|
|
ufs_access(vp, mode, cred)
|
|
struct vnode *vp;
|
|
int mode;
|
|
struct ucred *cred;
|
|
{
|
|
register struct inode *ip;
|
|
int error;
|
|
|
|
VFS_RECORD(vp->v_vfsp, VS_ACCESS, VS_CALL);
|
|
|
|
ip = VTOI(vp);
|
|
ILOCK(ip);
|
|
error = iaccess(ip, mode);
|
|
IUNLOCK(ip);
|
|
return (error);
|
|
}
|
|
|
|
/*ARGSUSED*/
|
|
static int
|
|
ufs_readlink(vp, uiop, cred)
|
|
struct vnode *vp;
|
|
struct uio *uiop;
|
|
struct ucred *cred;
|
|
{
|
|
register struct inode *ip;
|
|
register int error;
|
|
|
|
VFS_RECORD(vp->v_vfsp, VS_READLINK, VS_CALL);
|
|
|
|
if (vp->v_type != VLNK)
|
|
return (EINVAL);
|
|
ip = VTOI(vp);
|
|
if (ip->i_flag & IFASTSYMLNK) {
|
|
ILOCK(ip);
|
|
if (!ULOCKFS_IS_NOIACC(ITOU(ip)))
|
|
ip->i_flag |= IACC;
|
|
error = uiomove((caddr_t)&ip->i_db[1],
|
|
(int) MIN(ip->i_size, uiop->uio_resid),
|
|
UIO_READ, uiop);
|
|
IUNLOCK(ip);
|
|
} else {
|
|
int size; /* no. of byte read */
|
|
caddr_t basep; /* pointer to input data */
|
|
ino_t ino;
|
|
long igen;
|
|
|
|
ino = ip->i_number;
|
|
igen = ip->i_gen;
|
|
size = uiop->uio_resid;
|
|
basep = uiop->uio_iov->iov_base;
|
|
|
|
error = rwip(ip, uiop, UIO_READ, 0);
|
|
if (error == 0 && ip->i_number == ino &&
|
|
ip->i_gen == igen);
|
|
else goto out;
|
|
|
|
size -= uiop->uio_resid;
|
|
if (ip->i_size <= FSL_SIZE && ip->i_size == size) {
|
|
if (uiop->uio_segflg == UIO_USERSPACE ||
|
|
uiop->uio_segflg == UIO_USERISPACE)
|
|
error = copyin(basep,
|
|
(caddr_t) &ip->i_db[1],
|
|
(u_int) ip->i_size);
|
|
else
|
|
error = kcopy(basep,
|
|
(caddr_t) &ip->i_db[1],
|
|
(u_int) ip->i_size);
|
|
if (error == 0) {
|
|
ip->i_flag |= IFASTSYMLNK;
|
|
/* free page */
|
|
(void) VOP_PUTPAGE(ITOV(ip),
|
|
(caddr_t) 0, PAGESIZE,
|
|
(B_DONTNEED | B_FREE | B_FORCE |
|
|
B_ASYNC), cred);
|
|
} else {
|
|
int i;
|
|
/* error, clear garbage left behind */
|
|
for (i = 1; i < NDADDR && ip->i_db[i]; i++)
|
|
ip->i_db[i] = 0;
|
|
for (i = 0; i < NIADDR && ip->i_ib[i]; i++)
|
|
ip->i_ib[i] = 0;
|
|
}
|
|
}
|
|
}
|
|
out:
|
|
ITIMES(ip);
|
|
return (error);
|
|
|
|
}
|
|
|
|
/*ARGSUSED*/
|
|
static int
|
|
ufs_fsync(vp, cred)
|
|
struct vnode *vp;
|
|
struct ucred *cred;
|
|
{
|
|
register struct inode *ip;
|
|
int err;
|
|
|
|
VFS_RECORD(vp->v_vfsp, VS_FSYNC, VS_CALL);
|
|
|
|
ip = VTOI(vp);
|
|
ILOCK(ip);
|
|
err = syncip(ip, 0, 1); /* do synchronous writes */
|
|
if (!err)
|
|
err = sync_indir(ip); /* write back any other inode data */
|
|
IUNLOCK(ip);
|
|
return (err);
|
|
}
|
|
|
|
/*ARGSUSED*/
|
|
static int
|
|
ufs_inactive(vp, cred)
|
|
struct vnode *vp;
|
|
struct ucred *cred;
|
|
{
|
|
|
|
VFS_RECORD(vp->v_vfsp, VS_INACTIVE, VS_CALL);
|
|
|
|
iinactive(VTOI(vp));
|
|
return (0);
|
|
}
|
|
|
|
|
|
/*
|
|
* Unix file system operations having to do with directory manipulation.
|
|
*/
|
|
/*ARGSUSED*/
|
|
static int
|
|
ufs_lookup(dvp, nm, vpp, cred, pnp, flags)
|
|
struct vnode *dvp;
|
|
char *nm;
|
|
struct vnode **vpp;
|
|
struct ucred *cred;
|
|
struct pathname *pnp;
|
|
int flags;
|
|
{
|
|
register struct inode *ip;
|
|
struct inode *xip;
|
|
register int error;
|
|
|
|
VFS_RECORD(dvp->v_vfsp, VS_LOOKUP, VS_CALL);
|
|
|
|
ip = VTOI(dvp);
|
|
error = dirlook(ip, nm, &xip);
|
|
ITIMES(ip);
|
|
if (error == 0) {
|
|
ip = xip;
|
|
*vpp = ITOV(ip);
|
|
if ((ip->i_mode & ISVTX) && !(ip->i_mode & (IEXEC | IFDIR)) &&
|
|
stickyhack) {
|
|
(*vpp)->v_flag |= VISSWAP;
|
|
} else {
|
|
(*vpp)->v_flag &= ~VISSWAP;
|
|
}
|
|
ITIMES(ip);
|
|
IUNLOCK(ip);
|
|
/*
|
|
* If vnode is a device return special vnode instead
|
|
*/
|
|
if (ISVDEV((*vpp)->v_type)) {
|
|
struct vnode *newvp;
|
|
|
|
newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type);
|
|
VN_RELE(*vpp);
|
|
*vpp = newvp;
|
|
}
|
|
}
|
|
return (error);
|
|
}
|
|
|
|
static int
|
|
ufs_create(dvp, nm, vap, exclusive, mode, vpp, cred)
|
|
struct vnode *dvp;
|
|
char *nm;
|
|
struct vattr *vap;
|
|
enum vcexcl exclusive;
|
|
int mode;
|
|
struct vnode **vpp;
|
|
struct ucred *cred;
|
|
{
|
|
register int error;
|
|
register struct inode *ip;
|
|
struct inode *xip;
|
|
|
|
VFS_RECORD(dvp->v_vfsp, VS_CREATE, VS_CALL);
|
|
|
|
switch ((int) vap->va_type) {
|
|
/* Must be super-user to create a non-FIFO special device */
|
|
case (int) VBLK:
|
|
case (int) VCHR:
|
|
if (cred->cr_uid != 0)
|
|
return (EPERM);
|
|
else
|
|
break;
|
|
|
|
/* Can't create directories - use ufs_mkdir instead. */
|
|
case (int) VDIR:
|
|
return (EISDIR);
|
|
}
|
|
|
|
xip = (struct inode *)0;
|
|
ip = VTOI(dvp);
|
|
|
|
/* Must be super-user to set sticky bit */
|
|
if (cred->cr_uid != 0)
|
|
vap->va_mode &= ~VSVTX;
|
|
|
|
error = direnter(ip, nm, DE_CREATE, (struct inode *)0,
|
|
(struct inode *)0, vap, &xip);
|
|
ITIMES(ip);
|
|
ip = xip;
|
|
/*
|
|
* If file exists and this is a nonexclusive create,
|
|
* check for not directory and access permissions.
|
|
* If create/read-only an existing directory, allow it.
|
|
*/
|
|
if (error == EEXIST) {
|
|
if (exclusive == NONEXCL) {
|
|
if (((ip->i_mode & IFMT) == IFDIR) && (mode & IWRITE)) {
|
|
error = EISDIR;
|
|
} else if (mode) {
|
|
error = iaccess(ip, mode);
|
|
} else {
|
|
error = 0;
|
|
}
|
|
}
|
|
if (error) {
|
|
iput(ip);
|
|
} else if (((ip->i_mode&IFMT) == IFREG) && (vap->va_size == 0)){
|
|
/*
|
|
* Truncate regular files, if required
|
|
*/
|
|
(void) itrunc(ip, (u_long)0);
|
|
}
|
|
}
|
|
if (error) {
|
|
return (error);
|
|
}
|
|
*vpp = ITOV(ip);
|
|
ITIMES(ip);
|
|
IUNLOCK(ip);
|
|
/*
|
|
* If vnode is a device return special vnode instead
|
|
*/
|
|
if (ISVDEV((*vpp)->v_type)) {
|
|
struct vnode *newvp;
|
|
|
|
newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type);
|
|
VN_RELE(*vpp);
|
|
*vpp = newvp;
|
|
}
|
|
|
|
if (vap != (struct vattr *)0) {
|
|
(void) VOP_GETATTR(*vpp, vap, cred);
|
|
}
|
|
return (error);
|
|
}
|
|
|
|
/*ARGSUSED*/
|
|
static int
|
|
ufs_remove(vp, nm, cred)
|
|
struct vnode *vp;
|
|
char *nm;
|
|
struct ucred *cred;
|
|
{
|
|
register int error;
|
|
register struct inode *ip;
|
|
|
|
VFS_RECORD(vp->v_vfsp, VS_REMOVE, VS_CALL);
|
|
|
|
ip = VTOI(vp);
|
|
error = dirremove(ip, nm, (struct inode *)0, 0);
|
|
ITIMES(ip);
|
|
return (error);
|
|
}
|
|
|
|
/*
|
|
* Link a file or a directory.
|
|
* If source is a directory, must be superuser.
|
|
*/
|
|
/*ARGSUSED*/
|
|
static int
|
|
ufs_link(vp, tdvp, tnm, cred)
|
|
struct vnode *vp;
|
|
register struct vnode *tdvp;
|
|
char *tnm;
|
|
struct ucred *cred;
|
|
{
|
|
register struct inode *sip;
|
|
register int error;
|
|
struct vnode *realvp;
|
|
|
|
if (VOP_REALVP(vp, &realvp) == 0) {
|
|
vp = realvp;
|
|
}
|
|
|
|
VFS_RECORD(vp->v_vfsp, VS_LINK, VS_CALL);
|
|
|
|
sip = VTOI(vp);
|
|
if (((sip->i_mode & IFMT) == IFDIR) && !suser()) {
|
|
return (EPERM);
|
|
}
|
|
error = direnter(VTOI(tdvp), tnm, DE_LINK,
|
|
(struct inode *)0, sip, (struct vattr *)0, (struct inode **)0);
|
|
ITIMES(sip);
|
|
ITIMES(VTOI(tdvp));
|
|
return (error);
|
|
}
|
|
|
|
/*
|
|
* Rename a file or directory.
|
|
* We are given the vnode and entry string of the source and the
|
|
* vnode and entry string of the place we want to move the source to
|
|
* (the target). The essential operation is:
|
|
* unlink(target);
|
|
* link(source, target);
|
|
* unlink(source);
|
|
* but "atomically". Can't do full commit without saving state in the inode
|
|
* on disk, which isn't feasible at this time. Best we can do is always
|
|
* guarantee that the TARGET exists.
|
|
*/
|
|
/*ARGSUSED*/
|
|
static int
|
|
ufs_rename(sdvp, snm, tdvp, tnm, cred)
|
|
struct vnode *sdvp; /* old (source) parent vnode */
|
|
char *snm; /* old (source) entry name */
|
|
struct vnode *tdvp; /* new (target) parent vnode */
|
|
char *tnm; /* new (target) entry name */
|
|
struct ucred *cred;
|
|
{
|
|
struct inode *sip; /* source inode */
|
|
register struct inode *sdp; /* old (source) parent inode */
|
|
register struct inode *tdp; /* new (target) parent inode */
|
|
register int error;
|
|
struct vnode *realvp;
|
|
|
|
VFS_RECORD(sdvp->v_vfsp, VS_RENAME, VS_CALL);
|
|
|
|
if (VOP_REALVP(tdvp, &realvp) == 0) {
|
|
tdvp = realvp;
|
|
}
|
|
|
|
sdp = VTOI(sdvp);
|
|
tdp = VTOI(tdvp);
|
|
/*
|
|
* Make sure we can delete the source entry.
|
|
*/
|
|
error = iaccess(sdp, IWRITE);
|
|
if (error) {
|
|
return (error);
|
|
}
|
|
/*
|
|
* Look up inode of file we're supposed to rename.
|
|
*/
|
|
error = dirlook(sdp, snm, &sip);
|
|
if (error) {
|
|
return (error);
|
|
}
|
|
|
|
IUNLOCK(sip); /* unlock inode (it's held) */
|
|
/*
|
|
* Check for renaming '.' or '..' or alias of '.'
|
|
*/
|
|
if ((strcmp(snm, ".") == 0) || (strcmp(snm, "..") == 0) ||
|
|
(sdp == sip)) {
|
|
error = EINVAL;
|
|
goto out;
|
|
}
|
|
/*
|
|
* If the source parent directory is "sticky", then the user must
|
|
* either own the file, or owns the directory, or is the
|
|
* super-user
|
|
*/
|
|
if ((sdp->i_mode & ISVTX) && cred->cr_uid != 0 &&
|
|
cred->cr_uid != sdp->i_uid && sip->i_uid != cred->cr_uid) {
|
|
error = EPERM;
|
|
goto out;
|
|
}
|
|
/*
|
|
* Link source to the target.
|
|
*/
|
|
error = direnter(tdp, tnm, DE_RENAME,
|
|
sdp, sip, (struct vattr *)0, (struct inode **)0);
|
|
if (error) {
|
|
/*
|
|
* ESAME isn't really an error; it indicates that the
|
|
* operation should not be done because the source and target
|
|
* are the same file, but that no error should be reported.
|
|
*/
|
|
if (error == ESAME)
|
|
error = 0;
|
|
goto out;
|
|
}
|
|
|
|
/*
|
|
* Unlink the source.
|
|
* Remove the source entry. Dirremove checks that the entry
|
|
* still reflects sip, and returns an error if it doesn't.
|
|
* If the entry has changed just forget about it.
|
|
* Release the source inode.
|
|
*/
|
|
error = dirremove(sdp, snm, sip, 0);
|
|
if (error == ENOENT) {
|
|
error = 0;
|
|
} else if (error) {
|
|
goto out;
|
|
}
|
|
|
|
out:
|
|
ITIMES(sdp);
|
|
ITIMES(tdp);
|
|
irele(sip);
|
|
return (error);
|
|
}
|
|
|
|
/*ARGSUSED*/
|
|
static int
|
|
ufs_mkdir(dvp, nm, vap, vpp, cred)
|
|
struct vnode *dvp;
|
|
char *nm;
|
|
register struct vattr *vap;
|
|
struct vnode **vpp;
|
|
struct ucred *cred;
|
|
{
|
|
register struct inode *ip;
|
|
struct inode *xip;
|
|
register int error;
|
|
|
|
VFS_RECORD(dvp->v_vfsp, VS_MKDIR, VS_CALL);
|
|
|
|
ip = VTOI(dvp);
|
|
/*
|
|
* New directory inherits the set-gid bit from the parent.
|
|
*/
|
|
vap->va_mode &= ~VSGID;
|
|
if (ip->i_mode & ISGID)
|
|
vap->va_mode |= VSGID;
|
|
|
|
error = direnter(ip, nm, DE_CREATE,
|
|
(struct inode *)0, (struct inode *)0, vap, &xip);
|
|
ITIMES(ip);
|
|
if (error == 0) {
|
|
ip = xip;
|
|
*vpp = ITOV(ip);
|
|
ITIMES(ip);
|
|
IUNLOCK(ip);
|
|
} else if (error == EEXIST) {
|
|
iput(xip);
|
|
}
|
|
return (error);
|
|
}
|
|
|
|
/*ARGSUSED*/
|
|
static int
|
|
ufs_rmdir(vp, nm, cred)
|
|
struct vnode *vp;
|
|
char *nm;
|
|
struct ucred *cred;
|
|
{
|
|
register struct inode *ip;
|
|
register int error;
|
|
|
|
VFS_RECORD(vp->v_vfsp, VS_RMDIR, VS_CALL);
|
|
|
|
ip = VTOI(vp);
|
|
error = dirremove(ip, nm, (struct inode *)0, 1);
|
|
ITIMES(ip);
|
|
return (error);
|
|
}
|
|
|
|
/*ARGSUSED*/
|
|
static int
|
|
ufs_readdir(vp, uiop, cred)
|
|
struct vnode *vp;
|
|
struct uio *uiop;
|
|
struct ucred *cred;
|
|
{
|
|
register struct iovec *iovp;
|
|
register struct inode *ip;
|
|
register struct direct *idp;
|
|
register struct dirent *odp;
|
|
register u_int offset;
|
|
register int incount = 0;
|
|
register int outcount = 0;
|
|
register u_int bytes_wanted, total_bytes_wanted;
|
|
caddr_t outbuf;
|
|
u_int bufsize;
|
|
int error = 0;
|
|
struct fbuf *fbp;
|
|
int fastalloc;
|
|
static caddr_t dirbufp;
|
|
|
|
VFS_RECORD(vp->v_vfsp, VS_READDIR, VS_CALL);
|
|
|
|
ip = VTOI(vp);
|
|
iovp = uiop->uio_iov;
|
|
total_bytes_wanted = iovp->iov_len;
|
|
|
|
/* Force offset to be valid (to guard against bogus lseek() values) */
|
|
offset = uiop->uio_offset & ~(DIRBLKSIZ - 1);
|
|
|
|
/* Quit if at end of file */
|
|
if (offset >= ip->i_size)
|
|
return (0);
|
|
|
|
/*
|
|
* Get space to change directory entries into fs independent format.
|
|
* Do fast alloc for the most commonly used-request size (filesystem
|
|
* block size).
|
|
*/
|
|
fastalloc = (total_bytes_wanted == MAXBSIZE);
|
|
bufsize = total_bytes_wanted + sizeof (struct dirent);
|
|
if (fastalloc)
|
|
outbuf = new_kmem_fast_alloc(
|
|
&dirbufp, (int)bufsize, 1, KMEM_SLEEP);
|
|
else
|
|
outbuf = new_kmem_alloc(bufsize, KMEM_SLEEP);
|
|
odp = (struct dirent *)outbuf;
|
|
|
|
ILOCK(ip);
|
|
nextblk:
|
|
bytes_wanted = total_bytes_wanted;
|
|
|
|
/* Truncate request to file size */
|
|
if (offset + bytes_wanted > ip->i_size)
|
|
bytes_wanted = ip->i_size - offset;
|
|
|
|
/* Comply with MAXBSIZE boundary restrictions of fbread() */
|
|
if ((offset & MAXBOFFSET) + bytes_wanted > MAXBSIZE)
|
|
bytes_wanted = MAXBSIZE - (offset & MAXBOFFSET);
|
|
|
|
/* Read in the next chunk */
|
|
if (error = fbread(vp, offset, bytes_wanted, S_OTHER, &fbp))
|
|
goto out;
|
|
|
|
incount = 0;
|
|
idp = (struct direct *)fbp->fb_addr;
|
|
|
|
/* Transform to file-system independent format */
|
|
while (incount < bytes_wanted) {
|
|
extern char *strcpy();
|
|
|
|
/* Skip to requested offset and skip empty entries */
|
|
if (idp->d_ino != 0 && offset >= uiop->uio_offset) {
|
|
odp->d_fileno = idp->d_ino;
|
|
odp->d_namlen = idp->d_namlen;
|
|
(void) strcpy(odp->d_name, idp->d_name);
|
|
odp->d_reclen = DIRSIZ(odp);
|
|
odp->d_off = offset + idp->d_reclen;
|
|
outcount += odp->d_reclen;
|
|
/* Got as many bytes as requested, quit */
|
|
if (outcount > total_bytes_wanted) {
|
|
outcount -= odp->d_reclen;
|
|
/* Buffer too small to take any entry */
|
|
if (outcount == 0) {
|
|
fbrelse(fbp, S_OTHER);
|
|
error = EINVAL;
|
|
goto out;
|
|
}
|
|
break;
|
|
}
|
|
odp = (struct dirent *)((int)odp + odp->d_reclen);
|
|
}
|
|
if (idp->d_reclen) {
|
|
incount += idp->d_reclen;
|
|
offset += idp->d_reclen;
|
|
idp = (struct direct *)((int)idp + idp->d_reclen);
|
|
} else {
|
|
offset = (offset + DIRBLKSIZ) & ~(DIRBLKSIZ-1);
|
|
break;
|
|
}
|
|
}
|
|
/* Release the chunk */
|
|
fbrelse(fbp, S_OTHER);
|
|
|
|
/* Read whole block, but got no entries, read another if not eof */
|
|
if (offset < ip->i_size && !outcount)
|
|
goto nextblk;
|
|
|
|
/* Copy out the entry data */
|
|
if (error = uiomove(outbuf, outcount, UIO_READ, uiop))
|
|
goto out;
|
|
|
|
uiop->uio_offset = offset;
|
|
if (!ULOCKFS_IS_NOIACC(ITOU(ip)))
|
|
ip->i_flag |= IACC;
|
|
out:
|
|
ITIMES(ip);
|
|
IUNLOCK(ip);
|
|
if (fastalloc)
|
|
kmem_fast_free(&dirbufp, outbuf);
|
|
else
|
|
kmem_free(outbuf, bufsize);
|
|
return (error);
|
|
}
|
|
|
|
/*
|
|
* Old form of the ufs_readdir op. Returns directory entries directly
|
|
* from the disk in the 4.2 structure instead of the new sys/dirent.h
|
|
* structure. This routine is called directly by the old getdirentries
|
|
* system call when it discovers it is dealing with a ufs filesystem.
|
|
* The reason for this mess is to avoid large performance penalties
|
|
* that occur during conversion from the old format to the new and
|
|
* back again.
|
|
*/
|
|
|
|
/*ARGSUSED*/
|
|
int
|
|
old_ufs_readdir(vp, uiop, cred)
|
|
struct vnode *vp;
|
|
register struct uio *uiop;
|
|
struct ucred *cred;
|
|
{
|
|
register struct iovec *iovp;
|
|
register unsigned count;
|
|
register struct inode *ip;
|
|
int error;
|
|
struct mount *mp;
|
|
|
|
if (error = ufs_lockfs_begin(vp, VA_READDIR, &mp))
|
|
return (error);
|
|
|
|
ip = VTOI(vp);
|
|
iovp = uiop->uio_iov;
|
|
count = iovp->iov_len;
|
|
if ((uiop->uio_iovcnt != 1) || (count < DIRBLKSIZ) ||
|
|
(uiop->uio_offset & (DIRBLKSIZ -1))) {
|
|
error = EINVAL;
|
|
goto out;
|
|
}
|
|
count &= ~(DIRBLKSIZ - 1);
|
|
uiop->uio_resid -= iovp->iov_len - count;
|
|
iovp->iov_len = count;
|
|
error = rwip(ip, uiop, UIO_READ, 0);
|
|
ITIMES(ip);
|
|
out:
|
|
ufs_lockfs_end(VA_READDIR, mp);
|
|
return (error);
|
|
}
|
|
|
|
/*ARGSUSED*/
|
|
static int
|
|
ufs_symlink(dvp, lnm, vap, tnm, cred)
|
|
register struct vnode *dvp;
|
|
char *lnm;
|
|
struct vattr *vap;
|
|
char *tnm;
|
|
struct ucred *cred;
|
|
{
|
|
struct inode *ip;
|
|
register int error;
|
|
register struct fs *fs;
|
|
|
|
VFS_RECORD(dvp->v_vfsp, VS_SYMLINK, VS_MISS);
|
|
|
|
/* check for space availability - need at least 1 fragment */
|
|
fs = VTOI(dvp)->i_fs;
|
|
if (cred->cr_uid == 0) {
|
|
if ((fs->fs_cstotal.cs_nbfree == 0) &&
|
|
(fs->fs_cstotal.cs_nffree == 0))
|
|
return (ENOSPC);
|
|
} else
|
|
if (freespace(fs, fs->fs_minfree) <= 0)
|
|
return (ENOSPC);
|
|
|
|
ip = (struct inode *)0;
|
|
vap->va_type = VLNK;
|
|
vap->va_rdev = 0;
|
|
error = direnter(VTOI(dvp), lnm, DE_CREATE,
|
|
(struct inode *)0, (struct inode *)0, vap, &ip);
|
|
if (error == 0) {
|
|
error = rdwri(UIO_WRITE, ip, tnm, strlen(tnm),
|
|
(off_t)0, UIO_SYSSPACE, (int *)0);
|
|
if (error) {
|
|
idrop(ip);
|
|
error = dirremove(VTOI(dvp), lnm,
|
|
(struct inode *) 0, 0);
|
|
goto out;
|
|
}
|
|
}
|
|
if (error == 0) {
|
|
/* create a fast symbolic link */
|
|
if (ip->i_size <= FSL_SIZE) {
|
|
if (kcopy((caddr_t) tnm, (caddr_t) &ip->i_db[1],
|
|
(u_int) ip->i_size) == 0)
|
|
ip->i_flag |= IFASTSYMLNK;
|
|
else {
|
|
int i;
|
|
/* error, clear garbage left behind */
|
|
for (i = 1; i < NDADDR && ip->i_db[i]; i++)
|
|
ip->i_db[i] = 0;
|
|
for (i = 0; i < NIADDR && ip->i_ib[i]; i++)
|
|
ip->i_ib[i] = 0;
|
|
}
|
|
/*
|
|
* nice to free the page here, but don't bother because
|
|
* symbolic links are seldom created
|
|
*/
|
|
}
|
|
}
|
|
|
|
if (error == 0 || error == EEXIST)
|
|
iput(ip);
|
|
out:
|
|
ITIMES(VTOI(dvp));
|
|
return (error);
|
|
}
|
|
|
|
/*
|
|
* Ufs specific routine used to do ufs io.
|
|
*/
|
|
int
|
|
rdwri(rw, ip, base, len, offset, seg, aresid)
|
|
enum uio_rw rw;
|
|
struct inode *ip;
|
|
caddr_t base;
|
|
int len;
|
|
off_t offset;
|
|
int seg;
|
|
int *aresid;
|
|
{
|
|
struct uio auio;
|
|
struct iovec aiov;
|
|
register int error;
|
|
|
|
aiov.iov_base = base;
|
|
aiov.iov_len = len;
|
|
auio.uio_iov = &aiov;
|
|
auio.uio_iovcnt = 1;
|
|
auio.uio_offset = offset;
|
|
auio.uio_segflg = seg;
|
|
auio.uio_resid = len;
|
|
error = rwip(ip, &auio, rw, 0);
|
|
if (aresid) {
|
|
*aresid = auio.uio_resid;
|
|
} else if (auio.uio_resid) {
|
|
error = EIO;
|
|
}
|
|
return (error);
|
|
}
|
|
|
|
/*
|
|
* Record-locking requests are passed to the local Lock-Manager daemon.
|
|
*/
|
|
|
|
extern void kill_proc_locks();
|
|
|
|
/*ARGSUSED*/
|
|
static int
|
|
ufs_lockctl(vp, ld, cmd, cred, clid)
|
|
struct vnode *vp;
|
|
struct eflock *ld;
|
|
int cmd;
|
|
struct ucred *cred;
|
|
int clid;
|
|
{
|
|
VFS_RECORD(vp->v_vfsp, VS_LOCKCTL, VS_CALL);
|
|
|
|
if (cmd != F_RGETLK && cmd != F_RSETLK && cmd != F_RSETLKW) {
|
|
if (vp->v_type == VBLK || vp->v_type == VCHR ||
|
|
vp->v_type == VFIFO)
|
|
return (EINVAL);
|
|
} else {
|
|
if (vp->v_type == VBLK || vp->v_type == VFIFO)
|
|
return (EINVAL);
|
|
}
|
|
|
|
switch (cmd) {
|
|
case F_GETLK :
|
|
test_lock(vp, ld, cmd, clid, FILE_LOCK);
|
|
return (0);
|
|
case F_RGETLK :
|
|
test_lock(vp, ld, cmd, clid, LOCKMGR);
|
|
return (0);
|
|
case F_SETLK :
|
|
case F_SETLKW :
|
|
if (ld->l_type == F_UNLCK)
|
|
return (unlock(vp, ld, cmd, clid, FILE_LOCK));
|
|
else
|
|
return (lock(vp, ld, cmd, clid, FILE_LOCK));
|
|
case F_RSETLK :
|
|
case F_RSETLKW :
|
|
if (ld->l_type == F_UNLCK)
|
|
return (unlock(vp, ld, cmd, clid, LOCKMGR));
|
|
else if (ld->l_type == F_UNLKSYS) {
|
|
kill_proc_locks(clid, ld->l_rsys);
|
|
return (0);
|
|
} else
|
|
return (lock(vp, ld, cmd, clid, LOCKMGR));
|
|
default:
|
|
return (EINVAL);
|
|
}
|
|
}
|
|
|
|
static int
|
|
ufs_fid(vp, fidpp)
|
|
struct vnode *vp;
|
|
struct fid **fidpp;
|
|
{
|
|
register struct ufid *ufid;
|
|
|
|
VFS_RECORD(vp->v_vfsp, VS_FID, VS_CALL);
|
|
|
|
ufid = (struct ufid *)new_kmem_zalloc(sizeof (struct ufid), KMEM_SLEEP);
|
|
ufid->ufid_len = sizeof (struct ufid) - sizeof (u_short);
|
|
ufid->ufid_ino = VTOI(vp)->i_number;
|
|
ufid->ufid_gen = VTOI(vp)->i_gen;
|
|
*fidpp = (struct fid *)ufid;
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* For read purposes, this has to be bsize * maxcontig.
|
|
* For write purposes, this can be larger.
|
|
*/
|
|
#define RD_CLUSTSZ(fs) (fs->fs_bsize * fs->fs_maxcontig)
|
|
#define WR_CLUSTSZ(fs) (fs->fs_bsize * fs->fs_maxcontig)
|
|
|
|
int ufs_nocluster = 0;
|
|
int ufs_ra = 1;
|
|
int ufs_lostpage; /* number of times we lost original page */
|
|
|
|
/*
|
|
* Called from pvn_getpages or ufs_getpage to get a particular page.
|
|
* When we are called the inode is already locked. If rw == S_WRITE
|
|
* and the block is not currently allocated we need to allocate the
|
|
* needed block(s).
|
|
*
|
|
* Clustering notes: when we detect sequential access, we switch to cluster
|
|
* sized chunks of I/O. The steady state should be that we do clusters
|
|
* in the readahead case; we'll only do one synchronous read at the beginning.
|
|
* fs_maxcontig controls the cluster size and is bounded by maxphys.
|
|
*
|
|
* We handle bsize >= PAGESIZE here; others go to oldufs_getapage().
|
|
*
|
|
* TODO
|
|
* Think about mmap() writes and lastw/nextr interaction
|
|
*/
|
|
/*ARGSUSED*/
|
|
ufs_getapage(vp, off, protp, pl, plsz, seg, addr, rw, cred)
|
|
struct vnode *vp;
|
|
register u_int off;
|
|
u_int *protp;
|
|
struct page *pl[]; /* NULL if async IO is requested */
|
|
u_int plsz;
|
|
struct seg *seg;
|
|
addr_t addr;
|
|
enum seg_rw rw;
|
|
struct ucred *cred;
|
|
{
|
|
register struct inode *ip;
|
|
register struct fs *fs;
|
|
u_int xlen;
|
|
struct buf *bp, *bp2;
|
|
struct vnode *devvp;
|
|
struct page *pp, *pp2, **ppp, *pagefound;
|
|
daddr_t lbn, bn;
|
|
u_int io_off, io_len;
|
|
int len, boff;
|
|
int err, do2ndread;
|
|
dev_t dev;
|
|
|
|
VFS_RECORD(vp->v_vfsp, VS_GETPAGE, VS_CALL);
|
|
|
|
ip = VTOI(vp);
|
|
fs = ip->i_fs;
|
|
if (ufs_nocluster || fs->fs_bsize < PAGESIZE) {
|
|
return (oldufs_getapage(vp, off, protp, pl,
|
|
plsz, seg, addr, rw, cred));
|
|
}
|
|
devvp = ip->i_devvp;
|
|
dev = devvp->v_rdev;
|
|
|
|
reread:
|
|
err = 0;
|
|
bp = NULL;
|
|
bp2 = NULL;
|
|
pagefound = NULL;
|
|
do2ndread = ufs_ra && ip->i_nextr == off;
|
|
if (pl != NULL)
|
|
pl[0] = NULL;
|
|
/*
|
|
* It may seem that only writes need to do the bmap(). Not so -
|
|
* the protp needs to made readonly if the is backed by a hole.
|
|
* XXX - it might be possible to fix this.
|
|
*/
|
|
lbn = lblkno(fs, off);
|
|
boff = blkoff(fs, off);
|
|
if (rw == S_WRITE) {
|
|
err = bmap_write(ip, lbn, boff,
|
|
&bn, &len, (int)blksize(fs, ip, lbn), 0);
|
|
} else {
|
|
err = bmap_read(ip, lbn, boff, &bn, &len);
|
|
if (bn == UFS_HOLE) {
|
|
if (protp != NULL)
|
|
*protp &= ~PROT_WRITE;
|
|
do2ndread = 0;
|
|
}
|
|
}
|
|
if (err)
|
|
goto out;
|
|
if (!do2ndread)
|
|
len = MIN(fs->fs_bsize, len);
|
|
|
|
again:
|
|
if ((pagefound = page_find(vp, off)) == NULL) {
|
|
if (bn == UFS_HOLE) {
|
|
/*
|
|
* Block for this page is not allocated
|
|
* and the page was not found.
|
|
*/
|
|
if (pl != NULL) {
|
|
/*
|
|
* If we need a page, allocate and
|
|
* return a zero page. This assumes
|
|
* that for "async" faults it is not
|
|
* worth it to create the page now.
|
|
*/
|
|
pp = rm_allocpage(seg, addr, PAGESIZE, 1);
|
|
trace6(TR_SEG_ALLOCPAGE, seg,
|
|
(u_int)addr & PAGEMASK, TRC_SEG_UNK,
|
|
vp, off, pp);
|
|
if (page_enter(pp, vp, off))
|
|
panic("ufs_getapage page_enter");
|
|
pagezero(pp, 0, PAGESIZE);
|
|
page_unlock(pp);
|
|
pl[0] = pp;
|
|
pl[1] = NULL;
|
|
u.u_ru.ru_minflt++;
|
|
}
|
|
} else {
|
|
/*
|
|
* Need to really do disk IO to get the page(s).
|
|
*/
|
|
VFS_RECORD(vp->v_vfsp, VS_GETPAGE, VS_MISS);
|
|
|
|
pp = pvn_kluster(vp, off, seg, addr, &io_off, &io_len,
|
|
off, (u_int) len, 0);
|
|
|
|
/*
|
|
* Somebody has entered the page before us, so
|
|
* just use it.
|
|
*/
|
|
if (pp == NULL)
|
|
goto again;
|
|
|
|
if (pl != NULL) {
|
|
register int sz;
|
|
|
|
if (plsz >= io_len) {
|
|
/*
|
|
* Everything fits, set up to load
|
|
* up and hold all the pages.
|
|
*/
|
|
pp2 = pp;
|
|
sz = io_len;
|
|
} else {
|
|
/*
|
|
* Set up to load plsz worth
|
|
* starting at the needed page.
|
|
*/
|
|
for (pp2 = pp; pp2->p_offset != off;
|
|
pp2 = pp2->p_next) {
|
|
ASSERT(pp2->p_next->p_offset !=
|
|
pp->p_offset);
|
|
}
|
|
sz = plsz;
|
|
}
|
|
|
|
ppp = pl;
|
|
do {
|
|
PAGE_HOLD(pp2);
|
|
*ppp++ = pp2;
|
|
pp2 = pp2->p_next;
|
|
sz -= PAGESIZE;
|
|
} while (sz > 0);
|
|
*ppp = NULL; /* terminate list */
|
|
}
|
|
|
|
bp = pageio_setup(pp, io_len, devvp, pl == NULL ?
|
|
(B_ASYNC | B_READ) : B_READ);
|
|
|
|
bp->b_dev = dev;
|
|
bp->b_blkno = fsbtodb(fs, bn) + btodb(boff);
|
|
bp->b_un.b_addr = 0;
|
|
|
|
/*
|
|
* Zero part of page which we are not
|
|
* going to be reading from disk now.
|
|
* pp->p_prev is usually the same page unless
|
|
* a list of pages, as with exec.
|
|
*
|
|
* The only way this can happen, I think, is
|
|
* at the end of file, so I turn off readahead.
|
|
*/
|
|
xlen = io_len & PAGEOFFSET;
|
|
if (xlen != 0) {
|
|
pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
|
|
do2ndread = 0;
|
|
}
|
|
|
|
(*bdevsw[major(dev)].d_strategy)(bp);
|
|
|
|
/*
|
|
* Set up where to do the next readahead.
|
|
*/
|
|
ip->i_nextrio = off + (io_len & PAGEMASK);
|
|
u.u_ru.ru_majflt++;
|
|
if (seg == segkmap)
|
|
u.u_ru.ru_inblock++; /* count as `read' */
|
|
cnt.v_pgin++;
|
|
cnt.v_pgpgin += btopr(io_len);
|
|
}
|
|
}
|
|
|
|
ip->i_nextr = (off + fs->fs_bsize) & ~(fs->fs_bsize - 1);
|
|
|
|
/*
|
|
* XXX - This can get out of sync if a page has been stolen away in
|
|
* the previous cluster. Because we don't resync, this can result in
|
|
* two sync reads above; one for the stolen page and another on the
|
|
* following cluster.
|
|
*/
|
|
if (do2ndread &&
|
|
ip->i_nextrio - off <= RD_CLUSTSZ(fs) &&
|
|
ip->i_nextrio < ip->i_size) {
|
|
addr_t addr2;
|
|
|
|
io_off = ip->i_nextrio;
|
|
addr2 = addr + (io_off - off);
|
|
/*
|
|
* Read-ahead case (bsize >= PAGESIZE)
|
|
* If addr is now in a different seg,
|
|
* don't bother trying with read-ahead.
|
|
*/
|
|
if (addr2 >= seg->s_base + seg->s_size) {
|
|
pp2 = NULL;
|
|
goto out;
|
|
}
|
|
lbn = lblkno(fs, io_off);
|
|
boff = blkoff(fs, io_off);
|
|
err = bmap_read(ip, lbn, boff, &bn, &len);
|
|
if (err || bn == UFS_HOLE)
|
|
goto out;
|
|
|
|
pp2 = pvn_kluster(vp, io_off, seg, addr2,
|
|
&io_off, &io_len, io_off, (u_int) len, 1);
|
|
if (pp2 == NULL)
|
|
goto out;
|
|
bp2 = pageio_setup(pp2, io_len, devvp,
|
|
(B_ASYNC | B_READ));
|
|
bp2->b_dev = dev;
|
|
ASSERT(ip->i_nextrio == pp2->p_offset);
|
|
bp2->b_blkno = fsbtodb(fs, bn) + btodb(boff);
|
|
bp2->b_un.b_addr = 0;
|
|
|
|
/*
|
|
* Zero part of page which we are not
|
|
* going to be reading from disk now
|
|
* if it hasn't already been done.
|
|
*/
|
|
if (xlen = (io_len & PAGEOFFSET))
|
|
pagezero(pp2->p_prev, xlen, PAGESIZE - xlen);
|
|
/*
|
|
* Two cases where io_len < blksz.
|
|
* (1) We ran out of memory.
|
|
* (2) The page is already in memory.
|
|
*/
|
|
ip->i_nextrio = (io_off + io_len) & PAGEMASK;
|
|
|
|
(*bdevsw[major(dev)].d_strategy)(bp2);
|
|
|
|
/*
|
|
* Should we bill read ahead to extra faults?
|
|
*/
|
|
u.u_ru.ru_majflt++;
|
|
if (seg == segkmap)
|
|
u.u_ru.ru_inblock++; /* count as `read' */
|
|
cnt.v_pgin++;
|
|
cnt.v_pgpgin += btopr(io_len);
|
|
}
|
|
|
|
out:
|
|
if (pl == NULL) {
|
|
return (err);
|
|
}
|
|
|
|
if (bp != NULL) {
|
|
if (err == 0)
|
|
err = biowait(bp);
|
|
else
|
|
(void) biowait(bp);
|
|
pageio_done(bp);
|
|
}
|
|
|
|
if (pagefound != NULL) {
|
|
register int s;
|
|
|
|
/*
|
|
* We need to be careful here because if the page was
|
|
* previously on the free list, we might have already
|
|
* lost it at interrupt level.
|
|
*/
|
|
s = splvm();
|
|
if (pagefound->p_vnode == vp && pagefound->p_offset == off) {
|
|
/*
|
|
* If the page is still intransit or if
|
|
* it is on the free list call page_lookup
|
|
* to try and wait for / reclaim the page.
|
|
*/
|
|
if (pagefound->p_intrans || pagefound->p_free)
|
|
pagefound = page_lookup(vp, off);
|
|
}
|
|
if (pagefound == NULL || pagefound->p_offset != off ||
|
|
pagefound->p_vnode != vp || pagefound->p_gone) {
|
|
(void) splx(s);
|
|
ufs_lostpage++;
|
|
goto reread;
|
|
}
|
|
PAGE_HOLD(pagefound);
|
|
(void) splx(s);
|
|
pl[0] = pagefound;
|
|
pl[1] = NULL;
|
|
u.u_ru.ru_minflt++;
|
|
}
|
|
|
|
if (err) {
|
|
for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
|
|
PAGE_RELE(*ppp);
|
|
}
|
|
|
|
return (err);
|
|
}
|
|
|
|
/*
|
|
* Return all the pages from [off..off+len) in given file
|
|
*/
|
|
static int
|
|
ufs_getpage(vp, off, len, protp, pl, plsz, seg, addr, rw, cred)
|
|
struct vnode *vp;
|
|
u_int off, len;
|
|
u_int *protp;
|
|
struct page *pl[];
|
|
u_int plsz;
|
|
struct seg *seg;
|
|
addr_t addr;
|
|
enum seg_rw rw;
|
|
struct ucred *cred;
|
|
{
|
|
struct inode *ip = VTOI(vp);
|
|
int err;
|
|
extern freemem, lotsfree;
|
|
|
|
/*
|
|
* Normally fail if faulting beyond EOF, *except* if this
|
|
* is an internal access of ufs data. This condition is
|
|
* detected by testing the faulting segment against segkmap.
|
|
* Since accessing the file through segkmap is only done
|
|
* in places in the kernel which have knowledge of the
|
|
* current file length, these places deal with EOF themselves.
|
|
* For example, bmap may be faulting in pages beyond the
|
|
* current EOF when it is creating pages needed for extending
|
|
* the length of the file.
|
|
*/
|
|
if (off + len > ip->i_size + PAGEOFFSET && seg != segkmap)
|
|
return (EFAULT); /* beyond EOF */
|
|
|
|
if (protp != NULL)
|
|
*protp = PROT_ALL;
|
|
|
|
ILOCK(ip);
|
|
if (len <= PAGESIZE) {
|
|
err = ufs_getapage(vp, off, protp, pl, plsz, seg, addr,
|
|
rw, cred);
|
|
} else {
|
|
err = pvn_getpages(ufs_getapage, vp, off, len, protp, pl, plsz,
|
|
seg, addr, rw, cred);
|
|
}
|
|
/*
|
|
* If the inode is not already marked for IACC (in rwip() for read)
|
|
* and the inode is not marked for no access time update (in rwip()
|
|
* for write) then update the inode access time and mod time now.
|
|
*/
|
|
if ((ip->i_flag & (IACC | INOACC)) == 0) {
|
|
if (rw != S_OTHER) {
|
|
if (!ULOCKFS_IS_NOIACC(ITOU(ip)))
|
|
ip->i_flag |= IACC;
|
|
}
|
|
if (rw == S_WRITE) {
|
|
ip->i_flag |= IUPD;
|
|
}
|
|
ITIMES(ip);
|
|
}
|
|
IUNLOCK(ip);
|
|
|
|
return (err);
|
|
}
|
|
|
|
/*
|
|
* Called at interrupt level.
|
|
*/
|
|
static int
|
|
ufs_writedone(bp)
|
|
register struct buf *bp;
|
|
{
|
|
register struct inode *ip;
|
|
|
|
ASSERT(bp->b_pages);
|
|
ip = VTOI(bp->b_pages->p_vnode); /* gag me */
|
|
bp->b_flags &= ~B_CALL;
|
|
bp->b_iodone = NULL;
|
|
bp->b_flags |= B_DONE;
|
|
if (ip->i_writes > 0) {
|
|
ip->i_writes -= bp->b_bcount + bp->b_resid;
|
|
if (ip->i_writes <= ufs_WRITES)
|
|
wakeup((caddr_t)&ip->i_writes);
|
|
}
|
|
/*
|
|
* Stolen from biodone()
|
|
*/
|
|
if (bp->b_flags & B_ASYNC) {
|
|
if (bp->b_flags & (B_PAGEIO|B_REMAPPED))
|
|
swdone(bp);
|
|
else
|
|
brelse(bp);
|
|
} else if (bp->b_flags & B_WANTED) {
|
|
bp->b_flags &= ~B_WANTED;
|
|
wakeup((caddr_t)bp);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
|
|
* XXX - Has to be exported for 4K FS support.
|
|
*/
|
|
/* static */
|
|
int
|
|
ufs_writelbn(ip, bn, pp, len, pgoff, flags)
|
|
register struct inode *ip;
|
|
daddr_t bn;
|
|
struct page *pp;
|
|
u_int len;
|
|
u_int pgoff;
|
|
int flags;
|
|
{
|
|
struct buf *bp;
|
|
int err;
|
|
|
|
bp = pageio_setup(pp, len, ip->i_devvp, B_WRITE | flags);
|
|
if (bp == NULL) {
|
|
pvn_fail(pp, B_WRITE | flags);
|
|
return (ENOMEM);
|
|
}
|
|
if (ufs_WRITES) {
|
|
int s;
|
|
|
|
/*
|
|
* protected because the completion interrupt changes this.
|
|
*/
|
|
s = splbio();
|
|
ip->i_writes += len;
|
|
(void) splx(s);
|
|
bp->b_flags |= B_CALL;
|
|
bp->b_iodone = ufs_writedone;
|
|
}
|
|
|
|
bp->b_dev = ip->i_dev;
|
|
bp->b_blkno = bn;
|
|
bp->b_un.b_addr = (addr_t)pgoff;
|
|
|
|
(*bdevsw[major(ip->i_dev)].d_strategy)(bp);
|
|
u.u_ru.ru_oublock++;
|
|
|
|
/*
|
|
* If async, assume that pvn_done will
|
|
* handle the pages when IO is done
|
|
*/
|
|
if (flags & B_ASYNC) {
|
|
return (0);
|
|
}
|
|
|
|
err = biowait(bp);
|
|
pageio_done(bp);
|
|
|
|
return (err);
|
|
}
|
|
|
|
/*
|
|
* Macro to be used to see if it is safe to ILOCK the inode.
|
|
* This is needed because the pageout daemon cannot afford to
|
|
* wait for an inode lock since the process that has the inode
|
|
* lock may need more memory from the pageout daemon to complete
|
|
* its work. This is used to prevent deadlocking situations.
|
|
*/
|
|
#define ICHECK(ip) ((NOMEMWAIT()) && ((ip)->i_flag & ILOCKED) && \
|
|
((ip)->i_owner != uniqpid()))
|
|
|
|
int ufs_delay = 1; /* patchable while running */
|
|
|
|
/*
|
|
* Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
|
|
* If len == 0, do from off to EOF.
|
|
*
|
|
* The normal cases should be len == 0 & off == 0 (entire vp list),
|
|
* len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
|
|
* (from pageout).
|
|
*
|
|
* Note that for ufs it is possible to have dirty pages beyond
|
|
* roundup(ip->i_size, PAGESIZE). This can happen if the file
|
|
* length is long enough to involve indirect blocks (which are
|
|
* always fs->fs_bsize'd) and PAGESIZE < bsize while the length
|
|
* is such that roundup(blkoff(fs, ip->i_size), PAGESIZE) < bsize.
|
|
*/
|
|
|
|
/*ARGSUSED*/
|
|
static int
|
|
ufs_putpage(vp, off, len, flags, cred)
|
|
register struct vnode *vp;
|
|
u_int off, len;
|
|
int flags;
|
|
struct ucred *cred;
|
|
{
|
|
register struct inode *ip;
|
|
register struct page *pp;
|
|
register struct fs *fs;
|
|
struct page *dirty, *io_list;
|
|
register u_int io_off, io_len;
|
|
daddr_t lbn, dbn;
|
|
daddr_t bn;
|
|
int bmaplen, boff;
|
|
int vpcount, err;
|
|
|
|
#ifdef VFSSTATS
|
|
VFS_RECORD(vp->v_vfsp, VS_PUTPAGE, VS_CALL);
|
|
#endif
|
|
|
|
ip = VTOI(vp);
|
|
fs = ip->i_fs;
|
|
if (ufs_nocluster || fs->fs_bsize < PAGESIZE) {
|
|
return (oldufs_putpage(vp, off, len, flags, cred));
|
|
}
|
|
if (vp->v_pages == NULL) {
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* If (clustering) AND
|
|
* (it's a normal write, i.e., normal flags) AND
|
|
* (we're doing a portion of the file) AND
|
|
* (we've delayed less than a clusters worth) AND
|
|
* (this is the 1st chunk OR this chunk is contig w/the last chunk) THEN
|
|
* delay this chunk; we'll push it later.
|
|
*/
|
|
if (ufs_delay && (flags & B_ASYNC) &&
|
|
(flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0 && len &&
|
|
(ip->i_delaylen + len) < WR_CLUSTSZ(fs) &&
|
|
(ip->i_delaylen == 0 || ip->i_delayoff + ip->i_delaylen == off)) {
|
|
if (ip->i_delaylen == 0)
|
|
ip->i_delayoff = off;
|
|
ip->i_delaylen += len;
|
|
return (0);
|
|
}
|
|
vpcount = vp->v_count;
|
|
VN_HOLD(vp);
|
|
|
|
again:
|
|
|
|
/*
|
|
* Cannot afford to sleep on inode now, give up
|
|
*/
|
|
if (ICHECK(ip)) {
|
|
err = ENOMEM;
|
|
goto errout;
|
|
}
|
|
|
|
/*
|
|
* Hold inode lock for duration of push
|
|
*/
|
|
ILOCK(ip);
|
|
if (len == 0) {
|
|
/*
|
|
* Search the entire vp list for pages >= off
|
|
*/
|
|
dirty = pvn_vplist_dirty(vp, off, flags);
|
|
ip->i_delaylen = ip->i_delayoff = 0;
|
|
} else {
|
|
u_int offlo, offhi, offclust;
|
|
u_int d_len, d_off;
|
|
|
|
/*
|
|
* if (delayed pages)
|
|
* if (current request not in/adjacent to delayed pages)
|
|
* push old pages
|
|
* else
|
|
* start at beginning of delayed pages
|
|
* do [offlo..off+len) clustered up to off + WR_CLUSTSZ
|
|
*
|
|
* We play fast and loose with EOF here; counting on the
|
|
* fact that range_dirty will just not find the pages.
|
|
*/
|
|
offlo = off;
|
|
offhi = off + len;
|
|
offclust = MAX(offhi, off + WR_CLUSTSZ(fs));
|
|
if (ip->i_delaylen) {
|
|
d_off = ip->i_delayoff;
|
|
d_len = ip->i_delaylen;
|
|
ip->i_delayoff = ip->i_delaylen = 0;
|
|
if (off < d_off || off > d_off + d_len) {
|
|
int e;
|
|
|
|
if (e = ufs_putpage(vp, d_off,
|
|
d_len, B_NODELAY|B_ASYNC, cred)) {
|
|
printf("PP: vp=%x off=%d len=%d e=%d\n",
|
|
vp, d_off, d_len, e);
|
|
}
|
|
} else {
|
|
offlo = d_off;
|
|
}
|
|
}
|
|
dirty = pvn_range_dirty(vp, offlo, offhi,
|
|
offlo, offclust, flags);
|
|
}
|
|
|
|
/*
|
|
* Now pp will have the list of kept dirty pages marked for
|
|
* write back. All the pages on the pp list need to still
|
|
* be dealt with here. Verify that we can really can do the
|
|
* write back to the filesystem and if not and we have some
|
|
* dirty pages, return an error condition.
|
|
*/
|
|
err = fs->fs_ronly && dirty != NULL ? EROFS : 0;
|
|
|
|
if (dirty != NULL) {
|
|
/*
|
|
* If the modified time on the inode has not already been
|
|
* set elsewhere (i.e. for write/setattr) or this is
|
|
* a call from msync (B_FORCE) we set the time now.
|
|
* This gives us approximate modified times for mmap'ed files
|
|
* which are modified via stores in the user address space.
|
|
*/
|
|
if ((ip->i_flag & IMODTIME) == 0 || (flags & B_FORCE) != 0) {
|
|
ip->i_flag |= IUPD;
|
|
ITIMES(ip);
|
|
}
|
|
/*
|
|
* file system was modified
|
|
*/
|
|
LOCKFS_SET_MOD(UTOL(ITOU(ip)));
|
|
}
|
|
|
|
/*
|
|
* Handle all the dirty pages.
|
|
*
|
|
* Clustering changes: instead of grabbing a blocks worth,
|
|
* take whatever the extent tells us to.
|
|
*
|
|
* This code *assumes* that the list is in increasing order.
|
|
* There's a performance hit if it's not.
|
|
*/
|
|
pp = NULL;
|
|
while (err == 0 && dirty != NULL) {
|
|
io_off = dirty->p_offset;
|
|
lbn = lblkno(fs, io_off);
|
|
boff = blkoff(fs, io_off);
|
|
/*
|
|
* Normally the blocks should already be allocated for
|
|
* any dirty pages, we only need to use bmap_rd (S_OTHER)
|
|
* here and we should not get back a bn == UFS_HOLE.
|
|
*/
|
|
if (err = bmap_read(ip, lbn, boff, &bn, &bmaplen)) {
|
|
break;
|
|
}
|
|
if (bn == UFS_HOLE) {
|
|
if (!IS_SWAPVP(vp)) {
|
|
printf("ip=%x lbn=%d boff=%d off=%d poff=%d\n",
|
|
ip, lbn, boff, off, io_off);
|
|
panic("ufs_putpage hole");
|
|
}
|
|
/*
|
|
* Allocate for "holey" ufs file now.
|
|
* XXX - should redo the anon code to
|
|
* synchronously insure that all the
|
|
* needed backing store is allocated.
|
|
*/
|
|
err = bmap_write(ip, lbn, boff,
|
|
&bn, &bmaplen, (int)blksize(fs, ip, lbn), 1);
|
|
if (err) {
|
|
break;
|
|
}
|
|
ASSERT(bn != UFS_HOLE);
|
|
}
|
|
|
|
VFS_RECORD(vp->v_vfsp, VS_PUTPAGE, VS_MISS);
|
|
/*
|
|
* Pull off up to clustsize as long as it's contig.
|
|
* bmaplen tells everything we need to know.
|
|
* The list from pvn_xxx is sorted, all we have to check
|
|
* for are gaps.
|
|
*/
|
|
ASSERT(bmaplen > 0); /* leave this in */
|
|
pp = io_list = dirty;
|
|
io_len = 0;
|
|
do {
|
|
io_len += PAGESIZE;
|
|
bmaplen -= PAGESIZE;
|
|
pp = pp->p_next;
|
|
} while (bmaplen > 0 &&
|
|
pp != dirty && pp->p_offset == io_off + io_len);
|
|
|
|
/*
|
|
* Might have hit a gap or run out of extent.
|
|
* Have to break the list right before pp.
|
|
* No spls because the pages are held.
|
|
*/
|
|
if (pp != dirty &&
|
|
(pp->p_offset != io_off + io_len || bmaplen <= 0)) {
|
|
struct page *tail;
|
|
|
|
dirty = pp;
|
|
tail = io_list->p_prev;
|
|
pp = pp->p_prev;
|
|
tail->p_next = dirty;
|
|
dirty->p_prev = tail;
|
|
io_list->p_prev = pp;
|
|
pp->p_next = io_list;
|
|
} else {
|
|
dirty = NULL;
|
|
}
|
|
|
|
/*
|
|
* Might have gone to far (bmaplen is negative).
|
|
* We could have several full blocks and then a frag.
|
|
*/
|
|
if (bmaplen < 0)
|
|
io_len += bmaplen;
|
|
|
|
dbn = fsbtodb(fs, bn) + btodb(boff);
|
|
err = ufs_writelbn(ip, dbn, io_list, io_len, 0, flags);
|
|
pp = NULL;
|
|
}
|
|
IUNLOCK(ip);
|
|
|
|
if (err != 0) {
|
|
if (pp != NULL)
|
|
pvn_fail(pp, B_WRITE | flags);
|
|
if (dirty != NULL)
|
|
pvn_fail(dirty, B_WRITE | flags);
|
|
} else if (off == 0 && (len == 0 || len >= ip->i_size)) {
|
|
/*
|
|
* If doing "synchronous invalidation", make
|
|
* sure that all the pages are actually gone.
|
|
*
|
|
* We change len (possibly) from i_size to 0. This will
|
|
* make sure we get *all* the pages, including pages that
|
|
* may be past EOF. The other path may miss them.
|
|
*/
|
|
if ((flags & (B_INVAL | B_ASYNC)) == B_INVAL &&
|
|
((vp->v_pages != NULL) && (vp->v_pages->p_lckcnt == 0))) {
|
|
len = 0;
|
|
goto again;
|
|
}
|
|
/*
|
|
* We have just sync'ed back all the pages
|
|
* on the inode, turn off the IMODTIME flag.
|
|
*/
|
|
ip->i_flag &= ~IMODTIME;
|
|
}
|
|
|
|
/*
|
|
* Instead of using VN_RELE here we are careful to only call
|
|
* the inactive routine if the vnode reference count is now zero,
|
|
* but it wasn't zero coming into putpage. This is to prevent
|
|
* recursively calling the inactive routine on a vnode that
|
|
* is already considered in the `inactive' state.
|
|
* XXX - inactive is a relative term here (sigh).
|
|
*/
|
|
errout:
|
|
if (--vp->v_count == 0 && vpcount > 0)
|
|
iinactive(ip);
|
|
return (err);
|
|
}
|
|
|
|
/*ARGSUSED*/
|
|
static int
|
|
ufs_map(vp, off, as, addrp, len, prot, maxprot, flags, cred)
|
|
struct vnode *vp;
|
|
u_int off;
|
|
struct as *as;
|
|
addr_t *addrp;
|
|
u_int len;
|
|
u_int prot, maxprot;
|
|
u_int flags;
|
|
struct ucred *cred;
|
|
{
|
|
struct segvn_crargs vn_a;
|
|
|
|
VFS_RECORD(vp->v_vfsp, VS_MAP, VS_CALL);
|
|
|
|
if ((int)off < 0 || (int)(off + len) < 0)
|
|
return (EINVAL);
|
|
|
|
if (vp->v_type != VREG)
|
|
return (ENODEV);
|
|
|
|
if ((flags & MAP_FIXED) == 0) {
|
|
map_addr(addrp, len, (off_t)off, 1);
|
|
if (*addrp == NULL)
|
|
return (ENOMEM);
|
|
} else {
|
|
/*
|
|
* User specified address - blow away any previous mappings
|
|
*/
|
|
(void) as_unmap(as, *addrp, len);
|
|
}
|
|
|
|
vn_a.vp = vp;
|
|
vn_a.offset = off;
|
|
vn_a.type = flags & MAP_TYPE;
|
|
vn_a.prot = prot;
|
|
vn_a.maxprot = maxprot;
|
|
vn_a.cred = cred;
|
|
vn_a.amp = NULL;
|
|
|
|
return (as_map(as, *addrp, len, segvn_create, (caddr_t)&vn_a));
|
|
}
|
|
|
|
static int
|
|
ufs_cmp(vp1, vp2)
|
|
struct vnode *vp1, *vp2;
|
|
{
|
|
|
|
VFS_RECORD(vp1->v_vfsp, VS_CMP, VS_CALL);
|
|
return (vp1 == vp2);
|
|
}
|
|
|
|
/*ARGSUSED*/
|
|
static int
|
|
ufs_realvp(vp, vpp)
|
|
struct vnode *vp;
|
|
struct vnode **vpp;
|
|
{
|
|
|
|
VFS_RECORD(vp->v_vfsp, VS_REALVP, VS_CALL);
|
|
return (EINVAL);
|
|
}
|
|
|
|
static int
|
|
ufs_badop()
|
|
{
|
|
|
|
panic("ufs_badop");
|
|
}
|
|
|
|
/*ARGSUSED*/
|
|
static int
|
|
ufs_cntl(vp, cmd, idata, odata, iflag, oflag)
|
|
struct vnode *vp;
|
|
int cmd, iflag, oflag;
|
|
caddr_t idata, odata;
|
|
|
|
{
|
|
/*
|
|
* Currently we only allow a cmd passed in and an int passed out
|
|
*/
|
|
ASSERT(odata && oflag == CNTL_INT32);
|
|
switch (cmd) {
|
|
default:
|
|
return (EINVAL);
|
|
case _PC_LINK_MAX:
|
|
*(int *)odata = MAXLINK;
|
|
break;
|
|
case _PC_MAX_CANON:
|
|
*(int *)odata = CANBSIZ;
|
|
break;
|
|
case _PC_NAME_MAX:
|
|
*(int *)odata = MAXNAMLEN;
|
|
break;
|
|
case _PC_PATH_MAX:
|
|
*(int *)odata = MAXPATHLEN;
|
|
break;
|
|
case _PC_PIPE_BUF:
|
|
*(int *)odata = fifoinfo.fifobuf;
|
|
break;
|
|
case _PC_VDISABLE:
|
|
*(int *)odata = VDISABLE;
|
|
break;
|
|
case _PC_CHOWN_RESTRICTED:
|
|
*(int *)odata = 1;
|
|
break;
|
|
case _PC_NO_TRUNC:
|
|
*(int *)odata = 1;
|
|
break;
|
|
}
|
|
return (0);
|
|
}
|
|
|
|
#ifndef REMOVE_OLD_UFS
|
|
/*
|
|
* This stuff is obsolete. Here for compat but we're phasing out
|
|
* 4K file systems.
|
|
*/
|
|
|
|
/*
|
|
* Called from pvn_getpages or ufs_getpage to get a particular page.
|
|
* When we are called the inode is already locked. If rw == S_WRITE
|
|
* and the block is not currently allocated we need to allocate the
|
|
* needed block(s).
|
|
*
|
|
* bsize is either 4k or 8k. To handle the case of 4k bsize and 8k pages
|
|
* we will do two reads to get the data and don't bother with read ahead.
|
|
* Thus having 4k file systems on a Sun-3 works, but it is not recommended.
|
|
*
|
|
* XXX - should handle arbritrary file system block and page sizes.
|
|
*/
|
|
/*ARGSUSED*/
|
|
oldufs_getapage(vp, off, protp, pl, plsz, seg, addr, rw, cred)
|
|
struct vnode *vp;
|
|
register u_int off;
|
|
u_int *protp;
|
|
struct page *pl[]; /* NULL if async IO is requested */
|
|
u_int plsz;
|
|
struct seg *seg;
|
|
addr_t addr;
|
|
enum seg_rw rw;
|
|
struct ucred *cred;
|
|
{
|
|
register struct inode *ip;
|
|
register struct fs *fs;
|
|
register int bsize;
|
|
u_int xlen;
|
|
struct buf *bp, *bp2;
|
|
struct vnode *devvp;
|
|
struct page *pp, *pp2, **ppp, *pagefound;
|
|
daddr_t lbn, bn, bn2;
|
|
u_int io_off, io_len;
|
|
u_int lbnoff, blksz;
|
|
int err, nio, do2ndread, pgoff;
|
|
int multi_io;
|
|
dev_t dev;
|
|
|
|
VFS_RECORD(vp->v_vfsp, VS_GETPAGE, VS_CALL);
|
|
|
|
ip = VTOI(vp);
|
|
fs = ip->i_fs;
|
|
bsize = fs->fs_bsize;
|
|
devvp = ip->i_devvp;
|
|
dev = devvp->v_rdev;
|
|
multi_io = (PAGESIZE > bsize);
|
|
|
|
reread:
|
|
bp = NULL;
|
|
bp2 = NULL;
|
|
pagefound = NULL;
|
|
pgoff = 0;
|
|
lbn = lblkno(fs, off);
|
|
lbnoff = off & fs->fs_bmask;
|
|
if (pl != NULL)
|
|
pl[0] = NULL;
|
|
|
|
err = bmap(ip, lbn, &bn, &bn2, (int)blksize(fs, ip, lbn), rw, 0);
|
|
if (err)
|
|
goto out;
|
|
|
|
if (bn == UFS_HOLE && protp != NULL)
|
|
*protp &= ~PROT_WRITE;
|
|
|
|
if (multi_io) {
|
|
if (bsize != PAGESIZE / 2) {
|
|
/*
|
|
* This should have been prevented at mount time
|
|
* XXX - need to rewrite to avoid this restriction.
|
|
*/
|
|
panic("ufs_getapage bad bsize");
|
|
/* NOTREACHED */
|
|
}
|
|
|
|
if (bn2 == UFS_HOLE && ip->i_size > lbnoff + bsize) {
|
|
/*
|
|
* Try bmap with bn2 as the primary block now.
|
|
*/
|
|
err = bmap(ip, lbn + 1, &bn2, (daddr_t *)0,
|
|
(int)blksize(fs, ip, lbn + 1), rw, 0);
|
|
if (err)
|
|
goto out;
|
|
}
|
|
|
|
/*
|
|
* See if we are going to need to do a 2nd read
|
|
* to handle the bsize == PAGESIZE / 2 case.
|
|
*/
|
|
if (bn != UFS_HOLE && bn2 != UFS_HOLE &&
|
|
lbnoff + bsize < ip->i_size) {
|
|
nio = 2;
|
|
do2ndread = 1;
|
|
} else {
|
|
nio = 1;
|
|
do2ndread = 0;
|
|
if (bn2 == UFS_HOLE && lbnoff + bsize < ip->i_size)
|
|
*protp &= ~PROT_WRITE;
|
|
}
|
|
} else {
|
|
nio = 1;
|
|
if (ufs_ra && ip->i_nextr == off && bn2 != UFS_HOLE &&
|
|
lbnoff + bsize < ip->i_size) {
|
|
do2ndread = 1;
|
|
} else {
|
|
do2ndread = 0;
|
|
}
|
|
}
|
|
|
|
again:
|
|
if ((pagefound = page_find(vp, off)) == NULL) {
|
|
/*
|
|
* Compute the size of the block we actually want
|
|
* to read to be the smaller of a page boundary
|
|
* or the ufs acquired block size (i.e., we don't
|
|
* want to try and read the next page beyond EOF).
|
|
*/
|
|
blksz = MIN(roundup(ip->i_size, PAGESIZE) - lbnoff,
|
|
blksize(fs, ip, lbn));
|
|
|
|
if (bn == UFS_HOLE || off >= lbnoff + blksz) {
|
|
/*
|
|
* Block for this page is not allocated
|
|
* and the page was not found.
|
|
*/
|
|
if (pl != NULL) {
|
|
/*
|
|
* If we need a page, allocate and
|
|
* return a zero page. This assumes
|
|
* that for "async" faults it is not
|
|
* worth it to create the page now.
|
|
*/
|
|
pp = rm_allocpage(seg, addr, PAGESIZE, 1);
|
|
trace6(TR_SEG_ALLOCPAGE, seg,
|
|
(u_int)addr & PAGEMASK, TRC_SEG_UNK,
|
|
vp, off, pp);
|
|
if (page_enter(pp, vp, off))
|
|
panic("ufs_getapage page_enter");
|
|
pagezero(pp, 0, PAGESIZE);
|
|
page_unlock(pp);
|
|
pl[0] = pp;
|
|
pl[1] = NULL;
|
|
u.u_ru.ru_minflt++;
|
|
}
|
|
} else {
|
|
/*
|
|
* Need to really do disk IO to get the page(s).
|
|
*/
|
|
VFS_RECORD(vp->v_vfsp, VS_GETPAGE, VS_MISS);
|
|
|
|
pp = pvn_kluster(vp, off, seg, addr, &io_off, &io_len,
|
|
lbnoff, blksz, 0);
|
|
/*
|
|
* Somebody has entered the page before us, so
|
|
* just use it.
|
|
*/
|
|
if (pp == NULL)
|
|
goto again;
|
|
|
|
if (pl != NULL) {
|
|
register int sz;
|
|
|
|
if (plsz >= io_len) {
|
|
/*
|
|
* Everything fits, set up to load
|
|
* up and hold all the pages.
|
|
*/
|
|
pp2 = pp;
|
|
sz = io_len;
|
|
} else {
|
|
/*
|
|
* Set up to load plsz worth
|
|
* starting at the needed page.
|
|
*/
|
|
for (pp2 = pp; pp2->p_offset != off;
|
|
pp2 = pp2->p_next) {
|
|
ASSERT(pp2->p_next->p_offset !=
|
|
pp->p_offset);
|
|
}
|
|
sz = plsz;
|
|
}
|
|
|
|
ppp = pl;
|
|
do {
|
|
PAGE_HOLD(pp2);
|
|
*ppp++ = pp2;
|
|
pp2 = pp2->p_next;
|
|
sz -= PAGESIZE;
|
|
} while (sz > 0);
|
|
*ppp = NULL; /* terminate list */
|
|
}
|
|
|
|
if (nio > 1)
|
|
pp->p_nio = nio;
|
|
bp = pageio_setup(pp, io_len, devvp, pl == NULL ?
|
|
(B_ASYNC | B_READ) : B_READ);
|
|
|
|
bp->b_dev = dev;
|
|
bp->b_blkno = fsbtodb(fs, bn) +
|
|
btodb(blkoff(fs, io_off));
|
|
bp->b_un.b_addr = 0;
|
|
|
|
/*
|
|
* Zero part of page which we are not
|
|
* going to be reading from disk now.
|
|
* pp->p_prev is usually the same page unless
|
|
* a list of pages, as with exec.
|
|
*/
|
|
xlen = io_len & PAGEOFFSET;
|
|
if (xlen != 0)
|
|
pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
|
|
|
|
(*bdevsw[major(dev)].d_strategy)(bp);
|
|
|
|
ip->i_nextr = io_off + io_len;
|
|
u.u_ru.ru_majflt++;
|
|
if (seg == segkmap)
|
|
u.u_ru.ru_inblock++; /* count as `read' */
|
|
cnt.v_pgin++;
|
|
cnt.v_pgpgin += btopr(io_len);
|
|
}
|
|
}
|
|
|
|
lbn++;
|
|
lbnoff += fs->fs_bsize;
|
|
|
|
if (do2ndread && !(multi_io && pagefound != NULL)) {
|
|
addr_t addr2;
|
|
|
|
addr2 = addr + (lbnoff - off);
|
|
|
|
/*
|
|
* Compute the size of the block we actually want
|
|
* to read to be the smaller of a page boundary
|
|
* or the ufs acquired block size (i.e., we don't
|
|
* want to try and read the next page beyond EOF).
|
|
*/
|
|
blksz = MIN(roundup(ip->i_size, PAGESIZE) - lbnoff,
|
|
blksize(fs, ip, lbn));
|
|
|
|
if (multi_io) {
|
|
/*
|
|
* Second block for same page (bsize < PAGESIZE)
|
|
*/
|
|
pp2 = pp;
|
|
if (nio < 2) {
|
|
/*
|
|
* The first block was a hole, set up
|
|
* the page properly for io now. Otherwise,
|
|
* the page should already be marked as
|
|
* being paged in with an nio value of 2.
|
|
*/
|
|
page_lock(pp2);
|
|
PAGE_HOLD(pp2);
|
|
pp2->p_intrans = 1;
|
|
pp2->p_pagein = 1;
|
|
}
|
|
io_len = blksz;
|
|
pgoff = bsize;
|
|
} else {
|
|
/*
|
|
* Read-ahead case (bsize >= PAGESIZE)
|
|
* If addr is now in a different seg,
|
|
* don't bother trying with read-ahead.
|
|
*/
|
|
if (addr2 >= seg->s_base + seg->s_size) {
|
|
pp2 = NULL;
|
|
} else {
|
|
pp2 = pvn_kluster(vp, lbnoff, seg, addr2,
|
|
&io_off, &io_len, lbnoff, blksz, 1);
|
|
}
|
|
pgoff = 0;
|
|
}
|
|
|
|
if (pp2 != NULL) {
|
|
/*
|
|
* Do a synchronous read here only if a page
|
|
* list was given to this routine and the
|
|
* block size is smaller than the page size.
|
|
*/
|
|
bp2 = pageio_setup(pp2, io_len, devvp,
|
|
(pl != NULL && multi_io) ?
|
|
B_READ : (B_ASYNC | B_READ));
|
|
|
|
bp2->b_dev = dev;
|
|
bp2->b_blkno = fsbtodb(fs, bn2);
|
|
bp2->b_un.b_addr = (caddr_t)pgoff;
|
|
|
|
/*
|
|
* Zero part of page which we are not
|
|
* going to be reading from disk now
|
|
* if it hasn't already been done.
|
|
*/
|
|
xlen = (io_len + pgoff) & PAGEOFFSET;
|
|
if ((xlen != 0) && !multi_io)
|
|
pagezero(pp2->p_prev, xlen, PAGESIZE - xlen);
|
|
|
|
(*bdevsw[major(dev)].d_strategy)(bp2);
|
|
|
|
/*
|
|
* Should we bill read ahead to extra faults?
|
|
*/
|
|
u.u_ru.ru_majflt++;
|
|
if (seg == segkmap)
|
|
u.u_ru.ru_inblock++; /* count as `read' */
|
|
cnt.v_pgin++;
|
|
cnt.v_pgpgin += btopr(io_len);
|
|
}
|
|
}
|
|
|
|
out:
|
|
if (pl == NULL)
|
|
return (err);
|
|
|
|
if (bp != NULL) {
|
|
if (err == 0)
|
|
err = biowait(bp);
|
|
else
|
|
(void) biowait(bp);
|
|
pageio_done(bp);
|
|
}
|
|
|
|
/*
|
|
* Only wait for the second read operation
|
|
* when it is required for getting a page.
|
|
*/
|
|
if (multi_io && bp2 != NULL) {
|
|
if (err == 0)
|
|
err = biowait(bp2);
|
|
else
|
|
(void) biowait(bp2);
|
|
pageio_done(bp2);
|
|
}
|
|
|
|
if (pagefound != NULL) {
|
|
register int s;
|
|
|
|
/*
|
|
* We need to be careful here because if the page was
|
|
* previously on the free list, we might have already
|
|
* lost it at interrupt level.
|
|
*/
|
|
s = splvm();
|
|
if (pagefound->p_vnode == vp && pagefound->p_offset == off) {
|
|
/*
|
|
* If the page is still intransit or if
|
|
* it is on the free list call page_lookup
|
|
* to try and wait for / reclaim the page.
|
|
*/
|
|
if (pagefound->p_intrans || pagefound->p_free)
|
|
pagefound = page_lookup(vp, off);
|
|
}
|
|
if (pagefound == NULL || pagefound->p_offset != off ||
|
|
pagefound->p_vnode != vp || pagefound->p_gone) {
|
|
(void) splx(s);
|
|
ufs_lostpage++;
|
|
goto reread;
|
|
}
|
|
PAGE_HOLD(pagefound);
|
|
(void) splx(s);
|
|
pl[0] = pagefound;
|
|
pl[1] = NULL;
|
|
u.u_ru.ru_minflt++;
|
|
ip->i_nextr = off + PAGESIZE;
|
|
}
|
|
|
|
if (err) {
|
|
for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
|
|
PAGE_RELE(*ppp);
|
|
}
|
|
|
|
return (err);
|
|
}
|
|
|
|
/*
|
|
* Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
|
|
* If len == 0, do from off to EOF.
|
|
*
|
|
* The normal cases should be len == 0 & off == 0 (entire vp list),
|
|
* len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
|
|
* (from pageout).
|
|
*
|
|
* Note that for ufs it is possible to have dirty pages beyond
|
|
* roundup(ip->i_size, PAGESIZE). This can happen if the file
|
|
* length is long enough to involve indirect blocks (which are
|
|
* always fs->fs_bsize'd) and PAGESIZE < bsize while the length
|
|
* is such that roundup(blkoff(fs, ip->i_size), PAGESIZE) < bsize.
|
|
*/
|
|
/*ARGSUSED*/
|
|
int
|
|
oldufs_putpage(vp, off, len, flags, cred)
|
|
register struct vnode *vp;
|
|
u_int off, len;
|
|
int flags;
|
|
struct ucred *cred;
|
|
{
|
|
register struct inode *ip;
|
|
register struct page *pp;
|
|
register struct fs *fs;
|
|
struct page *dirty, *io_list;
|
|
register u_int io_off, io_len;
|
|
daddr_t lbn, bn, bn2;
|
|
u_int lbn_off;
|
|
int bsize, bsize2;
|
|
int vpcount;
|
|
int err;
|
|
|
|
#ifdef VFSSTATS
|
|
VFS_RECORD(vp->v_vfsp, VS_PUTPAGE, VS_CALL);
|
|
#endif
|
|
|
|
ip = VTOI(vp);
|
|
|
|
if (vp->v_pages == NULL || off >= ip->i_size)
|
|
return (0);
|
|
|
|
vpcount = vp->v_count;
|
|
VN_HOLD(vp);
|
|
fs = ip->i_fs;
|
|
|
|
again:
|
|
|
|
/*
|
|
* Cannot afford to sleep on inode now, give up
|
|
*/
|
|
if (ICHECK(ip)) {
|
|
err = ENOMEM;
|
|
goto errout;
|
|
}
|
|
|
|
/*
|
|
* Hold inode lock for duration of push
|
|
*/
|
|
ilock(ip);
|
|
if (len == 0) {
|
|
/*
|
|
* Search the entire vp list for pages >= off
|
|
*/
|
|
dirty = pvn_vplist_dirty(vp, off, flags);
|
|
} else {
|
|
/*
|
|
* Do a range from [off...off + len) via page_find.
|
|
* We set limits so that we kluster to bsize boundaries.
|
|
*/
|
|
if (off >= ip->i_size) {
|
|
dirty = NULL;
|
|
} else {
|
|
u_int fsize, eoff;
|
|
|
|
/*
|
|
* Use MAXBSIZE rounding to get indirect block pages
|
|
* which might beyond roundup(ip->i_size, PAGESIZE);
|
|
*/
|
|
fsize = (ip->i_size + MAXBOFFSET) & MAXBMASK;
|
|
eoff = MIN(off + len, fsize);
|
|
dirty = pvn_range_dirty(vp, off, eoff,
|
|
(u_int)(off & fs->fs_bmask),
|
|
(u_int)((eoff + fs->fs_bsize - 1) & fs->fs_bmask),
|
|
flags);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Now pp will have the list of kept dirty pages marked for
|
|
* write back. All the pages on the pp list need to still
|
|
* be dealt with here. Verify that we can really can do the
|
|
* write back to the filesystem and if not and we have some
|
|
* dirty pages, return an error condition.
|
|
*/
|
|
if (fs->fs_ronly && dirty != NULL)
|
|
err = EROFS;
|
|
else
|
|
err = 0;
|
|
|
|
if (dirty != NULL) {
|
|
/*
|
|
* Destroy the read ahead value now
|
|
* since we are really going to write
|
|
*/
|
|
ip->i_nextr = 0;
|
|
|
|
/*
|
|
* If the modified time on the inode has not already been
|
|
* set elsewhere (i.e. for write/setattr) or this is
|
|
* a call from msync (B_FORCE) we set the time now.
|
|
* This gives us approximate modified times for mmap'ed files
|
|
* which are modified via stores in the user address space.
|
|
*/
|
|
if ((ip->i_flag & IMODTIME) == 0 || (flags & B_FORCE) != 0) {
|
|
ip->i_flag |= IUPD;
|
|
ITIMES(ip);
|
|
}
|
|
/*
|
|
* file system was modified
|
|
*/
|
|
LOCKFS_SET_MOD(UTOL(ITOU(ip)));
|
|
}
|
|
|
|
/*
|
|
* Handle all the dirty pages.
|
|
*/
|
|
pp = NULL;
|
|
while (err == 0 && dirty != NULL) {
|
|
/*
|
|
* Pull off a contiguous chunk that fits in one lbn.
|
|
*/
|
|
|
|
io_off = dirty->p_offset;
|
|
lbn = lblkno(fs, io_off);
|
|
bsize = blksize(fs, ip, lbn);
|
|
/*
|
|
* Normally the blocks should already be allocated for
|
|
* any dirty pages, we only need to use S_OTHER
|
|
* here and we should not get back a bn == UFS_HOLE.
|
|
*/
|
|
err = bmap(ip, lbn, &bn, &bn2, bsize, S_OTHER, 1);
|
|
if (err) {
|
|
break;
|
|
}
|
|
if (bn == UFS_HOLE) {
|
|
if (!IS_SWAPVP(vp) && fs->fs_bsize >= PAGESIZE)
|
|
panic("ufs_putpage hole");
|
|
/*
|
|
* Allocate for "holey" ufs file now.
|
|
* XXX - should redo the anon code to
|
|
* synchronously insure that all the
|
|
* needed backing store is allocated.
|
|
*/
|
|
err = bmap(ip, lbn, &bn, &bn2, bsize, S_WRITE, 1);
|
|
if (err) {
|
|
break;
|
|
}
|
|
ASSERT(bn != UFS_HOLE);
|
|
}
|
|
|
|
VFS_RECORD(vp->v_vfsp, VS_PUTPAGE, VS_MISS);
|
|
|
|
pp = io_list = dirty;
|
|
io_len = PAGESIZE;
|
|
lbn_off = lbn << fs->fs_bshift;
|
|
page_sub(&dirty, pp);
|
|
|
|
while (dirty != NULL && dirty->p_offset < lbn_off + bsize &&
|
|
dirty->p_offset == io_off + io_len) {
|
|
pp = dirty;
|
|
page_sub(&dirty, pp);
|
|
/*
|
|
* Add the page to the end of the list. page_sortadd
|
|
* can do this without walking the list.
|
|
*/
|
|
page_sortadd(&io_list, pp);
|
|
io_len += PAGESIZE;
|
|
}
|
|
|
|
/* IO may be asynch, so need to set nio first */
|
|
if (fs->fs_bsize < PAGESIZE && ip->i_size > lbn_off + bsize) {
|
|
pp->p_nio = lblkno(fs, PAGESIZE);
|
|
} else {
|
|
pp->p_nio = 0;
|
|
/*
|
|
* Check for page length rounding problems
|
|
*/
|
|
if (io_off + io_len > lbn_off + bsize) {
|
|
ASSERT((io_off + io_len) - (lbn_off + bsize) <
|
|
PAGESIZE);
|
|
io_len = lbn_off + bsize - io_off;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Should zero any bytes beyond EOF,
|
|
* but it's not worth the work now.
|
|
*/
|
|
|
|
/*
|
|
* See if we need to do a 2nd bmap operation.
|
|
* This is needed if nio is non-zero and we
|
|
* didn't get a bn back from the 1st bmap().
|
|
*/
|
|
if (pp->p_nio) {
|
|
ASSERT(pp->p_nio == 2); /* XXX */
|
|
++lbn;
|
|
bsize2 = blksize(fs, ip, lbn);
|
|
if (bn2 == UFS_HOLE) {
|
|
/*
|
|
* Allocate backing store only if this is
|
|
* a swap vnode in case someone is using
|
|
* a "holey" ufs swap file with bsize <
|
|
* PAGESIZE (e.g., a 4k fs w/ 8k pages).
|
|
* XXX - should redo the anon code to
|
|
* synchronously insure that all the
|
|
* needed backing store is allocated.
|
|
*/
|
|
err = bmap(ip, lbn, &bn2, (daddr_t *)NULL,
|
|
bsize2, IS_SWAPVP(vp)? S_WRITE:S_OTHER, 1);
|
|
|
|
if (err) {
|
|
pvn_fail(pp, B_WRITE | flags);
|
|
break;
|
|
}
|
|
}
|
|
if (bn2 == UFS_HOLE)
|
|
pp->p_nio = 1;
|
|
|
|
/*
|
|
* Ok, now do it.
|
|
*/
|
|
err = ufs_writelbn(ip, fsbtodb(fs, bn), pp,
|
|
(u_int)bsize, 0, flags);
|
|
if (err == 0 && bn2 != UFS_HOLE) {
|
|
err = ufs_writelbn(ip, fsbtodb(fs, bn2), pp,
|
|
(u_int)bsize2, (u_int)fs->fs_bsize, flags);
|
|
pp = NULL;
|
|
}
|
|
} else {
|
|
bn = fsbtodb(fs, bn) + btodb(io_off - lbn_off);
|
|
err = ufs_writelbn(ip, bn, io_list, io_len, 0, flags);
|
|
pp = NULL;
|
|
}
|
|
}
|
|
iunlock(ip);
|
|
|
|
if (err != 0) {
|
|
if (pp != NULL)
|
|
pvn_fail(pp, B_WRITE | flags);
|
|
if (dirty != NULL)
|
|
pvn_fail(dirty, B_WRITE | flags);
|
|
} else if (off == 0 && (len == 0 || len >= ip->i_size)) {
|
|
/*
|
|
* If doing "synchronous invalidation", make
|
|
* sure that all the pages are actually gone.
|
|
*/
|
|
if ((flags & (B_INVAL | B_ASYNC)) == B_INVAL &&
|
|
((vp->v_pages != NULL) && (vp->v_pages->p_lckcnt == 0)))
|
|
goto again;
|
|
/*
|
|
* We have just sync'ed back all the pages
|
|
* on the inode, turn off the IMODTIME flag.
|
|
*/
|
|
ip->i_flag &= ~IMODTIME;
|
|
}
|
|
|
|
/*
|
|
* Instead of using VN_RELE here we are careful to only call
|
|
* the inactive routine if the vnode reference count is now zero,
|
|
* but it wasn't zero coming into putpage. This is to prevent
|
|
* recursively calling the inactive routine on a vnode that
|
|
* is already considered in the `inactive' state.
|
|
* XXX - inactive is a relative term here (sigh).
|
|
*/
|
|
errout:
|
|
if (--vp->v_count == 0 && vpcount > 0)
|
|
iinactive(ip);
|
|
return (err);
|
|
}
|
|
#else
|
|
oldufs_putpage(vp, off, len, flags, cred)
|
|
register struct vnode *vp;
|
|
u_int off, len;
|
|
int flags;
|
|
struct ucred *cred;
|
|
{
|
|
return (ENOSYS);
|
|
}
|
|
|
|
oldufs_getapage(vp, off, protp, pl, plsz, seg, addr, rw, cred)
|
|
struct vnode *vp;
|
|
register u_int off;
|
|
u_int *protp;
|
|
struct page *pl[]; /* NULL if async IO is requested */
|
|
u_int plsz;
|
|
struct seg *seg;
|
|
addr_t addr;
|
|
enum seg_rw rw;
|
|
struct ucred *cred;
|
|
{
|
|
return (ENOSYS);
|
|
}
|
|
#endif /* REMOVE_OLD_UFS */
|
|
/*
|
|
* ULOCKFS Intercept Routines
|
|
* VOP calls are intercepted and wrapped with lockfs code.
|
|
*/
|
|
static int
|
|
ufs_l_open(vpp, flag, cred)
|
|
struct vnode **vpp;
|
|
int flag;
|
|
struct ucred *cred;
|
|
{
|
|
ULOCKFS(*vpp, VA_OPEN,
|
|
ufs_open(vpp, flag, cred));
|
|
}
|
|
static int
|
|
ufs_l_close(vp, flag, count, cred)
|
|
struct vnode *vp;
|
|
int flag;
|
|
int count;
|
|
struct ucred *cred;
|
|
{
|
|
ULOCKFS(vp, VA_CLOSE,
|
|
ufs_close(vp, flag, count, cred));
|
|
}
|
|
static int
|
|
ufs_l_rdwr(vp, uiop, rw, ioflag, cred)
|
|
struct vnode *vp;
|
|
struct uio *uiop;
|
|
enum uio_rw rw;
|
|
int ioflag;
|
|
struct ucred *cred;
|
|
{
|
|
if (rw == UIO_READ) {
|
|
ULOCKFS(vp, VA_READ,
|
|
ufs_rdwr(vp, uiop, rw, ioflag, cred));
|
|
} else {
|
|
ULOCKFS(vp, VA_WRITE,
|
|
ufs_rdwr(vp, uiop, rw, ioflag, cred));
|
|
}
|
|
}
|
|
static int
|
|
ufs_l_select(vp, which, cred)
|
|
struct vnode *vp;
|
|
int which;
|
|
struct ucred *cred;
|
|
{
|
|
ULOCKFS(vp, VA_SELECT,
|
|
ufs_select(vp, which, cred));
|
|
}
|
|
static int
|
|
ufs_l_getattr(vp, vap, cred)
|
|
struct vnode *vp;
|
|
register struct vattr *vap;
|
|
struct ucred *cred;
|
|
{
|
|
ULOCKFS(vp, VA_GETATTR,
|
|
ufs_getattr(vp, vap, cred));
|
|
}
|
|
static int
|
|
ufs_l_setattr(vp, vap, cred)
|
|
register struct vnode *vp;
|
|
register struct vattr *vap;
|
|
struct ucred *cred;
|
|
{
|
|
if (vap->va_size != (u_long)-1) {
|
|
ULOCKFS(vp, VA_TRUNC,
|
|
ufs_setattr(vp, vap, cred));
|
|
} else {
|
|
ULOCKFS(vp, VA_CHANGE,
|
|
ufs_setattr(vp, vap, cred));
|
|
}
|
|
}
|
|
static int
|
|
ufs_l_access(vp, mode, cred)
|
|
struct vnode *vp;
|
|
int mode;
|
|
struct ucred *cred;
|
|
{
|
|
ULOCKFS(vp, VA_ACCESS,
|
|
ufs_access(vp, mode, cred));
|
|
}
|
|
static int
|
|
ufs_l_readlink(vp, uiop, cred)
|
|
struct vnode *vp;
|
|
struct uio *uiop;
|
|
struct ucred *cred;
|
|
{
|
|
ULOCKFS(vp, VA_READLINK,
|
|
ufs_readlink(vp, uiop, cred));
|
|
}
|
|
static int
|
|
ufs_l_fsync(vp, cred)
|
|
struct vnode *vp;
|
|
struct ucred *cred;
|
|
{
|
|
ULOCKFS(vp, VA_FSYNC,
|
|
ufs_fsync(vp, cred));
|
|
}
|
|
static int
|
|
ufs_l_inactive(vp, cred)
|
|
struct vnode *vp;
|
|
struct ucred *cred;
|
|
{
|
|
ULOCKFS(vp, VA_INACTIVE,
|
|
ufs_inactive(vp, cred));
|
|
}
|
|
static int
|
|
ufs_l_lookup(dvp, nm, vpp, cred, pnp, flags)
|
|
struct vnode *dvp;
|
|
char *nm;
|
|
struct vnode **vpp;
|
|
struct ucred *cred;
|
|
struct pathname *pnp;
|
|
int flags;
|
|
{
|
|
ULOCKFS(dvp, VA_LOOKUP,
|
|
ufs_lookup(dvp, nm, vpp, cred, pnp, flags));
|
|
}
|
|
static int
|
|
ufs_l_create(dvp, nm, vap, exclusive, mode, vpp, cred)
|
|
struct vnode *dvp;
|
|
char *nm;
|
|
struct vattr *vap;
|
|
enum vcexcl exclusive;
|
|
int mode;
|
|
struct vnode **vpp;
|
|
struct ucred *cred;
|
|
{
|
|
ULOCKFS(dvp, VA_CREATE,
|
|
ufs_create(dvp, nm, vap, exclusive, mode, vpp, cred));
|
|
}
|
|
static int
|
|
ufs_l_remove(vp, nm, cred)
|
|
struct vnode *vp;
|
|
char *nm;
|
|
struct ucred *cred;
|
|
{
|
|
ULOCKFS(vp, VA_REMOVE,
|
|
ufs_remove(vp, nm, cred));
|
|
}
|
|
static int
|
|
ufs_l_link(vp, tdvp, tnm, cred)
|
|
struct vnode *vp;
|
|
register struct vnode *tdvp;
|
|
char *tnm;
|
|
struct ucred *cred;
|
|
{
|
|
ULOCKFS(vp, VA_LINK,
|
|
ufs_link(vp, tdvp, tnm, cred));
|
|
}
|
|
static int
|
|
ufs_l_rename(sdvp, snm, tdvp, tnm, cred)
|
|
struct vnode *sdvp;
|
|
char *snm;
|
|
struct vnode *tdvp;
|
|
char *tnm;
|
|
struct ucred *cred;
|
|
{
|
|
ULOCKFS(sdvp, VA_RENAME,
|
|
ufs_rename(sdvp, snm, tdvp, tnm, cred));
|
|
}
|
|
static int
|
|
ufs_l_mkdir(dvp, nm, vap, vpp, cred)
|
|
struct vnode *dvp;
|
|
char *nm;
|
|
register struct vattr *vap;
|
|
struct vnode **vpp;
|
|
struct ucred *cred;
|
|
{
|
|
ULOCKFS(dvp, VA_MKDIR,
|
|
ufs_mkdir(dvp, nm, vap, vpp, cred));
|
|
}
|
|
static int
|
|
ufs_l_rmdir(vp, nm, cred)
|
|
struct vnode *vp;
|
|
char *nm;
|
|
struct ucred *cred;
|
|
{
|
|
ULOCKFS(vp, VA_RMDIR,
|
|
ufs_rmdir(vp, nm, cred));
|
|
}
|
|
static int
|
|
ufs_l_readdir(vp, uiop, cred)
|
|
struct vnode *vp;
|
|
struct uio *uiop;
|
|
struct ucred *cred;
|
|
{
|
|
ULOCKFS(vp, VA_READDIR,
|
|
ufs_readdir(vp, uiop, cred));
|
|
}
|
|
static int
|
|
ufs_l_symlink(dvp, lnm, vap, tnm, cred)
|
|
register struct vnode *dvp;
|
|
char *lnm;
|
|
struct vattr *vap;
|
|
char *tnm;
|
|
struct ucred *cred;
|
|
{
|
|
ULOCKFS(dvp, VA_SYMLINK,
|
|
ufs_symlink(dvp, lnm, vap, tnm, cred));
|
|
}
|
|
static int
|
|
ufs_l_lockctl(vp, ld, cmd, cred, clid)
|
|
struct vnode *vp;
|
|
struct eflock *ld;
|
|
int cmd;
|
|
struct ucred *cred;
|
|
int clid;
|
|
{
|
|
ULOCKFS(vp, VA_LOCKCTL,
|
|
ufs_lockctl(vp, ld, cmd, cred, clid));
|
|
}
|
|
static int
|
|
ufs_l_fid(vp, fidpp)
|
|
struct vnode *vp;
|
|
struct fid **fidpp;
|
|
{
|
|
ULOCKFS(vp, VA_FID,
|
|
ufs_fid(vp, fidpp));
|
|
}
|
|
static int
|
|
ufs_l_getpage(vp, off, len, protp, pl, plsz, seg, addr, rw, cred)
|
|
struct vnode *vp;
|
|
u_int off, len;
|
|
u_int *protp;
|
|
struct page *pl[];
|
|
u_int plsz;
|
|
struct seg *seg;
|
|
addr_t addr;
|
|
enum seg_rw rw;
|
|
struct ucred *cred;
|
|
{
|
|
int vaccess;
|
|
|
|
if (seg->s_ops != &segvn_ops)
|
|
vaccess = VA_GETPRIVATE;
|
|
else if (((struct segvn_data *)seg->s_data)->type != MAP_SHARED)
|
|
vaccess = VA_GETPRIVATE;
|
|
else if (rw == S_OTHER)
|
|
vaccess = VA_GETWRITE;
|
|
else if ((*seg->s_ops->checkprot)(seg, addr, len, PROT_WRITE) != 0)
|
|
vaccess = VA_GETREAD;
|
|
else
|
|
vaccess = VA_GETWRITE;
|
|
|
|
ULOCKFS(vp, vaccess,
|
|
ufs_getpage(vp, off, len, protp, pl, plsz, seg, addr, rw,
|
|
cred));
|
|
}
|
|
static int
|
|
ufs_l_putpage(vp, off, len, flags, cred)
|
|
register struct vnode *vp;
|
|
u_int off, len;
|
|
int flags;
|
|
struct ucred *cred;
|
|
{
|
|
ULOCKFS(vp, VA_PUTPAGE,
|
|
ufs_putpage(vp, off, len, flags, cred));
|
|
}
|
|
static int
|
|
ufs_l_map(vp, off, as, addrp, len, prot, maxprot, flags, cred)
|
|
struct vnode *vp;
|
|
u_int off;
|
|
struct as *as;
|
|
addr_t *addrp;
|
|
u_int len;
|
|
u_int prot, maxprot;
|
|
u_int flags;
|
|
struct ucred *cred;
|
|
{
|
|
ULOCKFS(vp, VA_MAP,
|
|
ufs_map(vp, off, as, addrp, len, prot, maxprot, flags, cred));
|
|
}
|
|
static int
|
|
ufs_l_cntl(vp, cmd, idata, odata, iflag, oflag)
|
|
struct vnode *vp;
|
|
int cmd, iflag, oflag;
|
|
caddr_t idata, odata;
|
|
|
|
{
|
|
ULOCKFS(vp, VA_CNTL,
|
|
ufs_cntl(vp, cmd, idata, odata, iflag, oflag));
|
|
}
|
|
|
|
/*
|
|
* ULOCKFS ROUTINES
|
|
*/
|
|
|
|
/*
|
|
* ufs_lockfs_end
|
|
* Called at end of every VOP call
|
|
*/
|
|
ufs_lockfs_end(vaid, mp)
|
|
int vaid;
|
|
struct mount *mp;
|
|
{
|
|
struct ulockfs *ul = mp->m_ul;
|
|
|
|
/*
|
|
* if there are no more of these accesses outstanding
|
|
*/
|
|
if (--(ul->ul_vacount[vaid]) == 0)
|
|
/*
|
|
* lock in progress for this access
|
|
*/
|
|
if (ul->ul_vamask & (1<<vaid))
|
|
/*
|
|
* awaken locking process
|
|
*/
|
|
if (ul->ul_flags & ULOCKFS_VAWANT) {
|
|
ul->ul_flags &= ~ULOCKFS_VAWANT;
|
|
wakeup((caddr_t)mp);
|
|
}
|
|
}
|
|
|
|
int lockfs_interruptible = 0;
|
|
/*
|
|
* ufs_lockfs_begin
|
|
* Called at end of every VOP call
|
|
*/
|
|
ufs_lockfs_begin(vp, vaid, mpp)
|
|
struct vnode *vp;
|
|
int vaid;
|
|
struct mount **mpp;
|
|
{
|
|
struct mount *mp = (struct mount *)(vp->v_vfsp->vfs_data);
|
|
struct ulockfs *ul = mp->m_ul;
|
|
|
|
*mpp = mp;
|
|
|
|
/*
|
|
* current lock wants this access pended
|
|
*/
|
|
while (ul->ul_vamask & (1<<vaid)) {
|
|
/*
|
|
* can't pend it because it is recursive
|
|
* e.g., VOP_RDWR causing VOP_GETPAGE
|
|
*/
|
|
if ((VTOI(vp)->i_flag & ILOCKED) &&
|
|
(u.u_procp == (struct proc *)(VTOI(vp)->i_owner)))
|
|
break;
|
|
/*
|
|
* return EIO if hard locked
|
|
*/
|
|
if (LOCKFS_IS_HLOCK(UTOL(ul)))
|
|
return (EIO);
|
|
/*
|
|
* Don't pend nfsd's. Return EIO and EAGAIN in u.u_XXX[0]
|
|
* and nfsd will drop request
|
|
*/
|
|
if (u.u_XXX[0] == ENOTBLK) {
|
|
u.u_XXX[0] = EAGAIN;
|
|
return (EIO);
|
|
}
|
|
if (lockfs_interruptible) {
|
|
int smask;
|
|
int s;
|
|
int interrupted;
|
|
|
|
/*
|
|
* pend access interruptibly (for some signals)
|
|
* See rpc/clnt_kudp.c. This is like an nfs mount
|
|
* with the intr option.
|
|
*/
|
|
s = splhigh();
|
|
smask = u.u_procp->p_sigmask;
|
|
u.u_procp->p_sigmask |= ~(sigmask(SIGHUP) |
|
|
sigmask(SIGINT) | sigmask(SIGQUIT) |
|
|
sigmask(SIGTERM));
|
|
interrupted = sleep((caddr_t)mp, PLOCK+PCATCH);
|
|
u.u_procp->p_sigmask = smask;
|
|
(void) splx(s);
|
|
if (interrupted)
|
|
return (EINTR);
|
|
} else
|
|
(void) sleep((caddr_t)mp, PZERO);
|
|
}
|
|
/*
|
|
* inc 'access in progress' count for this access
|
|
*/
|
|
ul->ul_vacount[vaid]++;
|
|
return (0);
|
|
}
|
|
/*
|
|
* Lock types are really indexes into the lockfs_vamask array.
|
|
* The accesses locked by a lock type can be changed by altering
|
|
* the mask, or by adding a new mask and incrementing LOCKFS_MAXLOCK.
|
|
*/
|
|
#define LOCKFS_MAXMASK (32)
|
|
u_long lockfs_vamask[LOCKFS_MAXMASK] = { LOCKFS_ULOCK_MASK,
|
|
LOCKFS_WLOCK_MASK,
|
|
LOCKFS_NLOCK_MASK,
|
|
LOCKFS_DLOCK_MASK,
|
|
LOCKFS_HLOCK_MASK };
|
|
u_long lockfs_maxlock = LOCKFS_MAXLOCK;
|
|
|
|
/*
|
|
* ufs_fiolfs
|
|
* file system locking ioctl handler
|
|
*/
|
|
static int
|
|
ufs_fiolfs(vp, lfup)
|
|
struct vnode *vp; /* vnode for some inode on fs */
|
|
struct lockfs **lfup; /* address of user lockfs struct */
|
|
{
|
|
int error; /* error return */
|
|
struct mount *mp; /* mount point of vp */
|
|
struct ulockfs *ul; /* ulockfs struct for mp */
|
|
struct lockfs *lfc; /* lockfs struct in ulockfs */
|
|
struct lockfs lfs; /* save current lock */
|
|
struct lockfs lfd; /* desired lock */
|
|
|
|
/*
|
|
* must be superuser
|
|
*/
|
|
if (!suser())
|
|
return (EPERM);
|
|
|
|
/*
|
|
* mount point, ufs lockfs, and current lockfs
|
|
*/
|
|
mp = (struct mount *)(vp->v_vfsp->vfs_data);
|
|
ul = mp->m_ul;
|
|
lfc = UTOL(ul);
|
|
|
|
/*
|
|
* if not already busy or hlocked, mark lock as busy
|
|
*/
|
|
if (LOCKFS_IS_BUSY(lfc))
|
|
return (EBUSY);
|
|
if (LOCKFS_IS_HLOCK(lfc))
|
|
return (EIO);
|
|
LOCKFS_SET_BUSY(lfc);
|
|
|
|
/*
|
|
* get and check the user's lockfs struct
|
|
*/
|
|
if (error = ufs_getlfd(vp, lfup, &lfd, lfc))
|
|
goto erridle;
|
|
|
|
/*
|
|
* Freeze the file system (pend future accesses)
|
|
*/
|
|
if (error = ufs_freeze(mp, &lfd, &lfs))
|
|
goto erridle;
|
|
|
|
/*
|
|
* Quiesce (wait for outstanding accesses to finish)
|
|
*/
|
|
if (error = ufs_quiesce(mp))
|
|
goto errout;
|
|
|
|
/*
|
|
* at least everything *currently* dirty goes out
|
|
*/
|
|
if (!LOCKFS_IS_ULOCK(lfc))
|
|
if (error = ufs_flush(mp))
|
|
goto errout;
|
|
|
|
/*
|
|
* reconcile superblock and inodes if fs was wlock'ed
|
|
*/
|
|
if (LOCKFS_IS_WLOCK(&lfs))
|
|
if (error = ufs_reconcile(mp))
|
|
goto errout;
|
|
|
|
/*
|
|
* thaw down to lfd.lf_lock (wakeup pended processes)
|
|
*/
|
|
if (error = ufs_thaw(mp))
|
|
goto errout;
|
|
|
|
/*
|
|
* idle the lock struct
|
|
*/
|
|
LOCKFS_CLR_BUSY(lfc);
|
|
|
|
/*
|
|
* free current comment
|
|
*/
|
|
kmem_free((caddr_t)lfs.lf_comment, (u_int)lfs.lf_comlen);
|
|
|
|
/*
|
|
* return status (such as the new key)
|
|
*/
|
|
return (ufs_fiolfss(vp, lfup));
|
|
|
|
errout:
|
|
/*
|
|
* if possible, apply original lock and clean up lock things
|
|
*/
|
|
ufs_unfreeze(mp, &lfs);
|
|
(void) ufs_thaw(mp);
|
|
erridle:
|
|
LOCKFS_CLR_BUSY(lfc);
|
|
|
|
return (error);
|
|
}
|
|
/*
|
|
* ufs_fioffs
|
|
* ioctl handler for FIOFFS
|
|
*/
|
|
static int
|
|
ufs_fioffs(vp, lfup)
|
|
struct vnode *vp; /* some vnode on fs */
|
|
struct lockfs **lfup; /* user's struct (must be NULL) */
|
|
{
|
|
/*
|
|
* no struct needed, yet
|
|
*/
|
|
if (*lfup != NULL)
|
|
return (EINVAL);
|
|
/*
|
|
* at least everything *currently* dirty goes out
|
|
*/
|
|
return (ufs_flush((struct mount *)(vp->v_vfsp->vfs_data)));
|
|
}
|
|
/*
|
|
* ufs_fiolfss
|
|
* ioctl handler for FIOLFSS
|
|
*/
|
|
static int
|
|
ufs_fiolfss(vp, lfup)
|
|
struct vnode *vp; /* some vnode on fs */
|
|
struct lockfs **lfup; /* user's lockfs struct */
|
|
{
|
|
int error;
|
|
u_int comlen; /* length of user's comment buf */
|
|
struct mount *mp;
|
|
struct ulockfs *ul;
|
|
struct lockfs *lfc; /* current lockfs struct */
|
|
struct lockfs lfu; /* copy of user's lockfs struct */
|
|
|
|
/*
|
|
* mount point and ulockfs and lockfs structs
|
|
*/
|
|
mp = (struct mount *)(vp->v_vfsp->vfs_data);
|
|
ul = mp->m_ul;
|
|
lfc = UTOL(ul);
|
|
|
|
/*
|
|
* get user's lockfs struct
|
|
*/
|
|
if (error = copyin((caddr_t)*lfup, (caddr_t)&lfu,
|
|
(u_int)(sizeof (struct lockfs))))
|
|
goto errout;
|
|
|
|
/*
|
|
* length of comment to return
|
|
*/
|
|
if (lfu.lf_comlen > lfc->lf_comlen)
|
|
comlen = lfc->lf_comlen;
|
|
else
|
|
comlen = lfu.lf_comlen;
|
|
|
|
/*
|
|
* return current lockfs struct to user
|
|
*/
|
|
lfu.lf_lock = lfc->lf_lock;
|
|
lfu.lf_key = lfc->lf_key;
|
|
lfu.lf_flags = lfc->lf_flags;
|
|
if (lfu.lf_comlen = comlen)
|
|
if (error = copyout(lfc->lf_comment, lfu.lf_comment, comlen))
|
|
goto errout;
|
|
error = copyout((caddr_t)&lfu, (caddr_t)*lfup,
|
|
(u_int)(sizeof (struct lockfs)));
|
|
errout:
|
|
return (error);
|
|
}
|
|
|
|
/*
|
|
* ufs_freeze
|
|
* pend future accesses for current lock and desired lock
|
|
*/
|
|
ufs_freeze(mp, lfd, lfs)
|
|
struct mount *mp;
|
|
struct lockfs *lfd; /* desired lock */
|
|
struct lockfs *lfs; /* save current lock here */
|
|
{
|
|
struct ulockfs *ul = mp->m_ul;
|
|
struct lockfs *lfc = UTOL(ul); /* current lock */
|
|
|
|
/*
|
|
* save current lock
|
|
*/
|
|
bcopy((caddr_t)lfc, (caddr_t)lfs, (u_int)sizeof (struct lockfs));
|
|
|
|
/*
|
|
* move over selected lock fields into lockfs struct
|
|
*/
|
|
lfc->lf_lock = lfd->lf_lock;
|
|
lfc->lf_key = lfd->lf_key;
|
|
lfc->lf_comlen = lfd->lf_comlen;
|
|
lfc->lf_comment = lfd->lf_comment;
|
|
|
|
/*
|
|
* pend current and desired lock's vop accesses for now
|
|
*/
|
|
ul->ul_vamask |= lockfs_vamask[lfc->lf_lock];
|
|
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* ufs_unfreeze
|
|
* lock failed, reset the old lock
|
|
*/
|
|
ufs_unfreeze(mp, lfr)
|
|
struct mount *mp;
|
|
struct lockfs *lfr; /* reset this lock */
|
|
{
|
|
u_int comlen;
|
|
caddr_t comment;
|
|
struct ulockfs *ul = mp->m_ul;
|
|
struct lockfs *lff = UTOL(ul); /* from this failed lock */
|
|
|
|
/*
|
|
* can't unfreeze a hlock
|
|
*/
|
|
if (LOCKFS_IS_HLOCK(lff)) {
|
|
/*
|
|
* free up comment from reset lock
|
|
*/
|
|
comlen = lfr->lf_comlen;
|
|
comment = lfr->lf_comment;
|
|
goto errout;
|
|
} else {
|
|
/*
|
|
* free up comment from failed lock
|
|
*/
|
|
comlen = lff->lf_comlen;
|
|
comment = lff->lf_comment;
|
|
}
|
|
|
|
/*
|
|
* move over the LOCKFS_MOD flag
|
|
*/
|
|
if (LOCKFS_IS_MOD(lff))
|
|
LOCKFS_SET_MOD(lfr);
|
|
/*
|
|
* reset lock
|
|
*/
|
|
bcopy((caddr_t)lfr, (caddr_t)lff, (u_int)sizeof (struct lockfs));
|
|
|
|
/*
|
|
* reset vop access mask
|
|
*/
|
|
ul->ul_vamask = lockfs_vamask[lfr->lf_lock];
|
|
|
|
errout:
|
|
kmem_free(comment, comlen);
|
|
}
|
|
/*
|
|
* ufs_quiesce
|
|
* wait for outstanding accesses to finish
|
|
*/
|
|
ufs_quiesce(mp)
|
|
struct mount *mp; /* mount point */
|
|
{
|
|
int i; /* index */
|
|
u_long vamask; /* access mask */
|
|
struct ulockfs *ul = mp->m_ul; /* mp's ulockfs */
|
|
|
|
/*
|
|
* for each access
|
|
*/
|
|
for (i = 0, vamask = ul->ul_vamask; i < VA_MAX; ++i) {
|
|
/*
|
|
* if these accesses should finish
|
|
*/
|
|
if (vamask & (1<<i))
|
|
/*
|
|
* wait for outstanding ones to finish
|
|
*/
|
|
while (ul->ul_vacount[i]) {
|
|
ul->ul_flags |= ULOCKFS_VAWANT;
|
|
if (sleep((caddr_t)mp, PLOCK+PCATCH))
|
|
return (EINTR);
|
|
}
|
|
}
|
|
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* ufs_thaw
|
|
* thaw file system lock down to current value
|
|
*/
|
|
ufs_thaw(mp)
|
|
struct mount *mp;
|
|
{
|
|
int error = 0;
|
|
struct ulockfs *ul = mp->m_ul;
|
|
struct lockfs *lfc = UTOL(ul);
|
|
int noidel = ULOCKFS_IS_NOIDEL(ul);
|
|
|
|
/*
|
|
* if wlock or hlock
|
|
*/
|
|
if (LOCKFS_IS_WLOCK(lfc) || LOCKFS_IS_HLOCK(lfc)) {
|
|
|
|
/*
|
|
* don't keep access times
|
|
* don't free deleted files
|
|
* if superblock writes are allowed, limit them to me for now
|
|
*/
|
|
ul->ul_flags |= (ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
|
|
if (ul->ul_sbowner != (struct proc *)-1)
|
|
ul->ul_sbowner = u.u_procp;
|
|
|
|
/*
|
|
* wait for writes for deleted files and superblock updates
|
|
*/
|
|
if (error = ufs_flush(mp))
|
|
goto errout;
|
|
|
|
/*
|
|
* no one can write the superblock
|
|
*/
|
|
ul->ul_sbowner = (struct proc *)-1;
|
|
|
|
/*
|
|
* reset modified
|
|
*/
|
|
LOCKFS_CLR_MOD(lfc);
|
|
|
|
/*
|
|
* special processing for wlock/hlock
|
|
*/
|
|
if (LOCKFS_IS_WLOCK(lfc))
|
|
if (error = ufs_thaw_wlock(mp))
|
|
goto errout;
|
|
if (LOCKFS_IS_HLOCK(lfc))
|
|
while (ufs_thaw_hlock(mp))
|
|
if (error = ufs_flush(mp))
|
|
goto errout;
|
|
} else {
|
|
|
|
/*
|
|
* okay to keep access times
|
|
* okay to free deleted files
|
|
* okay to write the superblock
|
|
*/
|
|
ul->ul_flags &= ~(ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
|
|
ul->ul_sbowner = NULL;
|
|
|
|
/*
|
|
* flush in case deleted files are in memory
|
|
*/
|
|
if (noidel)
|
|
if (error = ufs_flush(mp))
|
|
goto errout;
|
|
}
|
|
|
|
/*
|
|
* allow all accesses except those needed for this lock
|
|
*/
|
|
ul->ul_vamask = lockfs_vamask[lfc->lf_lock];
|
|
|
|
/*
|
|
* wakeup any pended accesses (appropriate ones will sleep again)
|
|
*/
|
|
errout:
|
|
wakeup((caddr_t)mp);
|
|
return (error);
|
|
}
|
|
|
|
/*
|
|
* ufs_flush
|
|
* flush at least everything that is currently dirty
|
|
*/
|
|
ufs_flush(mp)
|
|
struct mount *mp;
|
|
{
|
|
int error;
|
|
int saverror = 0;
|
|
struct fs *fs = mp->m_bufp->b_un.b_fs;
|
|
union ihead *ih;
|
|
struct inode *ip;
|
|
ino_t *inop; /* array of ino_t's (0 terminated) */
|
|
ino_t *cinop; /* pointer into array */
|
|
u_long tino; /* total length of array */
|
|
|
|
/*
|
|
* get rid of dnlc entries
|
|
*/
|
|
(void) dnlc_purge();
|
|
|
|
#ifdef QUOTA
|
|
/*
|
|
* flush quota records
|
|
*/
|
|
(void) qsync(mp);
|
|
#endif /* QUOTA */
|
|
|
|
/*
|
|
* flush and synchronously invalidate page cache and inodes
|
|
*/
|
|
for (ih = ihead; ih < &ihead[INOHSZ]; ih++) {
|
|
ufs_getino(mp, ih, &inop, &tino);
|
|
for (cinop = inop; *cinop; ++cinop) {
|
|
if (error = iget(mp->m_dev, fs, *cinop, &ip)) {
|
|
saverror = error;
|
|
continue;
|
|
}
|
|
if ((error = syncip(ip, B_ASYNC, 0)) == 0)
|
|
error = syncip(ip, B_INVAL, 0);
|
|
if (error)
|
|
saverror = error;
|
|
(void) iput(ip);
|
|
}
|
|
/*
|
|
* free the array of inode numbers
|
|
*/
|
|
if (inop != NULL)
|
|
kmem_free((caddr_t)inop, (u_int)tino * sizeof (ino_t));
|
|
}
|
|
|
|
/*
|
|
* Push buf cache and block device page cache
|
|
*/
|
|
if (error = VOP_PUTPAGE(mp->m_devvp, 0, 0, B_ASYNC, u.u_cred))
|
|
saverror = error;
|
|
(void) bflush(mp->m_devvp);
|
|
|
|
/*
|
|
* synchronously flush superblock and summary info
|
|
*/
|
|
if (fs->fs_ronly == 0) {
|
|
fs->fs_fmod = 0;
|
|
(void) sbupdate(mp);
|
|
}
|
|
|
|
/*
|
|
* synchronously flush and invalidate buf and page cache
|
|
*/
|
|
if (error = VOP_PUTPAGE(mp->m_devvp, 0, 0, B_INVAL, u.u_cred))
|
|
saverror = error;
|
|
(void) bsinval(mp->m_devvp);
|
|
|
|
/*
|
|
* set the clean flag
|
|
*/
|
|
ufs_checkclean(mp);
|
|
|
|
return (saverror);
|
|
}
|
|
/*
|
|
* ufs_thaw_wlock
|
|
* special processing when thawing down to wlock
|
|
*/
|
|
ufs_thaw_wlock(mp)
|
|
struct mount *mp;
|
|
{
|
|
int s;
|
|
union ihead *ih;
|
|
struct inode *ip;
|
|
struct vnode *vp;
|
|
struct page *pp;
|
|
ino_t *inop; /* array of ino_t's (0 terminated) */
|
|
ino_t *cinop; /* pointer into array */
|
|
u_long tino; /* total entries in inop */
|
|
int mlocks = 0;
|
|
struct fs *fs = mp->m_bufp->b_un.b_fs;
|
|
|
|
/*
|
|
* look for mlock'ed pages
|
|
*/
|
|
for (ih = ihead; ih < &ihead[INOHSZ]; ih++) {
|
|
ufs_getino(mp, ih, &inop, &tino);
|
|
for (cinop = inop; *cinop; ++cinop) {
|
|
if (iget(mp->m_dev, fs, *cinop, &ip))
|
|
continue;
|
|
if (fs->fs_ronly)
|
|
ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG);
|
|
vp = ITOV(ip);
|
|
if ((vp->v_type != VCHR) && (vp->v_type != VSOCK)) {
|
|
s = splvm();
|
|
if (pp = vp->v_pages)
|
|
do {
|
|
mlocks += pp->p_lckcnt;
|
|
pp = pp->p_vpnext;
|
|
} while (pp != vp->v_pages);
|
|
(void) splx(s);
|
|
}
|
|
(void) iput(ip);
|
|
}
|
|
if (inop != NULL)
|
|
kmem_free((caddr_t)inop, (u_int)tino * sizeof (ino_t));
|
|
}
|
|
|
|
return ((mlocks) ? EPERM : 0);
|
|
}
|
|
/*
|
|
* ufs_thaw_hlock
|
|
* special processing when thawing down to hlock
|
|
*/
|
|
ufs_thaw_hlock(mp)
|
|
struct mount *mp;
|
|
{
|
|
int s;
|
|
union ihead *ih;
|
|
struct inode *ip;
|
|
struct vnode *vp;
|
|
struct page *pp;
|
|
int reflush; /* reflush the file system */
|
|
ino_t *inop; /* array of ino_t's (0 terminated) */
|
|
ino_t *cinop; /* pointer into array */
|
|
u_long tino; /* total entries in inop */
|
|
struct fs *fs = mp->m_bufp->b_un.b_fs;
|
|
extern u_int pages_pp_locked;
|
|
|
|
/*
|
|
* clear i_flags and page locks and page mods just in case an
|
|
* error prevented them from being cleared during ufs_flush()
|
|
*/
|
|
for (ih = ihead, reflush = 0; ih < &ihead[INOHSZ]; ih++) {
|
|
ufs_getino(mp, ih, &inop, &tino);
|
|
for (cinop = inop; *cinop; ++cinop) {
|
|
if (iget(mp->m_dev, fs, *cinop, &ip))
|
|
continue;
|
|
ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG);
|
|
vp = ITOV(ip);
|
|
if ((vp->v_type != VCHR) && (vp->v_type != VSOCK)) {
|
|
s = splvm();
|
|
if (pp = vp->v_pages)
|
|
do {
|
|
reflush = 1;
|
|
if (pp->p_lckcnt)
|
|
--pages_pp_locked;
|
|
pp->p_lckcnt = 0;
|
|
hat_pagesync(pp);
|
|
pp->p_mod = 0;
|
|
pp->p_ref = 0;
|
|
pp = pp->p_vpnext;
|
|
} while (pp != vp->v_pages);
|
|
(void) splx(s);
|
|
}
|
|
(void) iput(ip);
|
|
}
|
|
if (inop != NULL)
|
|
kmem_free((caddr_t)inop, (u_int)tino * sizeof (ino_t));
|
|
}
|
|
return (reflush);
|
|
}
|
|
|
|
/*
|
|
* ufs_reconcile_ip
|
|
* reconcile ondisk inode with incore inode
|
|
*/
|
|
ufs_reconcile_ip(mp, ip)
|
|
struct mount *mp;
|
|
struct inode *ip; /* incore inode */
|
|
{
|
|
int i;
|
|
int ndaddr;
|
|
int niaddr;
|
|
struct dinode *dp; /* ondisk inode */
|
|
struct buf *bp = NULL;
|
|
struct fs *fs = mp->m_bufp->b_un.b_fs;
|
|
|
|
/*
|
|
* BIG BOO-BOO, reconciliation fails
|
|
*/
|
|
if (ip->i_flag & (IMOD|IMODACC|IACC|IUPD|ICHG))
|
|
return (EPERM);
|
|
|
|
/*
|
|
* get the dinode
|
|
*/
|
|
bp = bread(ip->i_devvp, (daddr_t)fsbtodb(fs, itod(fs, ip->i_number)),
|
|
(int)fs->fs_bsize);
|
|
if (bp->b_flags & B_ERROR) {
|
|
brelse(bp);
|
|
return (EIO);
|
|
}
|
|
dp = bp->b_un.b_dino;
|
|
dp += itoo(fs, ip->i_number);
|
|
|
|
/*
|
|
* some fields are not allowed to change
|
|
*/
|
|
if ((ip->i_mode != dp->di_mode) ||
|
|
(ip->i_uid != dp->di_uid) ||
|
|
(ip->i_gid != dp->di_gid)) {
|
|
brelse(bp);
|
|
return (EACCES);
|
|
}
|
|
|
|
/*
|
|
* and some are allowed to change
|
|
*/
|
|
ip->i_size = dp->di_size;
|
|
ip->i_ic.ic_flags = dp->di_ic.ic_flags;
|
|
ip->i_blocks = dp->di_blocks;
|
|
ip->i_gen = dp->di_gen;
|
|
ip->i_nlink = dp->di_nlink;
|
|
if (ip->i_flag & IFASTSYMLNK) {
|
|
ndaddr = 1;
|
|
niaddr = 0;
|
|
} else {
|
|
ndaddr = NDADDR;
|
|
niaddr = NIADDR;
|
|
}
|
|
for (i = 0; i < ndaddr; ++i)
|
|
ip->i_db[i] = dp->di_db[i];
|
|
for (i = 0; i < niaddr; ++i)
|
|
ip->i_ib[i] = dp->di_ib[i];
|
|
|
|
brelse(bp);
|
|
return (0);
|
|
}
|
|
/*
|
|
* ufs_reconcile_inodes
|
|
* reconcile all incore inodes for this fs with ondisk inodes
|
|
*/
|
|
ufs_reconcile_inodes(mp)
|
|
struct mount *mp;
|
|
{
|
|
int error = 0;
|
|
struct fs *fs = mp->m_bufp->b_un.b_fs;
|
|
union ihead *ih;
|
|
struct inode *ip;
|
|
u_long tino;
|
|
ino_t *inop; /* array of ino_t's */
|
|
ino_t *cinop; /* pointer into array */
|
|
|
|
/*
|
|
* scan inode hash and reconcile all inodes found for this fs
|
|
*/
|
|
for (ih = ihead; (error == 0) && (ih < &ihead[INOHSZ]); ih++) {
|
|
ufs_getino(mp, ih, &inop, &tino);
|
|
for (cinop = inop; (error == 0) && *cinop; ++cinop) {
|
|
if ((error = iget(mp->m_dev, fs, *cinop, &ip)) == 0) {
|
|
error = ufs_reconcile_ip(mp, ip);
|
|
(void) iput(ip);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* free the array of inode numbers
|
|
*/
|
|
if (inop != NULL)
|
|
kmem_free((caddr_t)inop, (u_int)tino * sizeof (ino_t));
|
|
}
|
|
return (error);
|
|
}
|
|
/*
|
|
* ufs_getino
|
|
* return array of ino_t's for inodes on the hash for given fs
|
|
*/
|
|
ufs_getino(mp, ih, inopp, tinop)
|
|
struct mount *mp;
|
|
union ihead *ih;
|
|
ino_t **inopp;
|
|
u_long *tinop;
|
|
{
|
|
struct inode *ip;
|
|
struct inode *aip = (struct inode *)ih;
|
|
struct fs *fs = mp->m_bufp->b_un.b_fs;
|
|
ino_t *inop = NULL;
|
|
u_long tino = 16;
|
|
u_long nino;
|
|
|
|
/*
|
|
* allocate an array of inode numbers (null terminated)
|
|
*/
|
|
again:
|
|
if (inop)
|
|
kmem_free((caddr_t)inop, (u_int)tino * sizeof (ino_t));
|
|
tino <<= 1;
|
|
inop = (ino_t *)kmem_zalloc((u_int)tino * sizeof (ino_t));
|
|
|
|
/*
|
|
* fill in the array from the inodes for fs on hash ih
|
|
*/
|
|
for (ip = aip->i_forw, nino = 0; ip && ip != aip; ip = ip->i_forw) {
|
|
if (ip->i_fs != fs) continue;
|
|
if (nino == (tino-1))
|
|
goto again;
|
|
*(inop + nino++) = ip->i_number;
|
|
}
|
|
|
|
/*
|
|
* return the array
|
|
*/
|
|
*inopp = inop;
|
|
*tinop = tino;
|
|
}
|
|
/*
|
|
* ufs_reconcile
|
|
* reconcile ondisk superblock/inodes with any incore
|
|
*/
|
|
ufs_reconcile(mp)
|
|
struct mount *mp;
|
|
{
|
|
int error = 0;
|
|
|
|
/*
|
|
* get rid of as much inmemory data as possible
|
|
*/
|
|
if (error = ufs_flush(mp))
|
|
goto errout;
|
|
|
|
/*
|
|
* reconcile the superblock and inodes
|
|
*/
|
|
if (error = ufs_reconcile_fs(mp))
|
|
goto errout;
|
|
if (error = ufs_reconcile_inodes(mp))
|
|
goto errout;
|
|
|
|
/*
|
|
* get rid of as much inmemory data as possible
|
|
*/
|
|
if (error = ufs_flush(mp))
|
|
goto errout;
|
|
errout:
|
|
|
|
return (error);
|
|
}
|
|
|
|
/*
|
|
* ufs_reconcile_fs
|
|
* reconcile incore superblock with ondisk superblock
|
|
*/
|
|
ufs_reconcile_fs(mp)
|
|
struct mount *mp;
|
|
{
|
|
int i;
|
|
int error;
|
|
struct fs *mfs; /* in-memory superblock */
|
|
struct fs *dfs; /* on-disk superblock */
|
|
struct buf *bp; /* on-disk superblock buf */
|
|
|
|
/*
|
|
* BIG BOO-BOO
|
|
*/
|
|
mfs = mp->m_bufp->b_un.b_fs;
|
|
if (mfs->fs_fmod)
|
|
return (EPERM);
|
|
|
|
/*
|
|
* get the on-disk copy of the superblock
|
|
*/
|
|
bp = bread(mp->m_devvp, SBLOCK, (int)mfs->fs_sbsize);
|
|
if (bp->b_flags & B_ERROR) {
|
|
brelse(bp);
|
|
return (EIO);
|
|
}
|
|
dfs = bp->b_un.b_fs;
|
|
|
|
/*
|
|
* if superblock has changed too much, abort
|
|
*/
|
|
if ((mfs->fs_sblkno != dfs->fs_sblkno) ||
|
|
(mfs->fs_cblkno != dfs->fs_cblkno) ||
|
|
(mfs->fs_iblkno != dfs->fs_iblkno) ||
|
|
(mfs->fs_dblkno != dfs->fs_dblkno) ||
|
|
(mfs->fs_cgoffset != dfs->fs_cgoffset) ||
|
|
(mfs->fs_cgmask != dfs->fs_cgmask) ||
|
|
(mfs->fs_bsize != dfs->fs_bsize) ||
|
|
(mfs->fs_fsize != dfs->fs_fsize) ||
|
|
(mfs->fs_frag != dfs->fs_frag) ||
|
|
(mfs->fs_bmask != dfs->fs_bmask) ||
|
|
(mfs->fs_fmask != dfs->fs_fmask) ||
|
|
(mfs->fs_bshift != dfs->fs_bshift) ||
|
|
(mfs->fs_fshift != dfs->fs_fshift) ||
|
|
(mfs->fs_fragshift != dfs->fs_fragshift) ||
|
|
(mfs->fs_fsbtodb != dfs->fs_fsbtodb) ||
|
|
(mfs->fs_sbsize != dfs->fs_sbsize) ||
|
|
(mfs->fs_nindir != dfs->fs_nindir) ||
|
|
(mfs->fs_nspf != dfs->fs_nspf) ||
|
|
(mfs->fs_trackskew != dfs->fs_trackskew) ||
|
|
(mfs->fs_cgsize != dfs->fs_cgsize) ||
|
|
(mfs->fs_ntrak != dfs->fs_ntrak) ||
|
|
(mfs->fs_nsect != dfs->fs_nsect) ||
|
|
(mfs->fs_spc != dfs->fs_spc) ||
|
|
(mfs->fs_cpg != dfs->fs_cpg) ||
|
|
(mfs->fs_ipg != dfs->fs_ipg) ||
|
|
(mfs->fs_fpg != dfs->fs_fpg) ||
|
|
(mfs->fs_postblformat != dfs->fs_postblformat) ||
|
|
(mfs->fs_magic != dfs->fs_magic)) {
|
|
brelse(bp);
|
|
return (EACCES);
|
|
}
|
|
|
|
/*
|
|
* get new summary info
|
|
*/
|
|
if (error = ufs_getsummaryinfo(mp, dfs)) {
|
|
brelse(bp);
|
|
return (error);
|
|
}
|
|
|
|
/*
|
|
* release old summary info and update in-memory superblock
|
|
*/
|
|
kmem_free((caddr_t)mfs->fs_csp[0], (u_int)mfs->fs_cssize);
|
|
for (i = 0; i < MAXCSBUFS; ++i)
|
|
mfs->fs_csp[i] = dfs->fs_csp[i];
|
|
|
|
/*
|
|
* update fields allowed to change
|
|
*/
|
|
mfs->fs_size = dfs->fs_size;
|
|
mfs->fs_dsize = dfs->fs_dsize;
|
|
mfs->fs_ncg = dfs->fs_ncg;
|
|
mfs->fs_minfree = dfs->fs_minfree;
|
|
mfs->fs_rotdelay = dfs->fs_rotdelay;
|
|
mfs->fs_rps = dfs->fs_rps;
|
|
mfs->fs_maxcontig = dfs->fs_maxcontig;
|
|
mfs->fs_maxbpg = dfs->fs_maxbpg;
|
|
mfs->fs_csmask = dfs->fs_csmask;
|
|
mfs->fs_csshift = dfs->fs_csshift;
|
|
mfs->fs_optim = dfs->fs_optim;
|
|
mfs->fs_csaddr = dfs->fs_csaddr;
|
|
mfs->fs_cssize = dfs->fs_cssize;
|
|
mfs->fs_ncyl = dfs->fs_ncyl;
|
|
mfs->fs_cstotal = dfs->fs_cstotal;
|
|
|
|
/* XXX What to do about sparecon? */
|
|
|
|
/*
|
|
* ondisk clean flag overrides inmemory clean flag iff == FSBAD
|
|
*/
|
|
if (FSOKAY != (fs_get_state(dfs) + dfs->fs_time))
|
|
mfs->fs_clean = FSBAD;
|
|
if (dfs->fs_clean == FSBAD)
|
|
mfs->fs_clean = FSBAD;
|
|
brelse(bp);
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* ufs_getlfd
|
|
* copy desired-lock struct from user to kernel space
|
|
*/
|
|
ufs_getlfd(vp, lfup, lfd, lfc)
|
|
struct vnode *vp; /* vnode on fs to be locked */
|
|
struct lockfs **lfup; /* address in user space */
|
|
struct lockfs *lfd; /* desired lock */
|
|
struct lockfs *lfc; /* current lock */
|
|
{
|
|
int error = 0;
|
|
u_int comlen = 0;
|
|
caddr_t comment = NULL;
|
|
|
|
/*
|
|
* copy user's lockfs struct into kernel memory
|
|
*/
|
|
if (error = copyin((caddr_t)*lfup, (caddr_t)lfd,
|
|
(u_int)(sizeof (struct lockfs))))
|
|
goto errout;
|
|
|
|
/*
|
|
* check key
|
|
*/
|
|
if (!LOCKFS_IS_ULOCK(lfc))
|
|
if (lfd->lf_key != lfc->lf_key) {
|
|
error = EINVAL;
|
|
goto errout;
|
|
}
|
|
lfd->lf_key = lfc->lf_key + 1;
|
|
|
|
/*
|
|
* check bounds -- lf_lock is index into array of access masks
|
|
*/
|
|
if (lfd->lf_lock >= lockfs_maxlock) {
|
|
error = EINVAL;
|
|
goto errout;
|
|
}
|
|
|
|
/*
|
|
* can't wlock fs with accounting or local swap file
|
|
*/
|
|
if (LOCKFS_IS_WLOCK(lfd)) {
|
|
#ifdef SYSACCT
|
|
if (error = ufs_checkaccton(vp))
|
|
goto errout;
|
|
#endif SYSACCT
|
|
if (error = ufs_checkswapon(vp))
|
|
goto errout;
|
|
}
|
|
|
|
/*
|
|
* no input flags defined
|
|
*/
|
|
if (lfd->lf_flags != 0) {
|
|
error = EINVAL;
|
|
goto errout;
|
|
}
|
|
|
|
/*
|
|
* get comment
|
|
*/
|
|
if (comlen = lfd->lf_comlen) {
|
|
if (comlen > LOCKFS_MAXCOMMENTLEN) {
|
|
error = ENAMETOOLONG;
|
|
goto errout;
|
|
}
|
|
comment = (caddr_t)kmem_alloc(comlen);
|
|
if (error = copyin(lfd->lf_comment, comment, comlen))
|
|
goto errout;
|
|
lfd->lf_comment = comment;
|
|
}
|
|
|
|
return (error);
|
|
errout:
|
|
if (comment)
|
|
kmem_free(comment, comlen);
|
|
return (error);
|
|
}
|
|
|
|
#ifdef SYSACCT
|
|
/*
|
|
* ufs_checkaccton
|
|
* check if accounting is turned on on this fs
|
|
*/
|
|
extern struct vnode *acctp;
|
|
extern struct vnode *savacctp;
|
|
ufs_checkaccton(vp)
|
|
struct vnode *vp;
|
|
{
|
|
if (acctp && acctp->v_vfsp == vp->v_vfsp)
|
|
return (EDEADLK);
|
|
if (savacctp && savacctp->v_vfsp == vp->v_vfsp)
|
|
return (EDEADLK);
|
|
return (0);
|
|
}
|
|
#endif /* SYSACCT */
|
|
|
|
/*
|
|
* ufs_checkswapon
|
|
* check if local swapping is to file on this fs
|
|
*/
|
|
extern struct swapinfo *swapinfo;
|
|
ufs_checkswapon(vp)
|
|
struct vnode *vp;
|
|
{
|
|
struct swapinfo *sip;
|
|
|
|
for (sip = swapinfo; sip; sip = sip->si_next)
|
|
if (sip->si_vp->v_vfsp == vp->v_vfsp)
|
|
return (EDEADLK);
|
|
return (0);
|
|
}
|
|
/*
|
|
* ufs_lockfs_hold
|
|
*/
|
|
ufs_lockfs_hold(vfsp)
|
|
struct vfs *vfsp;
|
|
{
|
|
struct mount *mp;
|
|
struct ulockfs *ul;
|
|
|
|
if ((mp = (struct mount *)vfsp->vfs_data) == NULL)
|
|
return (EIO);
|
|
ul = mp->m_ul;
|
|
|
|
if (ul->ul_flags & ULOCKFS_FUMOUNT)
|
|
return (EIO);
|
|
ul->ul_hold++;
|
|
return (0);
|
|
}
|
|
/*
|
|
* ufs_lockfs_rele
|
|
*/
|
|
ufs_lockfs_rele(vfsp)
|
|
struct vfs *vfsp;
|
|
{
|
|
struct ulockfs *ul;
|
|
|
|
ul = ((struct mount *)(vfsp->vfs_data))->m_ul;
|
|
|
|
if (ul->ul_hold-- == 1)
|
|
if (ul->ul_flags & ULOCKFS_WANT) {
|
|
ul->ul_flags &= ~ULOCKFS_WANT;
|
|
wakeup((caddr_t)ul);
|
|
}
|
|
}
|
|
/*
|
|
* ufs_lockfs_fumount
|
|
*/
|
|
ufs_lockfs_fumount(ul)
|
|
struct ulockfs *ul;
|
|
{
|
|
ul->ul_flags |= ULOCKFS_FUMOUNT;
|
|
while (ul->ul_hold) {
|
|
ul->ul_flags |= ULOCKFS_WANT;
|
|
if (sleep ((caddr_t)ul, PLOCK+PCATCH)) {
|
|
ul->ul_flags &= ~ULOCKFS_FUMOUNT;
|
|
return (EINTR);
|
|
}
|
|
}
|
|
return (0);
|
|
}
|
|
/*
|
|
* ufs_fioai
|
|
* file allocation information
|
|
*/
|
|
ufs_fioai(vp, faip)
|
|
struct vnode *vp; /* file's vnode */
|
|
struct filai **faip; /* user address of struct filai */
|
|
{
|
|
int error;
|
|
int boff; /* offset within file system block */
|
|
int na; /* # allocations returned */
|
|
int ne; /* # entries left in array */
|
|
size_t size; /* byte length of range */
|
|
daddr_t off; /* byte offset into file */
|
|
daddr_t lbn; /* logical fs block */
|
|
daddr_t bn; /* disk sector number */
|
|
daddr_t bor; /* beginning of range (sector) */
|
|
daddr_t lor; /* length of range (sector) */
|
|
daddr_t lof; /* length of file (sector) */
|
|
struct filai fai; /* copy of users filai */
|
|
struct fs *fs; /* file system (superblock) */
|
|
struct inode *ip; /* vnode's inode */
|
|
daddr_t *da; /* address of user array */
|
|
|
|
/*
|
|
* inode and superblock
|
|
*/
|
|
ip = VTOI(vp);
|
|
fs = ITOF(ip);
|
|
|
|
/*
|
|
* get user's filia struct
|
|
*/
|
|
if (error = copyin((caddr_t)*faip, (caddr_t)&fai, (u_int)sizeof (fai)))
|
|
return (error);
|
|
|
|
ILOCK(ip);
|
|
/*
|
|
* range checks
|
|
* offset >= 2G || size >= 2G || (offset+size) >= 2G
|
|
* offset >= length of file
|
|
*
|
|
*/
|
|
na = 0;
|
|
if ((size = fai.fai_size) == 0)
|
|
size = ip->i_size - fai.fai_off;
|
|
|
|
if ((int)fai.fai_off < 0)
|
|
goto errrange;
|
|
if ((int)size < 0)
|
|
goto errrange;
|
|
if ((int)(fai.fai_off + size) < 0)
|
|
goto errrange;
|
|
if (fai.fai_off >= ip->i_size)
|
|
goto errrange;
|
|
|
|
/*
|
|
* beginning of range in sectors
|
|
* length of range in sectors
|
|
* length of file in sectors
|
|
*/
|
|
bor = btodb(fai.fai_off);
|
|
off = dbtob(bor);
|
|
lor = btodb(size) + ((size & (DEV_BSIZE-1)) ? 1 : 0);
|
|
lof = btodb(ip->i_size) + ((ip->i_size & (DEV_BSIZE-1)) ? 1 : 0);
|
|
if (lof < (bor + lor))
|
|
lor = lof - bor;
|
|
|
|
/*
|
|
* return allocation info until:
|
|
* array fills
|
|
* range is covered (end of file accounted for above)
|
|
*/
|
|
ne = fai.fai_num;
|
|
da = fai.fai_daddr;
|
|
while (lor && ne) {
|
|
|
|
/*
|
|
* file system block and offset within block
|
|
*/
|
|
lbn = lblkno(fs, off);
|
|
boff = blkoff(fs, off);
|
|
|
|
/*
|
|
* get frag address and convert to disk address
|
|
*/
|
|
if (error = bmap(ip, lbn, &bn, (daddr_t *)NULL,
|
|
DEV_BSIZE, S_READ, 1))
|
|
goto errout;
|
|
if (bn == UFS_HOLE)
|
|
bn = FILAI_HOLE;
|
|
else
|
|
bn = fsbtodb(fs, bn) + btodb(boff);
|
|
|
|
/*
|
|
* return disk addresses.
|
|
* (file system blocks are contiguous on disk)
|
|
*/
|
|
do {
|
|
if (error = suword((caddr_t)da, (int)bn))
|
|
goto errout;
|
|
if (bn != FILAI_HOLE)
|
|
bn++;
|
|
off += DEV_BSIZE;
|
|
na++;
|
|
da++;
|
|
lor--;
|
|
ne--;
|
|
} while ((lbn == lblkno(fs, off)) && lor && ne);
|
|
}
|
|
/*
|
|
* update # of entries returned and current offset
|
|
*/
|
|
fai.fai_off = off;
|
|
errrange:
|
|
fai.fai_num = na;
|
|
if (error = copyout((caddr_t)&fai, (caddr_t)*faip, sizeof (fai)))
|
|
goto errout;
|
|
|
|
errout:
|
|
IUNLOCK(ip);
|
|
return (error);
|
|
}
|
|
/*
|
|
* ufs_fiodutimes
|
|
* set access/modified times but not change time. Also, delay the update
|
|
*/
|
|
ufs_fiodutimes(vp, tvp)
|
|
struct vnode *vp; /* file's vnode */
|
|
struct timeval **tvp; /* user address of struct timeval */
|
|
{
|
|
int error = 0;
|
|
struct inode *ip;
|
|
struct timeval tv;
|
|
|
|
if (!suser())
|
|
return (EPERM);
|
|
|
|
ip = VTOI(vp);
|
|
|
|
/*
|
|
* if NULL, use current time
|
|
*/
|
|
if (*tvp) {
|
|
if (error = copyin((caddr_t)*tvp, (caddr_t)&tv, sizeof (tv)))
|
|
return (error);
|
|
if (tv.tv_usec < 0 || tv.tv_usec >= 1000000)
|
|
return (EINVAL);
|
|
} else
|
|
tv = time;
|
|
|
|
ILOCK(ip);
|
|
ITIMES(ip);
|
|
ip->i_atime = tv;
|
|
ip->i_flag |= IMODACC;
|
|
IUNLOCK(ip);
|
|
|
|
return (error);
|
|
}
|
|
/*
|
|
* ufs_fiodios
|
|
* return status of metadata updates
|
|
*/
|
|
ufs_fiodios(vp, diop)
|
|
struct vnode *vp; /* file's vnode */
|
|
u_long **diop; /* m_dio returned here */
|
|
{
|
|
u_long dio;
|
|
|
|
dio = (ITOM(VTOI(vp)))->m_dio & ~MDIO_LOCK;
|
|
return (suword((caddr_t)*diop, (int)(dio)));
|
|
}
|
|
/*
|
|
* ufs_fiodio
|
|
* sandbag metadata updates
|
|
*/
|
|
ufs_fiodio(vp, diop)
|
|
struct vnode *vp; /* file's vnode */
|
|
u_long **diop; /* dio flags */
|
|
{
|
|
int error;
|
|
int clean;
|
|
u_long dio;
|
|
struct inode *ip;
|
|
struct fs *fs;
|
|
struct mount *mp;
|
|
struct buf *bp;
|
|
|
|
/*
|
|
* check input conditions
|
|
*/
|
|
if (!suser())
|
|
return (EPERM);
|
|
|
|
error = copyin((caddr_t)*diop, (caddr_t)&dio, (u_int)(sizeof (u_long)));
|
|
if (error)
|
|
return (error);
|
|
|
|
if (dio > 1)
|
|
return (EINVAL);
|
|
|
|
/*
|
|
* setup
|
|
*/
|
|
ip = VTOI(vp);
|
|
fs = ITOF(ip);
|
|
mp = ITOM(ip);
|
|
|
|
/*
|
|
* lock access to the dio field
|
|
*/
|
|
while (mp->m_dio & MDIO_LOCK)
|
|
if (sleep((caddr_t)mp, PLOCK+PCATCH))
|
|
return (EINTR);
|
|
if (mp->m_dio == dio)
|
|
goto out;
|
|
mp->m_dio = dio | MDIO_LOCK;
|
|
|
|
/*
|
|
* enable/disable clean flag processing
|
|
*/
|
|
if ((mp->m_dio & MDIO_ON) || (ufs_flush(mp)))
|
|
clean = FSSUSPEND;
|
|
else
|
|
clean = FSACTIVE;
|
|
if (fs->fs_ronly == 0) {
|
|
bp = getblk(mp->m_devvp, SBLOCK, (int)fs->fs_sbsize);
|
|
if (fs->fs_clean != FSBAD) {
|
|
fs->fs_clean = clean;
|
|
ufs_sbwrite(mp, fs, bp);
|
|
} else
|
|
brelse(bp);
|
|
}
|
|
out:
|
|
mp->m_dio &= ~MDIO_LOCK;
|
|
wakeup((caddr_t)mp);
|
|
|
|
return (0);
|
|
}
|