Files
Arquivotheca.SunOS-4.1.4/sys/ufs/ufs_vnodeops.c
seta75D ff309bfe1c Init
2021-10-11 18:37:13 -03:00

5129 lines
113 KiB
C

#ident "@(#)ufs_vnodeops.c 1.1 94/10/31 SMI"
/*
* Copyright (c) 1989, 1990 by Sun Microsystems, Inc.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/user.h>
#include <sys/buf.h>
#include <sys/vfs.h>
#include <sys/vfs_stat.h>
#include <sys/vnode.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/uio.h>
#include <sys/conf.h>
#include <sys/kernel.h>
#include <sys/mman.h>
#include <sys/pathname.h>
#include <sys/debug.h>
#include <sys/vmmeter.h>
#include <sys/trace.h>
#include <sys/unistd.h>
#include <sys/stat.h>
#include <sys/filio.h> /* FIOLFS */
#include <sys/vaccess.h> /* FIOLFS */
#include <sys/lockfs.h> /* FIOLFS */
#include <sys/filai.h> /* FIOAI */
#include <specfs/fifo.h>
#include <ufs/fs.h>
#include <ufs/inode.h>
#include <ufs/mount.h>
#include <ufs/fsdir.h>
#include <ufs/lockf.h> /* Defines constants for the locking code */
#include <ufs/lockfs.h> /* FIOLFS */
#ifdef QUOTA
#include <ufs/quota.h>
#endif
#include <sys/dirent.h> /* must be AFTER <ufs/fsdir>! */
#include <vm/hat.h>
#include <vm/page.h>
#include <vm/pvn.h>
#include <vm/as.h>
#include <vm/seg.h>
#include <vm/seg_map.h>
#include <vm/seg_vn.h>
#include <vm/rm.h>
#include <vm/swap.h>
#include <krpc/lockmgr.h>
#define ISVDEV(t) ((t == VCHR) || (t == VBLK) || (t == VFIFO))
static int ufs_open();
static int ufs_close();
static int ufs_rdwr();
static int ufs_ioctl();
static int ufs_select();
static int ufs_getattr();
static int ufs_setattr();
static int ufs_access();
static int ufs_lookup();
static int ufs_create();
static int ufs_remove();
static int ufs_link();
static int ufs_rename();
static int ufs_mkdir();
static int ufs_rmdir();
static int ufs_readdir();
static int ufs_symlink();
static int ufs_readlink();
static int ufs_fsync();
static int ufs_inactive();
static int ufs_lockctl();
static int ufs_fid();
static int ufs_getpage();
static int ufs_putpage();
static int ufs_map();
static int ufs_cmp();
static int ufs_realvp();
static int ufs_cntl();
static int ufs_badop();
/*
* ulockfs intercepts
* Substituted for normal VOP entry points in ufs_vnodeops below
*/
static int ufs_l_open();
static int ufs_l_close();
static int ufs_l_rdwr();
static int ufs_l_select();
static int ufs_l_getattr();
static int ufs_l_setattr();
static int ufs_l_access();
static int ufs_l_lookup();
static int ufs_l_create();
static int ufs_l_remove();
static int ufs_l_link();
static int ufs_l_rename();
static int ufs_l_mkdir();
static int ufs_l_rmdir();
static int ufs_l_readdir();
static int ufs_l_symlink();
static int ufs_l_readlink();
static int ufs_l_fsync();
static int ufs_l_inactive();
static int ufs_l_lockctl();
static int ufs_l_fid();
static int ufs_l_getpage();
static int ufs_l_putpage();
static int ufs_l_map();
static int ufs_l_cntl();
/*
* Replace standard entries with ulockfs intercepts
*/
struct vnodeops ufs_vnodeops = {
ufs_l_open,
ufs_l_close,
ufs_l_rdwr,
ufs_ioctl,
ufs_l_select,
ufs_l_getattr,
ufs_l_setattr,
ufs_l_access,
ufs_l_lookup,
ufs_l_create,
ufs_l_remove,
ufs_l_link,
ufs_l_rename,
ufs_l_mkdir,
ufs_l_rmdir,
ufs_l_readdir,
ufs_l_symlink,
ufs_l_readlink,
ufs_l_fsync,
ufs_l_inactive,
ufs_l_lockctl,
ufs_l_fid,
ufs_l_getpage,
ufs_l_putpage,
ufs_l_map,
ufs_badop, /* dump */
ufs_cmp,
ufs_realvp,
ufs_l_cntl,
};
/*
* FORCED UNMOUNT ENTRY POINTS
* Alternate vnodeops branch table substitued for ufs_vnodeops
*/
static int ufs_eio();
static int ufs_f_close();
static int ufs_f_inactive();
struct vnodeops ufs_forcedops = {
ufs_eio, /* ufs_open, */
ufs_f_close,
ufs_eio, /* ufs_rdwr, */
ufs_eio, /* ufs_ioctl, */
ufs_eio, /* ufs_select, */
ufs_eio, /* ufs_getattr, */
ufs_eio, /* ufs_setattr, */
ufs_eio, /* ufs_access, */
ufs_eio, /* ufs_lookup, */
ufs_eio, /* ufs_create, */
ufs_eio, /* ufs_remove, */
ufs_eio, /* ufs_link, */
ufs_eio, /* ufs_rename, */
ufs_eio, /* ufs_mkdir, */
ufs_eio, /* ufs_rmdir, */
ufs_eio, /* ufs_readdir, */
ufs_eio, /* ufs_symlink, */
ufs_eio, /* ufs_readlink, */
ufs_eio, /* ufs_fsync, */
ufs_f_inactive,
ufs_eio, /* ufs_lockctl, */
ufs_eio, /* ufs_fid, */
ufs_eio, /* ufs_getpage, */
ufs_eio, /* ufs_putpage, */
ufs_eio, /* ufs_map, */
ufs_badop, /* dump */
ufs_cmp,
ufs_eio, /* ufs_realvp, */
ufs_eio, /* ufs_cntl, */
};
/*
* FORCED UNMOUNT VOP ROUTINES
* VOP calls for inodes belonging to forcibly unmounted file systems
* enter one of the following routines.
*/
static int
ufs_eio()
{
return (EIO);
}
/*ARGSUSED*/
static int
ufs_f_close(vp, flag, count, cred)
struct vnode *vp;
int flag;
int count;
struct ucred *cred;
{
return (0);
}
/*ARGSUSED*/
static int
ufs_f_inactive(vp, cred)
struct vnode *vp;
struct ucred *cred;
{
iinactive(VTOI(vp));
return (0);
}
/*
* ULOCKFS MACROS
*/
/*
* ulockfs intercept routines surround the normal ufs VOP call with a
* locking wrapper by using the following wrapper macro
*/
#define ULOCKFS(VP, VAID, VOPCALL) \
{ \
int reterr; \
struct mount *mp; \
\
if (reterr = ufs_lockfs_begin(VP, VAID, &mp)) \
return (reterr); \
reterr = (VOPCALL); \
ufs_lockfs_end(VAID, mp); \
return (reterr); \
}
int lock(), unlock();
void test_lock(), kill_proc_locks();
/*ARGSUSED*/
static int
ufs_open(vpp, flag, cred)
struct vnode **vpp;
int flag;
struct ucred *cred;
{
register int error, cmd;
register struct inode *ip;
struct eflock ld; /* Holder for an I/O lock */
VFS_RECORD((*vpp)->v_vfsp, VS_OPEN, VS_CALL);
ip = VTOI(*vpp);
/*
* Mandatory file and record locking stuff. MFRL is enforced
* when the SGID bit is set and the XGRP bit is reset (hey I
* didn't come up with this scheme!) When enabled reads and
* writes are checked to see if they will 'violate' an existing
* lock on the file. Failure modes are determined by the state
* of the O_NDELAY flag on the file descriptor, when set the
* error EAGAIN is returned when reset the process blocks until
* there are no blocking locks. In either case if a dead lock
* would occur EDEADLK is returned.
*/
if (((ip->i_mode & ISGID) != 0) && ((ip->i_mode & IFMT) == IFREG) &&
((ip->i_mode & (IEXEC >> 3)) == 0)) {
ld.l_type = F_WRLCK;
ld.l_start = 0;
ld.l_len = 0x7fffffff;
ld.l_whence = 0;
cmd = F_SETLK;
/* XXX need a better way to get pid */
if ((error = lock(*vpp, &ld, cmd, u.u_procp->p_pid, IO_LOCK)) !=
0) {
/* to make it SVID compliant return EAGAIN */
if (error == EACCES)
error = EAGAIN;
return (error);
}
} else {
cmd = 0;
}
if (cmd) {
ld.l_type = F_UNLCK;
cmd = F_SETLK;
/* XXX need a better way to get pid */
(void) unlock(*vpp, &ld, cmd, u.u_procp->p_pid, IO_LOCK);
}
return (0);
}
/*ARGSUSED*/
static int
ufs_close(vp, flag, count, cred)
struct vnode *vp;
int flag;
int count;
struct ucred *cred;
{
VFS_RECORD(vp->v_vfsp, VS_CLOSE, VS_CALL);
return (0);
}
/*
* read or write a vnode
*/
/*ARGSUSED*/
static int
ufs_rdwr(vp, uiop, rw, ioflag, cred)
struct vnode *vp;
struct uio *uiop;
enum uio_rw rw;
int ioflag;
struct ucred *cred;
{
register struct inode *ip;
int error;
int didlock; /* TRUE if the inode was locked. */
int cmd; /* I/O lock command, zero if no lock */
struct eflock ld; /* Holder for an I/O lock */
ip = VTOI(vp);
/*
* Mandatory file and record locking stuff. MFRL is enforced
* when the SGID bit is set and the XGRP bit is reset (hey I
* didn't come up with this scheme!) When enabled reads and
* writes are checked to see if they will 'violate' an existing
* lock on the file. Failure modes are determined by the state
* of the O_NDELAY flag on the file descriptor, when set the
* error EAGAIN is returned when reset the process blocks until
* there are no blocking locks. In either case if a dead lock
* would occur EDEADLK is returned.
*/
if (((ip->i_mode & ISGID) != 0) && ((ip->i_mode & IFMT) == IFREG) &&
((ip->i_mode & (IEXEC >> 3)) == 0)) {
ld.l_type = (rw == UIO_WRITE) ? F_WRLCK : F_RDLCK;
ld.l_start = uiop->uio_offset;
ld.l_len = uiop->uio_resid;
ld.l_whence = 0;
cmd = (ioflag & IO_NDELAY) ? F_SETLK : F_SETLKW;
/* XXX need a better way to get pid */
error = lock(vp, &ld, cmd, u.u_procp->p_pid, IO_LOCK);
if (error != 0) {
/* to make it SVID compliant return EAGAIN */
if (error == EACCES)
error = EAGAIN;
return (error);
}
} else
cmd = 0; /* No lock set */
if ((ioflag & IO_APPEND) != 0 && (rw == UIO_WRITE) &&
(ip->i_mode & IFMT) == IFREG) {
/*
* In append mode start at end of file after locking it.
*/
didlock = 1;
ILOCK(ip);
uiop->uio_offset = ip->i_size;
} else
didlock = 0;
error = rwip(ip, uiop, rw, ioflag);
ITIMES(ip);
if (didlock)
IUNLOCK(ip);
if (cmd) {
ld.l_type = F_UNLCK;
cmd = F_SETLK;
/* XXX need a better way to get pid */
(void) unlock(vp, &ld, cmd, u.u_procp->p_pid, IO_LOCK);
}
return (error);
}
/*
* Don't cache write blocks to files with the sticky bit set.
* Used to keep swap files from blowing the page cache on a server.
*/
int stickyhack = 1;
/*
* Bytes / inode allowed in the disk queue.
*/
int ufs_WRITES = 512 * 1024;
#ifdef MULTIPROCESSOR
/*
* release the kernel lock during uiomove.
*/
int ufs_uiomove_nolock = 1;
#endif
/*
* prefault segmkmap mapping in rwip to avoid traps.
*/
int dogetmapflt = 1;
/*
* The idea behind the freebehind stuff is this:
* We want caching but we don't want large i/o's to blow everything else
* out. Furthermore, it is more expensive (cpu wise) to wait for the
* page to free up memory; it's faster to have the process do free up
* it's own memory.
*
* The knobs associated with this stuff are:
*
* freebehind on/off switch for both read and write
* write_free on/off for unconditional free's upon write completion
* pages_before_pager the pager turns on at 'lotsfree'; we turn on at
* 'lotsfree + pages_before_pager'. This wants to be
* at least a clusters worth.
* smallfile don't free behind at offsets less than this.
*/
int freebehind = 1;
int pages_before_pager = 30; /* 1 cluster on a sun4c, 2 on all others */
int write_free = 0;
int smallfile = 32 * 1024;
#ifdef MULTIPROCESSOR
int ufs_lock_released = 0;
#endif
/*
* rwip does the real work of read or write requests for ufs.
*/
static int
rwip(ip, uio, rw, ioflag)
register struct inode *ip;
register struct uio *uio;
enum uio_rw rw;
int ioflag;
{
register u_int off;
register addr_t base;
register int n, on, mapon;
register struct fs *fs;
struct vnode *vp;
int type, error, pagecreate;
u_int flags;
int iupdat_flag;
long old_blocks;
int adjust_resid = 0;
int dofree;
extern int freemem, lotsfree, pages_before_pager;
int orig_resid = 0;
int last = 0;
#ifdef MULTIPROCESSOR
int klock_released;
#endif
#ifdef LWP
extern int runthreads;
#endif
extern caddr_t segmap_getmapflt();
if (rw != UIO_READ && rw != UIO_WRITE)
panic("rwip");
type = ip->i_mode & IFMT;
if (type != IFREG && type != IFDIR && type != IFLNK)
panic("rwip type");
if (uio->uio_offset < 0 || (uio->uio_offset + uio->uio_resid) < 0)
return (EINVAL);
if (uio->uio_resid == 0)
return (0);
trace6(TR_UFS_RWIP, ip, uio, rw, ioflag, uio->uio_offset,
TRC_RWIP_ENTER);
ILOCK(ip);
if (rw == UIO_WRITE) {
if (type == IFREG && uio->uio_offset + uio->uio_resid >
u.u_rlimit[RLIMIT_FSIZE].rlim_cur) {
if (uio->uio_offset >=
u.u_rlimit[RLIMIT_FSIZE].rlim_cur) {
psignal(u.u_procp, SIGXFSZ);
error = EFBIG;
goto out;
} else {
adjust_resid = uio->uio_resid;
uio->uio_resid =
u.u_rlimit[RLIMIT_FSIZE].rlim_cur - uio->uio_offset;
adjust_resid -= uio->uio_resid;
}
}
ip->i_flag |= INOACC; /* don't update ref time in getpage */
} else {
if (!ULOCKFS_IS_NOIACC(ITOU(ip)))
ip->i_flag |= IACC;
}
if (ioflag & IO_SYNC) {
ip->i_flag |= ISYNC;
old_blocks = ip->i_blocks;
iupdat_flag = 0;
}
fs = ip->i_fs;
vp = ITOV(ip);
do {
off = uio->uio_offset & MAXBMASK;
mapon = uio->uio_offset & MAXBOFFSET;
on = blkoff(fs, uio->uio_offset);
n = MIN(fs->fs_bsize - on, uio->uio_resid);
if (rw == UIO_READ) {
int diff = ip->i_size - uio->uio_offset;
VFS_RECORD(ITOV(ip)->v_vfsp, VS_READ, VS_CALL);
if (diff <= 0) {
error = 0;
goto out;
}
if (diff < n)
n = diff;
dofree = freebehind &&
ip->i_nextr == (off & PAGEMASK) && off > smallfile;
} else {
int s;
/*
* Limit the amount of memory that this inode can use
* Protected because the count is modified at interrupt
* level.
*/
s = splbio();
while (ufs_WRITES && ip->i_writes > ufs_WRITES) {
(void) sleep((caddr_t)&ip->i_writes, PZERO);
}
(void) splx(s);
VFS_RECORD(ITOV(ip)->v_vfsp, VS_WRITE, VS_CALL);
}
if ( dogetmapflt && rw == UIO_READ
&& off + MAXBSIZE <= ip->i_size )
base = segmap_getmapflt(segkmap, vp, off);
else
base = segmap_getmap(segkmap, vp, off);
trace6(TR_UFS_RWIP, ip, uio, rw, ioflag, uio->uio_offset,
TRC_RWIP_GETMAP);
if (rw == UIO_WRITE) {
if (uio->uio_offset + n > ip->i_size) {
/*
* We are extending the length of the file.
* bmap is used so that we are sure that
* if we need to allocate new blocks, that it
* is done here before we up the file size.
*/
error = bmap_write(ip,
(daddr_t)lblkno(fs, uio->uio_offset),
0, (daddr_t*)0, (int*)0,
(int)(on + n), mapon == 0);
if (error && (on % fs->fs_fsize)) {
int llbn = lblkno(fs, ip->i_size-1);
int olbn = lblkno(fs, uio->uio_offset);
/*
* If the offset is within the same
* block as the last byte of the file,
* fill out the rest of that block
* or frag for Posix */
if (llbn == olbn) {
int avail=
blksize(fs, ip, llbn) - on;
n = MIN(MAX(0,avail), n);
orig_resid = uio->uio_resid - n;
uio->uio_resid = n;
error = 0;
}
}
trace6(TR_UFS_RWIP, ip, uio, rw, ioflag,
uio->uio_offset, TRC_RWIP_BMAPALLOC);
if (error) {
(void) segmap_release(segkmap, base, 0);
/*
* For Posix. If the last write worked
* and now we're out of space return
* the number of bytes written.
*/
if (last > 0 && error == ENOSPC)
error = 0;
break;
}
ip->i_size = uio->uio_offset + n;
iupdat_flag = 1;
/*
* If we are writing from the beginning of
* the mapping, we can just create the
* pages without having to read them.
*/
if (mapon == 0) {
segmap_pagecreate(segkmap, base,
(u_int)n, 0);
pagecreate = 1;
} else
pagecreate = 0;
} else if (n == MAXBSIZE) {
/*
* Going to do a whole mappings worth,
* so we can just create the pages w/o
* having to read them in. But before
* we do that, we need to make sure any
* needed blocks are allocated first.
*/
error = bmap_write(ip,
(daddr_t)lblkno(fs, uio->uio_offset),
0, (daddr_t*)0, (int*)0, (int)(on + n), 1);
trace6(TR_UFS_RWIP, ip, uio, rw, ioflag,
uio->uio_offset, TRC_RWIP_BMAPALLOC);
if (error && (on % fs->fs_fsize)) {
int llbn = lblkno(fs, ip->i_size-1);
int olbn = lblkno(fs, uio->uio_offset);
/*
* If the offset is within the same
* block as the last byte of the file,
* fill out the rest of that block
* or frag for Posix */
if (llbn == olbn) {
int avail=
blksize(fs, ip, llbn) - on;
n = MIN(MAX(0,avail), n);
orig_resid = uio->uio_resid - n;
uio->uio_resid = n;
error = 0;
}
}
if (error) {
(void) segmap_release(segkmap, base, 0);
if (last > 0 && error == ENOSPC)
error = 0;
break;
}
segmap_pagecreate(segkmap, base, (u_int)n, 0);
pagecreate = 1;
} else
pagecreate = 0;
} else
pagecreate = 0;
#ifdef MULTIPROCESSOR
#ifdef LWP
if (ufs_uiomove_nolock && n >= 512 && (runthreads == 0)) {
#else
if (ufs_uiomove_nolock && n >= 512) {
#endif
ufs_lock_released++;
klock_exit();
klock_released = 1;
} else
klock_released = 0;
error = uiomove(base + mapon, n, rw, uio);
if (klock_released) {
klock_enter();
ufs_lock_released--;
}
#else
error = uiomove(base + mapon, n, rw, uio);
#endif
if (pagecreate && uio->uio_offset <
roundup(off + mapon + n, PAGESIZE)) {
/*
* We created pages w/o initializing them completely,
* thus we need to zero the part that wasn't set up.
* This happens on most EOF write cases and if
* we had some sort of error during the uiomove.
*/
int nzero, nmoved;
nmoved = uio->uio_offset - (off + mapon);
ASSERT(nmoved >= 0 && nmoved <= n);
nzero = roundup(on + n, PAGESIZE) - nmoved;
ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE);
(void) kzero(base + mapon + nmoved, (u_int)nzero);
}
trace6(TR_UFS_RWIP, ip, uio, rw, ioflag, uio->uio_offset,
TRC_RWIP_UIOMOVE);
if (error == 0) {
int free;
flags = 0;
if (rw == UIO_WRITE) {
if (write_free ||
(freebehind &&
freemem < lotsfree + pages_before_pager)) {
free = SM_FREE;
} else {
free = 0;
}
/*
* Force write back for synchronous write cases.
*/
if ((ioflag & IO_SYNC) || type == IFDIR) {
/*
* If the sticky bit is set but the
* execute bit is not set, we do a
* synchronous write back and free
* the page when done. We set up swap
* files to be handled this way to
* prevent servers from keeping around
* the client's swap pages too long.
* XXX - there ought to be a better way.
*/
if (IS_SWAPVP(vp)) {
flags = SM_WRITE | SM_FREE |
SM_DONTNEED;
} else {
iupdat_flag = 1;
flags = SM_WRITE | free;
}
} else if (n + on == MAXBSIZE ||
IS_SWAPVP(vp)) {
/*
* Have written a whole block.
* Start an asynchronous write and
* mark the buffer to indicate that
* it won't be needed again soon.
*/
flags = SM_WRITE | SM_ASYNC | free;
}
ip->i_flag |= IUPD | ICHG;
if (u.u_ruid != 0 && (ip->i_mode & (IEXEC |
(IEXEC >> 3) | (IEXEC >> 6))) != 0) {
/*
* Clear Set-UID & Set-GID bits on
* successful write if not super-user
* and at least one of the execute bits
* is set. If we always clear Set-GID,
* mandatory file and record locking is
* unuseable.
*/
ip->i_mode &= ~(ISUID | ISGID);
}
} else if (rw == UIO_READ) {
if (freebehind && dofree &&
freemem < lotsfree + pages_before_pager) {
flags = SM_FREE | SM_DONTNEED;
}
}
error = segmap_release(segkmap, base, flags);
} else {
(void) segmap_release(segkmap, base, 0);
}
trace6(TR_UFS_RWIP, ip, uio, rw, ioflag, uio->uio_offset,
TRC_RWIP_RELEASE);
/*
* For Posix conformance
*/
if (orig_resid) {
uio->uio_resid = orig_resid;
if(error == ENOSPC)
error = 0;
break;
}
last = n;
} while (error == 0 && uio->uio_resid > 0 && n != 0);
/*
* If we are doing synchronous write the only time we should
* not be sync'ing the ip here is if we have the stickyhack
* activated, the file is marked with the sticky bit and
* no exec bit, the file length has not been changed and
* no new blocks have been allocated during this write.
*/
if ((ioflag & IO_SYNC) != 0 && rw == UIO_WRITE &&
(iupdat_flag != 0 || old_blocks != ip->i_blocks)) {
iupdat(ip, 1);
trace6(TR_UFS_RWIP, ip, uio, rw, ioflag, uio->uio_offset,
TRC_RWIP_IUPDAT);
}
out:
ip->i_flag &= ~(ISYNC | INOACC);
IUNLOCK(ip);
if (!error && adjust_resid) {
uio->uio_resid = adjust_resid;
psignal(u.u_procp, SIGXFSZ);
}
trace6(TR_UFS_RWIP, ip, uio, rw, ioflag, uio->uio_offset,
TRC_RWIP_RETURN);
return (error);
}
/*ARGSUSED*/
static int
ufs_ioctl(vp, com, data, flag, cred)
struct vnode *vp;
int com;
caddr_t data;
int flag;
struct ucred *cred;
{
int error;
VFS_RECORD(vp->v_vfsp, VS_IOCTL, VS_CALL);
switch (com) {
case FIOLFS:
/*
* file system locking
*/
if ((error = ufs_lockfs_hold(vp->v_vfsp)) == 0) {
error = ufs_fiolfs(vp, (struct lockfs **)data);
ufs_lockfs_rele(vp->v_vfsp);
}
break;
case FIOLFSS:
/*
* file system lock status
*/
if ((error = ufs_lockfs_hold(vp->v_vfsp)) == 0) {
error = ufs_fiolfss(vp, (struct lockfs **)data);
ufs_lockfs_rele(vp->v_vfsp);
}
break;
case FIOFFS:
/*
* file system flush (push w/invalidate)
*/
if ((error = ufs_lockfs_hold(vp->v_vfsp)) == 0) {
error = ufs_fioffs(vp, (struct lockfs **)data);
ufs_lockfs_rele(vp->v_vfsp);
}
break;
case FIOAI:
/*
* file allocation information
*/
ULOCKFS(vp, VA_GETATTR,
ufs_fioai(vp, (struct filai **)data));
/* NOTREACHED */
break;
case FIODUTIMES:
/*
* file allocation information
*/
ULOCKFS(vp, VA_CHANGE,
ufs_fiodutimes(vp, (struct timeval **)data));
/* NOTREACHED */
break;
case FIODIO:
/*
* file system meta/user data delayed io
*/
ULOCKFS(vp, VA_WRITE,
ufs_fiodio(vp, (u_long **)data));
/* NOTREACHED */
break;
case FIODIOS:
/*
* file system meta/user data delayed io status
*/
ULOCKFS(vp, VA_READ,
ufs_fiodios(vp, (u_long **)data));
/* NOTREACHED */
break;
default:
error = ENOTTY;
break;
}
return (error);
}
/*ARGSUSED*/
static int
ufs_select(vp, which, cred)
struct vnode *vp;
int which;
struct ucred *cred;
{
VFS_RECORD(vp->v_vfsp, VS_SELECT, VS_CALL);
return (EINVAL);
}
/*ARGSUSED*/
static int
ufs_getattr(vp, vap, cred)
struct vnode *vp;
register struct vattr *vap;
struct ucred *cred;
{
register struct inode *ip;
VFS_RECORD(vp->v_vfsp, VS_GETATTR, VS_CALL);
ip = VTOI(vp);
/*
* Mark correct time in inode.
*/
ITIMES(ip);
/*
* Copy from inode table.
*/
vap->va_type = IFTOVT(ip->i_mode);
vap->va_mode = ip->i_mode;
vap->va_uid = ip->i_uid;
vap->va_gid = ip->i_gid;
vap->va_fsid = ip->i_dev;
vap->va_nodeid = ip->i_number;
vap->va_nlink = ip->i_nlink;
vap->va_size = ip->i_size;
vap->va_atime = ip->i_atime;
vap->va_mtime = ip->i_mtime;
vap->va_ctime = ip->i_ctime;
vap->va_rdev = ip->i_rdev;
vap->va_blocks = ip->i_blocks;
switch (ip->i_mode & IFMT) {
case IFBLK:
vap->va_blocksize = MAXBSIZE; /* was BLKDEV_IOSIZE */
break;
case IFCHR:
vap->va_blocksize = MAXBSIZE;
break;
default:
vap->va_blocksize = vp->v_vfsp->vfs_bsize;
break;
}
return (0);
}
static int
ufs_setattr(vp, vap, cred)
register struct vnode *vp;
register struct vattr *vap;
struct ucred *cred;
{
register struct inode *ip;
int chtime = 0;
int error = 0;
VFS_RECORD(vp->v_vfsp, VS_SETATTR, VS_CALL);
/*
* Cannot set these attributes
*/
if ((vap->va_nlink != -1) || (vap->va_blocksize != -1) ||
(vap->va_rdev != -1) || (vap->va_blocks != -1) ||
(vap->va_fsid != -1) || (vap->va_nodeid != -1) ||
((int)vap->va_type != -1)) {
return (EINVAL);
}
ip = VTOI(vp);
ILOCK(ip);
/*
* Change file access modes. Must be owner or su.
*/
if (vap->va_mode != (u_short)-1) {
error = OWNER(cred, ip);
if (error)
goto out;
ip->i_mode &= IFMT;
ip->i_mode |= vap->va_mode & ~IFMT;
if (cred->cr_uid != 0) {
if ((ip->i_mode & IFMT) != IFDIR)
/* DBE_FAST_OSYNC */
if (ip->i_mode & (IEXEC | (IEXEC>>3) | (IEXEC>>6)))
/* DBE_FAST_OSYNC */
ip->i_mode &= ~ISVTX;
if (!groupmember((int)ip->i_gid))
ip->i_mode &= ~ISGID;
}
ip->i_flag |= ICHG;
}
/*
* To change file ownership, must be su.
* To change group ownership, must be su or owner and in target group.
* This is now enforced in chown1() below.
*/
if ((vap->va_uid != (uid_t)-1) || (vap->va_gid != (gid_t)-1)) {
error = chown1(ip, vap->va_uid, vap->va_gid);
if (error)
goto out;
}
/*
* Truncate file. Must have write permission (checked above vnode
* layer) and not be a directory.
*/
if (vap->va_size != (u_long)-1) {
if ((ip->i_mode & IFMT) == IFDIR) {
error = EISDIR;
goto out;
}
if ((error = itrunc(ip, vap->va_size)) != 0) {
goto out;
}
}
/*
* Change file access or modified times.
*/
if (vap->va_atime.tv_sec != -1) {
if (cred->cr_uid != ip->i_uid && cred->cr_uid != 0) {
error = iaccess(ip, IWRITE);
if (error)
goto out;
}
ip->i_atime = vap->va_atime;
chtime++;
}
if (vap->va_mtime.tv_sec != -1) {
/*
* Allow SysV-compatible option to set access and
* modified times to the current time if root, owner,
* or write access.
*
* XXX - va_mtime.tv_usec == -1 flags this.
*/
if (cred->cr_uid != ip->i_uid && cred->cr_uid != 0) {
error = iaccess(ip, IWRITE);
if (error)
goto out;
}
if (vap->va_mtime.tv_usec == -1) {
ip->i_atime = time;
ip->i_mtime = time;
} else {
ip->i_mtime = vap->va_mtime;
}
ip->i_flag |= IMODTIME;
chtime++;
}
if (chtime) {
ip->i_ctime = time;
ip->i_flag |= IMOD;
}
out:
iupdat(ip, 1); /* XXX - should be async for perf */
IUNLOCK(ip);
return (error);
}
/*
* Perform chown operation on inode ip;
* inode must be locked prior to call.
*/
static int
chown1(ip, uid, gid)
register struct inode *ip;
uid_t uid;
gid_t gid;
{
#ifdef QUOTA
register long change;
#endif
if (uid == (uid_t)-1)
uid = ip->i_uid;
if (gid == (gid_t)-1)
gid = ip->i_gid;
/*
* If:
* 1) not the owner of the file, or
* 2) trying to change the owner of the file, or
* 3) trying to change the group of the file to a group not in the
* process' group set,
* then must be super-user.
* Check super-user last, and use "suser", so that the accounting
* file's "used super-user privileges" flag is properly set.
*/
if ((u.u_uid != uid || uid != ip->i_uid || !groupmember((int)gid)) &&
!suser())
return (EPERM);
#ifdef QUOTA
if (ip->i_uid == uid) /* this just speeds things a little */
change = 0;
else
change = ip->i_blocks;
(void) chkdq(ip, -change, 1);
(void) chkiq(VFSTOM(ip->i_vnode.v_vfsp), ip, (int)ip->i_uid, 1);
dqrele(ip->i_dquot);
#endif
ip->i_uid = uid;
ip->i_gid = gid;
ip->i_flag |= ICHG;
if (u.u_uid != 0)
ip->i_mode &= ~(ISUID|ISGID);
#ifdef QUOTA
ip->i_dquot = getinoquota(ip);
(void) chkdq(ip, change, 1);
(void) chkiq(VFSTOM(ip->i_vnode.v_vfsp), (struct inode *)NULL,
(int)uid, 1);
#endif
return (0);
}
/*ARGSUSED*/
static int
ufs_access(vp, mode, cred)
struct vnode *vp;
int mode;
struct ucred *cred;
{
register struct inode *ip;
int error;
VFS_RECORD(vp->v_vfsp, VS_ACCESS, VS_CALL);
ip = VTOI(vp);
ILOCK(ip);
error = iaccess(ip, mode);
IUNLOCK(ip);
return (error);
}
/*ARGSUSED*/
static int
ufs_readlink(vp, uiop, cred)
struct vnode *vp;
struct uio *uiop;
struct ucred *cred;
{
register struct inode *ip;
register int error;
VFS_RECORD(vp->v_vfsp, VS_READLINK, VS_CALL);
if (vp->v_type != VLNK)
return (EINVAL);
ip = VTOI(vp);
if (ip->i_flag & IFASTSYMLNK) {
ILOCK(ip);
if (!ULOCKFS_IS_NOIACC(ITOU(ip)))
ip->i_flag |= IACC;
error = uiomove((caddr_t)&ip->i_db[1],
(int) MIN(ip->i_size, uiop->uio_resid),
UIO_READ, uiop);
IUNLOCK(ip);
} else {
int size; /* no. of byte read */
caddr_t basep; /* pointer to input data */
ino_t ino;
long igen;
ino = ip->i_number;
igen = ip->i_gen;
size = uiop->uio_resid;
basep = uiop->uio_iov->iov_base;
error = rwip(ip, uiop, UIO_READ, 0);
if (error == 0 && ip->i_number == ino &&
ip->i_gen == igen);
else goto out;
size -= uiop->uio_resid;
if (ip->i_size <= FSL_SIZE && ip->i_size == size) {
if (uiop->uio_segflg == UIO_USERSPACE ||
uiop->uio_segflg == UIO_USERISPACE)
error = copyin(basep,
(caddr_t) &ip->i_db[1],
(u_int) ip->i_size);
else
error = kcopy(basep,
(caddr_t) &ip->i_db[1],
(u_int) ip->i_size);
if (error == 0) {
ip->i_flag |= IFASTSYMLNK;
/* free page */
(void) VOP_PUTPAGE(ITOV(ip),
(caddr_t) 0, PAGESIZE,
(B_DONTNEED | B_FREE | B_FORCE |
B_ASYNC), cred);
} else {
int i;
/* error, clear garbage left behind */
for (i = 1; i < NDADDR && ip->i_db[i]; i++)
ip->i_db[i] = 0;
for (i = 0; i < NIADDR && ip->i_ib[i]; i++)
ip->i_ib[i] = 0;
}
}
}
out:
ITIMES(ip);
return (error);
}
/*ARGSUSED*/
static int
ufs_fsync(vp, cred)
struct vnode *vp;
struct ucred *cred;
{
register struct inode *ip;
int err;
VFS_RECORD(vp->v_vfsp, VS_FSYNC, VS_CALL);
ip = VTOI(vp);
ILOCK(ip);
err = syncip(ip, 0, 1); /* do synchronous writes */
if (!err)
err = sync_indir(ip); /* write back any other inode data */
IUNLOCK(ip);
return (err);
}
/*ARGSUSED*/
static int
ufs_inactive(vp, cred)
struct vnode *vp;
struct ucred *cred;
{
VFS_RECORD(vp->v_vfsp, VS_INACTIVE, VS_CALL);
iinactive(VTOI(vp));
return (0);
}
/*
* Unix file system operations having to do with directory manipulation.
*/
/*ARGSUSED*/
static int
ufs_lookup(dvp, nm, vpp, cred, pnp, flags)
struct vnode *dvp;
char *nm;
struct vnode **vpp;
struct ucred *cred;
struct pathname *pnp;
int flags;
{
register struct inode *ip;
struct inode *xip;
register int error;
VFS_RECORD(dvp->v_vfsp, VS_LOOKUP, VS_CALL);
ip = VTOI(dvp);
error = dirlook(ip, nm, &xip);
ITIMES(ip);
if (error == 0) {
ip = xip;
*vpp = ITOV(ip);
if ((ip->i_mode & ISVTX) && !(ip->i_mode & (IEXEC | IFDIR)) &&
stickyhack) {
(*vpp)->v_flag |= VISSWAP;
} else {
(*vpp)->v_flag &= ~VISSWAP;
}
ITIMES(ip);
IUNLOCK(ip);
/*
* If vnode is a device return special vnode instead
*/
if (ISVDEV((*vpp)->v_type)) {
struct vnode *newvp;
newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type);
VN_RELE(*vpp);
*vpp = newvp;
}
}
return (error);
}
static int
ufs_create(dvp, nm, vap, exclusive, mode, vpp, cred)
struct vnode *dvp;
char *nm;
struct vattr *vap;
enum vcexcl exclusive;
int mode;
struct vnode **vpp;
struct ucred *cred;
{
register int error;
register struct inode *ip;
struct inode *xip;
VFS_RECORD(dvp->v_vfsp, VS_CREATE, VS_CALL);
switch ((int) vap->va_type) {
/* Must be super-user to create a non-FIFO special device */
case (int) VBLK:
case (int) VCHR:
if (cred->cr_uid != 0)
return (EPERM);
else
break;
/* Can't create directories - use ufs_mkdir instead. */
case (int) VDIR:
return (EISDIR);
}
xip = (struct inode *)0;
ip = VTOI(dvp);
/* Must be super-user to set sticky bit */
if (cred->cr_uid != 0)
vap->va_mode &= ~VSVTX;
error = direnter(ip, nm, DE_CREATE, (struct inode *)0,
(struct inode *)0, vap, &xip);
ITIMES(ip);
ip = xip;
/*
* If file exists and this is a nonexclusive create,
* check for not directory and access permissions.
* If create/read-only an existing directory, allow it.
*/
if (error == EEXIST) {
if (exclusive == NONEXCL) {
if (((ip->i_mode & IFMT) == IFDIR) && (mode & IWRITE)) {
error = EISDIR;
} else if (mode) {
error = iaccess(ip, mode);
} else {
error = 0;
}
}
if (error) {
iput(ip);
} else if (((ip->i_mode&IFMT) == IFREG) && (vap->va_size == 0)){
/*
* Truncate regular files, if required
*/
(void) itrunc(ip, (u_long)0);
}
}
if (error) {
return (error);
}
*vpp = ITOV(ip);
ITIMES(ip);
IUNLOCK(ip);
/*
* If vnode is a device return special vnode instead
*/
if (ISVDEV((*vpp)->v_type)) {
struct vnode *newvp;
newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type);
VN_RELE(*vpp);
*vpp = newvp;
}
if (vap != (struct vattr *)0) {
(void) VOP_GETATTR(*vpp, vap, cred);
}
return (error);
}
/*ARGSUSED*/
static int
ufs_remove(vp, nm, cred)
struct vnode *vp;
char *nm;
struct ucred *cred;
{
register int error;
register struct inode *ip;
VFS_RECORD(vp->v_vfsp, VS_REMOVE, VS_CALL);
ip = VTOI(vp);
error = dirremove(ip, nm, (struct inode *)0, 0);
ITIMES(ip);
return (error);
}
/*
* Link a file or a directory.
* If source is a directory, must be superuser.
*/
/*ARGSUSED*/
static int
ufs_link(vp, tdvp, tnm, cred)
struct vnode *vp;
register struct vnode *tdvp;
char *tnm;
struct ucred *cred;
{
register struct inode *sip;
register int error;
struct vnode *realvp;
if (VOP_REALVP(vp, &realvp) == 0) {
vp = realvp;
}
VFS_RECORD(vp->v_vfsp, VS_LINK, VS_CALL);
sip = VTOI(vp);
if (((sip->i_mode & IFMT) == IFDIR) && !suser()) {
return (EPERM);
}
error = direnter(VTOI(tdvp), tnm, DE_LINK,
(struct inode *)0, sip, (struct vattr *)0, (struct inode **)0);
ITIMES(sip);
ITIMES(VTOI(tdvp));
return (error);
}
/*
* Rename a file or directory.
* We are given the vnode and entry string of the source and the
* vnode and entry string of the place we want to move the source to
* (the target). The essential operation is:
* unlink(target);
* link(source, target);
* unlink(source);
* but "atomically". Can't do full commit without saving state in the inode
* on disk, which isn't feasible at this time. Best we can do is always
* guarantee that the TARGET exists.
*/
/*ARGSUSED*/
static int
ufs_rename(sdvp, snm, tdvp, tnm, cred)
struct vnode *sdvp; /* old (source) parent vnode */
char *snm; /* old (source) entry name */
struct vnode *tdvp; /* new (target) parent vnode */
char *tnm; /* new (target) entry name */
struct ucred *cred;
{
struct inode *sip; /* source inode */
register struct inode *sdp; /* old (source) parent inode */
register struct inode *tdp; /* new (target) parent inode */
register int error;
struct vnode *realvp;
VFS_RECORD(sdvp->v_vfsp, VS_RENAME, VS_CALL);
if (VOP_REALVP(tdvp, &realvp) == 0) {
tdvp = realvp;
}
sdp = VTOI(sdvp);
tdp = VTOI(tdvp);
/*
* Make sure we can delete the source entry.
*/
error = iaccess(sdp, IWRITE);
if (error) {
return (error);
}
/*
* Look up inode of file we're supposed to rename.
*/
error = dirlook(sdp, snm, &sip);
if (error) {
return (error);
}
IUNLOCK(sip); /* unlock inode (it's held) */
/*
* Check for renaming '.' or '..' or alias of '.'
*/
if ((strcmp(snm, ".") == 0) || (strcmp(snm, "..") == 0) ||
(sdp == sip)) {
error = EINVAL;
goto out;
}
/*
* If the source parent directory is "sticky", then the user must
* either own the file, or owns the directory, or is the
* super-user
*/
if ((sdp->i_mode & ISVTX) && cred->cr_uid != 0 &&
cred->cr_uid != sdp->i_uid && sip->i_uid != cred->cr_uid) {
error = EPERM;
goto out;
}
/*
* Link source to the target.
*/
error = direnter(tdp, tnm, DE_RENAME,
sdp, sip, (struct vattr *)0, (struct inode **)0);
if (error) {
/*
* ESAME isn't really an error; it indicates that the
* operation should not be done because the source and target
* are the same file, but that no error should be reported.
*/
if (error == ESAME)
error = 0;
goto out;
}
/*
* Unlink the source.
* Remove the source entry. Dirremove checks that the entry
* still reflects sip, and returns an error if it doesn't.
* If the entry has changed just forget about it.
* Release the source inode.
*/
error = dirremove(sdp, snm, sip, 0);
if (error == ENOENT) {
error = 0;
} else if (error) {
goto out;
}
out:
ITIMES(sdp);
ITIMES(tdp);
irele(sip);
return (error);
}
/*ARGSUSED*/
static int
ufs_mkdir(dvp, nm, vap, vpp, cred)
struct vnode *dvp;
char *nm;
register struct vattr *vap;
struct vnode **vpp;
struct ucred *cred;
{
register struct inode *ip;
struct inode *xip;
register int error;
VFS_RECORD(dvp->v_vfsp, VS_MKDIR, VS_CALL);
ip = VTOI(dvp);
/*
* New directory inherits the set-gid bit from the parent.
*/
vap->va_mode &= ~VSGID;
if (ip->i_mode & ISGID)
vap->va_mode |= VSGID;
error = direnter(ip, nm, DE_CREATE,
(struct inode *)0, (struct inode *)0, vap, &xip);
ITIMES(ip);
if (error == 0) {
ip = xip;
*vpp = ITOV(ip);
ITIMES(ip);
IUNLOCK(ip);
} else if (error == EEXIST) {
iput(xip);
}
return (error);
}
/*ARGSUSED*/
static int
ufs_rmdir(vp, nm, cred)
struct vnode *vp;
char *nm;
struct ucred *cred;
{
register struct inode *ip;
register int error;
VFS_RECORD(vp->v_vfsp, VS_RMDIR, VS_CALL);
ip = VTOI(vp);
error = dirremove(ip, nm, (struct inode *)0, 1);
ITIMES(ip);
return (error);
}
/*ARGSUSED*/
static int
ufs_readdir(vp, uiop, cred)
struct vnode *vp;
struct uio *uiop;
struct ucred *cred;
{
register struct iovec *iovp;
register struct inode *ip;
register struct direct *idp;
register struct dirent *odp;
register u_int offset;
register int incount = 0;
register int outcount = 0;
register u_int bytes_wanted, total_bytes_wanted;
caddr_t outbuf;
u_int bufsize;
int error = 0;
struct fbuf *fbp;
int fastalloc;
static caddr_t dirbufp;
VFS_RECORD(vp->v_vfsp, VS_READDIR, VS_CALL);
ip = VTOI(vp);
iovp = uiop->uio_iov;
total_bytes_wanted = iovp->iov_len;
/* Force offset to be valid (to guard against bogus lseek() values) */
offset = uiop->uio_offset & ~(DIRBLKSIZ - 1);
/* Quit if at end of file */
if (offset >= ip->i_size)
return (0);
/*
* Get space to change directory entries into fs independent format.
* Do fast alloc for the most commonly used-request size (filesystem
* block size).
*/
fastalloc = (total_bytes_wanted == MAXBSIZE);
bufsize = total_bytes_wanted + sizeof (struct dirent);
if (fastalloc)
outbuf = new_kmem_fast_alloc(
&dirbufp, (int)bufsize, 1, KMEM_SLEEP);
else
outbuf = new_kmem_alloc(bufsize, KMEM_SLEEP);
odp = (struct dirent *)outbuf;
ILOCK(ip);
nextblk:
bytes_wanted = total_bytes_wanted;
/* Truncate request to file size */
if (offset + bytes_wanted > ip->i_size)
bytes_wanted = ip->i_size - offset;
/* Comply with MAXBSIZE boundary restrictions of fbread() */
if ((offset & MAXBOFFSET) + bytes_wanted > MAXBSIZE)
bytes_wanted = MAXBSIZE - (offset & MAXBOFFSET);
/* Read in the next chunk */
if (error = fbread(vp, offset, bytes_wanted, S_OTHER, &fbp))
goto out;
incount = 0;
idp = (struct direct *)fbp->fb_addr;
/* Transform to file-system independent format */
while (incount < bytes_wanted) {
extern char *strcpy();
/* Skip to requested offset and skip empty entries */
if (idp->d_ino != 0 && offset >= uiop->uio_offset) {
odp->d_fileno = idp->d_ino;
odp->d_namlen = idp->d_namlen;
(void) strcpy(odp->d_name, idp->d_name);
odp->d_reclen = DIRSIZ(odp);
odp->d_off = offset + idp->d_reclen;
outcount += odp->d_reclen;
/* Got as many bytes as requested, quit */
if (outcount > total_bytes_wanted) {
outcount -= odp->d_reclen;
/* Buffer too small to take any entry */
if (outcount == 0) {
fbrelse(fbp, S_OTHER);
error = EINVAL;
goto out;
}
break;
}
odp = (struct dirent *)((int)odp + odp->d_reclen);
}
if (idp->d_reclen) {
incount += idp->d_reclen;
offset += idp->d_reclen;
idp = (struct direct *)((int)idp + idp->d_reclen);
} else {
offset = (offset + DIRBLKSIZ) & ~(DIRBLKSIZ-1);
break;
}
}
/* Release the chunk */
fbrelse(fbp, S_OTHER);
/* Read whole block, but got no entries, read another if not eof */
if (offset < ip->i_size && !outcount)
goto nextblk;
/* Copy out the entry data */
if (error = uiomove(outbuf, outcount, UIO_READ, uiop))
goto out;
uiop->uio_offset = offset;
if (!ULOCKFS_IS_NOIACC(ITOU(ip)))
ip->i_flag |= IACC;
out:
ITIMES(ip);
IUNLOCK(ip);
if (fastalloc)
kmem_fast_free(&dirbufp, outbuf);
else
kmem_free(outbuf, bufsize);
return (error);
}
/*
* Old form of the ufs_readdir op. Returns directory entries directly
* from the disk in the 4.2 structure instead of the new sys/dirent.h
* structure. This routine is called directly by the old getdirentries
* system call when it discovers it is dealing with a ufs filesystem.
* The reason for this mess is to avoid large performance penalties
* that occur during conversion from the old format to the new and
* back again.
*/
/*ARGSUSED*/
int
old_ufs_readdir(vp, uiop, cred)
struct vnode *vp;
register struct uio *uiop;
struct ucred *cred;
{
register struct iovec *iovp;
register unsigned count;
register struct inode *ip;
int error;
struct mount *mp;
if (error = ufs_lockfs_begin(vp, VA_READDIR, &mp))
return (error);
ip = VTOI(vp);
iovp = uiop->uio_iov;
count = iovp->iov_len;
if ((uiop->uio_iovcnt != 1) || (count < DIRBLKSIZ) ||
(uiop->uio_offset & (DIRBLKSIZ -1))) {
error = EINVAL;
goto out;
}
count &= ~(DIRBLKSIZ - 1);
uiop->uio_resid -= iovp->iov_len - count;
iovp->iov_len = count;
error = rwip(ip, uiop, UIO_READ, 0);
ITIMES(ip);
out:
ufs_lockfs_end(VA_READDIR, mp);
return (error);
}
/*ARGSUSED*/
static int
ufs_symlink(dvp, lnm, vap, tnm, cred)
register struct vnode *dvp;
char *lnm;
struct vattr *vap;
char *tnm;
struct ucred *cred;
{
struct inode *ip;
register int error;
register struct fs *fs;
VFS_RECORD(dvp->v_vfsp, VS_SYMLINK, VS_MISS);
/* check for space availability - need at least 1 fragment */
fs = VTOI(dvp)->i_fs;
if (cred->cr_uid == 0) {
if ((fs->fs_cstotal.cs_nbfree == 0) &&
(fs->fs_cstotal.cs_nffree == 0))
return (ENOSPC);
} else
if (freespace(fs, fs->fs_minfree) <= 0)
return (ENOSPC);
ip = (struct inode *)0;
vap->va_type = VLNK;
vap->va_rdev = 0;
error = direnter(VTOI(dvp), lnm, DE_CREATE,
(struct inode *)0, (struct inode *)0, vap, &ip);
if (error == 0) {
error = rdwri(UIO_WRITE, ip, tnm, strlen(tnm),
(off_t)0, UIO_SYSSPACE, (int *)0);
if (error) {
idrop(ip);
error = dirremove(VTOI(dvp), lnm,
(struct inode *) 0, 0);
goto out;
}
}
if (error == 0) {
/* create a fast symbolic link */
if (ip->i_size <= FSL_SIZE) {
if (kcopy((caddr_t) tnm, (caddr_t) &ip->i_db[1],
(u_int) ip->i_size) == 0)
ip->i_flag |= IFASTSYMLNK;
else {
int i;
/* error, clear garbage left behind */
for (i = 1; i < NDADDR && ip->i_db[i]; i++)
ip->i_db[i] = 0;
for (i = 0; i < NIADDR && ip->i_ib[i]; i++)
ip->i_ib[i] = 0;
}
/*
* nice to free the page here, but don't bother because
* symbolic links are seldom created
*/
}
}
if (error == 0 || error == EEXIST)
iput(ip);
out:
ITIMES(VTOI(dvp));
return (error);
}
/*
* Ufs specific routine used to do ufs io.
*/
int
rdwri(rw, ip, base, len, offset, seg, aresid)
enum uio_rw rw;
struct inode *ip;
caddr_t base;
int len;
off_t offset;
int seg;
int *aresid;
{
struct uio auio;
struct iovec aiov;
register int error;
aiov.iov_base = base;
aiov.iov_len = len;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_offset = offset;
auio.uio_segflg = seg;
auio.uio_resid = len;
error = rwip(ip, &auio, rw, 0);
if (aresid) {
*aresid = auio.uio_resid;
} else if (auio.uio_resid) {
error = EIO;
}
return (error);
}
/*
* Record-locking requests are passed to the local Lock-Manager daemon.
*/
extern void kill_proc_locks();
/*ARGSUSED*/
static int
ufs_lockctl(vp, ld, cmd, cred, clid)
struct vnode *vp;
struct eflock *ld;
int cmd;
struct ucred *cred;
int clid;
{
VFS_RECORD(vp->v_vfsp, VS_LOCKCTL, VS_CALL);
if (cmd != F_RGETLK && cmd != F_RSETLK && cmd != F_RSETLKW) {
if (vp->v_type == VBLK || vp->v_type == VCHR ||
vp->v_type == VFIFO)
return (EINVAL);
} else {
if (vp->v_type == VBLK || vp->v_type == VFIFO)
return (EINVAL);
}
switch (cmd) {
case F_GETLK :
test_lock(vp, ld, cmd, clid, FILE_LOCK);
return (0);
case F_RGETLK :
test_lock(vp, ld, cmd, clid, LOCKMGR);
return (0);
case F_SETLK :
case F_SETLKW :
if (ld->l_type == F_UNLCK)
return (unlock(vp, ld, cmd, clid, FILE_LOCK));
else
return (lock(vp, ld, cmd, clid, FILE_LOCK));
case F_RSETLK :
case F_RSETLKW :
if (ld->l_type == F_UNLCK)
return (unlock(vp, ld, cmd, clid, LOCKMGR));
else if (ld->l_type == F_UNLKSYS) {
kill_proc_locks(clid, ld->l_rsys);
return (0);
} else
return (lock(vp, ld, cmd, clid, LOCKMGR));
default:
return (EINVAL);
}
}
static int
ufs_fid(vp, fidpp)
struct vnode *vp;
struct fid **fidpp;
{
register struct ufid *ufid;
VFS_RECORD(vp->v_vfsp, VS_FID, VS_CALL);
ufid = (struct ufid *)new_kmem_zalloc(sizeof (struct ufid), KMEM_SLEEP);
ufid->ufid_len = sizeof (struct ufid) - sizeof (u_short);
ufid->ufid_ino = VTOI(vp)->i_number;
ufid->ufid_gen = VTOI(vp)->i_gen;
*fidpp = (struct fid *)ufid;
return (0);
}
/*
* For read purposes, this has to be bsize * maxcontig.
* For write purposes, this can be larger.
*/
#define RD_CLUSTSZ(fs) (fs->fs_bsize * fs->fs_maxcontig)
#define WR_CLUSTSZ(fs) (fs->fs_bsize * fs->fs_maxcontig)
int ufs_nocluster = 0;
int ufs_ra = 1;
int ufs_lostpage; /* number of times we lost original page */
/*
* Called from pvn_getpages or ufs_getpage to get a particular page.
* When we are called the inode is already locked. If rw == S_WRITE
* and the block is not currently allocated we need to allocate the
* needed block(s).
*
* Clustering notes: when we detect sequential access, we switch to cluster
* sized chunks of I/O. The steady state should be that we do clusters
* in the readahead case; we'll only do one synchronous read at the beginning.
* fs_maxcontig controls the cluster size and is bounded by maxphys.
*
* We handle bsize >= PAGESIZE here; others go to oldufs_getapage().
*
* TODO
* Think about mmap() writes and lastw/nextr interaction
*/
/*ARGSUSED*/
ufs_getapage(vp, off, protp, pl, plsz, seg, addr, rw, cred)
struct vnode *vp;
register u_int off;
u_int *protp;
struct page *pl[]; /* NULL if async IO is requested */
u_int plsz;
struct seg *seg;
addr_t addr;
enum seg_rw rw;
struct ucred *cred;
{
register struct inode *ip;
register struct fs *fs;
u_int xlen;
struct buf *bp, *bp2;
struct vnode *devvp;
struct page *pp, *pp2, **ppp, *pagefound;
daddr_t lbn, bn;
u_int io_off, io_len;
int len, boff;
int err, do2ndread;
dev_t dev;
VFS_RECORD(vp->v_vfsp, VS_GETPAGE, VS_CALL);
ip = VTOI(vp);
fs = ip->i_fs;
if (ufs_nocluster || fs->fs_bsize < PAGESIZE) {
return (oldufs_getapage(vp, off, protp, pl,
plsz, seg, addr, rw, cred));
}
devvp = ip->i_devvp;
dev = devvp->v_rdev;
reread:
err = 0;
bp = NULL;
bp2 = NULL;
pagefound = NULL;
do2ndread = ufs_ra && ip->i_nextr == off;
if (pl != NULL)
pl[0] = NULL;
/*
* It may seem that only writes need to do the bmap(). Not so -
* the protp needs to made readonly if the is backed by a hole.
* XXX - it might be possible to fix this.
*/
lbn = lblkno(fs, off);
boff = blkoff(fs, off);
if (rw == S_WRITE) {
err = bmap_write(ip, lbn, boff,
&bn, &len, (int)blksize(fs, ip, lbn), 0);
} else {
err = bmap_read(ip, lbn, boff, &bn, &len);
if (bn == UFS_HOLE) {
if (protp != NULL)
*protp &= ~PROT_WRITE;
do2ndread = 0;
}
}
if (err)
goto out;
if (!do2ndread)
len = MIN(fs->fs_bsize, len);
again:
if ((pagefound = page_find(vp, off)) == NULL) {
if (bn == UFS_HOLE) {
/*
* Block for this page is not allocated
* and the page was not found.
*/
if (pl != NULL) {
/*
* If we need a page, allocate and
* return a zero page. This assumes
* that for "async" faults it is not
* worth it to create the page now.
*/
pp = rm_allocpage(seg, addr, PAGESIZE, 1);
trace6(TR_SEG_ALLOCPAGE, seg,
(u_int)addr & PAGEMASK, TRC_SEG_UNK,
vp, off, pp);
if (page_enter(pp, vp, off))
panic("ufs_getapage page_enter");
pagezero(pp, 0, PAGESIZE);
page_unlock(pp);
pl[0] = pp;
pl[1] = NULL;
u.u_ru.ru_minflt++;
}
} else {
/*
* Need to really do disk IO to get the page(s).
*/
VFS_RECORD(vp->v_vfsp, VS_GETPAGE, VS_MISS);
pp = pvn_kluster(vp, off, seg, addr, &io_off, &io_len,
off, (u_int) len, 0);
/*
* Somebody has entered the page before us, so
* just use it.
*/
if (pp == NULL)
goto again;
if (pl != NULL) {
register int sz;
if (plsz >= io_len) {
/*
* Everything fits, set up to load
* up and hold all the pages.
*/
pp2 = pp;
sz = io_len;
} else {
/*
* Set up to load plsz worth
* starting at the needed page.
*/
for (pp2 = pp; pp2->p_offset != off;
pp2 = pp2->p_next) {
ASSERT(pp2->p_next->p_offset !=
pp->p_offset);
}
sz = plsz;
}
ppp = pl;
do {
PAGE_HOLD(pp2);
*ppp++ = pp2;
pp2 = pp2->p_next;
sz -= PAGESIZE;
} while (sz > 0);
*ppp = NULL; /* terminate list */
}
bp = pageio_setup(pp, io_len, devvp, pl == NULL ?
(B_ASYNC | B_READ) : B_READ);
bp->b_dev = dev;
bp->b_blkno = fsbtodb(fs, bn) + btodb(boff);
bp->b_un.b_addr = 0;
/*
* Zero part of page which we are not
* going to be reading from disk now.
* pp->p_prev is usually the same page unless
* a list of pages, as with exec.
*
* The only way this can happen, I think, is
* at the end of file, so I turn off readahead.
*/
xlen = io_len & PAGEOFFSET;
if (xlen != 0) {
pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
do2ndread = 0;
}
(*bdevsw[major(dev)].d_strategy)(bp);
/*
* Set up where to do the next readahead.
*/
ip->i_nextrio = off + (io_len & PAGEMASK);
u.u_ru.ru_majflt++;
if (seg == segkmap)
u.u_ru.ru_inblock++; /* count as `read' */
cnt.v_pgin++;
cnt.v_pgpgin += btopr(io_len);
}
}
ip->i_nextr = (off + fs->fs_bsize) & ~(fs->fs_bsize - 1);
/*
* XXX - This can get out of sync if a page has been stolen away in
* the previous cluster. Because we don't resync, this can result in
* two sync reads above; one for the stolen page and another on the
* following cluster.
*/
if (do2ndread &&
ip->i_nextrio - off <= RD_CLUSTSZ(fs) &&
ip->i_nextrio < ip->i_size) {
addr_t addr2;
io_off = ip->i_nextrio;
addr2 = addr + (io_off - off);
/*
* Read-ahead case (bsize >= PAGESIZE)
* If addr is now in a different seg,
* don't bother trying with read-ahead.
*/
if (addr2 >= seg->s_base + seg->s_size) {
pp2 = NULL;
goto out;
}
lbn = lblkno(fs, io_off);
boff = blkoff(fs, io_off);
err = bmap_read(ip, lbn, boff, &bn, &len);
if (err || bn == UFS_HOLE)
goto out;
pp2 = pvn_kluster(vp, io_off, seg, addr2,
&io_off, &io_len, io_off, (u_int) len, 1);
if (pp2 == NULL)
goto out;
bp2 = pageio_setup(pp2, io_len, devvp,
(B_ASYNC | B_READ));
bp2->b_dev = dev;
ASSERT(ip->i_nextrio == pp2->p_offset);
bp2->b_blkno = fsbtodb(fs, bn) + btodb(boff);
bp2->b_un.b_addr = 0;
/*
* Zero part of page which we are not
* going to be reading from disk now
* if it hasn't already been done.
*/
if (xlen = (io_len & PAGEOFFSET))
pagezero(pp2->p_prev, xlen, PAGESIZE - xlen);
/*
* Two cases where io_len < blksz.
* (1) We ran out of memory.
* (2) The page is already in memory.
*/
ip->i_nextrio = (io_off + io_len) & PAGEMASK;
(*bdevsw[major(dev)].d_strategy)(bp2);
/*
* Should we bill read ahead to extra faults?
*/
u.u_ru.ru_majflt++;
if (seg == segkmap)
u.u_ru.ru_inblock++; /* count as `read' */
cnt.v_pgin++;
cnt.v_pgpgin += btopr(io_len);
}
out:
if (pl == NULL) {
return (err);
}
if (bp != NULL) {
if (err == 0)
err = biowait(bp);
else
(void) biowait(bp);
pageio_done(bp);
}
if (pagefound != NULL) {
register int s;
/*
* We need to be careful here because if the page was
* previously on the free list, we might have already
* lost it at interrupt level.
*/
s = splvm();
if (pagefound->p_vnode == vp && pagefound->p_offset == off) {
/*
* If the page is still intransit or if
* it is on the free list call page_lookup
* to try and wait for / reclaim the page.
*/
if (pagefound->p_intrans || pagefound->p_free)
pagefound = page_lookup(vp, off);
}
if (pagefound == NULL || pagefound->p_offset != off ||
pagefound->p_vnode != vp || pagefound->p_gone) {
(void) splx(s);
ufs_lostpage++;
goto reread;
}
PAGE_HOLD(pagefound);
(void) splx(s);
pl[0] = pagefound;
pl[1] = NULL;
u.u_ru.ru_minflt++;
}
if (err) {
for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
PAGE_RELE(*ppp);
}
return (err);
}
/*
* Return all the pages from [off..off+len) in given file
*/
static int
ufs_getpage(vp, off, len, protp, pl, plsz, seg, addr, rw, cred)
struct vnode *vp;
u_int off, len;
u_int *protp;
struct page *pl[];
u_int plsz;
struct seg *seg;
addr_t addr;
enum seg_rw rw;
struct ucred *cred;
{
struct inode *ip = VTOI(vp);
int err;
extern freemem, lotsfree;
/*
* Normally fail if faulting beyond EOF, *except* if this
* is an internal access of ufs data. This condition is
* detected by testing the faulting segment against segkmap.
* Since accessing the file through segkmap is only done
* in places in the kernel which have knowledge of the
* current file length, these places deal with EOF themselves.
* For example, bmap may be faulting in pages beyond the
* current EOF when it is creating pages needed for extending
* the length of the file.
*/
if (off + len > ip->i_size + PAGEOFFSET && seg != segkmap)
return (EFAULT); /* beyond EOF */
if (protp != NULL)
*protp = PROT_ALL;
ILOCK(ip);
if (len <= PAGESIZE) {
err = ufs_getapage(vp, off, protp, pl, plsz, seg, addr,
rw, cred);
} else {
err = pvn_getpages(ufs_getapage, vp, off, len, protp, pl, plsz,
seg, addr, rw, cred);
}
/*
* If the inode is not already marked for IACC (in rwip() for read)
* and the inode is not marked for no access time update (in rwip()
* for write) then update the inode access time and mod time now.
*/
if ((ip->i_flag & (IACC | INOACC)) == 0) {
if (rw != S_OTHER) {
if (!ULOCKFS_IS_NOIACC(ITOU(ip)))
ip->i_flag |= IACC;
}
if (rw == S_WRITE) {
ip->i_flag |= IUPD;
}
ITIMES(ip);
}
IUNLOCK(ip);
return (err);
}
/*
* Called at interrupt level.
*/
static int
ufs_writedone(bp)
register struct buf *bp;
{
register struct inode *ip;
ASSERT(bp->b_pages);
ip = VTOI(bp->b_pages->p_vnode); /* gag me */
bp->b_flags &= ~B_CALL;
bp->b_iodone = NULL;
bp->b_flags |= B_DONE;
if (ip->i_writes > 0) {
ip->i_writes -= bp->b_bcount + bp->b_resid;
if (ip->i_writes <= ufs_WRITES)
wakeup((caddr_t)&ip->i_writes);
}
/*
* Stolen from biodone()
*/
if (bp->b_flags & B_ASYNC) {
if (bp->b_flags & (B_PAGEIO|B_REMAPPED))
swdone(bp);
else
brelse(bp);
} else if (bp->b_flags & B_WANTED) {
bp->b_flags &= ~B_WANTED;
wakeup((caddr_t)bp);
}
}
/*
* Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
* XXX - Has to be exported for 4K FS support.
*/
/* static */
int
ufs_writelbn(ip, bn, pp, len, pgoff, flags)
register struct inode *ip;
daddr_t bn;
struct page *pp;
u_int len;
u_int pgoff;
int flags;
{
struct buf *bp;
int err;
bp = pageio_setup(pp, len, ip->i_devvp, B_WRITE | flags);
if (bp == NULL) {
pvn_fail(pp, B_WRITE | flags);
return (ENOMEM);
}
if (ufs_WRITES) {
int s;
/*
* protected because the completion interrupt changes this.
*/
s = splbio();
ip->i_writes += len;
(void) splx(s);
bp->b_flags |= B_CALL;
bp->b_iodone = ufs_writedone;
}
bp->b_dev = ip->i_dev;
bp->b_blkno = bn;
bp->b_un.b_addr = (addr_t)pgoff;
(*bdevsw[major(ip->i_dev)].d_strategy)(bp);
u.u_ru.ru_oublock++;
/*
* If async, assume that pvn_done will
* handle the pages when IO is done
*/
if (flags & B_ASYNC) {
return (0);
}
err = biowait(bp);
pageio_done(bp);
return (err);
}
/*
* Macro to be used to see if it is safe to ILOCK the inode.
* This is needed because the pageout daemon cannot afford to
* wait for an inode lock since the process that has the inode
* lock may need more memory from the pageout daemon to complete
* its work. This is used to prevent deadlocking situations.
*/
#define ICHECK(ip) ((NOMEMWAIT()) && ((ip)->i_flag & ILOCKED) && \
((ip)->i_owner != uniqpid()))
int ufs_delay = 1; /* patchable while running */
/*
* Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
* If len == 0, do from off to EOF.
*
* The normal cases should be len == 0 & off == 0 (entire vp list),
* len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
* (from pageout).
*
* Note that for ufs it is possible to have dirty pages beyond
* roundup(ip->i_size, PAGESIZE). This can happen if the file
* length is long enough to involve indirect blocks (which are
* always fs->fs_bsize'd) and PAGESIZE < bsize while the length
* is such that roundup(blkoff(fs, ip->i_size), PAGESIZE) < bsize.
*/
/*ARGSUSED*/
static int
ufs_putpage(vp, off, len, flags, cred)
register struct vnode *vp;
u_int off, len;
int flags;
struct ucred *cred;
{
register struct inode *ip;
register struct page *pp;
register struct fs *fs;
struct page *dirty, *io_list;
register u_int io_off, io_len;
daddr_t lbn, dbn;
daddr_t bn;
int bmaplen, boff;
int vpcount, err;
#ifdef VFSSTATS
VFS_RECORD(vp->v_vfsp, VS_PUTPAGE, VS_CALL);
#endif
ip = VTOI(vp);
fs = ip->i_fs;
if (ufs_nocluster || fs->fs_bsize < PAGESIZE) {
return (oldufs_putpage(vp, off, len, flags, cred));
}
if (vp->v_pages == NULL) {
return (0);
}
/*
* If (clustering) AND
* (it's a normal write, i.e., normal flags) AND
* (we're doing a portion of the file) AND
* (we've delayed less than a clusters worth) AND
* (this is the 1st chunk OR this chunk is contig w/the last chunk) THEN
* delay this chunk; we'll push it later.
*/
if (ufs_delay && (flags & B_ASYNC) &&
(flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0 && len &&
(ip->i_delaylen + len) < WR_CLUSTSZ(fs) &&
(ip->i_delaylen == 0 || ip->i_delayoff + ip->i_delaylen == off)) {
if (ip->i_delaylen == 0)
ip->i_delayoff = off;
ip->i_delaylen += len;
return (0);
}
vpcount = vp->v_count;
VN_HOLD(vp);
again:
/*
* Cannot afford to sleep on inode now, give up
*/
if (ICHECK(ip)) {
err = ENOMEM;
goto errout;
}
/*
* Hold inode lock for duration of push
*/
ILOCK(ip);
if (len == 0) {
/*
* Search the entire vp list for pages >= off
*/
dirty = pvn_vplist_dirty(vp, off, flags);
ip->i_delaylen = ip->i_delayoff = 0;
} else {
u_int offlo, offhi, offclust;
u_int d_len, d_off;
/*
* if (delayed pages)
* if (current request not in/adjacent to delayed pages)
* push old pages
* else
* start at beginning of delayed pages
* do [offlo..off+len) clustered up to off + WR_CLUSTSZ
*
* We play fast and loose with EOF here; counting on the
* fact that range_dirty will just not find the pages.
*/
offlo = off;
offhi = off + len;
offclust = MAX(offhi, off + WR_CLUSTSZ(fs));
if (ip->i_delaylen) {
d_off = ip->i_delayoff;
d_len = ip->i_delaylen;
ip->i_delayoff = ip->i_delaylen = 0;
if (off < d_off || off > d_off + d_len) {
int e;
if (e = ufs_putpage(vp, d_off,
d_len, B_NODELAY|B_ASYNC, cred)) {
printf("PP: vp=%x off=%d len=%d e=%d\n",
vp, d_off, d_len, e);
}
} else {
offlo = d_off;
}
}
dirty = pvn_range_dirty(vp, offlo, offhi,
offlo, offclust, flags);
}
/*
* Now pp will have the list of kept dirty pages marked for
* write back. All the pages on the pp list need to still
* be dealt with here. Verify that we can really can do the
* write back to the filesystem and if not and we have some
* dirty pages, return an error condition.
*/
err = fs->fs_ronly && dirty != NULL ? EROFS : 0;
if (dirty != NULL) {
/*
* If the modified time on the inode has not already been
* set elsewhere (i.e. for write/setattr) or this is
* a call from msync (B_FORCE) we set the time now.
* This gives us approximate modified times for mmap'ed files
* which are modified via stores in the user address space.
*/
if ((ip->i_flag & IMODTIME) == 0 || (flags & B_FORCE) != 0) {
ip->i_flag |= IUPD;
ITIMES(ip);
}
/*
* file system was modified
*/
LOCKFS_SET_MOD(UTOL(ITOU(ip)));
}
/*
* Handle all the dirty pages.
*
* Clustering changes: instead of grabbing a blocks worth,
* take whatever the extent tells us to.
*
* This code *assumes* that the list is in increasing order.
* There's a performance hit if it's not.
*/
pp = NULL;
while (err == 0 && dirty != NULL) {
io_off = dirty->p_offset;
lbn = lblkno(fs, io_off);
boff = blkoff(fs, io_off);
/*
* Normally the blocks should already be allocated for
* any dirty pages, we only need to use bmap_rd (S_OTHER)
* here and we should not get back a bn == UFS_HOLE.
*/
if (err = bmap_read(ip, lbn, boff, &bn, &bmaplen)) {
break;
}
if (bn == UFS_HOLE) {
if (!IS_SWAPVP(vp)) {
printf("ip=%x lbn=%d boff=%d off=%d poff=%d\n",
ip, lbn, boff, off, io_off);
panic("ufs_putpage hole");
}
/*
* Allocate for "holey" ufs file now.
* XXX - should redo the anon code to
* synchronously insure that all the
* needed backing store is allocated.
*/
err = bmap_write(ip, lbn, boff,
&bn, &bmaplen, (int)blksize(fs, ip, lbn), 1);
if (err) {
break;
}
ASSERT(bn != UFS_HOLE);
}
VFS_RECORD(vp->v_vfsp, VS_PUTPAGE, VS_MISS);
/*
* Pull off up to clustsize as long as it's contig.
* bmaplen tells everything we need to know.
* The list from pvn_xxx is sorted, all we have to check
* for are gaps.
*/
ASSERT(bmaplen > 0); /* leave this in */
pp = io_list = dirty;
io_len = 0;
do {
io_len += PAGESIZE;
bmaplen -= PAGESIZE;
pp = pp->p_next;
} while (bmaplen > 0 &&
pp != dirty && pp->p_offset == io_off + io_len);
/*
* Might have hit a gap or run out of extent.
* Have to break the list right before pp.
* No spls because the pages are held.
*/
if (pp != dirty &&
(pp->p_offset != io_off + io_len || bmaplen <= 0)) {
struct page *tail;
dirty = pp;
tail = io_list->p_prev;
pp = pp->p_prev;
tail->p_next = dirty;
dirty->p_prev = tail;
io_list->p_prev = pp;
pp->p_next = io_list;
} else {
dirty = NULL;
}
/*
* Might have gone to far (bmaplen is negative).
* We could have several full blocks and then a frag.
*/
if (bmaplen < 0)
io_len += bmaplen;
dbn = fsbtodb(fs, bn) + btodb(boff);
err = ufs_writelbn(ip, dbn, io_list, io_len, 0, flags);
pp = NULL;
}
IUNLOCK(ip);
if (err != 0) {
if (pp != NULL)
pvn_fail(pp, B_WRITE | flags);
if (dirty != NULL)
pvn_fail(dirty, B_WRITE | flags);
} else if (off == 0 && (len == 0 || len >= ip->i_size)) {
/*
* If doing "synchronous invalidation", make
* sure that all the pages are actually gone.
*
* We change len (possibly) from i_size to 0. This will
* make sure we get *all* the pages, including pages that
* may be past EOF. The other path may miss them.
*/
if ((flags & (B_INVAL | B_ASYNC)) == B_INVAL &&
((vp->v_pages != NULL) && (vp->v_pages->p_lckcnt == 0))) {
len = 0;
goto again;
}
/*
* We have just sync'ed back all the pages
* on the inode, turn off the IMODTIME flag.
*/
ip->i_flag &= ~IMODTIME;
}
/*
* Instead of using VN_RELE here we are careful to only call
* the inactive routine if the vnode reference count is now zero,
* but it wasn't zero coming into putpage. This is to prevent
* recursively calling the inactive routine on a vnode that
* is already considered in the `inactive' state.
* XXX - inactive is a relative term here (sigh).
*/
errout:
if (--vp->v_count == 0 && vpcount > 0)
iinactive(ip);
return (err);
}
/*ARGSUSED*/
static int
ufs_map(vp, off, as, addrp, len, prot, maxprot, flags, cred)
struct vnode *vp;
u_int off;
struct as *as;
addr_t *addrp;
u_int len;
u_int prot, maxprot;
u_int flags;
struct ucred *cred;
{
struct segvn_crargs vn_a;
VFS_RECORD(vp->v_vfsp, VS_MAP, VS_CALL);
if ((int)off < 0 || (int)(off + len) < 0)
return (EINVAL);
if (vp->v_type != VREG)
return (ENODEV);
if ((flags & MAP_FIXED) == 0) {
map_addr(addrp, len, (off_t)off, 1);
if (*addrp == NULL)
return (ENOMEM);
} else {
/*
* User specified address - blow away any previous mappings
*/
(void) as_unmap(as, *addrp, len);
}
vn_a.vp = vp;
vn_a.offset = off;
vn_a.type = flags & MAP_TYPE;
vn_a.prot = prot;
vn_a.maxprot = maxprot;
vn_a.cred = cred;
vn_a.amp = NULL;
return (as_map(as, *addrp, len, segvn_create, (caddr_t)&vn_a));
}
static int
ufs_cmp(vp1, vp2)
struct vnode *vp1, *vp2;
{
VFS_RECORD(vp1->v_vfsp, VS_CMP, VS_CALL);
return (vp1 == vp2);
}
/*ARGSUSED*/
static int
ufs_realvp(vp, vpp)
struct vnode *vp;
struct vnode **vpp;
{
VFS_RECORD(vp->v_vfsp, VS_REALVP, VS_CALL);
return (EINVAL);
}
static int
ufs_badop()
{
panic("ufs_badop");
}
/*ARGSUSED*/
static int
ufs_cntl(vp, cmd, idata, odata, iflag, oflag)
struct vnode *vp;
int cmd, iflag, oflag;
caddr_t idata, odata;
{
/*
* Currently we only allow a cmd passed in and an int passed out
*/
ASSERT(odata && oflag == CNTL_INT32);
switch (cmd) {
default:
return (EINVAL);
case _PC_LINK_MAX:
*(int *)odata = MAXLINK;
break;
case _PC_MAX_CANON:
*(int *)odata = CANBSIZ;
break;
case _PC_NAME_MAX:
*(int *)odata = MAXNAMLEN;
break;
case _PC_PATH_MAX:
*(int *)odata = MAXPATHLEN;
break;
case _PC_PIPE_BUF:
*(int *)odata = fifoinfo.fifobuf;
break;
case _PC_VDISABLE:
*(int *)odata = VDISABLE;
break;
case _PC_CHOWN_RESTRICTED:
*(int *)odata = 1;
break;
case _PC_NO_TRUNC:
*(int *)odata = 1;
break;
}
return (0);
}
#ifndef REMOVE_OLD_UFS
/*
* This stuff is obsolete. Here for compat but we're phasing out
* 4K file systems.
*/
/*
* Called from pvn_getpages or ufs_getpage to get a particular page.
* When we are called the inode is already locked. If rw == S_WRITE
* and the block is not currently allocated we need to allocate the
* needed block(s).
*
* bsize is either 4k or 8k. To handle the case of 4k bsize and 8k pages
* we will do two reads to get the data and don't bother with read ahead.
* Thus having 4k file systems on a Sun-3 works, but it is not recommended.
*
* XXX - should handle arbritrary file system block and page sizes.
*/
/*ARGSUSED*/
oldufs_getapage(vp, off, protp, pl, plsz, seg, addr, rw, cred)
struct vnode *vp;
register u_int off;
u_int *protp;
struct page *pl[]; /* NULL if async IO is requested */
u_int plsz;
struct seg *seg;
addr_t addr;
enum seg_rw rw;
struct ucred *cred;
{
register struct inode *ip;
register struct fs *fs;
register int bsize;
u_int xlen;
struct buf *bp, *bp2;
struct vnode *devvp;
struct page *pp, *pp2, **ppp, *pagefound;
daddr_t lbn, bn, bn2;
u_int io_off, io_len;
u_int lbnoff, blksz;
int err, nio, do2ndread, pgoff;
int multi_io;
dev_t dev;
VFS_RECORD(vp->v_vfsp, VS_GETPAGE, VS_CALL);
ip = VTOI(vp);
fs = ip->i_fs;
bsize = fs->fs_bsize;
devvp = ip->i_devvp;
dev = devvp->v_rdev;
multi_io = (PAGESIZE > bsize);
reread:
bp = NULL;
bp2 = NULL;
pagefound = NULL;
pgoff = 0;
lbn = lblkno(fs, off);
lbnoff = off & fs->fs_bmask;
if (pl != NULL)
pl[0] = NULL;
err = bmap(ip, lbn, &bn, &bn2, (int)blksize(fs, ip, lbn), rw, 0);
if (err)
goto out;
if (bn == UFS_HOLE && protp != NULL)
*protp &= ~PROT_WRITE;
if (multi_io) {
if (bsize != PAGESIZE / 2) {
/*
* This should have been prevented at mount time
* XXX - need to rewrite to avoid this restriction.
*/
panic("ufs_getapage bad bsize");
/* NOTREACHED */
}
if (bn2 == UFS_HOLE && ip->i_size > lbnoff + bsize) {
/*
* Try bmap with bn2 as the primary block now.
*/
err = bmap(ip, lbn + 1, &bn2, (daddr_t *)0,
(int)blksize(fs, ip, lbn + 1), rw, 0);
if (err)
goto out;
}
/*
* See if we are going to need to do a 2nd read
* to handle the bsize == PAGESIZE / 2 case.
*/
if (bn != UFS_HOLE && bn2 != UFS_HOLE &&
lbnoff + bsize < ip->i_size) {
nio = 2;
do2ndread = 1;
} else {
nio = 1;
do2ndread = 0;
if (bn2 == UFS_HOLE && lbnoff + bsize < ip->i_size)
*protp &= ~PROT_WRITE;
}
} else {
nio = 1;
if (ufs_ra && ip->i_nextr == off && bn2 != UFS_HOLE &&
lbnoff + bsize < ip->i_size) {
do2ndread = 1;
} else {
do2ndread = 0;
}
}
again:
if ((pagefound = page_find(vp, off)) == NULL) {
/*
* Compute the size of the block we actually want
* to read to be the smaller of a page boundary
* or the ufs acquired block size (i.e., we don't
* want to try and read the next page beyond EOF).
*/
blksz = MIN(roundup(ip->i_size, PAGESIZE) - lbnoff,
blksize(fs, ip, lbn));
if (bn == UFS_HOLE || off >= lbnoff + blksz) {
/*
* Block for this page is not allocated
* and the page was not found.
*/
if (pl != NULL) {
/*
* If we need a page, allocate and
* return a zero page. This assumes
* that for "async" faults it is not
* worth it to create the page now.
*/
pp = rm_allocpage(seg, addr, PAGESIZE, 1);
trace6(TR_SEG_ALLOCPAGE, seg,
(u_int)addr & PAGEMASK, TRC_SEG_UNK,
vp, off, pp);
if (page_enter(pp, vp, off))
panic("ufs_getapage page_enter");
pagezero(pp, 0, PAGESIZE);
page_unlock(pp);
pl[0] = pp;
pl[1] = NULL;
u.u_ru.ru_minflt++;
}
} else {
/*
* Need to really do disk IO to get the page(s).
*/
VFS_RECORD(vp->v_vfsp, VS_GETPAGE, VS_MISS);
pp = pvn_kluster(vp, off, seg, addr, &io_off, &io_len,
lbnoff, blksz, 0);
/*
* Somebody has entered the page before us, so
* just use it.
*/
if (pp == NULL)
goto again;
if (pl != NULL) {
register int sz;
if (plsz >= io_len) {
/*
* Everything fits, set up to load
* up and hold all the pages.
*/
pp2 = pp;
sz = io_len;
} else {
/*
* Set up to load plsz worth
* starting at the needed page.
*/
for (pp2 = pp; pp2->p_offset != off;
pp2 = pp2->p_next) {
ASSERT(pp2->p_next->p_offset !=
pp->p_offset);
}
sz = plsz;
}
ppp = pl;
do {
PAGE_HOLD(pp2);
*ppp++ = pp2;
pp2 = pp2->p_next;
sz -= PAGESIZE;
} while (sz > 0);
*ppp = NULL; /* terminate list */
}
if (nio > 1)
pp->p_nio = nio;
bp = pageio_setup(pp, io_len, devvp, pl == NULL ?
(B_ASYNC | B_READ) : B_READ);
bp->b_dev = dev;
bp->b_blkno = fsbtodb(fs, bn) +
btodb(blkoff(fs, io_off));
bp->b_un.b_addr = 0;
/*
* Zero part of page which we are not
* going to be reading from disk now.
* pp->p_prev is usually the same page unless
* a list of pages, as with exec.
*/
xlen = io_len & PAGEOFFSET;
if (xlen != 0)
pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
(*bdevsw[major(dev)].d_strategy)(bp);
ip->i_nextr = io_off + io_len;
u.u_ru.ru_majflt++;
if (seg == segkmap)
u.u_ru.ru_inblock++; /* count as `read' */
cnt.v_pgin++;
cnt.v_pgpgin += btopr(io_len);
}
}
lbn++;
lbnoff += fs->fs_bsize;
if (do2ndread && !(multi_io && pagefound != NULL)) {
addr_t addr2;
addr2 = addr + (lbnoff - off);
/*
* Compute the size of the block we actually want
* to read to be the smaller of a page boundary
* or the ufs acquired block size (i.e., we don't
* want to try and read the next page beyond EOF).
*/
blksz = MIN(roundup(ip->i_size, PAGESIZE) - lbnoff,
blksize(fs, ip, lbn));
if (multi_io) {
/*
* Second block for same page (bsize < PAGESIZE)
*/
pp2 = pp;
if (nio < 2) {
/*
* The first block was a hole, set up
* the page properly for io now. Otherwise,
* the page should already be marked as
* being paged in with an nio value of 2.
*/
page_lock(pp2);
PAGE_HOLD(pp2);
pp2->p_intrans = 1;
pp2->p_pagein = 1;
}
io_len = blksz;
pgoff = bsize;
} else {
/*
* Read-ahead case (bsize >= PAGESIZE)
* If addr is now in a different seg,
* don't bother trying with read-ahead.
*/
if (addr2 >= seg->s_base + seg->s_size) {
pp2 = NULL;
} else {
pp2 = pvn_kluster(vp, lbnoff, seg, addr2,
&io_off, &io_len, lbnoff, blksz, 1);
}
pgoff = 0;
}
if (pp2 != NULL) {
/*
* Do a synchronous read here only if a page
* list was given to this routine and the
* block size is smaller than the page size.
*/
bp2 = pageio_setup(pp2, io_len, devvp,
(pl != NULL && multi_io) ?
B_READ : (B_ASYNC | B_READ));
bp2->b_dev = dev;
bp2->b_blkno = fsbtodb(fs, bn2);
bp2->b_un.b_addr = (caddr_t)pgoff;
/*
* Zero part of page which we are not
* going to be reading from disk now
* if it hasn't already been done.
*/
xlen = (io_len + pgoff) & PAGEOFFSET;
if ((xlen != 0) && !multi_io)
pagezero(pp2->p_prev, xlen, PAGESIZE - xlen);
(*bdevsw[major(dev)].d_strategy)(bp2);
/*
* Should we bill read ahead to extra faults?
*/
u.u_ru.ru_majflt++;
if (seg == segkmap)
u.u_ru.ru_inblock++; /* count as `read' */
cnt.v_pgin++;
cnt.v_pgpgin += btopr(io_len);
}
}
out:
if (pl == NULL)
return (err);
if (bp != NULL) {
if (err == 0)
err = biowait(bp);
else
(void) biowait(bp);
pageio_done(bp);
}
/*
* Only wait for the second read operation
* when it is required for getting a page.
*/
if (multi_io && bp2 != NULL) {
if (err == 0)
err = biowait(bp2);
else
(void) biowait(bp2);
pageio_done(bp2);
}
if (pagefound != NULL) {
register int s;
/*
* We need to be careful here because if the page was
* previously on the free list, we might have already
* lost it at interrupt level.
*/
s = splvm();
if (pagefound->p_vnode == vp && pagefound->p_offset == off) {
/*
* If the page is still intransit or if
* it is on the free list call page_lookup
* to try and wait for / reclaim the page.
*/
if (pagefound->p_intrans || pagefound->p_free)
pagefound = page_lookup(vp, off);
}
if (pagefound == NULL || pagefound->p_offset != off ||
pagefound->p_vnode != vp || pagefound->p_gone) {
(void) splx(s);
ufs_lostpage++;
goto reread;
}
PAGE_HOLD(pagefound);
(void) splx(s);
pl[0] = pagefound;
pl[1] = NULL;
u.u_ru.ru_minflt++;
ip->i_nextr = off + PAGESIZE;
}
if (err) {
for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
PAGE_RELE(*ppp);
}
return (err);
}
/*
* Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
* If len == 0, do from off to EOF.
*
* The normal cases should be len == 0 & off == 0 (entire vp list),
* len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
* (from pageout).
*
* Note that for ufs it is possible to have dirty pages beyond
* roundup(ip->i_size, PAGESIZE). This can happen if the file
* length is long enough to involve indirect blocks (which are
* always fs->fs_bsize'd) and PAGESIZE < bsize while the length
* is such that roundup(blkoff(fs, ip->i_size), PAGESIZE) < bsize.
*/
/*ARGSUSED*/
int
oldufs_putpage(vp, off, len, flags, cred)
register struct vnode *vp;
u_int off, len;
int flags;
struct ucred *cred;
{
register struct inode *ip;
register struct page *pp;
register struct fs *fs;
struct page *dirty, *io_list;
register u_int io_off, io_len;
daddr_t lbn, bn, bn2;
u_int lbn_off;
int bsize, bsize2;
int vpcount;
int err;
#ifdef VFSSTATS
VFS_RECORD(vp->v_vfsp, VS_PUTPAGE, VS_CALL);
#endif
ip = VTOI(vp);
if (vp->v_pages == NULL || off >= ip->i_size)
return (0);
vpcount = vp->v_count;
VN_HOLD(vp);
fs = ip->i_fs;
again:
/*
* Cannot afford to sleep on inode now, give up
*/
if (ICHECK(ip)) {
err = ENOMEM;
goto errout;
}
/*
* Hold inode lock for duration of push
*/
ilock(ip);
if (len == 0) {
/*
* Search the entire vp list for pages >= off
*/
dirty = pvn_vplist_dirty(vp, off, flags);
} else {
/*
* Do a range from [off...off + len) via page_find.
* We set limits so that we kluster to bsize boundaries.
*/
if (off >= ip->i_size) {
dirty = NULL;
} else {
u_int fsize, eoff;
/*
* Use MAXBSIZE rounding to get indirect block pages
* which might beyond roundup(ip->i_size, PAGESIZE);
*/
fsize = (ip->i_size + MAXBOFFSET) & MAXBMASK;
eoff = MIN(off + len, fsize);
dirty = pvn_range_dirty(vp, off, eoff,
(u_int)(off & fs->fs_bmask),
(u_int)((eoff + fs->fs_bsize - 1) & fs->fs_bmask),
flags);
}
}
/*
* Now pp will have the list of kept dirty pages marked for
* write back. All the pages on the pp list need to still
* be dealt with here. Verify that we can really can do the
* write back to the filesystem and if not and we have some
* dirty pages, return an error condition.
*/
if (fs->fs_ronly && dirty != NULL)
err = EROFS;
else
err = 0;
if (dirty != NULL) {
/*
* Destroy the read ahead value now
* since we are really going to write
*/
ip->i_nextr = 0;
/*
* If the modified time on the inode has not already been
* set elsewhere (i.e. for write/setattr) or this is
* a call from msync (B_FORCE) we set the time now.
* This gives us approximate modified times for mmap'ed files
* which are modified via stores in the user address space.
*/
if ((ip->i_flag & IMODTIME) == 0 || (flags & B_FORCE) != 0) {
ip->i_flag |= IUPD;
ITIMES(ip);
}
/*
* file system was modified
*/
LOCKFS_SET_MOD(UTOL(ITOU(ip)));
}
/*
* Handle all the dirty pages.
*/
pp = NULL;
while (err == 0 && dirty != NULL) {
/*
* Pull off a contiguous chunk that fits in one lbn.
*/
io_off = dirty->p_offset;
lbn = lblkno(fs, io_off);
bsize = blksize(fs, ip, lbn);
/*
* Normally the blocks should already be allocated for
* any dirty pages, we only need to use S_OTHER
* here and we should not get back a bn == UFS_HOLE.
*/
err = bmap(ip, lbn, &bn, &bn2, bsize, S_OTHER, 1);
if (err) {
break;
}
if (bn == UFS_HOLE) {
if (!IS_SWAPVP(vp) && fs->fs_bsize >= PAGESIZE)
panic("ufs_putpage hole");
/*
* Allocate for "holey" ufs file now.
* XXX - should redo the anon code to
* synchronously insure that all the
* needed backing store is allocated.
*/
err = bmap(ip, lbn, &bn, &bn2, bsize, S_WRITE, 1);
if (err) {
break;
}
ASSERT(bn != UFS_HOLE);
}
VFS_RECORD(vp->v_vfsp, VS_PUTPAGE, VS_MISS);
pp = io_list = dirty;
io_len = PAGESIZE;
lbn_off = lbn << fs->fs_bshift;
page_sub(&dirty, pp);
while (dirty != NULL && dirty->p_offset < lbn_off + bsize &&
dirty->p_offset == io_off + io_len) {
pp = dirty;
page_sub(&dirty, pp);
/*
* Add the page to the end of the list. page_sortadd
* can do this without walking the list.
*/
page_sortadd(&io_list, pp);
io_len += PAGESIZE;
}
/* IO may be asynch, so need to set nio first */
if (fs->fs_bsize < PAGESIZE && ip->i_size > lbn_off + bsize) {
pp->p_nio = lblkno(fs, PAGESIZE);
} else {
pp->p_nio = 0;
/*
* Check for page length rounding problems
*/
if (io_off + io_len > lbn_off + bsize) {
ASSERT((io_off + io_len) - (lbn_off + bsize) <
PAGESIZE);
io_len = lbn_off + bsize - io_off;
}
}
/*
* Should zero any bytes beyond EOF,
* but it's not worth the work now.
*/
/*
* See if we need to do a 2nd bmap operation.
* This is needed if nio is non-zero and we
* didn't get a bn back from the 1st bmap().
*/
if (pp->p_nio) {
ASSERT(pp->p_nio == 2); /* XXX */
++lbn;
bsize2 = blksize(fs, ip, lbn);
if (bn2 == UFS_HOLE) {
/*
* Allocate backing store only if this is
* a swap vnode in case someone is using
* a "holey" ufs swap file with bsize <
* PAGESIZE (e.g., a 4k fs w/ 8k pages).
* XXX - should redo the anon code to
* synchronously insure that all the
* needed backing store is allocated.
*/
err = bmap(ip, lbn, &bn2, (daddr_t *)NULL,
bsize2, IS_SWAPVP(vp)? S_WRITE:S_OTHER, 1);
if (err) {
pvn_fail(pp, B_WRITE | flags);
break;
}
}
if (bn2 == UFS_HOLE)
pp->p_nio = 1;
/*
* Ok, now do it.
*/
err = ufs_writelbn(ip, fsbtodb(fs, bn), pp,
(u_int)bsize, 0, flags);
if (err == 0 && bn2 != UFS_HOLE) {
err = ufs_writelbn(ip, fsbtodb(fs, bn2), pp,
(u_int)bsize2, (u_int)fs->fs_bsize, flags);
pp = NULL;
}
} else {
bn = fsbtodb(fs, bn) + btodb(io_off - lbn_off);
err = ufs_writelbn(ip, bn, io_list, io_len, 0, flags);
pp = NULL;
}
}
iunlock(ip);
if (err != 0) {
if (pp != NULL)
pvn_fail(pp, B_WRITE | flags);
if (dirty != NULL)
pvn_fail(dirty, B_WRITE | flags);
} else if (off == 0 && (len == 0 || len >= ip->i_size)) {
/*
* If doing "synchronous invalidation", make
* sure that all the pages are actually gone.
*/
if ((flags & (B_INVAL | B_ASYNC)) == B_INVAL &&
((vp->v_pages != NULL) && (vp->v_pages->p_lckcnt == 0)))
goto again;
/*
* We have just sync'ed back all the pages
* on the inode, turn off the IMODTIME flag.
*/
ip->i_flag &= ~IMODTIME;
}
/*
* Instead of using VN_RELE here we are careful to only call
* the inactive routine if the vnode reference count is now zero,
* but it wasn't zero coming into putpage. This is to prevent
* recursively calling the inactive routine on a vnode that
* is already considered in the `inactive' state.
* XXX - inactive is a relative term here (sigh).
*/
errout:
if (--vp->v_count == 0 && vpcount > 0)
iinactive(ip);
return (err);
}
#else
oldufs_putpage(vp, off, len, flags, cred)
register struct vnode *vp;
u_int off, len;
int flags;
struct ucred *cred;
{
return (ENOSYS);
}
oldufs_getapage(vp, off, protp, pl, plsz, seg, addr, rw, cred)
struct vnode *vp;
register u_int off;
u_int *protp;
struct page *pl[]; /* NULL if async IO is requested */
u_int plsz;
struct seg *seg;
addr_t addr;
enum seg_rw rw;
struct ucred *cred;
{
return (ENOSYS);
}
#endif /* REMOVE_OLD_UFS */
/*
* ULOCKFS Intercept Routines
* VOP calls are intercepted and wrapped with lockfs code.
*/
static int
ufs_l_open(vpp, flag, cred)
struct vnode **vpp;
int flag;
struct ucred *cred;
{
ULOCKFS(*vpp, VA_OPEN,
ufs_open(vpp, flag, cred));
}
static int
ufs_l_close(vp, flag, count, cred)
struct vnode *vp;
int flag;
int count;
struct ucred *cred;
{
ULOCKFS(vp, VA_CLOSE,
ufs_close(vp, flag, count, cred));
}
static int
ufs_l_rdwr(vp, uiop, rw, ioflag, cred)
struct vnode *vp;
struct uio *uiop;
enum uio_rw rw;
int ioflag;
struct ucred *cred;
{
if (rw == UIO_READ) {
ULOCKFS(vp, VA_READ,
ufs_rdwr(vp, uiop, rw, ioflag, cred));
} else {
ULOCKFS(vp, VA_WRITE,
ufs_rdwr(vp, uiop, rw, ioflag, cred));
}
}
static int
ufs_l_select(vp, which, cred)
struct vnode *vp;
int which;
struct ucred *cred;
{
ULOCKFS(vp, VA_SELECT,
ufs_select(vp, which, cred));
}
static int
ufs_l_getattr(vp, vap, cred)
struct vnode *vp;
register struct vattr *vap;
struct ucred *cred;
{
ULOCKFS(vp, VA_GETATTR,
ufs_getattr(vp, vap, cred));
}
static int
ufs_l_setattr(vp, vap, cred)
register struct vnode *vp;
register struct vattr *vap;
struct ucred *cred;
{
if (vap->va_size != (u_long)-1) {
ULOCKFS(vp, VA_TRUNC,
ufs_setattr(vp, vap, cred));
} else {
ULOCKFS(vp, VA_CHANGE,
ufs_setattr(vp, vap, cred));
}
}
static int
ufs_l_access(vp, mode, cred)
struct vnode *vp;
int mode;
struct ucred *cred;
{
ULOCKFS(vp, VA_ACCESS,
ufs_access(vp, mode, cred));
}
static int
ufs_l_readlink(vp, uiop, cred)
struct vnode *vp;
struct uio *uiop;
struct ucred *cred;
{
ULOCKFS(vp, VA_READLINK,
ufs_readlink(vp, uiop, cred));
}
static int
ufs_l_fsync(vp, cred)
struct vnode *vp;
struct ucred *cred;
{
ULOCKFS(vp, VA_FSYNC,
ufs_fsync(vp, cred));
}
static int
ufs_l_inactive(vp, cred)
struct vnode *vp;
struct ucred *cred;
{
ULOCKFS(vp, VA_INACTIVE,
ufs_inactive(vp, cred));
}
static int
ufs_l_lookup(dvp, nm, vpp, cred, pnp, flags)
struct vnode *dvp;
char *nm;
struct vnode **vpp;
struct ucred *cred;
struct pathname *pnp;
int flags;
{
ULOCKFS(dvp, VA_LOOKUP,
ufs_lookup(dvp, nm, vpp, cred, pnp, flags));
}
static int
ufs_l_create(dvp, nm, vap, exclusive, mode, vpp, cred)
struct vnode *dvp;
char *nm;
struct vattr *vap;
enum vcexcl exclusive;
int mode;
struct vnode **vpp;
struct ucred *cred;
{
ULOCKFS(dvp, VA_CREATE,
ufs_create(dvp, nm, vap, exclusive, mode, vpp, cred));
}
static int
ufs_l_remove(vp, nm, cred)
struct vnode *vp;
char *nm;
struct ucred *cred;
{
ULOCKFS(vp, VA_REMOVE,
ufs_remove(vp, nm, cred));
}
static int
ufs_l_link(vp, tdvp, tnm, cred)
struct vnode *vp;
register struct vnode *tdvp;
char *tnm;
struct ucred *cred;
{
ULOCKFS(vp, VA_LINK,
ufs_link(vp, tdvp, tnm, cred));
}
static int
ufs_l_rename(sdvp, snm, tdvp, tnm, cred)
struct vnode *sdvp;
char *snm;
struct vnode *tdvp;
char *tnm;
struct ucred *cred;
{
ULOCKFS(sdvp, VA_RENAME,
ufs_rename(sdvp, snm, tdvp, tnm, cred));
}
static int
ufs_l_mkdir(dvp, nm, vap, vpp, cred)
struct vnode *dvp;
char *nm;
register struct vattr *vap;
struct vnode **vpp;
struct ucred *cred;
{
ULOCKFS(dvp, VA_MKDIR,
ufs_mkdir(dvp, nm, vap, vpp, cred));
}
static int
ufs_l_rmdir(vp, nm, cred)
struct vnode *vp;
char *nm;
struct ucred *cred;
{
ULOCKFS(vp, VA_RMDIR,
ufs_rmdir(vp, nm, cred));
}
static int
ufs_l_readdir(vp, uiop, cred)
struct vnode *vp;
struct uio *uiop;
struct ucred *cred;
{
ULOCKFS(vp, VA_READDIR,
ufs_readdir(vp, uiop, cred));
}
static int
ufs_l_symlink(dvp, lnm, vap, tnm, cred)
register struct vnode *dvp;
char *lnm;
struct vattr *vap;
char *tnm;
struct ucred *cred;
{
ULOCKFS(dvp, VA_SYMLINK,
ufs_symlink(dvp, lnm, vap, tnm, cred));
}
static int
ufs_l_lockctl(vp, ld, cmd, cred, clid)
struct vnode *vp;
struct eflock *ld;
int cmd;
struct ucred *cred;
int clid;
{
ULOCKFS(vp, VA_LOCKCTL,
ufs_lockctl(vp, ld, cmd, cred, clid));
}
static int
ufs_l_fid(vp, fidpp)
struct vnode *vp;
struct fid **fidpp;
{
ULOCKFS(vp, VA_FID,
ufs_fid(vp, fidpp));
}
static int
ufs_l_getpage(vp, off, len, protp, pl, plsz, seg, addr, rw, cred)
struct vnode *vp;
u_int off, len;
u_int *protp;
struct page *pl[];
u_int plsz;
struct seg *seg;
addr_t addr;
enum seg_rw rw;
struct ucred *cred;
{
int vaccess;
if (seg->s_ops != &segvn_ops)
vaccess = VA_GETPRIVATE;
else if (((struct segvn_data *)seg->s_data)->type != MAP_SHARED)
vaccess = VA_GETPRIVATE;
else if (rw == S_OTHER)
vaccess = VA_GETWRITE;
else if ((*seg->s_ops->checkprot)(seg, addr, len, PROT_WRITE) != 0)
vaccess = VA_GETREAD;
else
vaccess = VA_GETWRITE;
ULOCKFS(vp, vaccess,
ufs_getpage(vp, off, len, protp, pl, plsz, seg, addr, rw,
cred));
}
static int
ufs_l_putpage(vp, off, len, flags, cred)
register struct vnode *vp;
u_int off, len;
int flags;
struct ucred *cred;
{
ULOCKFS(vp, VA_PUTPAGE,
ufs_putpage(vp, off, len, flags, cred));
}
static int
ufs_l_map(vp, off, as, addrp, len, prot, maxprot, flags, cred)
struct vnode *vp;
u_int off;
struct as *as;
addr_t *addrp;
u_int len;
u_int prot, maxprot;
u_int flags;
struct ucred *cred;
{
ULOCKFS(vp, VA_MAP,
ufs_map(vp, off, as, addrp, len, prot, maxprot, flags, cred));
}
static int
ufs_l_cntl(vp, cmd, idata, odata, iflag, oflag)
struct vnode *vp;
int cmd, iflag, oflag;
caddr_t idata, odata;
{
ULOCKFS(vp, VA_CNTL,
ufs_cntl(vp, cmd, idata, odata, iflag, oflag));
}
/*
* ULOCKFS ROUTINES
*/
/*
* ufs_lockfs_end
* Called at end of every VOP call
*/
ufs_lockfs_end(vaid, mp)
int vaid;
struct mount *mp;
{
struct ulockfs *ul = mp->m_ul;
/*
* if there are no more of these accesses outstanding
*/
if (--(ul->ul_vacount[vaid]) == 0)
/*
* lock in progress for this access
*/
if (ul->ul_vamask & (1<<vaid))
/*
* awaken locking process
*/
if (ul->ul_flags & ULOCKFS_VAWANT) {
ul->ul_flags &= ~ULOCKFS_VAWANT;
wakeup((caddr_t)mp);
}
}
int lockfs_interruptible = 0;
/*
* ufs_lockfs_begin
* Called at end of every VOP call
*/
ufs_lockfs_begin(vp, vaid, mpp)
struct vnode *vp;
int vaid;
struct mount **mpp;
{
struct mount *mp = (struct mount *)(vp->v_vfsp->vfs_data);
struct ulockfs *ul = mp->m_ul;
*mpp = mp;
/*
* current lock wants this access pended
*/
while (ul->ul_vamask & (1<<vaid)) {
/*
* can't pend it because it is recursive
* e.g., VOP_RDWR causing VOP_GETPAGE
*/
if ((VTOI(vp)->i_flag & ILOCKED) &&
(u.u_procp == (struct proc *)(VTOI(vp)->i_owner)))
break;
/*
* return EIO if hard locked
*/
if (LOCKFS_IS_HLOCK(UTOL(ul)))
return (EIO);
/*
* Don't pend nfsd's. Return EIO and EAGAIN in u.u_XXX[0]
* and nfsd will drop request
*/
if (u.u_XXX[0] == ENOTBLK) {
u.u_XXX[0] = EAGAIN;
return (EIO);
}
if (lockfs_interruptible) {
int smask;
int s;
int interrupted;
/*
* pend access interruptibly (for some signals)
* See rpc/clnt_kudp.c. This is like an nfs mount
* with the intr option.
*/
s = splhigh();
smask = u.u_procp->p_sigmask;
u.u_procp->p_sigmask |= ~(sigmask(SIGHUP) |
sigmask(SIGINT) | sigmask(SIGQUIT) |
sigmask(SIGTERM));
interrupted = sleep((caddr_t)mp, PLOCK+PCATCH);
u.u_procp->p_sigmask = smask;
(void) splx(s);
if (interrupted)
return (EINTR);
} else
(void) sleep((caddr_t)mp, PZERO);
}
/*
* inc 'access in progress' count for this access
*/
ul->ul_vacount[vaid]++;
return (0);
}
/*
* Lock types are really indexes into the lockfs_vamask array.
* The accesses locked by a lock type can be changed by altering
* the mask, or by adding a new mask and incrementing LOCKFS_MAXLOCK.
*/
#define LOCKFS_MAXMASK (32)
u_long lockfs_vamask[LOCKFS_MAXMASK] = { LOCKFS_ULOCK_MASK,
LOCKFS_WLOCK_MASK,
LOCKFS_NLOCK_MASK,
LOCKFS_DLOCK_MASK,
LOCKFS_HLOCK_MASK };
u_long lockfs_maxlock = LOCKFS_MAXLOCK;
/*
* ufs_fiolfs
* file system locking ioctl handler
*/
static int
ufs_fiolfs(vp, lfup)
struct vnode *vp; /* vnode for some inode on fs */
struct lockfs **lfup; /* address of user lockfs struct */
{
int error; /* error return */
struct mount *mp; /* mount point of vp */
struct ulockfs *ul; /* ulockfs struct for mp */
struct lockfs *lfc; /* lockfs struct in ulockfs */
struct lockfs lfs; /* save current lock */
struct lockfs lfd; /* desired lock */
/*
* must be superuser
*/
if (!suser())
return (EPERM);
/*
* mount point, ufs lockfs, and current lockfs
*/
mp = (struct mount *)(vp->v_vfsp->vfs_data);
ul = mp->m_ul;
lfc = UTOL(ul);
/*
* if not already busy or hlocked, mark lock as busy
*/
if (LOCKFS_IS_BUSY(lfc))
return (EBUSY);
if (LOCKFS_IS_HLOCK(lfc))
return (EIO);
LOCKFS_SET_BUSY(lfc);
/*
* get and check the user's lockfs struct
*/
if (error = ufs_getlfd(vp, lfup, &lfd, lfc))
goto erridle;
/*
* Freeze the file system (pend future accesses)
*/
if (error = ufs_freeze(mp, &lfd, &lfs))
goto erridle;
/*
* Quiesce (wait for outstanding accesses to finish)
*/
if (error = ufs_quiesce(mp))
goto errout;
/*
* at least everything *currently* dirty goes out
*/
if (!LOCKFS_IS_ULOCK(lfc))
if (error = ufs_flush(mp))
goto errout;
/*
* reconcile superblock and inodes if fs was wlock'ed
*/
if (LOCKFS_IS_WLOCK(&lfs))
if (error = ufs_reconcile(mp))
goto errout;
/*
* thaw down to lfd.lf_lock (wakeup pended processes)
*/
if (error = ufs_thaw(mp))
goto errout;
/*
* idle the lock struct
*/
LOCKFS_CLR_BUSY(lfc);
/*
* free current comment
*/
kmem_free((caddr_t)lfs.lf_comment, (u_int)lfs.lf_comlen);
/*
* return status (such as the new key)
*/
return (ufs_fiolfss(vp, lfup));
errout:
/*
* if possible, apply original lock and clean up lock things
*/
ufs_unfreeze(mp, &lfs);
(void) ufs_thaw(mp);
erridle:
LOCKFS_CLR_BUSY(lfc);
return (error);
}
/*
* ufs_fioffs
* ioctl handler for FIOFFS
*/
static int
ufs_fioffs(vp, lfup)
struct vnode *vp; /* some vnode on fs */
struct lockfs **lfup; /* user's struct (must be NULL) */
{
/*
* no struct needed, yet
*/
if (*lfup != NULL)
return (EINVAL);
/*
* at least everything *currently* dirty goes out
*/
return (ufs_flush((struct mount *)(vp->v_vfsp->vfs_data)));
}
/*
* ufs_fiolfss
* ioctl handler for FIOLFSS
*/
static int
ufs_fiolfss(vp, lfup)
struct vnode *vp; /* some vnode on fs */
struct lockfs **lfup; /* user's lockfs struct */
{
int error;
u_int comlen; /* length of user's comment buf */
struct mount *mp;
struct ulockfs *ul;
struct lockfs *lfc; /* current lockfs struct */
struct lockfs lfu; /* copy of user's lockfs struct */
/*
* mount point and ulockfs and lockfs structs
*/
mp = (struct mount *)(vp->v_vfsp->vfs_data);
ul = mp->m_ul;
lfc = UTOL(ul);
/*
* get user's lockfs struct
*/
if (error = copyin((caddr_t)*lfup, (caddr_t)&lfu,
(u_int)(sizeof (struct lockfs))))
goto errout;
/*
* length of comment to return
*/
if (lfu.lf_comlen > lfc->lf_comlen)
comlen = lfc->lf_comlen;
else
comlen = lfu.lf_comlen;
/*
* return current lockfs struct to user
*/
lfu.lf_lock = lfc->lf_lock;
lfu.lf_key = lfc->lf_key;
lfu.lf_flags = lfc->lf_flags;
if (lfu.lf_comlen = comlen)
if (error = copyout(lfc->lf_comment, lfu.lf_comment, comlen))
goto errout;
error = copyout((caddr_t)&lfu, (caddr_t)*lfup,
(u_int)(sizeof (struct lockfs)));
errout:
return (error);
}
/*
* ufs_freeze
* pend future accesses for current lock and desired lock
*/
ufs_freeze(mp, lfd, lfs)
struct mount *mp;
struct lockfs *lfd; /* desired lock */
struct lockfs *lfs; /* save current lock here */
{
struct ulockfs *ul = mp->m_ul;
struct lockfs *lfc = UTOL(ul); /* current lock */
/*
* save current lock
*/
bcopy((caddr_t)lfc, (caddr_t)lfs, (u_int)sizeof (struct lockfs));
/*
* move over selected lock fields into lockfs struct
*/
lfc->lf_lock = lfd->lf_lock;
lfc->lf_key = lfd->lf_key;
lfc->lf_comlen = lfd->lf_comlen;
lfc->lf_comment = lfd->lf_comment;
/*
* pend current and desired lock's vop accesses for now
*/
ul->ul_vamask |= lockfs_vamask[lfc->lf_lock];
return (0);
}
/*
* ufs_unfreeze
* lock failed, reset the old lock
*/
ufs_unfreeze(mp, lfr)
struct mount *mp;
struct lockfs *lfr; /* reset this lock */
{
u_int comlen;
caddr_t comment;
struct ulockfs *ul = mp->m_ul;
struct lockfs *lff = UTOL(ul); /* from this failed lock */
/*
* can't unfreeze a hlock
*/
if (LOCKFS_IS_HLOCK(lff)) {
/*
* free up comment from reset lock
*/
comlen = lfr->lf_comlen;
comment = lfr->lf_comment;
goto errout;
} else {
/*
* free up comment from failed lock
*/
comlen = lff->lf_comlen;
comment = lff->lf_comment;
}
/*
* move over the LOCKFS_MOD flag
*/
if (LOCKFS_IS_MOD(lff))
LOCKFS_SET_MOD(lfr);
/*
* reset lock
*/
bcopy((caddr_t)lfr, (caddr_t)lff, (u_int)sizeof (struct lockfs));
/*
* reset vop access mask
*/
ul->ul_vamask = lockfs_vamask[lfr->lf_lock];
errout:
kmem_free(comment, comlen);
}
/*
* ufs_quiesce
* wait for outstanding accesses to finish
*/
ufs_quiesce(mp)
struct mount *mp; /* mount point */
{
int i; /* index */
u_long vamask; /* access mask */
struct ulockfs *ul = mp->m_ul; /* mp's ulockfs */
/*
* for each access
*/
for (i = 0, vamask = ul->ul_vamask; i < VA_MAX; ++i) {
/*
* if these accesses should finish
*/
if (vamask & (1<<i))
/*
* wait for outstanding ones to finish
*/
while (ul->ul_vacount[i]) {
ul->ul_flags |= ULOCKFS_VAWANT;
if (sleep((caddr_t)mp, PLOCK+PCATCH))
return (EINTR);
}
}
return (0);
}
/*
* ufs_thaw
* thaw file system lock down to current value
*/
ufs_thaw(mp)
struct mount *mp;
{
int error = 0;
struct ulockfs *ul = mp->m_ul;
struct lockfs *lfc = UTOL(ul);
int noidel = ULOCKFS_IS_NOIDEL(ul);
/*
* if wlock or hlock
*/
if (LOCKFS_IS_WLOCK(lfc) || LOCKFS_IS_HLOCK(lfc)) {
/*
* don't keep access times
* don't free deleted files
* if superblock writes are allowed, limit them to me for now
*/
ul->ul_flags |= (ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
if (ul->ul_sbowner != (struct proc *)-1)
ul->ul_sbowner = u.u_procp;
/*
* wait for writes for deleted files and superblock updates
*/
if (error = ufs_flush(mp))
goto errout;
/*
* no one can write the superblock
*/
ul->ul_sbowner = (struct proc *)-1;
/*
* reset modified
*/
LOCKFS_CLR_MOD(lfc);
/*
* special processing for wlock/hlock
*/
if (LOCKFS_IS_WLOCK(lfc))
if (error = ufs_thaw_wlock(mp))
goto errout;
if (LOCKFS_IS_HLOCK(lfc))
while (ufs_thaw_hlock(mp))
if (error = ufs_flush(mp))
goto errout;
} else {
/*
* okay to keep access times
* okay to free deleted files
* okay to write the superblock
*/
ul->ul_flags &= ~(ULOCKFS_NOIACC|ULOCKFS_NOIDEL);
ul->ul_sbowner = NULL;
/*
* flush in case deleted files are in memory
*/
if (noidel)
if (error = ufs_flush(mp))
goto errout;
}
/*
* allow all accesses except those needed for this lock
*/
ul->ul_vamask = lockfs_vamask[lfc->lf_lock];
/*
* wakeup any pended accesses (appropriate ones will sleep again)
*/
errout:
wakeup((caddr_t)mp);
return (error);
}
/*
* ufs_flush
* flush at least everything that is currently dirty
*/
ufs_flush(mp)
struct mount *mp;
{
int error;
int saverror = 0;
struct fs *fs = mp->m_bufp->b_un.b_fs;
union ihead *ih;
struct inode *ip;
ino_t *inop; /* array of ino_t's (0 terminated) */
ino_t *cinop; /* pointer into array */
u_long tino; /* total length of array */
/*
* get rid of dnlc entries
*/
(void) dnlc_purge();
#ifdef QUOTA
/*
* flush quota records
*/
(void) qsync(mp);
#endif /* QUOTA */
/*
* flush and synchronously invalidate page cache and inodes
*/
for (ih = ihead; ih < &ihead[INOHSZ]; ih++) {
ufs_getino(mp, ih, &inop, &tino);
for (cinop = inop; *cinop; ++cinop) {
if (error = iget(mp->m_dev, fs, *cinop, &ip)) {
saverror = error;
continue;
}
if ((error = syncip(ip, B_ASYNC, 0)) == 0)
error = syncip(ip, B_INVAL, 0);
if (error)
saverror = error;
(void) iput(ip);
}
/*
* free the array of inode numbers
*/
if (inop != NULL)
kmem_free((caddr_t)inop, (u_int)tino * sizeof (ino_t));
}
/*
* Push buf cache and block device page cache
*/
if (error = VOP_PUTPAGE(mp->m_devvp, 0, 0, B_ASYNC, u.u_cred))
saverror = error;
(void) bflush(mp->m_devvp);
/*
* synchronously flush superblock and summary info
*/
if (fs->fs_ronly == 0) {
fs->fs_fmod = 0;
(void) sbupdate(mp);
}
/*
* synchronously flush and invalidate buf and page cache
*/
if (error = VOP_PUTPAGE(mp->m_devvp, 0, 0, B_INVAL, u.u_cred))
saverror = error;
(void) bsinval(mp->m_devvp);
/*
* set the clean flag
*/
ufs_checkclean(mp);
return (saverror);
}
/*
* ufs_thaw_wlock
* special processing when thawing down to wlock
*/
ufs_thaw_wlock(mp)
struct mount *mp;
{
int s;
union ihead *ih;
struct inode *ip;
struct vnode *vp;
struct page *pp;
ino_t *inop; /* array of ino_t's (0 terminated) */
ino_t *cinop; /* pointer into array */
u_long tino; /* total entries in inop */
int mlocks = 0;
struct fs *fs = mp->m_bufp->b_un.b_fs;
/*
* look for mlock'ed pages
*/
for (ih = ihead; ih < &ihead[INOHSZ]; ih++) {
ufs_getino(mp, ih, &inop, &tino);
for (cinop = inop; *cinop; ++cinop) {
if (iget(mp->m_dev, fs, *cinop, &ip))
continue;
if (fs->fs_ronly)
ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG);
vp = ITOV(ip);
if ((vp->v_type != VCHR) && (vp->v_type != VSOCK)) {
s = splvm();
if (pp = vp->v_pages)
do {
mlocks += pp->p_lckcnt;
pp = pp->p_vpnext;
} while (pp != vp->v_pages);
(void) splx(s);
}
(void) iput(ip);
}
if (inop != NULL)
kmem_free((caddr_t)inop, (u_int)tino * sizeof (ino_t));
}
return ((mlocks) ? EPERM : 0);
}
/*
* ufs_thaw_hlock
* special processing when thawing down to hlock
*/
ufs_thaw_hlock(mp)
struct mount *mp;
{
int s;
union ihead *ih;
struct inode *ip;
struct vnode *vp;
struct page *pp;
int reflush; /* reflush the file system */
ino_t *inop; /* array of ino_t's (0 terminated) */
ino_t *cinop; /* pointer into array */
u_long tino; /* total entries in inop */
struct fs *fs = mp->m_bufp->b_un.b_fs;
extern u_int pages_pp_locked;
/*
* clear i_flags and page locks and page mods just in case an
* error prevented them from being cleared during ufs_flush()
*/
for (ih = ihead, reflush = 0; ih < &ihead[INOHSZ]; ih++) {
ufs_getino(mp, ih, &inop, &tino);
for (cinop = inop; *cinop; ++cinop) {
if (iget(mp->m_dev, fs, *cinop, &ip))
continue;
ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG);
vp = ITOV(ip);
if ((vp->v_type != VCHR) && (vp->v_type != VSOCK)) {
s = splvm();
if (pp = vp->v_pages)
do {
reflush = 1;
if (pp->p_lckcnt)
--pages_pp_locked;
pp->p_lckcnt = 0;
hat_pagesync(pp);
pp->p_mod = 0;
pp->p_ref = 0;
pp = pp->p_vpnext;
} while (pp != vp->v_pages);
(void) splx(s);
}
(void) iput(ip);
}
if (inop != NULL)
kmem_free((caddr_t)inop, (u_int)tino * sizeof (ino_t));
}
return (reflush);
}
/*
* ufs_reconcile_ip
* reconcile ondisk inode with incore inode
*/
ufs_reconcile_ip(mp, ip)
struct mount *mp;
struct inode *ip; /* incore inode */
{
int i;
int ndaddr;
int niaddr;
struct dinode *dp; /* ondisk inode */
struct buf *bp = NULL;
struct fs *fs = mp->m_bufp->b_un.b_fs;
/*
* BIG BOO-BOO, reconciliation fails
*/
if (ip->i_flag & (IMOD|IMODACC|IACC|IUPD|ICHG))
return (EPERM);
/*
* get the dinode
*/
bp = bread(ip->i_devvp, (daddr_t)fsbtodb(fs, itod(fs, ip->i_number)),
(int)fs->fs_bsize);
if (bp->b_flags & B_ERROR) {
brelse(bp);
return (EIO);
}
dp = bp->b_un.b_dino;
dp += itoo(fs, ip->i_number);
/*
* some fields are not allowed to change
*/
if ((ip->i_mode != dp->di_mode) ||
(ip->i_uid != dp->di_uid) ||
(ip->i_gid != dp->di_gid)) {
brelse(bp);
return (EACCES);
}
/*
* and some are allowed to change
*/
ip->i_size = dp->di_size;
ip->i_ic.ic_flags = dp->di_ic.ic_flags;
ip->i_blocks = dp->di_blocks;
ip->i_gen = dp->di_gen;
ip->i_nlink = dp->di_nlink;
if (ip->i_flag & IFASTSYMLNK) {
ndaddr = 1;
niaddr = 0;
} else {
ndaddr = NDADDR;
niaddr = NIADDR;
}
for (i = 0; i < ndaddr; ++i)
ip->i_db[i] = dp->di_db[i];
for (i = 0; i < niaddr; ++i)
ip->i_ib[i] = dp->di_ib[i];
brelse(bp);
return (0);
}
/*
* ufs_reconcile_inodes
* reconcile all incore inodes for this fs with ondisk inodes
*/
ufs_reconcile_inodes(mp)
struct mount *mp;
{
int error = 0;
struct fs *fs = mp->m_bufp->b_un.b_fs;
union ihead *ih;
struct inode *ip;
u_long tino;
ino_t *inop; /* array of ino_t's */
ino_t *cinop; /* pointer into array */
/*
* scan inode hash and reconcile all inodes found for this fs
*/
for (ih = ihead; (error == 0) && (ih < &ihead[INOHSZ]); ih++) {
ufs_getino(mp, ih, &inop, &tino);
for (cinop = inop; (error == 0) && *cinop; ++cinop) {
if ((error = iget(mp->m_dev, fs, *cinop, &ip)) == 0) {
error = ufs_reconcile_ip(mp, ip);
(void) iput(ip);
}
}
/*
* free the array of inode numbers
*/
if (inop != NULL)
kmem_free((caddr_t)inop, (u_int)tino * sizeof (ino_t));
}
return (error);
}
/*
* ufs_getino
* return array of ino_t's for inodes on the hash for given fs
*/
ufs_getino(mp, ih, inopp, tinop)
struct mount *mp;
union ihead *ih;
ino_t **inopp;
u_long *tinop;
{
struct inode *ip;
struct inode *aip = (struct inode *)ih;
struct fs *fs = mp->m_bufp->b_un.b_fs;
ino_t *inop = NULL;
u_long tino = 16;
u_long nino;
/*
* allocate an array of inode numbers (null terminated)
*/
again:
if (inop)
kmem_free((caddr_t)inop, (u_int)tino * sizeof (ino_t));
tino <<= 1;
inop = (ino_t *)kmem_zalloc((u_int)tino * sizeof (ino_t));
/*
* fill in the array from the inodes for fs on hash ih
*/
for (ip = aip->i_forw, nino = 0; ip && ip != aip; ip = ip->i_forw) {
if (ip->i_fs != fs) continue;
if (nino == (tino-1))
goto again;
*(inop + nino++) = ip->i_number;
}
/*
* return the array
*/
*inopp = inop;
*tinop = tino;
}
/*
* ufs_reconcile
* reconcile ondisk superblock/inodes with any incore
*/
ufs_reconcile(mp)
struct mount *mp;
{
int error = 0;
/*
* get rid of as much inmemory data as possible
*/
if (error = ufs_flush(mp))
goto errout;
/*
* reconcile the superblock and inodes
*/
if (error = ufs_reconcile_fs(mp))
goto errout;
if (error = ufs_reconcile_inodes(mp))
goto errout;
/*
* get rid of as much inmemory data as possible
*/
if (error = ufs_flush(mp))
goto errout;
errout:
return (error);
}
/*
* ufs_reconcile_fs
* reconcile incore superblock with ondisk superblock
*/
ufs_reconcile_fs(mp)
struct mount *mp;
{
int i;
int error;
struct fs *mfs; /* in-memory superblock */
struct fs *dfs; /* on-disk superblock */
struct buf *bp; /* on-disk superblock buf */
/*
* BIG BOO-BOO
*/
mfs = mp->m_bufp->b_un.b_fs;
if (mfs->fs_fmod)
return (EPERM);
/*
* get the on-disk copy of the superblock
*/
bp = bread(mp->m_devvp, SBLOCK, (int)mfs->fs_sbsize);
if (bp->b_flags & B_ERROR) {
brelse(bp);
return (EIO);
}
dfs = bp->b_un.b_fs;
/*
* if superblock has changed too much, abort
*/
if ((mfs->fs_sblkno != dfs->fs_sblkno) ||
(mfs->fs_cblkno != dfs->fs_cblkno) ||
(mfs->fs_iblkno != dfs->fs_iblkno) ||
(mfs->fs_dblkno != dfs->fs_dblkno) ||
(mfs->fs_cgoffset != dfs->fs_cgoffset) ||
(mfs->fs_cgmask != dfs->fs_cgmask) ||
(mfs->fs_bsize != dfs->fs_bsize) ||
(mfs->fs_fsize != dfs->fs_fsize) ||
(mfs->fs_frag != dfs->fs_frag) ||
(mfs->fs_bmask != dfs->fs_bmask) ||
(mfs->fs_fmask != dfs->fs_fmask) ||
(mfs->fs_bshift != dfs->fs_bshift) ||
(mfs->fs_fshift != dfs->fs_fshift) ||
(mfs->fs_fragshift != dfs->fs_fragshift) ||
(mfs->fs_fsbtodb != dfs->fs_fsbtodb) ||
(mfs->fs_sbsize != dfs->fs_sbsize) ||
(mfs->fs_nindir != dfs->fs_nindir) ||
(mfs->fs_nspf != dfs->fs_nspf) ||
(mfs->fs_trackskew != dfs->fs_trackskew) ||
(mfs->fs_cgsize != dfs->fs_cgsize) ||
(mfs->fs_ntrak != dfs->fs_ntrak) ||
(mfs->fs_nsect != dfs->fs_nsect) ||
(mfs->fs_spc != dfs->fs_spc) ||
(mfs->fs_cpg != dfs->fs_cpg) ||
(mfs->fs_ipg != dfs->fs_ipg) ||
(mfs->fs_fpg != dfs->fs_fpg) ||
(mfs->fs_postblformat != dfs->fs_postblformat) ||
(mfs->fs_magic != dfs->fs_magic)) {
brelse(bp);
return (EACCES);
}
/*
* get new summary info
*/
if (error = ufs_getsummaryinfo(mp, dfs)) {
brelse(bp);
return (error);
}
/*
* release old summary info and update in-memory superblock
*/
kmem_free((caddr_t)mfs->fs_csp[0], (u_int)mfs->fs_cssize);
for (i = 0; i < MAXCSBUFS; ++i)
mfs->fs_csp[i] = dfs->fs_csp[i];
/*
* update fields allowed to change
*/
mfs->fs_size = dfs->fs_size;
mfs->fs_dsize = dfs->fs_dsize;
mfs->fs_ncg = dfs->fs_ncg;
mfs->fs_minfree = dfs->fs_minfree;
mfs->fs_rotdelay = dfs->fs_rotdelay;
mfs->fs_rps = dfs->fs_rps;
mfs->fs_maxcontig = dfs->fs_maxcontig;
mfs->fs_maxbpg = dfs->fs_maxbpg;
mfs->fs_csmask = dfs->fs_csmask;
mfs->fs_csshift = dfs->fs_csshift;
mfs->fs_optim = dfs->fs_optim;
mfs->fs_csaddr = dfs->fs_csaddr;
mfs->fs_cssize = dfs->fs_cssize;
mfs->fs_ncyl = dfs->fs_ncyl;
mfs->fs_cstotal = dfs->fs_cstotal;
/* XXX What to do about sparecon? */
/*
* ondisk clean flag overrides inmemory clean flag iff == FSBAD
*/
if (FSOKAY != (fs_get_state(dfs) + dfs->fs_time))
mfs->fs_clean = FSBAD;
if (dfs->fs_clean == FSBAD)
mfs->fs_clean = FSBAD;
brelse(bp);
return (0);
}
/*
* ufs_getlfd
* copy desired-lock struct from user to kernel space
*/
ufs_getlfd(vp, lfup, lfd, lfc)
struct vnode *vp; /* vnode on fs to be locked */
struct lockfs **lfup; /* address in user space */
struct lockfs *lfd; /* desired lock */
struct lockfs *lfc; /* current lock */
{
int error = 0;
u_int comlen = 0;
caddr_t comment = NULL;
/*
* copy user's lockfs struct into kernel memory
*/
if (error = copyin((caddr_t)*lfup, (caddr_t)lfd,
(u_int)(sizeof (struct lockfs))))
goto errout;
/*
* check key
*/
if (!LOCKFS_IS_ULOCK(lfc))
if (lfd->lf_key != lfc->lf_key) {
error = EINVAL;
goto errout;
}
lfd->lf_key = lfc->lf_key + 1;
/*
* check bounds -- lf_lock is index into array of access masks
*/
if (lfd->lf_lock >= lockfs_maxlock) {
error = EINVAL;
goto errout;
}
/*
* can't wlock fs with accounting or local swap file
*/
if (LOCKFS_IS_WLOCK(lfd)) {
#ifdef SYSACCT
if (error = ufs_checkaccton(vp))
goto errout;
#endif SYSACCT
if (error = ufs_checkswapon(vp))
goto errout;
}
/*
* no input flags defined
*/
if (lfd->lf_flags != 0) {
error = EINVAL;
goto errout;
}
/*
* get comment
*/
if (comlen = lfd->lf_comlen) {
if (comlen > LOCKFS_MAXCOMMENTLEN) {
error = ENAMETOOLONG;
goto errout;
}
comment = (caddr_t)kmem_alloc(comlen);
if (error = copyin(lfd->lf_comment, comment, comlen))
goto errout;
lfd->lf_comment = comment;
}
return (error);
errout:
if (comment)
kmem_free(comment, comlen);
return (error);
}
#ifdef SYSACCT
/*
* ufs_checkaccton
* check if accounting is turned on on this fs
*/
extern struct vnode *acctp;
extern struct vnode *savacctp;
ufs_checkaccton(vp)
struct vnode *vp;
{
if (acctp && acctp->v_vfsp == vp->v_vfsp)
return (EDEADLK);
if (savacctp && savacctp->v_vfsp == vp->v_vfsp)
return (EDEADLK);
return (0);
}
#endif /* SYSACCT */
/*
* ufs_checkswapon
* check if local swapping is to file on this fs
*/
extern struct swapinfo *swapinfo;
ufs_checkswapon(vp)
struct vnode *vp;
{
struct swapinfo *sip;
for (sip = swapinfo; sip; sip = sip->si_next)
if (sip->si_vp->v_vfsp == vp->v_vfsp)
return (EDEADLK);
return (0);
}
/*
* ufs_lockfs_hold
*/
ufs_lockfs_hold(vfsp)
struct vfs *vfsp;
{
struct mount *mp;
struct ulockfs *ul;
if ((mp = (struct mount *)vfsp->vfs_data) == NULL)
return (EIO);
ul = mp->m_ul;
if (ul->ul_flags & ULOCKFS_FUMOUNT)
return (EIO);
ul->ul_hold++;
return (0);
}
/*
* ufs_lockfs_rele
*/
ufs_lockfs_rele(vfsp)
struct vfs *vfsp;
{
struct ulockfs *ul;
ul = ((struct mount *)(vfsp->vfs_data))->m_ul;
if (ul->ul_hold-- == 1)
if (ul->ul_flags & ULOCKFS_WANT) {
ul->ul_flags &= ~ULOCKFS_WANT;
wakeup((caddr_t)ul);
}
}
/*
* ufs_lockfs_fumount
*/
ufs_lockfs_fumount(ul)
struct ulockfs *ul;
{
ul->ul_flags |= ULOCKFS_FUMOUNT;
while (ul->ul_hold) {
ul->ul_flags |= ULOCKFS_WANT;
if (sleep ((caddr_t)ul, PLOCK+PCATCH)) {
ul->ul_flags &= ~ULOCKFS_FUMOUNT;
return (EINTR);
}
}
return (0);
}
/*
* ufs_fioai
* file allocation information
*/
ufs_fioai(vp, faip)
struct vnode *vp; /* file's vnode */
struct filai **faip; /* user address of struct filai */
{
int error;
int boff; /* offset within file system block */
int na; /* # allocations returned */
int ne; /* # entries left in array */
size_t size; /* byte length of range */
daddr_t off; /* byte offset into file */
daddr_t lbn; /* logical fs block */
daddr_t bn; /* disk sector number */
daddr_t bor; /* beginning of range (sector) */
daddr_t lor; /* length of range (sector) */
daddr_t lof; /* length of file (sector) */
struct filai fai; /* copy of users filai */
struct fs *fs; /* file system (superblock) */
struct inode *ip; /* vnode's inode */
daddr_t *da; /* address of user array */
/*
* inode and superblock
*/
ip = VTOI(vp);
fs = ITOF(ip);
/*
* get user's filia struct
*/
if (error = copyin((caddr_t)*faip, (caddr_t)&fai, (u_int)sizeof (fai)))
return (error);
ILOCK(ip);
/*
* range checks
* offset >= 2G || size >= 2G || (offset+size) >= 2G
* offset >= length of file
*
*/
na = 0;
if ((size = fai.fai_size) == 0)
size = ip->i_size - fai.fai_off;
if ((int)fai.fai_off < 0)
goto errrange;
if ((int)size < 0)
goto errrange;
if ((int)(fai.fai_off + size) < 0)
goto errrange;
if (fai.fai_off >= ip->i_size)
goto errrange;
/*
* beginning of range in sectors
* length of range in sectors
* length of file in sectors
*/
bor = btodb(fai.fai_off);
off = dbtob(bor);
lor = btodb(size) + ((size & (DEV_BSIZE-1)) ? 1 : 0);
lof = btodb(ip->i_size) + ((ip->i_size & (DEV_BSIZE-1)) ? 1 : 0);
if (lof < (bor + lor))
lor = lof - bor;
/*
* return allocation info until:
* array fills
* range is covered (end of file accounted for above)
*/
ne = fai.fai_num;
da = fai.fai_daddr;
while (lor && ne) {
/*
* file system block and offset within block
*/
lbn = lblkno(fs, off);
boff = blkoff(fs, off);
/*
* get frag address and convert to disk address
*/
if (error = bmap(ip, lbn, &bn, (daddr_t *)NULL,
DEV_BSIZE, S_READ, 1))
goto errout;
if (bn == UFS_HOLE)
bn = FILAI_HOLE;
else
bn = fsbtodb(fs, bn) + btodb(boff);
/*
* return disk addresses.
* (file system blocks are contiguous on disk)
*/
do {
if (error = suword((caddr_t)da, (int)bn))
goto errout;
if (bn != FILAI_HOLE)
bn++;
off += DEV_BSIZE;
na++;
da++;
lor--;
ne--;
} while ((lbn == lblkno(fs, off)) && lor && ne);
}
/*
* update # of entries returned and current offset
*/
fai.fai_off = off;
errrange:
fai.fai_num = na;
if (error = copyout((caddr_t)&fai, (caddr_t)*faip, sizeof (fai)))
goto errout;
errout:
IUNLOCK(ip);
return (error);
}
/*
* ufs_fiodutimes
* set access/modified times but not change time. Also, delay the update
*/
ufs_fiodutimes(vp, tvp)
struct vnode *vp; /* file's vnode */
struct timeval **tvp; /* user address of struct timeval */
{
int error = 0;
struct inode *ip;
struct timeval tv;
if (!suser())
return (EPERM);
ip = VTOI(vp);
/*
* if NULL, use current time
*/
if (*tvp) {
if (error = copyin((caddr_t)*tvp, (caddr_t)&tv, sizeof (tv)))
return (error);
if (tv.tv_usec < 0 || tv.tv_usec >= 1000000)
return (EINVAL);
} else
tv = time;
ILOCK(ip);
ITIMES(ip);
ip->i_atime = tv;
ip->i_flag |= IMODACC;
IUNLOCK(ip);
return (error);
}
/*
* ufs_fiodios
* return status of metadata updates
*/
ufs_fiodios(vp, diop)
struct vnode *vp; /* file's vnode */
u_long **diop; /* m_dio returned here */
{
u_long dio;
dio = (ITOM(VTOI(vp)))->m_dio & ~MDIO_LOCK;
return (suword((caddr_t)*diop, (int)(dio)));
}
/*
* ufs_fiodio
* sandbag metadata updates
*/
ufs_fiodio(vp, diop)
struct vnode *vp; /* file's vnode */
u_long **diop; /* dio flags */
{
int error;
int clean;
u_long dio;
struct inode *ip;
struct fs *fs;
struct mount *mp;
struct buf *bp;
/*
* check input conditions
*/
if (!suser())
return (EPERM);
error = copyin((caddr_t)*diop, (caddr_t)&dio, (u_int)(sizeof (u_long)));
if (error)
return (error);
if (dio > 1)
return (EINVAL);
/*
* setup
*/
ip = VTOI(vp);
fs = ITOF(ip);
mp = ITOM(ip);
/*
* lock access to the dio field
*/
while (mp->m_dio & MDIO_LOCK)
if (sleep((caddr_t)mp, PLOCK+PCATCH))
return (EINTR);
if (mp->m_dio == dio)
goto out;
mp->m_dio = dio | MDIO_LOCK;
/*
* enable/disable clean flag processing
*/
if ((mp->m_dio & MDIO_ON) || (ufs_flush(mp)))
clean = FSSUSPEND;
else
clean = FSACTIVE;
if (fs->fs_ronly == 0) {
bp = getblk(mp->m_devvp, SBLOCK, (int)fs->fs_sbsize);
if (fs->fs_clean != FSBAD) {
fs->fs_clean = clean;
ufs_sbwrite(mp, fs, bp);
} else
brelse(bp);
}
out:
mp->m_dio &= ~MDIO_LOCK;
wakeup((caddr_t)mp);
return (0);
}