Arquivotheca.SunOS-4.1.4/sys/nfs/nfs_vnodeops.c

#ident	"@(#)nfs_vnodeops.c 1.1 94/10/31 SMI"
/*
 * Copyright (c) 1988 by Sun Microsystems, Inc.
 */

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/user.h>
#include <sys/vnode.h>
#include <sys/vfs.h>
#include <sys/vfs_stat.h>
#include <sys/file.h>
#include <sys/uio.h>
#include <sys/buf.h>
#include <sys/kernel.h>
#include <sys/mman.h>
#include <netinet/in.h>
#include <sys/proc.h>
#include <sys/pathname.h>
#include <sys/dirent.h>
#include <sys/conf.h>
#include <sys/debug.h>
#include <sys/unistd.h>
#include <sys/mount.h>
#include <sys/vmmeter.h>
#include <sys/trace.h>
#include <sys/syslog.h>

#include <rpc/types.h>
#include <rpc/auth.h>
#include <rpc/clnt.h>
#include <rpc/xdr.h>
#include <nfs/nfs.h>
#include <nfs/nfs_clnt.h>
#include <nfs/rnode.h>

#include <vm/hat.h>
#include <vm/as.h>
#include <vm/page.h>
#include <vm/pvn.h>
#include <vm/seg.h>
#include <vm/seg_map.h>
#include <vm/seg_vn.h>
#include <vm/rm.h>
#include <vm/swap.h>

#include <krpc/lockmgr.h>

#ifdef NFSDEBUG
extern int nfsdebug;
#endif

struct vnode *makenfsnode();
struct vnode *dnlc_lookup();
char *newname();
int setdirgid();
u_int setdirmode();

/*
 * Do close to open consistency checking on all filesystems.
 * If this boolean is false, CTO checking can be selectively
 * turned off by setting actimeo to -1 at mount time.
 */
int nfs_cto = 1;

/*
 * Error flags used to pass information about certain special errors
 * back from do_bio() to nfs_getapage() (yuck).
 */
#define	NFS_CACHEINVALERR	-99
#define	NFS_EOF			-98

#define	ISVDEV(t) ((t == VBLK) || (t == VCHR) || (t == VFIFO))

/*
 * These are the vnode ops routines which implement the vnode interface to
 * the networked file system.  These routines just take their parameters,
 * make them look networkish by putting the right info into interface structs,
 * and then calling the appropriate remote routine(s) to do the work.
 *
 * Note on directory name lookup cacheing:  we desire that all operations
 * on a given client machine come out the same with or without the cache.
 * To correctly do this, we serialize all operations on a given directory,
 * by using RLOCK and RUNLOCK around rfscalls to the server.  This way,
 * we cannot get into races with ourself that would cause invalid information
 * in the cache.  Other clients (or the server itself) can cause our
 * cached information to become invalid, the same as with data pages.
 * Also, if we do detect a stale fhandle, we purge the directory cache
 * relative to that vnode.  This way, the user won't get burned by the
 * cache repeatedly.
 */

static	int nfs_open();
static	int nfs_close();
static	int nfs_rdwr();
static	int nfs_ioctl();
static	int nfs_select();
static	int nfs_getattr();
static	int nfs_setattr();
static	int nfs_access();
static	int nfs_lookup();
static	int nfs_create();
static	int nfs_remove();
static	int nfs_link();
static	int nfs_rename();
static	int nfs_mkdir();
static	int nfs_rmdir();
static	int nfs_readdir();
static	int nfs_symlink();
static	int nfs_readlink();
static	int nfs_fsync();
static	int nfs_inactive();
static	int nfs_lockctl();
static	int nfs_noop();
static	int nfs_getpage();
static	int nfs_putpage();
static	int nfs_map();
static	int nfs_dump();
static  int nfs_cmp();
static  int nfs_realvp();
static  int nfs_cntl();

struct vnodeops nfs_vnodeops = {
	nfs_open,
	nfs_close,
	nfs_rdwr,
	nfs_ioctl,
	nfs_select,
	nfs_getattr,
	nfs_setattr,
	nfs_access,
	nfs_lookup,
	nfs_create,
	nfs_remove,
	nfs_link,
	nfs_rename,
	nfs_mkdir,
	nfs_rmdir,
	nfs_readdir,
	nfs_symlink,
	nfs_readlink,
	nfs_fsync,
	nfs_inactive,
	nfs_lockctl,
	nfs_noop,
	nfs_getpage,
	nfs_putpage,
	nfs_map,
	nfs_dump,
	nfs_cmp,
	nfs_realvp,
	nfs_cntl,
};

/*ARGSUSED*/
static int
nfs_open(vpp, flag, cred)
	register struct vnode **vpp;
	int flag;
	struct ucred *cred;
{
	int error;
	struct vattr va;
        register struct rnode *rp;

#ifdef NFSDEBUG
	dprint(nfsdebug, 4, "nfs_open %s %x flag %d\n",
	    vtomi(*vpp)->mi_hostname, *vpp, flag);
#endif
	VFS_RECORD((*vpp)->v_vfsp, VS_OPEN, VS_CALL);

	error = 0;
	/*
	 * if close-to-open consistency checking is turned off
	 * we can avoid the over the wire getattr.
	 */
	if (nfs_cto || !vtomi(*vpp)->mi_nocto) {
		/*
		 * Force a call to the server to get fresh attributes
		 * so we can check caches. This is required for close-to-open
		 * consistency.
		 */
		error = nfs_getattr_otw(*vpp, &va, cred);
		if (error == 0) {
			nfs_cache_check(*vpp, va.va_mtime, va.va_size);
			nfs_attrcache_va(*vpp, &va);
		} else if (error == ESTALE) {
			/*
			 *	If we fail because of a stale NFS file handle,
			 *	restart the system call (nfs_getattr_otw will
			 *	have flushed all relevant caches).
			 */
				u.u_eosys = RESTARTSYS;
		}
	}

        /*
         * Track outstanding opens so that we know when to free the
         * credential on the rnode. We cannot use the vnode reference
         * count because it can be nonzero even if the file is not
         * open (e.g. the dnlc).
         */
        if (error == 0) {
                rp = vtor(*vpp);

                /*
                 * If there were no outstanding opens before now, and there
                 * was a cred on this rnode, and there are no outstanding
                 * pages, then we should remove the cred. It is possible
                 * the previous cred did not have the same access to
                 * the file that this user does.
                 */
                if (rp->r_opencnt++ == 0 && rp->r_cred &&
                                (*vpp)->v_pages == NULL) {
                        cred = rp->r_cred;
                        rp->r_cred = NULL;
                        crfree(cred);
                }
        }
	return (error);
}

static int
nfs_close(vp, flag, count, cred)
	struct vnode *vp;
	int flag;
	int count;
	struct ucred *cred;
{
	register struct rnode *rp;
	int error = 0;

	VFS_RECORD(vp->v_vfsp, VS_CLOSE, VS_CALL);
	if (count > 1)
		return (0);

#ifdef NFSDEBUG
	dprint(nfsdebug, 4, "nfs_close %s %x flag %d\n",
	    vtomi(vp)->mi_hostname, vp, flag);
#endif
	VFS_RECORD(vp->v_vfsp, VS_CLOSE, VS_MISS);
	rp = vtor(vp);

	/*
	 * If the file is an unlinked file, then flush the lookup
	 * cache so that nfs_inactive will be called if this is
	 * the last reference.  Otherwise, if close-to-open
	 * consistency is turned on and the file was open
	 * for writing or we had an asynchronous write error, we
	 * force the "sync on close" semantic by calling nfs_putpage.
	 */
	if (rp->r_unldvp != NULL || rp->r_error) {
		(void) nfs_putpage(vp, 0, 0, B_INVAL, cred);
		dnlc_purge_vp(vp);
		error = rp->r_error;
		rp->r_error = 0;
	} else if ((nfs_cto || !vtomi(vp)->mi_nocto) &&
			(flag & FWRITE)) {
		(void) nfs_putpage(vp, 0, 0, 0, cred);
		if (rp->r_error) {
			(void) nfs_putpage(vp, 0, 0, B_INVAL, cred);
			dnlc_purge_vp(vp);
			error = rp->r_error;
			rp->r_error = 0;
		}
	}
        /*
         * Remove this open reference.
         */
        rp->r_opencnt--;
	return (flag & FWRITE? error : 0);
}

static int
nfs_rdwr(vp, uio, rw, ioflag, cred)
	register struct vnode *vp;
	struct uio *uio;
	enum uio_rw rw;
	int ioflag;
	struct ucred *cred;
{
	int error = 0;
	struct rnode *rp;
	u_int off;
	addr_t base;
	u_int flags;
	register int n, on;
	int eof = 0;
	int adjust_resid = 0;


#ifdef NFSDEBUG
	dprint(nfsdebug, 4,
	    "nfs_rdwr: %s %x rw %s offset %x len %d cred 0x%x\n",
	    vtomi(vp)->mi_hostname, vp, rw == UIO_READ ? "READ" : "WRITE",
	    uio->uio_offset, uio->uio_iov->iov_len, cred);
#endif
	if (vp->v_type != VREG) {
		return (EISDIR);
	}
	if (uio->uio_resid == 0) {
		return (0);
	}
	rp = vtor(vp);
	if (rw == UIO_WRITE || (rw == UIO_READ && rp->r_cred == NULL)) {
		crhold(cred);
		if (rp->r_cred) {
			crfree(rp->r_cred);
		}
		rp->r_cred = cred;
	}

#ifdef notdef
	if (ioflag & IO_UNIT) {
		RLOCK(rp);
	}
#endif

	/* Fix bug 1045993, huey */
	if (vp->v_flag & VNOCACHE) {
		struct vattr va;

		error = nfs_getattr_otw(vp, &va, cred);
		if (error)
			goto out;
	}

	if ((ioflag & IO_APPEND) && rw == UIO_WRITE) {
		struct vattr va;

		RLOCK(rp);
                error = nfsgetattr(vp, &va, cred);
		if (error)
			goto out;
		uio->uio_offset = va.va_size;
	}

	if (uio->uio_offset < 0 || (uio->uio_offset + uio->uio_resid) < 0) {
		error = EINVAL;
		goto out;
	}
	if (rw == UIO_WRITE &&
	    uio->uio_offset+uio->uio_resid >
	    u.u_rlimit[RLIMIT_FSIZE].rlim_cur) {
		if (uio->uio_offset >= u.u_rlimit[RLIMIT_FSIZE].rlim_cur) {
			psignal(u.u_procp, SIGXFSZ);
			error = EFBIG;
			goto out;
		} else {
			adjust_resid = uio->uio_resid;
			uio->uio_resid = u.u_rlimit[RLIMIT_FSIZE].rlim_cur -
							uio->uio_offset;
			adjust_resid -= uio->uio_resid;
		}
	}
	RLOCK(rp);

	do {
		off = uio->uio_offset & MAXBMASK; /* mapping offset */
		on = uio->uio_offset & MAXBOFFSET; /* Relative offset */
		n = MIN(MAXBSIZE - on, uio->uio_resid);

		if (rw == UIO_READ) {
			int diff;

			VFS_RECORD(vp->v_vfsp, VS_READ, VS_CALL);
			if (!(vp->v_flag & VNOCACHE) && page_find(vp, off)) {
				(void) nfs_validate_caches(vp, cred);
			}

			diff = rp->r_size - uio->uio_offset;
			if (diff <= 0) {
				break;
			}
			if (diff < n) {
				n = diff;
				eof = 1;
			}
		} else {	/* UIO_WRITE */

			/*
			 * Keep returning errors on rnode until
			 * rnode goes away.
			 */
			/* Fix Bug 1030884:
			 * Completely gross hack to make writes
			 * to the middle
			 * of a file succeed if the remote filesystem is full.
			 * Hack also in place in do_bio().
			 */

			if (rp->r_error && !(rp->r_error == ENOSPC
 					      && rp->r_attr.va_size >=
						off + n)){
				error = rp->r_error;
				break;
			}
			VFS_RECORD(vp->v_vfsp, VS_WRITE, VS_CALL);

			/*
			 * For file locking:  bypass VM to retain consistency
			 * in case only part of the file is locked and we don't
			 * want to write a whole page.
			 *
			 * XXX: size of the kmem_alloc may affect performance
			 */
			if (vp->v_flag & VNOCACHE) {
				caddr_t buf;
				int count, org_offset;
				u_int bufsize = MIN(uio->uio_resid, PAGESIZE);
				buf = new_kmem_alloc(bufsize, KMEM_SLEEP);
				while ((uio->uio_resid > 0) && (!error)) {
					count = MIN(uio->uio_resid, PAGESIZE);
                                        org_offset = (int) uio->uio_offset;
					error = uiomove(buf, count, UIO_WRITE,
						uio);
					rp->r_error = error = nfswrite(vp, buf,
						(u_int)org_offset,
						(long)count, rp->r_cred);
				}
				kmem_free(buf, bufsize);
				break;
			}
		}

		base = segmap_getmap(segkmap, vp, off);
		error = (rw == UIO_READ)? uiomove(base + on, n, UIO_READ, uio):
			writerp(vp, (base + on), n, uio);

		if (error == 0) {
			flags = 0;
			if (rw == UIO_WRITE) {
				/*
				 * Invalidate if entry is not to be cached.
				 */
				if (vp->v_flag & VNOCACHE)
					flags = SM_WRITE | SM_INVAL;
				else {
					rp->r_flags |= RDIRTY;
					if (n + on == MAXBSIZE ||
					    IS_SWAPVP(vp)) {
						/*
						 * Have written a whole block.
						 * Start an asynchronous write
						 * and mark the buffer to
						 * indicate that it won't be
						 * needed again soon.
						 */
						flags = SM_WRITE | SM_ASYNC |
						    SM_DONTNEED;
					}
				}
				if (ioflag & IO_SYNC) {
					flags &= ~SM_ASYNC;
					flags |= SM_WRITE;
				}
			} else {
				if (vp->v_flag & VNOCACHE)
					flags = SM_INVAL;
				else {
					/*
					 * If read a whole block or read to eof,
					 * won't need this buffer again soon.
					 */
					if (n + on == MAXBSIZE ||
					    uio->uio_offset == rp->r_size)
						flags = SM_DONTNEED;
				}
			}
			error = segmap_release(segkmap, base, flags);
		} else {
			(void) segmap_release(segkmap, base, 0);
		}

	} while (error == 0 && uio->uio_resid > 0 && !eof);
	RUNLOCK(rp);

	if (!error && adjust_resid) {
		uio->uio_resid = adjust_resid;
		psignal (u.u_procp, SIGXFSZ);
	}
out:
	if ((ioflag & IO_APPEND) && rw == UIO_WRITE) {
		RUNLOCK(rp);
	}
#ifdef notdef
	if (ioflag & IO_UNIT) {
		RUNLOCK(rp);
	}
#endif
#ifdef NFSDEBUG
	dprint(nfsdebug, 5, "nfs_rdwr returning %d\n", error);
#endif
	return (error);
}


static int
writerp(vp, base, tcount, uio)
	struct vnode *vp;
	addr_t base;		/* base address kernel addr space */
	int tcount;		/* Total bytes to move - < MAXBSIZE */
	struct uio *uio;
{
	struct rnode *rp = vtor(vp);
	int pagecreate;
	register int n;
	register int offset;
	int error;

#ifdef NFSDEBUG
	dprint(nfsdebug, 4,
	    "writerp: vp 0x%x base 0x%x offset %d tcount %d\n", vp, base,
	    uio->uio_offset, tcount);
#endif

	ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
	ASSERT(((u_int)base & MAXBOFFSET) + tcount <= MAXBSIZE);

	/*
	 * Move bytes in at most PAGESIZE chunks. We must avoid
	 * spanning pages in uiomove() because page faults may cause
	 * the cache to be invalidated out from under us. The r_size is not
	 * updated until after the uiomove. If we push the last page of a
	 * file before r_size is correct, we will lose the data written past
	 * the current (and invalid) r_size.
	 */
	do {
		offset = uio->uio_offset;
		pagecreate = 0;

		/*
		 * n is the number of bytes required to satisfy the request
		 *   or the number of bytes to fill out the page.
		 */
		n = MIN((PAGESIZE - ((u_int)base & PAGEOFFSET)), tcount);

		/*
		 * Check to see if we can skip reading in the page
		 * and just allocate the memory.  We can do this
		 * if we are going to rewrite the entire mapping
		 * or if we are going to write to or beyond the current
		 * end of file from the beginning of the mapping.
		 */
		if (((u_int)base & PAGEOFFSET) == 0 && (n == PAGESIZE ||
			((offset + n) >= rp->r_size))) {
			segmap_pagecreate(segkmap, base, (u_int)n, 0);
			pagecreate = 1;
		}

		error = uiomove(base, n, UIO_WRITE, uio);
		n = uio->uio_offset - offset; /* n = # of bytes written */
		base += n;
		tcount -= n;

		/*
		 * If we created pages w/o initializing them completely,
		 * we need to zero the part that wasn't set up.
		 * This happens on a most EOF write cases and if
		 * we had some sort of error during the uiomove.
		 */
		if (pagecreate &&
			((uio->uio_offset & PAGEOFFSET) || n == 0)) {
			(void) kzero(base, (u_int)(PAGESIZE - n));
		}
		/*
		 * r_size is the maximum number of
		 * bytes known to be in the file.
		 * Make sure it is at least as high as the
		 * last byte we just wrote into the buffer.
		 */
		if (rp->r_size < uio->uio_offset) {
			rp->r_size = uio->uio_offset;
		}


	} while (tcount > 0 && error == 0);

#ifdef NFSDEBUG
	dprint(nfsdebug, 5, "writerp returning %d\n", error);
#endif
	return (error);
}

/*
 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
 */
static int
nfs_writelbn(rp, pp, off, len, flags)
	register struct rnode *rp;
	struct page *pp;
	u_int off;
	u_int len;
	int flags;
{
	struct buf *bp;
	int err;

	bp = pageio_setup(pp, len, rtov(rp), B_WRITE | flags);
	if (bp == NULL) {
		pvn_fail(pp, B_WRITE | flags);
		return (ENOMEM);
	}

	bp->b_dev = 0;
	bp->b_blkno = btodb(off);
	bp_mapin(bp);

	err = nfs_strategy(bp);
	u.u_ru.ru_oublock++;

#ifdef NFSDEBUG
	dprint(nfsdebug, 5,
	    "nfs_writelbn %s blkno %d pp %x len %d flags %x error %d\n",
	    vtomi(rtov(rp))->mi_hostname, btodb(off), pp, len, flags, err);
#endif
	return (err);
}

/*
 * Write to file.  Writes to remote server in largest size
 * chunks that the server can handle.  Write is synchronous.
 */
static int
nfswrite(vp, base, offset, count, cred)
	struct vnode *vp;
	caddr_t base;
	u_int offset;
	long count;
	struct ucred *cred;
{
	int error;
	struct nfswriteargs wa;
	struct nfsattrstat *ns;
	int tsize;

#ifdef NFSDEBUG
	dprint(nfsdebug, 4, "nfswrite %s %x offset = %d, count = %d\n",
	    vtomi(vp)->mi_hostname, vp, offset, count);
#endif
	VFS_RECORD(vp->v_vfsp, VS_PUTPAGE, VS_MISS);

	ns = (struct nfsattrstat *)new_kmem_alloc(sizeof (*ns), KMEM_SLEEP);
	/*
	 * Temporarily invalidate attr cache since we know mtime will change
	 */
	INVAL_ATTRCACHE(vp);

	do {
		tsize = MIN(vtomi(vp)->mi_curwrite, count);
		wa.wa_data = base;
		wa.wa_fhandle = *vtofh(vp);
		wa.wa_begoff = offset;
		wa.wa_totcount = tsize;
		wa.wa_count = tsize;
		wa.wa_offset = offset;
		error = rfscall(vtomi(vp), RFS_WRITE, xdr_writeargs,
		    (caddr_t)&wa, xdr_attrstat, (caddr_t)ns, cred);
		if (error == ENFS_TRYAGAIN) {
			error = 0;
			continue;
		}
		if (!error) {
			error = geterrno(ns->ns_status);
			/*
			 * Can't check for stale fhandle and purge caches
			 * here because pages are held by nfs_getpage.
			 */
		}
#ifdef NFSDEBUG
		dprint(nfsdebug, 3, "nfswrite: sent %d of %d, error %d\n",
		    tsize, count, error);
#endif
		count -= tsize;
		base += tsize;
		offset += tsize;
	} while (!error && count);

	if (!error) {
		nfs_attrcache(vp, &ns->ns_attr);
	} else {
		/*
		 * Since we invalidated the cache above without first
		 * purging cached pages we have to put it back in the
		 * "timed-out" state.
		 */
		PURGE_ATTRCACHE(vp);
	}
	kmem_free((caddr_t)ns, sizeof (*ns));
	switch (error) {
	case 0:
	case EDQUOT:
	case EINTR:
		break;

	case ENOSPC:
		printf("NFS write error: on host %s remote file system full\n",
			vtomi(vp)->mi_hostname);
		break;

	default:
		printf("NFS write error %d on host %s fh ",
		    error, vtomi(vp)->mi_hostname);
		printfhandle((caddr_t)vtofh(vp));
		printf("\n");
		break;
	}
#ifdef NFSDEBUG
	dprint(nfsdebug, 5, "nfswrite: returning %d\n", error);
#endif
	return (error);
}

/*
 * Print a file handle
 */
printfhandle(fh)
	caddr_t fh;
{
	int i;
	int fhint[NFS_FHSIZE / sizeof (int)];

	bcopy(fh, (caddr_t)fhint, sizeof (fhint));
	for (i = 0; i < (sizeof (fhint) / sizeof (int)); i++) {
		printf("%x ", fhint[i]);
	}
}

/*
 * Read from a file.  Reads data in largest chunks our interface can handle.
 */
static int
nfsread(vp, base, offset, count, residp, cred, vap)
	struct vnode *vp;
	caddr_t base;
	u_int offset;
	long count;
	long *residp;
	struct ucred *cred;
	struct vattr *vap;
{
	int error;
	struct nfsreadargs ra;
	struct nfsrdresult rr;
	register int tsize;

#ifdef NFSDEBUG
	dprint(nfsdebug, 4, "nfsread %s %x offset = %d, totcount = %d\n",
	    vtomi(vp)->mi_hostname, vp, offset, count);
#endif
	VFS_RECORD(vp->v_vfsp, VS_GETPAGE, VS_MISS);

	do {
		do {
			tsize = MIN(vtomi(vp)->mi_curread, count);
			rr.rr_data = base;
			ra.ra_fhandle = *vtofh(vp);
			ra.ra_offset = offset;
			ra.ra_totcount = tsize;
			ra.ra_count = tsize;
			error = rfscall(vtomi(vp), RFS_READ,
					xdr_readargs, (caddr_t)&ra,
					xdr_rdresult, (caddr_t)&rr,
					cred);
		} while (error == ENFS_TRYAGAIN);

		if (!error) {
			error = geterrno(rr.rr_status);
			/*
			 * Can't purge caches here because pages are held by
			 * nfs_getpage.
			 */
		}
#ifdef NFSDEBUG
		dprint(nfsdebug, 3, "nfsread: got %d of %d, error %d\n",
		    tsize, count, error);
#endif
		if (!error) {
			count -= rr.rr_count;
			base += rr.rr_count;
			offset += rr.rr_count;
		}
	} while (!error && count && rr.rr_count == tsize);

	*residp = count;

	if (!error) {
		nattr_to_vattr(vp, &rr.rr_attr, vap);
	}

#ifdef NFSDEBUG
	dprint(nfsdebug, 5, "nfsread: returning %d, resid %d\n",
		error, *residp);
#endif
	return (error);
}

/*ARGSUSED*/
static int
nfs_ioctl(vp, com, data, flag, cred)
	struct vnode *vp;
	int com;
	caddr_t data;
	int flag;
	struct ucred *cred;
{

	VFS_RECORD(vp->v_vfsp, VS_IOCTL, VS_CALL);
	return (EOPNOTSUPP);
}

/*ARGSUSED*/
static int
nfs_select(vp, which, cred)
	struct vnode *vp;
	int which;
	struct ucred *cred;
{

	VFS_RECORD(vp->v_vfsp, VS_SELECT, VS_CALL);
	return (EOPNOTSUPP);
}

static int
nfs_getattr(vp, vap, cred)
	struct vnode *vp;
	struct vattr *vap;
	struct ucred *cred;
{
	int error;
	struct rnode *rp;

#ifdef NFSDEBUG
	dprint(nfsdebug, 4, "nfs_getattr %s %x\n", vtomi(vp)->mi_hostname, vp);
#endif
	rp = vtor(vp);
	if (rp->r_flags & RDIRTY) {
		/*
		 * Since we know we have pages which are dirty because
		 * we went thru rwvp for writing, we sync pages so the
		 * mod time is right.  Note that if a page which is mapped
		 * in user land is modified, the page will not be flushed
		 * until the next sync or appropriate fsync or msync operation.
		 */
		(void) nfs_putpage(vp, 0, 0, 0, cred);
	}
	error = nfsgetattr(vp, vap, cred);
#ifdef NFSDEBUG
	dprint(nfsdebug, 5, "nfs_getattr: returns %d\n", error);
#endif
	return (error);
}

static int
nfs_setattr(vp, vap, cred)
	register struct vnode *vp;
	register struct vattr *vap;
	struct ucred *cred;
{
	int error;
	struct nfssaargs args;
	struct nfsattrstat *ns;

#ifdef NFSDEBUG
	dprint(nfsdebug, 4, "nfs_setattr %s %x\n", vtomi(vp)->mi_hostname, vp);
#endif
	VFS_RECORD(vp->v_vfsp, VS_SETATTR, VS_CALL);

	ns = (struct nfsattrstat *)new_kmem_alloc(sizeof (*ns), KMEM_SLEEP);
	if ((vap->va_nlink != -1) || (vap->va_blocksize != -1) ||
	    (vap->va_rdev != -1) || (vap->va_blocks != -1) ||
	    (vap->va_ctime.tv_sec != -1) || (vap->va_ctime.tv_usec != -1)) {
		error = EINVAL;
	} else {
		RLOCK(vtor(vp));
		if (vap->va_size != -1) {
#ifdef  TRACE
			{
				struct vattr oldvap;

				nfsgetattr(vp, &oldvap, cred);
				trace3(TR_MP_TRUNC, vp, vap->va_size,
					oldvap.va_size);
			}
#endif  TRACE
			pvn_vptrunc(vp, (u_int)vap->va_size, (u_int)(PAGESIZE -
			    (vap->va_size & PAGEOFFSET)));
			(vtor(vp))->r_size = vap->va_size;
		}
		(void) nfs_putpage(vp, 0, 0, 0, cred);
		RUNLOCK(vtor(vp));

		/*
		 * Allow SysV-compatible option to set access and
		 * modified times if root, owner, or write access.
		 *
		 * XXX - For now, va_mtime.tv_usec == -1 flags this.
		 *
		 * XXX - Until an NFS Protocol Revision, this may be
		 *	 simulated by setting the client time in the
		 *	 tv_sec field of the access and modified times
		 *	 and setting the tv_usec field of the modified
		 *	 time to an invalid value (1000000).  This
		 *	 may be detected by servers modified to do the
		 *	 right thing, but will not be disastrous on
		 *	 unmodified servers.
		 */
		if ((vap->va_mtime.tv_sec != -1) &&
		    (vap->va_mtime.tv_usec == -1)) {
			vap->va_atime = time;
			vap->va_mtime.tv_sec = time.tv_sec;
			vap->va_mtime.tv_usec = 1000000;
		}

		vattr_to_sattr(vap, &args.saa_sa);
		args.saa_fh = *vtofh(vp);
		VFS_RECORD(vp->v_vfsp, VS_SETATTR, VS_MISS);
		error = rfscall(vtomi(vp), RFS_SETATTR, xdr_saargs,
		    (caddr_t)&args, xdr_attrstat, (caddr_t)ns, cred);
		if (error == 0) {
			error = geterrno(ns->ns_status);
			if (error == 0) {
				nfs_cache_check(vp, ns->ns_attr.na_mtime,
				    ns->ns_attr.na_size);
				nfs_attrcache(vp, &ns->ns_attr);
			} else {
				PURGE_ATTRCACHE(vp);
				PURGE_STALE_FH(error, vp);
			}
		} else
			PURGE_ATTRCACHE(vp);
	}
	kmem_free((caddr_t)ns, sizeof (*ns));
#ifdef NFSDEBUG
	dprint(nfsdebug, 5, "nfs_setattr: returning %d\n", error);
#endif
	return (error);
}

static int
nfs_access(vp, mode, cred)
	struct vnode *vp;
	int mode;
	struct ucred *cred;
{
	struct vattr va;
	int *gp;

#ifdef NFSDEBUG
	dprint(nfsdebug, 4, "nfs_access %s %x mode %d uid %d\n",
	    vtomi(vp)->mi_hostname, vp, mode, cred->cr_uid);
#endif
	VFS_RECORD(vp->v_vfsp, VS_ACCESS, VS_CALL);

	u.u_error = nfsgetattr(vp, &va, cred);
	if (u.u_error) {
		return (u.u_error);
	}
	/*
	 * If you're the super-user,
	 * you always get access.
	 */
	if (cred->cr_uid == 0)
		return (0);
	/*
	 * Access check is based on only
	 * one of owner, group, public.
	 * If not owner, then check group.
	 * If not a member of the group,
	 * then check public access.
	 */
	if (cred->cr_uid != va.va_uid) {
		mode >>= 3;
		if (cred->cr_gid == va.va_gid)
			goto found;
		gp = cred->cr_groups;
		for (; gp < &cred->cr_groups[NGROUPS] && *gp != NOGROUP; gp++)
			if (va.va_gid == *gp)
				goto found;
		mode >>= 3;
	}
found:
	if ((va.va_mode & mode) == mode) {
		return (0);
	}
#ifdef NFSDEBUG
	dprint(nfsdebug, 5, "nfs_access: returning %d\n", u.u_error);
#endif
	u.u_error = EACCES;
	return (EACCES);
}

static int
nfs_readlink(vp, uiop, cred)
	struct vnode *vp;
	struct uio *uiop;
	struct ucred *cred;
{
	int error;
	struct nfsrdlnres rl;

#ifdef NFSDEBUG
	dprint(nfsdebug, 4, "nfs_readlink %s %x\n", vtomi(vp)->mi_hostname, vp);
#endif
	VFS_RECORD(vp->v_vfsp, VS_READLINK, VS_CALL);

	if (vp->v_type != VLNK)
		return (ENXIO);
	rl.rl_data = (char *)new_kmem_alloc(NFS_MAXPATHLEN, KMEM_SLEEP);
	VFS_RECORD(vp->v_vfsp, VS_READLINK, VS_MISS);
	error = rfscall(vtomi(vp), RFS_READLINK, xdr_fhandle,
	    (caddr_t)vtofh(vp), xdr_rdlnres, (caddr_t)&rl, cred);
	if (!error) {
		error = geterrno(rl.rl_status);
		if (!error) {
			error = uiomove(rl.rl_data, (int)rl.rl_count,
			    UIO_READ, uiop);
		} else {
			PURGE_STALE_FH(error, vp);
		}
	}
	kmem_free((caddr_t)rl.rl_data, NFS_MAXPATHLEN);
#ifdef NFSDEBUG
	dprint(nfsdebug, 5, "nfs_readlink: returning %d\n", error);
#endif
	return (error);
}

/*ARGSUSED*/
static int
nfs_fsync(vp, cred)
	struct vnode *vp;
	struct ucred *cred;
{
	register struct rnode *rp;

#ifdef NFSDEBUG
	dprint(nfsdebug, 4, "nfs_fsync %s %x\n", vtomi(vp)->mi_hostname, vp);
#endif
	VFS_RECORD(vp->v_vfsp, VS_FSYNC, VS_CALL);

	rp = vtor(vp);
	RLOCK(rp);  /* XXX-VLS Why? */
	(void)nfs_putpage(vp, 0, 0, 0, cred);
	RUNLOCK(rp);
	return (rp->r_error);
}

/*
 * Weirdness: if the file was removed while it was open it got renamed
 * (by nfs_remove) instead.  Here we remove the renamed file.
 */
/*ARGSUSED*/
static int
nfs_inactive(vp, cred)
	register struct vnode *vp;
	struct ucred *cred;
{
	register struct rnode *rp;
	struct nfsdiropargs da;
	enum nfsstat status;
	int error = 0;
	struct rnode *unlrp;

#ifdef NFSDEBUG
	dprint(nfsdebug, 4, "nfs_inactive %s, %x\n",
	    vtomi(vp)->mi_hostname, vp);
#endif
	VFS_RECORD(vp->v_vfsp, VS_INACTIVE, VS_CALL);

	rp = vtor(vp);

	if (vp->v_count != 0) {
		return (0);
	}

redo:
	if (rp->r_unldvp != NULL) {
		/*
		 * Lock down directory where unlinked-open file got renamed.
		 * This keeps a lookup from finding this rnode.
		 * Fix bug 1034328 - corbin
		 * Lock rnode down until we are finished doing the remove
		 * to prevent a race condition.
		 * The locking sequence is important here to prevent deadlock.
		 */
		unlrp = vtor(rp->r_unldvp);
		RLOCK(unlrp);
		RLOCK(rp);

		if (vp->v_count != 0) {
			RUNLOCK(rp);
			RUNLOCK(unlrp);
			return(0);
		}

		if (rp->r_unldvp == NULL) {
			RUNLOCK(rp);
			RUNLOCK(unlrp);
			goto redo;
		}

		rp->r_flags &= ~RDIRTY;
		trace1(TR_MP_TRUNC0, vp);
		pvn_vptrunc(vp, 0, 0);		/* toss all pages */

		/*
		 * Do the remove operation on the renamed file
		 */
		setdiropargs(&da, rp->r_unlname, rp->r_unldvp);
		VFS_RECORD(vp->v_vfsp, VS_INACTIVE, VS_MISS);
		error = rfscall(vtomi(rp->r_unldvp), RFS_REMOVE,
		    xdr_diropargs, (caddr_t)&da,
		    xdr_enum, (caddr_t)&status, rp->r_unlcred);
		if (error == 0)
			error = geterrno(status);

		/*
		 * Release stuff held for the remove
		 */
		VN_RELE(rp->r_unldvp);
		rp->r_unldvp = NULL;
		kmem_free((caddr_t)rp->r_unlname, NFS_MAXNAMLEN);
		rp->r_unlname = NULL;
		crfree(rp->r_unlcred);
		rp->r_unlcred = NULL;
		RUNLOCK(rp); /* Fix bug 1034328 */
		RUNLOCK(unlrp);
	} else {
		if (vp->v_pages != 0) {
			if (rp->r_error) {
				(void) nfs_putpage(vp, 0, 0, B_INVAL, cred);
				dnlc_purge_vp(vp);
				rp->r_error = 0;
			} else {
				(void) nfs_putpage(vp, 0, 0, 0, cred);
				if (rp->r_error) {
					(void) nfs_putpage(vp, 0, 0, B_INVAL,
					    cred);
					dnlc_purge_vp(vp);
					rp->r_error = 0;
				}
			}
		}
	}


	/*
	 * Check to be sure that the rnode has not been grabbed before
	 * freeing it.
	 */
	if (vp->v_count == 0) {
		if (rp->r_unldvp != NULL) {
			goto redo;
		}
		rp_rmhash(rp);
		rfree(rp);
	}

#ifdef NFSDEBUG
	dprint(nfsdebug, 5, "nfs_inactive done\n");
#endif
	return (error);
}

int nfs_dnlc = 1;	/* use dnlc */

/*
 * Remote file system operations having to do with directory manipulation.
 */
/* ARGSUSED */
static int
nfs_lookup(dvp, nm, vpp, cred, pnp, flags)
	struct vnode *dvp;
	char *nm;
	struct vnode **vpp;
	struct ucred *cred;
	struct pathname *pnp;
	int flags;
{
	int error;
	struct nfsdiropargs da;
	struct  nfsdiropres *dr;

#ifdef NFSDEBUG
	dprint(nfsdebug, 4, "nfs_lookup %s %x '%s'\n",
	    vtomi(dvp)->mi_hostname, dvp, nm);
#endif
	VFS_RECORD(dvp->v_vfsp, VS_LOOKUP, VS_CALL);

	/*
	 * Before checking dnlc, validate caches
	 */
	error = nfs_validate_caches(dvp, cred);
	if (error) {
		return (error);
	}

	RLOCK(vtor(dvp));
	*vpp = (struct vnode *)dnlc_lookup(dvp, nm, cred);
	if (*vpp) {
		VN_HOLD(*vpp);

		/*
		 *	Make sure we can search this directory (after the
		 *	fact).  It's done here because over the wire lookups
		 *	verify permissions on the server.  VOP_ACCESS will
		 *	one day go over the wire, so let's use it sparingly.
		 */
		error = VOP_ACCESS(dvp, VEXEC, cred);
		if (error) {
			VN_RELE(*vpp);
			RUNLOCK(vtor(dvp));
			return (error);
		}
	} else {
		VFS_RECORD(dvp->v_vfsp, VS_LOOKUP, VS_MISS);

		dr = (struct  nfsdiropres *)
			new_kmem_alloc(sizeof (*dr), KMEM_SLEEP);
		setdiropargs(&da, nm, dvp);

		error = rfscall(vtomi(dvp), RFS_LOOKUP, xdr_diropargs,
		    (caddr_t)&da, xdr_diropres, (caddr_t)dr, cred);
		if (error == 0) {
			error = geterrno(dr->dr_status);
			PURGE_STALE_FH(error, dvp);
		}
		if (error == 0) {
			*vpp = makenfsnode(&dr->dr_fhandle,
			    &dr->dr_attr, dvp->v_vfsp);
			if (nfs_dnlc && (vtomi(*vpp)->mi_noac == 0)) {
				dnlc_enter(dvp, nm, *vpp, cred);
			}
		} else {
			*vpp = (struct vnode *)0;
		}

		kmem_free((caddr_t)dr, sizeof (*dr));
	}
	/*
	 * If vnode is a device create special vnode
	 */
	if (!error && ISVDEV((*vpp)->v_type)) {
		struct vnode *newvp;

		newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type);
		VN_RELE(*vpp);
		*vpp = newvp;
	}
	RUNLOCK(vtor(dvp));
#ifdef NFSDEBUG
	dprint(nfsdebug, 5, "nfs_lookup returning %d vp = %x\n", error, *vpp);
#endif
	return (error);
}

/*ARGSUSED*/
static int
nfs_create(dvp, nm, va, exclusive, mode, vpp, cred)
	struct vnode *dvp;
	char *nm;
	struct vattr *va;
	enum vcexcl exclusive;
	int mode;
	struct vnode **vpp;
	struct ucred *cred;
{
	int error;
	struct nfscreatargs args;
	struct  nfsdiropres *dr;
	int file_exist = 0; /* Fix bug 1065361 */

#ifdef NFSDEBUG
	dprint(nfsdebug, 4, "nfs_create %s %x '%s' excl=%d, mode=%o\n",
	    vtomi(dvp)->mi_hostname, dvp, nm, exclusive, mode);
#endif
	VFS_RECORD(dvp->v_vfsp, VS_CREATE, VS_CALL);

		/*
		 * This is buggy: there is a race between the lookup and the
		 * create.  We should send the exclusive flag over the wire.
		 */
		/* Fix bug 1065361: huey
		 * Need to do an nfs_lookup to determine if file exists
		 * or not, to see if the preferred gid should be used.
		 */
	error = nfs_lookup(dvp, nm, vpp, cred,
				(struct pathname *)NULL, 0);
	if (!error) {
		VN_RELE(*vpp);
		if (exclusive == EXCL)
			return (EEXIST);
		file_exist = 1;
	}

	*vpp = (struct vnode *)0;

	dr = (struct  nfsdiropres *)new_kmem_alloc(sizeof (*dr), KMEM_SLEEP);
	setdiropargs(&args.ca_da, nm, dvp);

	/*
	 * Decide what the group-id of the created file should be.
	 * Set it in attribute list as advisory...then do a setattr
	 * if the server didn't get it right the first time.
	 */
	va->va_gid = (short) setdirgid(dvp);

	/*
	 * This is a completely gross hack to make mknod
	 * work over the wire until we can wack the protocol
	 */
#define	IFCHR		0020000		/* character special */
#define	IFBLK		0060000		/* block special */
#define	IFSOCK		0140000		/* socket */
	if (va->va_type == VCHR) {
		va->va_mode |= IFCHR;
		va->va_size = (u_long)va->va_rdev;
	} else if (va->va_type == VBLK) {
		va->va_mode |= IFBLK;
		va->va_size = (u_long)va->va_rdev;
	} else if (va->va_type == VFIFO) {
		va->va_mode |= IFCHR;		/* xtra kludge for namedpipe */
		va->va_size = (u_long)NFS_FIFO_DEV;	/* blech */
	} else if (va->va_type == VSOCK) {
		va->va_mode |= IFSOCK;
	}

	vattr_to_sattr(va, &args.ca_sa);
	VFS_RECORD(dvp->v_vfsp, VS_CREATE, VS_MISS);
	RLOCK(vtor(dvp));
	dnlc_remove(dvp, nm);
	error = rfscall(vtomi(dvp), RFS_CREATE, xdr_creatargs, (caddr_t)&args,
	    xdr_diropres, (caddr_t)dr, cred);
	PURGE_ATTRCACHE(dvp);	/* mod time changed */
	if (!error) {
		error = geterrno(dr->dr_status);
		if (!error) {
			short gid;

			*vpp = makenfsnode(&dr->dr_fhandle, &dr->dr_attr,
			    dvp->v_vfsp);
			if (va->va_size == 0) {
				(vtor(*vpp))->r_size = 0;
				if (((*vpp)->v_pages != NULL) &&
				((*vpp)->v_type != VCHR) &&
				((*vpp)->v_type != VSOCK)) {
					RLOCK(vtor(*vpp));
					(vtor(*vpp))->r_flags &= ~RDIRTY;
					(vtor(*vpp))->r_error = 0;
					pvn_vptrunc(*vpp, 0, 0);
					RUNLOCK(vtor(*vpp));
				}
			}
			if (nfs_dnlc && (vtomi(*vpp)->mi_noac == 0)) {
				dnlc_enter(dvp, nm, *vpp, cred);
			}

			/*
			 * Make sure the gid was set correctly.
			 * If not, try to set it (but don't lose
			 * any sleep over it).
			 */
			gid = va->va_gid;
			nattr_to_vattr(*vpp, &dr->dr_attr, va);

			/* Fix buig 1065361, huey
			 * The logix should apply to new file.
			 */
			if ((gid != va->va_gid) && (!file_exist)) {
				struct vattr vattr;

				vattr_null(&vattr);
				vattr.va_gid = gid;
				(void) nfs_setattr(*vpp, &vattr, cred);
				va->va_gid = gid;
			}

			/*
			 * If vnode is a device create special vnode
			 */
			if (ISVDEV((*vpp)->v_type)) {
				struct vnode *newvp = specvp(
				    *vpp, (*vpp)->v_rdev, (*vpp)->v_type);
				VN_RELE(*vpp);
				*vpp = newvp;
			}
		} else {
			PURGE_STALE_FH(error, dvp);
		}
	}
	RUNLOCK(vtor(dvp));
	kmem_free((caddr_t)dr, sizeof (*dr));
#ifdef NFSDEBUG
	dprint(nfsdebug, 5, "nfs_create returning %d\n", error);
#endif
	return (error);
}

/*
 * Weirdness: if the vnode to be removed is open
 * we rename it instead of removing it and nfs_inactive
 * will remove the new name.
 */
static int
nfs_remove(dvp, nm, cred)
	struct vnode *dvp;
	char *nm;
	struct ucred *cred;
{
	int error;
	struct nfsdiropargs da;
	enum nfsstat status;
	struct vnode *vp;
	struct vnode *oldvp;
	struct vnode *realvp;
	char *tmpname;

#ifdef NFSDEBUG
	dprint(nfsdebug, 4, "nfs_remove %s %x '%s'\n",
	    vtomi(dvp)->mi_hostname, vp, nm);
#endif
	VFS_RECORD(dvp->v_vfsp, VS_REMOVE, VS_CALL);

	status = NFS_OK;
	error = nfs_lookup(dvp, nm, &vp, cred, (struct pathname *) NULL, 0);
	/*
	 * Lookup may have returned a non-nfs vnode!
	 * get the real vnode.
	 */
	if (error == 0 && VOP_REALVP(vp, &realvp) == 0) {
		oldvp = vp;
		vp = realvp;
	} else {
		oldvp = NULL;
	}
	if (error == 0 && vp != NULL) {
		RLOCK(vtor(dvp));
		/*
		 * We need to flush the name cache so we can
		 * check the real reference count on the vnode
		 */
		dnlc_purge_vp(vp);

		if ((vp->v_count > 1) && vtor(vp)->r_unldvp == NULL) {
			tmpname = newname();
			error = nfs_rename(dvp, nm, dvp, tmpname, cred);
			if (error) {
				kmem_free((caddr_t)tmpname, NFS_MAXNAMLEN);
			} else {
				VN_HOLD(dvp);
				vtor(vp)->r_unldvp = dvp;
				vtor(vp)->r_unlname = tmpname;
				if (vtor(vp)->r_unlcred != NULL) {
					crfree(vtor(vp)->r_unlcred);
				}
				crhold(cred);
				vtor(vp)->r_unlcred = cred;
			}
		} else {
			vtor(vp)->r_flags &= ~RDIRTY;
			trace1(TR_MP_TRUNC0, vp);
			pvn_vptrunc(vp, 0, 0);		/* toss all pages */
			setdiropargs(&da, nm, dvp);
			VFS_RECORD(dvp->v_vfsp, VS_REMOVE, VS_MISS);
			error = rfscall(vtomi(dvp), RFS_REMOVE, xdr_diropargs,
			    (caddr_t)&da, xdr_enum, (caddr_t)&status,
			    cred);
			PURGE_ATTRCACHE(dvp);	/* mod time changed */
			PURGE_ATTRCACHE(vp);	/* link count changed */
			PURGE_STALE_FH(error ? error : geterrno(status), dvp);
		}
		RUNLOCK(vtor(dvp));
		if (oldvp) {
			VN_RELE(oldvp);
		} else {
			VN_RELE(vp);
		}
	}
	if (error == 0) {
		error = geterrno(status);
	}
#ifdef NFSDEBUG
	dprint(nfsdebug, 5, "nfs_remove: returning %d\n", error);
#endif
	return (error);
}

static int
nfs_link(vp, tdvp, tnm, cred)
	struct vnode *vp;
	struct vnode *tdvp;
	char *tnm;
	struct ucred *cred;
{
	int error;
	struct nfslinkargs args;
	enum nfsstat status;
	struct vnode *realvp;

	if (VOP_REALVP(vp, &realvp) == 0) {
		vp = realvp;
	}

#ifdef NFSDEBUG
	dprint(nfsdebug, 4, "nfs_link from %s %x to %s %x '%s'\n",
	    vtomi(vp)->mi_hostname, vp, vtomi(tdvp)->mi_hostname, tdvp, tnm);
#endif
	VFS_RECORD(vp->v_vfsp, VS_LINK, VS_CALL);
	VFS_RECORD(vp->v_vfsp, VS_LINK, VS_MISS);

	args.la_from = *vtofh(vp);
	setdiropargs(&args.la_to, tnm, tdvp);
	RLOCK(vtor(tdvp));
	error = rfscall(vtomi(vp), RFS_LINK, xdr_linkargs, (caddr_t)&args,
	    xdr_enum, (caddr_t)&status, cred);
	PURGE_ATTRCACHE(tdvp);	/* mod time changed */
	PURGE_ATTRCACHE(vp);	/* link count changed */
	RUNLOCK(vtor(tdvp));
	if (!error) {
		error = geterrno(status);
		PURGE_STALE_FH(error, vp);
		PURGE_STALE_FH(error, tdvp);
	}
#ifdef NFSDEBUG
	dprint(nfsdebug, 5, "nfs_link returning %d\n", error);
#endif
	return (error);
}

static int
nfs_rename(odvp, onm, ndvp, nnm, cred)
	struct vnode *odvp;
	char *onm;
	struct vnode *ndvp;
	char *nnm;
	struct ucred *cred;
{
	int error;
	enum nfsstat status;
	struct nfsrnmargs args;
	struct vnode *realvp;

#ifdef NFSDEBUG
	dprint(nfsdebug, 4, "nfs_rename from %s %x '%s' to %s %x '%s'\n",
	    vtomi(odvp)->mi_hostname, odvp, onm,
	    vtomi(ndvp)->mi_hostname, ndvp, nnm);
#endif
	VFS_RECORD(odvp->v_vfsp, VS_RENAME, VS_CALL);

	if (VOP_REALVP(ndvp, &realvp) == 0) {
		ndvp = realvp;
	}

	if (!strcmp(onm, ".") || !strcmp(onm, "..") || !strcmp(nnm, ".") ||
	    !strcmp (nnm, "..")) {
		error = EINVAL;
	} else {
		RLOCK(vtor(odvp));
		dnlc_remove(odvp, onm);
		dnlc_remove(ndvp, nnm);
		if (ndvp != odvp) {
			RLOCK(vtor(ndvp));
		}
		setdiropargs(&args.rna_from, onm, odvp);
		setdiropargs(&args.rna_to, nnm, ndvp);
		VFS_RECORD(odvp->v_vfsp, VS_RENAME, VS_MISS);
		error = rfscall(vtomi(odvp), RFS_RENAME, xdr_rnmargs,
		    (caddr_t)&args, xdr_enum, (caddr_t)&status,
		    cred);
		PURGE_ATTRCACHE(odvp);	/* mod time changed */
		PURGE_ATTRCACHE(ndvp);	/* mod time changed */
		RUNLOCK(vtor(odvp));
		if (ndvp != odvp) {
			RUNLOCK(vtor(ndvp));
		}
		if (!error) {
			error = geterrno(status);
			PURGE_STALE_FH(error, odvp);
			PURGE_STALE_FH(error, ndvp);
		}
	}
#ifdef NFSDEBUG
	dprint(nfsdebug, 5, "nfs_rename returning %d\n", error);
#endif
	return (error);
}

static int
nfs_mkdir(dvp, nm, va, vpp, cred)
	struct vnode *dvp;
	char *nm;
	register struct vattr *va;
	struct vnode **vpp;
	struct ucred *cred;
{
	int error;
	struct nfscreatargs args;
	struct  nfsdiropres *dr;

#ifdef NFSDEBUG
	dprint(nfsdebug, 4, "nfs_mkdir %s %x '%s'\n",
	    vtomi(dvp)->mi_hostname, dvp, nm);
#endif
	VFS_RECORD(dvp->v_vfsp, VS_MKDIR, VS_CALL);
	VFS_RECORD(dvp->v_vfsp, VS_MKDIR, VS_MISS);

	dr = (struct  nfsdiropres *)
			new_kmem_alloc(sizeof (*dr), KMEM_SLEEP);
	setdiropargs(&args.ca_da, nm, dvp);

	/*
	 * Decide what the group-id and set-gid bit of the created directory
	 * should be.  May have to do a setattr to get the gid right.
	 */
	va->va_gid = (short) setdirgid(dvp);
	va->va_mode = (u_short) setdirmode(dvp, va->va_mode);

	vattr_to_sattr(va, &args.ca_sa);
	RLOCK(vtor(dvp));
	dnlc_remove(dvp, nm);
	error = rfscall(vtomi(dvp), RFS_MKDIR, xdr_creatargs, (caddr_t)&args,
	    xdr_diropres, (caddr_t)dr, cred);
	PURGE_ATTRCACHE(dvp);	/* mod time changed */
	RUNLOCK(vtor(dvp));
	if (!error) {
		error = geterrno(dr->dr_status);
		PURGE_STALE_FH(error, dvp);
	}
	if (!error) {
		short gid;

		/*
		 * The attributes returned by RFS_MKDIR are now correct and
		 * may be safely used by the clients.
		 */
		*vpp = makenfsnode(&dr->dr_fhandle, &dr->dr_attr, dvp->v_vfsp);
		PURGE_ATTRCACHE(*vpp);

		if (nfs_dnlc && (vtomi(*vpp)->mi_noac == 0)) {
			dnlc_enter(dvp, nm, *vpp, cred);
		}

		/*
		 * Make sure the gid was set correctly.
		 * If not, try to set it (but don't lose
		 * any sleep over it).
		 */
		gid = va->va_gid;
		nattr_to_vattr(*vpp, &dr->dr_attr, va);
		if (gid != va->va_gid) {
			vattr_null(va);
			va->va_gid = gid;
			(void) nfs_setattr(*vpp, va, cred);
		}
	} else {
		*vpp = (struct vnode *)0;
	}
	kmem_free((caddr_t)dr, sizeof (*dr));
#ifdef NFSDEBUG
	dprint(nfsdebug, 5, "nfs_mkdir returning %d\n", error);
#endif
	return (error);
}

static int
nfs_rmdir(dvp, nm, cred)
	struct vnode *dvp;
	char *nm;
	struct ucred *cred;
{
	int error;
	enum nfsstat status;
	struct nfsdiropargs da;

#ifdef NFSDEBUG
	dprint(nfsdebug, 4, "nfs_rmdir %s %x '%s'\n",
	    vtomi(dvp)->mi_hostname, dvp, nm);
#endif
	VFS_RECORD(dvp->v_vfsp, VS_RMDIR, VS_CALL);
	VFS_RECORD(dvp->v_vfsp, VS_RMDIR, VS_MISS);

	setdiropargs(&da, nm, dvp);
	RLOCK(vtor(dvp));
	dnlc_purge_vp(dvp);
	error = rfscall(vtomi(dvp), RFS_RMDIR, xdr_diropargs, (caddr_t)&da,
	    xdr_enum, (caddr_t)&status, cred);
	PURGE_ATTRCACHE(dvp);	/* mod time changed */
	RUNLOCK(vtor(dvp));
	if (!error) {
		error = geterrno(status);
		PURGE_STALE_FH(error, dvp);
	}
#ifdef NFSDEBUG
	dprint(nfsdebug, 5, "nfs_rmdir returning %d\n", error);
#endif
	return (error);
}

static int
nfs_symlink(dvp, lnm, tva, tnm, cred)
	struct vnode *dvp;
	char *lnm;
	struct vattr *tva;
	char *tnm;
	struct ucred *cred;
{
	int error;
	struct nfsslargs args;
	enum nfsstat status;

#ifdef NFSDEBUG
	dprint(nfsdebug, 4, "nfs_symlink %s %x '%s' to '%s'\n",
	    vtomi(dvp)->mi_hostname, dvp, lnm, tnm);
#endif
	VFS_RECORD(dvp->v_vfsp, VS_SYMLINK, VS_CALL);
	VFS_RECORD(dvp->v_vfsp, VS_SYMLINK, VS_MISS);

	setdiropargs(&args.sla_from, lnm, dvp);
	vattr_to_sattr(tva, &args.sla_sa);
	args.sla_tnm = tnm;
	error = rfscall(vtomi(dvp), RFS_SYMLINK, xdr_slargs, (caddr_t)&args,
	    xdr_enum, (caddr_t)&status, cred);
	PURGE_ATTRCACHE(dvp);	/* mod time changed */
	if (!error) {
		error = geterrno(status);
		PURGE_STALE_FH(error, dvp);
	}
#ifdef NFSDEBUG
	dprint(nfsdebug, 5, "nfs_sysmlink: returning %d\n", error);
#endif
	return (error);
}

/*
 * Read directory entries.
 * There are some weird things to look out for here.  The uio_offset
 * field is either 0 or it is the offset returned from a previous
 * readdir.  It is an opaque value used by the server to find the
 * correct directory block to read.  The byte count must be at least
 * vtoblksz(vp) bytes.  The count field is the number of blocks to
 * read on the server.  This is advisory only, the server may return
 * only one block's worth of entries.  Entries may be compressed on
 * the server.
 */
static int
nfs_readdir(vp, uiop, cred)
	struct vnode *vp;
	register struct uio *uiop;
	struct ucred *cred;
{
	int error = 0;
	struct iovec *iovp;
	unsigned alloc_count, count;
	struct nfsrddirargs rda;
	struct nfsrddirres  rd;
	struct rnode *rp;

	VFS_RECORD(vp->v_vfsp, VS_READDIR, VS_CALL);

	rp = vtor(vp);

	/*
	 *	N.B.:  it appears here that we're treating the directory
	 *	cookie as an offset.  Not true.  It's simply that getdents
	 *	passes us the cookie to use in the uio_offset field of a
	 *	uio structure.
	 */
	if ((rp->r_lastcookie == (u_long)uiop->uio_offset) &&
	(rp->r_flags & REOF) && (timercmp(&time, &rp->r_attrtime, <))) {
		return (0);
	}
	iovp = uiop->uio_iov;
	alloc_count = count = iovp->iov_len;
#ifdef NFSDEBUG
	dprint(nfsdebug, 4, "nfs_readdir %s %x count %d offset %ld\n",
	    vtomi(vp)->mi_hostname, vp, count, uiop->uio_offset);
#endif
	/*
	 * XXX We should do some kind of test for count >= DEV_BSIZE
	 */
	if (uiop->uio_iovcnt != 1) {
		return (EINVAL);
	}
	rda.rda_offset = uiop->uio_offset;
	rd.rd_entries = (struct dirent *)
				new_kmem_alloc(alloc_count, KMEM_SLEEP);
	rda.rda_fh = *vtofh(vp);
	do {
		count = MIN(count, vtomi(vp)->mi_curread);
		rda.rda_count = count;
		rd.rd_size = count;
		VFS_RECORD(vp->v_vfsp, VS_READDIR, VS_MISS);

		error = rfscall(vtomi(vp), RFS_READDIR, xdr_rddirargs,
				(caddr_t)&rda, xdr_getrddirres, (caddr_t)&rd,
				cred);
	} while (error == ENFS_TRYAGAIN);

	if (!error) {
		error = geterrno(rd.rd_status);
		PURGE_STALE_FH(error, vp);
	}
	if (!error) {
		/*
		 * move dir entries to user land
		 */
		if (rd.rd_size) {
			error = uiomove((caddr_t)rd.rd_entries,
			    (int)rd.rd_size, UIO_READ, uiop);
			rda.rda_offset = rd.rd_offset;
			uiop->uio_offset = rd.rd_offset;
		}
		if (rd.rd_eof) {
			rp->r_flags |= REOF;
			rp->r_lastcookie = uiop->uio_offset;
		}
	}
	kmem_free((caddr_t)rd.rd_entries, alloc_count);
#ifdef NFSDEBUG
	dprint(nfsdebug, 5, "nfs_readdir: returning %d resid %d, offset %ld\n",
	    error, uiop->uio_resid, uiop->uio_offset);
#endif
	return (error);
}

static struct buf async_bufhead = {
	B_HEAD,
	(struct buf *)NULL, (struct buf *)NULL,
	&async_bufhead, &async_bufhead,
};

static int async_daemon_ready;		/* number of async biod's ready */
static int async_daemon_count;		/* number of existing biod's */

int nfs_wakeup_one_biod = 1;

static int
nfs_strategy(bp)
	register struct buf *bp;
{

#ifdef NFSDEBUG
	dprint(nfsdebug, 4, "nfs_strategy bp %x lbn %d\n", bp, bp->b_blkno);
#endif

	if (async_daemon_ready > 0 && (bp->b_flags & B_ASYNC)) {

		binstailfree(bp, &async_bufhead);

		async_daemon_ready--;
		if (nfs_wakeup_one_biod == 1)	{
			wakeup_one((caddr_t)&async_bufhead);
		} else {
			wakeup((caddr_t)&async_bufhead);
		}
		return (0);
	} else {
		return (do_bio(bp));
	}
}

async_daemon()
{
	register struct buf *bp;
	struct rnode *rp;
	char exitsig;

	relvm(u.u_procp);	/* First, release resources */

	async_daemon_count++;
	if (setjmp(&u.u_qsave)) {
		if (async_daemon_count == 0) {
			/*
			 * We already were processing requests below
			 * and we were signaled again.  So this time,
			 * just give up and abort all the requests.
			 */
			while ((bp = async_bufhead.b_actf) != &async_bufhead) {
				bremfree(bp);
				bp->b_flags |= B_ERROR;
				/*
				 * Since we are always ASYNC pvn_done
				 * will free the buf.
				 */
				pvn_done(bp);
			}
		} else {
			async_daemon_count--;
			async_daemon_ready--;
			/*
			 * If we were the last async daemon,
			 * process all the queued requests.
			 */
			if (async_daemon_count == 0) {
				while ((bp = async_bufhead.b_actf) !=
				    &async_bufhead) {
					bremfree(bp);
					rp = vtor(bp->b_vp);
					/*
					 * Since we are ASYNC do_bio will
					 * free the bp.
					 */
					if (do_bio(bp) == NFS_CACHEINVALERR) {
						nfs_purge_caches(rtov(rp));
					}
				}
			}
		}
		exitsig = u.u_procp->p_cursig;
		if (exitsig != SIGTERM && exitsig != SIGKILL) {
			log(LOG_WARNING,
				"async_daemon (pid %d) exiting on signal %d\n",
				u.u_procp->p_pid, exitsig);
		}
		exit(0);
	}

	for (;;) {
		async_daemon_ready++;
		while ((bp = async_bufhead.b_actf) == &async_bufhead) {
			(void) sleep((caddr_t)&async_bufhead, PZERO + 1);
		}
		bremfree(bp);
		rp = vtor(bp->b_vp);
		if (do_bio(bp) == NFS_CACHEINVALERR) {
			nfs_purge_caches(rtov(rp));
		}
	}
}

static int
do_bio(bp)
	register struct buf *bp;
{
	register struct rnode *rp = vtor(bp->b_vp);
	struct vattr va;
	long count;
	int error;
	int read, async;

	read = bp->b_flags & B_READ;
	async = bp->b_flags & B_ASYNC;
#ifdef NFSDEBUG
	dprint(nfsdebug, 4,
"do_bio: addr %x, blk %ld, offset %ld, size %ld, B_READ %x B_ASYNC %x\n",
	    bp->b_un.b_addr, bp->b_blkno, dbtob(bp->b_blkno),
	    bp->b_bcount, read, async);
#endif

	if (read) {
		error = bp->b_error = nfsread(bp->b_vp, bp->b_un.b_addr,
		    (u_int)dbtob(bp->b_blkno), bp->b_bcount,
		    &bp->b_resid, rp->r_cred, &va);
		if (!error) {
			if (bp->b_resid) {
				/*
				 * Didn't get it all because we hit EOF,
				 * zero all the memory beyond the EOF.
				 */
				bzero(bp->b_un.b_addr +
				    (bp->b_bcount - bp->b_resid),
				    (u_int)bp->b_resid);
			}
			if (bp->b_resid == bp->b_bcount &&
			    dbtob(bp->b_blkno) >= rp->r_size) {
				/*
				 * We didn't read anything at all as we are
				 * past EOF.  Return an error indicator back
				 * but don't destroy the pages (yet).
				 */
				error = NFS_EOF;
			}
		}
	} else {
		/*
		 * If the write fails all future writes will get
		 * an error until the file is closed or munmapped.
		 */
		/*
		 * Fix bug 1030884
		 * Completely gross hack to make writes to the middle
		 * of a file succeed if the remote filesystem is full.
		 * Hack also in place in nfs_rdwr().
		 */

		if (rp->r_error == 0 || rp->r_error == ENOSPC) {
			count = MIN(bp->b_bcount, rp->r_size -
			    dbtob(bp->b_blkno));
			if (count < 0) {
				panic("do_bio: write count < 0");
			}
			if (rp->r_error == ENOSPC &&
			    (count + dbtob(bp->b_blkno) >
				rp->r_attr.va_size))
			    error = bp->b_error = rp->r_error;
			else
			    rp->r_error = error = bp->b_error = nfswrite(bp->b_vp,
			    bp->b_un.b_addr, (u_int)dbtob(bp->b_blkno),
			    count, rp->r_cred);
		} else
			error = bp->b_error = rp->r_error;
	}

	if (!error && read) {
		if (!CACHE_VALID(rp, va.va_mtime, va.va_size)) {
			/*
			 * read, if cache is not valid mark this bp
			 * with an error so it will be freed by pvn_done
			 * and return a special error, NFS_CACHEINVALERR,
			 * so caller can flush caches and re-do the operation.
			 */
			error = NFS_CACHEINVALERR;
			bp->b_error = EINVAL;
		} else {
			nfs_attrcache_va(rtov(rp), &va);
		}
	}

	if (error != 0 && error != NFS_EOF) {
		bp->b_flags |= B_ERROR;
	}

	/*
	 * Call pvn_done() to free the bp and pages.  If not ASYNC
	 * then we have to call pageio_done() to free the bp.
	 */
	pvn_done(bp);
	if (!async) {
		pageio_done(bp);
	}

#ifdef NFSDEBUG
	dprint(nfsdebug, 5, "do_bio: error %d, bp %x B_READ %x B_ASYNC %d\n",
	    error, bp, read, async);
#endif
	return (error);
}

static int
nfs_noop()
{

	return (EREMOTE);
}

/*
 * Record-locking requests are passed to the local Lock-Manager daemon.
 */
static int
nfs_lockctl(vp, ld, cmd, cred, clid)
	struct vnode *vp;
	struct flock *ld;
	int cmd;
	struct ucred *cred;
	int clid;
{
	lockhandle_t lh;
	struct eflock eld;
	int error;

	ASSERT(sizeof (lh.lh_id) == sizeof (fhandle_t));

	VFS_RECORD(vp->v_vfsp, VS_LOCKCTL, VS_CALL);

	/*
	 * If we are setting a lock mark the vnode VNOCACHE so the page
	 * cache does not give inconsistent results on locked files shared
	 * between clients.  The VNOCACHE flag is never turned off as long
	 * as the vnode is active because it is hard to figure out when the
	 * last lock is gone.
	 * XXX - what if some already has the vnode mapped in?
	 */
/*	if (((vp->v_flag & VNOCACHE) == 0) &&
			Fix bug 1052330 */
	if ((ld->l_type != F_UNLCK) && (cmd != F_GETLK)) {
		vp->v_flag |= VNOCACHE;
		(void)nfs_putpage(vp, 0, 0, B_INVAL, cred);
		PURGE_ATTRCACHE(vp);
	}
	lh.lh_vp = vp;
	lh.lh_servername = vtomi(vp)->mi_hostname;
	bcopy((caddr_t)vtofh(vp), (caddr_t)&lh.lh_id, sizeof (fhandle_t));
	eld.l_type = ld->l_type;
	eld.l_whence = ld->l_whence;
	eld.l_start = ld->l_start;
	eld.l_len = ld->l_len;
	eld.l_pid = ld->l_pid;
	eld.l_xxx = ld->l_xxx;
	error = klm_lockctl(&lh, &eld, cmd, cred, clid);
	if (cmd == F_GETLK) {
		ld->l_type = eld.l_type;
		if (eld.l_type != F_UNLCK) {
			ld->l_whence = eld.l_whence;
			ld->l_start = eld.l_start;
			ld->l_len = eld.l_len;
			ld->l_pid = eld.l_pid;
			ld->l_xxx = eld.l_xxx;
		}
	}
	return (error);
}

int nfs_nra = 1;	/* number of pages to read ahead */
int nfs_lostpage;	/* number of times we lost original page */

/*
 * Called from pvn_getpages or nfs_getpage to get a particular page.
 * When we are called the rnode has already locked by nfs_getpage.
 */
/*ARGSUSED*/
static int
nfs_getapage(vp, off, protp, pl, plsz, seg, addr, rw, cred)
	struct vnode *vp;
	u_int off;
	u_int *protp;
	struct page *pl[];		/* NULL if async IO is requested */
	u_int plsz;
	struct seg *seg;
	addr_t addr;
	enum seg_rw rw;
	struct ucred *cred;
{
	register struct rnode *rp;
	register u_int bsize;
	struct buf *bp;
	struct page *pp, *pp2, **ppp, *pagefound;
	daddr_t lbn;
	u_int io_off, io_len;
	u_int blksize, blkoff;
	int err;
	int readahead;

	rp = vtor(vp);
#ifdef NFSDEBUG
	dprint(nfsdebug, 4,
	    "nfs_getapage: vp %x size %d off %d pl %x addr %x\n",
	    vp, rp->r_size, off, pl, addr);
#endif
	VFS_RECORD(vp->v_vfsp, VS_GETPAGE, VS_CALL);

	bsize = vp->v_vfsp->vfs_bsize;
reread:
	err = 0;
	lbn = off / bsize;
	blkoff = lbn * bsize;

	if (rp->r_nextr == off && !(vp->v_flag & VNOCACHE)) {
		readahead = nfs_nra;
	} else {
		readahead = 0;
	}

#ifdef NFSDEBUG
	dprint(nfsdebug, 1, "nfs_getapage: nextr %d off %d size %d ra %d ",
	    rp->r_nextr, off, rp->r_size, readahead);
#endif

again:
	if ((pagefound = page_find(vp, off)) == NULL) {
		/*
		 * Need to go to server to get a block
		 */

		if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
			/*
			 * If less than a block left in
			 * file read less than a block.
			 */
			if (rp->r_size <= off) {
				/*
				 * Trying to access beyond EOF,
				 * set up to get at least one page.
				 */
				blksize = off + PAGESIZE - blkoff;
			} else {
				blksize = rp->r_size - blkoff;
			}
		} else {
			blksize = bsize;
		}

		pp = pvn_kluster(vp, off, seg, addr, &io_off, &io_len,
		    blkoff, blksize, 0);
		/*
		 * Somebody has entered the page before us, so
		 * just use it.
		 */
		if (pp == NULL)
			goto again;

		if (pl != NULL) {
			register int sz;

			if (plsz >= io_len) {
				/*
				 * Everything fits, set up to load
				 * up and hold all the pages.
				 */
				pp2 = pp;
				sz = io_len;
			} else {
				/*
				 * Set up to load plsz worth
				 * starting at the needed page.
				 */
				for (pp2 = pp; pp2->p_offset != off;
				    pp2 = pp2->p_next) {
					ASSERT(pp2->p_next->p_offset !=
					    pp->p_offset);
				}
				sz = plsz;
			}

			ppp = pl;
			do {
				PAGE_HOLD(pp2);
				*ppp++ = pp2;
				pp2 = pp2->p_next;
				sz -= PAGESIZE;
			} while (sz > 0);
			*ppp = NULL;		/* terminate list */
		}

		/*
		 * Now round the request size up to page boundaries.
		 * This insures that the entire page will be
		 * initialized to zeroes if EOF is encountered.
		 */
		io_len = ptob(btopr(io_len));

		bp = pageio_setup(pp, io_len, vp, pl == NULL ?
		    (B_ASYNC | B_READ) : B_READ);

		bp->b_blkno = btodb(io_off);
		bp->b_dev = 0;
		bp_mapin(bp);

		/*
		 * If doing a write beyond what we believe is EOF,
		 * don't bother trying to read the pages from the
		 * server, we'll just zero the pages here.  We
		 * don't check that the rw flag is S_WRITE here
		 * because some implementations may attempt a
		 * read access to the buffer before copying data.
		 */
		if (io_off >= rp->r_size && seg == segkmap) {
			bzero(bp->b_un.b_addr, io_len);
			pvn_done(bp);
			if (pl != NULL)
				pageio_done(bp);
		} else {
			err = nfs_strategy(bp);
		}
		/* bp is now invalid! */

		if (err == NFS_EOF) {
			/*
			 * If doing a write system call just return
			 * zeroed pages, else user tried to get pages
			 * beyond EOF, return error.  We don't check
			 * that the rw flag is S_WRITE here because
			 * some implementations may attempt a read
			 * access to the buffer before copying data.
			 */
			if (seg == segkmap) {
				err = 0;
			} else {
				err = EFAULT;
			}
		}

		rp->r_nextr = io_off + io_len;
		u.u_ru.ru_majflt++;
		if (seg == segkmap)
			u.u_ru.ru_inblock++;	/* count as `read' operation */
		cnt.v_pgin++;
		cnt.v_pgpgin += btopr(io_len);
#ifdef NFSDEBUG
		dprint(nfsdebug, 1, "OTW\n");
#endif
	}

	while (!err && readahead > 0 && (blkoff + bsize < rp->r_size)) {
		addr_t addr2;

		readahead--;
		lbn++;
		blkoff += bsize;
		addr2 = addr + (blkoff - off);

		if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
			/*
			 * If less than a block left in
			 * file read less than a block.
			 */
			blksize = rp->r_size - blkoff;
		} else {
			blksize = bsize;
		}

		/*
		 * If addr is now in a different seg,
		 * don't bother trying with read-ahead.
		 */
		if (addr2 >= seg->s_base + seg->s_size) {
			pp2 = NULL;
#ifdef NFSDEBUG
			dprint(nfsdebug, 1, "nfs_getapage: ra out of seg\n");
#endif
		} else {
			VFS_RECORD(vp->v_vfsp, VS_GETPAGE, VS_CALL);
			pp2 = pvn_kluster(vp, blkoff, seg, addr2,
			    &io_off, &io_len, blkoff, blksize, 1);
#ifdef NFSDEBUG
			if (pp2 == NULL) {
				dprint(nfsdebug, 1,
				    "nfs_getapage: RA CACHE off %d size %d\n",
				    off, rp->r_size);
			}
#endif
		}

		if (pp2 != NULL) {
			/*
			 * Now round the request size up to page boundaries.
			 * This insures that the entire page will be
			 * initialized to zeroes if EOF is encountered.
			 */
			io_len = ptob(btopr(io_len));

			bp = pageio_setup(pp2, io_len, vp, B_READ | B_ASYNC);

			bp->b_dev = 0;
			bp->b_blkno = btodb(io_off);
			bp_mapin(bp);

#ifdef NFSDEBUG
			dprint(nfsdebug, 1,
			    "nfs_getapage: RA OTW off %d size %d\n",
			    off, rp->r_size);
#endif
			err = nfs_strategy(bp);	/* bp is now invalid! */

			/*
			 * Ignore all read ahead errors except those
			 * that might invalidate the primary read.
			 */
			if (err != NFS_EOF && err != NFS_CACHEINVALERR) {
				err = 0;
			}

			u.u_ru.ru_majflt++;
			if (seg == segkmap)
				u.u_ru.ru_inblock++;	/* count as `read' */
			cnt.v_pgin++;
			cnt.v_pgpgin += btopr(io_len);
		}
	}

	if (pagefound != NULL) {
		register int s;
#ifdef NFSDEBUG
		dprint(nfsdebug, 1, "CACHE\n");
#endif
		/*
		 * We need to be careful here because if the page was
		 * previously on the free list, we might have already
		 * lost it at interrupt level.
		 */
		s = splvm();
		if (pagefound->p_vnode == vp && pagefound->p_offset == off) {
			/*
			 * If the page is intransit or if
			 * it is on the free list call page_lookup
			 * to try and wait for / reclaim the page.
			 */
			if (pagefound->p_intrans || pagefound->p_free)
				pagefound = page_lookup(vp, off);
		}
		if (pagefound == NULL || pagefound->p_offset != off ||
		    pagefound->p_vnode != vp || pagefound->p_gone) {
			(void) splx(s);
			nfs_lostpage++;
			goto reread;
		}
		if (pl != NULL) {
			PAGE_HOLD(pagefound);
			pl[0] = pagefound;
			pl[1] = NULL;
			u.u_ru.ru_minflt++;
			rp->r_nextr = off + PAGESIZE;
		}
		(void) splx(s);
	}

	if (err && pl != NULL) {
		for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
			PAGE_RELE(*ppp);
	}

#ifdef NFSDEBUG
	dprint(nfsdebug, 5, "nfs_getapage: returning %d\n", err);
#endif
	return (err);
}

/*
 * Return all the pages from [off..off+len) in file
 */
static int
nfs_getpage(vp, off, len, protp, pl, plsz, seg, addr, rw, cred)
	struct vnode *vp;
	u_int off, len;
	u_int *protp;
	struct page *pl[];
	u_int plsz;
	struct seg *seg;
	addr_t addr;
	enum seg_rw rw;
	struct ucred *cred;
{
	struct rnode *rp = vtor(vp);
	int err;

	if (protp != NULL)
		*protp = PROT_ALL;

	RLOCK(rp);

	if (rp->r_cred == NULL) {
		if (cred == NULL) {
			cred = u.u_cred;	/* XXX need real cred! */
		}
		crhold(cred);
		rp->r_cred = cred;
	}

	/*
	 * Now valididate that the caches are up to date.
	 */
	(void) nfs_validate_caches(vp, rp->r_cred);

	/*
	 * If we are getting called as a side effect of a nfs_rdwr()
	 * write operation the local file size might not be extended yet.
	 * In this case we want to be able to return pages of zeroes.
	 */
	if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
		RUNLOCK(rp);
		return (EFAULT);		/* beyond EOF */
	}
retry:

	if (len <= PAGESIZE)
		err = nfs_getapage(vp, off, protp, pl, plsz, seg, addr,
		    rw, cred);
	else
		err = pvn_getpages(nfs_getapage, vp, off, len, protp, pl, plsz,
		    seg, addr, rw, cred);


	switch (err) {
	case NFS_CACHEINVALERR:
	case NFS_EOF:
		nfs_purge_caches(vp);
		goto retry;
	case ESTALE:
		PURGE_STALE_FH(err, vp);
	}
	RUNLOCK(rp);

	return (err);
}

/*ARGSUSED*/
static int
nfs_putpage(vp, off, len, flags, cred)
	struct vnode *vp;
	u_int off;
	u_int len;
	int flags;
	struct ucred *cred;
{
	register struct rnode *rp;
	register struct page *pp;
	struct page *dirty, *io_list;
	register u_int io_off, io_len;
	daddr_t lbn;
	u_int lbn_off;
	u_int bsize;
	int vpcount;
	int err = 0;

	if (len == 0 && (flags & B_INVAL) == 0 &&
	    (vp->v_vfsp->vfs_flag & VFS_RDONLY)) {
		return (0);
	}

	rp = vtor(vp);
	if ((vp->v_pages == NULL) || (vp->v_type == VCHR) ||
	(vp->v_type == VSOCK))
                       /* Fix bug 1047557, huey
                        * The reason this condition is taken out
			* is because it's possible that the rnode
                        * doesn't have the attributes up to date.
		        * Thus old pages might not be deleted. */
                       /*     || (off >= rp->r_size)) */
		return (0);

	VFS_RECORD(vp->v_vfsp, VS_PUTPAGE, VS_CALL);

	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
	vpcount = vp->v_count;
	if (vp->v_count == 0) {
		((struct mntinfo *)(vp->v_vfsp->vfs_data))->mi_refct++;
	}
	VN_HOLD(vp);

again:
	if (len == 0) {
		/*
		 * We refuse to act on behalf of the pageout daemon to push
		 * out a page to a rnode which is currently locked.
		 */
		if ((rp->r_flags & RLOCKED) && u.u_procp == &proc[2]) {
			err = EWOULDBLOCK;		/* XXX */
			goto out;
		}

		/*
		 * Search the entire vp list for pages >= off
		 */
		RLOCK(rp);
		dirty = pvn_vplist_dirty(vp, off, flags);
		if (dirty == NULL && off == 0 && (flags & B_ASYNC) == 0) {
			/*
			 * No dirty pages over the whole vnode, clear RDIRTY
			 * flag. This is the only safe place to do this since
			 * there is a possibility that we will sleep flushing
			 * the pages in a non-NULL list, and someone else
			 * could come in an write another ASYNC block.
			 */
			rp->r_flags &= ~RDIRTY;
		}
		RUNLOCK(rp);
	} else {
		/*
		 * Do a range from [off...off + len) via page_find.
		 * We set limits so that we kluster to bsize boundaries.
		 */
		if (off >= rp->r_size) {
			dirty = NULL;
		} else {
			u_int fsize, eoff, offlo, offhi;

			fsize = (rp->r_size + PAGEOFFSET) & PAGEMASK;
			eoff = MIN(off + len, fsize);
			offlo = (off / bsize) * bsize;
			offhi = roundup(eoff, bsize);
			dirty = pvn_range_dirty(vp, off, eoff, offlo, offhi,
			    flags);
		}
	}

	/*
	 * Destroy read ahead value (since we are really going to write)
	 * and save credentials for async writes.
	 */
	if (dirty != NULL) {
		rp->r_nextr = 0;
		if (rp->r_cred == NULL) {
			if (cred == NULL) {
				cred = u.u_cred; /* XXX need real cred! */
			}
			crhold(cred);
			if (rp->r_cred) {
				crfree(rp->r_cred);
			}
			rp->r_cred = cred;
		}
	}

	/*
	 * Now pp will have the list of kept dirty pages marked for
	 * write back.  It will also handle invalidation and freeing
	 * of pages that are not dirty.  All the pages on the list
	 * returned need to still be dealt with here.
	 */

	/*
	 * Handle all the dirty pages not yet dealt with.
	 */
	while ((pp = dirty) != NULL) {
		/*
		 * Pull off a contiguous chunk that fits in one lbn
		 */
		io_off = pp->p_offset;
		lbn = io_off / bsize;

		page_sub(&dirty, pp);
		io_list = pp;
		io_len = PAGESIZE;
		lbn_off = lbn * bsize;

		while (dirty != NULL && dirty->p_offset < lbn_off + bsize &&
		    dirty->p_offset == io_off + io_len) {
			pp = dirty;
			page_sub(&dirty, pp);
			page_sortadd(&io_list, pp);
			io_len += PAGESIZE;
		}

		/*
		 * Check for page length rounding problems
		 */
		if (io_off + io_len > lbn_off + bsize) {
			ASSERT((io_off+io_len) - (lbn_off+bsize) < PAGESIZE);
			io_len = lbn_off + bsize - io_off;
		}

		err = nfs_writelbn(rp, io_list, io_off, io_len, flags);
		if (err)
			break;
	}

	if (err != 0) {
		if (dirty != NULL)
			pvn_fail(dirty, B_WRITE | flags);
	} else if (off == 0 && (len == 0 || len >= rp->r_size)) {
		/*
		 * If doing "synchronous invalidation", make
		 * sure that all the pages are actually gone.
		 */
		if ((flags & (B_INVAL | B_ASYNC)) == B_INVAL &&
		    ((vp->v_pages != NULL) && (vp->v_pages->p_lckcnt == 0)))
			goto again;
	}

out:
	/*
	 * Instead of using VN_RELE here we are careful to only call
	 * the inactive routine if the vnode reference count is now zero,
	 * but it wasn't zero coming into putpage.  This is to prevent
	 * recursively calling the inactive routine on a vnode that
	 * is already considered in the `inactive' state.
	 * XXX - inactive is a relative term here (sigh).
	 */

	if (--vp->v_count == 0) {
		if (vpcount > 0) {
			(void) nfs_inactive(vp, rp->r_cred);
		} else {
			((struct mntinfo *)(vp->v_vfsp->vfs_data))->mi_refct--;
		}
	}
	return (err);
}

/*ARGSUSED*/
static int
nfs_map(vp, off, as, addrp, len, prot, maxprot, flags, cred)
	struct vnode *vp;
	u_int off;
	struct as *as;
	addr_t *addrp;
	u_int len;
	u_int prot, maxprot;
	u_int flags;
	struct ucred *cred;
{
	struct segvn_crargs vn_a;

	VFS_RECORD(vp->v_vfsp, VS_MAP, VS_CALL);

	if ((int)off < 0 || (int)(off + len) < 0)
		return (EINVAL);

	if (vp->v_type != VREG)
		return (ENODEV);

	/*
	 * Check to see if the vnode is currently marked as not cachable.
	 * If so, we have to refuse the map request as this violates the
	 * don't cache attribute.
	 */
	if (vp->v_flag & VNOCACHE)
		return (EIO);

	if ((flags & MAP_FIXED) == 0) {
		map_addr(addrp, len, (off_t)off, 1);
		if (*addrp == NULL)
			return (ENOMEM);
	} else {
		/*
		 * User specified address - blow away any previous mappings
		 */
		(void) as_unmap(as, *addrp, len);
	}

	vn_a.vp = vp;
	vn_a.offset = off;
	vn_a.type = flags & MAP_TYPE;
	vn_a.prot = prot;
	vn_a.maxprot = maxprot;
	vn_a.cred = cred;
	vn_a.amp = NULL;

	return (as_map(as, *addrp, len, segvn_create, (caddr_t)&vn_a));
}

static int
nfs_cmp(vp1, vp2)
	struct vnode *vp1, *vp2;
{

	VFS_RECORD(vp1->v_vfsp, VS_CMP, VS_CALL);
	return (vp1 == vp2);
}

/*ARGSUSED*/
static int
nfs_realvp(vp, vpp)
	struct vnode *vp;
	struct vnode **vpp;
{

	VFS_RECORD(vp->v_vfsp, VS_REALVP, VS_CALL);
	return (EINVAL);
}

/*ARGSUSED*/
static int
nfs_cntl(vp, cmd, idata, odata, iflag, oflag)
	struct vnode *vp;
	int cmd, iflag, oflag;
	caddr_t idata, odata;

{
	int error = 0;

	/*
	 * This looks a little weird because it's written in a general
	 * manner but we make use of only one case.  If cntl() ever gets
	 * widely used, the outer switch will make more sense.
	 */
	switch (cmd) {
	default:
		return (EINVAL);

	case _PC_LINK_MAX:
	case _PC_NAME_MAX:
	case _PC_PATH_MAX:
	case _PC_CHOWN_RESTRICTED:
	case _PC_NO_TRUNC: {
		struct mntinfo *mi;
		struct pathcnf *pc;

		if (!(mi = vtomi(vp)) || !(pc = mi->mi_pathconf))
			return (EINVAL);
		ASSERT(oflag == CNTL_INT32);
		error = _PC_ISSET(cmd, pc->pc_mask);	/* error or bool */
		switch (cmd) {
		case _PC_LINK_MAX:
			*(int*)odata = pc->pc_link_max;
			break;
		case _PC_NAME_MAX:
			*(int*)odata = pc->pc_name_max;
			break;
		case _PC_PATH_MAX:
			*(int*)odata = pc->pc_path_max;
			break;
		case _PC_CHOWN_RESTRICTED:
			*(int*)odata = error;	/* see above */
			break;
		case _PC_NO_TRUNC:
			*(int*)odata = error;	/* see above */
			break;
		}
		return (error ? EINVAL : 0);
	    }
	}
}