#ident  "@(#)spec_vnodeops.c 1.1 94/10/31 SMI"

/*
 * Copyright (c) 1988 by Sun Microsystems, Inc.
 */


#include <sys/param.h>
#include <sys/systm.h>
#include <sys/user.h>
#include <sys/proc.h>
#include <sys/buf.h>
#include <sys/kernel.h>
#include <sys/vfs.h>
#include <sys/vnode.h>
#include <sys/uio.h>
#include <sys/conf.h>
#include <sys/file.h>
#include <sys/stream.h>
#include <sys/stropts.h>
#include <sys/mman.h>
#include <sys/debug.h>
#include <sys/unistd.h>
#include <sys/termios.h>
#include <sys/vmmeter.h>
#include <specfs/snode.h>

#include <krpc/lockmgr.h>

#include <vm/hat.h>
#include <vm/page.h>
#include <vm/as.h>
#include <vm/pvn.h>
#include <vm/seg.h>
#include <vm/seg_map.h>
#include <vm/seg_dev.h>
#include <vm/seg_vn.h>
#include <vm/swap.h>

static	int spec_open();
static	int spec_close();
static	int spec_rdwr();
static	int spec_ioctl();
static	int spec_select();
static	int spec_getattr();
static	int spec_inactive();
static	int spec_noop();
static	int spec_getpage();
static	int spec_putpage();
static	int spec_map();
static	int spec_dump();
static  int spec_cmp();
/*
 * Used directly in fifo_vnodeops
 */
int spec_setattr();
int spec_access();
int spec_link();
int spec_lockctl();
int spec_fsync();
int spec_fid();
int spec_realvp();
int spec_cntl();

struct vnodeops spec_vnodeops = {
	spec_open,
	spec_close,
	spec_rdwr,
	spec_ioctl,
	spec_select,
	spec_getattr,
	spec_setattr,
	spec_access,
	spec_noop,	/* lookup */
	spec_noop,	/* create */
	spec_noop,	/* remove */
	spec_link,
	spec_noop,	/* rename */
	spec_noop,	/* mkdir */
	spec_noop,	/* rmdir */
	spec_noop,	/* readdir */
	spec_noop,	/* symlink */
	spec_noop,	/* readlink */
	spec_fsync,
	spec_inactive,
	spec_lockctl,
	spec_fid,
	spec_getpage,
	spec_putpage,
	spec_map,
	spec_dump,
	spec_cmp,
	spec_realvp,
	spec_cntl,
};

/*
 * open a special file (device)
 * Some weird stuff here having to do with clone and indirect devices:
 * When a file lookup operation happens (e.g. ufs_lookup) and the vnode has
 * type VDEV specvp() is used to return a spec vnode instead.  Then when
 * the VOP_OPEN routine is called, we get control here.  When we do the
 * device open routine there are several possible strange results:
 * 1) An indirect device will return the error EAGAIN on open and return
 *    a new dev number.  We have to make that into a spec vnode and call
 *    open on it again.
 * 2) The clone device driver will return the error EEXIST and return a
 *    new dev number.  As above, we build a new vnode and call open again,
 *    explicitly asking the open routine to do a clone open.
 * 3) A clone device will return a new dev number on open but no error.
 *    In this case we just make a new spec vnode out of the new dev number
 *    and return that.
 * The last two cases differ in that the decision to clone arises outside
 * of the target device in 2) and from within in 3).
 *
 * TODO: extend case 2) to apply to all character devices, not just streams
 * devices.
 */
#define	MAX_S_SIZE \
		((1 << sizeof (off_t) * NBBY - DEV_BSHIFT - 1) - 1)
/*ARGSUSED*/
static int
spec_open(vpp, flag, cred)
	struct vnode **vpp;
	int flag;
	struct ucred *cred;
{
	register struct snode *sp;
	dev_t dev;
	dev_t newdev;
	int sflag = 0;
	register int error;

	sp = VTOS(*vpp);

	/*
	 * Do open protocol for special type.
	 */
	dev = sp->s_dev;

	switch ((*vpp)->v_type) {

	case VCHR:
		newdev = dev;
		error = 0;
		for (;;) {
			register struct vnode *nvp;

			dev = newdev;
			if ((u_int)major(dev) >= nchrdev)
				return (ENXIO);

			while (isclosing(dev, (*vpp)->v_type))
				if ( sleep((caddr_t)sp, PSLEP|PCATCH))
					return (EINTR);

			if (cdevsw[major(dev)].d_str) {
				/*
				 * Open the stream.  Stropen handles
				 * the mechanics of cloning itself.
				 * In particular, it builds a fresh
				 * vnode for the cloned instance and
				 * does streams-specific cross-linking.
				 */
				error = stropen(vpp, flag, sflag);
				sp = VTOS(*vpp);
				break;
			} else
				error = (*cdevsw[major(dev)].d_open)(dev,
					flag, &newdev);

			/*
			 * If this is an indirect device or a forced clone,
			 * we need to do the open again.  In both cases,
			 * we insist that newdev differ from dev, to help
			 * avoid infinite regress.
			 */
			if (newdev == dev ||
			    (error != 0 && error != EAGAIN && error != EEXIST))
				break;

			/*
			 * Allocate new snode with new device.  Release old
			 * snode. Set vpp to point to new one.  This snode will
			 * go away when the last reference to it goes away.
			 * Warning: if you stat this, and try to match it with
			 * a name in the filesystem you will fail, unless you
			 * had previously put names in that match.
			 */
			nvp = makespecvp(newdev, VCHR);
			sp = VTOS(nvp);
			VN_RELE(*vpp);
			*vpp = nvp;

			/* If we've completed a clone open, we're done. */
			if (error == 0)
				break;
			else
				sflag = error == EEXIST ? CLONEOPEN : 0;
		}
		break;

	case VFIFO:
		printf("spec_open: got a VFIFO???\n");
		/* fall through to... */

	case VSOCK:
		error = EOPNOTSUPP;
		break;

	case VBLK:
		/*
		 * The block device sizing was already done in specvp().
		 * However, we still need to verify that we can open the
		 * block device here (since specvp was called as part of a
		 * "lookup", not an "open", and e.g. "stat"ing a block special
		 * file with an illegal major device number should be legal).
		 *
		 * With loadable drivers, removable media devices, or
		 * metadevices, the block device sizing might need to be
		 * done again, as the open will likely find a changed size
		 * to the device.
		 *
		 * If the special file for a device is opened before the
		 * driver is loaded, or a lookup on /dev is done, there
		 * might be an snode around with s_size == 0. In this case,
		 * we need to resize the device.
		 *
		 * Another way of putting it is that the XXsize function
		 * reports the current size, if any, of a device, and doesn't
		 * imply any other action that the driver will take, while
		 * the open might imply sizing.
		 */

		if ((u_int)major(dev) >= nblkdev)
			error = ENXIO;
		else
			error = (*bdevsw[major(dev)].d_open)(dev, flag);
		if (error == 0) {
			struct snode *sptmp;
			int (*size)() = bdevsw[major(dev)].d_psize;

			sptmp = VTOS(bdevvp(dev));
			if (size != NULL) {
				int rsize = (*size)(dev);
				if (rsize == -1)
					sptmp->s_size = 0;
				else
					sptmp->s_size =
					    dbtob(MIN(rsize, MAX_S_SIZE));
			}
			VN_RELE(STOV(sptmp));
		}
		break;
	default:
		panic("spec_open: type not VCHR or VBLK");
		break;
	}
	if (error == 0)
		sp->s_count++;		/* one more open reference */
	return (error);
}

/*ARGSUSED*/
static int
spec_close(vp, flag, count, cred)
	struct vnode *vp;
	int flag;
	int count;
	struct ucred *cred;
{
	register struct snode *sp;
	dev_t dev;

	if (count > 1)
		return (0);

	/*
	 * setjmp in case close is interrupted
	 */
	if (setjmp(&u.u_qsave)) {
		sp = VTOS(vp);	/* recompute - I don't trust setjmp/longjmp */
		sp->s_flag &= ~SCLOSING;
		wakeup((caddr_t)sp);
		return (EINTR);
	}

	sp = VTOS(vp);
	sp->s_count--;		/* one fewer open reference */

	/*
	 * Only call the close routine when the last open
	 * reference through any [s, v]node goes away.
	 */
	if (stillopen(sp->s_dev, vp->v_type))
		return (0);

	dev = sp->s_dev;

	switch (vp->v_type) {

	case VCHR:
		/*
		 * Mark this device as closing, so that opens will wait until
		 * the close finishes.  Since the close may block, this
		 * prevents an open from getting in while the close is blocked,
		 * and then getting surprised when the close finishes and
		 * potentially clears out the driver's state.
		 *
		 * XXX - really should be done on all devices, but for now we
		 * only do it on streams (as that's the one case where the
		 * close blocks before the close routine is called, and thus
		 * the one case where the close routine really can't protect
		 * itself).
		 */
		/*
		 * If it's a stream, call stream close routine.
		 */
		if (cdevsw[major(dev)].d_str) {
			sp->s_flag |= SCLOSING;
			strclose(vp, flag);
			sp->s_flag &= ~SCLOSING;
			wakeup((caddr_t)sp);
		} else
			(void) (*cdevsw[major(dev)].d_close)(dev, flag);
		break;

	case VBLK:
		/*
		 * On last close of a block device, we flush back
		 * and invalidate any in core buffers to help make
		 * the spec vnode inactive ASAP if it is not currently
		 * held by someone else for something (e.g., swapping).
		 */
		bflush(sp->s_bdevvp);
		binval(sp->s_bdevvp);
		(void) (*bdevsw[major(dev)].d_close)(dev, flag);
		break;

	case VFIFO:
		printf("spec_close: got a VFIFO???\n");
		break;
	}

	return (0);
}

/*
 * read or write a spec vnode
 */
/*ARGSUSED*/
static int
spec_rdwr(vp, uiop, rw, ioflag, cred)
	struct vnode *vp;
	register struct uio *uiop;
	enum uio_rw rw;
	int ioflag;
	struct ucred *cred;
{
	register struct snode *sp;
	register addr_t base;
	register u_int off;
	struct vnode *blkvp;
	dev_t dev;
	register int n, on;
	u_int flags;
	u_int bdevsize;
	int pagecreate;
	int error;
	extern int mem_no;

	sp = VTOS(vp);
	dev = (dev_t)sp->s_dev;
	if (rw != UIO_READ && rw != UIO_WRITE)
		panic("spec_rdwr");
	if (rw == UIO_READ && uiop->uio_resid == 0)
		return (0);
	n = uiop->uio_resid;
	/*
	 * If this I/O will carry us over the 2GB threshold,
	 * switch automatically to block mode if possible.
	 *
	 * XXX We switch if the I/O leaves us exactly at 2GB,
	 * which is arguably wrong, but the old code didn't
	 * allow such I/O's anyway, so there is no compatibility
	 * problem.
	 */
	if (vp->v_type == VCHR && (uiop->uio_fmode & FSETBLK) == 0 &&
	    mem_no != major(dev) && vp->v_stream == NULL &&
	    uiop->uio_offset >= 0 && uiop->uio_offset + n < 0 &&
	    uiop->uio_offset % DEV_BSIZE == 0) {
		uiop->uio_fmode |= FSETBLK;
		uiop->uio_offset = btodb(uiop->uio_offset);
	}
	if (uiop->uio_fmode & FSETBLK) {
		if (n % DEV_BSIZE != 0)
			return (EINVAL);
		n = btodb(n);
	}
	if ((uiop->uio_offset < 0 ||
	    (n != 0 && uiop->uio_offset + n - 1 < 0)) &&
	    !(vp->v_type == VCHR &&
	    (mem_no == major(dev) || vp->v_stream != NULL))) {
		return (EINVAL);
	}

	if (rw == UIO_READ)
		smark(sp, SACC);

	if (vp->v_type == VCHR) {
		if (rw == UIO_READ) {
			if (cdevsw[major(dev)].d_str) {
				int saverr = u.u_error;

				u.u_error = 0;
				strread(vp, uiop);
				error = u.u_error;
				u.u_error = saverr;
			} else
				error = (*cdevsw[major(dev)].d_read)(dev, uiop);
		} else {
			smark(sp, SUPD|SCHG);
			if (cdevsw[major(dev)].d_str) {
				int saverr = u.u_error;

				u.u_error = 0;
				strwrite(vp, uiop);
				error = u.u_error;
				u.u_error = saverr;
			} else
				error = (*cdevsw[major(dev)].d_write)(dev,
						uiop);
		}
		return (error);
	}

	if (vp->v_type != VBLK)
		return (EOPNOTSUPP);

	if (uiop->uio_resid == 0)
		return (0);

	error = 0;
	blkvp = sp->s_bdevvp;
	bdevsize = sp->s_size;
	do {
		int diff;

		off = uiop->uio_offset & MAXBMASK;
		on = uiop->uio_offset & MAXBOFFSET;
		n = MIN(MAXBSIZE - on, uiop->uio_resid);
		pagecreate = 0;
		diff = bdevsize - uiop->uio_offset;

		if (diff <= 0)
			break;
		if (diff < n)
			n = diff;

		base = segmap_getmap(segkmap, blkvp, off);

		/*
		 * Check to see if we can skip reading in the page
		 * and just allocate the memory.  We can do this
		 * if we are going to rewrite the entire mapping
		 * or if we are going to write to end of the device
		 * from the beginning of the mapping.
		 */
		if (rw == UIO_WRITE && (n == MAXBSIZE ||
		    (on == 0 && (off + n) == bdevsize))) {
			SNLOCK(sp);
			segmap_pagecreate(segkmap, base + on, (u_int)n, 0);
			SNUNLOCK(sp);
			pagecreate = 1;
		}

		error = uiomove(base + on, n, rw, uiop);

		if (pagecreate && uiop->uio_offset <
		    roundup(off + on + n, PAGESIZE)) {
			/*
			 * We created pages w/o initializing them completely,
			 * thus we need to zero the part that wasn't set up.
			 * This can happen if we write to the end of the device
			 * or if we had some sort of error during the uiomove.
			 */
			int nzero, nmoved;

			nmoved = uiop->uio_offset - (off + on);
			ASSERT(nmoved >= 0 && nmoved <= n);
			nzero = roundup(on + n, PAGESIZE) - nmoved;
			ASSERT(nzero > 0 && on + nmoved + nzero <= MAXBSIZE);
			(void) kzero(base + on + nmoved, (u_int)nzero);
		}

		if (error == 0) {
			flags = 0;
			if (rw == UIO_WRITE) {
				/*
				 * Force write back for synchronous write cases.
				 */
				if (ioflag & IO_SYNC) {
					flags = SM_WRITE;
				} else if (n + on == MAXBSIZE ||
				    IS_SWAPVP(vp)) {
					/*
					 * Have written a whole block.
					 * Start an asynchronous write and
					 * mark the buffer to indicate that
					 * it won't be needed again soon.
					 * Push swap files here, since it
					 * won't happen anywhere else.
					 */
					flags = SM_WRITE | SM_ASYNC |
					    SM_DONTNEED;
				}
				smark(sp, SUPD|SCHG);
			} else if (rw == UIO_READ) {
				/*
				 * If read a whole block, won't need this
				 * buffer again soon.  Don't mark it with
				 * SM_FREE, as that can lead to a deadlock
				 * if the block corresponds to a u-page.
				 * (The keep count never drops to zero, so
				 * waiting for "i/o to complete" never
				 * terminates; this points out a flaw in
				 * our locking strategy.)
				 */
				if (n + on == MAXBSIZE)
					flags = SM_DONTNEED;
			}
			error = segmap_release(segkmap, base, flags);
		} else {
			(void) segmap_release(segkmap, base, 0);
		}

	} while (error == 0 && uiop->uio_resid > 0 && n != 0);

	return (error);
}

/*ARGSUSED*/
static int
spec_ioctl(vp, com, data, flag, cred)
	struct vnode *vp;
	int com;
	caddr_t data;
	int flag;
	struct ucred *cred;
{
	register struct snode *sp;

	sp = VTOS(vp);
	if (vp->v_type != VCHR)
		panic("spec_ioctl");
	if (cdevsw[major(sp->s_dev)].d_str) {
		int saverr = u.u_error;
		int error;

		u.u_error = 0;
		strioctl(vp, com, data, flag);
		error = u.u_error;
		u.u_error = saverr;
		return (error);
	}
	return ((*cdevsw[major(sp->s_dev)].d_ioctl)
	    (sp->s_dev, com, data, flag));
}

/*ARGSUSED*/
static int
spec_select(vp, which, cred)
	struct vnode *vp;
	int which;
	struct ucred *cred;
{
	register struct snode *sp;

	sp = VTOS(vp);
	if (vp->v_type != VCHR)
		panic("spec_select");
	if (cdevsw[major(sp->s_dev)].d_str)
		return (strselect(vp, which));
	else
		return ((*cdevsw[major(sp->s_dev)].d_select)(sp->s_dev, which));
}

static int
spec_inactive(vp, cred)
	struct vnode *vp;
	struct ucred *cred;
{
	struct snode *sp;
	int error;

	sp = VTOS(vp);

	/* XXX before removing the snode reset stream */
	if (vp->v_type == VCHR && vp->v_stream)
		vp->v_stream->sd_vnode = other_specvp(vp);

	/* must sunsave() first to prevent a race when spec_fsync() sleeps */
	sunsave(sp);

	if (sp->s_realvp && (sp->s_bdevvp == NULL || !IS_SWAPVP(sp->s_bdevvp)))
		(void) spec_fsync(vp, cred);
	if (vp->v_type == VBLK && vp->v_pages != NULL) {
		/*
		 * Device is no longer referenced by anyone.
		 * Destroy all the old pages (which BTW don't
		 * count against the vnode reference count) so
		 * we can, for instance, change floppy disks.
		 */
		error = spec_putpage(sp->s_bdevvp, 0, 0, B_INVAL,
		    (struct ucred *)0);
	} else {
		error = 0;
	}

	/* now free the realvp (no longer done by sunsave()) */
	if (sp->s_realvp) {
		VN_RELE(sp->s_realvp);
		sp->s_realvp = NULL;
		if (sp->s_bdevvp)
			VN_RELE(sp->s_bdevvp);
	}

	kmem_free((caddr_t)sp, sizeof (*sp));
	return (error);
}

static int
spec_getattr(vp, vap, cred)
	struct vnode *vp;
	register struct vattr *vap;
	struct ucred *cred;
{
	int error;
	register struct snode *sp;
	register struct vnode *realvp;

	sp = VTOS(vp);
	if ((realvp = sp->s_realvp) == NULL) {
		/*
		 * No real vnode behind this one.
		 * Set the device size from snode.
		 * Set times to the present.
		 * Set blocksize based on type in the unreal vnode.
		 */
		bzero((caddr_t)vap, sizeof (*vap));
		vap->va_size = sp->s_size;
		vap->va_rdev = sp->s_dev;
		vap->va_type = vp->v_type;
		vap->va_nodeid = ++fake_vno;
	} else {
		extern int dump_no;
		error = VOP_GETATTR(realvp, vap, cred);
		if (error != 0)
			return (error);
		/* if this is the dump file, copy the size, too */
		/* XXX there should be a more general way of doing this */
		if (vp->v_type == VCHR && dump_no == major(sp->s_dev))
			vap->va_size = sp->s_size;
	}
	/* set current times from snode, even if older than vnode */
	vap->va_atime = sp->s_atime;
	vap->va_mtime = sp->s_mtime;
	vap->va_ctime = sp->s_ctime;

	/* set device-dependent blocksizes */
	switch (vap->va_type) {
	case VBLK:
		vap->va_blocksize = MAXBSIZE;		/* was BLKDEV_IOSIZE */
		break;

	case VCHR:
		vap->va_blocksize = MAXBSIZE;
		break;
	}
	return (0);
}

int
spec_setattr(vp, vap, cred)
	struct vnode *vp;
	register struct vattr *vap;
	struct ucred *cred;
{
	register struct snode *sp;
	register struct vnode *realvp;
	int error;
	register int chtime = 0;

	sp = VTOS(vp);
	if ((realvp = sp->s_realvp) == NULL)
		error = 0;			/* no real vnode to update */
	else
		error = VOP_SETATTR(realvp, vap, cred);
	if (error == 0) {
		/* if times were changed, update snode */
		if (vap->va_mtime.tv_sec != -1) {

			/*
			 * If SysV-compatible option to set access and
			 * modified times if root, owner, or write access,
			 * need to read back the new times in order to
			 * keep the snode times in sync.  If VOP_GETATTR()
			 * fails, use current client time as an approximation.
			 *
			 * XXX - va_mtime.tv_usec == -1 flags this.
			 */
			if (vap->va_mtime.tv_usec == -1) {
				struct vattr vtmp;

				if ((realvp == NULL) ||
				    VOP_GETATTR(realvp, &vtmp, cred) != 0) {
					/* if error, simulate server time */
					sp->s_mtime = time;
					sp->s_atime = time;
					sp->s_ctime = time;
				} else {
					sp->s_mtime = vtmp.va_mtime;
					sp->s_atime = vtmp.va_atime;
					sp->s_ctime = vtmp.va_ctime;
				}
				goto no_chtime;
			}

			sp->s_mtime = vap->va_mtime;
			chtime++;
		}
		if (vap->va_atime.tv_sec != -1) {
			sp->s_atime = vap->va_atime;
			chtime++;
		}
		if (chtime)
			sp->s_ctime = time;
	}
no_chtime:
	return (error);
}

int
spec_access(vp, mode, cred)
	struct vnode *vp;
	int mode;
	struct ucred *cred;
{
	register struct vnode *realvp;

	if ((realvp = VTOS(vp)->s_realvp) != NULL)
		return (VOP_ACCESS(realvp, mode, cred));
	else
		return (0);	/* allow all access */
}

int
spec_link(vp, tdvp, tnm, cred)
	struct vnode *vp;
	struct vnode *tdvp;
	char *tnm;
	struct ucred *cred;
{
	register struct vnode *realvp;

	if ((realvp = VTOS(vp)->s_realvp) != NULL)
		return (VOP_LINK(realvp, tdvp, tnm, cred));
	else
		return (ENOENT); /* can't link to something non-existent */
}

/*
 * In order to sync out the snode times without multi-client problems,
 * make sure the times written out are never earlier than the times
 * already set in the vnode.
 */
int
spec_fsync(vp, cred)
	struct vnode *vp;
	struct ucred *cred;
{
	register int error = 0;
	register struct snode *sp;
	register struct vnode *realvp;
	struct vattr *vap;
	struct vattr *vatmp;
	int err;

	sp = VTOS(vp);
	/*
	 * If times didn't change on a non-block
	 * special file, don't flush anything.
	 */
	if ((sp->s_flag & (SACC|SUPD|SCHG)) == 0 && vp->v_type != VBLK)
		return (0);
	sp->s_flag &= ~(SACC|SUPD|SCHG);

	/*
	 * If the vnode represents a block device and it is a "shadow"
	 * vnode, then flush all pages associated with the "common" vnode.
	 */
	if (vp->v_type == VBLK && sp->s_bdevvp != vp &&
	    sp->s_bdevvp->v_pages != NULL)
		error = spec_putpage(sp->s_bdevvp, 0, 0, 0,
		    (struct ucred *)0);

	/*
	 * If no real vnode to update, don't flush anything
	 */
	if ((realvp = sp->s_realvp) == NULL)
		return (error);

	vatmp = (struct vattr *)new_kmem_alloc(sizeof (*vatmp), KMEM_SLEEP);
	err = VOP_GETATTR(realvp, vatmp, cred);
	if (err == 0) {
		vap = (struct vattr *)new_kmem_alloc(sizeof (*vap), KMEM_SLEEP);
		vattr_null(vap);
		vap->va_atime = timercmp(&vatmp->va_atime, &sp->s_atime, >) ?
		    vatmp->va_atime : sp->s_atime;
		vap->va_mtime = timercmp(&vatmp->va_mtime, &sp->s_mtime, >) ?
		    vatmp->va_mtime : sp->s_mtime;
		VOP_SETATTR(realvp, vap, cred);
		kmem_free((caddr_t)vap, sizeof (*vap));
	}
	kmem_free((caddr_t)vatmp, sizeof (*vatmp));
	(void) VOP_FSYNC(realvp, cred);
	return (error);
}

static int
spec_dump(vp, addr, bn, count)
	struct vnode *vp;
	caddr_t addr;
	int bn;
	int count;
{

	return ((*bdevsw[major(vp->v_rdev)].d_dump)
	    (vp->v_rdev, addr, bn, count));
}

static int
spec_noop()
{

	return (EINVAL);
}

/*
 * Record-locking requests are passed back to the real vnode handler.
 */
int
spec_lockctl(vp, ld, cmd, cred, clid)
	struct vnode *vp;
	struct flock *ld;
	int cmd;
	struct ucred *cred;
	int clid;
{
	register struct vnode *realvp;

	if ((realvp = VTOS(vp)->s_realvp) != NULL)
		return (VOP_LOCKCTL(realvp, ld, cmd, cred, clid));
	else
		return (EINVAL);	/* can't lock this, it doesn't exist */
}

int
spec_fid(vp, fidpp)
	struct vnode *vp;
	struct fid **fidpp;
{
	register struct vnode *realvp;

	if ((realvp = VTOS(vp)->s_realvp) != NULL)
		return (VOP_FID(realvp, fidpp));
	else
		return (EINVAL);	/* you lose */
}

/*
 * klustsize should be a multiple of PAGESIZE and <= MAXPHYS.
 */
#define	KLUSTSIZE	(56 * 1024)
int klustsize = KLUSTSIZE;
int spec_ra = 1;
int spec_lostpage;	/* number of times we lost original page */

/*
 * Called from pvn_getpages or spec_getpage to get a particular page.
 * When we are called the snode is already locked.
 */
/*ARGSUSED*/
static int
spec_getapage(vp, off, protp, pl, plsz, seg, addr, rw, cred)
	register struct vnode *vp;
	u_int off, *protp;
	struct page *pl[];
	u_int plsz;
	struct seg *seg;
	addr_t addr;
	enum seg_rw rw;
	struct ucred *cred;
{
	register struct snode *sp;
	struct buf *bp, *bp2;
	struct page *pp, *pp2, **ppp, *pagefound;
	u_int io_off, io_len;
	u_int blksz, blkoff;
	int dora, err;
	u_int xlen;

	sp = VTOS(vp);

reread:
	err = 0;
	bp = NULL;
	bp2 = NULL;

	if (spec_ra && sp->s_nextr == off)
		dora = 1;
	else
		dora = 0;

	/*
	 * We SNLOCK here to try and allow more concurrent access
	 * to the snode.  We release the lock as soon as we know
	 * we won't be allocating more pages for the vnode.
	 * NB: It's possible that the snode was already locked by
	 * this process (e.g. we were called through pvn_getpages),
	 * thus we are assuming that SNLOCK is recursive.
	 */
	SNLOCK(sp);
again:
	if ((pagefound = page_find(vp, off)) == NULL) {
		/*
		 * Need to really do disk IO to get the page.
		 */
		blkoff = (off / klustsize) * klustsize;
		if (blkoff + klustsize <= sp->s_size)
			blksz = klustsize;
		else
			blksz = sp->s_size - blkoff;

		pp = pvn_kluster(vp, off, seg, addr, &io_off, &io_len,
		    blkoff, blksz, 0);
		/*
		 * Somebody has entered the page before us, so
		 * just use it.
		 */
		if (pp == NULL)
			goto again;

		if (!dora)
			SNUNLOCK(sp);

		if (pl != NULL) {
			register int sz;

			if (plsz >= io_len) {
				/*
				 * Everything fits, set up to load
				 * up and hold all the pages.
				 */
				pp2 = pp;
				sz = io_len;
			} else {
				/*
				 * Set up to load plsz worth
				 * starting at the needed page.
				 */
				for (pp2 = pp; pp2->p_offset != off;
				    pp2 = pp2->p_next) {
					ASSERT(pp2->p_next->p_offset !=
					    pp->p_offset);
				}
				sz = plsz;
			}

			ppp = pl;
			do {
				PAGE_HOLD(pp2);
				*ppp++ = pp2;
				pp2 = pp2->p_next;
				sz -= PAGESIZE;
			} while (sz > 0);
			*ppp = NULL;		/* terminate list */
		}

		bp = pageio_setup(pp, io_len, vp, pl == NULL ?
		    (B_ASYNC | B_READ) : B_READ);

		bp->b_dev = vp->v_rdev;
		bp->b_blkno = btodb(io_off);

		/*
		 * Zero part of page which we are not
		 * going to be reading from disk now.
		 */
		xlen = io_len & PAGEOFFSET;
		if (xlen != 0)
			pagezero(pp->p_prev, xlen, PAGESIZE - xlen);

		(*bdevsw[major(vp->v_rdev)].d_strategy)(bp);

		sp->s_nextr = io_off + io_len;
		u.u_ru.ru_majflt++;
		if (seg == segkmap)
			u.u_ru.ru_inblock++;	/* count as `read' operation */
		cnt.v_pgin++;
		cnt.v_pgpgin += btopr(io_len);
	} else if (!dora)
		SNUNLOCK(sp);

	if (dora) {
		u_int off2;
		addr_t addr2;

		off2 = ((off / klustsize) + 1) * klustsize;
		addr2 = addr + (off2 - off);

		/*
		 * If addr is now in a different seg or we are past
		 * EOF then don't bother trying with read-ahead.
		 */
		if (addr2 >= seg->s_base + seg->s_size || off2 >= sp->s_size) {
			pp2 = NULL;
		} else {
			if (off2 + klustsize <= sp->s_size)
				blksz = klustsize;
			else
				blksz = sp->s_size - off2;

			pp2 = pvn_kluster(vp, off2, seg, addr2, &io_off,
			    &io_len, off2, blksz, 1);
		}

		SNUNLOCK(sp);

		if (pp2 != NULL) {
			bp2 = pageio_setup(pp2, io_len, vp, B_READ | B_ASYNC);

			bp2->b_dev = vp->v_rdev;
			bp2->b_blkno = btodb(io_off);

			/*
			 * Zero part of page which we are not
			 * going to be reading from disk now.
			 */
			xlen = io_len & PAGEOFFSET;
			if (xlen != 0)
				pagezero(pp2->p_prev, xlen, PAGESIZE - xlen);

			(*bdevsw[major(vp->v_rdev)].d_strategy)(bp2);

			/*
			 * Should we bill read ahead to extra faults?
			 */
			u.u_ru.ru_majflt++;
			if (seg == segkmap)
				u.u_ru.ru_inblock++;	/* count as `read' */
			cnt.v_pgin++;
			cnt.v_pgpgin += btopr(io_len);
		}
	}

	if (bp != NULL && pl != NULL) {
		err = biowait(bp);
		pageio_done(bp);
	} else if (pagefound != NULL) {
		register int s;

		/*
		 * We need to be careful here because if the page was
		 * previously on the free list, we might have already
		 * lost it at interrupt level.
		 */
		s = splvm();
		if (pagefound->p_vnode == vp && pagefound->p_offset == off) {
			/*
			 * If the page is still intransit or if
			 * it is on the free list call page_lookup
			 * to try and wait for / reclaim the page.
			 */
			if (pagefound->p_intrans || pagefound->p_free)
				pagefound = page_lookup(vp, off);
		}
		if (pagefound == NULL || pagefound->p_offset != off ||
		    pagefound->p_vnode != vp || pagefound->p_gone) {
			(void) splx(s);
			spec_lostpage++;
			goto reread;
		}
		if (pl != NULL) {
			PAGE_HOLD(pagefound);
			pl[0] = pagefound;
			pl[1] = NULL;
			u.u_ru.ru_minflt++;
			sp->s_nextr = off + PAGESIZE;
		}
		(void) splx(s);
	}

	if (err && pl != NULL) {
		for (ppp = pl; *ppp != NULL; *ppp++ = NULL)
			PAGE_RELE(*ppp);
	}

	return (err);
}

/*
 * Return all the pages from [off..off+len) in block device
 */
static int
spec_getpage(vp, off, len, protp, pl, plsz, seg, addr, rw, cred)
	struct vnode *vp;
	u_int off, len;
	u_int *protp;
	struct page *pl[];
	u_int plsz;
	struct seg *seg;
	addr_t addr;
	enum seg_rw rw;
	struct ucred *cred;
{
	struct snode *sp = VTOS(vp);
	int err;

	if (vp->v_type != VBLK || sp->s_bdevvp != vp)
		panic("spec_getpage");

	if (off + len - PAGEOFFSET > sp->s_size)
		return (EFAULT);	/* beyond EOF */

	if (protp != NULL)
		*protp = PROT_ALL;

	if (len <= PAGESIZE)
		err = spec_getapage(vp, off, protp, pl, plsz, seg, addr,
		    rw, cred);
	else {
		SNLOCK(sp);
		err = pvn_getpages(spec_getapage, vp, off, len, protp, pl,
		    plsz, seg, addr, rw, cred);
		SNUNLOCK(sp);
	}

	return (err);
}

/*
 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
 */
static int
spec_wrtblk(vp, pp, off, len, flags)
	struct vnode *vp;
	struct page *pp;
	u_int off, len;
	int flags;
{
	struct buf *bp;
	int err;

	bp = pageio_setup(pp, len, vp, B_WRITE | flags);
	if (bp == NULL) {
		pvn_fail(pp, B_WRITE | flags);
		return (ENOMEM);
	}

	bp->b_dev = vp->v_rdev;
	bp->b_blkno = btodb(off);

	(*bdevsw[major(vp->v_rdev)].d_strategy)(bp);
	u.u_ru.ru_oublock++;

	/*
	 * If async, assume that pvn_done will
	 * handle the pages when IO is done
	 */
	if ((flags & B_ASYNC) != 0)
		return (0);

	err = biowait(bp);
	pageio_done(bp);

	return (err);
}

/*
 * Flags are composed of {B_ASYNC, B_INVAL, B_DIRTY B_FREE, B_DONTNEED}
 * If len == 0, do from off to EOF.
 *
 * The normal cases should be len == 0 & off == 0 (entire vp list),
 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
 * (from pageout).
 */
/*ARGSUSED*/
static int
spec_putpage(vp, off, len, flags, cred)
	register struct vnode *vp;
	u_int off;
	u_int len;
	int flags;
	struct ucred *cred;
{
	register struct snode *sp;
	register struct page *pp;
	struct page *dirty, *io_list;
	register u_int io_off, io_len;
	int vpcount;
	int err = 0;

	sp = VTOS(vp);
	if (vp->v_pages == NULL || off >= sp->s_size)
		return (0);

	if (vp->v_type != VBLK || sp->s_bdevvp != vp)
		panic("spec_putpage");

	vpcount = vp->v_count;
	VN_HOLD(vp);

again:
	if (len == 0) {
		/*
		 * We refuse to act on behalf of the pageout daemon to push
		 * out a page to a snode which is currently locked.
		 */
		if ((sp->s_flag & SLOCKED) && u.u_procp == &proc[2]) {
			err = EWOULDBLOCK;		/* XXX */
			goto out;
		}

		/*
		 * Search the entire vp list for pages >= off.
		 * We lock the snode here to prevent us from having
		 * multiple instances of pvn_vplist_dirty working
		 * on the same vnode active at the same time.
		 */
		SNLOCK(sp);
		dirty = pvn_vplist_dirty(vp, off, flags);
		SNUNLOCK(sp);
	} else {
		/*
		 * Do a range from [off...off + len) via page_find.
		 * We set limits so that we kluster to klustsize boundaries.
		 */
		if (off >= sp->s_size) {
			dirty = NULL;
		} else {
			u_int fsize, eoff, offlo, offhi;

			fsize = (sp->s_size + PAGEOFFSET) & PAGEMASK;
			eoff = MIN(off + len, fsize);
			offlo = (off / klustsize) * klustsize;
			offhi = roundup(eoff, klustsize);
			dirty = pvn_range_dirty(vp, off, eoff, offlo, offhi,
			    flags);
		}
	}

	/*
	 * Now pp will have the list of kept dirty pages marked for
	 * write back.  It will also, handle invalidation and freeing
	 * of pages that are not dirty.  All the pages on the list
	 * returned need to still be dealt with here.
	 */

	/*
	 * Destroy read ahead value (since we are really going to write)
	 */
	if (dirty != NULL)
		sp->s_nextr = 0;

	/*
	 * Handle all the dirty pages not yet dealt with.
	 */
	while ((pp = dirty) != NULL) {
		/*
		 * Pull off a contiguous chunk
		 */
		page_sub(&dirty, pp);
		io_list = pp;
		io_off = pp->p_offset;
		io_len = PAGESIZE;
		while (dirty != NULL && dirty->p_offset == io_off + io_len) {
			pp = dirty;
			page_sub(&dirty, pp);
			page_sortadd(&io_list, pp);
			io_len += PAGESIZE;
			if (io_len >= klustsize - PAGEOFFSET)
				break;
		}
		/*
		 * Check for page length rounding problems
		 */
		if (io_off + io_len > sp->s_size) {
			ASSERT((io_off + io_len) - sp->s_size < PAGESIZE);
			io_len = sp->s_size - io_off;
		}
		err = spec_wrtblk(vp, io_list, io_off, io_len, flags);
		if (err)
			break;
	}

	if (err != 0) {
		if (dirty != NULL)
			pvn_fail(dirty, B_WRITE | flags);
	} else if (off == 0 && (len == 0 || len >= sp->s_size)) {
		/*
		 * If doing "synchronous invalidation", make
		 * sure that all the pages are actually gone.
		 */
		if ((flags & (B_INVAL | B_ASYNC)) == B_INVAL &&
		    (vp->v_pages != NULL))
			goto again;
	}

out:
	/*
	 * Instead of using VN_RELE here we are careful to only call
	 * the inactive routine if the vnode reference count is now zero,
	 * but it wasn't zero coming into putpage.  This is to prevent
	 * recursively calling the inactive routine on a vnode that
	 * is already considered in the `inactive' state.
	 * XXX - inactive is a relative term here (sigh).
	 */
	if (--vp->v_count == 0 && vpcount > 0)
		(void) spec_inactive(vp, cred);
	return (err);
}

/*
 * This routine is called through the cdevsw[] table to handle
 * traditional mmap'able devices that support a d_mmap function.
 */
/*ARGSUSED*/
int
spec_segmap(dev, off, as, addrp, len, prot, maxprot, flags, cred)
	dev_t dev;
	u_int off;
	struct as *as;
	addr_t *addrp;
	u_int len;
	u_int prot, maxprot;
	u_int flags;
	struct ucred *cred;
{
	struct segdev_crargs dev_a;
	int (*mapfunc)();
	register int i;

	if ((mapfunc = cdevsw[major(dev)].d_mmap) == NULL)
		return (ENODEV);

	/*
	 * Character devices that support the d_mmap
	 * interface can only be mmap'ed shared.
	 */
	if ((flags & MAP_TYPE) != MAP_SHARED)
		return (EINVAL);

	/*
	 * Check to insure that the entire range is
	 * legal and we are not trying to map in
	 * more than the device will let us.
	 */
	for (i = 0; i < len; i += PAGESIZE) {
		if ((*mapfunc)(dev, off + i, maxprot) == -1)
			return (ENXIO);
	}

	if ((flags & MAP_FIXED) == 0) {
		/*
		 * Pick an address w/o worrying about
		 * any vac alignment contraints.
		 */
		map_addr(addrp, len, (off_t)off, 0);
		if (*addrp == NULL)
			return (ENOMEM);
	} else {
		/*
		 * User specified address -
		 * Blow away any previous mappings.
		 */
		(void) as_unmap(as, *addrp, len);
	}

	dev_a.mapfunc = mapfunc;
	dev_a.dev = dev;
	dev_a.offset = off;
	dev_a.prot = prot;
	dev_a.maxprot = maxprot;

	return (as_map(as, *addrp, len, segdev_create, (caddr_t)&dev_a));
}

static int
spec_map(vp, off, as, addrp, len, prot, maxprot, flags, cred)
	struct vnode *vp;
	u_int off;
	struct as *as;
	addr_t *addrp;
	u_int len;
	u_int prot, maxprot;
	u_int flags;
	struct ucred *cred;
{

	if (vp->v_type == VCHR) {
		int (*segmap)();
		dev_t dev = vp->v_rdev;

		/*
		 * Character device, let the device driver
		 * pick the appropriate segment driver.
		 */
		segmap = cdevsw[major(dev)].d_segmap;
		if (segmap == NULL) {
			if (cdevsw[major(dev)].d_mmap == NULL)
				return (ENODEV);

			/*
			 * For cdevsw[] entries that specify a d_mmap
			 * function but don't have a d_segmap function,
			 * we default to spec_segmap for compatibility.
			 */
			segmap = spec_segmap;
		}

		return ((*segmap)(dev, off, as, addrp, len, prot, maxprot,
		    flags, cred));
	} else if (vp->v_type == VBLK) {
		struct segvn_crargs vn_a;

		/*
		 * Block device, use the underlying bdevvp name for pages.
		 */
		if ((int)off < 0 || (int)(off + len) < 0)
			return (EINVAL);

		if ((flags & MAP_FIXED) == 0) {
			map_addr(addrp, len, (off_t)off, 1);
			if (*addrp == NULL)
				return (ENOMEM);
		} else {
			/*
			 * User specified address -
			 * Blow away any previous mappings.
			 */
			(void) as_unmap(as, *addrp, len);
		}

		ASSERT(VTOS(vp)->s_bdevvp != NULL);

		vn_a.vp = VTOS(vp)->s_bdevvp;
		vn_a.offset = off;
		vn_a.type = flags & MAP_TYPE;
		vn_a.prot = prot;
		vn_a.maxprot = maxprot;
		vn_a.cred = cred;
		vn_a.amp = NULL;

		return (as_map(as, *addrp, len, segvn_create, (caddr_t)&vn_a));
	} else {
		return (ENODEV);
	}
}

static int
spec_cmp(vp1, vp2)
	struct vnode *vp1, *vp2;
{

	return (vp1 == vp2);
}

int
spec_realvp(vp, vpp)
	struct vnode *vp;
	struct vnode **vpp;
{
	extern struct vnodeops spec_vnodeops;
	extern struct vnodeops fifo_vnodeops;
	struct vnode *rvp;

	if (vp &&
	    (vp->v_op == &spec_vnodeops || vp->v_op == &fifo_vnodeops)) {
		vp = VTOS(vp)->s_realvp;
	}
	if (vp && VOP_REALVP(vp, &rvp) == 0) {
		vp = rvp;
	}
	*vpp = vp;
	return (0);
}

spec_cntl(vp, cmd, idata, odata, iflg, oflg)
	struct vnode *vp;
	int cmd, iflg, oflg;
	caddr_t idata, odata;
{
	struct vnode *realvp;
	int error;

	switch (cmd) {
	/*
	 * ask the dev for this one
	 */
	case _PC_MAX_INPUT:
		if (vp->v_type == VCHR && vp->v_stream) {
			ASSERT(odata && oflg == CNTL_INT32);
			return (VOP_IOCTL(vp, TIOCISIZE, odata, 0, 0));
		} else if ((realvp = other_specvp(vp)) &&
		    realvp->v_type == VCHR && realvp->v_stream) {
			ASSERT(odata && oflg == CNTL_INT32);
			vp->v_stream = realvp->v_stream;
			return (VOP_IOCTL(vp, TIOCISIZE, odata, 0, 0));
		} else {
			/*
			 * This is for posix conformance.  Max input will
			 * always be at least 1 char.  Used to return EINVAL
			 */
			*odata = 1;
			return(0);
		}

	/*
	 * ask the supporting fs for everything else
	 */
	default:
		if (error = VOP_REALVP(vp, &realvp))
			return (error);
		return (VOP_CNTL(realvp, cmd, idata, odata, iflg, oflg));
	}
	/*NOTREACHED*/
}