Init

2021-10-11 18:20:23 -03:00
commit 2e8a93c394
13448 changed files with 2891753 additions and 0 deletions
--- a/sys/specfs/Makefile
+++ b/sys/specfs/Makefile
@@ -0,0 +1,15 @@
+#
+# @(#)Makefile 1.1 92/07/30 SMI
+#
+HFILES=fifo.h fifonode.h snode.h
+HDIR=$(DESTDIR)/usr/include/specfs
+
+all: $(HFILES)
+
+clean:
+
+install: $(HFILES)
+
+install_h: $(HFILES)
+	install -d -m 755 $(HDIR)
+	install -m 444 $(HFILES) $(HDIR)
--- a/sys/specfs/bdev_dsort.c
+++ b/sys/specfs/bdev_dsort.c
@@ -0,0 +1,576 @@
+#ident	"@(#)bdev_dsort.c 1.1 92/07/30 SMI"	/* from UCB 4.3 81/03/09 */
+
+/*
+ * Seek sort for disks.  We depend on the driver
+ * which calls us using b_resid as the current cylinder number.
+ *
+ * The argument dp structure holds a b_actf activity chain pointer
+ * on which we keep two queues, sorted in ascending cylinder order.
+ * The first queue holds those requests which are positioned after
+ * the current cylinder (in the first request); the second holds
+ * requests which came in after their cylinder number was passed.
+ * Thus we implement a one way scan, retracting after reaching the
+ * end of the drive to the first request on the second queue,
+ * at which time it becomes the first queue.
+ *
+ * A one-way scan is natural because of the way UNIX read-ahead
+ * blocks are allocated.
+ *
+ * This implementation also allows certain page-oriented operations
+ * to 'kluster' up into a single request.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <vm/page.h>
+#include <sys/kmem_alloc.h>
+
+#define	b_cylin	b_resid
+
+/*
+ * A kluster structure manages pools of buffers which are klustered together
+ * with a single primary buffer. The single primary buffer is an original
+ * buffer that has been modified to perform I/O for all the buffers in the
+ * pool.
+ *
+ * The traditional action of disksort() is to sort a buffer into a disk
+ * activity queue, using the b_resid field as a sort key and using the
+ * av_forw tag as the forward link field. The av_back field for each buffer
+ * is available for the driver to use as it sees fit.
+ *
+ * Diagrammatically, this is how this looks:
+ *
+ * queue header:
+ *	front -----------> buf --------> buf --------> buf ---> 0
+ *	back  -----------------------------------------^
+ *
+ *
+ * When a certain set of conditions are met (see below), instead of sorting
+ * a new buffer into this queue, we instead try and modify a buffer that is
+ * currently in the queue to do the I/O operation for the new buffer
+ *
+ *
+ * In this case, we allocate a kluster structure and modify things such
+ * that things look like this:
+ *
+ * queue header:
+ *	front -----------> buf ------> buf --------> buf ---> 0
+ *	back  ------------------------/ ^ /-----------^
+ *					|
+ *			klust struct	|
+ *			 prime buf -----/
+ *			 front ------------> buf
+ *			 tail ----------------^
+ *
+ * The kluster structure also maintains a copy of the original b_bcount
+ * field for the primary buffer. This is so that when this arrangement is
+ * decommissioned (either by calling klustdone() or klustbust()), that
+ * the original primary buffer structure ends up looking the same as
+ * if it had never been operated on by the klustering code.
+ *
+ * The conditions for which klustering might apply are these:
+ *
+ *	1) The driver wishing to use klustering calls the klustsort()
+ *	function instead of the disksort() function. The klustsort()
+ *	function functions as disksort did, except that it takes a third
+ *	argument which is the non-inclusive integer maximum upper limit
+ *	(in bytes) that a kluster operation can be increased to. The
+ *	old disksort() interface is maintained by haveing it turn right
+ *	around and call klustsort() with this argument as zero.
+ *
+ *	2) The driver uses the b_resid sort key to sort by absolute
+ *	logical block. Historically, the sort key has been just the
+ *	drive cylinder number for the request. This allows a number
+ *	of requests for a drive to be partially sorted with respect
+ *	to the drive layout, and is more or less optimal for devices
+ *	where the notion of cylinder is stil meaningful (SA-450, ST-506,
+ *	ESDI, and SMD devices), but is not particularly meaningful for
+ *	devices which are logically addressed (SCSI and IPI).
+ *
+ *	3) A number or condtions for both the buffer already in the
+ *	queue and the new buffer to be sorted or klustered are met.
+ *	These are a fairly limiting and restrictive set of conditions.
+ *
+ *		+ The buffer in the queue is not the head of the queue
+ *		(i.e., isn't the 'active' request).
+ *
+ *		+ The the b_dev fields of both buffers are the same
+ *
+ *		+ The buffer being added has only B_PAGEIO set of the
+ *		flags B_KLUSTER, B_REMAPPED, B_PAGEIO and B_READ.
+ *
+ *		+ The buffer already in the queue has only B_PAGEIO set
+ *		of the flage B_REMAPPED, B_PAGEIO and B_READ.
+ *
+ *		+ The b_un.b_addr field of both buffers is zero.
+ *
+ *		+ The b_bcount field of both buffers is mmu page aligned.
+ *
+ *		+ The logical block number for the buffer already in the
+ *		queue plus the quantity btodb() of its b_bcount field
+ *		equal the b_blkno field of the buffer to be added
+ *		(i.e., a logical contiguous set of disk blocks are
+ *		maintained)
+ *
+ *		+ The the b_bcount field of the buffer in the queue plus
+ *		b_bcount field of the buffer to be added does not equal
+ *		or exceed the maximum as passed by the driver.
+ *
+ *	The intent of these conditions are to ensure that all buffers are
+ *	pure page-oriented operations, are write operations only, are for
+ *	logically contiguous areas of the device, and do not exceed some
+ *	count limitation specified by the driver, before allowing a new
+ *	buffer to be klustered with a buffer already in the queue rather
+ *	than being sorted into the queue.
+ *
+ *	If these conditions are met, a routine is called which attempts
+ *	to add the new buffer to a list of buffers that are klustered
+ *	with the buffer already in the queue. If this is successful, the
+ *	buffer in the queue is modified to 'own' the list of pages
+ *	for the new buffer, and its b_bcount field is adjusted to
+ *	reflect the new size of the data area being managed for I/O.
+ *
+ *	The klustsort() routine returns the value 1 if the buffer that
+ *	it had been passed was klustered, else 0 (in which case the
+ *	buffer has just been sorted into the activity queue). The
+ *	primary buffer's b_flags field has B_KLUSTER set in it to
+ *	note that this is the primary buffer of a kluster of buffers.
+ *
+ *	4) When I/O completes for a buffer marked B_KLUSTER, the driver
+ *	calls the function klustdone() (instead of iodone()). klustdone()
+ *	breaks apart the list of pages from the primary buffer and restores
+ *	them to their original 'owners', restores the b_bcount field for
+ *	the primary kluster buffer, clears the B_KLUSTER flag in the
+ *	primary buffer, and calls iodone() for all buffers that were part
+ *	of this kluster. If the primary buffer had either a residual count
+ *	set in b_resid, or the flag B_ERROR was set, all buffers that were
+ *	part of the kluster have B_ERROR set, and b_resid set equal to their
+ *	b_bcount field. klustdone() returns the integer number of buffers
+ *	that had all been klustered together.
+ *
+ *	Optionally, if a driver wishes to retry failed I/O operations on each
+ *	buffer from a kluster singly (in order to isolate the actual error
+ *	more precisely), the function klustbust() is provided. The driver
+ *	passes the primary buffer to klustbust(), which performs the same
+ *	restoration of pages to their rightful owners and the b_bcount field
+ *	back to the primary driver. It leaves the buffers linked together
+ *	as a forward linked list of buffers (through the av_forw) field
+ *	starting from the primary buffer. The driver can then do as it
+ *	pleases with this chain.
+ */
+
+struct kluster {
+	struct kluster	*klust_next;	/* next in a list of kluster structs */
+	struct buf	*klust_head;	/* head of list of indirect bufs */
+	struct buf	*klust_tail;	/* tail of list of indirect bufs */
+	struct buf	*klust_prime;	/* primary kluster buffer */
+	int		klust_pcount;	/* primary buf's original b_bcount */
+};
+
+static int kluston = 1;
+static int klust_buf_flag_chk = B_REMAPPED|B_PAGEIO|B_READ;
+static int nklusters;
+#define	KLUSTMAXPSIZE	128
+static struct kluster *klustfree, *klustbusy;
+
+int klustsort(), klustdone();
+void klustbust();
+static int klustadd();
+
+disksort(dp, bp)
+struct diskhd *dp;
+struct buf *bp;
+{
+	(void) klustsort(dp, bp, 0);
+}
+
+/*
+ * Perform traditional sorting into a disk activity queue.
+ *
+ * If desired, instead of sorting a buffer into the queue,
+ * see if it can instead have its I/O operation joined up
+ * with the I/O operation of a buffer already in the queue.
+ */
+
+int
+klustsort(dp, bp, maxbcount)
+struct diskhd *dp;
+register struct buf *bp;
+int maxbcount;
+{
+	register struct buf *ap;
+
+	/*
+	 * If nothing on the activity queue, then
+	 * we become the only thing.
+	 */
+	ap = dp->b_actf;
+	if (ap == NULL) {
+		dp->b_actf = bp;
+		dp->b_actl = bp;
+		bp->av_forw = NULL;
+		return (0);
+	}
+
+	/*
+	 * Check to see whether the requested buffer is eligible
+	 * to become a candidate for klustering.
+	 */
+
+	/*
+	 * If we lie after the first (currently active)
+	 * request, then we must locate the second request list
+	 * and add ourselves to it.
+	 */
+	if (bp->b_cylin < ap->b_cylin) {
+		while (ap->av_forw) {
+			/*
+			 * Check for an ``inversion'' in the
+			 * normally ascending cylinder numbers,
+			 * indicating the start of the second request list.
+			 */
+			if (ap->av_forw->b_cylin < ap->b_cylin) {
+				/*
+				 * Search the second request list
+				 * for the first request at a larger
+				 * cylinder number.  We go before that;
+				 * if there is no such request, we go at end.
+				 */
+				do {
+					if (bp->b_cylin < ap->av_forw->b_cylin)
+						goto insert;
+					ap = ap->av_forw;
+				} while (ap->av_forw);
+				goto insert;		/* after last */
+			}
+			ap = ap->av_forw;
+		}
+		/*
+		 * No inversions... we will go after the last, and
+		 * be the first request in the second request list.
+		 */
+		goto insert;
+	}
+	/*
+	 * Request is at/after the current request...
+	 * sort in the first request list.
+	 */
+	while (ap->av_forw) {
+		/*
+		 * We want to go after the current request
+		 * if there is an inversion after it (i.e. it is
+		 * the end of the first request list), or if
+		 * the next request is a larger cylinder than our request.
+		 */
+		if (ap->av_forw->b_cylin < ap->b_cylin ||
+		    bp->b_cylin < ap->av_forw->b_cylin)
+			goto insert;
+		ap = ap->av_forw;
+	}
+	/*
+	 * Neither a second list nor a larger
+	 * request... we go at the end of the first list,
+	 * which is the same as the end of the whole schebang.
+	 */
+insert:
+	/*
+	 * See if we can kluster bp with ap
+	 *
+	 * Note that this will probably not kluster
+	 * with any device that sorts by anything other
+	 * than logical block number. Historically, the
+	 * b_cylin field has been used to sort via to
+	 * the granularity of cylinder number. However,
+	 * in order to take advantage of putting together
+	 * this one-way elevator sorting and checking for
+	 * the opportunity to kluster up requests at the
+	 * same time, we had to make some simplifying
+	 * assumptions here. Therefore, if somebody
+	 * calls klustsort() directly, it is assumed
+	 * that if they have gone to the effort of
+	 * stating that they wish to be eligible for
+	 * kluster checking (by setting the maxbcount
+	 * argument to nonzero), then they must use
+	 * a sort token in b_resid (b_cylin) that
+	 * matches the dkblock(bp) value.
+	 */
+
+	if (kluston && maxbcount != 0 && ap != dp->b_actf &&
+	    (ap->b_dev == bp->b_dev) &&
+	    ((bp->b_flags & (klust_buf_flag_chk|B_KLUSTER)) == B_PAGEIO) &&
+	    ((ap->b_flags & klust_buf_flag_chk) == B_PAGEIO) &&
+	    (ap->b_un.b_addr == (caddr_t) 0) &&
+	    (bp->b_un.b_addr == (caddr_t) 0) &&
+	    (((ap->b_bcount | bp->b_bcount) & PAGEOFFSET) == 0) &&
+	    (ap->b_blkno + btodb(ap->b_bcount) == bp->b_blkno) &&
+	    (ap->b_bcount + bp->b_bcount < maxbcount)) {
+		if (klustadd(ap, bp) != 0) {
+			return (1);
+		}
+	}
+
+	bp->av_forw = ap->av_forw;
+	ap->av_forw = bp;
+	if (ap == dp->b_actl)
+		dp->b_actl = bp;
+	return (0);
+}
+
+/*
+ * Add a new buffer to the passed kluster buf (if possible).
+ * If this is a brand new kluster being started, find a kluster
+ * structure and save the original starting buffer's b_bcount
+ * tag in it (for later restoration upon i/o completion).
+ * If we cannot find a free kluster structure, allocate another
+ * one, but don't sweat it if there isn't any memory available.
+ * Also limit ourselves to the very generous overall limit of
+ * 128 kluster structures.
+ *
+ * Returns 1 if was able kluster, else 0.
+ * Called only by klustsort().
+ *
+ */
+
+
+static int
+klustadd(bp, nbp)
+register struct buf *bp, *nbp;
+{
+	register int s;
+	register struct page *ppl, *nppl;
+	register struct kluster *kp;
+
+	s = splvm();
+	if ((bp->b_flags & B_KLUSTER) == 0) {
+		if ((kp = klustfree) == NULL) {
+			if (nklusters >= KLUSTMAXPSIZE) {
+				(void) splx(s);
+				return (0);
+			}
+			klustfree = (struct kluster *)
+			    new_kmem_zalloc(sizeof (*kp), KMEM_NOSLEEP);
+			if ((kp = klustfree) == NULL) {
+				(void) splx(s);
+				return (0);
+			}
+			nklusters++;
+		} else {
+			klustfree = kp->klust_next;
+		}
+
+		klustfree = kp->klust_next;
+		kp->klust_next = klustbusy;
+
+		kp->klust_head = nbp;
+		kp->klust_prime = bp;
+		kp->klust_pcount = bp->b_bcount;
+
+		klustbusy = kp;
+		bp->b_flags |= B_KLUSTER;
+	} else {
+		for (kp = klustbusy; kp != NULL; kp = kp->klust_next) {
+			if (kp->klust_prime == bp) {
+				break;
+			}
+		}
+		if (kp == NULL) {
+			(void) splx(s);
+			/*
+			 * This should be a panic....
+			 */
+			return (0);
+		}
+		kp->klust_tail->av_forw = nbp;
+	}
+
+	kp->klust_tail = nbp;
+	nbp->av_forw = 0;
+	bp->b_bcount += nbp->b_bcount;
+
+	ppl = bp->b_pages->p_prev;
+	nppl = nbp->b_pages->p_prev;
+
+	nppl->p_next = bp->b_pages;
+	bp->b_pages->p_prev = nppl;
+	ppl->p_next = nbp->b_pages;
+	nbp->b_pages->p_prev = ppl;
+
+	(void) splx(s);
+
+	/*
+	 * The av_back field of the buffer we are adding to the kluster
+	 * chain saves the original last page pointer for the previous buffer.
+	 */
+
+	nbp->av_back = (struct buf *) ppl;
+
+	return (1);
+}
+
+/*
+ *
+ * Bust apart a klustered set of buffers and
+ * decommission the active kluster structure.
+ *
+ * Upon return from this function the argument
+ * buffer passed will be the head of a forward
+ * linked list of buffers that are the real
+ * buffers that constituted the kluster.
+ * The linkage is through the av_forw tag.
+ */
+
+void
+klustbust(bp)
+register struct buf *bp;
+{
+	register struct page *pp;
+	struct page *first_pp_prev;
+	register struct kluster *kp, *kpr;
+	register int s;
+
+	if ((bp->b_flags & B_KLUSTER) == 0) {
+		bp->av_forw = (struct buf *) NULL;
+		return;
+	}
+
+	kpr = (struct kluster *) NULL;
+	s = splvm();
+
+	kp = klustbusy;
+	while (kp != (struct kluster *) NULL) {
+		if (kp->klust_prime == bp)
+			break;
+		kpr = kp;
+		kp = kp->klust_next;
+	}
+
+	if (kp == NULL) {
+		(void) splx(s);
+		bp->b_flags &= ~B_KLUSTER;
+		bp->av_forw = (struct buf *) NULL;
+		/*
+		 * This should be a logged warning..
+		 */
+		return;
+	}
+
+	/*
+	 * Restore original buffer's b_count field
+	 * and point forward link at the chain of saved
+	 * buffers that made up the rest of the kluster
+	 */
+
+	bp->b_bcount = kp->klust_pcount;
+	bp->av_forw = kp->klust_head;
+
+	/*
+	 * Put the kluster structure back on the free list
+	 */
+
+	if (kpr) {
+		kpr->klust_next = kp->klust_next;
+	} else {
+		klustbusy = kp->klust_next;
+	}
+	kp->klust_next = klustfree;
+	klustfree = kp;
+	(void) splx(s);
+
+	bp->b_flags &= ~B_KLUSTER;
+
+	/*
+	 * If the action of doing I/O caused the buffer to
+	 * be mapped in, map it back out again.
+	 *
+	 * We don't need to worry about it having been
+	 * mapped in before hand because if it had been
+	 * it wouldn't have been eligible for klustering
+	 * to begin with.
+	 */
+
+	bp_mapout(bp);
+
+	/*
+	 * Walk the chain and bust out the pages and restore them to
+	 * their original owners. The p_prev page for any given buffer
+	 * (except the last one in the chain) had been saved in the
+	 * *next* buffers' av_back field. The last buffer in the chain's
+	 * bp->b_pages->p_prev is obviously the bp->b_pages->p_prev
+	 * for the first buffer.
+	 */
+
+	first_pp_prev = bp->b_pages->p_prev;
+
+	while (bp) {
+		if (bp->av_forw) {
+			pp = (struct page *) bp->av_forw->av_back;
+		} else {
+			pp = first_pp_prev;
+		}
+		pp->p_next = bp->b_pages;
+		bp->b_pages->p_prev = pp;
+		bp = bp->av_forw;
+	}
+}
+
+/*
+ * Break apart a kluster into its original set of
+ * of buffers and call iodone. Return an integer
+ * count of the number of buffers passed to iodone
+ */
+
+int
+klustdone(bp)
+register struct buf *bp;
+{
+	register struct buf *nbp;
+	register i, err;
+
+	/*
+	 * If this doesn't appear to be a kluster buf, call
+	 * iodone() anyhow for the buffer and return a
+	 * count of 1 to say that one buf was passed to
+	 * iodone().
+	 */
+
+	if ((bp->b_flags & B_KLUSTER) == 0) {
+		iodone(bp);
+		return (1);
+	}
+
+	/*
+	 * Bust out the kluster chain and
+	 * 'finish' off the chain of bufs
+	 * that klustbust sets up.
+	 *
+	 * It is considered an error if a
+	 * kluster operation finishes with
+	 * a non-zero residual. In any
+	 * case, if an error condition is
+	 * set upon the kluster buf, it
+	 * is propagated to all buffers.
+	 * Further, we do not count that
+	 * any i/o was done, period.
+	 */
+
+	err = ((bp->b_flags & B_ERROR) || bp->b_resid);
+
+	klustbust(bp);
+
+	i = 0;
+	while (bp) {
+		nbp = bp->av_forw;
+		if (err) {
+			bp->b_flags |= B_ERROR;
+			bp->b_resid = bp->b_bcount;
+		} else {
+			bp->b_resid = 0;
+		}
+		bp->av_forw = bp->av_back = 0;
+		iodone(bp);
+		bp = nbp;
+		i++;
+	}
+	return (i);
+}
--- a/sys/specfs/fifo.h
+++ b/sys/specfs/fifo.h
@@ -0,0 +1,69 @@
+/*	@(#)fifo.h 1.1 92/07/30 SMI;	*/
+
+#ifndef _specfs_fifo_h
+#define	_specfs_fifo_h
+
+#ifdef KERNEL
+/*
+ *	Configuration Parameters
+ *
+ * These parameters are tuned by editing the system configuration file.
+ * The following lines establish the default values.
+ */
+#ifndef FIFOCNT
+#define	FIFOCNT 10	/* number of simultaneously open fifos */
+#endif
+
+/*
+ * The following parameters are assumed not to require tuning.
+ */
+#define	FIFOBUF	4096	/* max # bytes stored in a fifo */
+#define	FIFOMAX ~0	/* largest size of a single write to a fifo */
+#define	FIFOBSZ 4096	/* number of data bytes in each fifo data buffer */
+#define	FIFOMNB (FIFOBUF*(FIFOCNT+1))	/* # bytes allowed for all fifos */
+
+/*
+ * NOTE: When FIFOBUF == FIFOBSZ, a single buffer is used for each fifo.
+ *	Multiple, linked buffers will be used if FIFOBUF > FIFOBSZ.
+ *	In this case, FIFOBUF should be a multiple of FIFOBSZ and,
+ *	in order to minimize unnecessary fragmentation, FIFOBSZ should
+ *	probably be set to a power of 2 minus 4 (e.g., 4092).
+ *
+ *	Note, also, that this decision is made at compile-time so
+ *	the run-time modification of fifoinfo parameters is dangerous.
+ */
+#if (FIFOBUF > FIFOBSZ)
+struct fifo_bufhdr {
+	struct fifo_bufhdr *fb_next;	/* ptr to next buffer */
+	char fb_data[1];
+};
+#define	FIFO_BUFHDR_SIZE (sizeof (struct fifo_bufhdr *))
+#define	FIFO_BUFFER_SIZE (fifoinfo.fifobsz + FIFO_BUFHDR_SIZE)
+
+#else /*(FIFOBUF == FIFOBSZ)*/
+struct fifo_bufhdr {
+	union {
+		struct fifo_bufhdr *fu_next;
+		char fu_data[1];	/* must be at first byte in buffer */
+	} fb_u;
+};
+#define	fb_next fb_u.fu_next
+#define	fb_data fb_u.fu_data
+#define	FIFO_BUFFER_SIZE (fifoinfo.fifobsz)
+#endif
+
+/*
+ *	Fifo information structure.
+ */
+struct fifoinfo {
+	int	fifobuf,	/* max # bytes stored in a fifo */
+		fifomax,	/* largest size of a single write to a fifo */
+		fifobsz,	/* # of data bytes in each fifo data buffer */
+		fifomnb;	/* max # bytes reserved for all fifos */
+};
+
+int fifo_alloc;			/* total number of bytes reserved for fifos */
+struct fifoinfo	fifoinfo;	/* fifo parameters */
+
+#endif KERNEL
+#endif /*!_specfs_fifo_h*/
--- a/sys/specfs/fifo_vnodeops.c
+++ b/sys/specfs/fifo_vnodeops.c
@@ -0,0 +1,744 @@
+/*	@(#)fifo_vnodeops.c 1.1 92/07/30 SMI	*/
+
+/*
+ * Copyright (c) 1987 by Sun Microsystems, Inc.
+ */
+
+/*
+ * System V-compatible FIFO implementation.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+#include <sys/proc.h>
+#include <sys/user.h>
+#include <sys/uio.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/file.h>
+#include <sys/errno.h>
+#include <sys/signal.h>
+#include <sys/unistd.h>
+#include <specfs/fifo.h>
+#include <specfs/snode.h>
+#include <specfs/fifonode.h>
+
+#include <krpc/lockmgr.h>
+
+#define	SANITY		/* do sanity checks */
+
+static struct fifo_bufhdr *fifo_bufalloc();
+static struct fifo_bufhdr *fifo_buffree();
+
+static	int fifo_open();
+static	int fifo_close();
+static	int fifo_rdwr();
+static	int fifo_select();
+static	int fifo_getattr();
+static	int fifo_inactive();
+static	int fifo_invalop();
+static	int fifo_cmp();
+static	int fifo_badop();
+static	int fifo_cntl();
+
+extern int spec_setattr();
+extern int spec_access();
+extern int spec_link();
+extern int spec_lockctl();
+extern int spec_fsync();
+extern int spec_fid();
+extern int spec_realvp();
+
+struct vnodeops fifo_vnodeops = {
+	fifo_open,
+	fifo_close,
+	fifo_rdwr,
+	fifo_badop,	/* ioctl */
+	fifo_select,
+	fifo_getattr,
+	spec_setattr,
+	spec_access,
+	fifo_invalop,	/* lookup */
+	fifo_invalop,	/* create */
+	fifo_invalop,	/* remove */
+	spec_link,
+	fifo_invalop,	/* rename */
+	fifo_invalop,	/* mkdir */
+	fifo_invalop,	/* rmdir */
+	fifo_invalop,	/* readdir */
+	fifo_invalop,	/* symlink */
+	fifo_invalop,	/* readlink */
+	spec_fsync,
+	fifo_inactive,
+	spec_lockctl,
+	spec_fid,
+	fifo_badop,	/* getpage */
+	fifo_badop,	/* putpage */
+	fifo_invalop,	/* map */
+	fifo_invalop,	/* dump */
+	fifo_cmp,
+	spec_realvp,
+	fifo_cntl,
+};
+
+
+/*
+ * open a fifo -- sleep until there are at least one reader & one writer
+ */
+/*ARGSUSED*/
+static int
+fifo_open(vpp, flag, cred)
+	struct vnode **vpp;
+	int flag;
+	struct ucred *cred;
+{
+	register struct fifonode *fp;
+
+	/*
+	 * Setjmp in case open is interrupted.
+	 * If it is, close and return error.
+	 */
+	if (setjmp(&u.u_qsave)) {
+		(void) fifo_close(*vpp, flag & FMASK, 1, cred);
+		return (EINTR);
+	}
+	fp = VTOF(*vpp);
+
+	if (flag & FREAD) {
+		if (fp->fn_rcnt++ == 0)
+			/* if any writers waiting, wake them up */
+			wakeup((caddr_t) &fp->fn_rcnt);
+	}
+
+	if (flag & FWRITE) {
+		if ((flag & (FNDELAY|FNONBIO|FNBIO)) && (fp->fn_rcnt == 0))
+			return (ENXIO);
+		if (fp->fn_wcnt++ == 0)
+			/* if any readers waiting, wake them up */
+			wakeup((caddr_t) &fp->fn_wcnt);
+	}
+
+	if (flag & FREAD) {
+		while (fp->fn_wcnt == 0) {
+			/* if no delay, or data in fifo, open is complete */
+			if ((flag & (FNDELAY|FNONBIO|FNBIO)) || fp->fn_size)
+				return (0);
+			(void) sleep((caddr_t) &fp->fn_wcnt, PPIPE);
+		}
+	}
+
+	if (flag & FWRITE) {
+		while (fp->fn_rcnt == 0)
+			(void) sleep((caddr_t) &fp->fn_rcnt, PPIPE);
+	}
+	return (0);
+}
+
+/*
+ * close a fifo
+ * On final close, all buffered data goes away
+ */
+/*ARGSUSED*/
+static int
+fifo_close(vp, flag, count, cred)
+	struct vnode *vp;
+	int flag;
+	int count;
+	struct ucred *cred;
+{
+	register struct fifonode *fp;
+	register struct fifo_bufhdr *bp;
+
+	if (count > 1)
+		return (0);
+
+	fp = VTOF(vp);
+
+	if (flag & FREAD) {
+		if (--fp->fn_rcnt == 0) {
+			if (fp->fn_flag & FIFO_WBLK) {
+				fp->fn_flag &= ~FIFO_WBLK;
+				wakeup((caddr_t) &fp->fn_wcnt);
+			}
+			/* wake up any sleeping exception select()s */
+			if (fp->fn_xsel) {
+				curpri = PPIPE;
+				selwakeup(fp->fn_xsel, fp->fn_flag&FIFO_XCOLL);
+				fp->fn_flag &= ~FIFO_XCOLL;
+				fp->fn_xsel = (struct proc *)0;
+			}
+		}
+	}
+
+	if (flag & FWRITE) {
+		if ((--fp->fn_wcnt == 0) && (fp->fn_flag & FIFO_RBLK)) {
+			fp->fn_flag &= ~FIFO_RBLK;
+			wakeup((caddr_t) &fp->fn_rcnt);
+		}
+	}
+
+	if ((fp->fn_rcnt == 0) && (fp->fn_wcnt == 0)) {
+		/* free all buffers associated with this fifo */
+		bp = fp->fn_buf;
+		while (bp != NULL)
+			bp = fifo_buffree(bp, fp);
+
+		/* update times only if there were bytes flushed from fifo */
+		if (fp->fn_size != 0)
+			FIFOMARK(fp, SUPD|SCHG);
+
+		fp->fn_buf = (struct fifo_bufhdr *) NULL;
+		fp->fn_rptr = 0;
+		fp->fn_wptr = 0;
+		fp->fn_size = 0;
+	}
+	return (0);
+}
+
+
+/*
+ * read/write a fifo
+ */
+/*ARGSUSED*/
+static int
+fifo_rdwr(vp, uiop, rw, ioflag, cred)
+	struct vnode *vp;
+	struct uio *uiop;
+	enum uio_rw rw;
+	int ioflag;
+	struct ucred *cred;
+{
+	register struct fifonode *fp;
+	register struct fifo_bufhdr *bp;
+	register u_int count;
+	register int off;
+	register unsigned i;
+	register int rval = 0;
+	int ocnt = uiop->uio_resid;	/* save original request size */
+
+#ifdef SANITY
+	if (uiop->uio_offset != 0)
+		printf("fifo_rdwr: non-zero offset: %d\n", uiop->uio_offset);
+#endif SANITY
+
+	fp = VTOF(vp);
+	FIFOLOCK(fp);
+
+	if (rw == UIO_WRITE) {				/* UIO_WRITE */
+		/*
+		 * fifoinfo.fifobuf: max number of bytes buffered per open pipe
+		 * fifoinfo.fifomax: max size of single write to a pipe
+		 *
+		 * If the count is less than fifoinfo.fifobuf, it must occur
+		 * atomically.  If it does not currently fit in the
+		 * kernel pipe buffer, either: sleep, if no form of no-delay
+		 * mode is on; return -1 and EAGAIN, if POSIX-style no-delay
+		 * mode is on (FNONBIO set); return -1 and EWOULDBLOCK, if
+		 * 4.2-style no-delay mode is on (FNDELAY set); return 0, if
+		 * S5-style no-delay mode is on (FNBIO set).
+		 *
+		 * If the count is greater than fifoinfo.fifobuf, it will be
+		 * non-atomic (FNDELAY, FNONBIO, and FNBIO clear).  If FNDELAY,
+		 * FNONBIO, or FNBIO is set, write as much as will fit into the
+		 * kernel pipe buffer and return the number of bytes written.
+		 *
+		 * If the count is greater than fifoinfo.fifomax, return EINVAL.
+		 */
+		if ((unsigned)uiop->uio_resid > fifoinfo.fifomax) {
+			rval = EINVAL;
+			goto rdwrdone;
+		}
+
+		while (count = uiop->uio_resid) {
+			if (fp->fn_rcnt == 0) {
+				/* no readers anymore! */
+				psignal(u.u_procp, SIGPIPE);
+				rval = EPIPE;
+				goto rdwrdone;
+			}
+			if ((count + fp->fn_size) > fifoinfo.fifobuf) {
+				if (uiop->uio_fmode & (FNDELAY|FNBIO|FNONBIO)) {
+					/*
+					 * Non-blocking I/O.
+					 */
+					if (count <= fifoinfo.fifobuf) {
+						/*
+						 * Write will be satisfied
+						 * atomically, later.
+						 * If data was moved, return OK.
+						 * Else:
+						 * If POSIX-style non-blocking
+						 * I/O, return -1 and EAGAIN,
+						 * if 4.2-style non-blocking
+						 * I/O, return -1 and
+						 * EWOULDBLOCK, otherwise
+						 * return 0.
+						 */
+						if (ocnt != uiop->uio_resid)
+							goto rdwrdone;
+						if (uiop->uio_fmode & FNDELAY)
+							rval = EWOULDBLOCK;
+						if (uiop->uio_fmode & FNONBIO)
+							rval = EAGAIN;
+						goto rdwrdone;
+					} else if (fp->fn_size >=
+					    fifoinfo.fifobuf) {
+						/*
+						 * Write will never be atomic.
+						 * At this point, it cannot
+						 * even be partial.   However,
+						 * some portion of the write
+						 * may already have succeeded.
+						 * If so, uio_resid reflects
+						 * this.
+						 */
+						if ((uiop->uio_fmode&FNONBIO) &&
+						    (ocnt == uiop->uio_resid))
+							rval = EAGAIN;
+						if ((uiop->uio_fmode&FNDELAY) &&
+						    (ocnt == uiop->uio_resid))
+							rval = EWOULDBLOCK;
+						goto rdwrdone;
+					}
+				} else {
+					/*
+					 * Blocking I/O.
+					 */
+					if ((count <= fifoinfo.fifobuf) ||
+					    (fp->fn_size >= fifoinfo.fifobuf)) {
+				/*
+				 * Sleep until there is room for this request.
+				 * On wakeup, go back to the top of the loop.
+				 */
+						fp->fn_flag |= FIFO_WBLK;
+						FIFOUNLOCK(fp);
+						(void) sleep((caddr_t)
+						    &fp->fn_wcnt, PPIPE);
+						FIFOLOCK(fp);
+						goto wrloop;
+					}
+				}
+				/* at this point, can do a partial write */
+				count = fifoinfo.fifobuf - fp->fn_size;
+			}
+			/*
+			 * Can write 'count' bytes to pipe now.   Make sure
+			 * there is enough space in the allocated buffer list.
+			 * If not, try to allocate more.
+			 * If allocation does not succeed immediately, go back
+			 * to the  top of the loop to make sure everything is
+			 * still cool.
+			 */
+
+#ifdef SANITY
+			if ((fp->fn_wptr - fp->fn_rptr) != fp->fn_size)
+			    printf(
+		"fifo_write: ptr mismatch...size:%d  wptr:%d  rptr:%d\n",
+				fp->fn_size, fp->fn_wptr, fp->fn_rptr);
+
+			if (fp->fn_rptr > fifoinfo.fifobsz)
+			    printf("fifo_write: rptr too big...rptr:%d\n",
+				fp->fn_rptr);
+			if (fp->fn_wptr > (fp->fn_nbuf * fifoinfo.fifobsz))
+			    printf(
+				"fifo_write: wptr too big...wptr:%d  nbuf:%d\n",
+				fp->fn_wptr, fp->fn_nbuf);
+#endif SANITY
+
+			while (((fp->fn_nbuf * fifoinfo.fifobsz) - fp->fn_wptr)
+			    < count) {
+				if ((bp = fifo_bufalloc(fp)) == NULL) {
+					goto wrloop;	/* fifonode unlocked */
+				}
+				/* new buffer...tack it on the of the list */
+				bp->fb_next = (struct fifo_bufhdr *) NULL;
+				if (fp->fn_buf == (struct fifo_bufhdr *) NULL) {
+					fp->fn_buf = bp;
+				} else {
+					fp->fn_bufend->fb_next = bp;
+				}
+				fp->fn_bufend = bp;
+			}
+			/*
+			 * There is now enough space to write 'count' bytes.
+			 * Find append point and copy new data.
+			 */
+			bp = fp->fn_buf;
+			for (off = fp->fn_wptr; off >= fifoinfo.fifobsz;
+			    off -= fifoinfo.fifobsz)
+				bp = bp->fb_next;
+
+			while (count) {
+				i = fifoinfo.fifobsz - off;
+				i = MIN(count, i);
+				if (rval =
+				    uiomove(&bp->fb_data[off], (int) i,
+				    UIO_WRITE, uiop)){
+					/* error during copy from user space */
+					/* NOTE:LEAVE ALLOCATED BUFS FOR NOW */
+					goto rdwrdone;
+				}
+				fp->fn_size += i;
+				fp->fn_wptr += i;
+				count -= i;
+				off = 0;
+				bp = bp->fb_next;
+			}
+			FIFOMARK(fp, SUPD|SCHG);	/* update mod times */
+
+			/* wake up any sleeping readers */
+			if (fp->fn_flag & FIFO_RBLK) {
+				fp->fn_flag &= ~FIFO_RBLK;
+				curpri = PPIPE;
+				wakeup((caddr_t) &fp->fn_rcnt);
+			}
+
+			/* wake up any sleeping read selectors */
+			if (fp->fn_rsel) {
+				curpri = PPIPE;
+				selwakeup(fp->fn_rsel, fp->fn_flag&FIFO_RCOLL);
+				fp->fn_flag &= ~FIFO_RCOLL;
+				fp->fn_rsel = (struct proc *)0;
+			}
+
+wrloop: 		/* bottom of write 'while' loop */
+			continue;
+		}
+
+	} else {					/* UIO_READ */
+		/*
+		 * Handle zero-length reads specially here
+		 */
+		if ((count = uiop->uio_resid) == 0) {
+			goto rdwrdone;
+		}
+		while ((i = fp->fn_size) == 0) {
+			if (fp->fn_wcnt == 0) {
+				/* no data in pipe and no writers...(EOF) */
+				goto rdwrdone;
+			}
+			/*
+			 * No data in pipe, but writer is there;
+			 * if POSIX-style no-delay, return EAGAIN,
+			 * if 4.2-style no-delay, return EWOULDBLOCK,
+			 * if S5-style, return 0.
+			 */
+			if (uiop->uio_fmode & FNONBIO) {
+				rval = EAGAIN;
+				goto rdwrdone;
+			}
+			if (uiop->uio_fmode & FNDELAY) {
+				rval = EWOULDBLOCK;
+				goto rdwrdone;
+			}
+			if (uiop->uio_fmode & FNBIO)
+				goto rdwrdone;
+			fp->fn_flag |= FIFO_RBLK;
+			FIFOUNLOCK(fp);
+			(void) sleep((caddr_t) &fp->fn_rcnt, PPIPE);
+			FIFOLOCK(fp);
+			/* loop to make sure there is still a writer */
+		}
+
+#ifdef SANITY
+		if ((fp->fn_wptr - fp->fn_rptr) != fp->fn_size)
+			printf(
+			"fifo_read: ptr mismatch...size:%d  wptr:%d  rptr:%d\n",
+			    fp->fn_size, fp->fn_wptr, fp->fn_rptr);
+
+		if (fp->fn_rptr > fifoinfo.fifobsz)
+			printf("fifo_read: rptr too big...rptr:%d\n",
+			    fp->fn_rptr);
+
+		if (fp->fn_wptr > (fp->fn_nbuf * fifoinfo.fifobsz))
+			printf("fifo_read: wptr too big...wptr:%d  nbuf:%d\n",
+			    fp->fn_wptr, fp->fn_nbuf);
+#endif SANITY
+
+		/*
+		 * Get offset into first buffer at which to start getting data.
+		 * Truncate read, if necessary, to amount of data available.
+		 */
+		off = fp->fn_rptr;
+		bp = fp->fn_buf;
+		count = MIN(count, i);	/* smaller of pipe size and read size */
+
+		while (count) {
+			i = fifoinfo.fifobsz - off;
+			i = MIN(count, i);
+			if (rval =
+			    uiomove(&bp->fb_data[off], (int)i, UIO_READ, uiop)){
+				goto rdwrdone;
+			}
+			fp->fn_size -= i;
+			fp->fn_rptr += i;
+			count -= i;
+			off = 0;
+
+#ifdef SANITY
+			if (fp->fn_rptr > fifoinfo.fifobsz)
+				printf(
+			"fifo_read: rptr after uiomove too big...rptr:%d\n",
+				    fp->fn_rptr);
+#endif SANITY
+
+			if (fp->fn_rptr == fifoinfo.fifobsz) {
+				fp->fn_rptr = 0;
+				bp = fifo_buffree(bp, fp);
+				fp->fn_buf = bp;
+				fp->fn_wptr -= fifoinfo.fifobsz;
+			}
+			/*
+			 * At this point, if fp->fn_size is zero, there may be
+			 * an allocated, but unused, buffer.  [In this case,
+			 * fp->fn_rptr == fp->fn_wptr != 0.]
+			 * NOTE: FOR NOW, LEAVE THIS EXTRA BUFFER ALLOCATED.
+			 * NOTE: fifo_buffree() CAN'T HANDLE A BUFFER NOT 1ST.
+			 */
+		}
+
+		FIFOMARK(fp, SACC);	/* update the access times */
+
+		/* wake up any sleeping writers */
+		if (fp->fn_flag & FIFO_WBLK) {
+			fp->fn_flag &= ~FIFO_WBLK;
+			curpri = PPIPE;
+			wakeup((caddr_t) &fp->fn_wcnt);
+		}
+
+		/* wake up any sleeping write selectors */
+		if (fp->fn_wsel) {
+			curpri = PPIPE;
+			selwakeup(fp->fn_wsel, fp->fn_flag&FIFO_WCOLL);
+			fp->fn_flag &= ~FIFO_WCOLL;
+			fp->fn_wsel = (struct proc *)0;
+		}
+	}		/* end of UIO_READ code */
+
+rdwrdone:
+	FIFOUNLOCK(fp);
+	uiop->uio_offset = 0;		/* guarantee that f_offset stays 0 */
+	return (rval);
+}
+
+static int
+fifo_getattr(vp, vap, cred)
+	struct vnode *vp;
+	struct vattr *vap;
+	struct ucred *cred;
+{
+	register int error;
+	register struct snode *sp;
+
+	sp = VTOS(vp);
+	error = VOP_GETATTR(sp->s_realvp, vap, cred);
+	if (!error) {
+		/* set current times from snode, even if older than vnode */
+		vap->va_atime = sp->s_atime;
+		vap->va_mtime = sp->s_mtime;
+		vap->va_ctime = sp->s_ctime;
+
+		/* size should reflect the number of unread bytes in pipe */
+		vap->va_size = (VTOF(vp))->fn_size;
+		vap->va_blocksize = fifoinfo.fifobuf;
+	}
+	return (error);
+}
+
+/*
+ * test for fifo selections
+ */
+/*ARGSUSED*/
+static int
+fifo_select(vp, flag, cred)
+	struct vnode *vp;
+	int flag;
+	struct ucred *cred;
+{
+	register struct fifonode *fp;
+
+	fp = VTOF(vp);
+
+	switch (flag) {
+	case FREAD:
+		if (fp->fn_size != 0)		/* anything to read? */
+			return (1);
+		if (fp->fn_rsel && fp->fn_rsel->p_wchan == (caddr_t)&selwait)
+			fp->fn_flag |= FIFO_RCOLL;
+		else
+			fp->fn_rsel = u.u_procp;
+		break;
+
+	case FWRITE:
+		/* is there room to write? (and are there any readers?) */
+		if ((fp->fn_size < fifoinfo.fifobuf) && (fp->fn_rcnt > 0))
+			return (1);
+		if (fp->fn_wsel && fp->fn_wsel->p_wchan == (caddr_t)&selwait)
+			fp->fn_flag |= FIFO_WCOLL;
+		else
+			fp->fn_wsel = u.u_procp;
+		break;
+
+	case 0:
+		if (fp->fn_rcnt == 0)		/* no readers anymore? */
+			return (1);		/* exceptional condition */
+		if (fp->fn_xsel && fp->fn_xsel->p_wchan == (caddr_t)&selwait)
+			fp->fn_flag |= FIFO_XCOLL;
+		else
+			fp->fn_xsel = u.u_procp;
+		break;
+	}
+	return (0);
+}
+
+static int
+fifo_inactive(vp, cred)
+	struct vnode *vp;
+	struct ucred *cred;
+{
+	register struct snode *sp;
+
+	sp = VTOS(vp);
+	/* must sunsave() first to prevent a race when spec_fsync() sleeps */
+	sunsave(sp);
+	(void) spec_fsync(vp, cred);
+
+	/* now free the realvp (no longer done by sunsave()) */
+	if (sp->s_realvp) {
+		VN_RELE(sp->s_realvp);
+		sp->s_realvp = NULL;
+	}
+	kmem_free((caddr_t)VTOF(vp), (u_int)sizeof (struct fifonode));
+	return (0);
+}
+
+static int
+fifo_cmp(vp1, vp2)
+struct vnode *vp1, *vp2;
+{
+	return (vp1 == vp2);
+}
+
+static  int
+fifo_invalop()
+{
+	return (EINVAL);
+}
+
+static int
+fifo_badop()
+{
+
+	panic("fifo_badop");
+}
+
+/*
+ * allocate a buffer for a fifo
+ * return NULL if had to sleep
+ */
+static struct fifo_bufhdr *
+fifo_bufalloc(fp)
+	register struct fifonode *fp;
+{
+	register struct fifo_bufhdr *bp;
+
+	if (fifo_alloc >= fifoinfo.fifomnb) {
+		/*
+		 * Impose a system-wide maximum on buffered data in pipes.
+		 * NOTE: This could lead to deadlock!
+		 */
+		FIFOUNLOCK(fp);
+		(void) sleep((caddr_t) &fifo_alloc, PPIPE);
+		FIFOLOCK(fp);
+		return ((struct fifo_bufhdr *)NULL);
+	}
+
+	/* the call to kmem_alloc() might sleep, so leave fifonode locked */
+
+	fifo_alloc += FIFO_BUFFER_SIZE;
+	bp = (struct fifo_bufhdr *)
+		new_kmem_alloc((u_int)FIFO_BUFFER_SIZE, KMEM_SLEEP);
+	fp->fn_nbuf++;
+	return ((struct fifo_bufhdr *) bp);
+}
+
+/*
+ * deallocate a fifo buffer
+ */
+static struct fifo_bufhdr *
+fifo_buffree(bp, fp)
+	struct fifo_bufhdr *bp;
+	struct fifonode *fp;
+{
+	register struct fifo_bufhdr *nbp;
+
+	fp->fn_nbuf--;
+
+	/*
+	 * NOTE: THE FOLLOWING ONLY WORKS IF THE FREED BUFFER WAS THE 1ST ONE.
+	 */
+	if (fp->fn_bufend == bp) {
+		fp->fn_bufend = (struct fifo_bufhdr *) NULL;
+		nbp = (struct fifo_bufhdr *) NULL;
+	} else
+		nbp = bp->fb_next;
+
+	kmem_free((caddr_t)bp, (u_int)FIFO_BUFFER_SIZE);
+
+	if (fifo_alloc >= fifoinfo.fifomnb) {
+		curpri = PPIPE;
+		wakeup((caddr_t) &fifo_alloc);
+	}
+	fifo_alloc -= FIFO_BUFFER_SIZE;
+
+	return (nbp);
+}
+
+/*
+ * construct a fifonode that can masquerade as an snode
+ */
+struct snode *
+fifosp(vp)
+	struct vnode *vp;
+{
+	register struct fifonode *fp;
+	struct vattr va;
+
+	fp = (struct fifonode *)new_kmem_zalloc(sizeof (*fp), KMEM_SLEEP);
+	FTOV(fp)->v_op = &fifo_vnodeops;
+
+	/* init the times in the snode to those in the vnode */
+	(void) VOP_GETATTR(vp, &va, u.u_cred);
+	FTOS(fp)->s_atime = va.va_atime;
+	FTOS(fp)->s_mtime = va.va_mtime;
+	FTOS(fp)->s_ctime = va.va_ctime;
+	return (FTOS(fp));
+}
+
+/*
+ * perform fifo specific control operations
+ */
+static int
+fifo_cntl(vp, cmd, idata, odata, iflg, oflg)
+	struct vnode *vp;
+	int cmd, iflg, oflg;
+	caddr_t idata, odata;
+{
+	struct vnode *realvp;
+	int error;
+
+	switch (cmd) {
+	case _PC_PIPE_BUF:
+		*(int *)odata = fifoinfo.fifobuf;
+		break;
+	/*
+	 * ask the supporting fs for everything else
+	 */
+	default:
+		if (error = VOP_REALVP(vp, &realvp))
+			return (error);
+		return (VOP_CNTL(realvp, cmd, idata, odata, iflg, oflg));
+	}
+	return (0);
+}
--- a/sys/specfs/fifonode.h
+++ b/sys/specfs/fifonode.h
@@ -0,0 +1,48 @@
+/*	@(#)fifonode.h 1.1 92/07/30 SMI;	*/
+
+#ifndef _specfs_fifonode_h
+#define	_specfs_fifonode_h
+
+#ifdef KERNEL
+
+/* fifonodes start with an snode so that 'spec_xxx()' routines work */
+struct fifonode {
+	struct snode	fn_snode;	/* must be first */
+	struct fifo_bufhdr *fn_buf;	/* ptr to first buffer */
+	struct fifo_bufhdr *fn_bufend;	/* ptr to last buffer */
+	struct proc	*fn_rsel;	/* ptr to read selector */
+	struct proc	*fn_wsel;	/* ptr to write selector */
+	struct proc	*fn_xsel;	/* ptr to exception selector */
+	u_long		fn_size;	/* number of bytes in fifo */
+	short		fn_wcnt;	/* number of waiting readers */
+	short		fn_rcnt;	/* number of waiting writers */
+	short		fn_wptr;	/* write offset */
+	short		fn_rptr;	/* read offset */
+	short		fn_flag;	/* (see below) */
+	short		fn_nbuf;	/* number of buffers allocated */
+};
+
+#define	fn_vnode	fn_snode.s_vnode
+
+/* bits in fn_flag in fifonode */
+#define	FIFO_RBLK	0x0001	/* blocked readers */
+#define	FIFO_WBLK	0x0002	/* blocked writers */
+#define	FIFO_RCOLL	0x0004	/* more than one read selector */
+#define	FIFO_WCOLL	0x0008	/* more than one write selector */
+#define	FIFO_XCOLL	0x0010	/* more than one exception selector */
+
+/*
+ * Convert between fifonode, snode, and vnode pointers
+ */
+#define	VTOF(VP)	((struct fifonode *)(VP)->v_data)
+#define	FTOV(FP)	(&(FP)->fn_vnode)
+#define	FTOS(FP)	(&(FP)->fn_snode)
+
+
+/* define fifonode handling routines */
+#define	FIFOMARK(fp, x)	smark(FTOS(fp), x)
+#define	FIFOLOCK(fp)	SNLOCK(FTOS(fp))
+#define	FIFOUNLOCK(fp)	SNUNLOCK(FTOS(fp))
+
+#endif KERNEL
+#endif /*!_specfs_fifonode_h*/
--- a/sys/specfs/snode.h
+++ b/sys/specfs/snode.h
@@ -0,0 +1,123 @@
+/*	@(#)snode.h 1.1 92/07/30 SMI	*/
+
+#ifndef _specfs_snode_h
+#define	_specfs_snode_h
+
+/*
+ * Copyright (c) 1987 by Sun Microsystems, Inc.
+ */
+
+/*
+ * The SNODE represents a special file in any filesystem.  There is
+ * one snode for each active special file.  Filesystems which support
+ * special files use specvp(vp, dev) to convert a normal vnode to a
+ * special vnode in the ops create, mkdir, and lookup.
+ *
+ * To handle having multiple snodes which represent the same
+ * underlying block device vnode without cache aliasing problems,
+ * the s_bdevvp is used to point to the "common" vnode used for
+ * caching data.  If an snode is created internally by the kernel,
+ * then the s_realvp field is NULL and s_bdevvp points to s_vnode.
+ * The other snodes which are created as a result of a lookup of a
+ * device in a file system have s_realvp pointing to the vp which
+ * represents the device in the file system while the s_bdevvp points
+ * into the "common" vnode for the block device in another snode.
+ */
+
+
+struct snode {
+	struct	snode *s_next;		/* must be first */
+	struct	vnode s_vnode;		/* vnode associated with this snode */
+	struct	vnode *s_realvp;	/* vnode for the fs entry (if any) */
+	struct	vnode *s_bdevvp;	/* blk device vnode (for caching) */
+	u_short	s_flag;			/* flags, see below */
+	dev_t	s_dev;			/* device the snode represents */
+	daddr_t	s_nextr;		/* next byte read offset (read-ahead) */
+	daddr_t	s_size;			/* block device size in bytes */
+	struct timeval  s_atime;	/* time of last access */
+	struct timeval  s_mtime;	/* time of last modification */
+	struct timeval  s_ctime;	/* time of last attributes change */
+	int	s_count;		/* count of opened references */
+	long	s_owner;		/* index of process locking snode */
+	long	s_lckcnt;		/* number of processes locking snode */
+};
+
+/* flags */
+#define	SLOCKED		0x01		/* snode is locked */
+#define	SUPD		0x02		/* update device access time */
+#define	SACC		0x04		/* update device modification time */
+#define	SCLOSING	0x08		/* device is being closed */
+#define	SWANT		0x10		/* some process waiting on lock */
+#define	SCHG		0x40		/* update device change time */
+
+/*
+ * Convert between vnode and snode
+ */
+#define	VTOS(vp)	((struct snode *)((vp)->v_data))
+#define	STOV(sp)	(&(sp)->s_vnode)
+
+#ifdef KERNEL
+/*
+ * Lock and unlock snodes.
+ */
+#define	SNLOCK(sp) { \
+	while (((sp)->s_flag & SLOCKED) && \
+	    (sp)->s_owner != uniqpid()) { \
+		(sp)->s_flag |= SWANT; \
+		(void) sleep((caddr_t)(sp), PINOD); \
+	} \
+	(sp)->s_owner = uniqpid(); \
+	(sp)->s_lckcnt++; \
+	(sp)->s_flag |= SLOCKED; \
+	masterprocp->p_swlocks++; \
+}
+
+#define	SNUNLOCK(sp) { \
+	if (--(sp)->s_lckcnt < 0) \
+		panic("SNUNLOCK"); \
+	masterprocp->p_swlocks--; \
+	if ((sp)->s_lckcnt == 0) { \
+		(sp)->s_flag &= ~SLOCKED; \
+		if ((sp)->s_flag & SWANT) { \
+			(sp)->s_flag &= ~SWANT; \
+			wakeup((caddr_t)(sp)); \
+		} \
+	} \
+}
+
+/*
+ * Construct a spec vnode for a given device that shadows a particular
+ * "real" vnode.
+ */
+extern struct vnode *specvp();
+
+/*
+ * Construct a spec vnode for a given device that shadows nothing.
+ */
+extern struct vnode *makespecvp();
+
+/*
+ * Find any other spec vnode that refers to the same device as another vnode.
+ */
+extern struct vnode *other_specvp();
+
+/*
+ * Find and hold the spec vnode that refers to the given device.
+ */
+extern struct vnode *slookup();
+
+/*
+ * Snode lookup stuff.
+ * These routines maintain a table of snodes hashed by dev so
+ * that the snode for an dev can be found if it already exists.
+ * NOTE: STABLESIZE must be a power of 2 for STABLEHASH to work!
+ */
+
+#define	STABLESIZE	16
+#define	STABLEHASH(dev)	((major(dev) + minor(dev)) & (STABLESIZE - 1))
+extern struct snode *stable[];
+
+extern struct vnodeops spec_vnodeops;
+#endif KERNEL
+
+#endif /*!_specfs_snode_h*/
--- a/sys/specfs/spec_clone.c
+++ b/sys/specfs/spec_clone.c
@@ -0,0 +1,35 @@
+#ifndef	lint
+static	char sccsid[] = "@(#)spec_clone.c 1.1 92/07/30 Copyr 1986 Sun Micro";
+#endif	lint
+
+/*
+ * Copyright (c) 1986 by Sun Microsystems, Inc.
+ */
+
+/*
+ * Clone device driver.  Forces a clone open of some other
+ * character device.  Since its purpose in life is to force
+ * some other device to clone itself, there's no need for
+ * anything other than the open routine here.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/user.h>
+
+/*
+ * Do a clone open.  The (major number of the) device to be cloned
+ * is specified by minor(dev).  We tell spec_open to do the work
+ * by returning EEXIST after naming the device to clone.
+ */
+/* ARGSUSED */
+cloneopen(dev, flag, newdevp)
+	dev_t	dev;
+	int	flag;
+	dev_t	*newdevp;
+{
+	/* Convert to the device to be cloned. */
+	*newdevp = makedev(minor(dev), 0);
+
+	return (EEXIST);
+}
--- a/sys/specfs/spec_subr.c
+++ b/sys/specfs/spec_subr.c
@@ -0,0 +1,356 @@
+#ident	"@(#)spec_subr.c 1.1 92/07/30 SMI"
+
+/*LINTLIBRARY*/
+
+/*
+ * Copyright (c) 1987 by Sun Microsystems, Inc.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/user.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/conf.h>
+#include <sys/buf.h>
+#include <sys/trace.h>
+
+#include <specfs/snode.h>
+
+/*
+ * Find an appropriate snode.
+ */
+static struct snode *sfind();
+
+/*
+ * Returns a special vnode for the given dev.  The vnode is the
+ * one which is "common" to all the snodes which represent the
+ * same block device.
+ */
+struct vnode *
+bdevvp(dev)
+	dev_t dev;
+{
+
+	return (specvp((struct vnode *)NULL, dev, VBLK));
+}
+
+/*
+ * Return a shadow special vnode for the given dev.
+ * If no snode exists for this dev create one and put it
+ * in a table hashed by dev, realvp.  If the snode for
+ * this dev is already in the table return it (ref count is
+ * incremented by sfind).  The snode will be flushed from the
+ * table when spec_inactive calls sunsave.
+ *
+ * vp will be NULL if this is a block device that is
+ * to be shared via the s_bdevvp field in the snodes.
+ */
+struct vnode *
+specvp(vp, dev, type)
+	struct vnode *vp;
+	dev_t dev;
+	enum vtype type;
+{
+	register struct snode *sp;
+	extern struct snode *fifosp();
+	struct vattr va;
+
+	if ((sp = sfind(dev, vp, type)) == NULL) {
+		if (vp && vp->v_type == VFIFO) {
+			sp = fifosp(vp);
+		} else {
+			sp = (struct snode *)
+				new_kmem_zalloc(sizeof (*sp), KMEM_SLEEP);
+			STOV(sp)->v_op = &spec_vnodeops;
+
+			/* init the times in the snode to those in the vnode */
+			if (vp && VOP_GETATTR(vp, &va, u.u_cred) == 0) {
+				sp->s_atime = va.va_atime;
+				sp->s_mtime = va.va_mtime;
+				sp->s_ctime = va.va_ctime;
+			}
+		}
+		sp->s_realvp = vp;
+		sp->s_dev = dev;
+		trace3(TR_MP_SNODE, STOV(sp), dev, 0);
+		STOV(sp)->v_rdev = dev;
+		STOV(sp)->v_count = 1;
+		STOV(sp)->v_data = (caddr_t)sp;
+		if (vp != (struct vnode *)NULL) {
+			VN_HOLD(vp);
+			STOV(sp)->v_type = vp->v_type;
+			STOV(sp)->v_vfsp = vp->v_vfsp;
+			if (vp->v_type == VBLK) {
+				sp->s_bdevvp = bdevvp(dev);
+				sp->s_size = VTOS(sp->s_bdevvp)->s_size;
+			}
+		} else {
+			/* must be a `real' block device */
+			int (*size)();
+			long rsize;
+
+			STOV(sp)->v_type = VBLK;
+			STOV(sp)->v_vfsp = NULL;
+			sp->s_bdevvp = STOV(sp);
+			if ((major(dev) < nblkdev) &&
+			    (size = bdevsw[major(dev)].d_psize)) {
+				rsize = (*size)(dev);
+				if (rsize == -1)	/* did size fail? */
+					sp->s_size = 0;
+				else
+					sp->s_size = dbtob(rsize);
+			} else {
+				sp->s_size = 0;
+			}
+		}
+		ssave(sp);
+	}
+	return (STOV(sp));
+}
+
+/*
+ * Return a special vnode for the given dev; no vnode is supplied
+ * for it to shadow.
+ * If no snode exists for this dev (with a NULL realvp), create one
+ * and put it in a table hashed by dev, NULL.  If the snode for
+ * this dev is already in the table return it (ref count is
+ * incremented by sfind).  The snode will be flushed from the
+ * table when spec_inactive calls sunsave.
+ */
+struct vnode *
+makespecvp(dev, type)
+	dev_t dev;
+	enum vtype type;
+{
+	register struct snode *sp;
+	struct timeval ut;
+
+	if ((sp = sfind(dev, (struct vnode *)NULL, type)) == NULL) {
+		sp = (struct snode *)
+			new_kmem_zalloc((u_int)sizeof (*sp), KMEM_SLEEP);
+		STOV(sp)->v_op = &spec_vnodeops;
+		STOV(sp)->v_type = type;
+		STOV(sp)->v_rdev = dev;
+		STOV(sp)->v_count = 1;
+		STOV(sp)->v_data = (caddr_t)sp;
+		STOV(sp)->v_vfsp = NULL;
+		if (type == VBLK) {
+			sp->s_bdevvp = bdevvp(dev);
+			/* XXX - verify */
+			/* Possibly a VN_HOLD here */
+			sp->s_size = VTOS(sp->s_bdevvp)->s_size;
+		}
+		sp->s_realvp = NULL;
+		sp->s_dev = dev;
+		uniqtime(&ut);
+		sp->s_atime = ut;
+		sp->s_mtime = ut;
+		sp->s_ctime = ut;
+		trace3(TR_MP_SNODE, STOV(sp), dev, 1);
+		ssave(sp);
+	}
+	return (STOV(sp));
+}
+
+/*
+ * Snode lookup stuff.
+ * These routines maintain a table of snodes hashed by dev so
+ * that the snode for an dev can be found if it already exists.
+ */
+
+struct snode *stable[STABLESIZE];
+
+/*
+ * Put a snode in the table
+ */
+static
+ssave(sp)
+	struct snode *sp;
+{
+
+	sp->s_next = stable[STABLEHASH(sp->s_dev)];
+	stable[STABLEHASH(sp->s_dev)] = sp;
+}
+
+/*
+ * Remove a snode from the hash table.
+ * The realvp is not released here because spec_inactive() still
+ * needs it to do a spec_fsync().
+ */
+sunsave(sp)
+	struct snode *sp;
+{
+	struct snode *st;
+	struct snode *stprev = NULL;
+
+	st = stable[STABLEHASH(sp->s_dev)];
+	while (st != NULL) {
+		if (st == sp) {
+			if (stprev == NULL) {
+				stable[STABLEHASH(sp->s_dev)] = st->s_next;
+			} else {
+				stprev->s_next = st->s_next;
+			}
+			break;
+		}
+		stprev = st;
+		st = st->s_next;
+	}
+}
+
+/*
+ * Check to see how many open references there are in the snode table for
+ * a given device of a given type; if there are any, return 1, otherwise
+ * return 0.
+ */
+int
+stillopen(dev, type)
+	register dev_t dev;
+	register enum vtype type;
+{
+	register struct snode *st;
+	register int count;
+
+	count = 0;
+	for (st = stable[STABLEHASH(dev)]; st != NULL; st = st->s_next) {
+		if (st->s_dev == dev && STOV(st)->v_type == type)
+			count += st->s_count;
+	}
+	return (count != 0);
+}
+/*
+ * Check to see how many references there are in the snode table for
+ * a given device of a given type; if there are any, return 1, otherwise
+ * return 0.
+ */
+int
+stillref(dev, type)
+	register dev_t dev;
+	register enum vtype type;
+{
+	register struct snode *st;
+	register int count;
+
+	count = 0;
+	for (st = stable[STABLEHASH(dev)]; st != NULL; st = st->s_next) {
+		if (st->s_dev == dev && STOV(st)->v_type == type)
+			count += STOV(st)->v_count;
+	}
+	return (count != 0);
+}
+
+/*
+ * Check to see whether a given device of a given type is currently being
+ * closed; if so, return 1, otherwise return 0.
+ */
+int
+isclosing(dev, type)
+	register dev_t dev;
+	register enum vtype type;
+{
+	register struct snode *st;
+
+	for (st = stable[STABLEHASH(dev)]; st != NULL; st = st->s_next) {
+		if (st->s_dev == dev && STOV(st)->v_type == type &&
+		    (st->s_flag & SCLOSING))
+			return (1);
+	}
+	return (0);
+}
+
+/*
+ * Check to see if there is an snode in the table referring to a given device
+ * other than the one the vnode provided is associated with.  If so, return
+ * it.
+ */
+struct vnode *
+other_specvp(vp)
+	register struct vnode *vp;
+{
+	struct snode *sp;
+	register dev_t dev;
+	register struct snode *st;
+	register struct vnode *nvp;
+
+	sp = VTOS(vp);
+	dev = sp->s_dev;
+	st = stable[STABLEHASH(dev)];
+	while (st != NULL) {
+		if (st->s_dev == dev && (nvp = STOV(st)) != vp &&
+		    nvp->v_type == vp->v_type)
+			return (nvp);
+		st = st->s_next;
+	}
+	return (NULL);
+}
+
+/*
+ * Lookup a snode by type and dev; return a pointer to the vnode in that snode.
+ */
+struct vnode *
+slookup(type, dev)
+	enum vtype type;
+	dev_t dev;
+{
+	register struct snode *st;
+	register struct vnode *nvp;
+
+	st = stable[STABLEHASH(dev)];
+	while (st != NULL) {
+		if (st->s_dev == dev) {
+			nvp = STOV(st);
+			if (nvp->v_type == type) {
+				VN_HOLD(nvp);
+				return (nvp);
+			}
+		}
+		st = st->s_next;
+	}
+	return (NULL);
+}
+
+/*
+ * Lookup a snode by <dev, vp, type>
+ */
+static struct snode *
+sfind(dev, vp, type)
+	dev_t dev;
+	struct vnode *vp;
+	enum vtype type;
+{
+	register struct snode *st;
+
+	st = stable[STABLEHASH(dev)];
+	while (st != NULL) {
+		if ((st->s_dev == dev) && STOV(st)->v_type == type &&
+		    ((st->s_realvp && vp && VN_CMP(st->s_realvp, vp)) ||
+		    (st->s_realvp == NULL && vp == NULL))) {
+			VN_HOLD(STOV(st));
+			return (st);
+		}
+		st = st->s_next;
+	}
+	return (NULL);
+}
+
+/*
+ * Mark the accessed, updated, or changed times in an snode
+ * with the current (unique) time
+ */
+smark(sp, flag)
+	register struct snode *sp;
+	register int flag;
+{
+	struct timeval ut;
+
+	uniqtime(&ut);
+	sp->s_flag |= flag;
+	if (flag & SACC)
+		sp->s_atime = ut;
+	if (flag & SUPD)
+		sp->s_mtime = ut;
+	if (flag & SCHG) {
+		sp->s_ctime = ut;
+	}
+}
--- a/sys/specfs/spec_vfsops.c
+++ b/sys/specfs/spec_vfsops.c
@@ -0,0 +1,155 @@
+/*	@(#)spec_vfsops.c 1.1 92/07/30 SMI 	*/
+
+/*
+ * Copyright (c) 1988 by Sun Microsystems, Inc.
+ */
+
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/user.h>
+#include <sys/buf.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/bootconf.h>
+#include <specfs/snode.h>
+#include <sys/reboot.h>
+#include <vm/swap.h>
+#include <sun/dklabel.h>
+
+static	int spec_sync();
+static	int spec_badop();
+static	int spec_root();
+static	int spec_mountroot();
+static	int spec_swapvp();
+
+struct vfsops spec_vfsops = {
+	spec_badop,		/* mount */
+	spec_badop,		/* unmount */
+	spec_root,
+	spec_badop,		/* statfs */
+	spec_sync,
+	spec_badop,		/* vget */
+	spec_mountroot,
+	spec_swapvp,
+};
+
+static int
+spec_badop()
+{
+
+	panic("spec_badop");
+}
+
+/*
+ * Run though all the snodes and force write back
+ * of all dirty pages on the block devices.
+ */
+/*ARGSUSED*/
+static int
+spec_sync(vfsp)
+	struct vfs *vfsp;
+{
+	static int spec_lock;
+	register struct snode **spp, *sp;
+	register struct vnode *vp;
+
+	if (spec_lock)
+		return (0);
+
+	spec_lock++;
+	for (spp = stable; spp < &stable[STABLESIZE]; spp++) {
+		for (sp = *spp; sp != (struct snode *)NULL; sp = sp->s_next) {
+			vp = STOV(sp);
+			/*
+			 * Don't bother sync'ing a vp if it
+			 * is part of virtual swap device.
+			 */
+			if (IS_SWAPVP(vp))
+				continue;
+			if (vp->v_type == VBLK && vp->v_pages)
+				(void) VOP_PUTPAGE(vp, 0, 0, B_ASYNC,
+				    (struct ucred *)0);
+		}
+	}
+	spec_lock = 0;
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+spec_root(vfsp, vpp, name)
+	struct vfs *vfsp;
+	struct vnode **vpp;
+	char *name;
+{
+
+	return (EINVAL);
+}
+
+/*ARGSUSED*/
+static int
+spec_mountroot(vfsp, vpp, name)
+	struct vfs *vfsp;
+	struct vnode **vpp;
+	char *name;
+{
+
+	return (EINVAL);
+}
+
+/*ARGSUSED*/
+static int
+spec_swapvp(vfsp, vpp, name)
+	struct vfs *vfsp;
+	struct vnode **vpp;
+	char *name;
+{
+	extern char *strcpy();
+	extern dev_t getblockdev();
+	extern struct vnodeops spec_vnodeops;
+	char *cp;
+	dev_t dev;
+
+	if ((*name == '\0') || (boothowto & RB_ASKNAME)){
+		/*
+		 * No swap name specified, use root dev partition "b"
+		 * if it is a block device, otherwise fail.
+		 * XXX - should look through device list or something here
+		 * if root is not local.
+		 */
+		if (rootvp->v_op == &spec_vnodeops &&
+		    (boothowto & RB_ASKNAME) == 0) {
+			dev = makedev(major(rootvp->v_rdev),
+				(minor(rootvp->v_rdev) & ~(NDKMAP - 1)) | 1);
+			/*
+			 * kernel strcpy (unlike libc strcpy) returns a
+			 * pointer to the null byte of the destination string.
+			 */
+			cp = strcpy(name, rootfs.bo_name);
+			*(cp - 1) = 'b';	/* change last char to 'b' */
+		} else {
+retry:
+			if (!(dev = getblockdev("swap", name))) {
+				return (ENODEV);
+			}
+			/*
+			 * Check for swap on root device
+			 */
+			if (rootvp->v_op == &spec_vnodeops &&
+			    dev == rootvp->v_rdev) {
+				char resp[128];
+
+				printf("Swapping on root device, ok? ");
+				gets(resp);
+				if (*resp != 'y' && *resp != 'Y') {
+					goto retry;
+				}
+			}
+		}
+	} else if (!(dev = getblockdev("swap", name))) {
+		return (ENODEV);
+	}
+	*vpp = bdevvp(dev);
+	return (0);
+}
--- a/sys/specfs/spec_vnodeops.c
+++ b/sys/specfs/spec_vnodeops.c