#ident	"@(#)bdev_dsort.c 1.1 92/07/30 SMI"	/* from UCB 4.3 81/03/09 */

/*
 * Seek sort for disks.  We depend on the driver
 * which calls us using b_resid as the current cylinder number.
 *
 * The argument dp structure holds a b_actf activity chain pointer
 * on which we keep two queues, sorted in ascending cylinder order.
 * The first queue holds those requests which are positioned after
 * the current cylinder (in the first request); the second holds
 * requests which came in after their cylinder number was passed.
 * Thus we implement a one way scan, retracting after reaching the
 * end of the drive to the first request on the second queue,
 * at which time it becomes the first queue.
 *
 * A one-way scan is natural because of the way UNIX read-ahead
 * blocks are allocated.
 *
 * This implementation also allows certain page-oriented operations
 * to 'kluster' up into a single request.
 */

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <vm/page.h>
#include <sys/kmem_alloc.h>

#define	b_cylin	b_resid

/*
 * A kluster structure manages pools of buffers which are klustered together
 * with a single primary buffer. The single primary buffer is an original
 * buffer that has been modified to perform I/O for all the buffers in the
 * pool.
 *
 * The traditional action of disksort() is to sort a buffer into a disk
 * activity queue, using the b_resid field as a sort key and using the
 * av_forw tag as the forward link field. The av_back field for each buffer
 * is available for the driver to use as it sees fit.
 *
 * Diagrammatically, this is how this looks:
 *
 * queue header:
 *	front -----------> buf --------> buf --------> buf ---> 0
 *	back  -----------------------------------------^
 *
 *
 * When a certain set of conditions are met (see below), instead of sorting
 * a new buffer into this queue, we instead try and modify a buffer that is
 * currently in the queue to do the I/O operation for the new buffer
 *
 *
 * In this case, we allocate a kluster structure and modify things such
 * that things look like this:
 *
 * queue header:
 *	front -----------> buf ------> buf --------> buf ---> 0
 *	back  ------------------------/ ^ /-----------^
 *					|
 *			klust struct	|
 *			 prime buf -----/
 *			 front ------------> buf
 *			 tail ----------------^
 *
 * The kluster structure also maintains a copy of the original b_bcount
 * field for the primary buffer. This is so that when this arrangement is
 * decommissioned (either by calling klustdone() or klustbust()), that
 * the original primary buffer structure ends up looking the same as
 * if it had never been operated on by the klustering code.
 *
 * The conditions for which klustering might apply are these:
 *
 *	1) The driver wishing to use klustering calls the klustsort()
 *	function instead of the disksort() function. The klustsort()
 *	function functions as disksort did, except that it takes a third
 *	argument which is the non-inclusive integer maximum upper limit
 *	(in bytes) that a kluster operation can be increased to. The
 *	old disksort() interface is maintained by haveing it turn right
 *	around and call klustsort() with this argument as zero.
 *
 *	2) The driver uses the b_resid sort key to sort by absolute
 *	logical block. Historically, the sort key has been just the
 *	drive cylinder number for the request. This allows a number
 *	of requests for a drive to be partially sorted with respect
 *	to the drive layout, and is more or less optimal for devices
 *	where the notion of cylinder is stil meaningful (SA-450, ST-506,
 *	ESDI, and SMD devices), but is not particularly meaningful for
 *	devices which are logically addressed (SCSI and IPI).
 *
 *	3) A number or condtions for both the buffer already in the
 *	queue and the new buffer to be sorted or klustered are met.
 *	These are a fairly limiting and restrictive set of conditions.
 *
 *		+ The buffer in the queue is not the head of the queue
 *		(i.e., isn't the 'active' request).
 *
 *		+ The the b_dev fields of both buffers are the same
 *
 *		+ The buffer being added has only B_PAGEIO set of the
 *		flags B_KLUSTER, B_REMAPPED, B_PAGEIO and B_READ.
 *
 *		+ The buffer already in the queue has only B_PAGEIO set
 *		of the flage B_REMAPPED, B_PAGEIO and B_READ.
 *
 *		+ The b_un.b_addr field of both buffers is zero.
 *
 *		+ The b_bcount field of both buffers is mmu page aligned.
 *
 *		+ The logical block number for the buffer already in the
 *		queue plus the quantity btodb() of its b_bcount field
 *		equal the b_blkno field of the buffer to be added
 *		(i.e., a logical contiguous set of disk blocks are
 *		maintained)
 *
 *		+ The the b_bcount field of the buffer in the queue plus
 *		b_bcount field of the buffer to be added does not equal
 *		or exceed the maximum as passed by the driver.
 *
 *	The intent of these conditions are to ensure that all buffers are
 *	pure page-oriented operations, are write operations only, are for
 *	logically contiguous areas of the device, and do not exceed some
 *	count limitation specified by the driver, before allowing a new
 *	buffer to be klustered with a buffer already in the queue rather
 *	than being sorted into the queue.
 *
 *	If these conditions are met, a routine is called which attempts
 *	to add the new buffer to a list of buffers that are klustered
 *	with the buffer already in the queue. If this is successful, the
 *	buffer in the queue is modified to 'own' the list of pages
 *	for the new buffer, and its b_bcount field is adjusted to
 *	reflect the new size of the data area being managed for I/O.
 *
 *	The klustsort() routine returns the value 1 if the buffer that
 *	it had been passed was klustered, else 0 (in which case the
 *	buffer has just been sorted into the activity queue). The
 *	primary buffer's b_flags field has B_KLUSTER set in it to
 *	note that this is the primary buffer of a kluster of buffers.
 *
 *	4) When I/O completes for a buffer marked B_KLUSTER, the driver
 *	calls the function klustdone() (instead of iodone()). klustdone()
 *	breaks apart the list of pages from the primary buffer and restores
 *	them to their original 'owners', restores the b_bcount field for
 *	the primary kluster buffer, clears the B_KLUSTER flag in the
 *	primary buffer, and calls iodone() for all buffers that were part
 *	of this kluster. If the primary buffer had either a residual count
 *	set in b_resid, or the flag B_ERROR was set, all buffers that were
 *	part of the kluster have B_ERROR set, and b_resid set equal to their
 *	b_bcount field. klustdone() returns the integer number of buffers
 *	that had all been klustered together.
 *
 *	Optionally, if a driver wishes to retry failed I/O operations on each
 *	buffer from a kluster singly (in order to isolate the actual error
 *	more precisely), the function klustbust() is provided. The driver
 *	passes the primary buffer to klustbust(), which performs the same
 *	restoration of pages to their rightful owners and the b_bcount field
 *	back to the primary driver. It leaves the buffers linked together
 *	as a forward linked list of buffers (through the av_forw) field
 *	starting from the primary buffer. The driver can then do as it
 *	pleases with this chain.
 */

struct kluster {
	struct kluster	*klust_next;	/* next in a list of kluster structs */
	struct buf	*klust_head;	/* head of list of indirect bufs */
	struct buf	*klust_tail;	/* tail of list of indirect bufs */
	struct buf	*klust_prime;	/* primary kluster buffer */
	int		klust_pcount;	/* primary buf's original b_bcount */
};

static int kluston = 1;
static int klust_buf_flag_chk = B_REMAPPED|B_PAGEIO|B_READ;
static int nklusters;
#define	KLUSTMAXPSIZE	128
static struct kluster *klustfree, *klustbusy;

int klustsort(), klustdone();
void klustbust();
static int klustadd();

disksort(dp, bp)
struct diskhd *dp;
struct buf *bp;
{
	(void) klustsort(dp, bp, 0);
}

/*
 * Perform traditional sorting into a disk activity queue.
 *
 * If desired, instead of sorting a buffer into the queue,
 * see if it can instead have its I/O operation joined up
 * with the I/O operation of a buffer already in the queue.
 */

int
klustsort(dp, bp, maxbcount)
struct diskhd *dp;
register struct buf *bp;
int maxbcount;
{
	register struct buf *ap;

	/*
	 * If nothing on the activity queue, then
	 * we become the only thing.
	 */
	ap = dp->b_actf;
	if (ap == NULL) {
		dp->b_actf = bp;
		dp->b_actl = bp;
		bp->av_forw = NULL;
		return (0);
	}

	/*
	 * Check to see whether the requested buffer is eligible
	 * to become a candidate for klustering.
	 */

	/*
	 * If we lie after the first (currently active)
	 * request, then we must locate the second request list
	 * and add ourselves to it.
	 */
	if (bp->b_cylin < ap->b_cylin) {
		while (ap->av_forw) {
			/*
			 * Check for an ``inversion'' in the
			 * normally ascending cylinder numbers,
			 * indicating the start of the second request list.
			 */
			if (ap->av_forw->b_cylin < ap->b_cylin) {
				/*
				 * Search the second request list
				 * for the first request at a larger
				 * cylinder number.  We go before that;
				 * if there is no such request, we go at end.
				 */
				do {
					if (bp->b_cylin < ap->av_forw->b_cylin)
						goto insert;
					ap = ap->av_forw;
				} while (ap->av_forw);
				goto insert;		/* after last */
			}
			ap = ap->av_forw;
		}
		/*
		 * No inversions... we will go after the last, and
		 * be the first request in the second request list.
		 */
		goto insert;
	}
	/*
	 * Request is at/after the current request...
	 * sort in the first request list.
	 */
	while (ap->av_forw) {
		/*
		 * We want to go after the current request
		 * if there is an inversion after it (i.e. it is
		 * the end of the first request list), or if
		 * the next request is a larger cylinder than our request.
		 */
		if (ap->av_forw->b_cylin < ap->b_cylin ||
		    bp->b_cylin < ap->av_forw->b_cylin)
			goto insert;
		ap = ap->av_forw;
	}
	/*
	 * Neither a second list nor a larger
	 * request... we go at the end of the first list,
	 * which is the same as the end of the whole schebang.
	 */
insert:
	/*
	 * See if we can kluster bp with ap
	 *
	 * Note that this will probably not kluster
	 * with any device that sorts by anything other
	 * than logical block number. Historically, the
	 * b_cylin field has been used to sort via to
	 * the granularity of cylinder number. However,
	 * in order to take advantage of putting together
	 * this one-way elevator sorting and checking for
	 * the opportunity to kluster up requests at the
	 * same time, we had to make some simplifying
	 * assumptions here. Therefore, if somebody
	 * calls klustsort() directly, it is assumed
	 * that if they have gone to the effort of
	 * stating that they wish to be eligible for
	 * kluster checking (by setting the maxbcount
	 * argument to nonzero), then they must use
	 * a sort token in b_resid (b_cylin) that
	 * matches the dkblock(bp) value.
	 */

	if (kluston && maxbcount != 0 && ap != dp->b_actf &&
	    (ap->b_dev == bp->b_dev) &&
	    ((bp->b_flags & (klust_buf_flag_chk|B_KLUSTER)) == B_PAGEIO) &&
	    ((ap->b_flags & klust_buf_flag_chk) == B_PAGEIO) &&
	    (ap->b_un.b_addr == (caddr_t) 0) &&
	    (bp->b_un.b_addr == (caddr_t) 0) &&
	    (((ap->b_bcount | bp->b_bcount) & PAGEOFFSET) == 0) &&
	    (ap->b_blkno + btodb(ap->b_bcount) == bp->b_blkno) &&
	    (ap->b_bcount + bp->b_bcount < maxbcount)) {
		if (klustadd(ap, bp) != 0) {
			return (1);
		}
	}

	bp->av_forw = ap->av_forw;
	ap->av_forw = bp;
	if (ap == dp->b_actl)
		dp->b_actl = bp;
	return (0);
}

/*
 * Add a new buffer to the passed kluster buf (if possible).
 * If this is a brand new kluster being started, find a kluster
 * structure and save the original starting buffer's b_bcount
 * tag in it (for later restoration upon i/o completion).
 * If we cannot find a free kluster structure, allocate another
 * one, but don't sweat it if there isn't any memory available.
 * Also limit ourselves to the very generous overall limit of
 * 128 kluster structures.
 *
 * Returns 1 if was able kluster, else 0.
 * Called only by klustsort().
 *
 */


static int
klustadd(bp, nbp)
register struct buf *bp, *nbp;
{
	register int s;
	register struct page *ppl, *nppl;
	register struct kluster *kp;

	s = splvm();
	if ((bp->b_flags & B_KLUSTER) == 0) {
		if ((kp = klustfree) == NULL) {
			if (nklusters >= KLUSTMAXPSIZE) {
				(void) splx(s);
				return (0);
			}
			klustfree = (struct kluster *)
			    new_kmem_zalloc(sizeof (*kp), KMEM_NOSLEEP);
			if ((kp = klustfree) == NULL) {
				(void) splx(s);
				return (0);
			}
			nklusters++;
		} else {
			klustfree = kp->klust_next;
		}

		klustfree = kp->klust_next;
		kp->klust_next = klustbusy;

		kp->klust_head = nbp;
		kp->klust_prime = bp;
		kp->klust_pcount = bp->b_bcount;

		klustbusy = kp;
		bp->b_flags |= B_KLUSTER;
	} else {
		for (kp = klustbusy; kp != NULL; kp = kp->klust_next) {
			if (kp->klust_prime == bp) {
				break;
			}
		}
		if (kp == NULL) {
			(void) splx(s);
			/*
			 * This should be a panic....
			 */
			return (0);
		}
		kp->klust_tail->av_forw = nbp;
	}

	kp->klust_tail = nbp;
	nbp->av_forw = 0;
	bp->b_bcount += nbp->b_bcount;

	ppl = bp->b_pages->p_prev;
	nppl = nbp->b_pages->p_prev;

	nppl->p_next = bp->b_pages;
	bp->b_pages->p_prev = nppl;
	ppl->p_next = nbp->b_pages;
	nbp->b_pages->p_prev = ppl;

	(void) splx(s);

	/*
	 * The av_back field of the buffer we are adding to the kluster
	 * chain saves the original last page pointer for the previous buffer.
	 */

	nbp->av_back = (struct buf *) ppl;

	return (1);
}

/*
 *
 * Bust apart a klustered set of buffers and
 * decommission the active kluster structure.
 *
 * Upon return from this function the argument
 * buffer passed will be the head of a forward
 * linked list of buffers that are the real
 * buffers that constituted the kluster.
 * The linkage is through the av_forw tag.
 */

void
klustbust(bp)
register struct buf *bp;
{
	register struct page *pp;
	struct page *first_pp_prev;
	register struct kluster *kp, *kpr;
	register int s;

	if ((bp->b_flags & B_KLUSTER) == 0) {
		bp->av_forw = (struct buf *) NULL;
		return;
	}

	kpr = (struct kluster *) NULL;
	s = splvm();

	kp = klustbusy;
	while (kp != (struct kluster *) NULL) {
		if (kp->klust_prime == bp)
			break;
		kpr = kp;
		kp = kp->klust_next;
	}

	if (kp == NULL) {
		(void) splx(s);
		bp->b_flags &= ~B_KLUSTER;
		bp->av_forw = (struct buf *) NULL;
		/*
		 * This should be a logged warning..
		 */
		return;
	}

	/*
	 * Restore original buffer's b_count field
	 * and point forward link at the chain of saved
	 * buffers that made up the rest of the kluster
	 */

	bp->b_bcount = kp->klust_pcount;
	bp->av_forw = kp->klust_head;

	/*
	 * Put the kluster structure back on the free list
	 */

	if (kpr) {
		kpr->klust_next = kp->klust_next;
	} else {
		klustbusy = kp->klust_next;
	}
	kp->klust_next = klustfree;
	klustfree = kp;
	(void) splx(s);

	bp->b_flags &= ~B_KLUSTER;

	/*
	 * If the action of doing I/O caused the buffer to
	 * be mapped in, map it back out again.
	 *
	 * We don't need to worry about it having been
	 * mapped in before hand because if it had been
	 * it wouldn't have been eligible for klustering
	 * to begin with.
	 */

	bp_mapout(bp);

	/*
	 * Walk the chain and bust out the pages and restore them to
	 * their original owners. The p_prev page for any given buffer
	 * (except the last one in the chain) had been saved in the
	 * *next* buffers' av_back field. The last buffer in the chain's
	 * bp->b_pages->p_prev is obviously the bp->b_pages->p_prev
	 * for the first buffer.
	 */

	first_pp_prev = bp->b_pages->p_prev;

	while (bp) {
		if (bp->av_forw) {
			pp = (struct page *) bp->av_forw->av_back;
		} else {
			pp = first_pp_prev;
		}
		pp->p_next = bp->b_pages;
		bp->b_pages->p_prev = pp;
		bp = bp->av_forw;
	}
}

/*
 * Break apart a kluster into its original set of
 * of buffers and call iodone. Return an integer
 * count of the number of buffers passed to iodone
 */

int
klustdone(bp)
register struct buf *bp;
{
	register struct buf *nbp;
	register i, err;

	/*
	 * If this doesn't appear to be a kluster buf, call
	 * iodone() anyhow for the buffer and return a
	 * count of 1 to say that one buf was passed to
	 * iodone().
	 */

	if ((bp->b_flags & B_KLUSTER) == 0) {
		iodone(bp);
		return (1);
	}

	/*
	 * Bust out the kluster chain and
	 * 'finish' off the chain of bufs
	 * that klustbust sets up.
	 *
	 * It is considered an error if a
	 * kluster operation finishes with
	 * a non-zero residual. In any
	 * case, if an error condition is
	 * set upon the kluster buf, it
	 * is propagated to all buffers.
	 * Further, we do not count that
	 * any i/o was done, period.
	 */

	err = ((bp->b_flags & B_ERROR) || bp->b_resid);

	klustbust(bp);

	i = 0;
	while (bp) {
		nbp = bp->av_forw;
		if (err) {
			bp->b_flags |= B_ERROR;
			bp->b_resid = bp->b_bcount;
		} else {
			bp->b_resid = 0;
		}
		bp->av_forw = bp->av_back = 0;
		iodone(bp);
		bp = nbp;
		i++;
	}
	return (i);
}