634 lines
16 KiB
C
634 lines
16 KiB
C
/* @(#)vm_swap.c 1.1 92/07/30 SMI */
|
|
|
|
#ident "$SunId: @(#)vm_swap.c 1.2 91/02/19 SMI [RMTC] $"
|
|
|
|
/*
|
|
* Copyright (c) 1988, 1989 by Sun Microsystems, Inc.
|
|
*/
|
|
|
|
/*
|
|
* Virtual swap device
|
|
*
|
|
* The virtual swap device consists of the logical concatenation of one
|
|
* or more physical swap areas. It provides a logical array of anon
|
|
* slots, each of which corresponds to a page of swap space.
|
|
*
|
|
* Each physical swap area has an associated anon array representing
|
|
* its physical storage. These anon arrays are logically concatenated
|
|
* sequentially to form the overall swap device anon array. Thus, the
|
|
* offset of a given entry within this logical array is computed as the
|
|
* sum of the sizes of each area preceding the entry plus the offset
|
|
* within the area containing the entry.
|
|
*
|
|
* The anon array entries for unused swap slots within an area are
|
|
* linked together into a free list. Allocation proceeds by finding a
|
|
* suitable area (attempting to balance use among all the areas) and
|
|
* then returning the first free entry within the area. Thus, there's
|
|
* no linear relation between offset within the swap device and the
|
|
* address (within its segment(s)) of the page that the slot backs;
|
|
* instead, it's an arbitrary one-to-one mapping.
|
|
*
|
|
* Associated with each swap area is a swapinfo structure. These
|
|
* structures are linked into a linear list that determines the
|
|
* ordering of swap areas in the logical swap device. Each contains a
|
|
* pointer to the corresponding anon array, the area's size, and its
|
|
* associated vnode.
|
|
*/
|
|
|
|
#include <sys/param.h>
|
|
#include <sys/systm.h>
|
|
#include <sys/user.h>
|
|
#include <sys/vfs.h>
|
|
#include <sys/vnode.h>
|
|
#include <sys/file.h>
|
|
#include <sys/uio.h>
|
|
#include <sys/conf.h>
|
|
#include <sys/bootconf.h>
|
|
#include <sys/trace.h>
|
|
|
|
#include <vm/hat.h>
|
|
#include <vm/anon.h>
|
|
#include <vm/page.h>
|
|
#include <vm/swap.h>
|
|
|
|
/* these includes are used for the "fake" swap support of /dev/drum */
|
|
#include <sun/mem.h>
|
|
#include <specfs/snode.h>
|
|
|
|
static struct swapinfo *silast;
|
|
struct swapinfo *swapinfo;
|
|
|
|
/*
|
|
* To balance the load among multiple swap areas, we don't allow
|
|
* more than swap_maxcontig allocations to be satisfied from a
|
|
* single swap area before moving on to the next swap area. This
|
|
* effectively "interleaves" allocations among the many swap areas.
|
|
*/
|
|
int swap_maxcontig = 1024 * 1024 / PAGESIZE; /* 1MB of pages */
|
|
|
|
extern int klustsize; /* from spec_vnodeops.c */
|
|
int swap_order = 1; /* see swap_alloc,free */
|
|
|
|
#define MINIROOTSIZE 14000 /* ~7 Meg */
|
|
|
|
/*
|
|
* Initialize a new swapinfo structure.
|
|
*/
|
|
static int
|
|
swapinfo_init(vp, npages, skip)
|
|
struct vnode *vp;
|
|
register u_int npages;
|
|
u_int skip;
|
|
{
|
|
register struct anon *ap, *ap2;
|
|
register struct swapinfo **sipp, *nsip;
|
|
|
|
for (sipp = &swapinfo; nsip = *sipp; sipp = &nsip->si_next)
|
|
if (nsip->si_vp == vp)
|
|
return (EBUSY); /* swap device already in use */
|
|
|
|
nsip = (struct swapinfo *)new_kmem_zalloc(
|
|
sizeof (struct swapinfo), KMEM_SLEEP);
|
|
nsip->si_vp = vp;
|
|
nsip->si_size = ptob(npages);
|
|
/*
|
|
* Don't indirect through NULL if called with npages < skip (too tacky)
|
|
*/
|
|
if (npages < skip)
|
|
npages = skip;
|
|
nsip->si_anon = (struct anon *)new_kmem_zalloc(
|
|
npages * sizeof (struct anon), KMEM_SLEEP);
|
|
nsip->si_eanon = &nsip->si_anon[npages - 1];
|
|
#ifdef RECORD_USAGE
|
|
/*
|
|
* Monitoring of swap space usage is enabled, so malloc
|
|
* a parallel array to hold the PID responsible for
|
|
* causing the anon page to be created.
|
|
*/
|
|
nsip->si_pid = (short *)
|
|
new_kmem_zalloc(npages * sizeof (short), KMEM_SLEEP);
|
|
#endif RECORD_USAGE
|
|
npages -= skip;
|
|
|
|
/*
|
|
* ap2 now points to the first usable slot in the swap area.
|
|
* Set up free list links so that the head of the list is at
|
|
* the front of the usable portion of the array.
|
|
*/
|
|
ap = nsip->si_eanon;
|
|
ap2 = nsip->si_anon + skip;
|
|
while (--ap >= ap2)
|
|
ap->un.an_next = ap + 1;
|
|
if (npages == 0) /* if size was <= skip */
|
|
nsip->si_free = NULL;
|
|
else
|
|
nsip->si_free = ap + 1;
|
|
anoninfo.ani_free += npages;
|
|
anoninfo.ani_max += npages;
|
|
|
|
*sipp = nsip;
|
|
if (silast == NULL) /* first swap device */
|
|
silast = nsip;
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* Initialize a swap vnode.
|
|
*/
|
|
int
|
|
swap_init(vp)
|
|
struct vnode *vp;
|
|
{
|
|
struct vattr vattr;
|
|
u_int skip;
|
|
int err;
|
|
|
|
err = VOP_GETATTR(vp, &vattr, u.u_cred); /* XXX - u.u_cred? */
|
|
if (err) {
|
|
printf("swap_init: getattr failed, errno %d\n", err);
|
|
return (err);
|
|
}
|
|
|
|
/*
|
|
* To prevent swap I/O requests from crossing the boundary
|
|
* between swap areas, we erect a "fence" between areas by
|
|
* not allowing the first page of each swap area to be used.
|
|
* (This also prevents us from scribbling on the disk label
|
|
* if the swap partition is the first partition on the disk.)
|
|
* This may not be strictly necessary, since swap_blksize also
|
|
* prevents requests from crossing the boundary.
|
|
*
|
|
* If swapping on the root filesystem, don't put swap blocks that
|
|
* correspond to the miniroot filesystem on the swap free list.
|
|
*/
|
|
if (rootvp == vp)
|
|
skip = btoc(roundup(dbtob(MINIROOTSIZE), klustsize));
|
|
else
|
|
skip = 1;
|
|
|
|
err = swapinfo_init(vp, (u_int)btop(vattr.va_size), skip);
|
|
|
|
if (!err)
|
|
vp->v_flag |= VISSWAP;
|
|
return (err);
|
|
}
|
|
|
|
/*
|
|
* This routine is used to fake npages worth of swap space.
|
|
* These pages will have no backing and cannot be paged out any where.
|
|
*/
|
|
swap_cons(npages)
|
|
u_int npages;
|
|
{
|
|
|
|
if (swapinfo_init((struct vnode *)NULL, npages, 0) != 0)
|
|
panic("swap_cons");
|
|
}
|
|
|
|
/*
|
|
* Points to the location (close to) the last block handed to
|
|
* swap_free. The theory is that if you free one in this area,
|
|
* you'll probably free more, so use the hint as a starting point.
|
|
* hint is reset on each free to the block that preceeds the one
|
|
* freed (or the block freed, if we can't find the block before it).
|
|
* It is also reset if it points at block that is allocated.
|
|
*
|
|
* XXX - swap_free and swap_alloc both manipulate hint; the free
|
|
* lists are now protected with splswap(). Don't call into these routines
|
|
* from higher level interrupts!
|
|
*/
|
|
static struct {
|
|
struct anon *ap; /* pointer to the last freed */
|
|
struct swapinfo *sip; /* swap list for which hint is valid */
|
|
} hint;
|
|
|
|
int swap_hit; /* hint helped */
|
|
int swap_miss; /* hint was no good */
|
|
|
|
|
|
/*
|
|
* Allocate a single page from the virtual swap device.
|
|
*/
|
|
struct anon *
|
|
swap_alloc()
|
|
{
|
|
struct swapinfo *sip = silast;
|
|
struct anon *ap;
|
|
|
|
do {
|
|
ap = sip->si_free;
|
|
if (ap) {
|
|
/*
|
|
* can't condition this on swap_order since some
|
|
* idiot might turn it on and off. It's not cool
|
|
* to have the hint point at an allocated block.
|
|
*/
|
|
if (hint.sip == sip && hint.ap == ap)
|
|
hint.sip = NULL;
|
|
sip->si_free = ap->un.an_next;
|
|
if (++sip->si_allocs >= swap_maxcontig) {
|
|
sip->si_allocs = 0;
|
|
if (sip == silast) {
|
|
silast = sip->si_next;
|
|
if (silast == NULL)
|
|
silast = swapinfo;
|
|
}
|
|
} else {
|
|
silast = sip;
|
|
}
|
|
# ifdef TRACE
|
|
{
|
|
struct vnode *vp;
|
|
u_int off;
|
|
|
|
swap_xlate(ap, &vp, &off);
|
|
trace3(TR_MP_SWAP, vp, off, ap);
|
|
}
|
|
# endif TRACE
|
|
#ifdef RECORD_USAGE
|
|
if (u.u_procp) {
|
|
/* swap monitoring is on - record the current PID */
|
|
sip->si_pid[ap - sip->si_anon] = u.u_procp->p_pid;
|
|
}
|
|
#endif RECORD_USAGE
|
|
return (ap);
|
|
}
|
|
/*
|
|
* No more free anon slots here.
|
|
*/
|
|
sip->si_allocs = 0;
|
|
sip = sip->si_next;
|
|
if (sip == NULL)
|
|
sip = swapinfo;
|
|
} while (sip != silast);
|
|
return ((struct anon *)NULL);
|
|
}
|
|
|
|
/*
|
|
* Free a swap page.
|
|
* List is maintained in sorted order. Worst case is a linear search on the
|
|
* list; we maintain a hint to mitigate this.
|
|
*
|
|
* Pointing the hint at the most recently free'd anon struct makes it
|
|
* really fast to free anon pages in ascending order.
|
|
*
|
|
* Pointing the hint at the anon struct that is just *before* this makes
|
|
* it really fast to free anon pages in descending order, at nearly zero
|
|
* cost.
|
|
*
|
|
* This alogrithm points the hint at the anon struct that points to
|
|
* the one most recently free'd. When freeing a block of anon structs
|
|
* presented in ascending order, the hint advances one block behind
|
|
* the blocks as they are free'd. When freeing a block of anon structs
|
|
* precented in descending order -- which happens if a large hunk of
|
|
* memory is allocated in reverse order then free'd in forward order,
|
|
* common enough to be a problem -- the hint remains pointing at the
|
|
* anon struct that ends up pointing at each of the free'd blocks
|
|
* in order. This is worth an example.
|
|
*
|
|
* Assume anons #2 and #9 are free, the hint points to anon #2, and
|
|
* #2's "next" pointer goes to #9. Now, we present a set of swap_free
|
|
* requests for blocks #8 through #3, in descending order. This results
|
|
* in a series of hits on the hint, which just keeps pointing at #2.
|
|
* The previous algorithm would have set the hint to each block as
|
|
* it came in, resulting in worst-case behavior as the list had to
|
|
* be scanned from the front.
|
|
*/
|
|
void
|
|
swap_free(ap)
|
|
struct anon *ap;
|
|
{
|
|
register struct swapinfo *sip = silast;
|
|
register struct anon *tap, **tapp;
|
|
register struct anon *tap_hint;
|
|
|
|
/*
|
|
* Find the swap area containing ap and then put
|
|
* ap at the head of that area's free list.
|
|
*/
|
|
do {
|
|
if (sip->si_anon <= ap && ap <= sip->si_eanon) {
|
|
/*
|
|
ap->un.an_next = sip->si_free;
|
|
sip->si_free = ap;
|
|
*/
|
|
/*
|
|
* old unordered way
|
|
*/
|
|
if (!swap_order) {
|
|
ap->un.an_next = sip->si_free;
|
|
sip->si_free = ap;
|
|
#ifdef RECORD_USAGE
|
|
/* Swap monitoring is on - undo the PID */
|
|
sip->si_pid[ap - sip->si_anon] = 0;
|
|
#endif RECORD_USAGE
|
|
return;
|
|
}
|
|
/*
|
|
* Do it in order; use hint if possible
|
|
*/
|
|
tap = hint.ap;
|
|
if (hint.sip == sip && tap < ap) {
|
|
/*
|
|
* The anon we are freeing
|
|
* follows the hint tap somewhere.
|
|
* save the hint and advance
|
|
* to the next free anon.
|
|
*/
|
|
tapp = &tap->un.an_next;
|
|
tap_hint = tap;
|
|
tap = tap->un.an_next;
|
|
swap_hit++;
|
|
} else {
|
|
/*
|
|
* Wrong swapinfo, or
|
|
* the anon being free'd
|
|
* preceeds the hint.
|
|
* must start scanning
|
|
* from the front of the
|
|
* list. The best hint we
|
|
* can seed with is the
|
|
* anon we are freeing.
|
|
*/
|
|
tapp = &sip->si_free;
|
|
tap = sip->si_free;
|
|
tap_hint = ap;
|
|
swap_miss++;
|
|
}
|
|
/*
|
|
* advance tap until it is greater
|
|
* than the incoming anon.
|
|
*/
|
|
while (tap && tap < ap) {
|
|
tapp = &tap->un.an_next;
|
|
tap_hint = tap;
|
|
tap = tap->un.an_next;
|
|
}
|
|
*tapp = ap;
|
|
ap->un.an_next = tap;
|
|
#ifdef RECORD_USAGE
|
|
/* Swap monitoring is on - undo the PID */
|
|
sip->si_pid[ap - sip->si_anon] = 0;
|
|
#endif RECORD_USAGE
|
|
hint.sip = sip;
|
|
hint.ap = tap_hint;
|
|
return;
|
|
}
|
|
sip = sip->si_next;
|
|
if (sip == NULL)
|
|
sip = swapinfo;
|
|
} while (sip != silast);
|
|
panic("swap_free");
|
|
/* NOTREACHED */
|
|
}
|
|
|
|
/*
|
|
* Return the <vnode, offset> pair
|
|
* corresponding to the given anon struct.
|
|
*/
|
|
void
|
|
swap_xlate(ap, vpp, offsetp)
|
|
struct anon *ap;
|
|
struct vnode **vpp;
|
|
u_int *offsetp;
|
|
{
|
|
register struct swapinfo *sip = silast;
|
|
|
|
do {
|
|
if (sip->si_anon <= ap && ap <= sip->si_eanon) {
|
|
*offsetp = ptob(ap - sip->si_anon);
|
|
*vpp = sip->si_vp;
|
|
return;
|
|
}
|
|
sip = sip->si_next;
|
|
if (sip == NULL)
|
|
sip = swapinfo;
|
|
} while (sip != silast);
|
|
panic("swap_xlate");
|
|
/* NOTREACHED */
|
|
}
|
|
|
|
/*
|
|
* Like swap_xlate, but return a status instead of panic'ing.
|
|
* Used by dump routines when we know we may be corrupted.
|
|
*/
|
|
swap_xlate_nopanic(ap, vpp, offsetp)
|
|
struct anon *ap;
|
|
struct vnode **vpp;
|
|
u_int *offsetp;
|
|
{
|
|
register struct swapinfo *sip = swapinfo;
|
|
|
|
do {
|
|
if (sip->si_anon <= ap && ap <= sip->si_eanon) {
|
|
*offsetp = (ap - sip->si_anon) << PAGESHIFT;
|
|
*vpp = sip->si_vp;
|
|
return (1);
|
|
}
|
|
} while (sip = sip->si_next);
|
|
|
|
/* Couldn't find it; return failure */
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* Return the anon struct corresponding for the given
|
|
* <vnode, offset> if it is part of the virtual swap device.
|
|
*/
|
|
struct anon *
|
|
swap_anon(vp, offset)
|
|
struct vnode *vp;
|
|
u_int offset;
|
|
{
|
|
register struct swapinfo *sip = silast;
|
|
|
|
if (vp && sip) {
|
|
do {
|
|
if (vp == sip->si_vp && offset < sip->si_size)
|
|
return (sip->si_anon + (offset >> PAGESHIFT));
|
|
sip = sip->si_next;
|
|
if (sip == NULL)
|
|
sip = swapinfo;
|
|
} while (sip != silast);
|
|
}
|
|
/*
|
|
* Note - we don't return the anon structure for
|
|
* fake'd anon slots which have no real vp.
|
|
*/
|
|
return ((struct anon *)NULL);
|
|
}
|
|
|
|
/*
|
|
* swread and swwrite implement the /dev/drum device, an indirect,
|
|
* user visible, device to allow reading of the (virtual) swap device.
|
|
*/
|
|
|
|
/*ARGSUSED*/
|
|
swread(dev, uio)
|
|
dev_t dev;
|
|
struct uio *uio;
|
|
{
|
|
|
|
return (sw_rdwr(uio, UIO_READ));
|
|
}
|
|
|
|
/*ARGSUSED*/
|
|
swwrite(dev, uio)
|
|
dev_t dev;
|
|
struct uio *uio;
|
|
{
|
|
|
|
return (sw_rdwr(uio, UIO_WRITE));
|
|
}
|
|
|
|
/*
|
|
* Handle all the work of reading "fake" swap pages that are in memory.
|
|
*/
|
|
static int
|
|
fake_sw_rdwr(uio, rw, cred)
|
|
register struct uio *uio;
|
|
enum uio_rw rw;
|
|
struct ucred *cred;
|
|
{
|
|
struct page *pp;
|
|
struct vnode *memvp;
|
|
int nbytes;
|
|
u_int off;
|
|
int err;
|
|
extern int mem_no;
|
|
|
|
nbytes = uio->uio_resid;
|
|
off = uio->uio_offset;
|
|
memvp = makespecvp(makedev(mem_no, M_MEM), VCHR);
|
|
|
|
do {
|
|
/*
|
|
* Find the page corresponding to the "fake" name
|
|
* and then read the corresponding page from /dev/mem.
|
|
*/
|
|
pp = page_find((struct vnode *)NULL, (u_int)(off & PAGEMASK));
|
|
if (pp == NULL) {
|
|
err = EIO;
|
|
break;
|
|
}
|
|
uio->uio_offset = ptob(page_pptonum(pp)) + (off & PAGEOFFSET);
|
|
|
|
if ((off & PAGEOFFSET) == 0)
|
|
uio->uio_resid = MIN(PAGESIZE, nbytes);
|
|
else
|
|
uio->uio_resid = min(ptob(btopr(off)) - off,
|
|
(u_int)nbytes);
|
|
nbytes -= uio->uio_resid;
|
|
off += uio->uio_resid;
|
|
err = VOP_RDWR(memvp, uio, rw, 0, cred);
|
|
} while (err == 0 && nbytes > 0 && uio->uio_resid == 0);
|
|
|
|
VN_RELE(memvp);
|
|
return (err);
|
|
}
|
|
|
|
/*
|
|
* Common routine used to break up reads and writes to the
|
|
* (virtual) swap device to the underlying vnode(s). This is
|
|
* used to implement the user visable /dev/drum interface.
|
|
*/
|
|
static int
|
|
sw_rdwr(uio, rw)
|
|
register struct uio *uio;
|
|
enum uio_rw rw;
|
|
{
|
|
register struct swapinfo *sip = swapinfo;
|
|
int nbytes = uio->uio_resid;
|
|
u_int off = 0;
|
|
int err = 0;
|
|
|
|
do {
|
|
if (uio->uio_offset >= off &&
|
|
uio->uio_offset < off + sip->si_size)
|
|
break;
|
|
off += sip->si_size;
|
|
} while (sip = sip->si_next);
|
|
|
|
if (sip) {
|
|
uio->uio_offset -= off;
|
|
do {
|
|
uio->uio_resid = MIN(sip->si_size - uio->uio_offset,
|
|
nbytes);
|
|
nbytes -= uio->uio_resid;
|
|
if (sip->si_vp)
|
|
err = VOP_RDWR(sip->si_vp, uio, rw, 0,
|
|
u.u_cred);
|
|
else
|
|
err = fake_sw_rdwr(uio, rw, u.u_cred);
|
|
uio->uio_offset = 0;
|
|
} while (err == 0 && nbytes > 0 && uio->uio_resid == 0 &&
|
|
(sip = sip->si_next));
|
|
uio->uio_resid = nbytes + uio->uio_resid;
|
|
}
|
|
|
|
return (err);
|
|
}
|
|
|
|
/*
|
|
* System call swapon(name) enables swapping on device name,
|
|
* Return EBUSY if already swapping on this device.
|
|
*/
|
|
swapon()
|
|
{
|
|
register struct a {
|
|
char *name;
|
|
} *uap = (struct a *)u.u_ap;
|
|
struct vnode *vp;
|
|
|
|
if (!suser())
|
|
return;
|
|
uap = (struct a *)u.u_ap;
|
|
if (u.u_error = lookupname(uap->name, UIOSEG_USER, FOLLOW_LINK,
|
|
(struct vnode **)NULL, &vp))
|
|
return;
|
|
|
|
switch (vp->v_type) {
|
|
case VBLK: {
|
|
struct vnode *nvp;
|
|
|
|
nvp = bdevvp(vp->v_rdev);
|
|
VN_RELE(vp);
|
|
vp = nvp;
|
|
/*
|
|
* Call the partition's open routine, to give it a chance to
|
|
* check itself for consistency (e.g., for scrambled disk
|
|
* labels). (The open isn't otherwise required.)
|
|
*/
|
|
if (u.u_error = VOP_OPEN(&vp, FREAD|FWRITE, u.u_cred))
|
|
goto out;
|
|
break;
|
|
}
|
|
|
|
case VREG:
|
|
if (vp->v_vfsp->vfs_flag & VFS_RDONLY) {
|
|
u.u_error = EROFS;
|
|
goto out;
|
|
}
|
|
if (u.u_error = VOP_ACCESS(vp, VREAD|VWRITE, u.u_cred))
|
|
goto out;
|
|
if (u.u_error = VOP_OPEN(&vp, FREAD|FWRITE, u.u_cred))
|
|
goto out;
|
|
break;
|
|
|
|
case VDIR:
|
|
u.u_error = EISDIR;
|
|
goto out;
|
|
|
|
case VCHR:
|
|
case VSOCK:
|
|
default:
|
|
u.u_error = EOPNOTSUPP;
|
|
goto out;
|
|
}
|
|
u.u_error = swap_init(vp);
|
|
out:
|
|
if (u.u_error) {
|
|
VN_RELE(vp);
|
|
}
|
|
}
|