2021-10-11 18:20:23 -03:00

634 lines
16 KiB
C

/* @(#)vm_swap.c 1.1 92/07/30 SMI */
#ident "$SunId: @(#)vm_swap.c 1.2 91/02/19 SMI [RMTC] $"
/*
* Copyright (c) 1988, 1989 by Sun Microsystems, Inc.
*/
/*
* Virtual swap device
*
* The virtual swap device consists of the logical concatenation of one
* or more physical swap areas. It provides a logical array of anon
* slots, each of which corresponds to a page of swap space.
*
* Each physical swap area has an associated anon array representing
* its physical storage. These anon arrays are logically concatenated
* sequentially to form the overall swap device anon array. Thus, the
* offset of a given entry within this logical array is computed as the
* sum of the sizes of each area preceding the entry plus the offset
* within the area containing the entry.
*
* The anon array entries for unused swap slots within an area are
* linked together into a free list. Allocation proceeds by finding a
* suitable area (attempting to balance use among all the areas) and
* then returning the first free entry within the area. Thus, there's
* no linear relation between offset within the swap device and the
* address (within its segment(s)) of the page that the slot backs;
* instead, it's an arbitrary one-to-one mapping.
*
* Associated with each swap area is a swapinfo structure. These
* structures are linked into a linear list that determines the
* ordering of swap areas in the logical swap device. Each contains a
* pointer to the corresponding anon array, the area's size, and its
* associated vnode.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/user.h>
#include <sys/vfs.h>
#include <sys/vnode.h>
#include <sys/file.h>
#include <sys/uio.h>
#include <sys/conf.h>
#include <sys/bootconf.h>
#include <sys/trace.h>
#include <vm/hat.h>
#include <vm/anon.h>
#include <vm/page.h>
#include <vm/swap.h>
/* these includes are used for the "fake" swap support of /dev/drum */
#include <sun/mem.h>
#include <specfs/snode.h>
static struct swapinfo *silast;
struct swapinfo *swapinfo;
/*
* To balance the load among multiple swap areas, we don't allow
* more than swap_maxcontig allocations to be satisfied from a
* single swap area before moving on to the next swap area. This
* effectively "interleaves" allocations among the many swap areas.
*/
int swap_maxcontig = 1024 * 1024 / PAGESIZE; /* 1MB of pages */
extern int klustsize; /* from spec_vnodeops.c */
int swap_order = 1; /* see swap_alloc,free */
#define MINIROOTSIZE 14000 /* ~7 Meg */
/*
* Initialize a new swapinfo structure.
*/
static int
swapinfo_init(vp, npages, skip)
struct vnode *vp;
register u_int npages;
u_int skip;
{
register struct anon *ap, *ap2;
register struct swapinfo **sipp, *nsip;
for (sipp = &swapinfo; nsip = *sipp; sipp = &nsip->si_next)
if (nsip->si_vp == vp)
return (EBUSY); /* swap device already in use */
nsip = (struct swapinfo *)new_kmem_zalloc(
sizeof (struct swapinfo), KMEM_SLEEP);
nsip->si_vp = vp;
nsip->si_size = ptob(npages);
/*
* Don't indirect through NULL if called with npages < skip (too tacky)
*/
if (npages < skip)
npages = skip;
nsip->si_anon = (struct anon *)new_kmem_zalloc(
npages * sizeof (struct anon), KMEM_SLEEP);
nsip->si_eanon = &nsip->si_anon[npages - 1];
#ifdef RECORD_USAGE
/*
* Monitoring of swap space usage is enabled, so malloc
* a parallel array to hold the PID responsible for
* causing the anon page to be created.
*/
nsip->si_pid = (short *)
new_kmem_zalloc(npages * sizeof (short), KMEM_SLEEP);
#endif RECORD_USAGE
npages -= skip;
/*
* ap2 now points to the first usable slot in the swap area.
* Set up free list links so that the head of the list is at
* the front of the usable portion of the array.
*/
ap = nsip->si_eanon;
ap2 = nsip->si_anon + skip;
while (--ap >= ap2)
ap->un.an_next = ap + 1;
if (npages == 0) /* if size was <= skip */
nsip->si_free = NULL;
else
nsip->si_free = ap + 1;
anoninfo.ani_free += npages;
anoninfo.ani_max += npages;
*sipp = nsip;
if (silast == NULL) /* first swap device */
silast = nsip;
return (0);
}
/*
* Initialize a swap vnode.
*/
int
swap_init(vp)
struct vnode *vp;
{
struct vattr vattr;
u_int skip;
int err;
err = VOP_GETATTR(vp, &vattr, u.u_cred); /* XXX - u.u_cred? */
if (err) {
printf("swap_init: getattr failed, errno %d\n", err);
return (err);
}
/*
* To prevent swap I/O requests from crossing the boundary
* between swap areas, we erect a "fence" between areas by
* not allowing the first page of each swap area to be used.
* (This also prevents us from scribbling on the disk label
* if the swap partition is the first partition on the disk.)
* This may not be strictly necessary, since swap_blksize also
* prevents requests from crossing the boundary.
*
* If swapping on the root filesystem, don't put swap blocks that
* correspond to the miniroot filesystem on the swap free list.
*/
if (rootvp == vp)
skip = btoc(roundup(dbtob(MINIROOTSIZE), klustsize));
else
skip = 1;
err = swapinfo_init(vp, (u_int)btop(vattr.va_size), skip);
if (!err)
vp->v_flag |= VISSWAP;
return (err);
}
/*
* This routine is used to fake npages worth of swap space.
* These pages will have no backing and cannot be paged out any where.
*/
swap_cons(npages)
u_int npages;
{
if (swapinfo_init((struct vnode *)NULL, npages, 0) != 0)
panic("swap_cons");
}
/*
* Points to the location (close to) the last block handed to
* swap_free. The theory is that if you free one in this area,
* you'll probably free more, so use the hint as a starting point.
* hint is reset on each free to the block that preceeds the one
* freed (or the block freed, if we can't find the block before it).
* It is also reset if it points at block that is allocated.
*
* XXX - swap_free and swap_alloc both manipulate hint; the free
* lists are now protected with splswap(). Don't call into these routines
* from higher level interrupts!
*/
static struct {
struct anon *ap; /* pointer to the last freed */
struct swapinfo *sip; /* swap list for which hint is valid */
} hint;
int swap_hit; /* hint helped */
int swap_miss; /* hint was no good */
/*
* Allocate a single page from the virtual swap device.
*/
struct anon *
swap_alloc()
{
struct swapinfo *sip = silast;
struct anon *ap;
do {
ap = sip->si_free;
if (ap) {
/*
* can't condition this on swap_order since some
* idiot might turn it on and off. It's not cool
* to have the hint point at an allocated block.
*/
if (hint.sip == sip && hint.ap == ap)
hint.sip = NULL;
sip->si_free = ap->un.an_next;
if (++sip->si_allocs >= swap_maxcontig) {
sip->si_allocs = 0;
if (sip == silast) {
silast = sip->si_next;
if (silast == NULL)
silast = swapinfo;
}
} else {
silast = sip;
}
# ifdef TRACE
{
struct vnode *vp;
u_int off;
swap_xlate(ap, &vp, &off);
trace3(TR_MP_SWAP, vp, off, ap);
}
# endif TRACE
#ifdef RECORD_USAGE
if (u.u_procp) {
/* swap monitoring is on - record the current PID */
sip->si_pid[ap - sip->si_anon] = u.u_procp->p_pid;
}
#endif RECORD_USAGE
return (ap);
}
/*
* No more free anon slots here.
*/
sip->si_allocs = 0;
sip = sip->si_next;
if (sip == NULL)
sip = swapinfo;
} while (sip != silast);
return ((struct anon *)NULL);
}
/*
* Free a swap page.
* List is maintained in sorted order. Worst case is a linear search on the
* list; we maintain a hint to mitigate this.
*
* Pointing the hint at the most recently free'd anon struct makes it
* really fast to free anon pages in ascending order.
*
* Pointing the hint at the anon struct that is just *before* this makes
* it really fast to free anon pages in descending order, at nearly zero
* cost.
*
* This alogrithm points the hint at the anon struct that points to
* the one most recently free'd. When freeing a block of anon structs
* presented in ascending order, the hint advances one block behind
* the blocks as they are free'd. When freeing a block of anon structs
* precented in descending order -- which happens if a large hunk of
* memory is allocated in reverse order then free'd in forward order,
* common enough to be a problem -- the hint remains pointing at the
* anon struct that ends up pointing at each of the free'd blocks
* in order. This is worth an example.
*
* Assume anons #2 and #9 are free, the hint points to anon #2, and
* #2's "next" pointer goes to #9. Now, we present a set of swap_free
* requests for blocks #8 through #3, in descending order. This results
* in a series of hits on the hint, which just keeps pointing at #2.
* The previous algorithm would have set the hint to each block as
* it came in, resulting in worst-case behavior as the list had to
* be scanned from the front.
*/
void
swap_free(ap)
struct anon *ap;
{
register struct swapinfo *sip = silast;
register struct anon *tap, **tapp;
register struct anon *tap_hint;
/*
* Find the swap area containing ap and then put
* ap at the head of that area's free list.
*/
do {
if (sip->si_anon <= ap && ap <= sip->si_eanon) {
/*
ap->un.an_next = sip->si_free;
sip->si_free = ap;
*/
/*
* old unordered way
*/
if (!swap_order) {
ap->un.an_next = sip->si_free;
sip->si_free = ap;
#ifdef RECORD_USAGE
/* Swap monitoring is on - undo the PID */
sip->si_pid[ap - sip->si_anon] = 0;
#endif RECORD_USAGE
return;
}
/*
* Do it in order; use hint if possible
*/
tap = hint.ap;
if (hint.sip == sip && tap < ap) {
/*
* The anon we are freeing
* follows the hint tap somewhere.
* save the hint and advance
* to the next free anon.
*/
tapp = &tap->un.an_next;
tap_hint = tap;
tap = tap->un.an_next;
swap_hit++;
} else {
/*
* Wrong swapinfo, or
* the anon being free'd
* preceeds the hint.
* must start scanning
* from the front of the
* list. The best hint we
* can seed with is the
* anon we are freeing.
*/
tapp = &sip->si_free;
tap = sip->si_free;
tap_hint = ap;
swap_miss++;
}
/*
* advance tap until it is greater
* than the incoming anon.
*/
while (tap && tap < ap) {
tapp = &tap->un.an_next;
tap_hint = tap;
tap = tap->un.an_next;
}
*tapp = ap;
ap->un.an_next = tap;
#ifdef RECORD_USAGE
/* Swap monitoring is on - undo the PID */
sip->si_pid[ap - sip->si_anon] = 0;
#endif RECORD_USAGE
hint.sip = sip;
hint.ap = tap_hint;
return;
}
sip = sip->si_next;
if (sip == NULL)
sip = swapinfo;
} while (sip != silast);
panic("swap_free");
/* NOTREACHED */
}
/*
* Return the <vnode, offset> pair
* corresponding to the given anon struct.
*/
void
swap_xlate(ap, vpp, offsetp)
struct anon *ap;
struct vnode **vpp;
u_int *offsetp;
{
register struct swapinfo *sip = silast;
do {
if (sip->si_anon <= ap && ap <= sip->si_eanon) {
*offsetp = ptob(ap - sip->si_anon);
*vpp = sip->si_vp;
return;
}
sip = sip->si_next;
if (sip == NULL)
sip = swapinfo;
} while (sip != silast);
panic("swap_xlate");
/* NOTREACHED */
}
/*
* Like swap_xlate, but return a status instead of panic'ing.
* Used by dump routines when we know we may be corrupted.
*/
swap_xlate_nopanic(ap, vpp, offsetp)
struct anon *ap;
struct vnode **vpp;
u_int *offsetp;
{
register struct swapinfo *sip = swapinfo;
do {
if (sip->si_anon <= ap && ap <= sip->si_eanon) {
*offsetp = (ap - sip->si_anon) << PAGESHIFT;
*vpp = sip->si_vp;
return (1);
}
} while (sip = sip->si_next);
/* Couldn't find it; return failure */
return (0);
}
/*
* Return the anon struct corresponding for the given
* <vnode, offset> if it is part of the virtual swap device.
*/
struct anon *
swap_anon(vp, offset)
struct vnode *vp;
u_int offset;
{
register struct swapinfo *sip = silast;
if (vp && sip) {
do {
if (vp == sip->si_vp && offset < sip->si_size)
return (sip->si_anon + (offset >> PAGESHIFT));
sip = sip->si_next;
if (sip == NULL)
sip = swapinfo;
} while (sip != silast);
}
/*
* Note - we don't return the anon structure for
* fake'd anon slots which have no real vp.
*/
return ((struct anon *)NULL);
}
/*
* swread and swwrite implement the /dev/drum device, an indirect,
* user visible, device to allow reading of the (virtual) swap device.
*/
/*ARGSUSED*/
swread(dev, uio)
dev_t dev;
struct uio *uio;
{
return (sw_rdwr(uio, UIO_READ));
}
/*ARGSUSED*/
swwrite(dev, uio)
dev_t dev;
struct uio *uio;
{
return (sw_rdwr(uio, UIO_WRITE));
}
/*
* Handle all the work of reading "fake" swap pages that are in memory.
*/
static int
fake_sw_rdwr(uio, rw, cred)
register struct uio *uio;
enum uio_rw rw;
struct ucred *cred;
{
struct page *pp;
struct vnode *memvp;
int nbytes;
u_int off;
int err;
extern int mem_no;
nbytes = uio->uio_resid;
off = uio->uio_offset;
memvp = makespecvp(makedev(mem_no, M_MEM), VCHR);
do {
/*
* Find the page corresponding to the "fake" name
* and then read the corresponding page from /dev/mem.
*/
pp = page_find((struct vnode *)NULL, (u_int)(off & PAGEMASK));
if (pp == NULL) {
err = EIO;
break;
}
uio->uio_offset = ptob(page_pptonum(pp)) + (off & PAGEOFFSET);
if ((off & PAGEOFFSET) == 0)
uio->uio_resid = MIN(PAGESIZE, nbytes);
else
uio->uio_resid = min(ptob(btopr(off)) - off,
(u_int)nbytes);
nbytes -= uio->uio_resid;
off += uio->uio_resid;
err = VOP_RDWR(memvp, uio, rw, 0, cred);
} while (err == 0 && nbytes > 0 && uio->uio_resid == 0);
VN_RELE(memvp);
return (err);
}
/*
* Common routine used to break up reads and writes to the
* (virtual) swap device to the underlying vnode(s). This is
* used to implement the user visable /dev/drum interface.
*/
static int
sw_rdwr(uio, rw)
register struct uio *uio;
enum uio_rw rw;
{
register struct swapinfo *sip = swapinfo;
int nbytes = uio->uio_resid;
u_int off = 0;
int err = 0;
do {
if (uio->uio_offset >= off &&
uio->uio_offset < off + sip->si_size)
break;
off += sip->si_size;
} while (sip = sip->si_next);
if (sip) {
uio->uio_offset -= off;
do {
uio->uio_resid = MIN(sip->si_size - uio->uio_offset,
nbytes);
nbytes -= uio->uio_resid;
if (sip->si_vp)
err = VOP_RDWR(sip->si_vp, uio, rw, 0,
u.u_cred);
else
err = fake_sw_rdwr(uio, rw, u.u_cred);
uio->uio_offset = 0;
} while (err == 0 && nbytes > 0 && uio->uio_resid == 0 &&
(sip = sip->si_next));
uio->uio_resid = nbytes + uio->uio_resid;
}
return (err);
}
/*
* System call swapon(name) enables swapping on device name,
* Return EBUSY if already swapping on this device.
*/
swapon()
{
register struct a {
char *name;
} *uap = (struct a *)u.u_ap;
struct vnode *vp;
if (!suser())
return;
uap = (struct a *)u.u_ap;
if (u.u_error = lookupname(uap->name, UIOSEG_USER, FOLLOW_LINK,
(struct vnode **)NULL, &vp))
return;
switch (vp->v_type) {
case VBLK: {
struct vnode *nvp;
nvp = bdevvp(vp->v_rdev);
VN_RELE(vp);
vp = nvp;
/*
* Call the partition's open routine, to give it a chance to
* check itself for consistency (e.g., for scrambled disk
* labels). (The open isn't otherwise required.)
*/
if (u.u_error = VOP_OPEN(&vp, FREAD|FWRITE, u.u_cred))
goto out;
break;
}
case VREG:
if (vp->v_vfsp->vfs_flag & VFS_RDONLY) {
u.u_error = EROFS;
goto out;
}
if (u.u_error = VOP_ACCESS(vp, VREAD|VWRITE, u.u_cred))
goto out;
if (u.u_error = VOP_OPEN(&vp, FREAD|FWRITE, u.u_cred))
goto out;
break;
case VDIR:
u.u_error = EISDIR;
goto out;
case VCHR:
case VSOCK:
default:
u.u_error = EOPNOTSUPP;
goto out;
}
u.u_error = swap_init(vp);
out:
if (u.u_error) {
VN_RELE(vp);
}
}