/* @(#)vm_swap.c 1.1 92/07/30 SMI */ #ident "$SunId: @(#)vm_swap.c 1.2 91/02/19 SMI [RMTC] $" /* * Copyright (c) 1988, 1989 by Sun Microsystems, Inc. */ /* * Virtual swap device * * The virtual swap device consists of the logical concatenation of one * or more physical swap areas. It provides a logical array of anon * slots, each of which corresponds to a page of swap space. * * Each physical swap area has an associated anon array representing * its physical storage. These anon arrays are logically concatenated * sequentially to form the overall swap device anon array. Thus, the * offset of a given entry within this logical array is computed as the * sum of the sizes of each area preceding the entry plus the offset * within the area containing the entry. * * The anon array entries for unused swap slots within an area are * linked together into a free list. Allocation proceeds by finding a * suitable area (attempting to balance use among all the areas) and * then returning the first free entry within the area. Thus, there's * no linear relation between offset within the swap device and the * address (within its segment(s)) of the page that the slot backs; * instead, it's an arbitrary one-to-one mapping. * * Associated with each swap area is a swapinfo structure. These * structures are linked into a linear list that determines the * ordering of swap areas in the logical swap device. Each contains a * pointer to the corresponding anon array, the area's size, and its * associated vnode. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* these includes are used for the "fake" swap support of /dev/drum */ #include #include static struct swapinfo *silast; struct swapinfo *swapinfo; /* * To balance the load among multiple swap areas, we don't allow * more than swap_maxcontig allocations to be satisfied from a * single swap area before moving on to the next swap area. This * effectively "interleaves" allocations among the many swap areas. */ int swap_maxcontig = 1024 * 1024 / PAGESIZE; /* 1MB of pages */ extern int klustsize; /* from spec_vnodeops.c */ int swap_order = 1; /* see swap_alloc,free */ #define MINIROOTSIZE 14000 /* ~7 Meg */ /* * Initialize a new swapinfo structure. */ static int swapinfo_init(vp, npages, skip) struct vnode *vp; register u_int npages; u_int skip; { register struct anon *ap, *ap2; register struct swapinfo **sipp, *nsip; for (sipp = &swapinfo; nsip = *sipp; sipp = &nsip->si_next) if (nsip->si_vp == vp) return (EBUSY); /* swap device already in use */ nsip = (struct swapinfo *)new_kmem_zalloc( sizeof (struct swapinfo), KMEM_SLEEP); nsip->si_vp = vp; nsip->si_size = ptob(npages); /* * Don't indirect through NULL if called with npages < skip (too tacky) */ if (npages < skip) npages = skip; nsip->si_anon = (struct anon *)new_kmem_zalloc( npages * sizeof (struct anon), KMEM_SLEEP); nsip->si_eanon = &nsip->si_anon[npages - 1]; #ifdef RECORD_USAGE /* * Monitoring of swap space usage is enabled, so malloc * a parallel array to hold the PID responsible for * causing the anon page to be created. */ nsip->si_pid = (short *) new_kmem_zalloc(npages * sizeof (short), KMEM_SLEEP); #endif RECORD_USAGE npages -= skip; /* * ap2 now points to the first usable slot in the swap area. * Set up free list links so that the head of the list is at * the front of the usable portion of the array. */ ap = nsip->si_eanon; ap2 = nsip->si_anon + skip; while (--ap >= ap2) ap->un.an_next = ap + 1; if (npages == 0) /* if size was <= skip */ nsip->si_free = NULL; else nsip->si_free = ap + 1; anoninfo.ani_free += npages; anoninfo.ani_max += npages; *sipp = nsip; if (silast == NULL) /* first swap device */ silast = nsip; return (0); } /* * Initialize a swap vnode. */ int swap_init(vp) struct vnode *vp; { struct vattr vattr; u_int skip; int err; err = VOP_GETATTR(vp, &vattr, u.u_cred); /* XXX - u.u_cred? */ if (err) { printf("swap_init: getattr failed, errno %d\n", err); return (err); } /* * To prevent swap I/O requests from crossing the boundary * between swap areas, we erect a "fence" between areas by * not allowing the first page of each swap area to be used. * (This also prevents us from scribbling on the disk label * if the swap partition is the first partition on the disk.) * This may not be strictly necessary, since swap_blksize also * prevents requests from crossing the boundary. * * If swapping on the root filesystem, don't put swap blocks that * correspond to the miniroot filesystem on the swap free list. */ if (rootvp == vp) skip = btoc(roundup(dbtob(MINIROOTSIZE), klustsize)); else skip = 1; err = swapinfo_init(vp, (u_int)btop(vattr.va_size), skip); if (!err) vp->v_flag |= VISSWAP; return (err); } /* * This routine is used to fake npages worth of swap space. * These pages will have no backing and cannot be paged out any where. */ swap_cons(npages) u_int npages; { if (swapinfo_init((struct vnode *)NULL, npages, 0) != 0) panic("swap_cons"); } /* * Points to the location (close to) the last block handed to * swap_free. The theory is that if you free one in this area, * you'll probably free more, so use the hint as a starting point. * hint is reset on each free to the block that preceeds the one * freed (or the block freed, if we can't find the block before it). * It is also reset if it points at block that is allocated. * * XXX - swap_free and swap_alloc both manipulate hint; the free * lists are now protected with splswap(). Don't call into these routines * from higher level interrupts! */ static struct { struct anon *ap; /* pointer to the last freed */ struct swapinfo *sip; /* swap list for which hint is valid */ } hint; int swap_hit; /* hint helped */ int swap_miss; /* hint was no good */ /* * Allocate a single page from the virtual swap device. */ struct anon * swap_alloc() { struct swapinfo *sip = silast; struct anon *ap; do { ap = sip->si_free; if (ap) { /* * can't condition this on swap_order since some * idiot might turn it on and off. It's not cool * to have the hint point at an allocated block. */ if (hint.sip == sip && hint.ap == ap) hint.sip = NULL; sip->si_free = ap->un.an_next; if (++sip->si_allocs >= swap_maxcontig) { sip->si_allocs = 0; if (sip == silast) { silast = sip->si_next; if (silast == NULL) silast = swapinfo; } } else { silast = sip; } # ifdef TRACE { struct vnode *vp; u_int off; swap_xlate(ap, &vp, &off); trace3(TR_MP_SWAP, vp, off, ap); } # endif TRACE #ifdef RECORD_USAGE if (u.u_procp) { /* swap monitoring is on - record the current PID */ sip->si_pid[ap - sip->si_anon] = u.u_procp->p_pid; } #endif RECORD_USAGE return (ap); } /* * No more free anon slots here. */ sip->si_allocs = 0; sip = sip->si_next; if (sip == NULL) sip = swapinfo; } while (sip != silast); return ((struct anon *)NULL); } /* * Free a swap page. * List is maintained in sorted order. Worst case is a linear search on the * list; we maintain a hint to mitigate this. * * Pointing the hint at the most recently free'd anon struct makes it * really fast to free anon pages in ascending order. * * Pointing the hint at the anon struct that is just *before* this makes * it really fast to free anon pages in descending order, at nearly zero * cost. * * This alogrithm points the hint at the anon struct that points to * the one most recently free'd. When freeing a block of anon structs * presented in ascending order, the hint advances one block behind * the blocks as they are free'd. When freeing a block of anon structs * precented in descending order -- which happens if a large hunk of * memory is allocated in reverse order then free'd in forward order, * common enough to be a problem -- the hint remains pointing at the * anon struct that ends up pointing at each of the free'd blocks * in order. This is worth an example. * * Assume anons #2 and #9 are free, the hint points to anon #2, and * #2's "next" pointer goes to #9. Now, we present a set of swap_free * requests for blocks #8 through #3, in descending order. This results * in a series of hits on the hint, which just keeps pointing at #2. * The previous algorithm would have set the hint to each block as * it came in, resulting in worst-case behavior as the list had to * be scanned from the front. */ void swap_free(ap) struct anon *ap; { register struct swapinfo *sip = silast; register struct anon *tap, **tapp; register struct anon *tap_hint; /* * Find the swap area containing ap and then put * ap at the head of that area's free list. */ do { if (sip->si_anon <= ap && ap <= sip->si_eanon) { /* ap->un.an_next = sip->si_free; sip->si_free = ap; */ /* * old unordered way */ if (!swap_order) { ap->un.an_next = sip->si_free; sip->si_free = ap; #ifdef RECORD_USAGE /* Swap monitoring is on - undo the PID */ sip->si_pid[ap - sip->si_anon] = 0; #endif RECORD_USAGE return; } /* * Do it in order; use hint if possible */ tap = hint.ap; if (hint.sip == sip && tap < ap) { /* * The anon we are freeing * follows the hint tap somewhere. * save the hint and advance * to the next free anon. */ tapp = &tap->un.an_next; tap_hint = tap; tap = tap->un.an_next; swap_hit++; } else { /* * Wrong swapinfo, or * the anon being free'd * preceeds the hint. * must start scanning * from the front of the * list. The best hint we * can seed with is the * anon we are freeing. */ tapp = &sip->si_free; tap = sip->si_free; tap_hint = ap; swap_miss++; } /* * advance tap until it is greater * than the incoming anon. */ while (tap && tap < ap) { tapp = &tap->un.an_next; tap_hint = tap; tap = tap->un.an_next; } *tapp = ap; ap->un.an_next = tap; #ifdef RECORD_USAGE /* Swap monitoring is on - undo the PID */ sip->si_pid[ap - sip->si_anon] = 0; #endif RECORD_USAGE hint.sip = sip; hint.ap = tap_hint; return; } sip = sip->si_next; if (sip == NULL) sip = swapinfo; } while (sip != silast); panic("swap_free"); /* NOTREACHED */ } /* * Return the pair * corresponding to the given anon struct. */ void swap_xlate(ap, vpp, offsetp) struct anon *ap; struct vnode **vpp; u_int *offsetp; { register struct swapinfo *sip = silast; do { if (sip->si_anon <= ap && ap <= sip->si_eanon) { *offsetp = ptob(ap - sip->si_anon); *vpp = sip->si_vp; return; } sip = sip->si_next; if (sip == NULL) sip = swapinfo; } while (sip != silast); panic("swap_xlate"); /* NOTREACHED */ } /* * Like swap_xlate, but return a status instead of panic'ing. * Used by dump routines when we know we may be corrupted. */ swap_xlate_nopanic(ap, vpp, offsetp) struct anon *ap; struct vnode **vpp; u_int *offsetp; { register struct swapinfo *sip = swapinfo; do { if (sip->si_anon <= ap && ap <= sip->si_eanon) { *offsetp = (ap - sip->si_anon) << PAGESHIFT; *vpp = sip->si_vp; return (1); } } while (sip = sip->si_next); /* Couldn't find it; return failure */ return (0); } /* * Return the anon struct corresponding for the given * if it is part of the virtual swap device. */ struct anon * swap_anon(vp, offset) struct vnode *vp; u_int offset; { register struct swapinfo *sip = silast; if (vp && sip) { do { if (vp == sip->si_vp && offset < sip->si_size) return (sip->si_anon + (offset >> PAGESHIFT)); sip = sip->si_next; if (sip == NULL) sip = swapinfo; } while (sip != silast); } /* * Note - we don't return the anon structure for * fake'd anon slots which have no real vp. */ return ((struct anon *)NULL); } /* * swread and swwrite implement the /dev/drum device, an indirect, * user visible, device to allow reading of the (virtual) swap device. */ /*ARGSUSED*/ swread(dev, uio) dev_t dev; struct uio *uio; { return (sw_rdwr(uio, UIO_READ)); } /*ARGSUSED*/ swwrite(dev, uio) dev_t dev; struct uio *uio; { return (sw_rdwr(uio, UIO_WRITE)); } /* * Handle all the work of reading "fake" swap pages that are in memory. */ static int fake_sw_rdwr(uio, rw, cred) register struct uio *uio; enum uio_rw rw; struct ucred *cred; { struct page *pp; struct vnode *memvp; int nbytes; u_int off; int err; extern int mem_no; nbytes = uio->uio_resid; off = uio->uio_offset; memvp = makespecvp(makedev(mem_no, M_MEM), VCHR); do { /* * Find the page corresponding to the "fake" name * and then read the corresponding page from /dev/mem. */ pp = page_find((struct vnode *)NULL, (u_int)(off & PAGEMASK)); if (pp == NULL) { err = EIO; break; } uio->uio_offset = ptob(page_pptonum(pp)) + (off & PAGEOFFSET); if ((off & PAGEOFFSET) == 0) uio->uio_resid = MIN(PAGESIZE, nbytes); else uio->uio_resid = min(ptob(btopr(off)) - off, (u_int)nbytes); nbytes -= uio->uio_resid; off += uio->uio_resid; err = VOP_RDWR(memvp, uio, rw, 0, cred); } while (err == 0 && nbytes > 0 && uio->uio_resid == 0); VN_RELE(memvp); return (err); } /* * Common routine used to break up reads and writes to the * (virtual) swap device to the underlying vnode(s). This is * used to implement the user visable /dev/drum interface. */ static int sw_rdwr(uio, rw) register struct uio *uio; enum uio_rw rw; { register struct swapinfo *sip = swapinfo; int nbytes = uio->uio_resid; u_int off = 0; int err = 0; do { if (uio->uio_offset >= off && uio->uio_offset < off + sip->si_size) break; off += sip->si_size; } while (sip = sip->si_next); if (sip) { uio->uio_offset -= off; do { uio->uio_resid = MIN(sip->si_size - uio->uio_offset, nbytes); nbytes -= uio->uio_resid; if (sip->si_vp) err = VOP_RDWR(sip->si_vp, uio, rw, 0, u.u_cred); else err = fake_sw_rdwr(uio, rw, u.u_cred); uio->uio_offset = 0; } while (err == 0 && nbytes > 0 && uio->uio_resid == 0 && (sip = sip->si_next)); uio->uio_resid = nbytes + uio->uio_resid; } return (err); } /* * System call swapon(name) enables swapping on device name, * Return EBUSY if already swapping on this device. */ swapon() { register struct a { char *name; } *uap = (struct a *)u.u_ap; struct vnode *vp; if (!suser()) return; uap = (struct a *)u.u_ap; if (u.u_error = lookupname(uap->name, UIOSEG_USER, FOLLOW_LINK, (struct vnode **)NULL, &vp)) return; switch (vp->v_type) { case VBLK: { struct vnode *nvp; nvp = bdevvp(vp->v_rdev); VN_RELE(vp); vp = nvp; /* * Call the partition's open routine, to give it a chance to * check itself for consistency (e.g., for scrambled disk * labels). (The open isn't otherwise required.) */ if (u.u_error = VOP_OPEN(&vp, FREAD|FWRITE, u.u_cred)) goto out; break; } case VREG: if (vp->v_vfsp->vfs_flag & VFS_RDONLY) { u.u_error = EROFS; goto out; } if (u.u_error = VOP_ACCESS(vp, VREAD|VWRITE, u.u_cred)) goto out; if (u.u_error = VOP_OPEN(&vp, FREAD|FWRITE, u.u_cred)) goto out; break; case VDIR: u.u_error = EISDIR; goto out; case VCHR: case VSOCK: default: u.u_error = EOPNOTSUPP; goto out; } u.u_error = swap_init(vp); out: if (u.u_error) { VN_RELE(vp); } }