#ident "@(#)spec_vnodeops.c 1.1 94/10/31 SMI" /* * Copyright (c) 1988 by Sun Microsystems, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int spec_open(); static int spec_close(); static int spec_rdwr(); static int spec_ioctl(); static int spec_select(); static int spec_getattr(); static int spec_inactive(); static int spec_noop(); static int spec_getpage(); static int spec_putpage(); static int spec_map(); static int spec_dump(); static int spec_cmp(); /* * Used directly in fifo_vnodeops */ int spec_setattr(); int spec_access(); int spec_link(); int spec_lockctl(); int spec_fsync(); int spec_fid(); int spec_realvp(); int spec_cntl(); struct vnodeops spec_vnodeops = { spec_open, spec_close, spec_rdwr, spec_ioctl, spec_select, spec_getattr, spec_setattr, spec_access, spec_noop, /* lookup */ spec_noop, /* create */ spec_noop, /* remove */ spec_link, spec_noop, /* rename */ spec_noop, /* mkdir */ spec_noop, /* rmdir */ spec_noop, /* readdir */ spec_noop, /* symlink */ spec_noop, /* readlink */ spec_fsync, spec_inactive, spec_lockctl, spec_fid, spec_getpage, spec_putpage, spec_map, spec_dump, spec_cmp, spec_realvp, spec_cntl, }; /* * open a special file (device) * Some weird stuff here having to do with clone and indirect devices: * When a file lookup operation happens (e.g. ufs_lookup) and the vnode has * type VDEV specvp() is used to return a spec vnode instead. Then when * the VOP_OPEN routine is called, we get control here. When we do the * device open routine there are several possible strange results: * 1) An indirect device will return the error EAGAIN on open and return * a new dev number. We have to make that into a spec vnode and call * open on it again. * 2) The clone device driver will return the error EEXIST and return a * new dev number. As above, we build a new vnode and call open again, * explicitly asking the open routine to do a clone open. * 3) A clone device will return a new dev number on open but no error. * In this case we just make a new spec vnode out of the new dev number * and return that. * The last two cases differ in that the decision to clone arises outside * of the target device in 2) and from within in 3). * * TODO: extend case 2) to apply to all character devices, not just streams * devices. */ #define MAX_S_SIZE \ ((1 << sizeof (off_t) * NBBY - DEV_BSHIFT - 1) - 1) /*ARGSUSED*/ static int spec_open(vpp, flag, cred) struct vnode **vpp; int flag; struct ucred *cred; { register struct snode *sp; dev_t dev; dev_t newdev; int sflag = 0; register int error; sp = VTOS(*vpp); /* * Do open protocol for special type. */ dev = sp->s_dev; switch ((*vpp)->v_type) { case VCHR: newdev = dev; error = 0; for (;;) { register struct vnode *nvp; dev = newdev; if ((u_int)major(dev) >= nchrdev) return (ENXIO); while (isclosing(dev, (*vpp)->v_type)) if ( sleep((caddr_t)sp, PSLEP|PCATCH)) return (EINTR); if (cdevsw[major(dev)].d_str) { /* * Open the stream. Stropen handles * the mechanics of cloning itself. * In particular, it builds a fresh * vnode for the cloned instance and * does streams-specific cross-linking. */ error = stropen(vpp, flag, sflag); sp = VTOS(*vpp); break; } else error = (*cdevsw[major(dev)].d_open)(dev, flag, &newdev); /* * If this is an indirect device or a forced clone, * we need to do the open again. In both cases, * we insist that newdev differ from dev, to help * avoid infinite regress. */ if (newdev == dev || (error != 0 && error != EAGAIN && error != EEXIST)) break; /* * Allocate new snode with new device. Release old * snode. Set vpp to point to new one. This snode will * go away when the last reference to it goes away. * Warning: if you stat this, and try to match it with * a name in the filesystem you will fail, unless you * had previously put names in that match. */ nvp = makespecvp(newdev, VCHR); sp = VTOS(nvp); VN_RELE(*vpp); *vpp = nvp; /* If we've completed a clone open, we're done. */ if (error == 0) break; else sflag = error == EEXIST ? CLONEOPEN : 0; } break; case VFIFO: printf("spec_open: got a VFIFO???\n"); /* fall through to... */ case VSOCK: error = EOPNOTSUPP; break; case VBLK: /* * The block device sizing was already done in specvp(). * However, we still need to verify that we can open the * block device here (since specvp was called as part of a * "lookup", not an "open", and e.g. "stat"ing a block special * file with an illegal major device number should be legal). * * With loadable drivers, removable media devices, or * metadevices, the block device sizing might need to be * done again, as the open will likely find a changed size * to the device. * * If the special file for a device is opened before the * driver is loaded, or a lookup on /dev is done, there * might be an snode around with s_size == 0. In this case, * we need to resize the device. * * Another way of putting it is that the XXsize function * reports the current size, if any, of a device, and doesn't * imply any other action that the driver will take, while * the open might imply sizing. */ if ((u_int)major(dev) >= nblkdev) error = ENXIO; else error = (*bdevsw[major(dev)].d_open)(dev, flag); if (error == 0) { struct snode *sptmp; int (*size)() = bdevsw[major(dev)].d_psize; sptmp = VTOS(bdevvp(dev)); if (size != NULL) { int rsize = (*size)(dev); if (rsize == -1) sptmp->s_size = 0; else sptmp->s_size = dbtob(MIN(rsize, MAX_S_SIZE)); } VN_RELE(STOV(sptmp)); } break; default: panic("spec_open: type not VCHR or VBLK"); break; } if (error == 0) sp->s_count++; /* one more open reference */ return (error); } /*ARGSUSED*/ static int spec_close(vp, flag, count, cred) struct vnode *vp; int flag; int count; struct ucred *cred; { register struct snode *sp; dev_t dev; if (count > 1) return (0); /* * setjmp in case close is interrupted */ if (setjmp(&u.u_qsave)) { sp = VTOS(vp); /* recompute - I don't trust setjmp/longjmp */ sp->s_flag &= ~SCLOSING; wakeup((caddr_t)sp); return (EINTR); } sp = VTOS(vp); sp->s_count--; /* one fewer open reference */ /* * Only call the close routine when the last open * reference through any [s, v]node goes away. */ if (stillopen(sp->s_dev, vp->v_type)) return (0); dev = sp->s_dev; switch (vp->v_type) { case VCHR: /* * Mark this device as closing, so that opens will wait until * the close finishes. Since the close may block, this * prevents an open from getting in while the close is blocked, * and then getting surprised when the close finishes and * potentially clears out the driver's state. * * XXX - really should be done on all devices, but for now we * only do it on streams (as that's the one case where the * close blocks before the close routine is called, and thus * the one case where the close routine really can't protect * itself). */ /* * If it's a stream, call stream close routine. */ if (cdevsw[major(dev)].d_str) { sp->s_flag |= SCLOSING; strclose(vp, flag); sp->s_flag &= ~SCLOSING; wakeup((caddr_t)sp); } else (void) (*cdevsw[major(dev)].d_close)(dev, flag); break; case VBLK: /* * On last close of a block device, we flush back * and invalidate any in core buffers to help make * the spec vnode inactive ASAP if it is not currently * held by someone else for something (e.g., swapping). */ bflush(sp->s_bdevvp); binval(sp->s_bdevvp); (void) (*bdevsw[major(dev)].d_close)(dev, flag); break; case VFIFO: printf("spec_close: got a VFIFO???\n"); break; } return (0); } /* * read or write a spec vnode */ /*ARGSUSED*/ static int spec_rdwr(vp, uiop, rw, ioflag, cred) struct vnode *vp; register struct uio *uiop; enum uio_rw rw; int ioflag; struct ucred *cred; { register struct snode *sp; register addr_t base; register u_int off; struct vnode *blkvp; dev_t dev; register int n, on; u_int flags; u_int bdevsize; int pagecreate; int error; extern int mem_no; sp = VTOS(vp); dev = (dev_t)sp->s_dev; if (rw != UIO_READ && rw != UIO_WRITE) panic("spec_rdwr"); if (rw == UIO_READ && uiop->uio_resid == 0) return (0); n = uiop->uio_resid; /* * If this I/O will carry us over the 2GB threshold, * switch automatically to block mode if possible. * * XXX We switch if the I/O leaves us exactly at 2GB, * which is arguably wrong, but the old code didn't * allow such I/O's anyway, so there is no compatibility * problem. */ if (vp->v_type == VCHR && (uiop->uio_fmode & FSETBLK) == 0 && mem_no != major(dev) && vp->v_stream == NULL && uiop->uio_offset >= 0 && uiop->uio_offset + n < 0 && uiop->uio_offset % DEV_BSIZE == 0) { uiop->uio_fmode |= FSETBLK; uiop->uio_offset = btodb(uiop->uio_offset); } if (uiop->uio_fmode & FSETBLK) { if (n % DEV_BSIZE != 0) return (EINVAL); n = btodb(n); } if ((uiop->uio_offset < 0 || (n != 0 && uiop->uio_offset + n - 1 < 0)) && !(vp->v_type == VCHR && (mem_no == major(dev) || vp->v_stream != NULL))) { return (EINVAL); } if (rw == UIO_READ) smark(sp, SACC); if (vp->v_type == VCHR) { if (rw == UIO_READ) { if (cdevsw[major(dev)].d_str) { int saverr = u.u_error; u.u_error = 0; strread(vp, uiop); error = u.u_error; u.u_error = saverr; } else error = (*cdevsw[major(dev)].d_read)(dev, uiop); } else { smark(sp, SUPD|SCHG); if (cdevsw[major(dev)].d_str) { int saverr = u.u_error; u.u_error = 0; strwrite(vp, uiop); error = u.u_error; u.u_error = saverr; } else error = (*cdevsw[major(dev)].d_write)(dev, uiop); } return (error); } if (vp->v_type != VBLK) return (EOPNOTSUPP); if (uiop->uio_resid == 0) return (0); error = 0; blkvp = sp->s_bdevvp; bdevsize = sp->s_size; do { int diff; off = uiop->uio_offset & MAXBMASK; on = uiop->uio_offset & MAXBOFFSET; n = MIN(MAXBSIZE - on, uiop->uio_resid); pagecreate = 0; diff = bdevsize - uiop->uio_offset; if (diff <= 0) break; if (diff < n) n = diff; base = segmap_getmap(segkmap, blkvp, off); /* * Check to see if we can skip reading in the page * and just allocate the memory. We can do this * if we are going to rewrite the entire mapping * or if we are going to write to end of the device * from the beginning of the mapping. */ if (rw == UIO_WRITE && (n == MAXBSIZE || (on == 0 && (off + n) == bdevsize))) { SNLOCK(sp); segmap_pagecreate(segkmap, base + on, (u_int)n, 0); SNUNLOCK(sp); pagecreate = 1; } error = uiomove(base + on, n, rw, uiop); if (pagecreate && uiop->uio_offset < roundup(off + on + n, PAGESIZE)) { /* * We created pages w/o initializing them completely, * thus we need to zero the part that wasn't set up. * This can happen if we write to the end of the device * or if we had some sort of error during the uiomove. */ int nzero, nmoved; nmoved = uiop->uio_offset - (off + on); ASSERT(nmoved >= 0 && nmoved <= n); nzero = roundup(on + n, PAGESIZE) - nmoved; ASSERT(nzero > 0 && on + nmoved + nzero <= MAXBSIZE); (void) kzero(base + on + nmoved, (u_int)nzero); } if (error == 0) { flags = 0; if (rw == UIO_WRITE) { /* * Force write back for synchronous write cases. */ if (ioflag & IO_SYNC) { flags = SM_WRITE; } else if (n + on == MAXBSIZE || IS_SWAPVP(vp)) { /* * Have written a whole block. * Start an asynchronous write and * mark the buffer to indicate that * it won't be needed again soon. * Push swap files here, since it * won't happen anywhere else. */ flags = SM_WRITE | SM_ASYNC | SM_DONTNEED; } smark(sp, SUPD|SCHG); } else if (rw == UIO_READ) { /* * If read a whole block, won't need this * buffer again soon. Don't mark it with * SM_FREE, as that can lead to a deadlock * if the block corresponds to a u-page. * (The keep count never drops to zero, so * waiting for "i/o to complete" never * terminates; this points out a flaw in * our locking strategy.) */ if (n + on == MAXBSIZE) flags = SM_DONTNEED; } error = segmap_release(segkmap, base, flags); } else { (void) segmap_release(segkmap, base, 0); } } while (error == 0 && uiop->uio_resid > 0 && n != 0); return (error); } /*ARGSUSED*/ static int spec_ioctl(vp, com, data, flag, cred) struct vnode *vp; int com; caddr_t data; int flag; struct ucred *cred; { register struct snode *sp; sp = VTOS(vp); if (vp->v_type != VCHR) panic("spec_ioctl"); if (cdevsw[major(sp->s_dev)].d_str) { int saverr = u.u_error; int error; u.u_error = 0; strioctl(vp, com, data, flag); error = u.u_error; u.u_error = saverr; return (error); } return ((*cdevsw[major(sp->s_dev)].d_ioctl) (sp->s_dev, com, data, flag)); } /*ARGSUSED*/ static int spec_select(vp, which, cred) struct vnode *vp; int which; struct ucred *cred; { register struct snode *sp; sp = VTOS(vp); if (vp->v_type != VCHR) panic("spec_select"); if (cdevsw[major(sp->s_dev)].d_str) return (strselect(vp, which)); else return ((*cdevsw[major(sp->s_dev)].d_select)(sp->s_dev, which)); } static int spec_inactive(vp, cred) struct vnode *vp; struct ucred *cred; { struct snode *sp; int error; sp = VTOS(vp); /* XXX before removing the snode reset stream */ if (vp->v_type == VCHR && vp->v_stream) vp->v_stream->sd_vnode = other_specvp(vp); /* must sunsave() first to prevent a race when spec_fsync() sleeps */ sunsave(sp); if (sp->s_realvp && (sp->s_bdevvp == NULL || !IS_SWAPVP(sp->s_bdevvp))) (void) spec_fsync(vp, cred); if (vp->v_type == VBLK && vp->v_pages != NULL) { /* * Device is no longer referenced by anyone. * Destroy all the old pages (which BTW don't * count against the vnode reference count) so * we can, for instance, change floppy disks. */ error = spec_putpage(sp->s_bdevvp, 0, 0, B_INVAL, (struct ucred *)0); } else { error = 0; } /* now free the realvp (no longer done by sunsave()) */ if (sp->s_realvp) { VN_RELE(sp->s_realvp); sp->s_realvp = NULL; if (sp->s_bdevvp) VN_RELE(sp->s_bdevvp); } kmem_free((caddr_t)sp, sizeof (*sp)); return (error); } static int spec_getattr(vp, vap, cred) struct vnode *vp; register struct vattr *vap; struct ucred *cred; { int error; register struct snode *sp; register struct vnode *realvp; sp = VTOS(vp); if ((realvp = sp->s_realvp) == NULL) { /* * No real vnode behind this one. * Set the device size from snode. * Set times to the present. * Set blocksize based on type in the unreal vnode. */ bzero((caddr_t)vap, sizeof (*vap)); vap->va_size = sp->s_size; vap->va_rdev = sp->s_dev; vap->va_type = vp->v_type; vap->va_nodeid = ++fake_vno; } else { extern int dump_no; error = VOP_GETATTR(realvp, vap, cred); if (error != 0) return (error); /* if this is the dump file, copy the size, too */ /* XXX there should be a more general way of doing this */ if (vp->v_type == VCHR && dump_no == major(sp->s_dev)) vap->va_size = sp->s_size; } /* set current times from snode, even if older than vnode */ vap->va_atime = sp->s_atime; vap->va_mtime = sp->s_mtime; vap->va_ctime = sp->s_ctime; /* set device-dependent blocksizes */ switch (vap->va_type) { case VBLK: vap->va_blocksize = MAXBSIZE; /* was BLKDEV_IOSIZE */ break; case VCHR: vap->va_blocksize = MAXBSIZE; break; } return (0); } int spec_setattr(vp, vap, cred) struct vnode *vp; register struct vattr *vap; struct ucred *cred; { register struct snode *sp; register struct vnode *realvp; int error; register int chtime = 0; sp = VTOS(vp); if ((realvp = sp->s_realvp) == NULL) error = 0; /* no real vnode to update */ else error = VOP_SETATTR(realvp, vap, cred); if (error == 0) { /* if times were changed, update snode */ if (vap->va_mtime.tv_sec != -1) { /* * If SysV-compatible option to set access and * modified times if root, owner, or write access, * need to read back the new times in order to * keep the snode times in sync. If VOP_GETATTR() * fails, use current client time as an approximation. * * XXX - va_mtime.tv_usec == -1 flags this. */ if (vap->va_mtime.tv_usec == -1) { struct vattr vtmp; if ((realvp == NULL) || VOP_GETATTR(realvp, &vtmp, cred) != 0) { /* if error, simulate server time */ sp->s_mtime = time; sp->s_atime = time; sp->s_ctime = time; } else { sp->s_mtime = vtmp.va_mtime; sp->s_atime = vtmp.va_atime; sp->s_ctime = vtmp.va_ctime; } goto no_chtime; } sp->s_mtime = vap->va_mtime; chtime++; } if (vap->va_atime.tv_sec != -1) { sp->s_atime = vap->va_atime; chtime++; } if (chtime) sp->s_ctime = time; } no_chtime: return (error); } int spec_access(vp, mode, cred) struct vnode *vp; int mode; struct ucred *cred; { register struct vnode *realvp; if ((realvp = VTOS(vp)->s_realvp) != NULL) return (VOP_ACCESS(realvp, mode, cred)); else return (0); /* allow all access */ } int spec_link(vp, tdvp, tnm, cred) struct vnode *vp; struct vnode *tdvp; char *tnm; struct ucred *cred; { register struct vnode *realvp; if ((realvp = VTOS(vp)->s_realvp) != NULL) return (VOP_LINK(realvp, tdvp, tnm, cred)); else return (ENOENT); /* can't link to something non-existent */ } /* * In order to sync out the snode times without multi-client problems, * make sure the times written out are never earlier than the times * already set in the vnode. */ int spec_fsync(vp, cred) struct vnode *vp; struct ucred *cred; { register int error = 0; register struct snode *sp; register struct vnode *realvp; struct vattr *vap; struct vattr *vatmp; int err; sp = VTOS(vp); /* * If times didn't change on a non-block * special file, don't flush anything. */ if ((sp->s_flag & (SACC|SUPD|SCHG)) == 0 && vp->v_type != VBLK) return (0); sp->s_flag &= ~(SACC|SUPD|SCHG); /* * If the vnode represents a block device and it is a "shadow" * vnode, then flush all pages associated with the "common" vnode. */ if (vp->v_type == VBLK && sp->s_bdevvp != vp && sp->s_bdevvp->v_pages != NULL) error = spec_putpage(sp->s_bdevvp, 0, 0, 0, (struct ucred *)0); /* * If no real vnode to update, don't flush anything */ if ((realvp = sp->s_realvp) == NULL) return (error); vatmp = (struct vattr *)new_kmem_alloc(sizeof (*vatmp), KMEM_SLEEP); err = VOP_GETATTR(realvp, vatmp, cred); if (err == 0) { vap = (struct vattr *)new_kmem_alloc(sizeof (*vap), KMEM_SLEEP); vattr_null(vap); vap->va_atime = timercmp(&vatmp->va_atime, &sp->s_atime, >) ? vatmp->va_atime : sp->s_atime; vap->va_mtime = timercmp(&vatmp->va_mtime, &sp->s_mtime, >) ? vatmp->va_mtime : sp->s_mtime; VOP_SETATTR(realvp, vap, cred); kmem_free((caddr_t)vap, sizeof (*vap)); } kmem_free((caddr_t)vatmp, sizeof (*vatmp)); (void) VOP_FSYNC(realvp, cred); return (error); } static int spec_dump(vp, addr, bn, count) struct vnode *vp; caddr_t addr; int bn; int count; { return ((*bdevsw[major(vp->v_rdev)].d_dump) (vp->v_rdev, addr, bn, count)); } static int spec_noop() { return (EINVAL); } /* * Record-locking requests are passed back to the real vnode handler. */ int spec_lockctl(vp, ld, cmd, cred, clid) struct vnode *vp; struct flock *ld; int cmd; struct ucred *cred; int clid; { register struct vnode *realvp; if ((realvp = VTOS(vp)->s_realvp) != NULL) return (VOP_LOCKCTL(realvp, ld, cmd, cred, clid)); else return (EINVAL); /* can't lock this, it doesn't exist */ } int spec_fid(vp, fidpp) struct vnode *vp; struct fid **fidpp; { register struct vnode *realvp; if ((realvp = VTOS(vp)->s_realvp) != NULL) return (VOP_FID(realvp, fidpp)); else return (EINVAL); /* you lose */ } /* * klustsize should be a multiple of PAGESIZE and <= MAXPHYS. */ #define KLUSTSIZE (56 * 1024) int klustsize = KLUSTSIZE; int spec_ra = 1; int spec_lostpage; /* number of times we lost original page */ /* * Called from pvn_getpages or spec_getpage to get a particular page. * When we are called the snode is already locked. */ /*ARGSUSED*/ static int spec_getapage(vp, off, protp, pl, plsz, seg, addr, rw, cred) register struct vnode *vp; u_int off, *protp; struct page *pl[]; u_int plsz; struct seg *seg; addr_t addr; enum seg_rw rw; struct ucred *cred; { register struct snode *sp; struct buf *bp, *bp2; struct page *pp, *pp2, **ppp, *pagefound; u_int io_off, io_len; u_int blksz, blkoff; int dora, err; u_int xlen; sp = VTOS(vp); reread: err = 0; bp = NULL; bp2 = NULL; if (spec_ra && sp->s_nextr == off) dora = 1; else dora = 0; /* * We SNLOCK here to try and allow more concurrent access * to the snode. We release the lock as soon as we know * we won't be allocating more pages for the vnode. * NB: It's possible that the snode was already locked by * this process (e.g. we were called through pvn_getpages), * thus we are assuming that SNLOCK is recursive. */ SNLOCK(sp); again: if ((pagefound = page_find(vp, off)) == NULL) { /* * Need to really do disk IO to get the page. */ blkoff = (off / klustsize) * klustsize; if (blkoff + klustsize <= sp->s_size) blksz = klustsize; else blksz = sp->s_size - blkoff; pp = pvn_kluster(vp, off, seg, addr, &io_off, &io_len, blkoff, blksz, 0); /* * Somebody has entered the page before us, so * just use it. */ if (pp == NULL) goto again; if (!dora) SNUNLOCK(sp); if (pl != NULL) { register int sz; if (plsz >= io_len) { /* * Everything fits, set up to load * up and hold all the pages. */ pp2 = pp; sz = io_len; } else { /* * Set up to load plsz worth * starting at the needed page. */ for (pp2 = pp; pp2->p_offset != off; pp2 = pp2->p_next) { ASSERT(pp2->p_next->p_offset != pp->p_offset); } sz = plsz; } ppp = pl; do { PAGE_HOLD(pp2); *ppp++ = pp2; pp2 = pp2->p_next; sz -= PAGESIZE; } while (sz > 0); *ppp = NULL; /* terminate list */ } bp = pageio_setup(pp, io_len, vp, pl == NULL ? (B_ASYNC | B_READ) : B_READ); bp->b_dev = vp->v_rdev; bp->b_blkno = btodb(io_off); /* * Zero part of page which we are not * going to be reading from disk now. */ xlen = io_len & PAGEOFFSET; if (xlen != 0) pagezero(pp->p_prev, xlen, PAGESIZE - xlen); (*bdevsw[major(vp->v_rdev)].d_strategy)(bp); sp->s_nextr = io_off + io_len; u.u_ru.ru_majflt++; if (seg == segkmap) u.u_ru.ru_inblock++; /* count as `read' operation */ cnt.v_pgin++; cnt.v_pgpgin += btopr(io_len); } else if (!dora) SNUNLOCK(sp); if (dora) { u_int off2; addr_t addr2; off2 = ((off / klustsize) + 1) * klustsize; addr2 = addr + (off2 - off); /* * If addr is now in a different seg or we are past * EOF then don't bother trying with read-ahead. */ if (addr2 >= seg->s_base + seg->s_size || off2 >= sp->s_size) { pp2 = NULL; } else { if (off2 + klustsize <= sp->s_size) blksz = klustsize; else blksz = sp->s_size - off2; pp2 = pvn_kluster(vp, off2, seg, addr2, &io_off, &io_len, off2, blksz, 1); } SNUNLOCK(sp); if (pp2 != NULL) { bp2 = pageio_setup(pp2, io_len, vp, B_READ | B_ASYNC); bp2->b_dev = vp->v_rdev; bp2->b_blkno = btodb(io_off); /* * Zero part of page which we are not * going to be reading from disk now. */ xlen = io_len & PAGEOFFSET; if (xlen != 0) pagezero(pp2->p_prev, xlen, PAGESIZE - xlen); (*bdevsw[major(vp->v_rdev)].d_strategy)(bp2); /* * Should we bill read ahead to extra faults? */ u.u_ru.ru_majflt++; if (seg == segkmap) u.u_ru.ru_inblock++; /* count as `read' */ cnt.v_pgin++; cnt.v_pgpgin += btopr(io_len); } } if (bp != NULL && pl != NULL) { err = biowait(bp); pageio_done(bp); } else if (pagefound != NULL) { register int s; /* * We need to be careful here because if the page was * previously on the free list, we might have already * lost it at interrupt level. */ s = splvm(); if (pagefound->p_vnode == vp && pagefound->p_offset == off) { /* * If the page is still intransit or if * it is on the free list call page_lookup * to try and wait for / reclaim the page. */ if (pagefound->p_intrans || pagefound->p_free) pagefound = page_lookup(vp, off); } if (pagefound == NULL || pagefound->p_offset != off || pagefound->p_vnode != vp || pagefound->p_gone) { (void) splx(s); spec_lostpage++; goto reread; } if (pl != NULL) { PAGE_HOLD(pagefound); pl[0] = pagefound; pl[1] = NULL; u.u_ru.ru_minflt++; sp->s_nextr = off + PAGESIZE; } (void) splx(s); } if (err && pl != NULL) { for (ppp = pl; *ppp != NULL; *ppp++ = NULL) PAGE_RELE(*ppp); } return (err); } /* * Return all the pages from [off..off+len) in block device */ static int spec_getpage(vp, off, len, protp, pl, plsz, seg, addr, rw, cred) struct vnode *vp; u_int off, len; u_int *protp; struct page *pl[]; u_int plsz; struct seg *seg; addr_t addr; enum seg_rw rw; struct ucred *cred; { struct snode *sp = VTOS(vp); int err; if (vp->v_type != VBLK || sp->s_bdevvp != vp) panic("spec_getpage"); if (off + len - PAGEOFFSET > sp->s_size) return (EFAULT); /* beyond EOF */ if (protp != NULL) *protp = PROT_ALL; if (len <= PAGESIZE) err = spec_getapage(vp, off, protp, pl, plsz, seg, addr, rw, cred); else { SNLOCK(sp); err = pvn_getpages(spec_getapage, vp, off, len, protp, pl, plsz, seg, addr, rw, cred); SNUNLOCK(sp); } return (err); } /* * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED} */ static int spec_wrtblk(vp, pp, off, len, flags) struct vnode *vp; struct page *pp; u_int off, len; int flags; { struct buf *bp; int err; bp = pageio_setup(pp, len, vp, B_WRITE | flags); if (bp == NULL) { pvn_fail(pp, B_WRITE | flags); return (ENOMEM); } bp->b_dev = vp->v_rdev; bp->b_blkno = btodb(off); (*bdevsw[major(vp->v_rdev)].d_strategy)(bp); u.u_ru.ru_oublock++; /* * If async, assume that pvn_done will * handle the pages when IO is done */ if ((flags & B_ASYNC) != 0) return (0); err = biowait(bp); pageio_done(bp); return (err); } /* * Flags are composed of {B_ASYNC, B_INVAL, B_DIRTY B_FREE, B_DONTNEED} * If len == 0, do from off to EOF. * * The normal cases should be len == 0 & off == 0 (entire vp list), * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE * (from pageout). */ /*ARGSUSED*/ static int spec_putpage(vp, off, len, flags, cred) register struct vnode *vp; u_int off; u_int len; int flags; struct ucred *cred; { register struct snode *sp; register struct page *pp; struct page *dirty, *io_list; register u_int io_off, io_len; int vpcount; int err = 0; sp = VTOS(vp); if (vp->v_pages == NULL || off >= sp->s_size) return (0); if (vp->v_type != VBLK || sp->s_bdevvp != vp) panic("spec_putpage"); vpcount = vp->v_count; VN_HOLD(vp); again: if (len == 0) { /* * We refuse to act on behalf of the pageout daemon to push * out a page to a snode which is currently locked. */ if ((sp->s_flag & SLOCKED) && u.u_procp == &proc[2]) { err = EWOULDBLOCK; /* XXX */ goto out; } /* * Search the entire vp list for pages >= off. * We lock the snode here to prevent us from having * multiple instances of pvn_vplist_dirty working * on the same vnode active at the same time. */ SNLOCK(sp); dirty = pvn_vplist_dirty(vp, off, flags); SNUNLOCK(sp); } else { /* * Do a range from [off...off + len) via page_find. * We set limits so that we kluster to klustsize boundaries. */ if (off >= sp->s_size) { dirty = NULL; } else { u_int fsize, eoff, offlo, offhi; fsize = (sp->s_size + PAGEOFFSET) & PAGEMASK; eoff = MIN(off + len, fsize); offlo = (off / klustsize) * klustsize; offhi = roundup(eoff, klustsize); dirty = pvn_range_dirty(vp, off, eoff, offlo, offhi, flags); } } /* * Now pp will have the list of kept dirty pages marked for * write back. It will also, handle invalidation and freeing * of pages that are not dirty. All the pages on the list * returned need to still be dealt with here. */ /* * Destroy read ahead value (since we are really going to write) */ if (dirty != NULL) sp->s_nextr = 0; /* * Handle all the dirty pages not yet dealt with. */ while ((pp = dirty) != NULL) { /* * Pull off a contiguous chunk */ page_sub(&dirty, pp); io_list = pp; io_off = pp->p_offset; io_len = PAGESIZE; while (dirty != NULL && dirty->p_offset == io_off + io_len) { pp = dirty; page_sub(&dirty, pp); page_sortadd(&io_list, pp); io_len += PAGESIZE; if (io_len >= klustsize - PAGEOFFSET) break; } /* * Check for page length rounding problems */ if (io_off + io_len > sp->s_size) { ASSERT((io_off + io_len) - sp->s_size < PAGESIZE); io_len = sp->s_size - io_off; } err = spec_wrtblk(vp, io_list, io_off, io_len, flags); if (err) break; } if (err != 0) { if (dirty != NULL) pvn_fail(dirty, B_WRITE | flags); } else if (off == 0 && (len == 0 || len >= sp->s_size)) { /* * If doing "synchronous invalidation", make * sure that all the pages are actually gone. */ if ((flags & (B_INVAL | B_ASYNC)) == B_INVAL && (vp->v_pages != NULL)) goto again; } out: /* * Instead of using VN_RELE here we are careful to only call * the inactive routine if the vnode reference count is now zero, * but it wasn't zero coming into putpage. This is to prevent * recursively calling the inactive routine on a vnode that * is already considered in the `inactive' state. * XXX - inactive is a relative term here (sigh). */ if (--vp->v_count == 0 && vpcount > 0) (void) spec_inactive(vp, cred); return (err); } /* * This routine is called through the cdevsw[] table to handle * traditional mmap'able devices that support a d_mmap function. */ /*ARGSUSED*/ int spec_segmap(dev, off, as, addrp, len, prot, maxprot, flags, cred) dev_t dev; u_int off; struct as *as; addr_t *addrp; u_int len; u_int prot, maxprot; u_int flags; struct ucred *cred; { struct segdev_crargs dev_a; int (*mapfunc)(); register int i; if ((mapfunc = cdevsw[major(dev)].d_mmap) == NULL) return (ENODEV); /* * Character devices that support the d_mmap * interface can only be mmap'ed shared. */ if ((flags & MAP_TYPE) != MAP_SHARED) return (EINVAL); /* * Check to insure that the entire range is * legal and we are not trying to map in * more than the device will let us. */ for (i = 0; i < len; i += PAGESIZE) { if ((*mapfunc)(dev, off + i, maxprot) == -1) return (ENXIO); } if ((flags & MAP_FIXED) == 0) { /* * Pick an address w/o worrying about * any vac alignment contraints. */ map_addr(addrp, len, (off_t)off, 0); if (*addrp == NULL) return (ENOMEM); } else { /* * User specified address - * Blow away any previous mappings. */ (void) as_unmap(as, *addrp, len); } dev_a.mapfunc = mapfunc; dev_a.dev = dev; dev_a.offset = off; dev_a.prot = prot; dev_a.maxprot = maxprot; return (as_map(as, *addrp, len, segdev_create, (caddr_t)&dev_a)); } static int spec_map(vp, off, as, addrp, len, prot, maxprot, flags, cred) struct vnode *vp; u_int off; struct as *as; addr_t *addrp; u_int len; u_int prot, maxprot; u_int flags; struct ucred *cred; { if (vp->v_type == VCHR) { int (*segmap)(); dev_t dev = vp->v_rdev; /* * Character device, let the device driver * pick the appropriate segment driver. */ segmap = cdevsw[major(dev)].d_segmap; if (segmap == NULL) { if (cdevsw[major(dev)].d_mmap == NULL) return (ENODEV); /* * For cdevsw[] entries that specify a d_mmap * function but don't have a d_segmap function, * we default to spec_segmap for compatibility. */ segmap = spec_segmap; } return ((*segmap)(dev, off, as, addrp, len, prot, maxprot, flags, cred)); } else if (vp->v_type == VBLK) { struct segvn_crargs vn_a; /* * Block device, use the underlying bdevvp name for pages. */ if ((int)off < 0 || (int)(off + len) < 0) return (EINVAL); if ((flags & MAP_FIXED) == 0) { map_addr(addrp, len, (off_t)off, 1); if (*addrp == NULL) return (ENOMEM); } else { /* * User specified address - * Blow away any previous mappings. */ (void) as_unmap(as, *addrp, len); } ASSERT(VTOS(vp)->s_bdevvp != NULL); vn_a.vp = VTOS(vp)->s_bdevvp; vn_a.offset = off; vn_a.type = flags & MAP_TYPE; vn_a.prot = prot; vn_a.maxprot = maxprot; vn_a.cred = cred; vn_a.amp = NULL; return (as_map(as, *addrp, len, segvn_create, (caddr_t)&vn_a)); } else { return (ENODEV); } } static int spec_cmp(vp1, vp2) struct vnode *vp1, *vp2; { return (vp1 == vp2); } int spec_realvp(vp, vpp) struct vnode *vp; struct vnode **vpp; { extern struct vnodeops spec_vnodeops; extern struct vnodeops fifo_vnodeops; struct vnode *rvp; if (vp && (vp->v_op == &spec_vnodeops || vp->v_op == &fifo_vnodeops)) { vp = VTOS(vp)->s_realvp; } if (vp && VOP_REALVP(vp, &rvp) == 0) { vp = rvp; } *vpp = vp; return (0); } spec_cntl(vp, cmd, idata, odata, iflg, oflg) struct vnode *vp; int cmd, iflg, oflg; caddr_t idata, odata; { struct vnode *realvp; int error; switch (cmd) { /* * ask the dev for this one */ case _PC_MAX_INPUT: if (vp->v_type == VCHR && vp->v_stream) { ASSERT(odata && oflg == CNTL_INT32); return (VOP_IOCTL(vp, TIOCISIZE, odata, 0, 0)); } else if ((realvp = other_specvp(vp)) && realvp->v_type == VCHR && realvp->v_stream) { ASSERT(odata && oflg == CNTL_INT32); vp->v_stream = realvp->v_stream; return (VOP_IOCTL(vp, TIOCISIZE, odata, 0, 0)); } else { /* * This is for posix conformance. Max input will * always be at least 1 char. Used to return EINVAL */ *odata = 1; return(0); } /* * ask the supporting fs for everything else */ default: if (error = VOP_REALVP(vp, &realvp)) return (error); return (VOP_CNTL(realvp, cmd, idata, odata, iflg, oflg)); } /*NOTREACHED*/ }