1124 lines
29 KiB
C
1124 lines
29 KiB
C
#ifndef lint
|
|
static char sccsid[] = "@(#)md.c 1.1 94/10/31 SMI";
|
|
#endif
|
|
|
|
#include "md.h"
|
|
#if NMD > 0
|
|
|
|
/*
|
|
* Md - is the meta-disk driver. It sits below the UFS file system
|
|
* but above the 'real' disk drivers, xy, sd etc.
|
|
*
|
|
* To the UFS software, md looks like a normal driver, since it has
|
|
* the normal kinds of entries in the bdevsw and cdevsw arrays. So
|
|
* UFS accesses md in the usual ways, in particular, the strategy
|
|
* routine, mdstrategy(), gets called by fbiwrite(), ufs_getapage(),
|
|
* and ufs_write().
|
|
*
|
|
* Md maintains an array of minor devices (meta-partitions). Each
|
|
* meta partition stands for a matrix of real partitions, in rows
|
|
* which are not necessarily of equal length. Md maintains a table,
|
|
* with one entry for each meta-partition, which lists the rows and
|
|
* columns of actual partitions, and the job of the strategy routine
|
|
* is to translate from the meta-partition device and block numbers
|
|
* known to UFS into the actual partitions' device and block numbers.
|
|
*
|
|
* See below, in mdstrategy(), ndreal(), and mddone() for details of
|
|
* this translation.
|
|
*/
|
|
|
|
/*
|
|
* Debugging and other defines.
|
|
*/
|
|
#define MD_DEBUG 1
|
|
#define MD_STATS 0
|
|
|
|
/*
|
|
* Driver for Virtual Disk.
|
|
*/
|
|
#include <sys/param.h>
|
|
#include <sys/systm.h>
|
|
#include <sys/dk.h>
|
|
#include <sys/buf.h>
|
|
#include <sys/conf.h>
|
|
#include <sys/user.h>
|
|
#include <sys/map.h>
|
|
#include <sys/vmmac.h>
|
|
#include <sys/uio.h>
|
|
#include <sys/kernel.h>
|
|
#include <sys/dkbad.h>
|
|
#include <sys/file.h>
|
|
#include <sys/trace.h>
|
|
#include <sun/dkio.h>
|
|
|
|
#include <vm/hat.h>
|
|
#include <vm/seg.h>
|
|
#include <vm/as.h>
|
|
|
|
#include <machine/psl.h>
|
|
#include <machine/mmu.h>
|
|
#include <machine/cpu.h>
|
|
|
|
#include <sundev/mbvar.h>
|
|
#include <sundev/mdreg.h>
|
|
|
|
/*
|
|
* Local variables.
|
|
*/
|
|
struct md_save *md_free_list = (struct md_save *)0;
|
|
struct buf *md_raw_free_list = (struct buf *)0;
|
|
int md_alloc_size = 16;
|
|
int md_in_use = 0;
|
|
int md_raw_in_use = 0;
|
|
caddr_t md_parity_block = (caddr_t)0;
|
|
struct buf *md_parity_buf = (struct buf *)0;
|
|
|
|
#if MD_STATS > 0
|
|
#define MAX_TRACE 2048
|
|
struct {
|
|
dev_t tr_dev[2];
|
|
int tr_flags;
|
|
int tr_blk[2];
|
|
struct buf *tr_bufp[2];
|
|
struct md_save *tr_save;
|
|
long tr_secs;
|
|
long tr_usecs[2];
|
|
} md_trace[MAX_TRACE];
|
|
int md_trace_index = 0;
|
|
#endif MD_STATS
|
|
|
|
#if MD_STATS > 0
|
|
/*
|
|
* Store the maximum number of local structures that we generate.
|
|
*/
|
|
int max_md_raw_in_use = 0;
|
|
int max_md_in_use = 0;
|
|
#endif
|
|
|
|
int mdprobe(), mdslave(), mdattach(), mdgo(), mddone(), mdpoll();
|
|
|
|
struct mb_driver mdcdriver = {
|
|
mdprobe, mdslave, mdattach, mdgo, mddone, mdpoll,
|
|
sizeof (struct mddevice), "md", 0, "mdc", 0, MDR_BIODMA | MDR_PSEUDO,
|
|
};
|
|
|
|
/*
|
|
* Bufs used by physio().
|
|
*/
|
|
struct mdunit md_units[NMD];
|
|
|
|
/*
|
|
* See the description in mdreg.h for the function of each field of the
|
|
* md_struct structure shown here.
|
|
*/
|
|
struct md_struct md_conf[NMD];
|
|
|
|
|
|
/*
|
|
* Determine existence of a device.
|
|
*
|
|
* Called by:
|
|
*
|
|
*/
|
|
mdprobe(reg, ctlr)
|
|
caddr_t reg;
|
|
int ctlr;
|
|
{
|
|
printf("mdprobe(reg 0x%x, ctlr 0x%x)\n", reg, ctlr);
|
|
|
|
return (sizeof (struct mddevice));
|
|
}
|
|
|
|
/*
|
|
* See if a slave unit exists.
|
|
*
|
|
* Called by:
|
|
*
|
|
*/
|
|
mdslave(md, reg)
|
|
register struct mb_device *md;
|
|
caddr_t reg;
|
|
{
|
|
printf("mdslave(md 0x%x, reg 0x%x)\n", md, reg);
|
|
|
|
return (1);
|
|
}
|
|
|
|
/*
|
|
* This routine is used to attach a drive to the system.
|
|
*
|
|
* Called by:
|
|
*
|
|
*/
|
|
mdattach(md)
|
|
register struct mb_device *md;
|
|
{
|
|
printf("mdattach(md 0x%x)\n", md);
|
|
}
|
|
|
|
|
|
mdvalid(mdstruct)
|
|
struct md_struct *mdstruct;
|
|
{
|
|
int row;
|
|
struct md_row *mdrow;
|
|
int i;
|
|
dev_t real_dev;
|
|
int size;
|
|
|
|
|
|
/*
|
|
* Make sure that we have a somewhat valid Virtual Disk description
|
|
* record for this minor device of the Virtual Disk.
|
|
*/
|
|
if (mdstruct->md_mirror) {
|
|
printf("mdvalid: md_mirror 0x%x\n", mdstruct->md_mirror);
|
|
}
|
|
for (row = 0; row < mdstruct->md_rows; row++) {
|
|
mdrow = &mdstruct->md_real_row[row];
|
|
if ((mdrow->md_row_disks < 0) ||
|
|
(mdrow->md_row_disks > MAX_MD_DISKS)) {
|
|
printf("mdvalid: illegal entry md_ndisks 0x%x MAX_MD_DISKS 0x%x\n",
|
|
mdrow->md_row_disks, MAX_MD_DISKS);
|
|
return (ENXIO);
|
|
}
|
|
if (mdrow->md_end_block & 0xf) {
|
|
printf("mdvalid: not a complete block 0x%x\n",
|
|
mdrow->md_end_block);
|
|
}
|
|
for (i = 0; i < mdrow->md_row_disks; i++) {
|
|
real_dev = mdrow->md_real_disks[i];
|
|
/*
|
|
* Must check the major device for validity. The
|
|
* underlying block device driver will check the
|
|
* minor device as if it were a regular open.
|
|
*/
|
|
if (major(real_dev) >= nblkdev) {
|
|
printf("mdvalid: bad real disk 0x%x\n",
|
|
real_dev);
|
|
return (ENXIO);
|
|
}
|
|
size = (*bdevsw[major(real_dev)].d_psize)(real_dev);
|
|
if (size < mdrow->md_end_block) {
|
|
printf("mdvalid: dev 0x%x size 0x%x end_block 0x%x\n",
|
|
real_dev, size, mdrow->md_end_block);
|
|
return (ENXIO);
|
|
}
|
|
}
|
|
if ((mdrow->md_width <= 0) ||
|
|
(mdrow->md_width > mdrow->md_row_disks)) {
|
|
printf("mdvalid: illegal md_width 0x%x md_ndisks 0x%x\n",
|
|
mdrow->md_width, mdrow->md_row_disks);
|
|
return (ENXIO);
|
|
}
|
|
}
|
|
/*
|
|
* Make sure that this Unit of the Virtual Disk was really configured.
|
|
* If not, then all we allow is an ioctl.
|
|
*/
|
|
if ((mdstruct->md_ndisks <= 0) || (mdstruct->md_rows <= 0)) {
|
|
printf("mdvalid: no entry: may only open for ioctl\n");
|
|
mdstruct->md_status |= MD_IOCOPS;
|
|
return (ENXIO);
|
|
}
|
|
mdstruct->md_status &= ~MD_IOCOPS;
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* This routine opens one minor device of the Virtual Disk System. The
|
|
* particular minor device of the Virtual Disk which is being opened is
|
|
* described by one entry in the md_struct structure.
|
|
*
|
|
* Called by:
|
|
*
|
|
*/
|
|
mdopen(dev, flag)
|
|
dev_t dev;
|
|
int flag;
|
|
{
|
|
struct md_struct *mdstruct;
|
|
struct md_row *mdrow;
|
|
dev_t real_dev;
|
|
int row;
|
|
int i;
|
|
int err;
|
|
|
|
trace2(TR_MD_OPEN, dev, flag);
|
|
printf("mdopen(dev 0x%x, flag 0x%x)\n", dev, flag);
|
|
|
|
/*
|
|
* Make sure that the minor device is a valid part of the Virtual
|
|
* Disk subsystem.
|
|
*/
|
|
if (MD_MINOR(dev) >= NMD) {
|
|
printf("mdopen: dev 0x%x bad minor device 0x%x\n",
|
|
dev, MD_MINOR(dev));
|
|
return (ENXIO);
|
|
}
|
|
if (md_units[MD_MINOR(dev)].un_rtab == (struct buf *)0) {
|
|
md_units[MD_MINOR(dev)].un_rtab =
|
|
(struct buf *)new_kmem_alloc(
|
|
sizeof (struct buf), KMEM_SLEEP);
|
|
bzero(md_units[MD_MINOR(dev)].un_rtab, sizeof (struct buf));
|
|
}
|
|
mdstruct = &md_conf[MD_MINOR(dev)];
|
|
if ((err = mdvalid(mdstruct)) != 0) {
|
|
printf("mdopen: dev 0x%x bad\n", dev);
|
|
}
|
|
/*
|
|
* This device is only to be opened for ioctls.
|
|
*/
|
|
if (mdstruct->md_status & MD_IOCOPS) return (0);
|
|
/*
|
|
* This device has not been properly set up.
|
|
*/
|
|
if ((mdstruct->md_status & MD_SETUP) == 0) {
|
|
printf("mdopen: bad open\n");
|
|
return (ENXIO);
|
|
}
|
|
if (mdstruct->md_parity_interval != 0)
|
|
printf("mdopen: md_parity_interval %d\n",
|
|
mdstruct->md_parity_interval);
|
|
/*
|
|
* Make sure that each real disk listed in the md_struct entry for
|
|
* this unit is plausible, and also make sure that we can open each
|
|
* underlying real disk
|
|
*
|
|
*/
|
|
for (row = 0; row < mdstruct->md_rows; row++) {
|
|
mdrow = &mdstruct->md_real_row[row];
|
|
for (i = 0; i < mdrow->md_row_disks; i++) {
|
|
real_dev = mdrow->md_real_disks[i];
|
|
/*
|
|
* Must check the major device for validity. The
|
|
* underlying block device driver will check the
|
|
* minor device as if it were a regular open.
|
|
*/
|
|
if (major(real_dev) >= nblkdev) {
|
|
printf("mdopen: row minor 0x%x bad real disk 0x%x\n",
|
|
i, real_dev);
|
|
return (ENXIO);
|
|
}
|
|
err = (*bdevsw[major(real_dev)].d_open)(real_dev, flag);
|
|
if (err) {
|
|
printf("\tmopen: row minor 0x%x real_dev 0x%x bad open 0x%x\n",
|
|
i, real_dev, err);
|
|
return (err);
|
|
}
|
|
printf("mdopen: opened 0x%x\n", real_dev);
|
|
}
|
|
}
|
|
return (err);
|
|
}
|
|
|
|
/*
|
|
* This routine returns the size of a logical partition. It is called
|
|
* from the device switch at normal priority.
|
|
*
|
|
* Called by:
|
|
*
|
|
*/
|
|
int
|
|
mdsize(dev)
|
|
dev_t dev;
|
|
{
|
|
struct md_struct *mdstruct;
|
|
struct md_row *mdrow;
|
|
int i;
|
|
int row;
|
|
int total_size = 0;
|
|
int size = 0;
|
|
dev_t real_dev;
|
|
|
|
printf("mdsize(dev 0x%x)\n", dev);
|
|
|
|
|
|
/*
|
|
* Make sure that the minor device is a valid part of the Virtual
|
|
* Disk subsystem.
|
|
*/
|
|
if (MD_MINOR(dev) >= NMD) {
|
|
printf("mdsize: dev 0x%x bad minor device 0x%x\n",
|
|
dev, MD_MINOR(dev));
|
|
return (ENXIO);
|
|
}
|
|
mdstruct = &md_conf[MD_MINOR(dev)];
|
|
|
|
for (row = 0; row < mdstruct->md_rows; row++) {
|
|
mdrow = &mdstruct->md_real_row[row];
|
|
for (i = 0; i < mdrow->md_row_disks; i++) {
|
|
real_dev = mdrow->md_real_disks[i];
|
|
/*
|
|
* Must check the major device for validity. The
|
|
* underlying block device driver will check the
|
|
* minor device as if it were a regular open.
|
|
*/
|
|
if (major(real_dev) >= nblkdev) {
|
|
printf("mdsize: dev 0x%x bad real disk 0x%x\n",
|
|
dev, real_dev);
|
|
return (ENXIO);
|
|
}
|
|
printf("mdsize: dev 0x%x real disk 0x%x",
|
|
dev, real_dev);
|
|
size = (*bdevsw[major(real_dev)].d_psize)(real_dev);
|
|
printf(" size 0x%x\n", size);
|
|
total_size += size;
|
|
}
|
|
}
|
|
trace2(TR_MD_SIZE, dev, total_size);
|
|
printf("mdsize: total_size 0x%x\n", total_size);
|
|
return (total_size);
|
|
}
|
|
|
|
/*
|
|
* Create and return the address of a buffer structure which we
|
|
* can use to hold a fragment of a longer raw read/write/
|
|
*/
|
|
struct buf *
|
|
md_raw_get()
|
|
{
|
|
struct buf *mdr;
|
|
|
|
/*
|
|
* This will panic higher up if I can't get one, so don't
|
|
* test for NULL returns here.
|
|
*/
|
|
mdr = (struct buf *)new_kmem_fast_alloc((caddr_t *)&md_raw_free_list,
|
|
sizeof (*mdr), md_alloc_size, KMEM_SLEEP);
|
|
bzero((caddr_t)mdr, sizeof (*mdr));
|
|
md_raw_in_use++;
|
|
#if MD_STATS > 0
|
|
if (md_raw_in_use > max_md_raw_in_use) {
|
|
max_md_raw_in_use = md_raw_in_use;
|
|
#if MD_DEBUG > 0
|
|
printf("md_raw_get: max_md_raw_in_use 0x%x\n",
|
|
max_md_raw_in_use);
|
|
#endif
|
|
}
|
|
#endif
|
|
return (mdr);
|
|
}
|
|
|
|
/*
|
|
* Reclaim a buffer structure.
|
|
*/
|
|
md_raw_free(mdr)
|
|
struct buf *mdr;
|
|
{
|
|
kmem_fast_free((caddr_t *)&md_raw_free_list, (caddr_t)mdr);
|
|
md_raw_in_use--;
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* Create and return the address of a save structure in which we
|
|
* save the details of the current transfer.
|
|
*/
|
|
struct md_save *
|
|
mdget()
|
|
{
|
|
struct md_save *mds;
|
|
|
|
/*
|
|
* This will panic higher up if I can't get one, so don't
|
|
* test for NULL returns here.
|
|
*/
|
|
mds = (struct md_save *)new_kmem_fast_zalloc((caddr_t *)&md_free_list,
|
|
sizeof (*mds), md_alloc_size, KMEM_SLEEP);
|
|
md_in_use++;
|
|
#if MD_STATS > 0
|
|
if (md_in_use > max_md_in_use) {
|
|
max_md_in_use = md_in_use;
|
|
#if MD_DEBUG > 0
|
|
printf("md_get: max_md_in_use 0x%x\n",
|
|
max_md_in_use);
|
|
#endif
|
|
}
|
|
#endif
|
|
return (mds);
|
|
}
|
|
|
|
mdfree(mds)
|
|
struct md_save *mds;
|
|
{
|
|
kmem_fast_free((caddr_t *)&md_free_list, (caddr_t)mds);
|
|
md_in_use--;
|
|
return;
|
|
}
|
|
|
|
mdsavedump(mds)
|
|
struct md_save *mds;
|
|
{
|
|
printf(" mdsavedump: mds 0x%x md_bp 0x%x md_frags 0x%x md_frag_chars 0x%x\n",
|
|
mds,
|
|
mds->md_bp,
|
|
mds->md_frags,
|
|
mds->md_frag_chars);
|
|
mddumpbuf("mdsavedump", &(mds->md_buf));
|
|
|
|
}
|
|
|
|
mddumpbuf(str, bp)
|
|
char *str;
|
|
struct buf *bp;
|
|
{
|
|
printf(" mddumpbuf(%s): bp 0x%x av_back 0x%x b_flags 0x%x b_bcount 0x%x b_error 0x%x b_dev 0x%x b_blkno 0x%x b_iodone 0x%x b_addr 0x%x\n",
|
|
str,
|
|
bp,
|
|
bp->av_back,
|
|
bp->b_flags,
|
|
bp->b_bcount,
|
|
bp->b_error,
|
|
bp->b_dev,
|
|
bp->b_blkno,
|
|
bp->b_iodone,
|
|
bp->b_un.b_addr);
|
|
}
|
|
|
|
/*
|
|
* This routine is the high level interface to the Virtual Disk. It
|
|
* performs reads and writes on the disk using the buf as the method
|
|
* of communication. It is called from the device switch for block
|
|
* operations and via physio() for raw operations. It is called at
|
|
* normal priority.
|
|
*
|
|
* We save the original buffer header so that it is still in the
|
|
* buffer cache in case the kernel comes looking for it and create
|
|
* a new buffer header in *mdsave which is a duplicate of the old.
|
|
*
|
|
* We also save a pointer back to the original buffer header so that
|
|
* on completion we can go and make it look as though it went through
|
|
* the driver before sending it to iodone().
|
|
*
|
|
* Called by:
|
|
* ufs_getapage();
|
|
* ufs_writelbn()
|
|
*/
|
|
mdstrategy(bp)
|
|
register struct buf *bp;
|
|
{
|
|
struct md_struct *mdstruct;
|
|
struct md_save *mdsave;
|
|
int s;
|
|
daddr_t fragment;
|
|
long total_count, this_count;
|
|
daddr_t this_blkno;
|
|
caddr_t this_address;
|
|
struct buf *mdraw;
|
|
dev_t dev,
|
|
mirror_dev = 0;
|
|
int primary = 1;
|
|
|
|
|
|
dev = bp->b_dev;
|
|
/*
|
|
* Make sure that the minor device is a valid part of the Virtual
|
|
* Disk subsystem.
|
|
*/
|
|
if (MD_MINOR(dev) >= NMD) {
|
|
printf("mdstrategy: dev 0x%x bad minor device 0x%x\n",
|
|
dev, MD_MINOR(dev));
|
|
bp->b_flags |= B_ERROR;
|
|
iodone(bp);
|
|
return;
|
|
}
|
|
mdstruct = &md_conf[MD_MINOR(dev)];
|
|
if ((mdstruct->md_status & MD_SETUP) == 0) {
|
|
printf("mdstrategy: dev 0x%x not set up\n", dev);
|
|
bp->b_flags |= B_ERROR;
|
|
iodone(bp);
|
|
return;
|
|
}
|
|
while (mdstruct != (struct md_struct *)0) {
|
|
s = spl6();
|
|
/*
|
|
* Save essential information.
|
|
*/
|
|
MD_SAVE(mdsave, bp);
|
|
if (primary) {
|
|
bp->av_back = (struct buf *)mdsave;
|
|
mdsave->md_bp = bp;
|
|
} else {
|
|
mdsave->md_bp = (struct buf *)0;
|
|
mdsave->md_buf.b_dev = mirror_dev;
|
|
}
|
|
(void) splx(s);
|
|
|
|
#if MD_STATS > 0
|
|
mdsave->md_trace_ind = md_trace_index;
|
|
md_trace[md_trace_index].tr_flags = bp->b_flags;
|
|
md_trace[md_trace_index].tr_save = mdsave;
|
|
md_trace[md_trace_index].tr_bufp[0] = bp;
|
|
md_trace[md_trace_index].tr_bufp[1] = &(mdsave->md_buf);
|
|
md_trace[md_trace_index].tr_dev[0] = bp->b_dev;
|
|
md_trace[md_trace_index].tr_blk[0] = bp->b_blkno;
|
|
md_trace[md_trace_index].tr_secs = time.tv_sec;
|
|
md_trace[md_trace_index].tr_usecs[0] = time.tv_usec;
|
|
md_trace_index =
|
|
(md_trace_index == (MAX_TRACE-1)) ? 0 : md_trace_index + 1;
|
|
#endif MD_STATS
|
|
|
|
/*
|
|
* Make sure that completion comes back to us, so we can clean up.
|
|
*/
|
|
mdsave->md_buf.b_iodone = mddone;
|
|
mdsave->md_buf.b_flags |= B_CALL;
|
|
|
|
/*
|
|
* Remember where it's saved.
|
|
*/
|
|
mdsave->md_buf.av_back = (struct buf *)mdsave;
|
|
|
|
/*
|
|
* Handle very long read/write operations correctly. The catch
|
|
* is that we can be called from physio() which hands us a
|
|
* buffer asking for a read or write which is up to 63kb in
|
|
* length. It's OK to send this to the driver in one piece
|
|
* on a single unstriped partition, but we have to break it up
|
|
* into file system block-sized reads and writes since we are
|
|
* striping. the read/write might look like this:
|
|
*
|
|
* _________________________________________________
|
|
* : | | | | | | | :
|
|
* : |<--------------------------------------->| :
|
|
* :___|___|_______|_______|_______|_______|_____|_:
|
|
* n 8k 8k 8k 8k m
|
|
*
|
|
* The method is to do one read/write of (n) bytes, several of
|
|
* 8k, and then one of (m) bytes. The essential is not to have
|
|
* a single read/write which spans file system blocks, so all
|
|
* we have to do is to break up the original request into single
|
|
* buffer reads/writes, and pass them down to mdreal() which
|
|
* will figure out to which real partition and block number to
|
|
* distribute them. Unfortunately, this is a little slower
|
|
* than passing a single huge read/write down.
|
|
*/
|
|
fragment = bp->b_blkno & 0xf;
|
|
if ((bp->b_bcount > ((0x10 - fragment) * DEV_BSIZE))) {
|
|
for (total_count = bp->b_bcount,
|
|
this_count = (0x10 - fragment) * DEV_BSIZE,
|
|
this_blkno = bp->b_blkno,
|
|
this_address = bp->b_un.b_addr;
|
|
total_count > 0; ) {
|
|
s = spl6();
|
|
/*
|
|
* Send down a buffer for the current fragment
|
|
* of the raw read/write.
|
|
*/
|
|
mdraw = md_raw_get();
|
|
(void) splx(s);
|
|
*mdraw = *bp;
|
|
mdraw->b_flags = bp->b_flags | B_RAW_FRAG | B_CALL;
|
|
mdraw->av_back = (struct buf *)mdsave;
|
|
mdraw->b_bcount = this_count;
|
|
if (primary)
|
|
mdraw->b_dev = bp->b_dev;
|
|
else
|
|
mdraw->b_dev = mirror_dev;
|
|
mdraw->b_un.b_addr = this_address;
|
|
mdraw->b_blkno = this_blkno;
|
|
mdraw->b_iodone = mddone;
|
|
mdraw->b_pages = (struct page *)0;
|
|
mdraw->b_chain = (struct buf *)0;
|
|
mdsave->md_frags++;
|
|
mdreal (mdraw);
|
|
/*
|
|
* Update the parameters for the next fragment.
|
|
*/
|
|
total_count = total_count - this_count;
|
|
this_address = this_address + this_count;
|
|
this_blkno = this_blkno + (this_count / DEV_BSIZE);
|
|
this_count = min(total_count, 0x10 * DEV_BSIZE);
|
|
}
|
|
goto mdstrategyout;
|
|
}
|
|
/*
|
|
* Call the real driver.
|
|
*/
|
|
mdreal (&(mdsave->md_buf));
|
|
mdstrategyout:
|
|
if ((mdstruct->md_mirror) && ((bp->b_flags & B_READ) == 0)) {
|
|
mirror_dev = mdstruct->md_mirror;
|
|
mdstruct = &md_conf[MD_MINOR(mirror_dev)];
|
|
}
|
|
|
|
else
|
|
mdstruct = (struct md_struct *)0;
|
|
primary = 0;
|
|
}
|
|
return;
|
|
}
|
|
|
|
mdread_parity(dev, sector)
|
|
dev_t dev;
|
|
daddr_t sector;
|
|
{
|
|
printf("mdread_parity(dev 0x%x sector 0x%x)\n",
|
|
dev, sector);
|
|
|
|
if (md_parity_block == (caddr_t)0) {
|
|
md_parity_block = new_kmem_alloc(0x10 * DEV_BSIZE, KMEM_SLEEP);
|
|
printf("mdread_parity: md_parity_block 0x%x\n", md_parity_block);
|
|
}
|
|
|
|
if (md_parity_buf == (struct buf *)0) {
|
|
md_parity_buf = (struct buf *)new_kmem_alloc(
|
|
sizeof (struct buf), KMEM_SLEEP);
|
|
printf("mdread_parity: md_parity_buf 0x%x\n", md_parity_buf);
|
|
md_parity_buf->b_forw = (struct buf *)0;
|
|
md_parity_buf->b_back = (struct buf *)0;
|
|
md_parity_buf->av_forw = (struct buf *)0;
|
|
md_parity_buf->av_back = (struct buf *)0;
|
|
md_parity_buf->b_chain = (struct buf *)0;
|
|
md_parity_buf->b_bcount = 0x10 * DEV_BSIZE;
|
|
md_parity_buf->b_bufsize = 0x10 * DEV_BSIZE;
|
|
md_parity_buf->b_error = 0;
|
|
md_parity_buf->b_iodone = (int)0;
|
|
md_parity_buf->b_vp = (struct vnode *)0;
|
|
md_parity_buf->b_pages = (struct page *)0;
|
|
md_parity_buf->b_chain = (struct buf *)0;
|
|
md_parity_buf->b_mbinfo = 0;
|
|
}
|
|
md_parity_buf->b_dev = dev;
|
|
md_parity_buf->b_blkno = sector;
|
|
md_parity_buf->b_un.b_addr = md_parity_block;
|
|
md_parity_buf->b_resid = 0;
|
|
(*bdevsw[major(md_parity_buf->b_dev)].d_strategy)(md_parity_buf);
|
|
}
|
|
|
|
/*
|
|
* Accept the duplicate buffer header which initially contains the
|
|
* same information (dev and block number) passed down from the UFS
|
|
* layer.
|
|
*
|
|
* Go to the md_conf[] table to find out the correct real partition
|
|
* dev and block number for this buffer. Calculate which real disk
|
|
* to give the request to, and invoke its strategy routine.
|
|
*/
|
|
mdreal(bp)
|
|
register struct buf *bp;
|
|
{
|
|
struct md_struct *mdstruct;
|
|
struct md_row *mdrow;
|
|
dev_t real_dev,
|
|
parity_device;
|
|
dev_t dev;
|
|
int minor_device;
|
|
daddr_t sector,
|
|
real_sector,
|
|
fragment,
|
|
min_block,
|
|
blk_in_row;
|
|
int row_index;
|
|
|
|
trace4(TR_MD_REAL, bp, bp->b_blkno, bp->b_dev, bp->b_flags);
|
|
|
|
dev = bp->b_dev;
|
|
/*
|
|
* Make sure that the minor device is a valid part of the Virtual
|
|
* Disk subsystem.
|
|
*/
|
|
if (MD_MINOR(dev) >= NMD) {
|
|
printf("mdreal: dev 0x%x bad minor device 0x%x\n",
|
|
dev, MD_MINOR(dev));
|
|
printf("mdreal: bp 0x%x b_flags 0x%x av_back 0x%x\n",
|
|
bp, bp->b_flags, bp-> av_back);
|
|
panic ("mdreal: bad real device");
|
|
}
|
|
mdstruct = &md_conf[MD_MINOR(dev)];
|
|
|
|
/*
|
|
* Fake calculation to check what is below.
|
|
*/
|
|
mdrow = &mdstruct->md_real_row[0];
|
|
|
|
/*
|
|
* Do a real calculation to derive the minor device of the
|
|
* Virtual Disk, which in turn will let us derive the
|
|
* device/minor of the underlying real device.
|
|
*/
|
|
if (mdstruct->md_parity_interval == 0) {
|
|
for (row_index = 0; row_index < mdstruct->md_rows; row_index++) {
|
|
mdrow = &mdstruct->md_real_row[row_index];
|
|
if (bp->b_blkno < mdrow->md_cum_blocks) break;
|
|
}
|
|
min_block = mdrow->md_cum_blocks - mdrow->md_blocks;
|
|
if ((bp->b_blkno < min_block) || (bp->b_blkno > mdrow->md_cum_blocks))
|
|
printf("mdreal: block 0x%x max 0x%x min 0x%x\n",
|
|
bp->b_blkno,
|
|
mdrow->md_cum_blocks,
|
|
min_block);
|
|
blk_in_row = bp->b_blkno - min_block;
|
|
fragment = blk_in_row & 0xf;
|
|
sector = blk_in_row >> 4;
|
|
if (bp->b_bcount > ((0x10 - fragment) * DEV_BSIZE)) {
|
|
mddumpbuf("mdreal(fragment)", bp);
|
|
printf("mdreal: should panic count 0x%x fragment 0x%x",
|
|
bp->b_bcount, fragment);
|
|
}
|
|
minor_device = sector % mdrow->md_width;
|
|
real_sector = sector / mdrow->md_width;
|
|
bp->b_blkno = (real_sector << 4) + fragment;
|
|
real_dev = mdrow->md_real_disks[minor_device];
|
|
bp->b_dev = real_dev;
|
|
} else {
|
|
for (row_index = 0; row_index < mdstruct->md_rows; row_index++) {
|
|
mdrow = &mdstruct->md_real_row[row_index];
|
|
if (bp->b_blkno < mdrow->md_cum_data_blocks) break;
|
|
}
|
|
min_block = mdrow->md_cum_data_blocks - mdrow->md_data_blocks;
|
|
if ((bp->b_blkno < min_block) || (bp->b_blkno > mdrow->md_cum_data_blocks))
|
|
printf("mdreal: block 0x%x max 0x%x min 0x%x\n",
|
|
bp->b_blkno,
|
|
mdrow->md_cum_data_blocks,
|
|
min_block);
|
|
blk_in_row = bp->b_blkno - min_block;
|
|
fragment = blk_in_row & 0xf;
|
|
sector = blk_in_row >> 4;
|
|
if (bp->b_bcount > ((0x10 - fragment) * DEV_BSIZE)) {
|
|
mddumpbuf("mdreal(fragment)", bp);
|
|
printf("mdreal: should panic count 0x%x fragment 0x%x",
|
|
bp->b_bcount, fragment);
|
|
}
|
|
minor_device = sector % (mdrow->md_width - 1);
|
|
real_sector = sector / (mdrow->md_width - 1);
|
|
parity_device = real_sector % mdrow->md_width;
|
|
mdread_parity (mdrow->md_real_disks[parity_device], real_sector << 4);
|
|
if (minor_device >= parity_device) {
|
|
minor_device = minor_device + 1;
|
|
}
|
|
bp->b_blkno = (real_sector << 4) + fragment;
|
|
real_dev = mdrow->md_real_disks[minor_device];
|
|
bp->b_dev = real_dev;
|
|
}
|
|
|
|
trace4(TR_MD_REAL, bp, bp->b_blkno, bp->b_dev, bp->b_flags);
|
|
|
|
#if MD_STATS > 0
|
|
md_trace[bp->av_back->md_trace_ind].tr_dev[1] = bp->b_dev;
|
|
md_trace[bp->av_back->md_trace_ind].tr_blk[1] = bp->b_blkno;
|
|
#endif MD_STATS
|
|
(*bdevsw[major(bp->b_dev)].d_strategy)(bp);
|
|
}
|
|
|
|
/*
|
|
* This routine performs raw read operations. It is called from the
|
|
* device switch at normal priority. It uses a per-unit buffer for
|
|
* the operation.
|
|
*
|
|
* The main catch is that the *uio struct which is passed to us may
|
|
* specify a read which spans two buffers, which would be contiguous
|
|
* on a single partition, but not on a striped partition.
|
|
*
|
|
* Called by:
|
|
*
|
|
*/
|
|
mdread(dev, uio)
|
|
dev_t dev;
|
|
struct uio *uio;
|
|
{
|
|
int unit;
|
|
struct mdunit *un;
|
|
struct buf *bp;
|
|
int length;
|
|
int error;
|
|
|
|
trace2(TR_MDREAD, dev, uio);
|
|
|
|
length = uio->uio_iov->iov_len;
|
|
if ((unit = MD_MINOR(dev)) >= NMD) {
|
|
printf("mdread: bad unit 0x%x\n", unit);
|
|
return (ENXIO);
|
|
}
|
|
un = &md_units[unit];
|
|
bp = un->un_rtab;
|
|
error = physio(mdstrategy, bp, dev, B_READ, minphys, uio);
|
|
if (error) {
|
|
printf("mdread: error 0x%x\n", error);
|
|
}
|
|
if (uio->uio_resid != 0) {
|
|
printf("mdread: uio_resid 0x%x length 0x%x b_bcount 0x%x b_resid 0x%x\n",
|
|
uio->uio_resid, length, bp->b_bcount, bp->b_resid);
|
|
}
|
|
return (error);
|
|
}
|
|
|
|
/*
|
|
* This routine performs raw write operations. It is called from the
|
|
* device switch at normal priority. It uses a per-unit buffer for
|
|
* the operation.
|
|
*
|
|
* The main catch is that the *uio struct which is passed to us may
|
|
* specify a write which spans two buffers, which would be contiguous
|
|
* on a single partition, but not on a striped partition.
|
|
* Called by:
|
|
*
|
|
*/
|
|
mdwrite(dev, uio)
|
|
dev_t dev;
|
|
struct uio *uio;
|
|
{
|
|
int unit;
|
|
struct mdunit *un;
|
|
struct buf *bp;
|
|
int length;
|
|
int error;
|
|
|
|
trace2(TR_MDWRITE, dev, uio);
|
|
|
|
length = uio->uio_iov->iov_len;
|
|
if ((unit = MD_MINOR(dev)) >= NMD) {
|
|
printf("mdwrite: bad unit 0x%x\n", unit);
|
|
return (ENXIO);
|
|
}
|
|
un = &md_units[unit];
|
|
bp = un->un_rtab;
|
|
error = physio(mdstrategy, bp, dev, B_WRITE, minphys, uio);
|
|
if (error) {
|
|
printf("mdwrite: error 0x%x\n", error);
|
|
}
|
|
if (uio->uio_resid != 0) {
|
|
printf("mdwrite: uio_resid 0x%x length 0x%x b_bcount 0x%x b_resid 0x%x\n",
|
|
uio->uio_resid, length, bp->b_bcount, bp->b_resid);
|
|
}
|
|
return (error);
|
|
}
|
|
|
|
/*
|
|
* This routine finishes a buf-oriented operation. It is called from
|
|
* the underlying driver level. Buffer are sent here by biodone().
|
|
*
|
|
* We use the backwards pointer in the buffer header to find the md_save
|
|
* structure. If this is a single buffer derived from a single buffer,
|
|
* we make the original buffer look as though is has been completed,
|
|
* and pass it back to biodone().
|
|
*
|
|
* If the original buffer was a long read/write and had to be split
|
|
* into several buffers, then we count them off as they arrive, and
|
|
* when all are in, we proceed as above.
|
|
*
|
|
* Called by:
|
|
* biodone()
|
|
*
|
|
*/
|
|
mddone(bp)
|
|
struct buf *bp;
|
|
{
|
|
struct md_save *mdsave;
|
|
int s;
|
|
|
|
/*
|
|
* We should never see the B_DONE flag, since biodone ought
|
|
* to call us without setting it.
|
|
*/
|
|
if (bp->b_flags & B_DONE) {
|
|
panic ("mddone: B_DONE\n");
|
|
}
|
|
/*
|
|
* First, locate the MD save structure that holds the information
|
|
* for this buf.
|
|
*/
|
|
mdsave = (struct md_save *)bp->av_back;
|
|
if (&(mdsave->md_buf) != bp) {
|
|
/*
|
|
* Such a buffer can be completely illegal, or it can be one
|
|
* of several fragments of a long raw read/write.
|
|
*/
|
|
if (bp->b_flags & B_RAW_FRAG) {
|
|
/*
|
|
* If this buffer is one fragment of a long raw read/write
|
|
* then get rid of it, and if it is the final fragment
|
|
* signal completion.
|
|
*/
|
|
if (mdsave->md_frags <= 0) {
|
|
printf("mddone: md_frags 0\n");
|
|
panic ("mddone: md_frags");
|
|
}
|
|
mdsave->md_frags = mdsave->md_frags - 1;
|
|
mdsave->md_frag_chars = mdsave->md_frag_chars + bp->b_bcount;
|
|
if ((mdsave->md_frag_chars < 0) ||
|
|
(mdsave->md_frag_chars > mdsave->md_buf.b_bcount)) {
|
|
printf("mddone: md_frag_chars 0x%x b_bcount 0x%x\n",
|
|
mdsave->md_frag_chars, mdsave->md_buf.b_bcount);
|
|
panic ("mddone: md_frag_chars");
|
|
}
|
|
s = spl6();
|
|
md_raw_free (bp);
|
|
if (mdsave->md_frags == 0) {
|
|
if (mdsave->md_frag_chars != mdsave->md_buf.b_bcount) {
|
|
printf("mddone: md_frag_chars 0x%x b_bcount 0x%x\n",
|
|
mdsave->md_frag_chars,
|
|
mdsave->md_buf.b_bcount);
|
|
panic ("mddone: md_frag_chars");
|
|
}
|
|
if (mdsave->md_bp != (struct buf *)0) {
|
|
bp = mdsave->md_bp;
|
|
bp->b_resid =
|
|
bp->b_bcount - mdsave->md_frag_chars;
|
|
bp->b_flags &= ~B_CALL;
|
|
trace4(TR_MD_DONE, bp, bp->b_blkno,
|
|
bp->b_dev, bp->b_flags);
|
|
iodone(bp);
|
|
}
|
|
|
|
mdfree (mdsave);
|
|
(void) splx(s);
|
|
return;
|
|
}
|
|
(void) splx(s);
|
|
return;
|
|
} else {
|
|
mddumpbuf("mddone(2)", bp);
|
|
mdsavedump(mdsave);
|
|
panic ("mddone: bad bp");
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* Now, restore the saved information, so that to the rest of the
|
|
* kernel, it looks as thought nothing has changed.
|
|
*/
|
|
#if MD_STATS > 0
|
|
md_trace[mdsave->md_trace_ind].tr_usecs[1] = time.tv_usec;
|
|
#endif MD_STATS
|
|
s = spl6();
|
|
if (mdsave->md_bp != (struct buf *)0) {
|
|
bp = mdsave->md_bp;
|
|
MD_RESTORE(bp, mdsave);
|
|
/*
|
|
* Make sure biodone() does not try to call back twice.
|
|
*/
|
|
bp->b_flags &= ~B_CALL;
|
|
iodone(bp);
|
|
} else {
|
|
mdfree (mdsave);
|
|
}
|
|
trace4(TR_MD_DONE, bp, bp->b_blkno, bp->b_dev, bp->b_flags);
|
|
(void) splx(s);
|
|
}
|
|
|
|
/*
|
|
* This routine implements the ioctl calls for the Virtual Disk System.
|
|
* It is called from the device switch at normal priority.
|
|
*
|
|
* Called by:
|
|
*
|
|
*/
|
|
/* ARGSUSED */
|
|
mdioctl(dev, cmd, data, flag)
|
|
dev_t dev;
|
|
int cmd, flag;
|
|
caddr_t data;
|
|
{
|
|
struct md_struct *mdstruct;
|
|
struct md_struct *md_data;
|
|
struct dk_info *info;
|
|
int status;
|
|
|
|
printf("mdioctl(dev 0x%x, cmd 0x%x, data 0x%x, flag 0x%x)\n",
|
|
dev, cmd, data, flag);
|
|
trace4(TR_MD_IOCTL, dev, cmd, data, flag);
|
|
/*
|
|
* Make sure that the minor device is a valid part of the Virtual
|
|
* Disk subsystem.
|
|
*/
|
|
if (MD_MINOR(dev) >= NMD) {
|
|
printf("mdioctl: dev 0x%x bad minor device 0x%x\n",
|
|
dev, MD_MINOR(dev));
|
|
return (ENXIO);
|
|
}
|
|
mdstruct = &md_conf[MD_MINOR(dev)];
|
|
|
|
switch (cmd) {
|
|
|
|
/*
|
|
* Return info concerning the controller.
|
|
*/
|
|
case DKIOCINFO:
|
|
printf("mdioctl: get info\n");
|
|
info = (struct dk_info *)data;
|
|
info->dki_ctlr = 0;
|
|
info->dki_unit = 0;
|
|
info->dki_ctype = DKC_MD;
|
|
info->dki_flags = 0;
|
|
return (0);
|
|
|
|
case MD_IOCGET: /* anyone can read the Virtual Disk table */
|
|
printf("mdioctl: get Virtual Disk table\n");
|
|
md_data = (struct md_struct *)data;
|
|
*md_data = *mdstruct;
|
|
return (0);
|
|
|
|
case MD_IOCSET: /* only superuser can set Virtual Disk table */
|
|
printf("mdioctl: set Virtual Disk table\n");
|
|
if (!suser()) {
|
|
printf("mdioctl: dev 0x%x not superuser\n",
|
|
dev);
|
|
return (EPERM);
|
|
}
|
|
md_data = (struct md_struct *)data;
|
|
if (md_data->md_status == MD_CLEAR) {
|
|
printf("mdioctl: clearing device\n");
|
|
mdstruct->md_status &= ~MD_SETUP;
|
|
mdstruct->md_status |= MD_IOCOPS;
|
|
return (0);
|
|
}
|
|
if ((status = mdvalid (md_data)) != 0) {
|
|
printf("mdioctl: invalid ioctl\n");
|
|
mdstruct->md_status &= ~MD_SETUP;
|
|
return (status);
|
|
}
|
|
*mdstruct = *md_data;
|
|
mdstruct->md_status |= MD_SETUP;
|
|
return (0);
|
|
|
|
default:
|
|
printf("mdioctl: dev 0x%x invalid cmd 0x%x\n",
|
|
dev, cmd);
|
|
return (EINVAL);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* This routine dumps memory to the disk. It assumes that the memory has
|
|
* already been mapped into mainbus space. It is called at disk interrupt
|
|
* priority when the system is in trouble.
|
|
*
|
|
* Called by:
|
|
*
|
|
*/
|
|
mddump()
|
|
{
|
|
}
|
|
|
|
/*
|
|
* This routine translates a buf oriented command down to a level
|
|
* where it can actually be passed to the underlying driver.
|
|
*
|
|
* Called by:
|
|
*
|
|
*/
|
|
mdgo(md)
|
|
register struct mb_device *md;
|
|
{
|
|
printf("mdgo: md 0x%x\n", md);
|
|
}
|
|
|
|
/*
|
|
* This routine polls all the underlying drivers to see if one is
|
|
* interrupting. It is called whenever a non-vectored interrupt
|
|
* of the correct priority is received.
|
|
*
|
|
* Called by:
|
|
*
|
|
*/
|
|
mdpoll()
|
|
{
|
|
printf("mdpoll()\n");
|
|
return (0);
|
|
}
|
|
|
|
#endif NMD
|