Files
Arquivotheca.SunOS-4.1.4/sys/sun4m/memerr.c
seta75D ff309bfe1c Init
2021-10-11 18:37:13 -03:00

420 lines
9.8 KiB
C

#ident "@(#)memerr.c 1.1 94/10/31 SMI"
/*LINTLIBRARY*/
/*
* Copyright (c) 1989 by Sun Microsystems, Inc.
*/
#include <sys/param.h>
#include <machine/memerr.h>
#include <sys/user.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <vm/page.h>
#include <sys/vnode.h>
#include <machine/buserr.h>
#include <machine/mmu.h>
extern void enable_dvma();
extern void disable_dvma();
extern void hat_pagesync();
extern void hat_pageunload();
int report_ce_console = 0; /* don't print messages on console */
int report_ce = 1;
u_int efervalue = 0x101;
/*
* Memory error handling for sun4m
*/
sun4m_memerr_init()
{
#ifndef SAS
if (report_ce) /* Enables ECC checking, reports on CEs */
efervalue = efervalue | EFER_EE | EFER_EI;
else /* Else just turn on checking */
efervalue = efervalue | EFER_EE;
#ifdef DEBUG
printf("efer set to 0x%x\n", efervalue);
#endif
*(u_int *) EFER_VADDR = *(u_int *)EFER_VADDR | efervalue;
#endif
}
/* ARGSUSED */
void
sun4m_ebe_handler(type, reg, vaddr, fault, rp)
u_int type, reg;
addr_t vaddr;
unsigned fault; /* only meaningful if type == MERR_SYNC */
struct regs *rp; /* ditto */
{
}
int correctable = 1;
u_int testloc[2];
int
testecc()
{
/*
* Try generating an ECC error by turning on the diagnostic register
*/
/* Assume uniprocessor and dvma has finished */
/* Also assume cache is off */
disable_dvma();
if (correctable)
*(u_int *) EFDR_VADDR = EFDR_GENERATE_MODE | 0x85;
else
*(u_int *) EFDR_VADDR = EFDR_GENERATE_MODE | 0x61;
testloc[0] = 0x00000000;
testloc[1] = 0x00000000;
#if 0
vac_flush(&testloc[0], 4);
vac_flush(&testloc[1], 4);
#endif
*(u_int *) EFDR_VADDR = 0;
enable_dvma();
if ((testloc[0] != 0x80000000) && (testloc[1] != 0))
u.u_r.u_rv.R_val1 = -1;
else
u.u_r.u_rv.R_val1 = 0;
}
int
fix_nc_ecc(efsr, efar0, efar1)
u_int efsr, efar0, efar1;
{
/* Fix a non-correctable ECC error by "retiring" the page */
u_int pagenum;
struct page *pp;
pagenum = (((efar0 & EFAR0_PA) << EFAR0_MID_SHFT) |
((efar1 & EFAR1_PGNUM) >> 4));
if (efsr & EFSR_ME) {
printf("Multiple ECC Errors\n");
return (-1);
}
if (efar0 & EFAR0_S) {
printf("ECC error recovery: Supervisor mode\n");
return (-1);
}
pp = (struct page *) page_numtopp(pagenum);
if (pp = (struct page *) 0) {
printf("Ecc error recovery: no page structure\n");
return (-1);
}
if (pp->p_vnode == 0) {
printf("ECC error recovery: no vnode\n");
return (-1);
}
if (pp->p_intrans) {
printf("Ecc error recovery: i/o in progress\n");
return (-1);
}
hat_pagesync(pp);
if (pp->p_mod && hat_kill_procs(pp, pagenum))
return (-1);
page_giveup(pagenum, pp);
return (0);
}
page_giveup(pagenum, pp)
u_int pagenum;
struct page *pp;
{
hat_pageunload(pp);
page_abort(pp);
if (pp->p_free)
page_unfree(pp);
pp->p_lock = 1;
printf("page %x marked out of service.\n",
ptob(pagenum));
}
#if defined (SUN4M_35)
extern void enable_dvma();
extern void disable_dvma();
extern void hat_pagesync();
extern void hat_pageunload();
extern void msparc1_memerr_init_asm();
extern void msparc1_memerr_disable_asm();
extern void msparc_memerr_init_asm();
extern void msparc_memerr_disable_asm();
extern int tsunami;
/*
* msparc memory error (parity) handling
*/
void
msparc_memerr_init()
{
extern int use_pe;
if (use_pe) {
if (tsunami)
msparc1_memerr_init_asm();
else
msparc_memerr_init_asm();
} else {
printf("parity checking disabled\n");
}
}
void
msparc_memerr_disable()
{
if (tsunami)
msparc1_memerr_disable_asm();
else
msparc_memerr_disable_asm();
}
#define USER 0x10000 /* user-mode flag added to type in trap.c */
/* ARGSUSED */
int
msparc_ebe_handler(type, reg, addr, fault, rp)
u_int type;
u_int reg;
addr_t addr;
unsigned fault; /* only meaningful if type == MERR_SYNC */
struct regs *rp; /* ditto */
{
u_int ctx;
struct pte pte;
struct pte *ppte = &pte;
int transient;
u_int paddr;
paddr = va2pa(addr);
switch (type) {
case MERR_SYNC:
if (reg & MMU_SFSR_PERR) {
printf("Synchronous parity error\n");
if (X_FAULT_TYPE(reg) == FT_TRANS_ERROR) {
/* Parity errors during table walks are unrecoverable */
panic("parity error during table walk");
} else {
printf("Attempting recovery..\n");
transient = (msparc_parerr_reset(paddr) != -1);
ctx = mmu_getctx();
*(u_int *) ppte = mmu_probe(addr);
printf(" ctx=0x%x vaddr=0x%x paddr=0x%x pte=0x%x transient=%d\n",
ctx, addr, paddr, *(u_int *)ppte, transient);
if (msparc_parerr_recover(addr, ppte, transient) == -1)
panic("unrecoverable parity error");
}
printf("System operation can continue.\n");
return (0);
} else if (reg & (MMU_SFSR_TO | MMU_SFSR_BE)) {
/*
* If it's the result of a user process getting a
* timeout or bus error on the sbus, just return and
* let the process be sent a bus error signal.
*/
if (fault & USER)
return (-1);
}
ctx = mmu_getctx();
*(u_int *) ppte = mmu_probe(addr);
printf("Non-parity Synchronous memory error\n");
printf(" ctx=0x%x vaddr=0x%x pte=0x%x fsr=0x%x\n",
ctx, addr, *(u_int *)ppte, reg);
panic("sync memory error");
/*NOTREACHED*/
break;
case MERR_ASYNC:
printf("Asynchronous memory error\n");
printf("addr=0x%x reg=0x%x\n", (int)addr, (int)reg);
panic("async memory error");
/*NOTREACHED*/
break;
default:
panic("msparc_ebe_handler: invalid type");
break;
}
}
/*
* Patterns to use to determine if a location has a hard or soft parity
* error.
* The zero is also an end-of-list marker, as well as a pattern.
*/
static u_int parerr_patterns[] = {
0xAAAAAAAA, /* Alternating ones and zeroes */
0x55555555, /* Alternate the other way */
0x01010101, /* Walking ones ... */
0x02020202, /* ... four bytes at once ... */
0x04040404, /* ... from right to left */
0x08080808,
0x10101010,
0x20202020,
0x40404040,
0x80808080,
0x7f7f7f7f, /* And now walking zeros, from left to right */
0xbfbfbfbf,
0xdfdfdfdf,
0xefefefef,
0xf7f7f7f7,
0xfbfbfbfb,
0xfdfdfdfd,
0xfefefefe,
0xffffffff, /* All ones */
0x00000000, /* All zeroes -- must be last! */
};
/*
* Reset a parity error so that we can continue operation.
* Also, see if we get another parity error at the same location.
* Return 0 if error reset, -1 if not.
* %%% We need to test all the words in a cache line, if cache is on.
*/
extern void stphys();
extern u_int ldphys();
int
msparc_parerr_reset(addr)
addr_t addr;
{
int retval = 0;
u_int *p, i;
/*
* Test the word with successive patterns, to see if the parity
* error is a permanent problem.
*/
memerr_disable();
for (p = parerr_patterns; *p; p++)
{
stphys((u_int)addr, *p); /* store the pattern */
i = ldphys((u_int)addr);
if (i != *p)
{
printf("Parity error at 0x%x with pattern 0x%x\n", addr, *p);
retval++;
}
}
memerr_init();
/*
* Convert return value to what caller expects.
*/
if (retval)
retval = -1;
printf("Parity error at %x is %s.\n", addr, retval ? "permanent" : "transient");
return (retval);
}
/*
* Recover from a parity error. Returns 0 if successful, -1 if not.
*/
int
msparc_parerr_recover(vaddr, pte, pageok)
addr_t vaddr;
struct pte *pte;
int pageok;
{
struct page *pp;
/*
* Pages with no page structures or vnode are kernel pages.
* We cannot regenerate them, since there is no copy of
* the data on disk.
* And if the page was being used for i/o, we can't recover?
*/
pp = page_numtopp(pte->PhysicalPageNumber);
if (pp == (struct page *) 0) {
printf("parity recovery: no page structure\n");
return (-1);
}
if (pp->p_vnode == 0) {
printf("parity recovery: no vnode\n");
return (-1);
}
if (pp->p_intrans) {
printf("parity recovery: i/o in progress\n");
return (-1);
}
/*
* Sync all the page structures for this page, so that we get
* the bits for all the page structures, not just this one.
*/
hat_pagesync(pp);
/*
* If the page was modified, then the disk version is out
* of date. This means we have to kill all the processes
* that use the page. We may find a kernel use of the page,
* which means we have to return an error indication so that
* the caller can panic.
*/
if (pp->p_mod)
if (hat_kill_procs(pp, (u_int)vaddr) != 0)
return (-1);
/*
* Give up the page. When the fetch is retried, the page
* will be automatically faulted in. Note that we have to
* kill the processes before we give up the page, or else
* the page will no longer exist in the processes' address
* space.
*/
msparc_page_giveup(pte, pp, pageok);
return (0);
}
/*
* Give up this page, which contains the address that caused the
* parity error. If the parity error is permanent, then lock the
* page so that it will never be used again (until we reboot).
*/
msparc_page_giveup(ppte, pp, pageok)
struct pte *ppte;
struct page *pp;
int pageok;
{
hat_pageunload(pp);
page_abort(pp);
/*
* If retry failed, lock the page to keep it from being used again.
* This will let the machine stay up for now, without the parity
* error affecting future processes.
*
* XXX do we need to splvm() between the abort and the lock?
*/
if (!pageok) {
if (pp->p_free)
page_unfree(pp);
pp->p_lock = 1;
}
printf("page %x %s service.\n",
ptob(ppte->PhysicalPageNumber),
pageok ? "back in" : "marked out of");
}
#endif SUN4M_35