PCL fixes, more perf tweaks: shift, prtn, add16

changed EXPCL macro to set and clear bits - not just set changed shift instructions - made CLEARCL conditional; this means keys are only updated once per shift, not twice fixed prtn to update stack free pointer after values fetched in case a fault occurs and prtn is restarted changed prtn to invalidate brp cache only if ring changes changed prtn to inline, to avoid register save/restore changed argt to only update rp word offset in sf header added hack to pcl to correct wrapped RP for cpu.pcl case 42 added cgt inline proc, used in 2 places; generates better code (store to utempa is avoided altogether) added pimh (also used for pima) inline proc to avoid stores inlined invalidate_brp changed add16 implementation while looking at code generated ran faster according to Jeff's timers: 59/52 for old/new
2026-02-26 08:13:56 +00:00 · 2007-09-15 00:00:00 -04:00
parent d99d16932d
commit 2fb2e1a348
1 changed files with 86 additions and 84 deletions
--- a/em.c
+++ b/em.c
@@ -199,13 +199,11 @@ static void macheck (unsigned short p300vec, unsigned short chkvec, unsigned int
 #define XCLEARC CLEARC
 #define XSETC   SETC

-/* EXPCL sets both the C and L bits for shift instructions
-
-   NOTE: unlike EXPC, this doesn't clear anything - bits must be cleared
-   before executing these macros! */
+/* EXPCL sets both the C and L bits for shift instructions */

 #define EXPCL(onoff) \
-  if ((onoff)) crs[KEYS] |= 0120000
+  if ((onoff)) crs[KEYS] |= 0120000; \
+  else crs[KEYS] &= ~0120000

 #define SETCL crs[KEYS] |= 0120000
 #define CLEARCL crs[KEYS] &= ~0120000
@@ -549,11 +547,8 @@ static unsigned short physmem[MEMSIZE]; /* system's physical memory */
   increment the whole thing.

   DIAG cpu.pcl test 42 does check for segment wraparound, so -DFAST
-   will cause this test to fail.
-
-   Update: when cpuid=40 (6650), cpu.pcl test 42 *expects* 32-bit
-   increment on RP (segment gets incremented too)!
- */
+   will cause this test to fail (but, see hack in pcl which fixes it).
+*/

 #ifdef FAST
 #define RPADD(n) (RP+n)
@@ -617,7 +612,7 @@ static struct {

 /* invalidates all entries in the mapva supercache */

-void invalidate_brp() {
+void inline invalidate_brp() {
  int i;

  for (i=0; i < BRP_SIZE; i++)
@@ -2335,23 +2330,24 @@ static ea_t stex(unsigned int extsize) {
 }

 /* for PRTN, load values into temps first so that if any faults occur,
-   PRTN can be restarted
+   PRTN can be restarted.  After all the temps are loaded, the stack
+   free pointer can be updated since no further faults can occur.

-   XXX: the order of this look wrong - stack free pointer shouldn't
-   be updated if a fault occurs fetching base registers
- */
+   If changing rings, make sure to invalidate the brp supercache. */

-static void prtn() {
+static inline void prtn() {
  unsigned short stackrootseg;
  ea_t newrp,newsb,newlb;
  unsigned short keys;

  stackrootseg = get16(*(unsigned int *)(crs+SB)+1);
-  put32(*(unsigned int *)(crs+SB), MAKEVA(stackrootseg,0));
  newrp = get32(*(unsigned int *)(crs+SB)+2);
  newsb = get32(*(unsigned int *)(crs+SB)+4);
  newlb = get32(*(unsigned int *)(crs+SB)+6);
  keys = get16(*(unsigned int *)(crs+SB)+8);
+  put32(*(unsigned int *)(crs+SB), MAKEVA(stackrootseg,0));
+  if ((newrp ^ RP) & RINGMASK32)
+    invalidate_brp();
  RP = newrp | (RP & RINGMASK32);
  *(unsigned int *)(crs+SB) = newsb;
  *(unsigned int *)(crs+LB) = newlb;
@@ -2534,11 +2530,16 @@ static argt() {
       advance Y to the next arg displacement in the stack.  Y
       has to be advanced last because the PB store may fault.
       If it does, the ARGT starts over, and this argument will
-       have to be transferred again. */
+       have to be transferred again.
+
+       The full 32-bit rp is incremented, which is technically
+       wrong but faster, but because PCL DIAG 42 specifically
+       checks for segment wraparound, only update the 16-bit
+       word offset in the stack frame header. */

    if (advancepb) {
      rp += 2;
-      put32(rp, stackfp+2);
+      put16(rp & 0xffff, stackfp+3);
      crs[XL] = lastarg;
    }
    if (advancey) {
@@ -2582,6 +2583,16 @@ static pcl (ea_t ecbea) {
  }
 #endif

+  /* this hack makes DIAG cpu.pcl happy: RP is only supposed to
+     have the 16-bit word offset increment, but we do 32-bits
+     for speed.  If RPL == 0 during PCL, it means it used to be
+     segno/177776 and was incremented to segno+1/0.  So fiddle
+     RP here to pass diags.  In practice, RP wraparound is
+     just stupid and slow. */
+
+  if (RPL == 0)      /* did RP wrap? */
+    RP -= (1<<16);   /* yes, subtract 1 from seg # */
+
  /* get segment access; mapva ensures either read or gate */

  pa = mapva(ecbea, RP, PACC, &access);
@@ -3617,7 +3628,6 @@ static inline arfa(int n, int val) {

 static inline unsigned int lrs(unsigned int val, short scount) {

-  CLEARCL;
  if (scount <= 32) {
    EXPCL(val & (((unsigned int)0x80000000) >> (32-scount)));
    return (*(int *)&val) >> scount;
@@ -3625,13 +3635,13 @@ static inline unsigned int lrs(unsigned int val, short scount) {
    SETCL;
    return 0xFFFFFFFF;
  } else
+    CLEARCL;
    return 0;
 }

 static inline unsigned int lls(unsigned int val, short scount) {
  int templ;

-  CLEARCL;
  if (scount < 32) {
    templ = 0x80000000;
    templ = templ >> scount;         /* create mask */
@@ -3647,21 +3657,21 @@ static inline unsigned int lls(unsigned int val, short scount) {

 static inline unsigned int lll(unsigned int val, short scount) {

-  CLEARCL;
  if (scount <= 32) {
    EXPCL(val & (((unsigned int)0x80000000) >> (scount-1)));
    return val << scount;
  } else
+    CLEARCL;
    return 0;
 }

 static inline unsigned int lrl(unsigned int val, short scount) {

-  CLEARCL;
  if (scount <= 32) {
    EXPCL(val & (((unsigned int)0x80000000) >> (32-scount)));
    return val >> scount;
  } else
+    CLEARCL;
    return 0;
 }

@@ -3669,22 +3679,22 @@ static inline unsigned int lrl(unsigned int val, short scount) {

 static inline unsigned short arl (unsigned short val, short scount) {

-  CLEARCL;
  if (scount <= 16) {
    EXPCL(val & (((unsigned short)0x8000) >> (16-scount)));
    return val >> scount;
  } else {
+    CLEARCL;
    return 0;
  }
 }

 static inline unsigned short all (unsigned short val, short scount) {

-  CLEARCL;
  if (scount <= 16) {
    EXPCL(val & (((unsigned short)0x8000) >> (scount-1)));
    return val << scount;
  } else {
+    CLEARCL;
    return 0;
  }
 }
@@ -3693,7 +3703,6 @@ static inline unsigned short als (unsigned short val, short scount) {

  short tempa;

-  CLEARCL;
  if (scount <= 15) {
    tempa = 0100000;
    tempa = tempa >> scount;         /* create mask */
@@ -3702,14 +3711,12 @@ static inline unsigned short als (unsigned short val, short scount) {
    EXPCL(!(tempa == -1 || tempa == 0));
    return val << scount;
  }
-  if (val != 0)
-    SETCL;
+  EXPCL(val != 0);
  return 0;
 }

 static inline unsigned short ars (unsigned short val, short scount) {

-  CLEARCL;
  if (scount <= 16) {
    EXPCL(val & (((unsigned short)0x8000) >> (16-scount)));
    return (*(short *)&val) >> scount;
@@ -3717,6 +3724,7 @@ static inline unsigned short ars (unsigned short val, short scount) {
    SETCL;
    return 0xFFFF;
  } else
+    CLEARCL;
    return 0;
 }

@@ -3724,7 +3732,6 @@ static inline unsigned short ars (unsigned short val, short scount) {

 static inline unsigned int lrr(unsigned int val, short scount) {

-  CLEARCL;
  scount = ((scount-1)%32)+1;         /* make scount 1-32 */
  EXPCL(val & (((unsigned int)0x80000000) >> (32-scount)));
  return (val >> scount) | (val << (32-scount));
@@ -3732,7 +3739,6 @@ static inline unsigned int lrr(unsigned int val, short scount) {

 static inline unsigned int llr(unsigned int val, short scount) {

-  CLEARCL;
  scount = ((scount-1)%32)+1;         /* make scount 1-32 */
  EXPCL(val & (((unsigned int)0x80000000) >> (scount-1)));
  return (val << scount) | (val >> (32-scount));
@@ -3742,7 +3748,6 @@ static inline unsigned int llr(unsigned int val, short scount) {

 static inline unsigned int alr(unsigned short val, short scount) {

-  CLEARCL;
  scount = ((scount-1)%16)+1;         /* make scount 1-16 */
  EXPCL(val & (((unsigned short)0x8000) >> (scount-1)));
  return (val << scount) | (val >> (16-scount));
@@ -3750,7 +3755,6 @@ static inline unsigned int alr(unsigned short val, short scount) {

 static inline unsigned int arr(unsigned short val, short scount) {

-  CLEARCL;
  scount = ((scount-1)%16)+1;         /* make scount 1-16 */
  EXPCL(val & (((unsigned short)0x8000) >> (16-scount)));
  return (val >> scount) | (val << (16-scount));
@@ -3824,36 +3828,31 @@ static int add32(unsigned int *a1, unsigned int a2, unsigned int a3, ea_t ea) {

 static int add16(unsigned short *a1, unsigned short a2, unsigned short a3, ea_t ea) {

-  unsigned short uorig, uresult;
-  unsigned int utemp;
-  short link, eq, lt;
+  unsigned short uorig;
+  unsigned int uresult;
+  int keybits, oflow;

  stopwatch_push(&sw_add16);
-  crs[KEYS] &= ~0120300;
-  link = eq = lt = 0;
  uorig = *a1;                             /* save original for sign check */
-  utemp = uorig;                           /* expand to higher precision */
-  utemp += a2;                             /* double-precision add */
-  utemp += a3;                             /* again, for subtract */
-  uresult = utemp;                         /* truncate result to result size */
+  uresult = uorig;                         /* expand to higher precision */
+  uresult += a2;                           /* double-precision add */
+  uresult += a3;                           /* again, for subtract */
+  keybits = (uresult & 0x10000) >> 3;      /* set L-bit if carry occurred */  
+  uresult &= 0xFFFF;                       /* truncate result */
  *a1 = uresult;                           /* store result */
-  if (utemp & 0x10000)                     /* set L-bit if carry occurred */
-    link = 020000;  
  if (uresult == 0)                        /* set EQ? */
-    eq = 0100; 
-  if (((~uorig ^ a2) & (uorig ^ uresult) & 0x8000) == 0) { /* no overflow */
-    if (*(int *)&uresult < 0)
-      lt = 0200;
-    crs[KEYS] = crs[KEYS] | link | eq | lt;
-  } else {
-    if (*(int *)&uresult >= 0)
-      lt = 0200;
-    crs[KEYS] = crs[KEYS] | link | eq | lt;
+    keybits |= 0100; 
+  oflow = (((~uorig ^ a2) & (uorig ^ uresult) & 0x8000) != 0); /* overflow! */
+  if (oflow)
+    uresult = ~uresult;
+  keybits |= (uresult & 0x8000) >> 8;      /* set LT if result negative */
+  crs[KEYS] = crs[KEYS] & ~0120300 | keybits;
+  if (oflow)
    mathexception('i', FC_INT_OFLOW, ea);
-  }
  stopwatch_pop(&sw_add16);
 }

+
 static inline adlr(int dr) {

  if (crs[KEYS] & 020000)
@@ -3864,6 +3863,33 @@ static inline adlr(int dr) {
  }
 }

+
+static inline cgt(unsigned short n) {
+  unsigned short utempa;
+
+  utempa = iget16(RP);              /* get number of words */
+  if (1 <= n && n < utempa)
+    RPL = iget16(RPADD(n));
+  else
+    RP = RPADD(utempa);
+}
+
+
+static inline pimh(int dr) {
+  int templ, templ2;
+
+  templ = crsl[dr];
+  /* NOTE: PIMH could be implemented as a left shift, but Prime DIAG
+     tests require a swap - hence the "or" below */
+  crsl[dr] = (crsl[dr] << 16) | (crsl[dr] >> 16);
+  /* check that bits 1-16 were equal to bit 17 before PIMH */
+  templ2 = (templ << 16) >> 16;
+  if (templ2 == templ)
+    CLEARC;
+  else
+    mathexception('i', FC_INT_OFLOW, 0);
+}
+
 /* NOTE: PMA manuals say the range for absolute RF addressing is
   0-'377, but this does not allow addressing a machine with 8 user
   register sets.  The range should probably be an emulator config
@@ -4731,11 +4757,7 @@ d_iab:  /* 000201 */

 d_cgt:  /* 001314 */
  TRACE(T_FLOW, " CGT\n");
-  utempa = iget16(RP);              /* get number of words */
-  if (1 <= crs[A] && crs[A] < utempa)
-    RPL = iget16(RPADD(crs[A]));
-  else
-    RP = RPADD(utempa);
+  cgt(crs[A]);
  goto fetch;

 d_pida:  /* 000115 */
@@ -4753,13 +4775,7 @@ d_pidl:  /* 000305 */

 d_pima: /* 000015 */
  TRACE(T_FLOW, " PIMA\n");
-  templ = *(int *)(crsl+GR2);
-  crsl[GR2] = (crsl[GR2] << 16) | (crsl[GR2] >> 16);
-  templ2 = (templ << 16) >> 16;
-  if (templ != templ2)
-    mathexception('i', FC_INT_OFLOW, 0);
-  else
-    CLEARC;
+  pimh(GR2);
  goto fetch;

 d_piml:  /* 000301 */
@@ -6753,7 +6769,6 @@ d_gen1:
    if (crs[KEYS] & 010000) {          /* V/I mode */
      crsl[GR2] = lrs(crsl[GR2], scount);
    } else {
-      CLEARCL;
      utempa = crs[B] & 0x8000;        /* save B bit 1 */
      if (scount <= 31) {
 	templ = (crs[A]<<16) | ((crs[B] & 0x7FFF)<<1);
@@ -6765,6 +6780,7 @@ d_gen1:
 	*(int *)(crs+A) = 0xFFFF7FFF | utempa;
 	SETCL;
      } else {
+	CLEARCL;
 	*(int *)(crs+A) = utempa;
      }
    }
@@ -6822,7 +6838,6 @@ d_gen1:
    if (crs[KEYS] & 010000)                /* V/I mode */
      crsl[GR2] = lls(crsl[GR2], scount);
    else {
-      CLEARCL;
      utempa = crs[B] & 0x8000;            /* save B bit 1 */
      if (scount < 31) {
 	utempl = (crs[A]<<16) | ((crs[B] & 0x7FFF)<<1);
@@ -7242,11 +7257,7 @@ imode:

    case 0026:
      TRACE(T_FLOW, " CGT\n");
-      utempa = iget16(RP);              /* get number of words */
-      if (1 <= crs[dr*2] && crs[dr*2] < utempa)
-	RPL = iget16(INCVA(RP,crs[dr*2]));
-      else
-	RPL += utempa;
+      cgt(crs[dr*2]);
      break;

    case 0040:
@@ -7691,16 +7702,7 @@ imode:

    case 0051:
      TRACE(T_FLOW, " PIMH\n");
-      templ = crsl[dr];
-      /* NOTE: PIMH could be implemented as a left shift, but Prime DIAG
-	 tests require a swap - hence the "or" below */
-      crsl[dr] = (crsl[dr] << 16) | (crsl[dr] >> 16);
-      /* check that bits 1-16 were equal to bit 17 before PIMH */
-      templ2 = (templ << 16) >> 16;
-      if (templ2 != templ)
-	mathexception('i', FC_INT_OFLOW, 0);
-      else
-	CLEARC;
+      pimh(dr);
      break;

    case 0133:
@@ -8522,7 +8524,7 @@ imode:
    case 1:
 imodepcl:
      stopwatch_push(&sw_pcl);
-      TRACE(T_FLOW|T_PCL, "#%d %o/%o: PCL %o/%o %s\n", gvp->instcount, RPH, RPL-2, ea>>16, ea&0xFFFF, searchloadmap(ea, 'e'));
+      TRACE(T_FLOW|T_PCL, "#%d %o/%:0o: PCL %o/%o %s\n", gvp->instcount, RPH, RPL-2, ea>>16, ea&0xFFFF, searchloadmap(ea, 'e'));
      if (gvp->numtraceprocs > 0 && TRACEUSER)
 	for (i=0; i<gvp->numtraceprocs; i++)
 	  if (traceprocs[i].ecb == (ea & 0xFFFFFFF) && traceprocs[i].sb == -1) {