a2cf7d
commit e905212627350d54b58426214b5a54ddc852b0c9
a2cf7d
Author: Paul A. Clarke <pc@us.ibm.com>
a2cf7d
Date:   Fri Aug 2 22:47:57 2019 -0400
a2cf7d
a2cf7d
    [powerpc] SET_RESTORE_ROUND improvements
a2cf7d
    
a2cf7d
    SET_RESTORE_ROUND uses libc_feholdsetround_ppc_ctx and
a2cf7d
    libc_feresetround_ppc_ctx to bracket a block of code where the floating point
a2cf7d
    rounding mode must be set to a certain value.
a2cf7d
    
a2cf7d
    For the *prologue*, libc_feholdsetround_ppc_ctx is used and performs:
a2cf7d
    1. Read/save FPSCR.
a2cf7d
    2. Create new value for FPSCR with new rounding mode and enables cleared.
a2cf7d
    3. If new value is different than current value,
a2cf7d
       a. If transitioning from a state where some exceptions enabled,
a2cf7d
          enter "ignore exceptions / non-stop" mode.
a2cf7d
       b. Write new value to FPSCR.
a2cf7d
       c. Put a mark on the wall indicating the FPSCR was changed.
a2cf7d
    
a2cf7d
    (1) uses the 'mffs' instruction.  On POWER9, the lighter weight 'mffsl'
a2cf7d
    instruction can be used, but it doesn't return all of the bits in the FPSCR.
a2cf7d
    fegetenv_status uses 'mffsl' on POWER9, 'mffs' otherwise, and can thus be
a2cf7d
    used instead of fegetenv_register.
a2cf7d
    (3b) uses 'mtfsf 0b11111111' to write the entire FPSCR, so it must
a2cf7d
    instead use 'mtfsf 0b00000011' to write just the enables and the mode,
a2cf7d
    because some of the rest of the bits are not valid if 'mffsl' was used.
a2cf7d
    fesetenv_mode uses 'mtfsf 0b00000011' on POWER9, 'mtfsf 0b11111111'
a2cf7d
    otherwise.
a2cf7d
    
a2cf7d
    For the *epilogue*, libc_feresetround_ppc_ctx checks the mark on the wall, then
a2cf7d
    calls libc_feresetround_ppc, which just calls __libc_femergeenv_ppc with
a2cf7d
    parameters such that it performs:
a2cf7d
    1. Retreive saved value of FPSCR, saved in prologue above.
a2cf7d
    2. Read FPSCR.
a2cf7d
    3. Create new value of FPSCR where:
a2cf7d
       - Summary bits and exception indicators = current OR saved.
a2cf7d
       - Rounding mode and enables = saved.
a2cf7d
       - Status bits = current.
a2cf7d
    4. If transitioning from some exceptions enabled to none,
a2cf7d
       enter "ignore exceptions / non-stop" mode.
a2cf7d
    5. If transitioning from no exceptions enabled to some,
a2cf7d
       enter "catch exceptions" mode.
a2cf7d
    6. Write new value to FPSCR.
a2cf7d
    
a2cf7d
    The summary bits are hardwired to the exception indicators, so there is no
a2cf7d
    need to restore any saved summary bits.
a2cf7d
    The exception indicator bits, which are sticky and remain set unless
a2cf7d
    explicitly cleared, would only need to be restored if the code block
a2cf7d
    might explicitly clear any of them.  This is certainly not expected.
a2cf7d
    
a2cf7d
    So, the only bits that need to be restored are the enables and the mode.
a2cf7d
    If it is the case that only those bits are to be restored, there is no need to
a2cf7d
    read the FPSCR.  Steps (2) and (3) are unnecessary, and step (6) only needs to
a2cf7d
    write the bits being restored.
a2cf7d
    
a2cf7d
    We know we are transitioning out of "ignore exceptions" mode, so step (4) is
a2cf7d
    unnecessary, and in step (6), we only need to check the state we are
a2cf7d
    entering.
a2cf7d
a2cf7d
diff --git a/sysdeps/powerpc/fpu/fenv_private.h b/sysdeps/powerpc/fpu/fenv_private.h
a2cf7d
index 945ab98018450092..b0149aa243e69f5a 100644
a2cf7d
--- a/sysdeps/powerpc/fpu/fenv_private.h
a2cf7d
+++ b/sysdeps/powerpc/fpu/fenv_private.h
a2cf7d
@@ -132,7 +132,17 @@ libc_fesetenv_ppc (const fenv_t *envp)
a2cf7d
 static __always_inline void
a2cf7d
 libc_feresetround_ppc (fenv_t *envp)
a2cf7d
 {
a2cf7d
-  __libc_femergeenv_ppc (envp, _FPU_MASK_TRAPS_RN, _FPU_MASK_FRAC_INEX_RET_CC);
a2cf7d
+  fenv_union_t new = { .fenv = *envp };
a2cf7d
+
a2cf7d
+  /* If the old env has no enabled exceptions and the new env has any enabled
a2cf7d
+     exceptions, then unmask SIGFPE in the MSR FE0/FE1 bits.  This will put the
a2cf7d
+     hardware into "precise mode" and may cause the FPU to run slower on some
a2cf7d
+     hardware.  */
a2cf7d
+  if ((new.l & _FPU_ALL_TRAPS) != 0)
a2cf7d
+    (void) __fe_nomask_env_priv ();
a2cf7d
+
a2cf7d
+  /* Atomically enable and raise (if appropriate) exceptions set in `new'.  */
a2cf7d
+  fesetenv_mode (new.fenv);
a2cf7d
 }
a2cf7d
 
a2cf7d
 static __always_inline int
a2cf7d
@@ -176,9 +186,30 @@ libc_feholdsetround_ppc_ctx (struct rm_ctx *ctx, int r)
a2cf7d
 {
a2cf7d
   fenv_union_t old, new;
a2cf7d
 
a2cf7d
+  old.fenv = fegetenv_status ();
a2cf7d
+
a2cf7d
+  new.l = (old.l & ~(FPSCR_ENABLES_MASK|FPSCR_RN_MASK)) | r;
a2cf7d
+
a2cf7d
+  ctx->env = old.fenv;
a2cf7d
+  if (__glibc_unlikely (new.l != old.l))
a2cf7d
+    {
a2cf7d
+      if ((old.l & _FPU_ALL_TRAPS) != 0)
a2cf7d
+	(void) __fe_mask_env ();
a2cf7d
+      fesetenv_mode (new.fenv);
a2cf7d
+      ctx->updated_status = true;
a2cf7d
+    }
a2cf7d
+  else
a2cf7d
+    ctx->updated_status = false;
a2cf7d
+}
a2cf7d
+
a2cf7d
+static __always_inline void
a2cf7d
+libc_feholdsetround_noex_ppc_ctx (struct rm_ctx *ctx, int r)
a2cf7d
+{
a2cf7d
+  fenv_union_t old, new;
a2cf7d
+
a2cf7d
   old.fenv = fegetenv_register ();
a2cf7d
 
a2cf7d
-  new.l = (old.l & _FPU_MASK_TRAPS_RN) | r;
a2cf7d
+  new.l = (old.l & ~(FPSCR_ENABLES_MASK|FPSCR_RN_MASK)) | r;
a2cf7d
 
a2cf7d
   ctx->env = old.fenv;
a2cf7d
   if (__glibc_unlikely (new.l != old.l))
a2cf7d
@@ -218,6 +249,9 @@ libc_feresetround_ppc_ctx (struct rm_ctx *ctx)
a2cf7d
 #define libc_feholdsetround_ctx          libc_feholdsetround_ppc_ctx
a2cf7d
 #define libc_feholdsetroundf_ctx         libc_feholdsetround_ppc_ctx
a2cf7d
 #define libc_feholdsetroundl_ctx         libc_feholdsetround_ppc_ctx
a2cf7d
+#define libc_feholdsetround_noex_ctx     libc_feholdsetround_noex_ppc_ctx
a2cf7d
+#define libc_feholdsetround_noexf_ctx    libc_feholdsetround_noex_ppc_ctx
a2cf7d
+#define libc_feholdsetround_noexl_ctx    libc_feholdsetround_noex_ppc_ctx
a2cf7d
 #define libc_feresetround_ctx            libc_feresetround_ppc_ctx
a2cf7d
 #define libc_feresetroundf_ctx           libc_feresetround_ppc_ctx
a2cf7d
 #define libc_feresetroundl_ctx           libc_feresetround_ppc_ctx