Several issues actually: - Fix a regression in unsigned normalized in the rescaling [0, 255] to [0, 256] - Ensure we use signed shifts where appropriate (instead of unsigned shifts) - Refactor the code slightly -- move all the logic inside lp_build_lerp_simple(). This change, plus an adjustment in the tolerance of signed normalized results in piglit fbo-blending-formats fixes bug 57903 Reviewed-by: Brian Paul <brianp@vmware.com>

13 lat temu · 5e99cd9159
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -685,7 +685,7 @@ lp_build_sub(struct lp_build_context *bld,
 /**
 * Normalized multiplication.
 *
 * There are several approaches here (using 8-bit normalized multiplication as
 * There are several approaches for (using 8-bit normalized multiplication as
 * an example):
 *
 * - alpha plus one
@@ -694,7 +694,7 @@ lp_build_sub(struct lp_build_context *bld,
 *    
 *       a*b/255 ~= (a*(b + 1)) >> 256
 *    
 *     which is the fastest method that satisfies the following OpenGL criteria
 *     which is the fastest method that satisfies the following OpenGL criteria of
 *    
 *       0*0 = 0 and 255*255 = 255
 *
@@ -710,7 +710,7 @@ lp_build_sub(struct lp_build_context *bld,
 *
 *     note that just by itself it doesn't satisfies the OpenGL criteria, as
 *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
 *     must be used
 *     must be used.
 *
 * - geometric series plus rounding
 *
@@ -719,7 +719,9 @@ lp_build_sub(struct lp_build_context *bld,
 *
 *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
 *
 *     achieving the exact results
 *     achieving the exact results.
 *
 *
 *
 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995, 
 *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
@@ -733,8 +735,7 @@ lp_build_mul_norm(struct gallivm_state *gallivm,
 {
   LLVMBuilderRef builder = gallivm->builder;
   struct lp_build_context bld;
   unsigned bits;
   LLVMValueRef shift;
   unsigned n;
   LLVMValueRef half;
   LLVMValueRef ab;

@@ -744,29 +745,28 @@ lp_build_mul_norm(struct gallivm_state *gallivm,

   lp_build_context_init(&bld, gallivm, wide_type);

   bits = wide_type.width / 2;
   n = wide_type.width / 2;
   if (wide_type.sign) {
      --bits;
      --n;
   }

   shift = lp_build_const_int_vec(gallivm, wide_type, bits);
   /*
    * TODO: for 16bits normalized SSE2 vectors we could consider using PMULHUW
    * http://ssp.impulsetrain.com/2011/07/03/multiplying-normalized-16-bit-numbers-with-sse2/
    */

 #if 0
   
   /* a*b/255 ~= (a*(b + 1)) >> 256 */
   /* XXX: This would not work for signed types */
   assert(!wide_type.sign);
   b = LLVMBuildAdd(builder, b, lp_build_const_int_vec(gallium, wide_type, 1), "");
   ab = LLVMBuildMul(builder, a, b, "");
   /*
    * a*b / (2**n - 1) ~= (a*b + (a*b >> n) + half) >> n
    */

 #else
   
   /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
   ab = LLVMBuildMul(builder, a, b, "");
   ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, shift, ""), "");
   ab = LLVMBuildAdd(builder, ab, lp_build_shr_imm(&bld, ab, n), "");

   /* Add rounding term */
   half = lp_build_const_int_vec(gallivm, wide_type, 1 << (bits - 1));
   /*
    * half = sgn(ab) * 0.5 * (2 ** n) = sgn(ab) * (1 << (n - 1))
    */

   half = lp_build_const_int_vec(gallivm, wide_type, 1 << (n - 1));
   if (wide_type.sign) {
      LLVMValueRef minus_half = LLVMBuildNeg(builder, half, "");
      LLVMValueRef sign = lp_build_shr_imm(&bld, half, wide_type.width - 1);
@@ -774,9 +774,8 @@ lp_build_mul_norm(struct gallivm_state *gallivm,
   }
   ab = LLVMBuildAdd(builder, ab, half, "");

 #endif
   
   ab = LLVMBuildLShr(builder, ab, shift, "");
   /* Final division */
   ab = lp_build_shr_imm(&bld, ab, n);

   return ab;
 }
@@ -988,14 +987,28 @@ lp_build_lerp_simple(struct lp_build_context *bld,

   delta = lp_build_sub(bld, v1, v0);

   res = lp_build_mul(bld, x, delta);

   if (normalized) {
      if (bld->type.sign) {
         res = lp_build_shr_imm(bld, res, half_width - 1);
      } else {
      if (!bld->type.sign) {
         /*
          * Scale x from [0, 2**n - 1] to [0, 2**n] by adding the
          * most-significant-bit to the lowest-significant-bit, so that
          * later we can just divide by 2**n instead of 2**n - 1.
          */
         x = lp_build_add(bld, x, lp_build_shr_imm(bld, x, half_width - 1));

         /* (x * delta) >> n */
         res = lp_build_mul(bld, x, delta);
         res = lp_build_shr_imm(bld, res, half_width);
      } else {
         /*
          * The rescaling trick above doesn't work for signed numbers, so
          * use the 2**n - 1 divison approximation in lp_build_mul_norm
          * instead.
          */
         res = lp_build_mul_norm(bld->gallivm, bld->type, x, delta);
      }
   } else {
      res = lp_build_mul(bld, x, delta);
   }

   res = lp_build_add(bld, v0, res);
@@ -1022,7 +1035,6 @@ lp_build_lerp(struct lp_build_context *bld,
              LLVMValueRef v0,
              LLVMValueRef v1)
 {
   LLVMBuilderRef builder = bld->gallivm->builder;
   const struct lp_type type = bld->type;
   LLVMValueRef res;

@@ -1034,8 +1046,6 @@ lp_build_lerp(struct lp_build_context *bld,
      struct lp_type wide_type;
      struct lp_build_context wide_bld;
      LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
      unsigned bits;
      LLVMValueRef shift;

      assert(type.length >= 2);

@@ -1054,22 +1064,6 @@ lp_build_lerp(struct lp_build_context *bld,
      lp_build_unpack2(bld->gallivm, type, wide_type, v0, &v0l, &v0h);
      lp_build_unpack2(bld->gallivm, type, wide_type, v1, &v1l, &v1h);

      /*
       * Scale x from [0, 255] to [0, 256]
       */

      bits = type.width - 1;
      if (type.sign) {
         --bits;
      }

      shift = lp_build_const_int_vec(bld->gallivm, wide_type, bits - 1);

      xl = lp_build_add(&wide_bld, xl,
                        LLVMBuildAShr(builder, xl, shift, ""));
      xh = lp_build_add(&wide_bld, xh,
                        LLVMBuildAShr(builder, xh, shift, ""));

      /*
       * Lerp both halves.
       */