3 files changed, 119 insertions, 104 deletions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 666e569..33338f6 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -245,16 +245,17 @@ lp_build_rho(struct lp_build_sample_context *bld,
       LLVMValueRef cubesize;
       LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
       /*
-       * If we have derivs too then we have per-pixel cube_rho - doesn't matter
-       * though until we do per-pixel lod.
        * Cube map code did already everything except size mul and per-quad extraction.
        */
+      rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+                                      perquadf_bld->type, cube_rho, 0);
+      if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
+         rho = lp_build_sqrt(perquadf_bld, rho);
+      }
       /* Could optimize this for single quad just skip the broadcast */
       cubesize = lp_build_extract_broadcast(gallivm, bld->float_size_in_type,
-                                            coord_bld->type, float_size, index0);
-      rho_vec = lp_build_mul(coord_bld, cubesize, cube_rho);
-      rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
-                                      perquadf_bld->type, rho_vec, 0);
+                                            perquadf_bld->type, float_size, index0);
+      rho = lp_build_mul(perquadf_bld, cubesize, rho);
    }
    else if (derivs && !(bld->static_texture_state->target == PIPE_TEXTURE_CUBE)) {
       LLVMValueRef ddmax[3], ddx[3], ddy[3];
@@ -1356,25 +1357,31 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
                      LLVMValueRef *face,
                      LLVMValueRef *face_s,
                      LLVMValueRef *face_t,
-                     LLVMValueRef *rho)
+                     LLVMValueRef *rho,
+                     boolean need_derivs)
 {
    struct lp_build_context *coord_bld = &bld->coord_bld;
    LLVMBuilderRef builder = bld->gallivm->builder;
    struct gallivm_state *gallivm = bld->gallivm;
    LLVMValueRef si, ti, ri;
-   boolean need_derivs = TRUE;
 
    if (1 || coord_bld->type.length > 4) {
       /*
        * Do per-pixel face selection. We cannot however (as we used to do)
        * simply calculate the derivs afterwards (which is very bogus for
-       * explicit derivs anyway) because the values would be "random" when
-       * not all pixels lie on the same face. Hence just transform the derivs
-       * (or rather only the dmax values), which works both for implicit and
-       * explicit derivatives and doesn't add much math (except need to
-       * calculate derivs for 3 instead of 2 coords and have a couple more selects
-       * but cuts some minor math elsewhere). The derivs don't need mirroring,
-       * just selection, since noone cares about the sign.
+       * explicit derivs btw) because the values would be "random" when
+       * not all pixels lie on the same face. So what we do here is just
+       * calculate the derivatives after scaling the coords by the absolute
+       * value of the inverse major axis, and essentially do rho calculation
+       * steps as if it were a 3d texture. This is perfect if all pixels hit
+       * the same face, but not so great at edges, I believe the max error
+       * should be sqrt(2) with no_rho_approx or 2 otherwise (essentially measuring
+       * the 3d distance between 2 points on the cube instead of measuring up/down
+       * the edge). Still this is possibly a win over just selecting the same face
+       * for all pixels. Unfortunately, something like that doesn't work for
+       * explicit derivatives.
+       * TODO: handle explicit derivatives by transforming them alongside coords
+       * somehow.
        */
       struct lp_build_context *cint_bld = &bld->int_coord_bld;
       struct lp_type intctype = cint_bld->type;
@@ -1392,95 +1399,111 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
       LLVMValueRef facex = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_X);
       LLVMValueRef facey = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Y);
       LLVMValueRef facez = lp_build_const_int_vec(gallivm, intctype, PIPE_TEX_FACE_POS_Z);
-      LLVMValueRef dmax[3], dmaxsnew, dmaxtnew;
 
       assert(PIPE_TEX_FACE_NEG_X == PIPE_TEX_FACE_POS_X + 1);
       assert(PIPE_TEX_FACE_NEG_Y == PIPE_TEX_FACE_POS_Y + 1);
       assert(PIPE_TEX_FACE_NEG_Z == PIPE_TEX_FACE_POS_Z + 1);
 
       /*
-       * TODO do this only when needed.
+       * get absolute value (for x/y/z face selection) and sign bit
+       * (for mirroring minor coords and pos/neg face selection)
+       * of the original coords.
        */
-      if (need_derivs && !derivs) {
-         LLVMValueRef ddx_ddy[2], tmp[2];
-         /*
-          * This isn't quite the same as the "ordinary" path since
-          * we need to extract the ds/dt/dr values before further processing.
-          */
-         static const unsigned char swizzle11[] = { /* no-op swizzle */
+      as = lp_build_abs(&bld->coord_bld, s);
+      at = lp_build_abs(&bld->coord_bld, t);
+      ar = lp_build_abs(&bld->coord_bld, r);
+
+      /*
+       * major face determination: select x if x > y else select y
+       * select z if z >= max(x,y) else select previous result
+       * if some axis are the same we chose z over y, y over x - the
+       * dx10 spec seems to ask for it while OpenGL doesn't care (if we
+       * wouldn't care could save a select or two if using different
+       * compares and doing at_g_as_ar last since tnewx and tnewz are the
+       * same).
+       */
+      as_ge_at = lp_build_cmp(coord_bld, PIPE_FUNC_GREATER, as, at);
+      maxasat = lp_build_max(coord_bld, as, at);
+      ar_ge_as_at = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, ar, maxasat);
+
+      if (need_derivs) {
+         LLVMValueRef ddx_ddy[2], tmp[2], rho_vec;
+         static const unsigned char swizzle0[] = { /* no-op swizzle */
             0, LP_BLD_SWIZZLE_DONTCARE,
             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
          };
-         static const unsigned char swizzle12[] = {
-            2, LP_BLD_SWIZZLE_DONTCARE,
+         static const unsigned char swizzle1[] = {
+            1, LP_BLD_SWIZZLE_DONTCARE,
             LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
          };
-         static const unsigned char swizzle21[] = { /* no-op swizzle */
-            0, LP_BLD_SWIZZLE_DONTCARE,
-            2, LP_BLD_SWIZZLE_DONTCARE
+         static const unsigned char swizzle01[] = { /* no-op swizzle */
+            0, 1,
+            LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
          };
-         static const unsigned char swizzle22[] = {
-            1, LP_BLD_SWIZZLE_DONTCARE,
-            3, LP_BLD_SWIZZLE_DONTCARE
+         static const unsigned char swizzle23[] = {
+            2, 3,
+            LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
+         };
+         static const unsigned char swizzle02[] = {
+            0, 2,
+            LP_BLD_SWIZZLE_DONTCARE, LP_BLD_SWIZZLE_DONTCARE
          };
 
+         /*
+          * scale the s/t/r coords pre-select/mirror so we can calculate
+          * "reasonable" derivs.
+          */
+         ma = lp_build_select(coord_bld, as_ge_at, s, t);
+         ma = lp_build_select(coord_bld, ar_ge_as_at, r, ma);
+         ima = lp_build_cube_imapos(coord_bld, ma);
+         s = lp_build_mul(coord_bld, s, ima);
+         t = lp_build_mul(coord_bld, t, ima);
+         r = lp_build_mul(coord_bld, r, ima);
+
+         /*
+          * This isn't quite the same as the "ordinary" (3d deriv) path since we
+          * know the texture is square which simplifies things (we can omit the
+          * size mul which happens very early completely here and do it at the
+          * very end).
+          */
          ddx_ddy[0] = lp_build_packed_ddx_ddy_twocoord(coord_bld, s, t);
          ddx_ddy[1] = lp_build_packed_ddx_ddy_onecoord(coord_bld, r);
-         ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
-         ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
 
-         tmp[0] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle21);
-         tmp[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle22);
-         dmax[0] = lp_build_max(coord_bld, tmp[0], tmp[1]);
-         dmax[1] = lp_build_swizzle_aos(coord_bld, dmax[0], swizzle12);
+         if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
+            ddx_ddy[0] = lp_build_mul(coord_bld, ddx_ddy[0], ddx_ddy[0]);
+            ddx_ddy[1] = lp_build_mul(coord_bld, ddx_ddy[1], ddx_ddy[1]);
+         }
+         else {
+            ddx_ddy[0] = lp_build_abs(coord_bld, ddx_ddy[0]);
+            ddx_ddy[1] = lp_build_abs(coord_bld, ddx_ddy[1]);
+         }
 
-         tmp[0] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle11);
-         tmp[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle12);
-         dmax[2] = lp_build_max(coord_bld, tmp[0], tmp[1]);
-      }
-      else if (need_derivs) {
-         LLVMValueRef abs_ddx[3], abs_ddy[3];
-         abs_ddx[0] = lp_build_abs(coord_bld, derivs->ddx[0]);
-         abs_ddx[1] = lp_build_abs(coord_bld, derivs->ddx[1]);
-         abs_ddx[2] = lp_build_abs(coord_bld, derivs->ddx[2]);
-         abs_ddy[0] = lp_build_abs(coord_bld, derivs->ddy[0]);
-         abs_ddy[1] = lp_build_abs(coord_bld, derivs->ddy[1]);
-         abs_ddy[2] = lp_build_abs(coord_bld, derivs->ddy[2]);
-         dmax[0] = lp_build_max(coord_bld, abs_ddx[0], abs_ddy[0]);
-         dmax[1] = lp_build_max(coord_bld, abs_ddx[1], abs_ddy[1]);
-         dmax[2] = lp_build_max(coord_bld, abs_ddx[2], abs_ddy[2]);
+         tmp[0] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle01);
+         tmp[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[0], swizzle23);
+         tmp[2] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
+
+         if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
+            rho_vec = lp_build_add(coord_bld, tmp[0], tmp[1]);
+            rho_vec = lp_build_add(coord_bld, rho_vec, tmp[2]);
+         }
+         else {
+            rho_vec = lp_build_max(coord_bld, tmp[0], tmp[1]);
+            rho_vec = lp_build_max(coord_bld, rho_vec, tmp[2]);
+         }
+
+         tmp[0] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
+         tmp[1] = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
+         *rho = lp_build_max(coord_bld, tmp[0], tmp[1]);
       }
 
       si = LLVMBuildBitCast(builder, s, lp_build_vec_type(gallivm, intctype), "");
       ti = LLVMBuildBitCast(builder, t, lp_build_vec_type(gallivm, intctype), "");
       ri = LLVMBuildBitCast(builder, r, lp_build_vec_type(gallivm, intctype), "");
-
-      /*
-       * get absolute value (for x/y/z face selection) and sign bit
-       * (for mirroring minor coords and pos/neg face selection)
-       * of the original coords.
-       */
-      as = lp_build_abs(&bld->coord_bld, s);
-      at = lp_build_abs(&bld->coord_bld, t);
-      ar = lp_build_abs(&bld->coord_bld, r);
       signs = LLVMBuildAnd(builder, si, signmask, "");
       signt = LLVMBuildAnd(builder, ti, signmask, "");
       signr = LLVMBuildAnd(builder, ri, signmask, "");
 
       /*
-       * major face determination: select x if x > y else select y
-       * select z if z >= max(x,y) else select previous result
-       * if some axis are the same we chose z over y, y over x - the
-       * dx10 spec seems to ask for it while OpenGL doesn't care (if we
-       * wouldn't care could save a select or two if using different
-       * compares and doing at_g_as_ar last since tnewx and tnewz are the
-       * same).
-       */
-      as_ge_at = lp_build_cmp(coord_bld, PIPE_FUNC_GREATER, as, at);
-      maxasat = lp_build_max(coord_bld, as, at);
-      ar_ge_as_at = lp_build_cmp(coord_bld, PIPE_FUNC_GEQUAL, ar, maxasat);
-
-      /*
        * compute all possible new s/t coords
        * snewx = signs * -r;
        * tnewx = -t;
@@ -1510,23 +1533,19 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
        */
 
       /* select/mirror */
+      if (!need_derivs) {
+         ma = lp_build_select(coord_bld, as_ge_at, s, t);
+      }
       *face_s = lp_build_select(cint_bld, as_ge_at, snewx, snewy);
       *face_t = lp_build_select(cint_bld, as_ge_at, tnewx, tnewy);
-      ma = lp_build_select(coord_bld, as_ge_at, s, t);
       *face = lp_build_select(cint_bld, as_ge_at, facex, facey);
-      if (need_derivs) {
-         dmaxsnew = lp_build_select(coord_bld, as_ge_at, dmax[2], dmax[0]);
-         dmaxtnew = lp_build_select(coord_bld, as_ge_at, dmax[1], dmax[2]);
-      }
 
+      if (!need_derivs) {
+         ma = lp_build_select(coord_bld, ar_ge_as_at, r, ma);
+      }
       *face_s = lp_build_select(cint_bld, ar_ge_as_at, snewz, *face_s);
       *face_t = lp_build_select(cint_bld, ar_ge_as_at, tnewz, *face_t);
-      ma = lp_build_select(coord_bld, ar_ge_as_at, r, ma);
       *face = lp_build_select(cint_bld, ar_ge_as_at, facez, *face);
-      if (need_derivs) {
-         dmaxsnew = lp_build_select(coord_bld, ar_ge_as_at, dmax[0], dmaxsnew);
-         dmaxtnew = lp_build_select(coord_bld, ar_ge_as_at, dmax[1], dmaxtnew);
-      }
 
       *face_s = LLVMBuildBitCast(builder, *face_s,
                                lp_build_vec_type(gallivm, coord_bld->type), "");
@@ -1542,26 +1561,15 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
       signma = LLVMBuildLShr(builder, mai, signshift, "");
       *face = LLVMBuildOr(builder, *face, signma, "face");
 
-      ima = lp_build_cube_imapos(coord_bld, ma);
-
       /* project coords */
-      *face_s = lp_build_mul(coord_bld, *face_s, ima);
+      if (!need_derivs) {
+         ima = lp_build_cube_imapos(coord_bld, ma);
+         *face_s = lp_build_mul(coord_bld, *face_s, ima);
+         *face_t = lp_build_mul(coord_bld, *face_t, ima);
+      }
+
       *face_s = lp_build_add(coord_bld, *face_s, posHalf);
-      *face_t = lp_build_mul(coord_bld, *face_t, ima);
       *face_t = lp_build_add(coord_bld, *face_t, posHalf);
-
-      /* project derivs */
-      if (need_derivs) {
-         /*
-          * we do some optimization here, since we know it's square
-          * we can do the max before projection (and before size mul,
-          * which the so-called "rho" is missing here).
-          * For explicit derivs this is fully per-pixel vector, for implicit
-          * derivs only the first value per quad contains useful values.
-          */
-         *rho = lp_build_max(coord_bld, dmaxsnew, dmaxtnew);
-         *rho = lp_build_mul(coord_bld, *rho, ima);
-      }
    }
 
    else {
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index 72af813..cde8ce9 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -437,7 +437,8 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
                      LLVMValueRef *face,
                      LLVMValueRef *face_s,
                      LLVMValueRef *face_t,
-                     LLVMValueRef *rho);
+                     LLVMValueRef *rho,
+                     boolean need_derivs);
 
 
 void
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index d2cc0f3..ced2103 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -1102,7 +1102,13 @@ lp_build_sample_common(struct lp_build_sample_context *bld,
     */
    if (target == PIPE_TEXTURE_CUBE) {
       LLVMValueRef face, face_s, face_t;
-      lp_build_cube_lookup(bld, *s, *t, *r, derivs, &face, &face_s, &face_t, &cube_rho);
+      boolean need_derivs;
+      need_derivs = ((min_filter != mag_filter ||
+                      mip_filter != PIPE_TEX_MIPFILTER_NONE) &&
+                      !bld->static_sampler_state->min_max_lod_equal &&
+                      !explicit_lod);
+      lp_build_cube_lookup(bld, *s, *t, *r, derivs, &face, &face_s, &face_t,
+                           &cube_rho, need_derivs);
       *s = face_s; /* vec */
       *t = face_t; /* vec */
       /* use 'r' to indicate cube face */