This also adds some code to handle per-quad lods for more than 4-wide fetches, because otherwise I'd have to integrate the texelFetch function into the splitting stuff... (but it is not used yet outside texelFetch). passes piglit fs-texelFetch-2D, fails fs-texelFetchOffset-2D due to I believe a test error (results are undefined for out-of-bounds fetches, we return whatever is at offset 0, whereas the test expects [0,0,0,1]). Texel offsets are only handled by texelFetch for now, though the interface can handle it for everything. Reviewed-by: José Fonseca <jfonseca@vmware.com>

12 years ago · 0b6554ba6f
--- a/src/gallium/auxiliary/draw/draw_llvm_sample.c
+++ b/src/gallium/auxiliary/draw/draw_llvm_sample.c
@@ -171,9 +171,10 @@ static void
 draw_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
                                       struct gallivm_state *gallivm,
                                       struct lp_type type,
                                       boolean is_fetch,
                                       unsigned unit,
                                       unsigned num_coords,
                                       const LLVMValueRef *coords,
                                       const LLVMValueRef *offsets,
                                       const struct lp_derivatives *derivs,
                                       LLVMValueRef lod_bias, /* optional */
                                       LLVMValueRef explicit_lod, /* optional */
@@ -187,8 +188,10 @@ draw_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
                       &sampler->dynamic_state.static_state[unit],
                       &sampler->dynamic_state.base,
                       type,
                       is_fetch,
                       unit,
                       num_coords, coords,
                       coords,
                       offsets,
                       derivs,
                       lod_bias, explicit_lod,
                       texel);
@@ -213,7 +216,7 @@ draw_llvm_sampler_soa_emit_size_query(const struct lp_build_sampler_soa *base,
   lp_build_size_query_soa(gallivm,
                           &sampler->dynamic_state.static_state[unit],
                           &sampler->dynamic_state.base,
 			   type,
                           type,
                           unit,
                           explicit_lod,
                           sizes_out);
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -186,8 +186,8 @@ lp_build_rho(struct lp_build_sample_context *bld,
             const struct lp_derivatives *derivs)
 {
   struct gallivm_state *gallivm = bld->gallivm;
   struct lp_build_context *int_size_bld = &bld->int_size_bld;
   struct lp_build_context *float_size_bld = &bld->float_size_bld;
   struct lp_build_context *int_size_bld = &bld->int_size_in_bld;
   struct lp_build_context *float_size_bld = &bld->float_size_in_bld;
   struct lp_build_context *float_bld = &bld->float_bld;
   struct lp_build_context *coord_bld = &bld->coord_bld;
   struct lp_build_context *perquadf_bld = &bld->perquadf_bld;
@@ -316,7 +316,7 @@ lp_build_rho(struct lp_build_sample_context *bld,
         }
      }
      rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
                                      perquadf_bld->type, rho);
                                      perquadf_bld->type, rho, 0);
   }
   else {
      if (dims <= 1) {
@@ -517,7 +517,7 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
   else {
      if (explicit_lod) {
         lod = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
                                         perquadf_bld->type, explicit_lod);
                                         perquadf_bld->type, explicit_lod, 0);
      }
      else {
         LLVMValueRef rho;
@@ -562,7 +562,7 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
         /* add shader lod bias */
         if (lod_bias) {
            lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
                  perquadf_bld->type, lod_bias);
                  perquadf_bld->type, lod_bias, 0);
            lod = LLVMBuildFAdd(builder, lod, lod_bias, "shader_lod_bias");
         }
      }
@@ -725,7 +725,6 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,

 /**
 * Return pointer to a single mipmap level.
 * \param data_array  array of pointers to mipmap levels
 * \param level  integer mipmap level
 */
 LLVMValueRef
@@ -743,6 +742,55 @@ lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
   return data_ptr;
 }

 /**
 * Return (per-pixel) offsets to mip levels.
 * \param level  integer mipmap level
 */
 LLVMValueRef
 lp_build_get_mip_offsets(struct lp_build_sample_context *bld,
                         LLVMValueRef level)
 {
   LLVMBuilderRef builder = bld->gallivm->builder;
   LLVMValueRef indexes[2], offsets, offset1;

   indexes[0] = lp_build_const_int32(bld->gallivm, 0);
   if (bld->num_lods == 1) {
      indexes[1] = level;
      offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
      offset1 = LLVMBuildLoad(builder, offset1, "");
      offsets = lp_build_broadcast_scalar(&bld->int_coord_bld, offset1);
   }
   else if (bld->num_lods == bld->coord_bld.type.length / 4) {
      unsigned i;

      offsets = bld->int_coord_bld.undef;
      for (i = 0; i < bld->num_lods; i++) {
         LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
         LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, 4 * i);
         indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
         offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
         offset1 = LLVMBuildLoad(builder, offset1, "");
         offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexo, "");
      }
      offsets = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, offsets, 0);
   }
   else {
      unsigned i;

      assert (bld->num_lods == bld->coord_bld.type.length);

      offsets = bld->int_coord_bld.undef;
      for (i = 0; i < bld->num_lods; i++) {
         LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
         indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
         offset1 = LLVMBuildGEP(builder, bld->mip_offsets, indexes, 2, "");
         offset1 = LLVMBuildLoad(builder, offset1, "");
         offsets = LLVMBuildInsertElement(builder, offsets, offset1, indexi, "");
      }
   }
   return offsets;
 }


 /**
 * Codegen equivalent for u_minify().
@@ -780,12 +828,44 @@ lp_build_get_level_stride_vec(struct lp_build_sample_context *bld,
                              LLVMValueRef stride_array, LLVMValueRef level)
 {
   LLVMBuilderRef builder = bld->gallivm->builder;
   LLVMValueRef indexes[2], stride;
   LLVMValueRef indexes[2], stride, stride1;
   indexes[0] = lp_build_const_int32(bld->gallivm, 0);
   indexes[1] = level;
   stride = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
   stride = LLVMBuildLoad(builder, stride, "");
   stride = lp_build_broadcast_scalar(&bld->int_coord_bld, stride);
   if (bld->num_lods == 1) {
      indexes[1] = level;
      stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
      stride1 = LLVMBuildLoad(builder, stride1, "");
      stride = lp_build_broadcast_scalar(&bld->int_coord_bld, stride1);
   }
   else if (bld->num_lods == bld->coord_bld.type.length / 4) {
      LLVMValueRef stride1;
      unsigned i;

      stride = bld->int_coord_bld.undef;
      for (i = 0; i < bld->num_lods; i++) {
         LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
         LLVMValueRef indexo = lp_build_const_int32(bld->gallivm, i);
         indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
         stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
         stride1 = LLVMBuildLoad(builder, stride1, "");
         stride = LLVMBuildInsertElement(builder, stride, stride1, indexo, "");
      }
      stride = lp_build_swizzle_scalar_aos(&bld->int_coord_bld, stride, 0);
   }
   else {
      LLVMValueRef stride1;
      unsigned i;

      assert (bld->num_lods == bld->coord_bld.type.length);

      stride = bld->int_coord_bld.undef;
      for (i = 0; i < bld->coord_bld.type.length; i++) {
         LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
         indexes[1] = LLVMBuildExtractElement(builder, level, indexi, "");
         stride1 = LLVMBuildGEP(builder, stride_array, indexes, 2, "");
         stride1 = LLVMBuildLoad(builder, stride1, "");
         stride = LLVMBuildInsertElement(builder, stride, stride1, indexi, "");
      }
   }
   return stride;
 }

@@ -805,12 +885,102 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
   const unsigned dims = bld->dims;
   LLVMValueRef ilevel_vec;

   ilevel_vec = lp_build_broadcast_scalar(&bld->int_size_bld, ilevel);

   /*
    * Compute width, height, depth at mipmap level 'ilevel'
    */
   *out_size = lp_build_minify(&bld->int_size_bld, bld->int_size, ilevel_vec);
   if (bld->num_lods == 1) {
      ilevel_vec = lp_build_broadcast_scalar(&bld->int_size_bld, ilevel);
      *out_size = lp_build_minify(&bld->int_size_bld, bld->int_size, ilevel_vec);
   }
   else {
      LLVMValueRef int_size_vec;
      LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
      unsigned num_quads = bld->coord_bld.type.length / 4;
      unsigned i;

      if (bld->num_lods == num_quads) {
         /*
          * XXX: this should be #ifndef SANE_INSTRUCTION_SET.
          * intel "forgot" the variable shift count instruction until avx2.
          * A harmless 8x32 shift gets translated into 32 instructions
          * (16 extracts, 8 scalar shifts, 8 inserts), llvm is apparently
          * unable to recognize if there are really just 2 different shift
          * count values. So do the shift 4-wide before expansion.
          */
         struct lp_build_context bld4;
         struct lp_type type4;

         type4 = bld->int_coord_bld.type;
         type4.length = 4;

         lp_build_context_init(&bld4, bld->gallivm, type4);

         if (bld->dims == 1) {
            assert(bld->int_size_in_bld.type.length == 1);
            int_size_vec = lp_build_broadcast_scalar(&bld4,
                                                     bld->int_size);
         }
         else {
            assert(bld->int_size_in_bld.type.length == 4);
            int_size_vec = bld->int_size;
         }

         for (i = 0; i < num_quads; i++) {
            LLVMValueRef ileveli;
            LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);

            ileveli = lp_build_extract_broadcast(bld->gallivm,
                                                 bld->perquadi_bld.type,
                                                 bld4.type,
                                                 ilevel,
                                                 indexi);
            tmp[i] = lp_build_minify(&bld4, int_size_vec, ileveli);
         }
         /*
          * out_size is [w0, h0, d0, _, w1, h1, d1, _, ...] vector for dims > 1,
          * [w0, w0, w0, w0, w1, w1, w1, w1, ...] otherwise.
          */
         *out_size = lp_build_concat(bld->gallivm,
                                     tmp,
                                     bld4.type,
                                     num_quads);
      }
      else {
        /* FIXME: this is terrible and results in _huge_ vector
         * (for the dims > 1 case).
         * Should refactor this (together with extract_image_sizes) and do
         * something more useful. Could for instance if we have width,height
         * with 4-wide vector pack all elements into a 8xi16 vector
         * (on which we can still do useful math) instead of using a 16xi32
         * vector.
         * FIXME: some callers can't handle this yet.
         * For dims == 1 this will create [w0, w1, w2, w3, ...] vector.
         * For dims > 1 this will create [w0, h0, d0, _, w1, h1, d1, _, ...] vector.
         */
         assert(bld->num_lods == bld->coord_bld.type.length);
         if (bld->dims == 1) {
            assert(bld->int_size_bld.type.length == 1);
            int_size_vec = lp_build_broadcast_scalar(&bld->int_coord_bld,
                                                     bld->int_size);
            /* vector shift with variable shift count alert... */
            *out_size = lp_build_minify(&bld->int_coord_bld, int_size_vec, ilevel);
         }
         else {
            LLVMValueRef ilevel1;
            for (i = 0; i < bld->num_lods; i++) {
               LLVMValueRef indexi = lp_build_const_int32(bld->gallivm, i);
               ilevel1 = lp_build_extract_broadcast(bld->gallivm, bld->int_coord_type,
                                                    bld->int_size_in_bld.type, ilevel, indexi);
               tmp[i] = bld->int_size;
               tmp[i] = lp_build_minify(&bld->int_size_in_bld, tmp[i], ilevel1);
            }
            int_size_vec = lp_build_concat(bld->gallivm,
                                           tmp,
                                           bld->int_size_in_bld.type,
                                           bld->num_lods);
         }
      }
   }

   if (dims >= 2) {
      *row_stride_vec = lp_build_get_level_stride_vec(bld,
@@ -836,7 +1006,7 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
 */
 void
 lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
                             struct lp_type size_type,
                             struct lp_build_context *size_bld,
                             struct lp_type coord_type,
                             LLVMValueRef size,
                             LLVMValueRef *out_width,
@@ -845,24 +1015,56 @@ lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
 {
   const unsigned dims = bld->dims;
   LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
   struct lp_type size_type = size_bld->type;

   if (bld->num_lods == 1) {
      *out_width = lp_build_extract_broadcast(bld->gallivm,
                                              size_type,
                                              coord_type,
                                              size,
                                              LLVMConstInt(i32t, 0, 0));
      if (dims >= 2) {
         *out_height = lp_build_extract_broadcast(bld->gallivm,
                                                  size_type,
                                                  coord_type,
                                                  size,
                                                  LLVMConstInt(i32t, 1, 0));
         if (dims == 3) {
            *out_depth = lp_build_extract_broadcast(bld->gallivm,
                                                    size_type,
                                                    coord_type,
                                                    size,
                                                    LLVMConstInt(i32t, 2, 0));
         }
      }
   }
   else {
      unsigned num_quads = bld->coord_bld.type.length / 4;

   *out_width = lp_build_extract_broadcast(bld->gallivm,
                                           size_type,
                                           coord_type,
                                           size,
                                           LLVMConstInt(i32t, 0, 0));
   if (dims >= 2) {
      *out_height = lp_build_extract_broadcast(bld->gallivm,
                                               size_type,
                                               coord_type,
                                               size,
                                               LLVMConstInt(i32t, 1, 0));
      if (dims == 3) {
         *out_depth = lp_build_extract_broadcast(bld->gallivm,
                                                 size_type,
                                                 coord_type,
                                                 size,
                                                 LLVMConstInt(i32t, 2, 0));
      if (dims == 1) {
         *out_width = size;
      }
      else if (bld->num_lods == num_quads) {
         *out_width = lp_build_swizzle_scalar_aos(size_bld, size, 0);
         if (dims >= 2) {
            *out_height = lp_build_swizzle_scalar_aos(size_bld, size, 1);
            if (dims == 3) {
               *out_depth = lp_build_swizzle_scalar_aos(size_bld, size, 2);
            }
         }
      }
      else {
         assert(bld->num_lods == bld->coord_type.length);
         *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
                                                coord_type, size, 0);
         if (dims >= 2) {
            *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
                                                   coord_type, size, 1);
            if (dims == 3) {
               *out_width = lp_build_pack_aos_scalars(bld->gallivm, size_type,
                                                      coord_type, size, 2);
            }
         }
      }
   }
 }
@@ -886,7 +1088,7 @@ lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
   LLVMValueRef depth;

   lp_build_extract_image_sizes(bld,
                                bld->float_size_type,
                                &bld->float_size_bld,
                                bld->coord_type,
                                flt_size,
                                &width,
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -210,6 +210,9 @@ struct lp_build_sample_context
   /** SIMD vector width */
   unsigned vector_width;

   /** number of lod values (valid are 1, length/4, length) */
   unsigned num_lods;

   /** regular scalar float type */
   struct lp_type float_type;
   struct lp_build_context float_bld;
@@ -230,10 +233,18 @@ struct lp_build_sample_context
   struct lp_build_context int_coord_bld;

   /** Unsigned integer texture size */
   struct lp_type int_size_in_type;
   struct lp_build_context int_size_in_bld;

   /** Float incoming texture size */
   struct lp_type float_size_in_type;
   struct lp_build_context float_size_in_bld;

   /** Unsigned integer texture size (might be per quad) */
   struct lp_type int_size_type;
   struct lp_build_context int_size_bld;

   /** Unsigned integer texture size */
   /** Float texture size (might be per quad) */
   struct lp_type float_size_type;
   struct lp_build_context float_size_bld;

@@ -298,6 +309,7 @@ texture_dims(enum pipe_texture_target tex)
 {
   switch (tex) {
   case PIPE_TEXTURE_1D:
   case PIPE_BUFFER:
      return 1;
   case PIPE_TEXTURE_2D:
   case PIPE_TEXTURE_RECT:
@@ -355,6 +367,11 @@ lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
                          LLVMValueRef level);


 LLVMValueRef
 lp_build_get_mip_offsets(struct lp_build_sample_context *bld,
                         LLVMValueRef level);


 void
 lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
                            LLVMValueRef ilevel,
@@ -365,7 +382,7 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,

 void
 lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
                             struct lp_type size_type,
                             struct lp_build_context *size_bld,
                             struct lp_type coord_type,
                             LLVMValueRef size,
                             LLVMValueRef *out_width,
@@ -418,9 +435,10 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
                    const struct lp_sampler_static_state *static_state,
                    struct lp_sampler_dynamic_state *dynamic_state,
                    struct lp_type fp_type,
                    boolean is_fetch,
                    unsigned unit,
                    unsigned num_coords,
                    const LLVMValueRef *coords,
                    const LLVMValueRef *offsets,
                    const struct lp_derivatives *derivs,
                    LLVMValueRef lod_bias,
                    LLVMValueRef explicit_lod,
@@ -448,7 +466,6 @@ lp_build_size_query_soa(struct gallivm_state *gallivm,
 void
 lp_build_sample_nop(struct gallivm_state *gallivm, 
                    struct lp_type type,
                    unsigned num_coords,
                    const LLVMValueRef *coords,
                    LLVMValueRef texel_out[4]);

--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@@ -539,7 +539,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
   i32_vec_type = lp_build_vec_type(bld->gallivm, i32.type);

   lp_build_extract_image_sizes(bld,
                                bld->int_size_type,
                                &bld->int_size_bld,
                                bld->int_coord_type,
                                int_size,
                                &width_vec,
@@ -661,7 +661,7 @@ lp_build_sample_image_nearest_afloat(struct lp_build_sample_context *bld,
   flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size);

   lp_build_extract_image_sizes(bld,
                                bld->float_size_type,
                                &bld->float_size_bld,
                                bld->coord_type,
                                flt_size,
                                &width_vec,
@@ -994,7 +994,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
   i32_vec_type = lp_build_vec_type(bld->gallivm, i32.type);

   lp_build_extract_image_sizes(bld,
                                bld->int_size_type,
                                &bld->int_size_bld,
                                bld->int_coord_type,
                                int_size,
                                &width_vec,
@@ -1175,7 +1175,7 @@ lp_build_sample_image_linear_afloat(struct lp_build_sample_context *bld,
   flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size);

   lp_build_extract_image_sizes(bld,
                                bld->float_size_type,
                                &bld->float_size_bld,
                                bld->coord_type,
                                flt_size,
                                &width_vec,
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -610,7 +610,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
   LLVMValueRef x, y, z;

   lp_build_extract_image_sizes(bld,
                                bld->int_size_type,
                                &bld->int_size_bld,
                                bld->int_coord_type,
                                size,
                                &width_vec, &height_vec, &depth_vec);
@@ -618,7 +618,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
   flt_size = lp_build_int_to_float(&bld->float_size_bld, size);

   lp_build_extract_image_sizes(bld,
                                bld->float_size_type,
                                &bld->float_size_bld,
                                bld->coord_type,
                                flt_size,
                                &flt_width_vec, &flt_height_vec, &flt_depth_vec);
@@ -695,7 +695,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
   int chan;

   lp_build_extract_image_sizes(bld,
                                bld->int_size_type,
                                &bld->int_size_bld,
                                bld->int_coord_type,
                                size,
                                &width_vec, &height_vec, &depth_vec);
@@ -703,7 +703,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
   flt_size = lp_build_int_to_float(&bld->float_size_bld, size);

   lp_build_extract_image_sizes(bld,
                                bld->float_size_type,
                                &bld->float_size_bld,
                                bld->coord_type,
                                flt_size,
                                &flt_width_vec, &flt_height_vec, &flt_depth_vec);
@@ -1157,6 +1157,120 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
 }


 /**
 * Texel fetch function.
 * In contrast to general sampling there is no filtering, no coord minification,
 * lod (if any) is always explicit uint, coords are uints (in terms of texel units)
 * directly to be applied to the selected mip level (after adding texel offsets).
 * This function handles texel fetch for all targets where texel fetch is supported
 * (no cube maps, but 1d, 2d, 3d are supported, arrays and buffers should be too).
 */
 static void
 lp_build_fetch_texel(struct lp_build_sample_context *bld,
                     unsigned unit,
                     const LLVMValueRef *coords,
                     LLVMValueRef explicit_lod,
                     const LLVMValueRef *offsets,
                     LLVMValueRef *colors_out)
 {
   struct lp_build_context *perquadi_bld = &bld->perquadi_bld;
   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
   unsigned dims = bld->dims, chan;
   LLVMValueRef size, ilevel;
   LLVMValueRef row_stride_vec = NULL, img_stride_vec = NULL;
   LLVMValueRef x = coords[0], y = coords[1], z = coords[2];
   LLVMValueRef width, height, depth, i, j;
   LLVMValueRef offset, out_of_bounds, out1;

   /* XXX just like ordinary sampling, we don't handle per-pixel lod (yet). */
   if (explicit_lod && bld->static_state->target != PIPE_BUFFER) {
      /* could also avoid this if there are no mipmaps */
      /* XXX temporary hack until ordinary sampling handles per-quad lod the same */
      bld->num_lods = bld->coord_type.length / 4;
      bld->float_size_type = bld->float_size_in_type;
      bld->float_size_type.length = bld->num_lods > 1 ? bld->coord_type.length :
                                      bld->float_size_in_type.length;
      bld->int_size_type = lp_int_type(bld->float_size_type);
      lp_build_context_init(&bld->int_size_bld, bld->gallivm, bld->int_size_type);
      lp_build_context_init(&bld->float_size_bld, bld->gallivm, bld->float_size_type);

      ilevel = lp_build_pack_aos_scalars(bld->gallivm, int_coord_bld->type,
                                         perquadi_bld->type, explicit_lod, 0);
      lp_build_nearest_mip_level(bld, unit, ilevel, &ilevel);
   }
   else {
      bld->num_lods = 1;
      ilevel = lp_build_const_int32(bld->gallivm, 0);
   }
   lp_build_mipmap_level_sizes(bld, ilevel,
                               &size,
                               &row_stride_vec, &img_stride_vec);
   lp_build_extract_image_sizes(bld, &bld->int_size_bld, int_coord_bld->type,
                                size, &width, &height, &depth);

   /* This is a lot like border sampling */
   if (offsets[0]) {
      /* XXX coords are really unsigned, offsets are signed */
      x = lp_build_add(int_coord_bld, x, offsets[0]);
   }
   out_of_bounds = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
   out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
   out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);

   if (dims >= 2) {
      if (offsets[1]) {
         y = lp_build_add(int_coord_bld, y, offsets[1]);
      }
      out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
      out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
      out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
      out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);

      if (dims >= 3) {
         if (offsets[2]) {
            z = lp_build_add(int_coord_bld, z, offsets[2]);
         }
         out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
         out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
         out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
         out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
      }
   }

   lp_build_sample_offset(int_coord_bld,
                          bld->format_desc,
                          x, y, z, row_stride_vec, img_stride_vec,
                          &offset, &i, &j);

   if (bld->static_state->target != PIPE_BUFFER) {
      offset = lp_build_add(int_coord_bld, offset,
                            lp_build_get_mip_offsets(bld, ilevel));
   }

   offset = lp_build_andnot(int_coord_bld, offset, out_of_bounds);

   lp_build_fetch_rgba_soa(bld->gallivm,
                           bld->format_desc,
                           bld->texel_type,
                           bld->base_ptr, offset,
                           i, j,
                           colors_out);

   if (0) {
      /*
       * Not needed except for ARB_robust_buffer_access_behavior.
       * Could use min/max above instead of out-of-bounds comparisons
       * (in fact cast to unsigned and min only is sufficient)
       * if we don't care about the result returned for out-of-bounds.
       */
      for (chan = 0; chan < 4; chan++) {
         colors_out[chan] = lp_build_select(&bld->texel_bld, out_of_bounds,
                                            bld->texel_bld.zero, colors_out[chan]);
      }
   }
 }


 /**
 * Do shadow test/comparison.
 * \param p  the texcoord Z (aka R, aka P) component
@@ -1209,7 +1323,6 @@ lp_build_sample_compare(struct lp_build_sample_context *bld,
 void
 lp_build_sample_nop(struct gallivm_state *gallivm,
                    struct lp_type type,
                    unsigned num_coords,
                    const LLVMValueRef *coords,
                    LLVMValueRef texel_out[4])
 {
@@ -1227,6 +1340,7 @@ lp_build_sample_nop(struct gallivm_state *gallivm,
 * 'texel' will return a vector of four LLVMValueRefs corresponding to
 * R, G, B, A.
 * \param type  vector float type to use for coords, etc.
 * \param is_fetch  if this is a texel fetch instruction.
 * \param derivs  partial derivatives of (s,t,r,q) with respect to x and y
 */
 void
@@ -1234,9 +1348,10 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
                    const struct lp_sampler_static_state *static_state,
                    struct lp_sampler_dynamic_state *dynamic_state,
                    struct lp_type type,
                    boolean is_fetch,
                    unsigned unit,
                    unsigned num_coords,
                    const LLVMValueRef *coords,
                    const LLVMValueRef *offsets,
                    const struct lp_derivatives *derivs,
                    LLVMValueRef lod_bias, /* optional */
                    LLVMValueRef explicit_lod, /* optional */
@@ -1272,20 +1387,28 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
   bld.int_type = lp_type_int(32);
   bld.coord_type = type;
   bld.int_coord_type = lp_int_type(type);
   bld.float_size_type = lp_type_float(32);
   bld.float_size_type.length = dims > 1 ? 4 : 1;
   bld.int_size_type = lp_int_type(bld.float_size_type);
   bld.float_size_in_type = lp_type_float(32);
   bld.float_size_in_type.length = dims > 1 ? 4 : 1;
   bld.int_size_in_type = lp_int_type(bld.float_size_in_type);
   bld.texel_type = type;
   bld.perquadf_type = type;
   /* we want native vector size to be able to use our intrinsics */
   bld.perquadf_type.length = type.length > 4 ? ((type.length + 15) / 16) * 4 : 1;
   bld.perquadi_type = lp_int_type(bld.perquadf_type);

   bld.num_lods = 1;
   bld.float_size_type = bld.float_size_in_type;
   bld.float_size_type.length = bld.num_lods > 1 ? type.length :
                                   bld.float_size_in_type.length;
   bld.int_size_type = lp_int_type(bld.float_size_type);

   lp_build_context_init(&bld.float_bld, gallivm, bld.float_type);
   lp_build_context_init(&bld.float_vec_bld, gallivm, type);
   lp_build_context_init(&bld.int_bld, gallivm, bld.int_type);
   lp_build_context_init(&bld.coord_bld, gallivm, bld.coord_type);
   lp_build_context_init(&bld.int_coord_bld, gallivm, bld.int_coord_type);
   lp_build_context_init(&bld.int_size_in_bld, gallivm, bld.int_size_in_type);
   lp_build_context_init(&bld.float_size_in_bld, gallivm, bld.float_size_in_type);
   lp_build_context_init(&bld.int_size_bld, gallivm, bld.int_size_type);
   lp_build_context_init(&bld.float_size_bld, gallivm, bld.float_size_type);
   lp_build_context_init(&bld.texel_bld, gallivm, bld.texel_type);
@@ -1311,7 +1434,7 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
      bld.int_size = tex_width;
   }
   else {
      bld.int_size = LLVMBuildInsertElement(builder, bld.int_size_bld.undef,
      bld.int_size = LLVMBuildInsertElement(builder, bld.int_size_in_bld.undef,
                                            tex_width, LLVMConstInt(i32t, 0, 0), "");
      if (dims >= 2) {
         bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
@@ -1327,7 +1450,6 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
      /* For debug: no-op texture sampling */
      lp_build_sample_nop(gallivm,
                          bld.texel_type,
                          num_coords,
                          coords,
                          texel_out);
   }
@@ -1352,6 +1474,18 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
                      static_state->wrap_t);
      }

      if (is_fetch) {
         lp_build_fetch_texel(&bld, unit, coords,
                              explicit_lod, offsets,
                              texel_out);

         if (static_state->target != PIPE_BUFFER) {
            apply_sampler_swizzle(&bld, texel_out);
         }

         return;
      }

      lp_build_sample_common(&bld, unit,
                             &s, &t, &r,
                             derivs, lod_bias, explicit_lod,
@@ -1450,20 +1584,25 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
            bld4.int_type = lp_type_int(32);
            bld4.coord_type = type4;
            bld4.int_coord_type = lp_int_type(type4);
            bld4.float_size_type = lp_type_float(32);
            bld4.float_size_type.length = dims > 1 ? 4 : 1;
            bld4.int_size_type = lp_int_type(bld4.float_size_type);
            bld4.float_size_in_type = lp_type_float(32);
            bld4.float_size_in_type.length = dims > 1 ? 4 : 1;
            bld4.int_size_in_type = lp_int_type(bld4.float_size_in_type);
            bld4.float_size_type = bld4.float_size_in_type;
            bld4.int_size_type =  bld4.int_size_in_type;
            bld4.texel_type = type4;
            bld4.perquadf_type = type4;
            /* we want native vector size to be able to use our intrinsics */
            bld4.perquadf_type.length = 1;
            bld4.perquadi_type = lp_int_type(bld4.perquadf_type);
            bld4.num_lods = 1;

            lp_build_context_init(&bld4.float_bld, gallivm, bld4.float_type);
            lp_build_context_init(&bld4.float_vec_bld, gallivm, type4);
            lp_build_context_init(&bld4.int_bld, gallivm, bld4.int_type);
            lp_build_context_init(&bld4.coord_bld, gallivm, bld4.coord_type);
            lp_build_context_init(&bld4.int_coord_bld, gallivm, bld4.int_coord_type);
            lp_build_context_init(&bld4.int_size_in_bld, gallivm, bld4.int_size_in_type);
            lp_build_context_init(&bld4.float_size_in_bld, gallivm, bld4.float_size_in_type);
            lp_build_context_init(&bld4.int_size_bld, gallivm, bld4.int_size_type);
            lp_build_context_init(&bld4.float_size_bld, gallivm, bld4.float_size_type);
            lp_build_context_init(&bld4.texel_bld, gallivm, bld4.texel_type);
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
@@ -554,15 +554,16 @@ lp_build_transpose_aos(struct gallivm_state *gallivm,


 /**
 * Pack first element of aos values,
 * Pack n-th element of aos values,
 * pad out to destination size.
 * i.e. x1 _ _ _ x2 _ _ _ will become x1 x2 _ _
 * i.e. x1 y1 _ _ x2 y2 _ _ will become x1 x2 _ _
 */
 LLVMValueRef
 lp_build_pack_aos_scalars(struct gallivm_state *gallivm,
                          struct lp_type src_type,
                          struct lp_type dst_type,
                          const LLVMValueRef src)
                          const LLVMValueRef src,
                          unsigned channel)
 {
   LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
   LLVMValueRef undef = LLVMGetUndef(i32t);
@@ -574,7 +575,7 @@ lp_build_pack_aos_scalars(struct gallivm_state *gallivm,
   assert(num_src <= num_dst);

   for (i = 0; i < num_src; i++) {
      shuffles[i] = LLVMConstInt(i32t, i * 4, 0);
      shuffles[i] = LLVMConstInt(i32t, i * 4 + channel, 0);
   }
   for (i = num_src; i < num_dst; i++) {
      shuffles[i] = undef;
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
@@ -117,7 +117,8 @@ LLVMValueRef
 lp_build_pack_aos_scalars(struct gallivm_state *gallivm,
                          struct lp_type src_type,
                          struct lp_type dst_type,
                          const LLVMValueRef src);
                          const LLVMValueRef src,
                          unsigned channel);


 LLVMValueRef
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
@@ -334,6 +334,66 @@ lp_build_emit_fetch(

 }


 LLVMValueRef
 lp_build_emit_fetch_texoffset(
   struct lp_build_tgsi_context *bld_base,
   const struct tgsi_full_instruction *inst,
   unsigned tex_off_op,
   const unsigned chan_index)
 {
   const struct tgsi_texture_offset *off = &inst->TexOffsets[tex_off_op];
   struct tgsi_full_src_register reg;
   unsigned swizzle;
   LLVMValueRef res;
   enum tgsi_opcode_type stype = TGSI_TYPE_SIGNED;

   /* convert offset "register" to ordinary register so can use normal emit funcs */
   memset(&reg, 0, sizeof(reg));
   reg.Register.File = off->File;
   reg.Register.Index = off->Index;
   reg.Register.SwizzleX = off->SwizzleX;
   reg.Register.SwizzleY = off->SwizzleY;
   reg.Register.SwizzleZ = off->SwizzleZ;

   if (chan_index == LP_CHAN_ALL) {
      swizzle = ~0;
   } else {
      swizzle = tgsi_util_get_src_register_swizzle(&reg.Register, chan_index);
      if (swizzle > 2) {
         assert(0 && "invalid swizzle in emit_fetch_texoffset()");
         return bld_base->base.undef;
      }
   }

   assert(off->Index <= bld_base->info->file_max[off->File]);

   if (bld_base->emit_fetch_funcs[off->File]) {
      res = bld_base->emit_fetch_funcs[off->File](bld_base, &reg, stype,
                                                           swizzle);
   } else {
      assert(0 && "invalid src register in emit_fetch_texoffset()");
      return bld_base->base.undef;
   }

   /*
    * Swizzle the argument
    */

   if (swizzle == ~0) {
      res = bld_base->emit_swizzle(bld_base, res,
                                   off->SwizzleX,
                                   off->SwizzleY,
                                   off->SwizzleZ,
                                   /* there's no 4th channel */
                                   off->SwizzleX);
   }

   return res;

 }


 boolean
 lp_build_tgsi_llvm(
   struct lp_build_tgsi_context * bld_base,
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -172,9 +172,10 @@ struct lp_build_sampler_soa
   (*emit_fetch_texel)( const struct lp_build_sampler_soa *sampler,
                        struct gallivm_state *gallivm,
                        struct lp_type type,
                        boolean is_fetch,
                        unsigned unit,
                        unsigned num_coords,
                        const LLVMValueRef *coords,
                        const LLVMValueRef *offsets,
                        const struct lp_derivatives *derivs,
                        LLVMValueRef lod_bias, /* optional */
                        LLVMValueRef explicit_lod, /* optional */
@@ -555,6 +556,14 @@ lp_build_emit_fetch(
   unsigned src_op,
   const unsigned chan_index);


 LLVMValueRef
 lp_build_emit_fetch_texoffset(
   struct lp_build_tgsi_context *bld_base,
   const struct tgsi_full_instruction *inst,
   unsigned tex_off_op,
   const unsigned chan_index);

 boolean
 lp_build_tgsi_llvm(
   struct lp_build_tgsi_context * bld_base,
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -1146,7 +1146,8 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
   unsigned unit;
   LLVMValueRef lod_bias, explicit_lod;
   LLVMValueRef oow = NULL;
   LLVMValueRef coords[3];
   LLVMValueRef coords[4];
   LLVMValueRef offsets[3] = { NULL };
   struct lp_derivatives derivs;
   unsigned num_coords;
   unsigned dims;
@@ -1225,7 +1226,7 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
      if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED)
         coords[i] = lp_build_mul(&bld->bld_base.base, coords[i], oow);
   }
   for (i = num_coords; i < 3; i++) {
   for (i = num_coords; i < 4; i++) {
      coords[i] = bld->bld_base.base.undef;
   }

@@ -1285,15 +1286,111 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
      unit = inst->Src[1].Register.Index;
   }

   /* some advanced gather instructions (txgo) would require 4 offsets */
   if (inst->Texture.NumOffsets == 1) {
      unsigned dim;
      for (dim = 0; dim < dims; dim++) {
         offsets[dim] = lp_build_emit_fetch_texoffset(&bld->bld_base, inst, 0, dim );
      }
   }

   bld->sampler->emit_fetch_texel(bld->sampler,
                                  bld->bld_base.base.gallivm,
                                  bld->bld_base.base.type,
                                  unit, num_coords, coords,
                                  FALSE,
                                  unit, coords,
                                  offsets,
                                  &derivs,
                                  lod_bias, explicit_lod,
                                  texel);
 }

 static void
 emit_txf( struct lp_build_tgsi_soa_context *bld,
          const struct tgsi_full_instruction *inst,
          LLVMValueRef *texel)
 {
   unsigned unit;
   LLVMValueRef coord_undef = LLVMGetUndef(bld->bld_base.base.int_vec_type);
   LLVMValueRef explicit_lod = NULL;
   LLVMValueRef coords[3];
   LLVMValueRef offsets[3] = { NULL };
   struct lp_derivatives derivs;
   unsigned num_coords;
   unsigned dims;
   unsigned i;

   if (!bld->sampler) {
      _debug_printf("warning: found texture instruction but no sampler generator supplied\n");
      for (i = 0; i < 4; i++) {
         texel[i] = coord_undef;
      }
      return;
   }

   derivs.ddx_ddy[0] = coord_undef;
   derivs.ddx_ddy[1] = coord_undef;

   switch (inst->Texture.Texture) {
   case TGSI_TEXTURE_1D:
   case TGSI_TEXTURE_BUFFER:
      num_coords = 1;
      dims = 1;
      break;
   case TGSI_TEXTURE_1D_ARRAY:
      num_coords = 2;
      dims = 1;
      break;
   case TGSI_TEXTURE_2D:
   case TGSI_TEXTURE_RECT:
      num_coords = 2;
      dims = 2;
      break;
   case TGSI_TEXTURE_2D_ARRAY:
      num_coords = 3;
      dims = 2;
      break;
   case TGSI_TEXTURE_3D:
      num_coords = 3;
      dims = 3;
      break;
   default:
      assert(0);
      return;
   }

   /* always have lod except for buffers ? */
   if (inst->Texture.Texture != TGSI_TEXTURE_BUFFER) {
      explicit_lod = lp_build_emit_fetch( &bld->bld_base, inst, 0, 3 );
   }

   for (i = 0; i < num_coords; i++) {
      coords[i] = lp_build_emit_fetch( &bld->bld_base, inst, 0, i );
   }
   for (i = num_coords; i < 3; i++) {
      coords[i] = coord_undef;
   }

   unit = inst->Src[1].Register.Index;

   if (inst->Texture.NumOffsets == 1) {
      unsigned dim;
      for (dim = 0; dim < dims; dim++) {
         offsets[dim] = lp_build_emit_fetch_texoffset(&bld->bld_base, inst, 0, dim );
      }
   }

   bld->sampler->emit_fetch_texel(bld->sampler,
                                  bld->bld_base.base.gallivm,
                                  bld->bld_base.base.type,
                                  TRUE,
                                  unit, coords,
                                  offsets,
                                  &derivs,
                                  NULL, explicit_lod,
                                  texel);
 }

 static void
 emit_txq( struct lp_build_tgsi_soa_context *bld,
          const struct tgsi_full_instruction *inst,
@@ -1755,6 +1852,17 @@ txq_emit(
   emit_txq(bld, emit_data->inst, emit_data->output);
 }

 static void
 txf_emit(
   const struct lp_build_tgsi_action * action,
   struct lp_build_tgsi_context * bld_base,
   struct lp_build_emit_data * emit_data)
 {
   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);

   emit_txf(bld, emit_data->inst, emit_data->output);
 }

 static void
 cal_emit(
   const struct lp_build_tgsi_action * action,
@@ -2126,6 +2234,7 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
   bld.bld_base.op_actions[TGSI_OPCODE_TXL].emit = txl_emit;
   bld.bld_base.op_actions[TGSI_OPCODE_TXP].emit = txp_emit;
   bld.bld_base.op_actions[TGSI_OPCODE_TXQ].emit = txq_emit;
   bld.bld_base.op_actions[TGSI_OPCODE_TXF].emit = txf_emit;

   lp_exec_mask_init(&bld.exec_mask, &bld.bld_base.base);

--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -293,6 +293,7 @@ tgsi_opcode_infer_src_type( uint opcode )
   case TGSI_OPCODE_USHR:
   case TGSI_OPCODE_SHL:
   case TGSI_OPCODE_TXQ:
   case TGSI_OPCODE_TXF:
      return TGSI_TYPE_UNSIGNED;
   case TGSI_OPCODE_MOD:
   case TGSI_OPCODE_I2F:
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -175,9 +175,11 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
      return 0;
   case PIPE_CAP_SCALED_RESOLVE:
      return 0;
   /* this is a lie could support arbitrary large offsets */
   case PIPE_CAP_MIN_TEXEL_OFFSET:
      return -8;
   case PIPE_CAP_MAX_TEXEL_OFFSET:
      return 0;
      return 7;
   case PIPE_CAP_CONDITIONAL_RENDER:
      return 1;
   case PIPE_CAP_TEXTURE_BARRIER:
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample.c
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.c
@@ -176,9 +176,10 @@ static void
 lp_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
                                     struct gallivm_state *gallivm,
                                     struct lp_type type,
                                     boolean is_fetch,
                                     unsigned unit,
                                     unsigned num_coords,
                                     const LLVMValueRef *coords,
                                     const LLVMValueRef *offsets,
                                     const struct lp_derivatives *derivs,
                                     LLVMValueRef lod_bias, /* optional */
                                     LLVMValueRef explicit_lod, /* optional */
@@ -189,7 +190,7 @@ lp_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
   assert(unit < PIPE_MAX_SAMPLERS);
   
   if (LP_PERF & PERF_NO_TEX) {
      lp_build_sample_nop(gallivm, type, num_coords, coords, texel);
      lp_build_sample_nop(gallivm, type, coords, texel);
      return;
   }

@@ -197,8 +198,10 @@ lp_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
                       &sampler->dynamic_state.static_state[unit],
                       &sampler->dynamic_state.base,
                       type,
                       is_fetch,
                       unit,
                       num_coords, coords,
                       coords,
                       offsets,
                       derivs,
                       lod_bias, explicit_lod,
                       texel);