14 years ago · 912dc8ff09
--- a/src/gallium/auxiliary/vl/vl_mpeg12_bitstream.c
+++ b/src/gallium/auxiliary/vl/vl_mpeg12_bitstream.c
@@ -55,7 +55,6 @@
 #include <pipe/p_video_state.h>

 #include "vl_vlc.h"
 #include "vl_zscan.h"
 #include "vl_mpeg12_bitstream.h"

 /* take num bits from the high part of bit_buf and zero extend them */
@@ -64,12 +63,6 @@
 /* take num bits from the high part of bit_buf and sign extend them */
 #define SBITS(buf,num) (((int32_t)(buf)) >> (32 - (num)))

 #define SATURATE(val)			\
 do {					\
   if ((uint32_t)(val + 2048) > 4095)	\
      val = (val > 0) ? 2047 : -2048;	\
 } while (0)

 /* macroblock modes */
 #define MACROBLOCK_INTRA 1
 #define MACROBLOCK_PATTERN 2
@@ -721,7 +714,7 @@ get_chroma_dc_dct_diff(struct vl_mpg12_bs *bs)
 }

 static inline void
 get_intra_block_B14(struct vl_mpg12_bs *bs, const int quant_matrix[64], int quantizer_scale, short *dest)
 get_intra_block_B14(struct vl_mpg12_bs *bs, int quantizer_scale, short *dest)
 {
   int i, val;
   const DCTtab *tab;
@@ -742,12 +735,10 @@ get_intra_block_B14(struct vl_mpg12_bs *bs, const int quant_matrix[64], int quan
      normal_code:
         bs->vlc.buf <<= tab->len;
         bs->vlc.bits += tab->len + 1;
         val = (tab->level * quantizer_scale * quant_matrix[i]) >> 4;
         val = tab->level * quantizer_scale;

         /* if (bitstream_get (1)) val = -val; */
         val = (val ^ vl_vlc_sbits(&bs->vlc, 1)) - vl_vlc_sbits(&bs->vlc, 1);

         SATURATE (val);
         dest[i] = val;

         bs->vlc.buf <<= 1;
@@ -771,9 +762,8 @@ get_intra_block_B14(struct vl_mpg12_bs *bs, const int quant_matrix[64], int quan

         vl_vlc_dumpbits(&bs->vlc, 12);
         vl_vlc_needbits(&bs->vlc);
         val = (vl_vlc_sbits(&bs->vlc, 12) * quantizer_scale * quant_matrix[i]) / 16;
         val = vl_vlc_sbits(&bs->vlc, 12) * quantizer_scale;

         SATURATE (val);
         dest[i] = val;

         vl_vlc_dumpbits(&bs->vlc, 12);
@@ -811,7 +801,7 @@ get_intra_block_B14(struct vl_mpg12_bs *bs, const int quant_matrix[64], int quan
 }

 static inline void
 get_intra_block_B15(struct vl_mpg12_bs *bs, const int quant_matrix[64], int quantizer_scale, short *dest)
 get_intra_block_B15(struct vl_mpg12_bs *bs, int quantizer_scale, short *dest)
 {
   int i, val;
   const DCTtab * tab;
@@ -831,12 +821,10 @@ get_intra_block_B15(struct vl_mpg12_bs *bs, const int quant_matrix[64], int quan
         normal_code:
            bs->vlc.buf <<= tab->len;
            bs->vlc.bits += tab->len + 1;
            val = (tab->level * quantizer_scale * quant_matrix[i]) >> 4;
            val = tab->level * quantizer_scale;

            /* if (bitstream_get (1)) val = -val; */
            val = (val ^ vl_vlc_sbits(&bs->vlc, 1)) - vl_vlc_sbits(&bs->vlc, 1);

            SATURATE (val);
            dest[i] = val;

            bs->vlc.buf <<= 1;
@@ -859,9 +847,8 @@ get_intra_block_B15(struct vl_mpg12_bs *bs, const int quant_matrix[64], int quan

            vl_vlc_dumpbits(&bs->vlc, 12);
            vl_vlc_needbits(&bs->vlc);
            val = (vl_vlc_sbits(&bs->vlc, 12) * quantizer_scale * quant_matrix[i]) / 16;
            val = vl_vlc_sbits(&bs->vlc, 12) * quantizer_scale;

            SATURATE (val);
            dest[i] = val;

            vl_vlc_dumpbits(&bs->vlc, 12);
@@ -900,7 +887,7 @@ get_intra_block_B15(struct vl_mpg12_bs *bs, const int quant_matrix[64], int quan
 }

 static inline void
 get_non_intra_block(struct vl_mpg12_bs *bs, const int quant_matrix[64], int quantizer_scale, short *dest)
 get_non_intra_block(struct vl_mpg12_bs *bs, int quantizer_scale, short *dest)
 {
   int i, val;
   const DCTtab *tab;
@@ -927,12 +914,10 @@ get_non_intra_block(struct vl_mpg12_bs *bs, const int quant_matrix[64], int quan
      normal_code:
         bs->vlc.buf <<= tab->len;
         bs->vlc.bits += tab->len + 1;
         val = ((2*tab->level+1) * quantizer_scale * quant_matrix[i]) >> 5;
         val = ((2*tab->level+1) * quantizer_scale) >> 1;

         /* if (bitstream_get (1)) val = -val; */
         val = (val ^ vl_vlc_sbits(&bs->vlc, 1)) - vl_vlc_sbits(&bs->vlc, 1);

         SATURATE (val);
         dest[i] = val;

         bs->vlc.buf <<= 1;
@@ -960,9 +945,8 @@ get_non_intra_block(struct vl_mpg12_bs *bs, const int quant_matrix[64], int quan
         vl_vlc_dumpbits(&bs->vlc, 12);
         vl_vlc_needbits(&bs->vlc);
         val = 2 * (vl_vlc_sbits(&bs->vlc, 12) + vl_vlc_sbits(&bs->vlc, 1)) + 1;
         val = (val * quantizer_scale * quant_matrix[i]) / 32;
         val = (val * quantizer_scale) / 2;

         SATURATE (val);
         dest[i] = val;

         vl_vlc_dumpbits(&bs->vlc, 12);
@@ -999,7 +983,7 @@ get_non_intra_block(struct vl_mpg12_bs *bs, const int quant_matrix[64], int quan
 }

 static inline void
 get_mpeg1_intra_block(struct vl_mpg12_bs *bs, const int quant_matrix[64], int quantizer_scale, short *dest)
 get_mpeg1_intra_block(struct vl_mpg12_bs *bs, int quantizer_scale, short *dest)
 {
   int i, val;
   const DCTtab * tab;
@@ -1020,7 +1004,7 @@ get_mpeg1_intra_block(struct vl_mpg12_bs *bs, const int quant_matrix[64], int qu
      normal_code:
         bs->vlc.buf <<= tab->len;
         bs->vlc.bits += tab->len + 1;
         val = (tab->level * quantizer_scale * quant_matrix[i]) >> 4;
         val = tab->level * quantizer_scale;

         /* oddification */
         val = (val - 1) | 1;
@@ -1028,7 +1012,6 @@ get_mpeg1_intra_block(struct vl_mpg12_bs *bs, const int quant_matrix[64], int qu
         /* if (bitstream_get (1)) val = -val; */
         val = (val ^ vl_vlc_sbits(&bs->vlc, 1)) - vl_vlc_sbits(&bs->vlc, 1);

         SATURATE (val);
         dest[i] = val;

         bs->vlc.buf <<= 1;
@@ -1057,12 +1040,11 @@ get_mpeg1_intra_block(struct vl_mpg12_bs *bs, const int quant_matrix[64], int qu
            vl_vlc_dumpbits(&bs->vlc, 8);
            val = vl_vlc_ubits(&bs->vlc, 8) + 2 * val;
         }
         val = (val * quantizer_scale * quant_matrix[i]) / 16;
         val = val * quantizer_scale;

         /* oddification */
         val = (val + ~SBITS (val, 1)) | 1;

         SATURATE (val);
         dest[i] = val;

         vl_vlc_dumpbits(&bs->vlc, 8);
@@ -1099,7 +1081,7 @@ get_mpeg1_intra_block(struct vl_mpg12_bs *bs, const int quant_matrix[64], int qu
 }

 static inline void
 get_mpeg1_non_intra_block(struct vl_mpg12_bs *bs, const int quant_matrix[64], int quantizer_scale, short *dest)
 get_mpeg1_non_intra_block(struct vl_mpg12_bs *bs, int quantizer_scale, short *dest)
 {
   int i, val;
   const DCTtab * tab;
@@ -1126,7 +1108,7 @@ get_mpeg1_non_intra_block(struct vl_mpg12_bs *bs, const int quant_matrix[64], in
      normal_code:
         bs->vlc.buf <<= tab->len;
         bs->vlc.bits += tab->len + 1;
         val = ((2*tab->level+1) * quantizer_scale * quant_matrix[i]) >> 5;
         val = ((2*tab->level+1) * quantizer_scale) >> 1;

         /* oddification */
         val = (val - 1) | 1;
@@ -1134,7 +1116,6 @@ get_mpeg1_non_intra_block(struct vl_mpg12_bs *bs, const int quant_matrix[64], in
         /* if (bitstream_get (1)) val = -val; */
         val = (val ^ vl_vlc_sbits(&bs->vlc, 1)) - vl_vlc_sbits(&bs->vlc, 1);

         SATURATE (val);
         dest[i] = val;

         bs->vlc.buf <<= 1;
@@ -1167,12 +1148,11 @@ get_mpeg1_non_intra_block(struct vl_mpg12_bs *bs, const int quant_matrix[64], in
            val = vl_vlc_ubits(&bs->vlc, 8) + 2 * val;
         }
         val = 2 * (val + SBITS (val, 1)) + 1;
         val = (val * quantizer_scale * quant_matrix[i]) / 32;
         val = (val * quantizer_scale) / 2;

         /* oddification */
         val = (val + ~SBITS (val, 1)) | 1;

         SATURATE (val);
         dest[i] = val;

         vl_vlc_dumpbits(&bs->vlc, 8);
@@ -1209,7 +1189,7 @@ get_mpeg1_non_intra_block(struct vl_mpg12_bs *bs, const int quant_matrix[64], in
 }

 static inline void
 slice_intra_DCT(struct vl_mpg12_bs *bs, struct pipe_mpeg12_picture_desc * picture, const int quant_matrix[64], int cc,
 slice_intra_DCT(struct vl_mpg12_bs *bs, struct pipe_mpeg12_picture_desc * picture, int cc,
                 unsigned x, unsigned y, enum pipe_mpeg12_dct_type coding, int quantizer_scale, int dc_dct_pred[3])
 {
   short dest[64];
@@ -1228,14 +1208,14 @@ slice_intra_DCT(struct vl_mpg12_bs *bs, struct pipe_mpeg12_picture_desc * pictur
      dc_dct_pred[cc] += get_chroma_dc_dct_diff(bs);

   memset(dest, 0, sizeof(int16_t) * 64);
   dest[0] = dc_dct_pred[cc] << (3 - picture->intra_dc_precision);
   dest[0] = dc_dct_pred[cc];
   if (picture->mpeg1) {
      if (picture->picture_coding_type != D_TYPE)
          get_mpeg1_intra_block(bs, quant_matrix, quantizer_scale, dest);
          get_mpeg1_intra_block(bs, quantizer_scale, dest);
   } else if (picture->intra_vlc_format)
      get_intra_block_B15(bs, quant_matrix, quantizer_scale, dest);
      get_intra_block_B15(bs, quantizer_scale, dest);
   else
      get_intra_block_B14(bs, quant_matrix, quantizer_scale, dest);
      get_intra_block_B14(bs, quantizer_scale, dest);

   memcpy(bs->ycbcr_buffer[cc], dest, sizeof(int16_t) * 64);

@@ -1245,7 +1225,7 @@ slice_intra_DCT(struct vl_mpg12_bs *bs, struct pipe_mpeg12_picture_desc * pictur
 }

 static inline void
 slice_non_intra_DCT(struct vl_mpg12_bs *bs, struct pipe_mpeg12_picture_desc * picture, const int quant_matrix[64], int cc,
 slice_non_intra_DCT(struct vl_mpg12_bs *bs, struct pipe_mpeg12_picture_desc * picture, int cc,
                    unsigned x, unsigned y,  enum pipe_mpeg12_dct_type coding, int quantizer_scale)
 {
   short dest[64];
@@ -1257,9 +1237,9 @@ slice_non_intra_DCT(struct vl_mpg12_bs *bs, struct pipe_mpeg12_picture_desc * pi

   memset(dest, 0, sizeof(int16_t) * 64);
   if (picture->mpeg1)
      get_mpeg1_non_intra_block(bs, quant_matrix, quantizer_scale, dest);
      get_mpeg1_non_intra_block(bs, quantizer_scale, dest);
   else
      get_non_intra_block(bs, quant_matrix, quantizer_scale, dest);
      get_non_intra_block(bs, quantizer_scale, dest);

   memcpy(bs->ycbcr_buffer[cc], dest, sizeof(int16_t) * 64);

@@ -1571,8 +1551,7 @@ slice_init(struct vl_mpg12_bs *bs, struct pipe_mpeg12_picture_desc * picture,
 }

 static inline bool
 decode_slice(struct vl_mpg12_bs *bs, struct pipe_mpeg12_picture_desc *picture,
             const int intra_quantizer_matrix[64], const int non_intra_quantizer_matrix[64])
 decode_slice(struct vl_mpg12_bs *bs, struct pipe_mpeg12_picture_desc *picture)
 {
   enum pipe_video_field_select default_field_select;
   struct pipe_motionvector mv_fwd, mv_bwd;
@@ -1659,12 +1638,12 @@ decode_slice(struct vl_mpg12_bs *bs, struct pipe_mpeg12_picture_desc *picture,
         mv_bwd.top.weight = mv_bwd.bottom.weight = PIPE_VIDEO_MV_WEIGHT_MIN;

         // unravaled loop of 6 block(i) calls in macroblock()
         slice_intra_DCT(bs, picture, intra_quantizer_matrix, 0, x*2+0, y*2+0, dct_type, quantizer_scale, dc_dct_pred);
         slice_intra_DCT(bs, picture, intra_quantizer_matrix, 0, x*2+1, y*2+0, dct_type, quantizer_scale, dc_dct_pred);
         slice_intra_DCT(bs, picture, intra_quantizer_matrix, 0, x*2+0, y*2+1, dct_type, quantizer_scale, dc_dct_pred);
         slice_intra_DCT(bs, picture, intra_quantizer_matrix, 0, x*2+1, y*2+1, dct_type, quantizer_scale, dc_dct_pred);
         slice_intra_DCT(bs, picture, intra_quantizer_matrix, 1, x, y, PIPE_MPEG12_DCT_TYPE_FRAME, quantizer_scale, dc_dct_pred);
         slice_intra_DCT(bs, picture, intra_quantizer_matrix, 2, x, y, PIPE_MPEG12_DCT_TYPE_FRAME, quantizer_scale, dc_dct_pred);
         slice_intra_DCT(bs, picture, 0, x*2+0, y*2+0, dct_type, quantizer_scale, dc_dct_pred);
         slice_intra_DCT(bs, picture, 0, x*2+1, y*2+0, dct_type, quantizer_scale, dc_dct_pred);
         slice_intra_DCT(bs, picture, 0, x*2+0, y*2+1, dct_type, quantizer_scale, dc_dct_pred);
         slice_intra_DCT(bs, picture, 0, x*2+1, y*2+1, dct_type, quantizer_scale, dc_dct_pred);
         slice_intra_DCT(bs, picture, 1, x, y, PIPE_MPEG12_DCT_TYPE_FRAME, quantizer_scale, dc_dct_pred);
         slice_intra_DCT(bs, picture, 2, x, y, PIPE_MPEG12_DCT_TYPE_FRAME, quantizer_scale, dc_dct_pred);

         if (picture->picture_coding_type == D_TYPE) {
            vl_vlc_needbits(&bs->vlc);
@@ -1722,17 +1701,17 @@ decode_slice(struct vl_mpg12_bs *bs, struct pipe_mpeg12_picture_desc *picture,

            // TODO  optimize not fully used for idct accel only mc.
            if (coded_block_pattern & 0x20)
               slice_non_intra_DCT(bs, picture, non_intra_quantizer_matrix, 0, x*2+0, y*2+0, dct_type, quantizer_scale); // cc0  luma 0
               slice_non_intra_DCT(bs, picture, 0, x*2+0, y*2+0, dct_type, quantizer_scale); // cc0  luma 0
            if (coded_block_pattern & 0x10)
               slice_non_intra_DCT(bs, picture, non_intra_quantizer_matrix, 0, x*2+1, y*2+0, dct_type, quantizer_scale); // cc0 luma 1
               slice_non_intra_DCT(bs, picture, 0, x*2+1, y*2+0, dct_type, quantizer_scale); // cc0 luma 1
            if (coded_block_pattern & 0x08)
               slice_non_intra_DCT(bs, picture, non_intra_quantizer_matrix, 0, x*2+0, y*2+1, dct_type, quantizer_scale); // cc0 luma 2
               slice_non_intra_DCT(bs, picture, 0, x*2+0, y*2+1, dct_type, quantizer_scale); // cc0 luma 2
            if (coded_block_pattern & 0x04)
               slice_non_intra_DCT(bs, picture, non_intra_quantizer_matrix, 0, x*2+1, y*2+1, dct_type, quantizer_scale); // cc0 luma 3
               slice_non_intra_DCT(bs, picture, 0, x*2+1, y*2+1, dct_type, quantizer_scale); // cc0 luma 3
            if (coded_block_pattern & 0x2)
               slice_non_intra_DCT(bs, picture, non_intra_quantizer_matrix, 1, x, y, PIPE_MPEG12_DCT_TYPE_FRAME, quantizer_scale); // cc1 croma
               slice_non_intra_DCT(bs, picture, 1, x, y, PIPE_MPEG12_DCT_TYPE_FRAME, quantizer_scale); // cc1 croma
            if (coded_block_pattern & 0x1)
               slice_non_intra_DCT(bs, picture, non_intra_quantizer_matrix, 2, x, y, PIPE_MPEG12_DCT_TYPE_FRAME, quantizer_scale); // cc2 croma
               slice_non_intra_DCT(bs, picture, 2, x, y, PIPE_MPEG12_DCT_TYPE_FRAME, quantizer_scale); // cc2 croma
         }

         dc_dct_pred[0] = dc_dct_pred[1] = dc_dct_pred[2] = 0;
@@ -1845,12 +1824,6 @@ void
 vl_mpg12_bs_decode(struct vl_mpg12_bs *bs, unsigned num_bytes, const void *buffer,
                   struct pipe_mpeg12_picture_desc *picture, unsigned num_ycbcr_blocks[3])
 {
   int intra_quantizer_matrix[64];
   int non_intra_quantizer_matrix[64];

   const int *scan;
   unsigned i;

   assert(bs);
   assert(num_ycbcr_blocks);
   assert(buffer && num_bytes);
@@ -1859,11 +1832,5 @@ vl_mpg12_bs_decode(struct vl_mpg12_bs *bs, unsigned num_bytes, const void *buffe

   vl_vlc_init(&bs->vlc, buffer, num_bytes);

   scan = picture->alternate_scan ? vl_zscan_alternate : vl_zscan_normal;
   for (i = 0; i < 64; ++i) {
      intra_quantizer_matrix[i] = picture->intra_quantizer_matrix[scan[i]];
      non_intra_quantizer_matrix[i] = picture->non_intra_quantizer_matrix[scan[i]];
   }

   while(decode_slice(bs, picture, intra_quantizer_matrix, non_intra_quantizer_matrix));
   while(decode_slice(bs, picture));
 }
--- a/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c
+++ b/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c
@@ -312,8 +312,21 @@ vl_mpeg12_buffer_map(struct pipe_video_decode_buffer *buffer)

      vl_mpg12_bs_set_buffers(&buf->bs, ycbcr_stream, buf->texels, mv_stream);
   } else {
      for (i = 0; i < VL_MAX_PLANES; ++i)
      static const uint8_t dummy_quant[64] = {
         0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
         0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
         0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
         0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
         0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
         0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
         0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
         0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10
      };

      for (i = 0; i < VL_MAX_PLANES; ++i) {
         vl_zscan_set_layout(&buf->zscan[i], dec->zscan_linear);
         vl_zscan_upload_quant(&buf->zscan[i], dummy_quant, dummy_quant);
      }
   }
 }

@@ -365,6 +378,7 @@ vl_mpeg12_buffer_decode_bitstream(struct pipe_video_decode_buffer *buffer,
                                  unsigned num_ycbcr_blocks[3])
 {
   struct vl_mpeg12_buffer *buf = (struct vl_mpeg12_buffer*)buffer;
   uint8_t intra_quantizer_matrix[64];
   struct vl_mpeg12_decoder *dec;
   unsigned i;

@@ -373,8 +387,13 @@ vl_mpeg12_buffer_decode_bitstream(struct pipe_video_decode_buffer *buffer,
   dec = (struct vl_mpeg12_decoder *)buf->base.decoder;
   assert(dec);

   for (i = 0; i < VL_MAX_PLANES; ++i)
   memcpy(intra_quantizer_matrix, picture->intra_quantizer_matrix, sizeof(intra_quantizer_matrix));
   intra_quantizer_matrix[0] = 1 << (7 - picture->intra_dc_precision);

   for (i = 0; i < VL_MAX_PLANES; ++i) {
      vl_zscan_set_layout(&buf->zscan[i], picture->alternate_scan ? dec->zscan_alternate : dec->zscan_normal);
      vl_zscan_upload_quant(&buf->zscan[i], intra_quantizer_matrix, picture->non_intra_quantizer_matrix);
   }

   vl_mpg12_bs_decode(&buf->bs, num_bytes, data, picture, num_ycbcr_blocks);
 }
--- a/src/gallium/auxiliary/vl/vl_zscan.c
+++ b/src/gallium/auxiliary/vl/vl_zscan.c
@@ -136,11 +136,11 @@ create_vert_shader(struct vl_zscan *zscan)
   ureg_MUL(shader, ureg_writemask(o_vpos, TGSI_WRITEMASK_XY), ureg_src(tmp), scale);
   ureg_MOV(shader, ureg_writemask(o_vpos, TGSI_WRITEMASK_ZW), ureg_imm1f(shader, 1.0f));

   ureg_MUL(shader, ureg_writemask(tmp, TGSI_WRITEMASK_XZ), ureg_scalar(instance, TGSI_SWIZZLE_X),
   ureg_MUL(shader, ureg_writemask(tmp, TGSI_WRITEMASK_XW), ureg_scalar(instance, TGSI_SWIZZLE_X),
            ureg_imm1f(shader, 1.0f / zscan->blocks_per_line));

   ureg_FRC(shader, ureg_writemask(tmp, TGSI_WRITEMASK_Y), ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X));
   ureg_FLR(shader, ureg_writemask(tmp, TGSI_WRITEMASK_Z), ureg_src(tmp));
   ureg_FLR(shader, ureg_writemask(tmp, TGSI_WRITEMASK_W), ureg_src(tmp));

   for (i = 0; i < zscan->num_channels; ++i) {
      ureg_ADD(shader, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_Y),
@@ -149,7 +149,8 @@ create_vert_shader(struct vl_zscan *zscan)
      ureg_MAD(shader, ureg_writemask(o_vtex[i], TGSI_WRITEMASK_X), vrect,
               ureg_imm1f(shader, 1.0f / zscan->blocks_per_line), ureg_src(tmp));
      ureg_MOV(shader, ureg_writemask(o_vtex[i], TGSI_WRITEMASK_Y), vrect);
      ureg_MUL(shader, ureg_writemask(o_vtex[i], TGSI_WRITEMASK_Z), ureg_src(tmp),
      ureg_MOV(shader, ureg_writemask(o_vtex[i], TGSI_WRITEMASK_Z), vpos);
      ureg_MUL(shader, ureg_writemask(o_vtex[i], TGSI_WRITEMASK_W), ureg_src(tmp),
               ureg_imm1f(shader, (float)zscan->blocks_per_line / zscan->blocks_total));
   }

@@ -165,10 +166,10 @@ create_frag_shader(struct vl_zscan *zscan)
   struct ureg_program *shader;
   struct ureg_src vtex[zscan->num_channels];

   struct ureg_src src, scan, quant;
   struct ureg_src samp_src, samp_scan, samp_quant;

   struct ureg_dst tmp[zscan->num_channels];
   struct ureg_dst fragment;
   struct ureg_dst quant, fragment;

   unsigned i;

@@ -179,12 +180,13 @@ create_frag_shader(struct vl_zscan *zscan)
   for (i = 0; i < zscan->num_channels; ++i)
      vtex[i] = ureg_DECL_fs_input(shader, TGSI_SEMANTIC_GENERIC, VS_O_VTEX + i, TGSI_INTERPOLATE_LINEAR);

   src = ureg_DECL_sampler(shader, 0);
   scan = ureg_DECL_sampler(shader, 1);
   quant = ureg_DECL_sampler(shader, 2);
   samp_src = ureg_DECL_sampler(shader, 0);
   samp_scan = ureg_DECL_sampler(shader, 1);
   samp_quant = ureg_DECL_sampler(shader, 2);

   for (i = 0; i < zscan->num_channels; ++i)
      tmp[i] = ureg_DECL_temporary(shader);
   quant = ureg_DECL_temporary(shader);

   fragment = ureg_DECL_output(shader, TGSI_SEMANTIC_COLOR, 0);

@@ -194,17 +196,18 @@ create_frag_shader(struct vl_zscan *zscan)
    * fragment = tex(tmp, 0) * quant
    */
   for (i = 0; i < zscan->num_channels; ++i)
      ureg_TEX(shader, ureg_writemask(tmp[i], TGSI_WRITEMASK_X), TGSI_TEXTURE_2D, vtex[i], scan);
      ureg_TEX(shader, ureg_writemask(tmp[i], TGSI_WRITEMASK_X), TGSI_TEXTURE_2D, vtex[i], samp_scan);

   for (i = 0; i < zscan->num_channels; ++i)
      ureg_MOV(shader, ureg_writemask(tmp[i], TGSI_WRITEMASK_Y), ureg_scalar(vtex[i], TGSI_SWIZZLE_Z));
      ureg_MOV(shader, ureg_writemask(tmp[i], TGSI_WRITEMASK_Y), ureg_scalar(vtex[i], TGSI_SWIZZLE_W));

   for (i = 0; i < zscan->num_channels; ++i)
      ureg_TEX(shader, tmp[i], TGSI_TEXTURE_2D, ureg_src(tmp[i]), src);
   for (i = 0; i < zscan->num_channels; ++i) {
      ureg_TEX(shader, ureg_writemask(tmp[0], TGSI_WRITEMASK_X << i), TGSI_TEXTURE_2D, ureg_src(tmp[i]), samp_src);
      ureg_TEX(shader, ureg_writemask(quant, TGSI_WRITEMASK_X << i), TGSI_TEXTURE_3D, vtex[i], samp_quant);
   }

   // TODO: Fetch quant and use it
   for (i = 0; i < zscan->num_channels; ++i)
      ureg_MUL(shader, ureg_writemask(fragment, TGSI_WRITEMASK_X << i), ureg_src(tmp[i]), ureg_imm1f(shader, 1.0f));
   ureg_MUL(shader, quant, ureg_src(quant), ureg_imm1f(shader, 16.0f));
   ureg_MUL(shader, fragment, ureg_src(tmp[0]), ureg_src(quant));

   for (i = 0; i < zscan->num_channels; ++i)
      ureg_release_temporary(shader, tmp[i]);
@@ -283,7 +286,7 @@ init_state(struct vl_zscan *zscan)
      memset(&sampler, 0, sizeof(sampler));
      sampler.wrap_s = PIPE_TEX_WRAP_REPEAT;
      sampler.wrap_t = PIPE_TEX_WRAP_REPEAT;
      sampler.wrap_r = PIPE_TEX_WRAP_REPEAT;
      sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
      sampler.min_img_filter = PIPE_TEX_FILTER_NEAREST;
      sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
      sampler.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
@@ -413,15 +416,6 @@ error_resource:
   return NULL;
 }

 #if 0
 // TODO
 struct pipe_sampler_view *
 vl_zscan_normal(struct pipe_context *pipe, unsigned blocks_per_line);

 struct pipe_sampler_view *
 vl_zscan_alternate(struct pipe_context *pipe, unsigned blocks_per_line);
 #endif

 bool
 vl_zscan_init(struct vl_zscan *zscan, struct pipe_context *pipe,
              unsigned buffer_width, unsigned buffer_height,
@@ -457,16 +451,13 @@ vl_zscan_cleanup(struct vl_zscan *zscan)
   cleanup_state(zscan);
 }

 #if 0
 // TODO
 void
 vl_zscan_upload_quant(struct vl_zscan *zscan, ...);
 #endif

 bool
 vl_zscan_init_buffer(struct vl_zscan *zscan, struct vl_zscan_buffer *buffer,
                     struct pipe_sampler_view *src, struct pipe_surface *dst)
 {
   struct pipe_resource res_tmpl, *res;
   struct pipe_sampler_view sv_tmpl;

   assert(zscan && buffer);

   memset(buffer, 0, sizeof(struct vl_zscan_buffer));
@@ -489,6 +480,28 @@ vl_zscan_init_buffer(struct vl_zscan *zscan, struct vl_zscan_buffer *buffer,
   buffer->fb_state.nr_cbufs = 1;
   pipe_surface_reference(&buffer->fb_state.cbufs[0], dst);

   memset(&res_tmpl, 0, sizeof(res_tmpl));
   res_tmpl.target = PIPE_TEXTURE_3D;
   res_tmpl.format = PIPE_FORMAT_R8_UNORM;
   res_tmpl.width0 = BLOCK_WIDTH * zscan->blocks_per_line;
   res_tmpl.height0 = BLOCK_HEIGHT;
   res_tmpl.depth0 = 2;
   res_tmpl.array_size = 1;
   res_tmpl.usage = PIPE_USAGE_IMMUTABLE;
   res_tmpl.bind = PIPE_BIND_SAMPLER_VIEW;

   res = zscan->pipe->screen->resource_create(zscan->pipe->screen, &res_tmpl);
   if (!res)
      return false;

   memset(&sv_tmpl, 0, sizeof(sv_tmpl));
   u_sampler_view_default_template(&sv_tmpl, res, res->format);
   sv_tmpl.swizzle_r = sv_tmpl.swizzle_g = sv_tmpl.swizzle_b = sv_tmpl.swizzle_a = TGSI_SWIZZLE_X;
   buffer->quant = zscan->pipe->create_sampler_view(zscan->pipe, res, &sv_tmpl);
   pipe_resource_reference(&res, NULL);
   if (!buffer->quant)
      return false;

   return true;
 }

@@ -512,6 +525,65 @@ vl_zscan_set_layout(struct vl_zscan_buffer *buffer, struct pipe_sampler_view *la
   pipe_sampler_view_reference(&buffer->layout, layout);
 }

 void
 vl_zscan_upload_quant(struct vl_zscan_buffer *buffer,
                      const uint8_t intra_matrix[64],
                      const uint8_t non_intra_matrix[64])
 {
   struct pipe_context *pipe;
   struct pipe_transfer *buf_transfer;
   unsigned x, y, i, pitch;
   uint8_t *intra, *non_intra;

   struct pipe_box rect =
   {
      0, 0, 0,
      BLOCK_WIDTH,
      BLOCK_HEIGHT,
      2
   };

   assert(buffer);
   assert(intra_matrix);
   assert(non_intra_matrix);

   pipe = buffer->zscan->pipe;

   rect.width *= buffer->zscan->blocks_per_line;

   buf_transfer = pipe->get_transfer
   (
      pipe, buffer->quant->texture,
      0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD,
      &rect
   );
   if (!buf_transfer)
      goto error_transfer;

   pitch = buf_transfer->stride;

   non_intra = pipe->transfer_map(pipe, buf_transfer);
   if (!non_intra)
      goto error_map;

   intra = non_intra + BLOCK_HEIGHT * pitch;

   for (i = 0; i < buffer->zscan->blocks_per_line; ++i)
      for (y = 0; y < BLOCK_HEIGHT; ++y)
         for (x = 0; x < BLOCK_WIDTH; ++x) {
            intra[i * BLOCK_WIDTH + y * pitch + x] = intra_matrix[x + y * BLOCK_WIDTH];
            non_intra[i * BLOCK_WIDTH + y * pitch + x] = non_intra_matrix[x + y * BLOCK_WIDTH];
         }

   pipe->transfer_unmap(pipe, buf_transfer);

 error_map:
   pipe->transfer_destroy(pipe, buf_transfer);

 error_transfer:
   return;
 }

 void
 vl_zscan_render(struct vl_zscan_buffer *buffer, unsigned num_instances)
 {
@@ -523,10 +595,10 @@ vl_zscan_render(struct vl_zscan_buffer *buffer, unsigned num_instances)

   zscan->pipe->bind_rasterizer_state(zscan->pipe, zscan->rs_state);
   zscan->pipe->bind_blend_state(zscan->pipe, zscan->blend);
   zscan->pipe->bind_fragment_sampler_states(zscan->pipe, 2, zscan->samplers);
   zscan->pipe->bind_fragment_sampler_states(zscan->pipe, 3, zscan->samplers);
   zscan->pipe->set_framebuffer_state(zscan->pipe, &buffer->fb_state);
   zscan->pipe->set_viewport_state(zscan->pipe, &buffer->viewport);
   zscan->pipe->set_fragment_sampler_views(zscan->pipe, 2, &buffer->src);
   zscan->pipe->set_fragment_sampler_views(zscan->pipe, 3, &buffer->src);
   zscan->pipe->bind_vs_state(zscan->pipe, zscan->vs);
   zscan->pipe->bind_fs_state(zscan->pipe, zscan->fs);
   util_draw_arrays_instanced(zscan->pipe, PIPE_PRIM_QUADS, 0, 4, 0, num_instances);
--- a/src/gallium/auxiliary/vl/vl_zscan.h
+++ b/src/gallium/auxiliary/vl/vl_zscan.h
@@ -53,8 +53,6 @@ struct vl_zscan
   void *samplers[3];

   void *vs, *fs;

   struct pipe_sampler_view *quant;
 };

 struct vl_zscan_buffer
@@ -84,11 +82,6 @@ vl_zscan_init(struct vl_zscan *zscan, struct pipe_context *pipe,
 void
 vl_zscan_cleanup(struct vl_zscan *zscan);

 #if 0
 void
 vl_zscan_upload_quant(struct vl_zscan *zscan, ...);
 #endif

 bool
 vl_zscan_init_buffer(struct vl_zscan *zscan, struct vl_zscan_buffer *buffer,
                     struct pipe_sampler_view *src, struct pipe_surface *dst);
@@ -99,6 +92,11 @@ vl_zscan_cleanup_buffer(struct vl_zscan_buffer *buffer);
 void
 vl_zscan_set_layout(struct vl_zscan_buffer *buffer, struct pipe_sampler_view *layout);

 void
 vl_zscan_upload_quant(struct vl_zscan_buffer *buffer,
                      const uint8_t intra_matrix[64],
                      const uint8_t non_intra_matrix[64]);

 void
 vl_zscan_render(struct vl_zscan_buffer *buffer, unsigned num_instances);