All of the code is wired in on the SPU side, but it is not called from the PPU yet. Instruction / declaration fetch still needs to be implemented in spu_exec.c.

17 years ago · 524bba17a7
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -83,6 +83,9 @@
 #define CELL_CMD_STATE_SAMPLER       12
 #define CELL_CMD_STATE_TEXTURE       13
 #define CELL_CMD_STATE_VERTEX_INFO   14
 #define CELL_CMD_STATE_VIEWPORT      15
 #define CELL_CMD_STATE_VS_ARRAY_INFO 16
 #define CELL_CMD_VS_EXECUTE          17


 #define CELL_NUM_BUFFERS 4
@@ -116,6 +119,41 @@ struct cell_command_clear_surface
 } ALIGN16_ATTRIB;


 /**
 * Array info used by the vertex shader's vertex puller.
 */
 struct cell_array_info
 {
    void *base;               /**< Base address of the 0th element. */
    uint attr;                /**< Attribute that this state if for. */
    uint pitch;               /**< Byte pitch from one entry to the next. */
    enum pipe_format format;  /**< Pipe format of each entry. */
 } ALIGN16_ATTRIB;


 struct cell_shader_info
 {
   unsigned processor;
   unsigned num_outputs;

   void *declarations;
   unsigned num_declarations;
   void *instructions;
   unsigned num_instructions;
   void *uniforms;
 } ALIGN16_ATTRIB;


 struct cell_command_vs
 {
   struct cell_shader_info   shader;
   void *elts;
   unsigned num_elts;
   unsigned bytes_per_elt;
   void *vOut;
 } ALIGN16_ATTRIB;


 struct cell_command_render
 {
   uint opcode;       /**< CELL_CMD_RENDER */
--- a/src/mesa/pipe/cell/spu/Makefile
+++ b/src/mesa/pipe/cell/spu/Makefile
@@ -20,7 +20,11 @@ SOURCES = \
 	spu_render.c \
 	spu_texture.c \
 	spu_tile.c \
 	spu_tri.c
 	spu_tri.c \
 	spu_exec.c \
 	spu_util.c \
 	spu_vertex_fetch.c \
 	spu_vertex_shader.c

 SPU_OBJECTS = $(SOURCES:.c=.o) \

--- a/src/mesa/pipe/cell/spu/spu_exec.c
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
--- a/src/mesa/pipe/cell/spu/spu_exec.h
+++ b/src/mesa/pipe/cell/spu/spu_exec.h
@@ -0,0 +1,171 @@
 /**************************************************************************
 * 
 * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
 * All Rights Reserved.
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 * 
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 * 
 **************************************************************************/

 #if !defined SPU_EXEC_H
 #define SPU_EXEC_H

 #include "pipe/p_compiler.h"
 #include "pipe/tgsi/exec/tgsi_exec.h"

 #if defined __cplusplus
 extern "C" {
 #endif

 /**
  * Registers may be treated as float, signed int or unsigned int.
  */
 union spu_exec_channel
 {
   float    f[QUAD_SIZE];
   int      i[QUAD_SIZE];
   unsigned u[QUAD_SIZE];
 };

 /**
  * A vector[RGBA] of channels[4 pixels]
  */
 struct spu_exec_vector
 {
   union spu_exec_channel xyzw[NUM_CHANNELS];
 };

 /**
 * For fragment programs, information for computing fragment input
 * values from plane equation of the triangle/line.
 */
 struct spu_interp_coef
 {
   float a0[NUM_CHANNELS];	/* in an xyzw layout */
   float dadx[NUM_CHANNELS];
   float dady[NUM_CHANNELS];
 };


 struct softpipe_tile_cache;  /**< Opaque to TGSI */

 /**
 * Information for sampling textures, which must be implemented
 * by code outside the TGSI executor.
 */
 struct spu_sampler
 {
   const struct pipe_sampler_state *state;
   struct pipe_texture *texture;
   /** Get samples for four fragments in a quad */
   void (*get_samples)(struct spu_sampler *sampler,
                       const float s[QUAD_SIZE],
                       const float t[QUAD_SIZE],
                       const float p[QUAD_SIZE],
                       float lodbias,
                       float rgba[NUM_CHANNELS][QUAD_SIZE]);
   void *pipe; /*XXX temporary*/
   struct softpipe_tile_cache *cache;
 };


 /**
 * Run-time virtual machine state for executing TGSI shader.
 */
 struct spu_exec_machine
 {
   /*
    * 32 program temporaries
    * 4  internal temporaries
    * 1  address
    */
   struct spu_exec_vector       Temps[TGSI_EXEC_NUM_TEMPS 
 				      + TGSI_EXEC_NUM_ADDRS + 1]
       ALIGN16_ATTRIB;

   struct spu_exec_vector       *Addrs;

   struct spu_sampler           *Samplers;

   float                         Imms[TGSI_EXEC_NUM_IMMEDIATES][4];
   unsigned                      ImmLimit;
   float                         (*Consts)[4];
   struct spu_exec_vector       *Inputs;
   struct spu_exec_vector       *Outputs;
   unsigned                      Processor;

   /* GEOMETRY processor only. */
   unsigned                      *Primitives;

   /* FRAGMENT processor only. */
   const struct spu_interp_coef *InterpCoefs;
   struct spu_exec_vector       QuadPos;

   /* Conditional execution masks */
   uint CondMask;  /**< For IF/ELSE/ENDIF */
   uint LoopMask;  /**< For BGNLOOP/ENDLOOP */
   uint ContMask;  /**< For loop CONT statements */
   uint FuncMask;  /**< For function calls */
   uint ExecMask;  /**< = CondMask & LoopMask */

   /** Condition mask stack (for nested conditionals) */
   uint CondStack[TGSI_EXEC_MAX_COND_NESTING];
   int CondStackTop;

   /** Loop mask stack (for nested loops) */
   uint LoopStack[TGSI_EXEC_MAX_LOOP_NESTING];
   int LoopStackTop;

   /** Loop continue mask stack (see comments in tgsi_exec.c) */
   uint ContStack[TGSI_EXEC_MAX_LOOP_NESTING];
   int ContStackTop;

   /** Function execution mask stack (for executing subroutine code) */
   uint FuncStack[TGSI_EXEC_MAX_CALL_NESTING];
   int FuncStackTop;

   /** Function call stack for saving/restoring the program counter */
   uint CallStack[TGSI_EXEC_MAX_CALL_NESTING];
   int CallStackTop;

   struct tgsi_full_instruction *Instructions;
   uint NumInstructions;

   struct tgsi_full_declaration *Declarations;
   uint NumDeclarations;
 };


 extern void
 spu_exec_machine_init(struct spu_exec_machine *mach,
                      uint numSamplers,
                      struct spu_sampler *samplers,
                      unsigned processor);

 extern uint
 spu_exec_machine_run( struct spu_exec_machine *mach );


 #if defined __cplusplus
 } /* extern "C" */
 #endif

 #endif /* SPU_EXEC_H */
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -36,6 +36,7 @@
 #include "spu_render.h"
 #include "spu_texture.h"
 #include "spu_tile.h"
 #include "spu_vertex_shader.h"
 #include "pipe/cell/common.h"
 #include "pipe/p_defines.h"

@@ -50,6 +51,7 @@ boolean Debug = FALSE;

 struct spu_global spu;

 struct spu_vs_context draw;

 /**
 * Tell the PPU that this SPU has finished copying a buffer to
@@ -264,6 +266,18 @@ cmd_state_vertex_info(const struct vertex_info *vinfo)
 }


 static void
 cmd_state_vs_array_info(const struct cell_array_info *vs_info)
 {
   const unsigned attr = vs_info->attr;

   ASSERT(attr < PIPE_ATTRIB_MAX);
   draw.vertex_fetch.src_ptr[attr] = vs_info->base;
   draw.vertex_fetch.pitch[attr] = vs_info->pitch;
   draw.vertex_fetch.format[attr] = vs_info->format;
   draw.vertex_fetch.dirty = 1;
 }


 static void
 cmd_finish(void)
@@ -374,6 +388,20 @@ cmd_batch(uint opcode)
         cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
         pos += (1 + sizeof(struct vertex_info) / 4);
         break;
      case CELL_CMD_STATE_VIEWPORT:
         (void) memcpy(& draw.viewport, &buffer[pos+1],
                       sizeof(struct pipe_viewport_state));
         pos += (1 + sizeof(struct pipe_viewport_state) / 4);
         break;
      case CELL_CMD_STATE_VS_ARRAY_INFO:
         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
         pos += (1 + sizeof(struct cell_array_info) / 4);
         break;
      case CELL_CMD_VS_EXECUTE:
         spu_execute_vertex_shader(&draw,
                                   (struct cell_command_vs *) &buffer[pos+1]);
         pos += (1 + sizeof(struct cell_command_vs) / 4);
         break;
      default:
         printf("SPU %u: bad opcode: 0x%x\n", spu.init.id, buffer[pos]);
         ASSERT(0);
--- a/src/mesa/pipe/cell/spu/spu_util.c
+++ b/src/mesa/pipe/cell/spu/spu_util.c
@@ -0,0 +1,165 @@
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "pipe/tgsi/util/tgsi_parse.h"
 //#include "tgsi_build.h"
 #include "pipe/tgsi/util/tgsi_util.h"

 unsigned
 tgsi_util_get_src_register_swizzle(
   const struct tgsi_src_register *reg,
   unsigned component )
 {
   switch( component ) {
   case 0:
      return reg->SwizzleX;
   case 1:
      return reg->SwizzleY;
   case 2:
      return reg->SwizzleZ;
   case 3:
      return reg->SwizzleW;
   default:
      assert( 0 );
   }
   return 0;
 }

 unsigned
 tgsi_util_get_src_register_extswizzle(
   const struct tgsi_src_register_ext_swz *reg,
   unsigned component )
 {
   switch( component ) {
   case 0:
      return reg->ExtSwizzleX;
   case 1:
      return reg->ExtSwizzleY;
   case 2:
      return reg->ExtSwizzleZ;
   case 3:
      return reg->ExtSwizzleW;
   default:
      assert( 0 );
   }
   return 0;
 }

 unsigned
 tgsi_util_get_full_src_register_extswizzle(
   const struct tgsi_full_src_register  *reg,
   unsigned component )
 {
   unsigned swizzle;

   /*
    * First, calculate  the   extended swizzle for a given channel. This will give
    * us either a channel index into the simple swizzle or  a constant 1 or   0.
    */
   swizzle = tgsi_util_get_src_register_extswizzle(
      &reg->SrcRegisterExtSwz,
      component );

   assert (TGSI_SWIZZLE_X == TGSI_EXTSWIZZLE_X);
   assert (TGSI_SWIZZLE_Y == TGSI_EXTSWIZZLE_Y);
   assert (TGSI_SWIZZLE_Z == TGSI_EXTSWIZZLE_Z);
   assert (TGSI_SWIZZLE_W == TGSI_EXTSWIZZLE_W);
   assert (TGSI_EXTSWIZZLE_ZERO > TGSI_SWIZZLE_W);
   assert (TGSI_EXTSWIZZLE_ONE > TGSI_SWIZZLE_W);

   /*
    * Second, calculate the simple  swizzle  for   the   unswizzled channel index.
    * Leave the constants intact, they are   not   affected by the   simple swizzle.
    */
   if( swizzle <= TGSI_SWIZZLE_W ) {
      swizzle = tgsi_util_get_src_register_swizzle(
         &reg->SrcRegister,
         component );
   }

   return swizzle;
 }

 unsigned
 tgsi_util_get_src_register_extnegate(
   const  struct tgsi_src_register_ext_swz *reg,
   unsigned component )
 {
   switch( component ) {
   case 0:
      return reg->NegateX;
   case 1:
      return reg->NegateY;
   case 2:
      return reg->NegateZ;
   case 3:
      return reg->NegateW;
   default:
      assert( 0 );
   }
   return 0;
 }

 void
 tgsi_util_set_src_register_extnegate(
   struct tgsi_src_register_ext_swz *reg,
   unsigned negate,
   unsigned component )
 {
   switch( component ) {
   case 0:
      reg->NegateX = negate;
      break;
   case 1:
      reg->NegateY = negate;
      break;
   case 2:
      reg->NegateZ = negate;
      break;
   case 3:
      reg->NegateW = negate;
      break;
   default:
      assert( 0 );
   }
 }

 unsigned
 tgsi_util_get_full_src_register_sign_mode(
   const struct  tgsi_full_src_register *reg,
   unsigned component )
 {
   unsigned sign_mode;

   if( reg->SrcRegisterExtMod.Absolute ) {
      /* Consider only the post-abs negation. */

      if( reg->SrcRegisterExtMod.Negate ) {
         sign_mode = TGSI_UTIL_SIGN_SET;
      }
      else {
         sign_mode = TGSI_UTIL_SIGN_CLEAR;
      }
   }
   else {
      /* Accumulate the three negations. */

      unsigned negate;

      negate = reg->SrcRegister.Negate;
      if( tgsi_util_get_src_register_extnegate( &reg->SrcRegisterExtSwz, component ) ) {
         negate = !negate;
      }
      if( reg->SrcRegisterExtMod.Negate ) {
         negate = !negate;
      }

      if( negate ) {
         sign_mode = TGSI_UTIL_SIGN_TOGGLE;
      }
      else {
         sign_mode = TGSI_UTIL_SIGN_KEEP;
      }
   }

   return sign_mode;
 }
--- a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -0,0 +1,493 @@
 /**************************************************************************
 * 
 * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
 * All Rights Reserved.
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 * 
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 * 
 **************************************************************************/

 /*
  * Authors:
  *   Keith Whitwell <keith@tungstengraphics.com>
  */

 #include "pipe/p_util.h"
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
 #include "spu_exec.h"
 #include "spu_vertex_shader.h"


 #define DRAW_DBG 0


 /**
 * Fetch a float[4] vertex attribute from memory, doing format/type
 * conversion as needed.
 *
 * This is probably needed/dupliocated elsewhere, eg format
 * conversion, texture sampling etc.
 */
 #define FETCH_ATTRIB( NAME, SZ, CVT )			\
 static void						\
 fetch_##NAME(const void *ptr, float *attrib)		\
 {							\
   static const float defaults[4] = { 0,0,0,1 };	\
   int i;						\
 							\
   for (i = 0; i < SZ; i++) {				\
      attrib[i] = CVT;					\
   }							\
 							\
   for (; i < 4; i++) {					\
      attrib[i] = defaults[i];				\
   }							\
 }

 #define CVT_64_FLOAT   (float) ((double *) ptr)[i]
 #define CVT_32_FLOAT   ((float *) ptr)[i]

 #define CVT_8_USCALED  (float) ((unsigned char *) ptr)[i]
 #define CVT_16_USCALED (float) ((unsigned short *) ptr)[i]
 #define CVT_32_USCALED (float) ((unsigned int *) ptr)[i]

 #define CVT_8_SSCALED  (float) ((char *) ptr)[i]
 #define CVT_16_SSCALED (float) ((short *) ptr)[i]
 #define CVT_32_SSCALED (float) ((int *) ptr)[i]

 #define CVT_8_UNORM    (float) ((unsigned char *) ptr)[i] / 255.0f
 #define CVT_16_UNORM   (float) ((unsigned short *) ptr)[i] / 65535.0f
 #define CVT_32_UNORM   (float) ((unsigned int *) ptr)[i] / 4294967295.0f

 #define CVT_8_SNORM    (float) ((char *) ptr)[i] / 127.0f
 #define CVT_16_SNORM   (float) ((short *) ptr)[i] / 32767.0f
 #define CVT_32_SNORM   (float) ((int *) ptr)[i] / 2147483647.0f

 FETCH_ATTRIB( R64G64B64A64_FLOAT,   4, CVT_64_FLOAT )
 FETCH_ATTRIB( R64G64B64_FLOAT,      3, CVT_64_FLOAT )
 FETCH_ATTRIB( R64G64_FLOAT,         2, CVT_64_FLOAT )
 FETCH_ATTRIB( R64_FLOAT,            1, CVT_64_FLOAT )

 FETCH_ATTRIB( R32G32B32A32_FLOAT,   4, CVT_32_FLOAT )
 FETCH_ATTRIB( R32G32B32_FLOAT,      3, CVT_32_FLOAT )
 FETCH_ATTRIB( R32G32_FLOAT,         2, CVT_32_FLOAT )
 FETCH_ATTRIB( R32_FLOAT,            1, CVT_32_FLOAT )

 FETCH_ATTRIB( R32G32B32A32_USCALED, 4, CVT_32_USCALED )
 FETCH_ATTRIB( R32G32B32_USCALED,    3, CVT_32_USCALED )
 FETCH_ATTRIB( R32G32_USCALED,       2, CVT_32_USCALED )
 FETCH_ATTRIB( R32_USCALED,          1, CVT_32_USCALED )

 FETCH_ATTRIB( R32G32B32A32_SSCALED, 4, CVT_32_SSCALED )
 FETCH_ATTRIB( R32G32B32_SSCALED,    3, CVT_32_SSCALED )
 FETCH_ATTRIB( R32G32_SSCALED,       2, CVT_32_SSCALED )
 FETCH_ATTRIB( R32_SSCALED,          1, CVT_32_SSCALED )

 FETCH_ATTRIB( R32G32B32A32_UNORM, 4, CVT_32_UNORM )
 FETCH_ATTRIB( R32G32B32_UNORM,    3, CVT_32_UNORM )
 FETCH_ATTRIB( R32G32_UNORM,       2, CVT_32_UNORM )
 FETCH_ATTRIB( R32_UNORM,          1, CVT_32_UNORM )

 FETCH_ATTRIB( R32G32B32A32_SNORM, 4, CVT_32_SNORM )
 FETCH_ATTRIB( R32G32B32_SNORM,    3, CVT_32_SNORM )
 FETCH_ATTRIB( R32G32_SNORM,       2, CVT_32_SNORM )
 FETCH_ATTRIB( R32_SNORM,          1, CVT_32_SNORM )

 FETCH_ATTRIB( R16G16B16A16_USCALED, 4, CVT_16_USCALED )
 FETCH_ATTRIB( R16G16B16_USCALED,    3, CVT_16_USCALED )
 FETCH_ATTRIB( R16G16_USCALED,       2, CVT_16_USCALED )
 FETCH_ATTRIB( R16_USCALED,          1, CVT_16_USCALED )

 FETCH_ATTRIB( R16G16B16A16_SSCALED, 4, CVT_16_SSCALED )
 FETCH_ATTRIB( R16G16B16_SSCALED,    3, CVT_16_SSCALED )
 FETCH_ATTRIB( R16G16_SSCALED,       2, CVT_16_SSCALED )
 FETCH_ATTRIB( R16_SSCALED,          1, CVT_16_SSCALED )

 FETCH_ATTRIB( R16G16B16A16_UNORM, 4, CVT_16_UNORM )
 FETCH_ATTRIB( R16G16B16_UNORM,    3, CVT_16_UNORM )
 FETCH_ATTRIB( R16G16_UNORM,       2, CVT_16_UNORM )
 FETCH_ATTRIB( R16_UNORM,          1, CVT_16_UNORM )

 FETCH_ATTRIB( R16G16B16A16_SNORM, 4, CVT_16_SNORM )
 FETCH_ATTRIB( R16G16B16_SNORM,    3, CVT_16_SNORM )
 FETCH_ATTRIB( R16G16_SNORM,       2, CVT_16_SNORM )
 FETCH_ATTRIB( R16_SNORM,          1, CVT_16_SNORM )

 FETCH_ATTRIB( R8G8B8A8_USCALED,   4, CVT_8_USCALED )
 FETCH_ATTRIB( R8G8B8_USCALED,     3, CVT_8_USCALED )
 FETCH_ATTRIB( R8G8_USCALED,       2, CVT_8_USCALED )
 FETCH_ATTRIB( R8_USCALED,         1, CVT_8_USCALED )

 FETCH_ATTRIB( R8G8B8A8_SSCALED,  4, CVT_8_SSCALED )
 FETCH_ATTRIB( R8G8B8_SSCALED,    3, CVT_8_SSCALED )
 FETCH_ATTRIB( R8G8_SSCALED,      2, CVT_8_SSCALED )
 FETCH_ATTRIB( R8_SSCALED,        1, CVT_8_SSCALED )

 FETCH_ATTRIB( R8G8B8A8_UNORM,  4, CVT_8_UNORM )
 FETCH_ATTRIB( R8G8B8_UNORM,    3, CVT_8_UNORM )
 FETCH_ATTRIB( R8G8_UNORM,      2, CVT_8_UNORM )
 FETCH_ATTRIB( R8_UNORM,        1, CVT_8_UNORM )

 FETCH_ATTRIB( R8G8B8A8_SNORM,  4, CVT_8_SNORM )
 FETCH_ATTRIB( R8G8B8_SNORM,    3, CVT_8_SNORM )
 FETCH_ATTRIB( R8G8_SNORM,      2, CVT_8_SNORM )
 FETCH_ATTRIB( R8_SNORM,        1, CVT_8_SNORM )

 FETCH_ATTRIB( A8R8G8B8_UNORM,       4, CVT_8_UNORM )
 //FETCH_ATTRIB( R8G8B8A8_UNORM,       4, CVT_8_UNORM )



 static spu_fetch_func get_fetch_func( enum pipe_format format )
 {
 #if 0
   {
      char tmp[80];
      pf_sprint_name(tmp, format);
      _mesa_printf("%s: %s\n", __FUNCTION__, tmp);
   }
 #endif

   switch (format) {
   case PIPE_FORMAT_R64_FLOAT:
      return fetch_R64_FLOAT;
   case PIPE_FORMAT_R64G64_FLOAT:
      return fetch_R64G64_FLOAT;
   case PIPE_FORMAT_R64G64B64_FLOAT:
      return fetch_R64G64B64_FLOAT;
   case PIPE_FORMAT_R64G64B64A64_FLOAT:
      return fetch_R64G64B64A64_FLOAT;

   case PIPE_FORMAT_R32_FLOAT:
      return fetch_R32_FLOAT;
   case PIPE_FORMAT_R32G32_FLOAT:
      return fetch_R32G32_FLOAT;
   case PIPE_FORMAT_R32G32B32_FLOAT:
      return fetch_R32G32B32_FLOAT;
   case PIPE_FORMAT_R32G32B32A32_FLOAT:
      return fetch_R32G32B32A32_FLOAT;

   case PIPE_FORMAT_R32_UNORM:
      return fetch_R32_UNORM;
   case PIPE_FORMAT_R32G32_UNORM:
      return fetch_R32G32_UNORM;
   case PIPE_FORMAT_R32G32B32_UNORM:
      return fetch_R32G32B32_UNORM;
   case PIPE_FORMAT_R32G32B32A32_UNORM:
      return fetch_R32G32B32A32_UNORM;

   case PIPE_FORMAT_R32_USCALED:
      return fetch_R32_USCALED;
   case PIPE_FORMAT_R32G32_USCALED:
      return fetch_R32G32_USCALED;
   case PIPE_FORMAT_R32G32B32_USCALED:
      return fetch_R32G32B32_USCALED;
   case PIPE_FORMAT_R32G32B32A32_USCALED:
      return fetch_R32G32B32A32_USCALED;

   case PIPE_FORMAT_R32_SNORM:
      return fetch_R32_SNORM;
   case PIPE_FORMAT_R32G32_SNORM:
      return fetch_R32G32_SNORM;
   case PIPE_FORMAT_R32G32B32_SNORM:
      return fetch_R32G32B32_SNORM;
   case PIPE_FORMAT_R32G32B32A32_SNORM:
      return fetch_R32G32B32A32_SNORM;

   case PIPE_FORMAT_R32_SSCALED:
      return fetch_R32_SSCALED;
   case PIPE_FORMAT_R32G32_SSCALED:
      return fetch_R32G32_SSCALED;
   case PIPE_FORMAT_R32G32B32_SSCALED:
      return fetch_R32G32B32_SSCALED;
   case PIPE_FORMAT_R32G32B32A32_SSCALED:
      return fetch_R32G32B32A32_SSCALED;

   case PIPE_FORMAT_R16_UNORM:
      return fetch_R16_UNORM;
   case PIPE_FORMAT_R16G16_UNORM:
      return fetch_R16G16_UNORM;
   case PIPE_FORMAT_R16G16B16_UNORM:
      return fetch_R16G16B16_UNORM;
   case PIPE_FORMAT_R16G16B16A16_UNORM:
      return fetch_R16G16B16A16_UNORM;

   case PIPE_FORMAT_R16_USCALED:
      return fetch_R16_USCALED;
   case PIPE_FORMAT_R16G16_USCALED:
      return fetch_R16G16_USCALED;
   case PIPE_FORMAT_R16G16B16_USCALED:
      return fetch_R16G16B16_USCALED;
   case PIPE_FORMAT_R16G16B16A16_USCALED:
      return fetch_R16G16B16A16_USCALED;

   case PIPE_FORMAT_R16_SNORM:
      return fetch_R16_SNORM;
   case PIPE_FORMAT_R16G16_SNORM:
      return fetch_R16G16_SNORM;
   case PIPE_FORMAT_R16G16B16_SNORM:
      return fetch_R16G16B16_SNORM;
   case PIPE_FORMAT_R16G16B16A16_SNORM:
      return fetch_R16G16B16A16_SNORM;

   case PIPE_FORMAT_R16_SSCALED:
      return fetch_R16_SSCALED;
   case PIPE_FORMAT_R16G16_SSCALED:
      return fetch_R16G16_SSCALED;
   case PIPE_FORMAT_R16G16B16_SSCALED:
      return fetch_R16G16B16_SSCALED;
   case PIPE_FORMAT_R16G16B16A16_SSCALED:
      return fetch_R16G16B16A16_SSCALED;

   case PIPE_FORMAT_R8_UNORM:
      return fetch_R8_UNORM;
   case PIPE_FORMAT_R8G8_UNORM:
      return fetch_R8G8_UNORM;
   case PIPE_FORMAT_R8G8B8_UNORM:
      return fetch_R8G8B8_UNORM;
   case PIPE_FORMAT_R8G8B8A8_UNORM:
      return fetch_R8G8B8A8_UNORM;

   case PIPE_FORMAT_R8_USCALED:
      return fetch_R8_USCALED;
   case PIPE_FORMAT_R8G8_USCALED:
      return fetch_R8G8_USCALED;
   case PIPE_FORMAT_R8G8B8_USCALED:
      return fetch_R8G8B8_USCALED;
   case PIPE_FORMAT_R8G8B8A8_USCALED:
      return fetch_R8G8B8A8_USCALED;

   case PIPE_FORMAT_R8_SNORM:
      return fetch_R8_SNORM;
   case PIPE_FORMAT_R8G8_SNORM:
      return fetch_R8G8_SNORM;
   case PIPE_FORMAT_R8G8B8_SNORM:
      return fetch_R8G8B8_SNORM;
   case PIPE_FORMAT_R8G8B8A8_SNORM:
      return fetch_R8G8B8A8_SNORM;

   case PIPE_FORMAT_R8_SSCALED:
      return fetch_R8_SSCALED;
   case PIPE_FORMAT_R8G8_SSCALED:
      return fetch_R8G8_SSCALED;
   case PIPE_FORMAT_R8G8B8_SSCALED:
      return fetch_R8G8B8_SSCALED;
   case PIPE_FORMAT_R8G8B8A8_SSCALED:
      return fetch_R8G8B8A8_SSCALED;

   case PIPE_FORMAT_A8R8G8B8_UNORM:
      return fetch_A8R8G8B8_UNORM;

   case 0:
      return NULL;		/* not sure why this is needed */

   default:
      assert(0);
      return NULL;
   }
 }


 static void 
 transpose_4x4( float *out, const float *in )
 {
   /* This can be achieved in 12 sse instructions, plus the final
    * stores I guess.  This is probably a bit more than that - maybe
    * 32 or so?
    */
   out[0] = in[0];  out[1] = in[4];  out[2] = in[8];   out[3] = in[12];
   out[4] = in[1];  out[5] = in[5];  out[6] = in[9];   out[7] = in[13];
   out[8] = in[2];  out[9] = in[6];  out[10] = in[10]; out[11] = in[14];
   out[12] = in[3]; out[13] = in[7]; out[14] = in[11]; out[15] = in[15];
 }



 static void fetch_xyz_rgb( struct spu_vs_context *draw,
 			   struct spu_exec_machine *machine,
 			   const unsigned *elts,
 			   unsigned count )
 {
   assert(count <= 4);

 //   _mesa_printf("%s\n", __FUNCTION__);

   /* loop over vertex attributes (vertex shader inputs)
    */

   const unsigned *pitch   = draw->vertex_fetch.pitch;
   const ubyte **src       = draw->vertex_fetch.src_ptr;
   int i;

   for (i = 0; i < 4; i++) {
      {
 	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
 	 float *out = &machine->Inputs[0].xyzw[0].f[i];
 	 out[0] = in[0];
 	 out[4] = in[1];
 	 out[8] = in[2];
 	 out[12] = 1.0f;
      }

      {
 	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
 	 float *out = &machine->Inputs[1].xyzw[0].f[i];
 	 out[0] = in[0];
 	 out[4] = in[1];
 	 out[8] = in[2];
 	 out[12] = 1.0f;
      }
   }
 }




 static void fetch_xyz_rgb_st( struct spu_vs_context *draw,
 			      struct spu_exec_machine *machine,
 			      const unsigned *elts,
 			      unsigned count )
 {
   assert(count <= 4);

   /* loop over vertex attributes (vertex shader inputs)
    */

   const unsigned *pitch   = draw->vertex_fetch.pitch;
   const ubyte **src       = draw->vertex_fetch.src_ptr;
   int i;

   for (i = 0; i < 4; i++) {
      {
 	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
 	 float *out = &machine->Inputs[0].xyzw[0].f[i];
 	 out[0] = in[0];
 	 out[4] = in[1];
 	 out[8] = in[2];
 	 out[12] = 1.0f;
      }

      {
 	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
 	 float *out = &machine->Inputs[1].xyzw[0].f[i];
 	 out[0] = in[0];
 	 out[4] = in[1];
 	 out[8] = in[2];
 	 out[12] = 1.0f;
      }

      {
 	 const float *in = (const float *)(src[2] + elts[i] * pitch[2]);
 	 float *out = &machine->Inputs[1].xyzw[0].f[i];
 	 out[0] = in[0];
 	 out[4] = in[1];
 	 out[8] = 0.0f;
 	 out[12] = 1.0f;
      }
   }
 }




 /**
 * Fetch vertex attributes for 'count' vertices.
 */
 static void generic_vertex_fetch( struct spu_vs_context *draw,
 				  struct spu_exec_machine *machine,
 				  const unsigned *elts,
 				  unsigned count )
 {
   unsigned nr_attrs = draw->vertex_fetch.nr_attrs;
   unsigned attr;

   assert(count <= 4);

 //   _mesa_printf("%s %d\n", __FUNCTION__, count);

   /* loop over vertex attributes (vertex shader inputs)
    */
   for (attr = 0; attr < nr_attrs; attr++) {

      const unsigned pitch   = draw->vertex_fetch.pitch[attr];
      const ubyte *src = draw->vertex_fetch.src_ptr[attr];
      const spu_fetch_func fetch = draw->vertex_fetch.fetch[attr];
      unsigned i;
      float p[4][4];


      /* Fetch four attributes for four vertices.  
       * 
       * Could fetch directly into AOS format, but this is meant to be
       * a prototype for an sse implementation, which would have
       * difficulties doing that.
       */
      for (i = 0; i < count; i++) 
 	 fetch( src + elts[i] * pitch, p[i] );

      /* Be nice and zero out any missing vertices: 
       */
      for (/* empty */; i < 4; i++) 
 	 p[i][0] = p[i][1] = p[i][2] = p[i][3] = 0;
      
      /* Transpose/swizzle into sse-friendly format.  Currently
       * assuming that all vertex shader inputs are float[4], but this
       * isn't true -- if the vertex shader only wants tex0.xy, we
       * could optimize for that.
       *
       * To do so fully without codegen would probably require an
       * excessive number of fetch functions, but we could at least
       * minimize the transpose step:
       */
      transpose_4x4( (float *)&machine->Inputs[attr].xyzw[0].f[0], (float *)p );
   }
 }


 void spu_update_vertex_fetch( struct spu_vs_context *draw )
 {
   unsigned i;

   
   for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
      draw->vertex_fetch.fetch[i] =
          get_fetch_func(draw->vertex_fetch.format[i]);
   }

   draw->vertex_fetch.fetch_func = generic_vertex_fetch;

   switch (draw->vertex_fetch.nr_attrs) {
   case 2:
      if (draw->vertex_fetch.format[0] == PIPE_FORMAT_R32G32B32_FLOAT &&
          draw->vertex_fetch.format[1] == PIPE_FORMAT_R32G32B32_FLOAT)
          draw->vertex_fetch.fetch_func = fetch_xyz_rgb;
      break;
   case 3:
      if (draw->vertex_fetch.format[0] == PIPE_FORMAT_R32G32B32_FLOAT &&
          draw->vertex_fetch.format[1] == PIPE_FORMAT_R32G32B32_FLOAT &&
          draw->vertex_fetch.format[2] == PIPE_FORMAT_R32G32_FLOAT)
          draw->vertex_fetch.fetch_func = fetch_xyz_rgb_st;
      break;
   default:
      break;
   }
 }
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.c
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
@@ -0,0 +1,224 @@
 /**************************************************************************
 * 
 * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
 * All Rights Reserved.
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 * 
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 * 
 **************************************************************************/

 /*
  * Authors:
  *   Keith Whitwell <keith@tungstengraphics.com>
  *   Brian Paul
  *   Ian Romanick <idr@us.ibm.com>
  */

 #include "pipe/p_util.h"
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
 #include "spu_vertex_shader.h"
 #include "spu_exec.h"
 #include "pipe/draw/draw_private.h"
 #include "pipe/draw/draw_context.h"
 #include "pipe/cell/common.h"

 #define DBG_VS 0


 static INLINE unsigned
 compute_clipmask(const float *clip, /*const*/ float plane[][4], unsigned nr)
 {
   unsigned mask = 0;
   unsigned i;

   /* Do the hardwired planes first:
    */
   if (-clip[0] + clip[3] < 0) mask |= CLIP_RIGHT_BIT;
   if ( clip[0] + clip[3] < 0) mask |= CLIP_LEFT_BIT;
   if (-clip[1] + clip[3] < 0) mask |= CLIP_TOP_BIT;
   if ( clip[1] + clip[3] < 0) mask |= CLIP_BOTTOM_BIT;
   if (-clip[2] + clip[3] < 0) mask |= CLIP_FAR_BIT;
   if ( clip[2] + clip[3] < 0) mask |= CLIP_NEAR_BIT;

   /* Followed by any remaining ones:
    */
   for (i = 6; i < nr; i++) {
      if (dot4(clip, plane[i]) < 0) 
         mask |= (1<<i);
   }

   return mask;
 }


 /**
 * Transform vertices with the current vertex program/shader
 * Up to four vertices can be shaded at a time.
 * \param vbuffer  the input vertex data
 * \param elts  indexes of four input vertices
 * \param count  number of vertices to shade [1..4]
 * \param vOut  array of pointers to four output vertices
 */
 static void
 run_vertex_program(struct spu_vs_context *draw,
                   unsigned elts[4], unsigned count,
                   struct vertex_header *vOut[])
 {
   struct spu_exec_machine *machine = &draw->machine;
   unsigned int j;

   ALIGN16_DECL(struct spu_exec_vector, inputs, PIPE_ATTRIB_MAX);
   ALIGN16_DECL(struct spu_exec_vector, outputs, PIPE_ATTRIB_MAX);
   const float *scale = draw->viewport.scale;
   const float *trans = draw->viewport.translate;

   assert(count <= 4);

   /* Consts does not require 16 byte alignment. */
   ASSERT_ALIGN16(draw->constants);
   machine->Consts = (float (*)[4]) draw->constants;

   machine->Inputs = ALIGN16_ASSIGN(inputs);
   machine->Outputs = ALIGN16_ASSIGN(outputs);

   spu_vertex_fetch( draw, machine, elts, count );

   /* run shader */
   spu_exec_machine_run( machine );


   /* store machine results */
   for (j = 0; j < count; j++) {
      unsigned slot;
      float x, y, z, w;

      /* Handle attr[0] (position) specially:
       *
       * XXX: Computing the clipmask should be done in the vertex
       * program as a set of DP4 instructions appended to the
       * user-provided code.
       */
      x = vOut[j]->clip[0] = machine->Outputs[0].xyzw[0].f[j];
      y = vOut[j]->clip[1] = machine->Outputs[0].xyzw[1].f[j];
      z = vOut[j]->clip[2] = machine->Outputs[0].xyzw[2].f[j];
      w = vOut[j]->clip[3] = machine->Outputs[0].xyzw[3].f[j];

      vOut[j]->clipmask = compute_clipmask(vOut[j]->clip, draw->plane,
 					   draw->nr_planes);
      vOut[j]->edgeflag = 1;

      /* divide by w */
      w = 1.0f / w;
      x *= w;
      y *= w;
      z *= w;

      /* Viewport mapping */
      vOut[j]->data[0][0] = x * scale[0] + trans[0];
      vOut[j]->data[0][1] = y * scale[1] + trans[1];
      vOut[j]->data[0][2] = z * scale[2] + trans[2];
      vOut[j]->data[0][3] = w;

 #if DBG_VS
      printf("output[%d]win: %f %f %f %f\n", j,
             vOut[j]->data[0][0],
             vOut[j]->data[0][1],
             vOut[j]->data[0][2],
             vOut[j]->data[0][3]);
 #endif
      /* Remaining attributes are packed into sequential post-transform
       * vertex attrib slots.
       */
      for (slot = 1; slot < draw->num_vs_outputs; slot++) {
         vOut[j]->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
         vOut[j]->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
         vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
         vOut[j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
 #if DBG_VS
         printf("output[%d][%d]: %f %f %f %f\n", j, slot,
                vOut[j]->data[slot][0],
                vOut[j]->data[slot][1],
                vOut[j]->data[slot][2],
                vOut[j]->data[slot][3]);
 #endif
      }
   } /* loop over vertices */
 }


 static void
 spu_bind_vertex_shader(struct spu_vs_context *draw,
 		       void *uniforms,
 		       void *planes,
 		       unsigned nr_planes,
 		       unsigned num_outputs
 		       )
 {
   draw->constants = (float (*)[4]) uniforms;

   (void) memcpy(draw->plane, planes, sizeof(float) * 4 * nr_planes);
   draw->nr_planes = nr_planes;
   draw->num_vs_outputs = num_outputs;

   /* specify the shader to interpret/execute */
   spu_exec_machine_init(&draw->machine,
 			 PIPE_MAX_SAMPLERS,
 			 NULL /*samplers*/,
 			 PIPE_SHADER_VERTEX);
 }


 void
 spu_execute_vertex_shader(struct spu_vs_context *draw,
 			  const struct cell_command_vs *vs)
 {
   unsigned i;
   unsigned j;

   draw->machine.Instructions = (struct tgsi_full_instruction *)
       vs->shader.instructions;
   draw->machine.NumInstructions = vs->shader.num_instructions;

   draw->machine.Declarations = (struct tgsi_full_declaration *)
       vs->shader.declarations;
   draw->machine.NumDeclarations = vs->shader.num_declarations;

   spu_bind_vertex_shader(draw, vs->shader.uniforms,
 			  NULL, -1,
 			  vs->shader.num_outputs);
   
   for (i = 0; i < vs->num_elts; i += 4) {
      const unsigned batch_size = MIN2(vs->num_elts - i, 4);
      unsigned elts[4];

      for (j = 0; j < batch_size; j++) {
 	 switch (vs->bytes_per_elt) {
 	 case 1: elts[j] = ((unsigned char *) vs->elts)[i + j]; break;
 	 case 2: elts[j] = ((unsigned short *)vs->elts)[i + j]; break;
 	 case 4: elts[j] = ((unsigned int *)  vs->elts)[i + j]; break;
 	 }
      }

      run_vertex_program(draw, elts, batch_size,
 			 (struct vertex_header (*)[]) vs->vOut);
   }
 }
--- a/src/mesa/pipe/cell/spu/spu_vertex_shader.h
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
@@ -0,0 +1,61 @@
 #ifndef SPU_VERTEX_SHADER_H
 #define SPU_VERTEX_SHADER_H

 #include "pipe/p_format.h"
 #include "spu_exec.h"

 struct spu_vs_context;

 typedef void (*spu_fetch_func)(const void *ptr, float *attrib);
 typedef void (*spu_full_fetch_func)( struct spu_vs_context *draw,
 				     struct spu_exec_machine *machine,
 				     const unsigned *elts,
 				     unsigned count );

 struct spu_vs_context {
   struct pipe_viewport_state viewport;

   struct {
      const ubyte *src_ptr[PIPE_ATTRIB_MAX];
      unsigned pitch[PIPE_ATTRIB_MAX];
      enum pipe_format format[PIPE_ATTRIB_MAX];
      unsigned nr_attrs;
      boolean dirty;

      spu_fetch_func fetch[PIPE_ATTRIB_MAX];
      spu_full_fetch_func fetch_func;
   } vertex_fetch;
   
   /* Clip derived state:
    */
   float plane[12][4];
   unsigned nr_planes;

   struct spu_exec_machine machine;
   const float (*constants)[4];

   unsigned num_vs_outputs;
 };

 extern void spu_update_vertex_fetch(struct spu_vs_context *draw);

 static INLINE void spu_vertex_fetch(struct spu_vs_context *draw,
 				    struct spu_exec_machine *machine,
 				    const unsigned *elts,
 				    unsigned count)
 {
   if (draw->vertex_fetch.dirty) {
      spu_update_vertex_fetch(draw);
      draw->vertex_fetch.dirty = 0;
   }
   
   (*draw->vertex_fetch.fetch_func)(draw, machine, elts, count);
 }

 struct cell_command_vs;

 extern void
 spu_execute_vertex_shader(struct spu_vs_context *draw,
 			  const struct cell_command_vs *vs);

 #endif /* SPU_VERTEX_SHADER_H */