Clone of mesa.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

tgsi_ppc.c 40KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363
  1. /**************************************************************************
  2. *
  3. * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
  4. * All Rights Reserved.
  5. *
  6. * Permission is hereby granted, free of charge, to any person obtaining a
  7. * copy of this software and associated documentation files (the
  8. * "Software"), to deal in the Software without restriction, including
  9. * without limitation the rights to use, copy, modify, merge, publish,
  10. * distribute, sub license, and/or sell copies of the Software, and to
  11. * permit persons to whom the Software is furnished to do so, subject to
  12. * the following conditions:
  13. *
  14. * The above copyright notice and this permission notice (including the
  15. * next paragraph) shall be included in all copies or substantial portions
  16. * of the Software.
  17. *
  18. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  19. * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20. * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
  21. * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
  22. * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  23. * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  24. * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25. *
  26. **************************************************************************/
  27. /**
  28. * TGSI to PowerPC code generation.
  29. */
  30. #include "pipe/p_config.h"
  31. #if defined(PIPE_ARCH_PPC)
  32. #include "pipe/p_debug.h"
  33. #include "pipe/p_shader_tokens.h"
  34. #include "util/u_math.h"
  35. #include "util/u_memory.h"
  36. #include "util/u_sse.h"
  37. #include "tgsi/tgsi_parse.h"
  38. #include "tgsi/tgsi_util.h"
  39. #include "tgsi_dump.h"
  40. #include "tgsi_exec.h"
  41. #include "tgsi_ppc.h"
  42. #include "rtasm/rtasm_ppc.h"
  43. /**
  44. * Since it's pretty much impossible to form PPC vector immediates, load
  45. * them from memory here:
  46. */
  47. const float ppc_builtin_constants[] ALIGN16_ATTRIB = {
  48. 1.0f, -128.0f, 128.0, 0.0
  49. };
  50. #define FOR_EACH_CHANNEL( CHAN )\
  51. for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
  52. #define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  53. ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
  54. #define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
  55. if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
  56. #define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
  57. FOR_EACH_CHANNEL( CHAN )\
  58. IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
  59. #define CHAN_X 0
  60. #define CHAN_Y 1
  61. #define CHAN_Z 2
  62. #define CHAN_W 3
  63. /**
  64. * How many TGSI temps should be implemented with real PPC vector registers
  65. * rather than memory.
  66. */
  67. #define MAX_PPC_TEMPS 3
  68. /**
  69. * Context/state used during code gen.
  70. */
  71. struct gen_context
  72. {
  73. struct ppc_function *f;
  74. int inputs_reg; /**< GP register pointing to input params */
  75. int outputs_reg; /**< GP register pointing to output params */
  76. int temps_reg; /**< GP register pointing to temporary "registers" */
  77. int immed_reg; /**< GP register pointing to immediates buffer */
  78. int const_reg; /**< GP register pointing to constants buffer */
  79. int builtins_reg; /**< GP register pointint to built-in constants */
  80. int offset_reg; /**< used to reduce redundant li instructions */
  81. int offset_value;
  82. int one_vec; /**< vector register with {1.0, 1.0, 1.0, 1.0} */
  83. int bit31_vec; /**< vector register with {1<<31, 1<<31, 1<<31, 1<<31} */
  84. /**
  85. * Map TGSI temps to PPC vector temps.
  86. * We have 32 PPC vector regs. Use 16 of them for storing 4 TGSI temps.
  87. * XXX currently only do this for TGSI temps [0..MAX_PPC_TEMPS-1].
  88. */
  89. int temps_map[MAX_PPC_TEMPS][4];
  90. /**
  91. * Cache of src registers.
  92. * This is used to avoid redundant load instructions.
  93. */
  94. struct {
  95. struct tgsi_full_src_register src;
  96. uint chan;
  97. uint vec;
  98. } regs[12]; /* 3 src regs, 4 channels */
  99. uint num_regs;
  100. };
  101. /**
  102. * Initialize code generation context.
  103. */
  104. static void
  105. init_gen_context(struct gen_context *gen, struct ppc_function *func)
  106. {
  107. uint i;
  108. memset(gen, 0, sizeof(*gen));
  109. gen->f = func;
  110. gen->inputs_reg = ppc_reserve_register(func, 3); /* first function param */
  111. gen->outputs_reg = ppc_reserve_register(func, 4); /* second function param */
  112. gen->temps_reg = ppc_reserve_register(func, 5); /* ... */
  113. gen->immed_reg = ppc_reserve_register(func, 6);
  114. gen->const_reg = ppc_reserve_register(func, 7);
  115. gen->builtins_reg = ppc_reserve_register(func, 8);
  116. gen->one_vec = -1;
  117. gen->bit31_vec = -1;
  118. gen->offset_reg = -1;
  119. gen->offset_value = -9999999;
  120. for (i = 0; i < MAX_PPC_TEMPS; i++) {
  121. gen->temps_map[i][0] = ppc_allocate_vec_register(gen->f);
  122. gen->temps_map[i][1] = ppc_allocate_vec_register(gen->f);
  123. gen->temps_map[i][2] = ppc_allocate_vec_register(gen->f);
  124. gen->temps_map[i][3] = ppc_allocate_vec_register(gen->f);
  125. }
  126. }
  127. /**
  128. * Is the given TGSI register stored as a real PPC vector register?
  129. */
  130. static boolean
  131. is_ppc_vec_temporary(const struct tgsi_full_src_register *reg)
  132. {
  133. return (reg->SrcRegister.File == TGSI_FILE_TEMPORARY &&
  134. reg->SrcRegister.Index < MAX_PPC_TEMPS);
  135. }
  136. /**
  137. * Is the given TGSI register stored as a real PPC vector register?
  138. */
  139. static boolean
  140. is_ppc_vec_temporary_dst(const struct tgsi_full_dst_register *reg)
  141. {
  142. return (reg->DstRegister.File == TGSI_FILE_TEMPORARY &&
  143. reg->DstRegister.Index < MAX_PPC_TEMPS);
  144. }
  145. /**
  146. * All PPC vector load/store instructions form an effective address
  147. * by adding the contents of two registers. For example:
  148. * lvx v2,r8,r9 # v2 = memory[r8 + r9]
  149. * stvx v2,r8,r9 # memory[r8 + r9] = v2;
  150. * So our lvx/stvx instructions are typically preceded by an 'li' instruction
  151. * to load r9 (above) with an immediate (an offset).
  152. * This code emits that 'li' instruction, but only if the offset value is
  153. * different than the previous 'li'.
  154. * This optimization seems to save about 10% in the instruction count.
  155. * Note that we need to unconditionally emit an 'li' inside basic blocks
  156. * (such as inside loops).
  157. */
  158. static int
  159. emit_li_offset(struct gen_context *gen, int offset)
  160. {
  161. if (gen->offset_reg <= 0) {
  162. /* allocate a GP register for storing load/store offset */
  163. gen->offset_reg = ppc_allocate_register(gen->f);
  164. }
  165. /* emit new 'li' if offset is changing */
  166. if (gen->offset_value < 0 || gen->offset_value != offset) {
  167. gen->offset_value = offset;
  168. ppc_li(gen->f, gen->offset_reg, offset);
  169. }
  170. return gen->offset_reg;
  171. }
  172. /**
  173. * Forces subsequent emit_li_offset() calls to emit an 'li'.
  174. * To be called at the top of basic blocks.
  175. */
  176. static void
  177. reset_li_offset(struct gen_context *gen)
  178. {
  179. gen->offset_value = -9999999;
  180. }
  181. /**
  182. * Load the given vector register with {value, value, value, value}.
  183. * The value must be in the ppu_builtin_constants[] array.
  184. * We wouldn't need this if there was a simple way to load PPC vector
  185. * registers with immediate values!
  186. */
  187. static void
  188. load_constant_vec(struct gen_context *gen, int dst_vec, float value)
  189. {
  190. uint pos;
  191. for (pos = 0; pos < Elements(ppc_builtin_constants); pos++) {
  192. if (ppc_builtin_constants[pos] == value) {
  193. int offset = pos * 4;
  194. int offset_reg = emit_li_offset(gen, offset);
  195. /* Load 4-byte word into vector register.
  196. * The vector slot depends on the effective address we load from.
  197. * We know that our builtins start at a 16-byte boundary so we
  198. * know that 'swizzle' tells us which vector slot will have the
  199. * loaded word. The other vector slots will be undefined.
  200. */
  201. ppc_lvewx(gen->f, dst_vec, gen->builtins_reg, offset_reg);
  202. /* splat word[pos % 4] across the vector reg */
  203. ppc_vspltw(gen->f, dst_vec, dst_vec, pos % 4);
  204. return;
  205. }
  206. }
  207. assert(0 && "Need to add new constant to ppc_builtin_constants array");
  208. }
  209. /**
  210. * Return index of vector register containing {1.0, 1.0, 1.0, 1.0}.
  211. */
  212. static int
  213. gen_one_vec(struct gen_context *gen)
  214. {
  215. if (gen->one_vec < 0) {
  216. gen->one_vec = ppc_allocate_vec_register(gen->f);
  217. load_constant_vec(gen, gen->one_vec, 1.0f);
  218. }
  219. return gen->one_vec;
  220. }
  221. /**
  222. * Return index of vector register containing {1<<31, 1<<31, 1<<31, 1<<31}.
  223. */
  224. static int
  225. gen_get_bit31_vec(struct gen_context *gen)
  226. {
  227. if (gen->bit31_vec < 0) {
  228. gen->bit31_vec = ppc_allocate_vec_register(gen->f);
  229. ppc_vspltisw(gen->f, gen->bit31_vec, -1);
  230. ppc_vslw(gen->f, gen->bit31_vec, gen->bit31_vec, gen->bit31_vec);
  231. }
  232. return gen->bit31_vec;
  233. }
  234. /**
  235. * Register fetch. Return PPC vector register with result.
  236. */
  237. static int
  238. emit_fetch(struct gen_context *gen,
  239. const struct tgsi_full_src_register *reg,
  240. const unsigned chan_index)
  241. {
  242. uint swizzle = tgsi_util_get_full_src_register_extswizzle(reg, chan_index);
  243. int dst_vec = -1;
  244. switch (swizzle) {
  245. case TGSI_EXTSWIZZLE_X:
  246. case TGSI_EXTSWIZZLE_Y:
  247. case TGSI_EXTSWIZZLE_Z:
  248. case TGSI_EXTSWIZZLE_W:
  249. switch (reg->SrcRegister.File) {
  250. case TGSI_FILE_INPUT:
  251. {
  252. int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
  253. int offset_reg = emit_li_offset(gen, offset);
  254. dst_vec = ppc_allocate_vec_register(gen->f);
  255. ppc_lvx(gen->f, dst_vec, gen->inputs_reg, offset_reg);
  256. }
  257. break;
  258. case TGSI_FILE_TEMPORARY:
  259. if (is_ppc_vec_temporary(reg)) {
  260. /* use PPC vec register */
  261. dst_vec = gen->temps_map[reg->SrcRegister.Index][swizzle];
  262. }
  263. else {
  264. /* use memory-based temp register "file" */
  265. int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
  266. int offset_reg = emit_li_offset(gen, offset);
  267. dst_vec = ppc_allocate_vec_register(gen->f);
  268. ppc_lvx(gen->f, dst_vec, gen->temps_reg, offset_reg);
  269. }
  270. break;
  271. case TGSI_FILE_IMMEDIATE:
  272. {
  273. int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
  274. int offset_reg = emit_li_offset(gen, offset);
  275. dst_vec = ppc_allocate_vec_register(gen->f);
  276. /* Load 4-byte word into vector register.
  277. * The vector slot depends on the effective address we load from.
  278. * We know that our immediates start at a 16-byte boundary so we
  279. * know that 'swizzle' tells us which vector slot will have the
  280. * loaded word. The other vector slots will be undefined.
  281. */
  282. ppc_lvewx(gen->f, dst_vec, gen->immed_reg, offset_reg);
  283. /* splat word[swizzle] across the vector reg */
  284. ppc_vspltw(gen->f, dst_vec, dst_vec, swizzle);
  285. }
  286. break;
  287. case TGSI_FILE_CONSTANT:
  288. {
  289. int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
  290. int offset_reg = emit_li_offset(gen, offset);
  291. dst_vec = ppc_allocate_vec_register(gen->f);
  292. /* Load 4-byte word into vector register.
  293. * The vector slot depends on the effective address we load from.
  294. * We know that our constants start at a 16-byte boundary so we
  295. * know that 'swizzle' tells us which vector slot will have the
  296. * loaded word. The other vector slots will be undefined.
  297. */
  298. ppc_lvewx(gen->f, dst_vec, gen->const_reg, offset_reg);
  299. /* splat word[swizzle] across the vector reg */
  300. ppc_vspltw(gen->f, dst_vec, dst_vec, swizzle);
  301. }
  302. break;
  303. default:
  304. assert( 0 );
  305. }
  306. break;
  307. case TGSI_EXTSWIZZLE_ZERO:
  308. ppc_vzero(gen->f, dst_vec);
  309. break;
  310. case TGSI_EXTSWIZZLE_ONE:
  311. {
  312. int one_vec = gen_one_vec(gen);
  313. dst_vec = ppc_allocate_vec_register(gen->f);
  314. ppc_vmove(gen->f, dst_vec, one_vec);
  315. }
  316. break;
  317. default:
  318. assert( 0 );
  319. }
  320. assert(dst_vec >= 0);
  321. {
  322. uint sign_op = tgsi_util_get_full_src_register_sign_mode(reg, chan_index);
  323. if (sign_op != TGSI_UTIL_SIGN_KEEP) {
  324. int bit31_vec = gen_get_bit31_vec(gen);
  325. int dst_vec2;
  326. if (is_ppc_vec_temporary(reg)) {
  327. /* need to use a new temp */
  328. dst_vec2 = ppc_allocate_vec_register(gen->f);
  329. }
  330. else {
  331. dst_vec2 = dst_vec;
  332. }
  333. switch (sign_op) {
  334. case TGSI_UTIL_SIGN_CLEAR:
  335. /* vec = vec & ~bit31 */
  336. ppc_vandc(gen->f, dst_vec2, dst_vec, bit31_vec);
  337. break;
  338. case TGSI_UTIL_SIGN_SET:
  339. /* vec = vec | bit31 */
  340. ppc_vor(gen->f, dst_vec2, dst_vec, bit31_vec);
  341. break;
  342. case TGSI_UTIL_SIGN_TOGGLE:
  343. /* vec = vec ^ bit31 */
  344. ppc_vxor(gen->f, dst_vec2, dst_vec, bit31_vec);
  345. break;
  346. default:
  347. assert(0);
  348. }
  349. return dst_vec2;
  350. }
  351. }
  352. return dst_vec;
  353. }
  354. /**
  355. * Test if two TGSI src registers refer to the same memory location.
  356. * We use this to avoid redundant register loads.
  357. */
  358. static boolean
  359. equal_src_locs(const struct tgsi_full_src_register *a, uint chan_a,
  360. const struct tgsi_full_src_register *b, uint chan_b)
  361. {
  362. int swz_a, swz_b;
  363. int sign_a, sign_b;
  364. if (a->SrcRegister.File != b->SrcRegister.File)
  365. return FALSE;
  366. if (a->SrcRegister.Index != b->SrcRegister.Index)
  367. return FALSE;
  368. swz_a = tgsi_util_get_full_src_register_extswizzle(a, chan_a);
  369. swz_b = tgsi_util_get_full_src_register_extswizzle(b, chan_b);
  370. if (swz_a != swz_b)
  371. return FALSE;
  372. sign_a = tgsi_util_get_full_src_register_sign_mode(a, chan_a);
  373. sign_b = tgsi_util_get_full_src_register_sign_mode(b, chan_b);
  374. if (sign_a != sign_b)
  375. return FALSE;
  376. return TRUE;
  377. }
  378. /**
  379. * Given a TGSI src register and channel index, return the PPC vector
  380. * register containing the value. We use a cache to prevent re-loading
  381. * the same register multiple times.
  382. * \return index of PPC vector register with the desired src operand
  383. */
  384. static int
  385. get_src_vec(struct gen_context *gen,
  386. struct tgsi_full_instruction *inst, int src_reg, uint chan)
  387. {
  388. const const struct tgsi_full_src_register *src =
  389. &inst->FullSrcRegisters[src_reg];
  390. int vec;
  391. uint i;
  392. /* check the cache */
  393. for (i = 0; i < gen->num_regs; i++) {
  394. if (equal_src_locs(&gen->regs[i].src, gen->regs[i].chan, src, chan)) {
  395. /* cache hit */
  396. assert(gen->regs[i].vec >= 0);
  397. return gen->regs[i].vec;
  398. }
  399. }
  400. /* cache miss: allocate new vec reg and emit fetch/load code */
  401. vec = emit_fetch(gen, src, chan);
  402. gen->regs[gen->num_regs].src = *src;
  403. gen->regs[gen->num_regs].chan = chan;
  404. gen->regs[gen->num_regs].vec = vec;
  405. gen->num_regs++;
  406. assert(gen->num_regs <= Elements(gen->regs));
  407. assert(vec >= 0);
  408. return vec;
  409. }
  410. /**
  411. * Clear the src operand cache. To be called at the end of each emit function.
  412. */
  413. static void
  414. release_src_vecs(struct gen_context *gen)
  415. {
  416. uint i;
  417. for (i = 0; i < gen->num_regs; i++) {
  418. const const struct tgsi_full_src_register src = gen->regs[i].src;
  419. if (!is_ppc_vec_temporary(&src)) {
  420. ppc_release_vec_register(gen->f, gen->regs[i].vec);
  421. }
  422. }
  423. gen->num_regs = 0;
  424. }
  425. static int
  426. get_dst_vec(struct gen_context *gen,
  427. const struct tgsi_full_instruction *inst,
  428. unsigned chan_index)
  429. {
  430. const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[0];
  431. if (is_ppc_vec_temporary_dst(reg)) {
  432. int vec = gen->temps_map[reg->DstRegister.Index][chan_index];
  433. return vec;
  434. }
  435. else {
  436. return ppc_allocate_vec_register(gen->f);
  437. }
  438. }
  439. /**
  440. * Register store. Store 'src_vec' at location indicated by 'reg'.
  441. * \param free_vec Should the src_vec be released when done?
  442. */
  443. static void
  444. emit_store(struct gen_context *gen,
  445. int src_vec,
  446. const struct tgsi_full_instruction *inst,
  447. unsigned chan_index,
  448. boolean free_vec)
  449. {
  450. const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[0];
  451. switch (reg->DstRegister.File) {
  452. case TGSI_FILE_OUTPUT:
  453. {
  454. int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
  455. int offset_reg = emit_li_offset(gen, offset);
  456. ppc_stvx(gen->f, src_vec, gen->outputs_reg, offset_reg);
  457. }
  458. break;
  459. case TGSI_FILE_TEMPORARY:
  460. if (is_ppc_vec_temporary_dst(reg)) {
  461. if (!free_vec) {
  462. int dst_vec = gen->temps_map[reg->DstRegister.Index][chan_index];
  463. if (dst_vec != src_vec)
  464. ppc_vmove(gen->f, dst_vec, src_vec);
  465. }
  466. free_vec = FALSE;
  467. }
  468. else {
  469. int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
  470. int offset_reg = emit_li_offset(gen, offset);
  471. ppc_stvx(gen->f, src_vec, gen->temps_reg, offset_reg);
  472. }
  473. break;
  474. #if 0
  475. case TGSI_FILE_ADDRESS:
  476. emit_addrs(
  477. func,
  478. xmm,
  479. reg->DstRegister.Index,
  480. chan_index );
  481. break;
  482. #endif
  483. default:
  484. assert( 0 );
  485. }
  486. #if 0
  487. switch( inst->Instruction.Saturate ) {
  488. case TGSI_SAT_NONE:
  489. break;
  490. case TGSI_SAT_ZERO_ONE:
  491. /* assert( 0 ); */
  492. break;
  493. case TGSI_SAT_MINUS_PLUS_ONE:
  494. assert( 0 );
  495. break;
  496. }
  497. #endif
  498. if (free_vec)
  499. ppc_release_vec_register(gen->f, src_vec);
  500. }
  501. static void
  502. emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
  503. {
  504. int v0, v1;
  505. uint chan_index;
  506. v0 = get_src_vec(gen, inst, 0, CHAN_X);
  507. v1 = ppc_allocate_vec_register(gen->f);
  508. switch (inst->Instruction.Opcode) {
  509. case TGSI_OPCODE_RSQ:
  510. /* v1 = 1.0 / sqrt(v0) */
  511. ppc_vrsqrtefp(gen->f, v1, v0);
  512. break;
  513. case TGSI_OPCODE_RCP:
  514. /* v1 = 1.0 / v0 */
  515. ppc_vrefp(gen->f, v1, v0);
  516. break;
  517. default:
  518. assert(0);
  519. }
  520. FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
  521. emit_store(gen, v1, inst, chan_index, FALSE);
  522. }
  523. release_src_vecs(gen);
  524. ppc_release_vec_register(gen->f, v1);
  525. }
  526. static void
  527. emit_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
  528. {
  529. uint chan_index;
  530. FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
  531. int v0 = get_src_vec(gen, inst, 0, chan_index); /* v0 = srcreg[0] */
  532. int v1 = get_dst_vec(gen, inst, chan_index);
  533. switch (inst->Instruction.Opcode) {
  534. case TGSI_OPCODE_ABS:
  535. /* turn off the most significant bit of each vector float word */
  536. {
  537. int bit31_vec = gen_get_bit31_vec(gen);
  538. ppc_vandc(gen->f, v1, v0, bit31_vec); /* v1 = v0 & ~bit31 */
  539. }
  540. break;
  541. case TGSI_OPCODE_FLOOR:
  542. ppc_vrfim(gen->f, v1, v0); /* v1 = floor(v0) */
  543. break;
  544. case TGSI_OPCODE_FRAC:
  545. ppc_vrfim(gen->f, v1, v0); /* tmp = floor(v0) */
  546. ppc_vsubfp(gen->f, v1, v0, v1); /* v1 = v0 - v1 */
  547. break;
  548. case TGSI_OPCODE_EXPBASE2:
  549. ppc_vexptefp(gen->f, v1, v0); /* v1 = 2^v0 */
  550. break;
  551. case TGSI_OPCODE_LOGBASE2:
  552. /* XXX this may be broken! */
  553. ppc_vlogefp(gen->f, v1, v0); /* v1 = log2(v0) */
  554. break;
  555. case TGSI_OPCODE_MOV:
  556. case TGSI_OPCODE_SWZ:
  557. if (v0 != v1)
  558. ppc_vmove(gen->f, v1, v0);
  559. break;
  560. default:
  561. assert(0);
  562. }
  563. emit_store(gen, v1, inst, chan_index, TRUE); /* store v0 */
  564. }
  565. release_src_vecs(gen);
  566. }
  567. static void
  568. emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
  569. {
  570. int zero_vec = -1;
  571. uint chan;
  572. if (inst->Instruction.Opcode == TGSI_OPCODE_MUL) {
  573. zero_vec = ppc_allocate_vec_register(gen->f);
  574. ppc_vzero(gen->f, zero_vec);
  575. }
  576. FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
  577. /* fetch src operands */
  578. int v0 = get_src_vec(gen, inst, 0, chan);
  579. int v1 = get_src_vec(gen, inst, 1, chan);
  580. int v2 = get_dst_vec(gen, inst, chan);
  581. /* emit binop */
  582. switch (inst->Instruction.Opcode) {
  583. case TGSI_OPCODE_ADD:
  584. ppc_vaddfp(gen->f, v2, v0, v1);
  585. break;
  586. case TGSI_OPCODE_SUB:
  587. ppc_vsubfp(gen->f, v2, v0, v1);
  588. break;
  589. case TGSI_OPCODE_MUL:
  590. ppc_vmaddfp(gen->f, v2, v0, v1, zero_vec);
  591. break;
  592. case TGSI_OPCODE_MIN:
  593. ppc_vminfp(gen->f, v2, v0, v1);
  594. break;
  595. case TGSI_OPCODE_MAX:
  596. ppc_vmaxfp(gen->f, v2, v0, v1);
  597. break;
  598. default:
  599. assert(0);
  600. }
  601. /* store v2 */
  602. emit_store(gen, v2, inst, chan, TRUE);
  603. }
  604. if (inst->Instruction.Opcode == TGSI_OPCODE_MUL)
  605. ppc_release_vec_register(gen->f, zero_vec);
  606. release_src_vecs(gen);
  607. }
  608. static void
  609. emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
  610. {
  611. uint chan;
  612. FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
  613. /* fetch src operands */
  614. int v0 = get_src_vec(gen, inst, 0, chan);
  615. int v1 = get_src_vec(gen, inst, 1, chan);
  616. int v2 = get_src_vec(gen, inst, 2, chan);
  617. int v3 = get_dst_vec(gen, inst, chan);
  618. /* emit ALU */
  619. switch (inst->Instruction.Opcode) {
  620. case TGSI_OPCODE_MAD:
  621. ppc_vmaddfp(gen->f, v3, v0, v1, v2); /* v3 = v0 * v1 + v2 */
  622. break;
  623. case TGSI_OPCODE_LRP:
  624. ppc_vsubfp(gen->f, v3, v1, v2); /* v3 = v1 - v2 */
  625. ppc_vmaddfp(gen->f, v3, v0, v3, v2); /* v3 = v0 * v3 + v2 */
  626. break;
  627. default:
  628. assert(0);
  629. }
  630. /* store v3 */
  631. emit_store(gen, v3, inst, chan, TRUE);
  632. }
  633. release_src_vecs(gen);
  634. }
  635. /**
  636. * Vector comparisons, resulting in 1.0 or 0.0 values.
  637. */
  638. static void
  639. emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
  640. {
  641. uint chan;
  642. int one_vec = gen_one_vec(gen);
  643. FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
  644. /* fetch src operands */
  645. int v0 = get_src_vec(gen, inst, 0, chan);
  646. int v1 = get_src_vec(gen, inst, 1, chan);
  647. int v2 = get_dst_vec(gen, inst, chan);
  648. boolean complement = FALSE;
  649. switch (inst->Instruction.Opcode) {
  650. case TGSI_OPCODE_SNE:
  651. complement = TRUE;
  652. /* fall-through */
  653. case TGSI_OPCODE_SEQ:
  654. ppc_vcmpeqfpx(gen->f, v2, v0, v1); /* v2 = v0 == v1 ? ~0 : 0 */
  655. break;
  656. case TGSI_OPCODE_SGE:
  657. complement = TRUE;
  658. /* fall-through */
  659. case TGSI_OPCODE_SLT:
  660. ppc_vcmpgtfpx(gen->f, v2, v1, v0); /* v2 = v1 > v0 ? ~0 : 0 */
  661. break;
  662. case TGSI_OPCODE_SLE:
  663. complement = TRUE;
  664. /* fall-through */
  665. case TGSI_OPCODE_SGT:
  666. ppc_vcmpgtfpx(gen->f, v2, v0, v1); /* v2 = v0 > v1 ? ~0 : 0 */
  667. break;
  668. default:
  669. assert(0);
  670. }
  671. /* v2 is now {0,0,0,0} or {~0,~0,~0,~0} */
  672. if (complement)
  673. ppc_vandc(gen->f, v2, one_vec, v2); /* v2 = one_vec & ~v2 */
  674. else
  675. ppc_vand(gen->f, v2, one_vec, v2); /* v2 = one_vec & v2 */
  676. /* store v2 */
  677. emit_store(gen, v2, inst, chan, TRUE);
  678. }
  679. release_src_vecs(gen);
  680. }
  681. static void
  682. emit_dotprod(struct gen_context *gen, struct tgsi_full_instruction *inst)
  683. {
  684. int v0, v1, v2;
  685. uint chan_index;
  686. v2 = ppc_allocate_vec_register(gen->f);
  687. ppc_vzero(gen->f, v2); /* v2 = {0, 0, 0, 0} */
  688. v0 = get_src_vec(gen, inst, 0, CHAN_X); /* v0 = src0.XXXX */
  689. v1 = get_src_vec(gen, inst, 1, CHAN_X); /* v1 = src1.XXXX */
  690. ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */
  691. v0 = get_src_vec(gen, inst, 0, CHAN_Y); /* v0 = src0.YYYY */
  692. v1 = get_src_vec(gen, inst, 1, CHAN_Y); /* v1 = src1.YYYY */
  693. ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */
  694. v0 = get_src_vec(gen, inst, 0, CHAN_Z); /* v0 = src0.ZZZZ */
  695. v1 = get_src_vec(gen, inst, 1, CHAN_Z); /* v1 = src1.ZZZZ */
  696. ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */
  697. if (inst->Instruction.Opcode == TGSI_OPCODE_DP4) {
  698. v0 = get_src_vec(gen, inst, 0, CHAN_W); /* v0 = src0.WWWW */
  699. v1 = get_src_vec(gen, inst, 1, CHAN_W); /* v1 = src1.WWWW */
  700. ppc_vmaddfp(gen->f, v2, v0, v1, v2); /* v2 = v0 * v1 + v2 */
  701. }
  702. else if (inst->Instruction.Opcode == TGSI_OPCODE_DPH) {
  703. v1 = get_src_vec(gen, inst, 1, CHAN_W); /* v1 = src1.WWWW */
  704. ppc_vaddfp(gen->f, v2, v2, v1); /* v2 = v2 + v1 */
  705. }
  706. FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
  707. emit_store(gen, v2, inst, chan_index, FALSE); /* store v2, free v2 later */
  708. }
  709. release_src_vecs(gen);
  710. ppc_release_vec_register(gen->f, v2);
  711. }
  712. /** Approximation for vr = pow(va, vb) */
  713. static void
  714. ppc_vec_pow(struct ppc_function *f, int vr, int va, int vb)
  715. {
  716. /* pow(a,b) ~= exp2(log2(a) * b) */
  717. int t_vec = ppc_allocate_vec_register(f);
  718. int zero_vec = ppc_allocate_vec_register(f);
  719. ppc_vzero(f, zero_vec);
  720. ppc_vlogefp(f, t_vec, va); /* t = log2(va) */
  721. ppc_vmaddfp(f, t_vec, t_vec, vb, zero_vec); /* t = t * vb + zero */
  722. ppc_vexptefp(f, vr, t_vec); /* vr = 2^t */
  723. ppc_release_vec_register(f, t_vec);
  724. ppc_release_vec_register(f, zero_vec);
  725. }
  726. static void
  727. emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
  728. {
  729. int one_vec = gen_one_vec(gen);
  730. /* Compute X */
  731. if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
  732. emit_store(gen, one_vec, inst, CHAN_X, FALSE);
  733. }
  734. /* Compute Y, Z */
  735. if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
  736. IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
  737. int x_vec;
  738. int zero_vec = ppc_allocate_vec_register(gen->f);
  739. x_vec = get_src_vec(gen, inst, 0, CHAN_X); /* x_vec = src[0].x */
  740. ppc_vzero(gen->f, zero_vec); /* zero = {0,0,0,0} */
  741. ppc_vmaxfp(gen->f, x_vec, x_vec, zero_vec); /* x_vec = max(x_vec, 0) */
  742. if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
  743. emit_store(gen, x_vec, inst, CHAN_Y, FALSE);
  744. }
  745. if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
  746. int y_vec, w_vec;
  747. int z_vec = ppc_allocate_vec_register(gen->f);
  748. int pow_vec = ppc_allocate_vec_register(gen->f);
  749. int pos_vec = ppc_allocate_vec_register(gen->f);
  750. int p128_vec = ppc_allocate_vec_register(gen->f);
  751. int n128_vec = ppc_allocate_vec_register(gen->f);
  752. y_vec = get_src_vec(gen, inst, 0, CHAN_Y); /* y_vec = src[0].y */
  753. ppc_vmaxfp(gen->f, y_vec, y_vec, zero_vec); /* y_vec = max(y_vec, 0) */
  754. w_vec = get_src_vec(gen, inst, 0, CHAN_W); /* w_vec = src[0].w */
  755. /* clamp W to [-128, 128] */
  756. load_constant_vec(gen, p128_vec, 128.0f);
  757. load_constant_vec(gen, n128_vec, -128.0f);
  758. ppc_vmaxfp(gen->f, w_vec, w_vec, n128_vec); /* w = max(w, -128) */
  759. ppc_vminfp(gen->f, w_vec, w_vec, p128_vec); /* w = min(w, 128) */
  760. /* if temp.x > 0
  761. * z = pow(tmp.y, tmp.w)
  762. * else
  763. * z = 0.0
  764. */
  765. ppc_vec_pow(gen->f, pow_vec, y_vec, w_vec); /* pow = pow(y, w) */
  766. ppc_vcmpgtfpx(gen->f, pos_vec, x_vec, zero_vec); /* pos = x > 0 */
  767. ppc_vand(gen->f, z_vec, pow_vec, pos_vec); /* z = pow & pos */
  768. emit_store(gen, z_vec, inst, CHAN_Z, FALSE);
  769. ppc_release_vec_register(gen->f, z_vec);
  770. ppc_release_vec_register(gen->f, pow_vec);
  771. ppc_release_vec_register(gen->f, pos_vec);
  772. ppc_release_vec_register(gen->f, p128_vec);
  773. ppc_release_vec_register(gen->f, n128_vec);
  774. }
  775. ppc_release_vec_register(gen->f, zero_vec);
  776. }
  777. /* Compute W */
  778. if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
  779. emit_store(gen, one_vec, inst, CHAN_W, FALSE);
  780. }
  781. release_src_vecs(gen);
  782. }
  783. static void
  784. emit_exp(struct gen_context *gen, struct tgsi_full_instruction *inst)
  785. {
  786. const int one_vec = gen_one_vec(gen);
  787. int src_vec;
  788. /* get src arg */
  789. src_vec = get_src_vec(gen, inst, 0, CHAN_X);
  790. /* Compute X = 2^floor(src) */
  791. if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
  792. int dst_vec = get_dst_vec(gen, inst, CHAN_X);
  793. int tmp_vec = ppc_allocate_vec_register(gen->f);
  794. ppc_vrfim(gen->f, tmp_vec, src_vec); /* tmp = floor(src); */
  795. ppc_vexptefp(gen->f, dst_vec, tmp_vec); /* dst = 2 ^ tmp */
  796. emit_store(gen, dst_vec, inst, CHAN_X, TRUE);
  797. ppc_release_vec_register(gen->f, tmp_vec);
  798. }
  799. /* Compute Y = src - floor(src) */
  800. if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
  801. int dst_vec = get_dst_vec(gen, inst, CHAN_Y);
  802. int tmp_vec = ppc_allocate_vec_register(gen->f);
  803. ppc_vrfim(gen->f, tmp_vec, src_vec); /* tmp = floor(src); */
  804. ppc_vsubfp(gen->f, dst_vec, src_vec, tmp_vec); /* dst = src - tmp */
  805. emit_store(gen, dst_vec, inst, CHAN_Y, TRUE);
  806. ppc_release_vec_register(gen->f, tmp_vec);
  807. }
  808. /* Compute Z = RoughApprox2ToX(src) */
  809. if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
  810. int dst_vec = get_dst_vec(gen, inst, CHAN_Z);
  811. ppc_vexptefp(gen->f, dst_vec, src_vec); /* dst = 2 ^ src */
  812. emit_store(gen, dst_vec, inst, CHAN_Z, TRUE);
  813. }
  814. /* Compute W = 1.0 */
  815. if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
  816. emit_store(gen, one_vec, inst, CHAN_W, FALSE);
  817. }
  818. release_src_vecs(gen);
  819. }
  820. static void
  821. emit_log(struct gen_context *gen, struct tgsi_full_instruction *inst)
  822. {
  823. const int bit31_vec = gen_get_bit31_vec(gen);
  824. const int one_vec = gen_one_vec(gen);
  825. int src_vec, abs_vec;
  826. /* get src arg */
  827. src_vec = get_src_vec(gen, inst, 0, CHAN_X);
  828. /* compute abs(src) */
  829. abs_vec = ppc_allocate_vec_register(gen->f);
  830. ppc_vandc(gen->f, abs_vec, src_vec, bit31_vec); /* abs = src & ~bit31 */
  831. if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) &&
  832. IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
  833. /* compute tmp = floor(log2(abs)) */
  834. int tmp_vec = ppc_allocate_vec_register(gen->f);
  835. ppc_vlogefp(gen->f, tmp_vec, abs_vec); /* tmp = log2(abs) */
  836. ppc_vrfim(gen->f, tmp_vec, tmp_vec); /* tmp = floor(tmp); */
  837. /* Compute X = tmp */
  838. if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
  839. emit_store(gen, tmp_vec, inst, CHAN_X, FALSE);
  840. }
  841. /* Compute Y = abs / 2^tmp */
  842. if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
  843. const int zero_vec = ppc_allocate_vec_register(gen->f);
  844. ppc_vzero(gen->f, zero_vec);
  845. ppc_vexptefp(gen->f, tmp_vec, tmp_vec); /* tmp = 2 ^ tmp */
  846. ppc_vrefp(gen->f, tmp_vec, tmp_vec); /* tmp = 1 / tmp */
  847. /* tmp = abs * tmp + zero */
  848. ppc_vmaddfp(gen->f, tmp_vec, abs_vec, tmp_vec, zero_vec);
  849. emit_store(gen, tmp_vec, inst, CHAN_Y, FALSE);
  850. ppc_release_vec_register(gen->f, zero_vec);
  851. }
  852. ppc_release_vec_register(gen->f, tmp_vec);
  853. }
  854. /* Compute Z = RoughApproxLog2(abs) */
  855. if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
  856. int dst_vec = get_dst_vec(gen, inst, CHAN_Z);
  857. ppc_vlogefp(gen->f, dst_vec, abs_vec); /* dst = log2(abs) */
  858. emit_store(gen, dst_vec, inst, CHAN_Z, TRUE);
  859. }
  860. /* Compute W = 1.0 */
  861. if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
  862. emit_store(gen, one_vec, inst, CHAN_W, FALSE);
  863. }
  864. ppc_release_vec_register(gen->f, abs_vec);
  865. release_src_vecs(gen);
  866. }
  867. static void
  868. emit_pow(struct gen_context *gen, struct tgsi_full_instruction *inst)
  869. {
  870. int s0_vec = get_src_vec(gen, inst, 0, CHAN_X);
  871. int s1_vec = get_src_vec(gen, inst, 1, CHAN_X);
  872. int pow_vec = ppc_allocate_vec_register(gen->f);
  873. int chan;
  874. ppc_vec_pow(gen->f, pow_vec, s0_vec, s1_vec);
  875. FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
  876. emit_store(gen, pow_vec, inst, chan, FALSE);
  877. }
  878. ppc_release_vec_register(gen->f, pow_vec);
  879. release_src_vecs(gen);
  880. }
  881. static void
  882. emit_xpd(struct gen_context *gen, struct tgsi_full_instruction *inst)
  883. {
  884. int x0_vec, y0_vec, z0_vec;
  885. int x1_vec, y1_vec, z1_vec;
  886. int zero_vec, tmp_vec;
  887. int tmp2_vec;
  888. zero_vec = ppc_allocate_vec_register(gen->f);
  889. ppc_vzero(gen->f, zero_vec);
  890. tmp_vec = ppc_allocate_vec_register(gen->f);
  891. tmp2_vec = ppc_allocate_vec_register(gen->f);
  892. if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
  893. IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
  894. x0_vec = get_src_vec(gen, inst, 0, CHAN_X);
  895. x1_vec = get_src_vec(gen, inst, 1, CHAN_X);
  896. }
  897. if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
  898. IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
  899. y0_vec = get_src_vec(gen, inst, 0, CHAN_Y);
  900. y1_vec = get_src_vec(gen, inst, 1, CHAN_Y);
  901. }
  902. if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
  903. IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
  904. z0_vec = get_src_vec(gen, inst, 0, CHAN_Z);
  905. z1_vec = get_src_vec(gen, inst, 1, CHAN_Z);
  906. }
  907. IF_IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) {
  908. /* tmp = y0 * z1 */
  909. ppc_vmaddfp(gen->f, tmp_vec, y0_vec, z1_vec, zero_vec);
  910. /* tmp = tmp - z0 * y1*/
  911. ppc_vnmsubfp(gen->f, tmp_vec, tmp_vec, z0_vec, y1_vec);
  912. emit_store(gen, tmp_vec, inst, CHAN_X, FALSE);
  913. }
  914. IF_IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) {
  915. /* tmp = z0 * x1 */
  916. ppc_vmaddfp(gen->f, tmp_vec, z0_vec, x1_vec, zero_vec);
  917. /* tmp = tmp - x0 * z1 */
  918. ppc_vnmsubfp(gen->f, tmp_vec, tmp_vec, x0_vec, z1_vec);
  919. emit_store(gen, tmp_vec, inst, CHAN_Y, FALSE);
  920. }
  921. IF_IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) {
  922. /* tmp = x0 * y1 */
  923. ppc_vmaddfp(gen->f, tmp_vec, x0_vec, y1_vec, zero_vec);
  924. /* tmp = tmp - y0 * x1 */
  925. ppc_vnmsubfp(gen->f, tmp_vec, tmp_vec, y0_vec, x1_vec);
  926. emit_store(gen, tmp_vec, inst, CHAN_Z, FALSE);
  927. }
  928. /* W is undefined */
  929. ppc_release_vec_register(gen->f, tmp_vec);
  930. ppc_release_vec_register(gen->f, zero_vec);
  931. release_src_vecs(gen);
  932. }
  933. static int
  934. emit_instruction(struct gen_context *gen,
  935. struct tgsi_full_instruction *inst)
  936. {
  937. switch (inst->Instruction.Opcode) {
  938. case TGSI_OPCODE_MOV:
  939. case TGSI_OPCODE_SWZ:
  940. case TGSI_OPCODE_ABS:
  941. case TGSI_OPCODE_FLOOR:
  942. case TGSI_OPCODE_FRAC:
  943. case TGSI_OPCODE_EXPBASE2:
  944. case TGSI_OPCODE_LOGBASE2:
  945. emit_unaryop(gen, inst);
  946. break;
  947. case TGSI_OPCODE_RSQ:
  948. case TGSI_OPCODE_RCP:
  949. emit_scalar_unaryop(gen, inst);
  950. break;
  951. case TGSI_OPCODE_ADD:
  952. case TGSI_OPCODE_SUB:
  953. case TGSI_OPCODE_MUL:
  954. case TGSI_OPCODE_MIN:
  955. case TGSI_OPCODE_MAX:
  956. emit_binop(gen, inst);
  957. break;
  958. case TGSI_OPCODE_SEQ:
  959. case TGSI_OPCODE_SNE:
  960. case TGSI_OPCODE_SLT:
  961. case TGSI_OPCODE_SGT:
  962. case TGSI_OPCODE_SLE:
  963. case TGSI_OPCODE_SGE:
  964. emit_inequality(gen, inst);
  965. break;
  966. case TGSI_OPCODE_MAD:
  967. case TGSI_OPCODE_LRP:
  968. emit_triop(gen, inst);
  969. break;
  970. case TGSI_OPCODE_DP3:
  971. case TGSI_OPCODE_DP4:
  972. case TGSI_OPCODE_DPH:
  973. emit_dotprod(gen, inst);
  974. break;
  975. case TGSI_OPCODE_LIT:
  976. emit_lit(gen, inst);
  977. break;
  978. case TGSI_OPCODE_LOG:
  979. emit_log(gen, inst);
  980. break;
  981. case TGSI_OPCODE_EXP:
  982. emit_exp(gen, inst);
  983. break;
  984. case TGSI_OPCODE_POW:
  985. emit_pow(gen, inst);
  986. break;
  987. case TGSI_OPCODE_XPD:
  988. emit_xpd(gen, inst);
  989. break;
  990. case TGSI_OPCODE_END:
  991. /* normal end */
  992. return 1;
  993. default:
  994. return 0;
  995. }
  996. return 1;
  997. }
  998. static void
  999. emit_declaration(
  1000. struct ppc_function *func,
  1001. struct tgsi_full_declaration *decl )
  1002. {
  1003. if( decl->Declaration.File == TGSI_FILE_INPUT ) {
  1004. #if 0
  1005. unsigned first, last, mask;
  1006. unsigned i, j;
  1007. first = decl->DeclarationRange.First;
  1008. last = decl->DeclarationRange.Last;
  1009. mask = decl->Declaration.UsageMask;
  1010. for( i = first; i <= last; i++ ) {
  1011. for( j = 0; j < NUM_CHANNELS; j++ ) {
  1012. if( mask & (1 << j) ) {
  1013. switch( decl->Declaration.Interpolate ) {
  1014. case TGSI_INTERPOLATE_CONSTANT:
  1015. emit_coef_a0( func, 0, i, j );
  1016. emit_inputs( func, 0, i, j );
  1017. break;
  1018. case TGSI_INTERPOLATE_LINEAR:
  1019. emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
  1020. emit_coef_dadx( func, 1, i, j );
  1021. emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
  1022. emit_coef_dady( func, 3, i, j );
  1023. emit_mul( func, 0, 1 ); /* x * dadx */
  1024. emit_coef_a0( func, 4, i, j );
  1025. emit_mul( func, 2, 3 ); /* y * dady */
  1026. emit_add( func, 0, 4 ); /* x * dadx + a0 */
  1027. emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
  1028. emit_inputs( func, 0, i, j );
  1029. break;
  1030. case TGSI_INTERPOLATE_PERSPECTIVE:
  1031. emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
  1032. emit_coef_dadx( func, 1, i, j );
  1033. emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
  1034. emit_coef_dady( func, 3, i, j );
  1035. emit_mul( func, 0, 1 ); /* x * dadx */
  1036. emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
  1037. emit_coef_a0( func, 5, i, j );
  1038. emit_rcp( func, 4, 4 ); /* 1.0 / w */
  1039. emit_mul( func, 2, 3 ); /* y * dady */
  1040. emit_add( func, 0, 5 ); /* x * dadx + a0 */
  1041. emit_add( func, 0, 2 ); /* x * dadx + y * dady + a0 */
  1042. emit_mul( func, 0, 4 ); /* (x * dadx + y * dady + a0) / w */
  1043. emit_inputs( func, 0, i, j );
  1044. break;
  1045. default:
  1046. assert( 0 );
  1047. break;
  1048. }
  1049. }
  1050. }
  1051. }
  1052. #endif
  1053. }
  1054. }
  1055. static void
  1056. emit_prologue(struct ppc_function *func)
  1057. {
  1058. /* XXX set up stack frame */
  1059. }
  1060. static void
  1061. emit_epilogue(struct ppc_function *func)
  1062. {
  1063. ppc_comment(func, -4, "Epilogue:");
  1064. ppc_return(func);
  1065. /* XXX restore prev stack frame */
  1066. #if 0
  1067. debug_printf("PPC: Emitted %u instructions\n", func->num_inst);
  1068. #endif
  1069. }
  1070. /**
  1071. * Translate a TGSI vertex/fragment shader to PPC code.
  1072. *
  1073. * \param tokens the TGSI input shader
  1074. * \param func the output PPC code/function
  1075. * \param immediates buffer to place immediates, later passed to PPC func
  1076. * \return TRUE for success, FALSE if translation failed
  1077. */
  1078. boolean
  1079. tgsi_emit_ppc(const struct tgsi_token *tokens,
  1080. struct ppc_function *func,
  1081. float (*immediates)[4],
  1082. boolean do_swizzles )
  1083. {
  1084. static int use_ppc_asm = -1;
  1085. struct tgsi_parse_context parse;
  1086. /*boolean instruction_phase = FALSE;*/
  1087. unsigned ok = 1;
  1088. uint num_immediates = 0;
  1089. struct gen_context gen;
  1090. uint ic = 0;
  1091. if (use_ppc_asm < 0) {
  1092. /* If GALLIUM_NOPPC is set, don't use PPC codegen */
  1093. use_ppc_asm = !debug_get_bool_option("GALLIUM_NOPPC", FALSE);
  1094. }
  1095. if (!use_ppc_asm)
  1096. return FALSE;
  1097. if (0) {
  1098. debug_printf("\n********* TGSI->PPC ********\n");
  1099. tgsi_dump(tokens, 0);
  1100. }
  1101. util_init_math();
  1102. init_gen_context(&gen, func);
  1103. emit_prologue(func);
  1104. tgsi_parse_init( &parse, tokens );
  1105. while (!tgsi_parse_end_of_tokens(&parse) && ok) {
  1106. tgsi_parse_token(&parse);
  1107. switch (parse.FullToken.Token.Type) {
  1108. case TGSI_TOKEN_TYPE_DECLARATION:
  1109. if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
  1110. emit_declaration(func, &parse.FullToken.FullDeclaration );
  1111. }
  1112. break;
  1113. case TGSI_TOKEN_TYPE_INSTRUCTION:
  1114. if (func->print) {
  1115. _debug_printf("# ");
  1116. ic++;
  1117. tgsi_dump_instruction(&parse.FullToken.FullInstruction, ic);
  1118. }
  1119. ok = emit_instruction(&gen, &parse.FullToken.FullInstruction);
  1120. if (!ok) {
  1121. debug_printf("failed to translate tgsi opcode %d to PPC (%s)\n",
  1122. parse.FullToken.FullInstruction.Instruction.Opcode,
  1123. parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
  1124. "vertex shader" : "fragment shader");
  1125. }
  1126. break;
  1127. case TGSI_TOKEN_TYPE_IMMEDIATE:
  1128. /* splat each immediate component into a float[4] vector for SoA */
  1129. {
  1130. const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
  1131. uint i;
  1132. assert(size <= 4);
  1133. assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
  1134. for (i = 0; i < size; i++) {
  1135. immediates[num_immediates][i] =
  1136. parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
  1137. }
  1138. num_immediates++;
  1139. }
  1140. break;
  1141. default:
  1142. ok = 0;
  1143. assert( 0 );
  1144. }
  1145. }
  1146. emit_epilogue(func);
  1147. tgsi_parse_free( &parse );
  1148. if (ppc_num_instructions(func) == 0) {
  1149. /* ran out of memory for instructions */
  1150. ok = FALSE;
  1151. }
  1152. if (!ok)
  1153. debug_printf("TGSI->PPC translation failed\n");
  1154. return ok;
  1155. }
  1156. #endif /* PIPE_ARCH_PPC */