Sfoglia il codice sorgente

swr: [rasterizer core] 16-wide tile store nearly completed

* All format combinations coded
* Fully emulated on AVX2 and AVX
* Known issue: the MSAA sample locations need to be adjusted for 8x2

Set ENABLE_AVX512_SIMD16 and USD_8x2_TILE_BACKEND to 1 in knobs.h to enable

Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
tags/17.0-branchpoint
Tim Rowley 8 anni fa
parent
commit
937b7d8e5a

+ 31
- 30
src/gallium/drivers/swr/rasterizer/common/simd16intrin.h Vedi File

@@ -459,10 +459,10 @@ INLINE simd16scalar _simd16_cmp_ps(simd16scalar a, simd16scalar b)
#define _simd16_cmpge_ps(a, b) _simd16_cmp_ps<_CMP_GE_OQ>(a, b)
#define _simd16_cmple_ps(a, b) _simd16_cmp_ps<_CMP_LE_OQ>(a, b)

SIMD16_EMU_AVX512_2(simd16scalar, _simd16_and_ps, _mm256_and_ps)
SIMD16_EMU_AVX512_2(simd16scalar, _simd16_or_ps, _mm256_or_ps)
SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rcp_ps, _mm256_rcp_ps)
SIMD16_EMU_AVX512_2(simd16scalar, _simd16_div_ps, _mm256_div_ps)
SIMD16_EMU_AVX512_2(simd16scalar, _simd16_and_ps, _simd_and_ps)
SIMD16_EMU_AVX512_2(simd16scalar, _simd16_or_ps, _simd_or_ps)
SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rcp_ps, _simd_rcp_ps)
SIMD16_EMU_AVX512_2(simd16scalar, _simd16_div_ps, _simd_div_ps)

INLINE simd16scalar _simd16_castsi_ps(simd16scalari a)
{
@@ -509,21 +509,22 @@ INLINE simd16scalar _simd16_round_ps_temp(simd16scalar a)

#define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a)

SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mul_epi32, _mm256_mul_epi32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mullo_epi32, _mm256_mullo_epi32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi32, _mm256_sub_epi32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi64, _mm256_sub_epi64)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epi32, _mm256_min_epi32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epi32, _mm256_max_epi32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epu32, _mm256_min_epu32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epu32, _mm256_max_epu32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi32, _mm256_add_epi32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mul_epi32, _simd_mul_epi32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mullo_epi32, _simd_mullo_epi32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi32, _simd_sub_epi32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi64, _simd_sub_epi64)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epi32, _simd_min_epi32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epi32, _simd_max_epi32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epu32, _simd_min_epu32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epu32, _simd_max_epu32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi32, _simd_add_epi32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_and_si, _simd_and_si)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_andnot_si, _simd_andnot_si)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_or_si, _simd_or_si)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_xor_si, _simd_xor_si)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi32, _mm256_cmpeq_epi32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi32, _mm256_cmpgt_epi32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi32, _simd_cmpeq_epi32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi32, _simd_cmpgt_epi32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmplt_epi32, _simd_cmplt_epi32)

INLINE int _simd16_testz_ps(simd16scalar a, simd16scalar b)
{
@@ -579,13 +580,13 @@ INLINE simd16scalari _simd16_srli_epi32_temp(simd16scalari a)

#define _simd16_srli_epi32(a, imm8) _simd16_srli_epi32_temp<imm8>(a)

SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmadd_ps, _mm256_fmadd_ps)
SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmsub_ps, _mm256_fmsub_ps)
SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmadd_ps, _simd_fmadd_ps)
SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmsub_ps, _simd_fmsub_ps)

SIMD16_EMU_AVX512_2(simd16scalari, _simd16_shuffle_epi8, _mm256_shuffle_epi8)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_adds_epu8, _mm256_adds_epu8)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_subs_epu8, _mm256_subs_epu8)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi8, _mm256_add_epi8)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_shuffle_epi8, _simd_shuffle_epi8)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_adds_epu8, _simd_adds_epu8)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_subs_epu8, _simd_subs_epu8)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi8, _simd_add_epi8)

template <int imm8>
INLINE simd16scalar _simd16_i32gather_ps_temp(float const *m, simd16scalari a)
@@ -600,13 +601,13 @@ INLINE simd16scalar _simd16_i32gather_ps_temp(float const *m, simd16scalari a)

#define _simd16_i32gather_ps(m, a, imm8) _simd16_i32gather_ps_temp<imm8>(m, a)

SIMD16_EMU_AVX512_1(simd16scalari, _simd16_abs_epi32, _mm256_abs_epi32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi64, _mm256_cmpeq_epi64)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi64, _mm256_cmpgt_epi64)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi16, _mm256_cmpeq_epi16)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi16, _mm256_cmpgt_epi16)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi8, _mm256_cmpeq_epi8)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi8, _mm256_cmpgt_epi8)
SIMD16_EMU_AVX512_1(simd16scalari, _simd16_abs_epi32, _simd_abs_epi32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi64, _simd_cmpeq_epi64)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi64, _simd_cmpgt_epi64)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi16, _simd_cmpeq_epi16)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi16, _simd_cmpgt_epi16)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi8, _simd_cmpeq_epi8)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi8, _simd_cmpgt_epi8)

INLINE simd16scalar _simd16_permute_ps(simd16scalar a, simd16scalari i)
{
@@ -631,8 +632,8 @@ INLINE simd16scalari _simd16_permute_epi32(simd16scalari a, simd16scalari i)
return _simd16_castps_si(_simd16_permute_ps(_simd16_castsi_ps(a), i));
}

SIMD16_EMU_AVX512_2(simd16scalari, _simd16_srlv_epi32, _mm256_srlv_epi32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sllv_epi32, _mm256_sllv_epi32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_srlv_epi32, _simd_srlv_epi32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sllv_epi32, _simd_sllv_epi32)

template <int imm8>
INLINE simd16scalar _simd16_permute2f128_ps_temp(simd16scalar a, simd16scalar b)

+ 37
- 1
src/gallium/drivers/swr/rasterizer/common/simdintrin.h Vedi File

@@ -314,7 +314,15 @@ SIMD_EMU_EPI(_simdemu_cmpgt_epi8, _mm_cmpgt_epi8)
SIMD_EMU_EPI(_simdemu_cmpeq_epi8, _mm_cmpeq_epi8)
SIMD_EMU_EPI(_simdemu_cmpgt_epi16, _mm_cmpgt_epi16)
SIMD_EMU_EPI(_simdemu_cmpeq_epi16, _mm_cmpeq_epi16)

SIMD_EMU_EPI(_simdemu_unpacklo_epi8, _mm_unpacklo_epi8)
SIMD_EMU_EPI(_simdemu_unpackhi_epi8, _mm_unpackhi_epi8)
SIMD_EMU_EPI(_simdemu_unpacklo_epi16, _mm_unpacklo_epi16)
SIMD_EMU_EPI(_simdemu_unpackhi_epi16, _mm_unpackhi_epi16)

#define _simd_unpacklo_epi8 _simdemu_unpacklo_epi8
#define _simd_unpackhi_epi8 _simdemu_unpackhi_epi8
#define _simd_unpacklo_epi16 _simdemu_unpacklo_epi16
#define _simd_unpackhi_epi16 _simdemu_unpackhi_epi16
#define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
#define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
#define _simd_unpacklo_epi64(a, b) _mm256_castpd_si256(_mm256_unpacklo_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)))
@@ -490,6 +498,10 @@ __m256i _simd_packs_epi32(__m256i a, __m256i b)
#define _simd_xor_si _mm256_xor_si256
#define _simd_castps_si _mm256_castps_si256

#define _simd_unpacklo_epi8 _mm256_unpacklo_epi8
#define _simd_unpackhi_epi8 _mm256_unpackhi_epi8
#define _simd_unpacklo_epi16 _mm256_unpacklo_epi16
#define _simd_unpackhi_epi16 _mm256_unpackhi_epi16
#define _simd_unpacklo_epi32 _mm256_unpacklo_epi32
#define _simd_unpackhi_epi32 _mm256_unpackhi_epi32
#define _simd_unpacklo_epi64 _mm256_unpacklo_epi64
@@ -529,6 +541,14 @@ __m256i _simd_packs_epi32(__m256i a, __m256i b)

#endif

#define _simd_unpacklo_ps _mm256_unpacklo_ps
#define _simd_unpacklo_pd _mm256_unpacklo_pd
#define _simd_insertf128_ps _mm256_insertf128_ps
#define _simd_insertf128_pd _mm256_insertf128_pd
#define _simd_insertf128_si _mm256_insertf128_si256
#define _simd_extractf128_ps _mm256_extractf128_ps
#define _simd_extractf128_pd _mm256_extractf128_pd
#define _simd_extractf128_si _mm256_extractf128_si256
#define _simd_permute2f128_ps _mm256_permute2f128_ps
#define _simd_permute2f128_pd _mm256_permute2f128_pd
#define _simd_permute2f128_si _mm256_permute2f128_si256
@@ -550,6 +570,22 @@ __m256i _simd_packs_epi32(__m256i a, __m256i b)
#define _simd_testz_ps _mm256_testz_ps
#define _simd_xor_ps _mm256_xor_ps

INLINE
simdscalari _simd_loadu2_si(const __m128i *hiaddr, const __m128i *loaddr)
{
__m128i lo = _mm_loadu_si128(loaddr);
__m128i hi = _mm_loadu_si128(hiaddr);

return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
}

INLINE
void _simd_storeu2_si(__m128i *hiaddr, __m128i *loaddr, simdscalari a)
{
_mm_storeu_si128(loaddr, _mm256_castsi256_si128(a));
_mm_storeu_si128(hiaddr, _mm256_extractf128_si256(a, 1));
}

INLINE
simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalar mask)
{

+ 10
- 10
src/gallium/drivers/swr/rasterizer/core/format_types.h Vedi File

@@ -166,12 +166,12 @@ struct PackTraits<8, false>
simd16scalari result = _simd16_setzero_si();
simdscalari resultlo = _simd_setzero_si();

__m128i templo = _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 0))), _mm256_extractf128_si256(_mm256_castps_si256(_simd16_extract_ps(in, 0)), 1));
__m128i temphi = _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 1))), _mm256_extractf128_si256(_mm256_castps_si256(_simd16_extract_ps(in, 1)), 1));
__m128i templo = _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 0))), _simd_extractf128_si(_mm256_castps_si256(_simd16_extract_ps(in, 0)), 1));
__m128i temphi = _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 1))), _simd_extractf128_si(_mm256_castps_si256(_simd16_extract_ps(in, 1)), 1));

__m128i temp = _mm_packus_epi16(templo, temphi);

resultlo = _mm256_inserti128_si256(resultlo, temp, 0);
resultlo = _simd_insertf128_si(resultlo, temp, 0);
result = _simd16_insert_si(result, resultlo, 0);

return _simd16_castsi_ps(result);
@@ -278,12 +278,12 @@ struct PackTraits<8, true>
simd16scalari result = _simd16_setzero_si();
simdscalari resultlo = _simd_setzero_si();

__m128i templo = _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 0))), _mm256_extractf128_si256(_mm256_castps_si256(_simd16_extract_ps(in, 0)), 1));
__m128i temphi = _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 1))), _mm256_extractf128_si256(_mm256_castps_si256(_simd16_extract_ps(in, 1)), 1));
__m128i templo = _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 0))), _simd_extractf128_si(_mm256_castps_si256(_simd16_extract_ps(in, 0)), 1));
__m128i temphi = _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 1))), _simd_extractf128_si(_mm256_castps_si256(_simd16_extract_ps(in, 1)), 1));

__m128i temp = _mm_packs_epi16(templo, temphi);

resultlo = _mm256_inserti128_si256(resultlo, temp, 0);
resultlo = _simd_insertf128_si(resultlo, temp, 0);
result = _simd16_insert_si(result, resultlo, 0);

return _simd16_castsi_ps(result);
@@ -1057,16 +1057,16 @@ template<> struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16>
simdscalar simdlo = pack(_simd16_extract_ps(in, 0));
simdscalar simdhi = pack(_simd16_extract_ps(in, 1));

__m128i templo = _mm256_extractf128_si256(_simd_castps_si(simdlo), 0);
__m128i temphi = _mm256_extractf128_si256(_simd_castps_si(simdhi), 0);
__m128i templo = _simd_extractf128_si(_simd_castps_si(simdlo), 0);
__m128i temphi = _simd_extractf128_si(_simd_castps_si(simdhi), 0);

#else
__m128i templo = _mm256_cvtps_ph(_simd16_extract_ps(in, 0), _MM_FROUND_TRUNC);
__m128i temphi = _mm256_cvtps_ph(_simd16_extract_ps(in, 1), _MM_FROUND_TRUNC);

#endif
resultlo = _mm256_insertf128_si256(resultlo, templo, 0);
resultlo = _mm256_insertf128_si256(resultlo, temphi, 1);
resultlo = _simd_insertf128_si(resultlo, templo, 0);
resultlo = _simd_insertf128_si(resultlo, temphi, 1);

result = _simd16_insert_si(result, resultlo, 0);


+ 64
- 110
src/gallium/drivers/swr/rasterizer/core/utils.h Vedi File

@@ -147,7 +147,7 @@ void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)

#if KNOB_SIMD_WIDTH == 8
INLINE
void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2)
void vTranspose3x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2)
{
__m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
__m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5
@@ -171,7 +171,7 @@ void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc
}

INLINE
void vTranspose4x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2, __m256 &vSrc3)
void vTranspose4x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2, const __m256 &vSrc3)
{
__m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
__m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5
@@ -357,15 +357,17 @@ struct Transpose8_8

INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
{
__m256i src = _mm256_load_si256(reinterpret_cast<const __m256i *>(pSrc)); // rrrrrrrrrrrrrrrrgggggggggggggggg
simdscalari r = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrrgggggggggggggggg

__m256i r = _mm256_permute4x64_epi64(src, 0x50); // 0x50 = 01010000b // rrrrrrrrxxxxxxxxrrrrrrrrxxxxxxxx
simdscalari g = _simd_permute2f128_si(r, r, 1); // ggggggggggggggggxxxxxxxxxxxxxxxx

__m256i g = _mm256_permute4x64_epi64(src, 0xFA); // 0xFA = 11111010b // ggggggggxxxxxxxxggggggggxxxxxxxx
r = _simd_insertf128_si(r, _mm_srli_si128(_simd_extractf128_si(r, 0), 8), 1); // rrrrrrrrxxxxxxxxrrrrrrrrxxxxxxxx

__m256i dst = _mm256_unpacklo_epi8(r, g); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
g = _simd_insertf128_si(g, _mm_srli_si128(_simd_extractf128_si(g, 0), 8), 1); // ggggggggxxxxxxxxggggggggxxxxxxxx

_mm256_store_si256(reinterpret_cast<__m256i *>(pDst), dst);
simdscalari dst = _simd_unpacklo_epi8(r, g); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg

_simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst);
}
#endif
};
@@ -414,35 +416,13 @@ struct Transpose32_32_32_32

vTranspose4x8(vDst, _simd16_extract_ps(src0, 0), _simd16_extract_ps(src1, 0), _simd16_extract_ps(src2, 0), _simd16_extract_ps(src3, 0));

#if 1
_simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, reinterpret_cast<simd16scalar *>(vDst)[0]);
_simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, reinterpret_cast<simd16scalar *>(vDst)[1]);
#else
_mm_store_ps(reinterpret_cast<float *>(pDst), vDst[0]);
_mm_store_ps(reinterpret_cast<float *>(pDst) + 4, vDst[1]);
_mm_store_ps(reinterpret_cast<float *>(pDst) + 8, vDst[2]);
_mm_store_ps(reinterpret_cast<float *>(pDst) + 12, vDst[3]);
_mm_store_ps(reinterpret_cast<float *>(pDst) + 16, vDst[4]);
_mm_store_ps(reinterpret_cast<float *>(pDst) + 20, vDst[5]);
_mm_store_ps(reinterpret_cast<float *>(pDst) + 24, vDst[6]);
_mm_store_ps(reinterpret_cast<float *>(pDst) + 28, vDst[7]);
#endif

vTranspose4x8(vDst, _simd16_extract_ps(src0, 1), _simd16_extract_ps(src1, 1), _simd16_extract_ps(src2, 1), _simd16_extract_ps(src3, 1));

#if 1
_simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, reinterpret_cast<simd16scalar *>(vDst)[2]);
_simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, reinterpret_cast<simd16scalar *>(vDst)[3]);
#else
_mm_store_ps(reinterpret_cast<float *>(pDst) + 32, vDst[0]);
_mm_store_ps(reinterpret_cast<float *>(pDst) + 36, vDst[1]);
_mm_store_ps(reinterpret_cast<float *>(pDst) + 40, vDst[2]);
_mm_store_ps(reinterpret_cast<float *>(pDst) + 44, vDst[3]);
_mm_store_ps(reinterpret_cast<float *>(pDst) + 48, vDst[4]);
_mm_store_ps(reinterpret_cast<float *>(pDst) + 52, vDst[5]);
_mm_store_ps(reinterpret_cast<float *>(pDst) + 56, vDst[6]);
_mm_store_ps(reinterpret_cast<float *>(pDst) + 60, vDst[7]);
#endif
}
#endif
};
@@ -489,35 +469,13 @@ struct Transpose32_32_32

vTranspose3x8(vDst, _simd16_extract_ps(src0, 0), _simd16_extract_ps(src1, 0), _simd16_extract_ps(src2, 0));

#if 1
_simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, reinterpret_cast<simd16scalar *>(vDst)[0]);
_simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, reinterpret_cast<simd16scalar *>(vDst)[1]);
#else
_mm_store_ps(reinterpret_cast<float *>(pDst), vDst[0]);
_mm_store_ps(reinterpret_cast<float *>(pDst) + 4, vDst[1]);
_mm_store_ps(reinterpret_cast<float *>(pDst) + 8, vDst[2]);
_mm_store_ps(reinterpret_cast<float *>(pDst) + 12, vDst[3]);
_mm_store_ps(reinterpret_cast<float *>(pDst) + 16, vDst[4]);
_mm_store_ps(reinterpret_cast<float *>(pDst) + 20, vDst[5]);
_mm_store_ps(reinterpret_cast<float *>(pDst) + 24, vDst[6]);
_mm_store_ps(reinterpret_cast<float *>(pDst) + 28, vDst[7]);
#endif

vTranspose3x8(vDst, _simd16_extract_ps(src0, 1), _simd16_extract_ps(src1, 1), _simd16_extract_ps(src2, 1));

#if 1
_simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, reinterpret_cast<simd16scalar *>(vDst)[2]);
_simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, reinterpret_cast<simd16scalar *>(vDst)[3]);
#else
_mm_store_ps(reinterpret_cast<float *>(pDst) + 32, vDst[0]);
_mm_store_ps(reinterpret_cast<float *>(pDst) + 36, vDst[1]);
_mm_store_ps(reinterpret_cast<float *>(pDst) + 40, vDst[2]);
_mm_store_ps(reinterpret_cast<float *>(pDst) + 44, vDst[3]);
_mm_store_ps(reinterpret_cast<float *>(pDst) + 48, vDst[4]);
_mm_store_ps(reinterpret_cast<float *>(pDst) + 52, vDst[5]);
_mm_store_ps(reinterpret_cast<float *>(pDst) + 56, vDst[6]);
_mm_store_ps(reinterpret_cast<float *>(pDst) + 60, vDst[7]);
#endif
}
#endif
};
@@ -558,24 +516,20 @@ struct Transpose32_32

INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
{
const float *pfSrc = reinterpret_cast<const float *>(pSrc);

__m256 src_r0 = _mm256_load_ps(pfSrc + 0);
__m256 src_r1 = _mm256_load_ps(pfSrc + 8);
__m256 src_g0 = _mm256_load_ps(pfSrc + 16);
__m256 src_g1 = _mm256_load_ps(pfSrc + 24);

__m256 dst0 = _mm256_unpacklo_ps(src_r0, src_g0);
__m256 dst1 = _mm256_unpackhi_ps(src_r0, src_g0);
__m256 dst2 = _mm256_unpacklo_ps(src_r1, src_g1);
__m256 dst3 = _mm256_unpackhi_ps(src_r1, src_g1);

float *pfDst = reinterpret_cast<float *>(pDst);

_mm256_store_ps(pfDst + 0, dst0);
_mm256_store_ps(pfDst + 8, dst1);
_mm256_store_ps(pfDst + 16, dst2);
_mm256_store_ps(pfDst + 24, dst3);
simdscalar src_r0 = _simd_load_ps(reinterpret_cast<const float *>(pSrc));
simdscalar src_r1 = _simd_load_ps(reinterpret_cast<const float *>(pSrc) + 8);
simdscalar src_g0 = _simd_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
simdscalar src_g1 = _simd_load_ps(reinterpret_cast<const float *>(pSrc) + 24);

simdscalar dst0 = _simd_unpacklo_ps(src_r0, src_g0);
simdscalar dst1 = _simd_unpacklo_ps(src_r0, src_g0);
simdscalar dst2 = _simd_unpacklo_ps(src_r1, src_g1);
simdscalar dst3 = _simd_unpacklo_ps(src_r1, src_g1);

_simd_store_ps(reinterpret_cast<float *>(pDst) + 0, dst0);
_simd_store_ps(reinterpret_cast<float *>(pDst) + 8, dst1);
_simd_store_ps(reinterpret_cast<float *>(pDst) + 16, dst2);
_simd_store_ps(reinterpret_cast<float *>(pDst) + 24, dst3);
}
#endif
};
@@ -625,25 +579,25 @@ struct Transpose16_16_16_16
simd16scalari src_rg = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc));
simd16scalari src_ba = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc + sizeof(simd16scalari)));

__m256i src_r = _simd16_extract_si(src_rg, 0);
__m256i src_g = _simd16_extract_si(src_rg, 1);
__m256i src_b = _simd16_extract_si(src_ba, 0);
__m256i src_a = _simd16_extract_si(src_ba, 1);
__m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g);
__m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g);
__m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a);
__m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a);
__m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0);
__m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0);
__m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1);
__m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1);
_mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0);
_mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1);
_mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2);
_mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3);
simdscalari src_r = _simd16_extract_si(src_rg, 0);
simdscalari src_g = _simd16_extract_si(src_rg, 1);
simdscalari src_b = _simd16_extract_si(src_ba, 0);
simdscalari src_a = _simd16_extract_si(src_ba, 1);
simdscalari rg0 = _simd_unpacklo_epi16(src_r, src_g);
simdscalari rg1 = _simd_unpackhi_epi16(src_r, src_g);
simdscalari ba0 = _simd_unpacklo_epi16(src_b, src_a);
simdscalari ba1 = _simd_unpackhi_epi16(src_b, src_a);
simdscalari dst0 = _simd_unpacklo_epi32(rg0, ba0);
simdscalari dst1 = _simd_unpackhi_epi32(rg0, ba0);
simdscalari dst2 = _simd_unpacklo_epi32(rg1, ba1);
simdscalari dst3 = _simd_unpackhi_epi32(rg1, ba1);
_simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);
_simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);
_simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2);
_simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3);
}
#endif
};
@@ -691,25 +645,25 @@ struct Transpose16_16_16
{
simd16scalari src_rg = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc));

__m256i src_r = _simd16_extract_si(src_rg, 0);
__m256i src_g = _simd16_extract_si(src_rg, 1);
__m256i src_b = _mm256_load_si256(reinterpret_cast<const __m256i *>(pSrc + sizeof(simd16scalari)));
__m256i src_a = _mm256_undefined_si256();
__m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g);
__m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g);
__m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a);
__m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a);
__m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0);
__m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0);
__m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1);
__m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1);
_mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0);
_mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1);
_mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2);
_mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3);
simdscalari src_r = _simd16_extract_si(src_rg, 0);
simdscalari src_g = _simd16_extract_si(src_rg, 1);
simdscalari src_b = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc + sizeof(simd16scalari)));
simdscalari src_a = _mm256_undefined_si256();
simdscalari rg0 = _simd_unpacklo_epi16(src_r, src_g);
simdscalari rg1 = _simd_unpackhi_epi16(src_r, src_g);
simdscalari ba0 = _simd_unpacklo_epi16(src_b, src_a);
simdscalari ba1 = _simd_unpackhi_epi16(src_b, src_a);
simdscalari dst0 = _simd_unpacklo_epi32(rg0, ba0);
simdscalari dst1 = _simd_unpackhi_epi32(rg0, ba0);
simdscalari dst2 = _simd_unpacklo_epi32(rg1, ba1);
simdscalari dst3 = _simd_unpackhi_epi32(rg1, ba1);
_simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);
_simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);
_simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2);
_simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3);
}
#endif
};
@@ -749,13 +703,13 @@ struct Transpose16_16
{
simd16scalari result = _simd16_setzero_si();

simd16scalari src = _simd16_castps_si(_simd16_load_ps(reinterpret_cast<const float *>(pSrc)));
simd16scalari src = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc));

simdscalari srclo = _simd16_extract_si(src, 0);
simdscalari srchi = _simd16_extract_si(src, 1);

result = _simd16_insert_si(result, _mm256_unpacklo_epi16(srclo, srchi), 0);
result = _simd16_insert_si(result, _mm256_unpackhi_epi16(srclo, srchi), 1);
result = _simd16_insert_si(result, _simd_unpacklo_epi16(srclo, srchi), 0);
result = _simd16_insert_si(result, _simd_unpackhi_epi16(srclo, srchi), 1);

_simd16_store_si(reinterpret_cast<simd16scalari *>(pDst), result);
}

+ 777
- 165
src/gallium/drivers/swr/rasterizer/memory/StoreTile.h
File diff soppresso perché troppo grande
Vedi File


Loading…
Annulla
Salva