According to a quick micro-benchmark, this new version is 20% faster on my
Haswell laptop.
v2: Removed the XXX note about x86_64 from the comment
v3: Use an intrinsic instead of an __asm__ block. This should give us MSVC
support for free.
v4: Enable it for all x86_64 builds, not just with USE_X86_64_ASM
Signed-off-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
tags/10.3-branchpoint
| @@ -274,10 +274,12 @@ static inline int IROUND_POS(float f) | |||
| return (int) (f + 0.5F); | |||
| } | |||
| #ifdef __x86_64__ | |||
| # include <xmmintrin.h> | |||
| #endif | |||
| /** | |||
| * Convert float to int using a fast method. The rounding mode may vary. | |||
| * XXX We could use an x86-64/SSE2 version here. | |||
| */ | |||
| static inline int F_TO_I(float f) | |||
| { | |||
| @@ -292,6 +294,8 @@ static inline int F_TO_I(float f) | |||
| fistp r | |||
| } | |||
| return r; | |||
| #elif defined(__x86_64__) | |||
| return _mm_cvt_ss2si(_mm_load_ss(&f)); | |||
| #else | |||
| return IROUND(f); | |||
| #endif | |||