00001 /* 00002 ----------------------------------------------------------------------------- 00003 This source file is part of OGRE 00004 (Object-oriented Graphics Rendering Engine) 00005 For the latest info, see http://www.ogre3d.org/ 00006 00007 Copyright (c) 2000-2012 Torus Knot Software Ltd 00008 00009 Permission is hereby granted, free of charge, to any person obtaining a copy 00010 of this software and associated documentation files (the "Software"), to deal 00011 in the Software without restriction, including without limitation the rights 00012 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 00013 copies of the Software, and to permit persons to whom the Software is 00014 furnished to do so, subject to the following conditions: 00015 00016 The above copyright notice and this permission notice shall be included in 00017 all copies or substantial portions of the Software. 00018 00019 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 00020 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 00021 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 00022 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 00023 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 00024 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 00025 THE SOFTWARE. 00026 ----------------------------------------------------------------------------- 00027 */ 00028 #ifndef __SIMDHelper_H__ 00029 #define __SIMDHelper_H__ 00030 00031 #include "OgrePrerequisites.h" 00032 #include "OgrePlatformInformation.h" 00033 00034 // Stack-alignment hackery. 00035 // 00036 // If macro __OGRE_SIMD_ALIGN_STACK defined, means there requests 00037 // special code to ensure stack align to a 16-bytes boundary. 00038 // 00039 // Note: 00040 // This macro can only guarantee callee stack pointer (esp) align 00041 // to a 16-bytes boundary, but not that for frame pointer (ebp). 00042 // Because most compiler might use frame pointer to access to stack 00043 // variables, so you need to wrap those alignment required functions 00044 // with extra function call. 00045 // 00046 #if defined(__INTEL_COMPILER) 00047 // For intel's compiler, simply calling alloca seems to do the right 00048 // thing. The size of the allocated block seems to be irrelevant. 00049 #define __OGRE_SIMD_ALIGN_STACK() _alloca(16) 00050 00051 #elif OGRE_CPU == OGRE_CPU_X86 && (OGRE_COMPILER == OGRE_COMPILER_GNUC || OGRE_COMPILER == OGRE_COMPILER_CLANG) 00052 // 00053 // Horrible hack to align the stack to a 16-bytes boundary for gcc. 00054 // 00055 // We assume a gcc version >= 2.95 so that 00056 // -mreferred-stack-boundary works. Otherwise, all bets are 00057 // off. However, -mreferred-stack-boundary does not create a 00058 // stack alignment, but it only preserves it. Unfortunately, 00059 // since Ogre are designed as a flexibility library, user might 00060 // compile their application with wrong stack alignment, even 00061 // if user taken care with stack alignment, but many versions 00062 // of libc on linux call main() with the wrong initial stack 00063 // alignment the result that the code is now pessimally aligned 00064 // instead of having a 50% chance of being correct. 00065 // 00066 #if OGRE_ARCH_TYPE != OGRE_ARCHITECTURE_64 00067 00068 #define __OGRE_SIMD_ALIGN_STACK() \ 00069 { \ 00070 /* Use alloca to allocate some memory on the stack. */ \ 00071 /* This alerts gcc that something funny is going on, */ \ 00072 /* so that it does not omit the frame pointer etc. */ \ 00073 (void)__builtin_alloca(16); \ 00074 /* Now align the stack pointer */ \ 00075 __asm__ __volatile__ ("andl $-16, %esp"); \ 00076 } 00077 00078 #else // 64 00079 #define __OGRE_SIMD_ALIGN_STACK() \ 00080 { \ 00081 /* Use alloca to allocate some memory on the stack. */ \ 00082 /* This alerts gcc that something funny is going on, */ \ 00083 /* so that it does not omit the frame pointer etc. */ \ 00084 (void)__builtin_alloca(16); \ 00085 /* Now align the stack pointer */ \ 00086 __asm__ __volatile__ ("andq $-16, %rsp"); \ 00087 } 00088 #endif //64 00089 00090 #elif defined(_MSC_VER) 00091 // Fortunately, MSVC will align the stack automatically 00092 00093 #endif 00094 00095 00096 // Additional platform-dependent header files and declares. 00097 // 00098 // NOTE: Should be sync with __OGRE_HAVE_SSE macro. 00099 // 00100 00101 #if OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86 00102 00103 // GCC version 4.0 upwards should be reliable for official SSE now, 00104 // so no longer define SSE macros ourselves 00105 // We don't support gcc 3.x anymore anyway, although that had SSE it was a bit flaky? 00106 #include <xmmintrin.h> 00107 00108 00109 #endif // OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86 00110 00111 00112 00113 //--------------------------------------------------------------------- 00114 // SIMD macros and helpers 00115 //--------------------------------------------------------------------- 00116 00117 00118 namespace Ogre { 00126 #if __OGRE_HAVE_SSE 00127 00138 #if 1 00139 #define __MM_RSQRT_PS(x) _mm_rsqrt_ps(x) 00140 #else 00141 #define __MM_RSQRT_PS(x) __mm_rsqrt_nr_ps(x) // Implemented below 00142 #endif 00143 00152 #define __MM_TRANSPOSE4x4_PS(r0, r1, r2, r3) \ 00153 { \ 00154 __m128 tmp3, tmp2, tmp1, tmp0; \ 00155 \ 00156 /* r00 r01 r02 r03 */ \ 00157 /* r10 r11 r12 r13 */ \ 00158 /* r20 r21 r22 r23 */ \ 00159 /* r30 r31 r32 r33 */ \ 00160 \ 00161 tmp0 = _mm_unpacklo_ps(r0, r1); /* r00 r10 r01 r11 */ \ 00162 tmp2 = _mm_unpackhi_ps(r0, r1); /* r02 r12 r03 r13 */ \ 00163 tmp1 = _mm_unpacklo_ps(r2, r3); /* r20 r30 r21 r31 */ \ 00164 tmp3 = _mm_unpackhi_ps(r2, r3); /* r22 r32 r23 r33 */ \ 00165 \ 00166 r0 = _mm_movelh_ps(tmp0, tmp1); /* r00 r10 r20 r30 */ \ 00167 r1 = _mm_movehl_ps(tmp1, tmp0); /* r01 r11 r21 r31 */ \ 00168 r2 = _mm_movelh_ps(tmp2, tmp3); /* r02 r12 r22 r32 */ \ 00169 r3 = _mm_movehl_ps(tmp3, tmp2); /* r03 r13 r23 r33 */ \ 00170 } 00171 00180 #define __MM_TRANSPOSE4x3_PS(v0, v1, v2) \ 00181 { \ 00182 __m128 tmp0, tmp1, tmp2; \ 00183 \ 00184 /* r00 r01 r02 r10 */ \ 00185 /* r11 r12 r20 r21 */ \ 00186 /* r22 r30 r31 r32 */ \ 00187 \ 00188 tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(3,0,3,0)); /* r00 r10 r22 r32 */ \ 00189 tmp1 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(1,0,2,1)); /* r01 r02 r11 r12 */ \ 00190 tmp2 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(2,1,3,2)); /* r20 r21 r30 r31 */ \ 00191 \ 00192 v0 = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(2,0,1,0)); /* r00 r10 r20 r30 */ \ 00193 v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0)); /* r01 r11 r21 r31 */ \ 00194 v2 = _mm_shuffle_ps(tmp1, tmp0, _MM_SHUFFLE(3,2,3,1)); /* r02 r12 r22 r32 */ \ 00195 } 00196 00204 #define __MM_TRANSPOSE3x4_PS(v0, v1, v2) \ 00205 { \ 00206 __m128 tmp0, tmp1, tmp2; \ 00207 \ 00208 /* r00 r10 r20 r30 */ \ 00209 /* r01 r11 r21 r31 */ \ 00210 /* r02 r12 r22 r32 */ \ 00211 \ 00212 tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(2,0,3,1)); /* r10 r30 r02 r22 */ \ 00213 tmp1 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3,1,3,1)); /* r11 r31 r12 r32 */ \ 00214 tmp2 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(2,0,2,0)); /* r00 r20 r01 r21 */ \ 00215 \ 00216 v0 = _mm_shuffle_ps(tmp2, tmp0, _MM_SHUFFLE(0,2,2,0)); /* r00 r01 r02 r10 */ \ 00217 v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0)); /* r11 r12 r20 r21 */ \ 00218 v2 = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3,1,1,3)); /* r22 r30 r31 r32 */ \ 00219 } 00220 00224 #define __MM_SELECT(v, fp) \ 00225 _mm_shuffle_ps((v), (v), _MM_SHUFFLE((fp),(fp),(fp),(fp))) 00226 00228 #define __MM_ACCUM4_PS(a, b, c, d) \ 00229 _mm_add_ps(_mm_add_ps(a, b), _mm_add_ps(c, d)) 00230 00234 #define __MM_DOT4x4_PS(a0, a1, a2, a3, b0, b1, b2, b3) \ 00235 __MM_ACCUM4_PS(_mm_mul_ps(a0, b0), _mm_mul_ps(a1, b1), _mm_mul_ps(a2, b2), _mm_mul_ps(a3, b3)) 00236 00240 #define __MM_DOT4x3_PS(r0, r1, r2, r3, v0, v1, v2) \ 00241 __MM_ACCUM4_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2), r3) 00242 00244 #define __MM_ACCUM3_PS(a, b, c) \ 00245 _mm_add_ps(_mm_add_ps(a, b), c) 00246 00250 #define __MM_DOT3x3_PS(r0, r1, r2, v0, v1, v2) \ 00251 __MM_ACCUM3_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2)) 00252 00254 #define __MM_MADD_PS(a, b, c) \ 00255 _mm_add_ps(_mm_mul_ps(a, b), c) 00256 00258 #define __MM_LERP_PS(t, a, b) \ 00259 __MM_MADD_PS(_mm_sub_ps(b, a), t, a) 00260 00262 #define __MM_MADD_SS(a, b, c) \ 00263 _mm_add_ss(_mm_mul_ss(a, b), c) 00264 00266 #define __MM_LERP_SS(t, a, b) \ 00267 __MM_MADD_SS(_mm_sub_ss(b, a), t, a) 00268 00270 #define __MM_LOAD_PS(p) \ 00271 (*(__m128*)(p)) 00272 00274 #define __MM_STORE_PS(p, v) \ 00275 (*(__m128*)(p) = (v)) 00276 00277 00280 template <bool aligned = false> 00281 struct SSEMemoryAccessor 00282 { 00283 static FORCEINLINE __m128 load(const float *p) 00284 { 00285 return _mm_loadu_ps(p); 00286 } 00287 static FORCEINLINE void store(float *p, const __m128& v) 00288 { 00289 _mm_storeu_ps(p, v); 00290 } 00291 }; 00292 // Special aligned accessor 00293 template <> 00294 struct SSEMemoryAccessor<true> 00295 { 00296 static FORCEINLINE const __m128& load(const float *p) 00297 { 00298 return __MM_LOAD_PS(p); 00299 } 00300 static FORCEINLINE void store(float *p, const __m128& v) 00301 { 00302 __MM_STORE_PS(p, v); 00303 } 00304 }; 00305 00308 static FORCEINLINE bool _isAlignedForSSE(const void *p) 00309 { 00310 return (((size_t)p) & 15) == 0; 00311 } 00312 00316 static FORCEINLINE __m128 __mm_rsqrt_nr_ps(const __m128& x) 00317 { 00318 static const __m128 v0pt5 = { 0.5f, 0.5f, 0.5f, 0.5f }; 00319 static const __m128 v3pt0 = { 3.0f, 3.0f, 3.0f, 3.0f }; 00320 __m128 t = _mm_rsqrt_ps(x); 00321 return _mm_mul_ps(_mm_mul_ps(v0pt5, t), 00322 _mm_sub_ps(v3pt0, _mm_mul_ps(_mm_mul_ps(x, t), t))); 00323 } 00324 00325 // Macro to check the stack aligned for SSE 00326 #if OGRE_DEBUG_MODE 00327 #define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE() \ 00328 { \ 00329 __m128 test; \ 00330 assert(_isAlignedForSSE(&test)); \ 00331 } 00332 00333 #else // !OGRE_DEBUG_MODE 00334 #define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE() 00335 00336 #endif // OGRE_DEBUG_MODE 00337 00338 00339 #endif // __OGRE_HAVE_SSE 00340 00343 } 00344 00345 #endif // __SIMDHelper_H__
Copyright © 2012 Torus Knot Software Ltd
This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.
Last modified Fri May 25 23:36:27 2012