OgreSIMDHelper.h

Go to the documentation of this file.
00001 /*
00002 -----------------------------------------------------------------------------
00003 This source file is part of OGRE
00004     (Object-oriented Graphics Rendering Engine)
00005 For the latest info, see http://www.ogre3d.org/
00006 
00007 Copyright (c) 2000-2012 Torus Knot Software Ltd
00008 
00009 Permission is hereby granted, free of charge, to any person obtaining a copy
00010 of this software and associated documentation files (the "Software"), to deal
00011 in the Software without restriction, including without limitation the rights
00012 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
00013 copies of the Software, and to permit persons to whom the Software is
00014 furnished to do so, subject to the following conditions:
00015 
00016 The above copyright notice and this permission notice shall be included in
00017 all copies or substantial portions of the Software.
00018 
00019 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00020 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00021 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
00022 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
00023 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
00024 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
00025 THE SOFTWARE.
00026 -----------------------------------------------------------------------------
00027 */
00028 #ifndef __SIMDHelper_H__
00029 #define __SIMDHelper_H__
00030 
00031 #include "OgrePrerequisites.h"
00032 #include "OgrePlatformInformation.h"
00033 
00034 // Stack-alignment hackery.
00035 //
00036 // If macro __OGRE_SIMD_ALIGN_STACK defined, means there requests
00037 // special code to ensure stack align to a 16-bytes boundary.
00038 //
00039 // Note:
00040 //   This macro can only guarantee callee stack pointer (esp) align
00041 // to a 16-bytes boundary, but not that for frame pointer (ebp).
00042 // Because most compiler might use frame pointer to access to stack
00043 // variables, so you need to wrap those alignment required functions
00044 // with extra function call.
00045 //
00046 #if defined(__INTEL_COMPILER)
00047 // For intel's compiler, simply calling alloca seems to do the right
00048 // thing. The size of the allocated block seems to be irrelevant.
00049 #define __OGRE_SIMD_ALIGN_STACK()   _alloca(16)
00050 
00051 #elif OGRE_CPU == OGRE_CPU_X86 && (OGRE_COMPILER == OGRE_COMPILER_GNUC || OGRE_COMPILER == OGRE_COMPILER_CLANG)
00052 //
00053 // Horrible hack to align the stack to a 16-bytes boundary for gcc.
00054 //
00055 // We assume a gcc version >= 2.95 so that
00056 // -mreferred-stack-boundary works.  Otherwise, all bets are
00057 // off.  However, -mreferred-stack-boundary does not create a
00058 // stack alignment, but it only preserves it.  Unfortunately,
00059 // since Ogre are designed as a flexibility library, user might
00060 // compile their application with wrong stack alignment, even
00061 // if user taken care with stack alignment, but many versions
00062 // of libc on linux call main() with the wrong initial stack
00063 // alignment the result that the code is now pessimally aligned
00064 // instead of having a 50% chance of being correct.
00065 //
00066 #if OGRE_ARCH_TYPE != OGRE_ARCHITECTURE_64
00067 
00068 #define __OGRE_SIMD_ALIGN_STACK()                                   \
00069     {                                                               \
00070         /* Use alloca to allocate some memory on the stack.  */     \
00071         /* This alerts gcc that something funny is going on, */     \
00072         /* so that it does not omit the frame pointer etc.   */     \
00073         (void)__builtin_alloca(16);                                 \
00074         /* Now align the stack pointer */                           \
00075         __asm__ __volatile__ ("andl $-16, %esp");                   \
00076     }
00077 
00078 #else // 64
00079 #define __OGRE_SIMD_ALIGN_STACK()                                   \
00080     {                                                               \
00081         /* Use alloca to allocate some memory on the stack.  */     \
00082         /* This alerts gcc that something funny is going on, */     \
00083         /* so that it does not omit the frame pointer etc.   */     \
00084         (void)__builtin_alloca(16);                                 \
00085         /* Now align the stack pointer */                           \
00086         __asm__ __volatile__ ("andq $-16, %rsp");                   \
00087     }
00088 #endif //64
00089 
00090 #elif defined(_MSC_VER)
00091 // Fortunately, MSVC will align the stack automatically
00092 
00093 #endif
00094 
00095 
00096 // Additional platform-dependent header files and declares.
00097 //
00098 // NOTE: Should be sync with __OGRE_HAVE_SSE macro.
00099 //
00100 
00101 #if OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86
00102 
00103 // GCC version 4.0 upwards should be reliable for official SSE now,
00104 // so no longer define SSE macros ourselves
00105 // We don't support gcc 3.x anymore anyway, although that had SSE it was a bit flaky?
00106 #include <xmmintrin.h>
00107 
00108 
00109 #endif // OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86
00110 
00111 
00112 
00113 //---------------------------------------------------------------------
00114 // SIMD macros and helpers
00115 //---------------------------------------------------------------------
00116 
00117 
00118 namespace Ogre {
00126 #if __OGRE_HAVE_SSE
00127 
00138 #if 1
00139 #define __MM_RSQRT_PS(x)    _mm_rsqrt_ps(x)
00140 #else
00141 #define __MM_RSQRT_PS(x)    __mm_rsqrt_nr_ps(x) // Implemented below
00142 #endif
00143 
00152 #define __MM_TRANSPOSE4x4_PS(r0, r1, r2, r3)                                            \
00153     {                                                                                   \
00154         __m128 tmp3, tmp2, tmp1, tmp0;                                                  \
00155                                                                                         \
00156                                                             /* r00 r01 r02 r03 */       \
00157                                                             /* r10 r11 r12 r13 */       \
00158                                                             /* r20 r21 r22 r23 */       \
00159                                                             /* r30 r31 r32 r33 */       \
00160                                                                                         \
00161         tmp0 = _mm_unpacklo_ps(r0, r1);                       /* r00 r10 r01 r11 */     \
00162         tmp2 = _mm_unpackhi_ps(r0, r1);                       /* r02 r12 r03 r13 */     \
00163         tmp1 = _mm_unpacklo_ps(r2, r3);                       /* r20 r30 r21 r31 */     \
00164         tmp3 = _mm_unpackhi_ps(r2, r3);                       /* r22 r32 r23 r33 */     \
00165                                                                                         \
00166         r0 = _mm_movelh_ps(tmp0, tmp1);                         /* r00 r10 r20 r30 */   \
00167         r1 = _mm_movehl_ps(tmp1, tmp0);                         /* r01 r11 r21 r31 */   \
00168         r2 = _mm_movelh_ps(tmp2, tmp3);                         /* r02 r12 r22 r32 */   \
00169         r3 = _mm_movehl_ps(tmp3, tmp2);                         /* r03 r13 r23 r33 */   \
00170     }
00171 
00180 #define __MM_TRANSPOSE4x3_PS(v0, v1, v2)                                                \
00181     {                                                                                   \
00182         __m128 tmp0, tmp1, tmp2;                                                        \
00183                                                                                         \
00184                                                             /* r00 r01 r02 r10 */       \
00185                                                             /* r11 r12 r20 r21 */       \
00186                                                             /* r22 r30 r31 r32 */       \
00187                                                                                         \
00188         tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(3,0,3,0));  /* r00 r10 r22 r32 */     \
00189         tmp1 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(1,0,2,1));  /* r01 r02 r11 r12 */     \
00190         tmp2 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(2,1,3,2));  /* r20 r21 r30 r31 */     \
00191                                                                                         \
00192         v0 = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(2,0,1,0));  /* r00 r10 r20 r30 */   \
00193         v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0));  /* r01 r11 r21 r31 */   \
00194         v2 = _mm_shuffle_ps(tmp1, tmp0, _MM_SHUFFLE(3,2,3,1));  /* r02 r12 r22 r32 */   \
00195     }
00196 
00204 #define __MM_TRANSPOSE3x4_PS(v0, v1, v2)                                            \
00205     {                                                                               \
00206         __m128 tmp0, tmp1, tmp2;                                                    \
00207                                                                                     \
00208                                                             /* r00 r10 r20 r30 */   \
00209                                                             /* r01 r11 r21 r31 */   \
00210                                                             /* r02 r12 r22 r32 */   \
00211                                                                                     \
00212         tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(2,0,3,1));  /* r10 r30 r02 r22 */   \
00213         tmp1 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3,1,3,1));  /* r11 r31 r12 r32 */   \
00214         tmp2 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(2,0,2,0));  /* r00 r20 r01 r21 */   \
00215                                                                                     \
00216         v0 = _mm_shuffle_ps(tmp2, tmp0, _MM_SHUFFLE(0,2,2,0));  /* r00 r01 r02 r10 */   \
00217         v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0));  /* r11 r12 r20 r21 */   \
00218         v2 = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3,1,1,3));  /* r22 r30 r31 r32 */   \
00219     }
00220 
00224 #define __MM_SELECT(v, fp)                                                          \
00225     _mm_shuffle_ps((v), (v), _MM_SHUFFLE((fp),(fp),(fp),(fp)))
00226 
00228 #define __MM_ACCUM4_PS(a, b, c, d)                                                  \
00229     _mm_add_ps(_mm_add_ps(a, b), _mm_add_ps(c, d))
00230 
00234 #define __MM_DOT4x4_PS(a0, a1, a2, a3, b0, b1, b2, b3)                              \
00235     __MM_ACCUM4_PS(_mm_mul_ps(a0, b0), _mm_mul_ps(a1, b1), _mm_mul_ps(a2, b2), _mm_mul_ps(a3, b3))
00236 
00240 #define __MM_DOT4x3_PS(r0, r1, r2, r3, v0, v1, v2)                                  \
00241     __MM_ACCUM4_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2), r3)
00242 
00244 #define __MM_ACCUM3_PS(a, b, c)                                                     \
00245     _mm_add_ps(_mm_add_ps(a, b), c)
00246 
00250 #define __MM_DOT3x3_PS(r0, r1, r2, v0, v1, v2)                                      \
00251     __MM_ACCUM3_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2))
00252 
00254 #define __MM_MADD_PS(a, b, c)                                                       \
00255     _mm_add_ps(_mm_mul_ps(a, b), c)
00256 
00258 #define __MM_LERP_PS(t, a, b)                                                       \
00259     __MM_MADD_PS(_mm_sub_ps(b, a), t, a)
00260 
00262 #define __MM_MADD_SS(a, b, c)                                                       \
00263     _mm_add_ss(_mm_mul_ss(a, b), c)
00264 
00266 #define __MM_LERP_SS(t, a, b)                                                       \
00267     __MM_MADD_SS(_mm_sub_ss(b, a), t, a)
00268 
00270 #define __MM_LOAD_PS(p)                                                             \
00271     (*(__m128*)(p))
00272 
00274 #define __MM_STORE_PS(p, v)                                                         \
00275     (*(__m128*)(p) = (v))
00276 
00277 
00280     template <bool aligned = false>
00281     struct SSEMemoryAccessor
00282     {
00283         static FORCEINLINE __m128 load(const float *p)
00284         {
00285             return _mm_loadu_ps(p);
00286         }
00287         static FORCEINLINE void store(float *p, const __m128& v)
00288         {
00289             _mm_storeu_ps(p, v);
00290         }
00291     };
00292     // Special aligned accessor
00293     template <>
00294     struct SSEMemoryAccessor<true>
00295     {
00296         static FORCEINLINE const __m128& load(const float *p)
00297         {
00298             return __MM_LOAD_PS(p);
00299         }
00300         static FORCEINLINE void store(float *p, const __m128& v)
00301         {
00302             __MM_STORE_PS(p, v);
00303         }
00304     };
00305 
00308     static FORCEINLINE bool _isAlignedForSSE(const void *p)
00309     {
00310         return (((size_t)p) & 15) == 0;
00311     }
00312 
00316     static FORCEINLINE __m128 __mm_rsqrt_nr_ps(const __m128& x)
00317     {
00318         static const __m128 v0pt5 = { 0.5f, 0.5f, 0.5f, 0.5f };
00319         static const __m128 v3pt0 = { 3.0f, 3.0f, 3.0f, 3.0f };
00320         __m128 t = _mm_rsqrt_ps(x);
00321         return _mm_mul_ps(_mm_mul_ps(v0pt5, t),
00322             _mm_sub_ps(v3pt0, _mm_mul_ps(_mm_mul_ps(x, t), t)));
00323     }
00324 
00325 // Macro to check the stack aligned for SSE
00326 #if OGRE_DEBUG_MODE
00327 #define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE()        \
00328     {                                               \
00329         __m128 test;                                \
00330         assert(_isAlignedForSSE(&test));            \
00331     }
00332 
00333 #else   // !OGRE_DEBUG_MODE
00334 #define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE()
00335 
00336 #endif  // OGRE_DEBUG_MODE
00337 
00338 
00339 #endif  // __OGRE_HAVE_SSE
00340 
00343 }
00344 
00345 #endif // __SIMDHelper_H__