Singular_Value_Decomposition_Kernel_Declarations.hpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. //#####################################################################
  2. // Copyright (c) 2010-2011, Eftychios Sifakis.
  3. //
  4. // Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
  5. // * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
  6. // * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
  7. // other materials provided with the distribution.
  8. //
  9. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
  10. // BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
  11. // SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  12. // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  13. // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  14. // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  15. //#####################################################################
  16. //###########################################################
  17. // Local variable declarations
  18. //###########################################################
  19. #ifdef PRINT_DEBUGGING_OUTPUT
  20. #ifdef USE_SSE_IMPLEMENTATION
  21. float buf[4];
  22. float A11,A21,A31,A12,A22,A32,A13,A23,A33;
  23. float S11,S21,S31,S22,S32,S33;
  24. #ifdef COMPUTE_V_AS_QUATERNION
  25. float QVS,QVVX,QVVY,QVVZ;
  26. #endif
  27. #ifdef COMPUTE_V_AS_MATRIX
  28. float V11,V21,V31,V12,V22,V32,V13,V23,V33;
  29. #endif
  30. #ifdef COMPUTE_U_AS_QUATERNION
  31. float QUS,QUVX,QUVY,QUVZ;
  32. #endif
  33. #ifdef COMPUTE_U_AS_MATRIX
  34. float U11,U21,U31,U12,U22,U32,U13,U23,U33;
  35. #endif
  36. #endif
  37. #ifdef USE_AVX_IMPLEMENTATION
  38. float buf[8];
  39. float A11,A21,A31,A12,A22,A32,A13,A23,A33;
  40. float S11,S21,S31,S22,S32,S33;
  41. #ifdef COMPUTE_V_AS_QUATERNION
  42. float QVS,QVVX,QVVY,QVVZ;
  43. #endif
  44. #ifdef COMPUTE_V_AS_MATRIX
  45. float V11,V21,V31,V12,V22,V32,V13,V23,V33;
  46. #endif
  47. #ifdef COMPUTE_U_AS_QUATERNION
  48. float QUS,QUVX,QUVY,QUVZ;
  49. #endif
  50. #ifdef COMPUTE_U_AS_MATRIX
  51. float U11,U21,U31,U12,U22,U32,U13,U23,U33;
  52. #endif
  53. #endif
  54. #endif
  55. const float Four_Gamma_Squared=sqrt(8.)+3.;
  56. const float Sine_Pi_Over_Eight=.5*sqrt(2.-sqrt(2.));
  57. const float Cosine_Pi_Over_Eight=.5*sqrt(2.+sqrt(2.));
  58. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sfour_gamma_squared;) ENABLE_SSE_IMPLEMENTATION(__m128 Vfour_gamma_squared;) ENABLE_AVX_IMPLEMENTATION(__m256 Vfour_gamma_squared;)
  59. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ssine_pi_over_eight;) ENABLE_SSE_IMPLEMENTATION(__m128 Vsine_pi_over_eight;) ENABLE_AVX_IMPLEMENTATION(__m256 Vsine_pi_over_eight;)
  60. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Scosine_pi_over_eight;) ENABLE_SSE_IMPLEMENTATION(__m128 Vcosine_pi_over_eight;) ENABLE_AVX_IMPLEMENTATION(__m256 Vcosine_pi_over_eight;)
  61. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sone_half;) ENABLE_SSE_IMPLEMENTATION(__m128 Vone_half;) ENABLE_AVX_IMPLEMENTATION(__m256 Vone_half;)
  62. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sone;) ENABLE_SSE_IMPLEMENTATION(__m128 Vone;) ENABLE_AVX_IMPLEMENTATION(__m256 Vone;)
  63. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Stiny_number;) ENABLE_SSE_IMPLEMENTATION(__m128 Vtiny_number;) ENABLE_AVX_IMPLEMENTATION(__m256 Vtiny_number;)
  64. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ssmall_number;) ENABLE_SSE_IMPLEMENTATION(__m128 Vsmall_number;) ENABLE_AVX_IMPLEMENTATION(__m256 Vsmall_number;)
  65. ENABLE_SCALAR_IMPLEMENTATION(Sfour_gamma_squared.f=Four_Gamma_Squared;) ENABLE_SSE_IMPLEMENTATION(Vfour_gamma_squared=_mm_set1_ps(Four_Gamma_Squared);) ENABLE_AVX_IMPLEMENTATION(Vfour_gamma_squared=_mm256_set1_ps(Four_Gamma_Squared);)
  66. ENABLE_SCALAR_IMPLEMENTATION(Ssine_pi_over_eight.f=Sine_Pi_Over_Eight;) ENABLE_SSE_IMPLEMENTATION(Vsine_pi_over_eight=_mm_set1_ps(Sine_Pi_Over_Eight);) ENABLE_AVX_IMPLEMENTATION(Vsine_pi_over_eight=_mm256_set1_ps(Sine_Pi_Over_Eight);)
  67. ENABLE_SCALAR_IMPLEMENTATION(Scosine_pi_over_eight.f=Cosine_Pi_Over_Eight;) ENABLE_SSE_IMPLEMENTATION(Vcosine_pi_over_eight=_mm_set1_ps(Cosine_Pi_Over_Eight);) ENABLE_AVX_IMPLEMENTATION(Vcosine_pi_over_eight=_mm256_set1_ps(Cosine_Pi_Over_Eight);)
  68. ENABLE_SCALAR_IMPLEMENTATION(Sone_half.f=.5;) ENABLE_SSE_IMPLEMENTATION(Vone_half=_mm_set1_ps(.5);) ENABLE_AVX_IMPLEMENTATION(Vone_half=_mm256_set1_ps(.5);)
  69. ENABLE_SCALAR_IMPLEMENTATION(Sone.f=1.;) ENABLE_SSE_IMPLEMENTATION(Vone=_mm_set1_ps(1.);) ENABLE_AVX_IMPLEMENTATION(Vone=_mm256_set1_ps(1.);)
  70. ENABLE_SCALAR_IMPLEMENTATION(Stiny_number.f=1.e-20;) ENABLE_SSE_IMPLEMENTATION(Vtiny_number=_mm_set1_ps(1.e-20);) ENABLE_AVX_IMPLEMENTATION(Vtiny_number=_mm256_set1_ps(1.e-20);)
  71. ENABLE_SCALAR_IMPLEMENTATION(Ssmall_number.f=1.e-12;) ENABLE_SSE_IMPLEMENTATION(Vsmall_number=_mm_set1_ps(1.e-12);) ENABLE_AVX_IMPLEMENTATION(Vsmall_number=_mm256_set1_ps(1.e-12);)
  72. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sa11;) ENABLE_SSE_IMPLEMENTATION(__m128 Va11;) ENABLE_AVX_IMPLEMENTATION(__m256 Va11;)
  73. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sa21;) ENABLE_SSE_IMPLEMENTATION(__m128 Va21;) ENABLE_AVX_IMPLEMENTATION(__m256 Va21;)
  74. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sa31;) ENABLE_SSE_IMPLEMENTATION(__m128 Va31;) ENABLE_AVX_IMPLEMENTATION(__m256 Va31;)
  75. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sa12;) ENABLE_SSE_IMPLEMENTATION(__m128 Va12;) ENABLE_AVX_IMPLEMENTATION(__m256 Va12;)
  76. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sa22;) ENABLE_SSE_IMPLEMENTATION(__m128 Va22;) ENABLE_AVX_IMPLEMENTATION(__m256 Va22;)
  77. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sa32;) ENABLE_SSE_IMPLEMENTATION(__m128 Va32;) ENABLE_AVX_IMPLEMENTATION(__m256 Va32;)
  78. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sa13;) ENABLE_SSE_IMPLEMENTATION(__m128 Va13;) ENABLE_AVX_IMPLEMENTATION(__m256 Va13;)
  79. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sa23;) ENABLE_SSE_IMPLEMENTATION(__m128 Va23;) ENABLE_AVX_IMPLEMENTATION(__m256 Va23;)
  80. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sa33;) ENABLE_SSE_IMPLEMENTATION(__m128 Va33;) ENABLE_AVX_IMPLEMENTATION(__m256 Va33;)
  81. #ifdef COMPUTE_V_AS_MATRIX
  82. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv11;) ENABLE_SSE_IMPLEMENTATION(__m128 Vv11;) ENABLE_AVX_IMPLEMENTATION(__m256 Vv11;)
  83. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv21;) ENABLE_SSE_IMPLEMENTATION(__m128 Vv21;) ENABLE_AVX_IMPLEMENTATION(__m256 Vv21;)
  84. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv31;) ENABLE_SSE_IMPLEMENTATION(__m128 Vv31;) ENABLE_AVX_IMPLEMENTATION(__m256 Vv31;)
  85. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv12;) ENABLE_SSE_IMPLEMENTATION(__m128 Vv12;) ENABLE_AVX_IMPLEMENTATION(__m256 Vv12;)
  86. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv22;) ENABLE_SSE_IMPLEMENTATION(__m128 Vv22;) ENABLE_AVX_IMPLEMENTATION(__m256 Vv22;)
  87. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv32;) ENABLE_SSE_IMPLEMENTATION(__m128 Vv32;) ENABLE_AVX_IMPLEMENTATION(__m256 Vv32;)
  88. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv13;) ENABLE_SSE_IMPLEMENTATION(__m128 Vv13;) ENABLE_AVX_IMPLEMENTATION(__m256 Vv13;)
  89. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv23;) ENABLE_SSE_IMPLEMENTATION(__m128 Vv23;) ENABLE_AVX_IMPLEMENTATION(__m256 Vv23;)
  90. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv33;) ENABLE_SSE_IMPLEMENTATION(__m128 Vv33;) ENABLE_AVX_IMPLEMENTATION(__m256 Vv33;)
  91. #endif
  92. #ifdef COMPUTE_V_AS_QUATERNION
  93. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sqvs;) ENABLE_SSE_IMPLEMENTATION(__m128 Vqvs;) ENABLE_AVX_IMPLEMENTATION(__m256 Vqvs;)
  94. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sqvvx;) ENABLE_SSE_IMPLEMENTATION(__m128 Vqvvx;) ENABLE_AVX_IMPLEMENTATION(__m256 Vqvvx;)
  95. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sqvvy;) ENABLE_SSE_IMPLEMENTATION(__m128 Vqvvy;) ENABLE_AVX_IMPLEMENTATION(__m256 Vqvvy;)
  96. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sqvvz;) ENABLE_SSE_IMPLEMENTATION(__m128 Vqvvz;) ENABLE_AVX_IMPLEMENTATION(__m256 Vqvvz;)
  97. #endif
  98. #ifdef COMPUTE_U_AS_MATRIX
  99. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Su11;) ENABLE_SSE_IMPLEMENTATION(__m128 Vu11;) ENABLE_AVX_IMPLEMENTATION(__m256 Vu11;)
  100. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Su21;) ENABLE_SSE_IMPLEMENTATION(__m128 Vu21;) ENABLE_AVX_IMPLEMENTATION(__m256 Vu21;)
  101. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Su31;) ENABLE_SSE_IMPLEMENTATION(__m128 Vu31;) ENABLE_AVX_IMPLEMENTATION(__m256 Vu31;)
  102. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Su12;) ENABLE_SSE_IMPLEMENTATION(__m128 Vu12;) ENABLE_AVX_IMPLEMENTATION(__m256 Vu12;)
  103. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Su22;) ENABLE_SSE_IMPLEMENTATION(__m128 Vu22;) ENABLE_AVX_IMPLEMENTATION(__m256 Vu22;)
  104. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Su32;) ENABLE_SSE_IMPLEMENTATION(__m128 Vu32;) ENABLE_AVX_IMPLEMENTATION(__m256 Vu32;)
  105. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Su13;) ENABLE_SSE_IMPLEMENTATION(__m128 Vu13;) ENABLE_AVX_IMPLEMENTATION(__m256 Vu13;)
  106. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Su23;) ENABLE_SSE_IMPLEMENTATION(__m128 Vu23;) ENABLE_AVX_IMPLEMENTATION(__m256 Vu23;)
  107. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Su33;) ENABLE_SSE_IMPLEMENTATION(__m128 Vu33;) ENABLE_AVX_IMPLEMENTATION(__m256 Vu33;)
  108. #endif
  109. #ifdef COMPUTE_U_AS_QUATERNION
  110. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Squs;) ENABLE_SSE_IMPLEMENTATION(__m128 Vqus;) ENABLE_AVX_IMPLEMENTATION(__m256 Vqus;)
  111. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Squvx;) ENABLE_SSE_IMPLEMENTATION(__m128 Vquvx;) ENABLE_AVX_IMPLEMENTATION(__m256 Vquvx;)
  112. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Squvy;) ENABLE_SSE_IMPLEMENTATION(__m128 Vquvy;) ENABLE_AVX_IMPLEMENTATION(__m256 Vquvy;)
  113. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Squvz;) ENABLE_SSE_IMPLEMENTATION(__m128 Vquvz;) ENABLE_AVX_IMPLEMENTATION(__m256 Vquvz;)
  114. #endif
  115. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sc;) ENABLE_SSE_IMPLEMENTATION(__m128 Vc;) ENABLE_AVX_IMPLEMENTATION(__m256 Vc;)
  116. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss;) ENABLE_SSE_IMPLEMENTATION(__m128 Vs;) ENABLE_AVX_IMPLEMENTATION(__m256 Vs;)
  117. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sch;) ENABLE_SSE_IMPLEMENTATION(__m128 Vch;) ENABLE_AVX_IMPLEMENTATION(__m256 Vch;)
  118. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ssh;) ENABLE_SSE_IMPLEMENTATION(__m128 Vsh;) ENABLE_AVX_IMPLEMENTATION(__m256 Vsh;)
  119. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Stmp1;) ENABLE_SSE_IMPLEMENTATION(__m128 Vtmp1;) ENABLE_AVX_IMPLEMENTATION(__m256 Vtmp1;)
  120. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Stmp2;) ENABLE_SSE_IMPLEMENTATION(__m128 Vtmp2;) ENABLE_AVX_IMPLEMENTATION(__m256 Vtmp2;)
  121. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Stmp3;) ENABLE_SSE_IMPLEMENTATION(__m128 Vtmp3;) ENABLE_AVX_IMPLEMENTATION(__m256 Vtmp3;)
  122. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Stmp4;) ENABLE_SSE_IMPLEMENTATION(__m128 Vtmp4;) ENABLE_AVX_IMPLEMENTATION(__m256 Vtmp4;)
  123. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Stmp5;) ENABLE_SSE_IMPLEMENTATION(__m128 Vtmp5;) ENABLE_AVX_IMPLEMENTATION(__m256 Vtmp5;)