Singular_Value_Decomposition_Main_Kernel_Body.hpp 112 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277
  1. //#####################################################################
  2. // Copyright (c) 2010-2011, Eftychios Sifakis.
  3. //
  4. // Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
  5. // * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
  6. // * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
  7. // other materials provided with the distribution.
  8. //
  9. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
  10. // BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
  11. // SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  12. // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  13. // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  14. // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  15. //#####################################################################
  16. #ifdef __INTEL_COMPILER
  17. #pragma warning( disable : 592 )
  18. #endif
  19. // #define USE_ACCURATE_RSQRT_IN_JACOBI_CONJUGATION
  20. // #define PERFORM_STRICT_QUATERNION_RENORMALIZATION
  21. { // Begin block : Scope of qV (if not maintained)
  22. #ifndef COMPUTE_V_AS_QUATERNION
  23. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sqvs;) ENABLE_SSE_IMPLEMENTATION(__m128 Vqvs;) ENABLE_AVX_IMPLEMENTATION(__m256 Vqvs;)
  24. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sqvvx;) ENABLE_SSE_IMPLEMENTATION(__m128 Vqvvx;) ENABLE_AVX_IMPLEMENTATION(__m256 Vqvvx;)
  25. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sqvvy;) ENABLE_SSE_IMPLEMENTATION(__m128 Vqvvy;) ENABLE_AVX_IMPLEMENTATION(__m256 Vqvvy;)
  26. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sqvvz;) ENABLE_SSE_IMPLEMENTATION(__m128 Vqvvz;) ENABLE_AVX_IMPLEMENTATION(__m256 Vqvvz;)
  27. #endif
  28. { // Begin block : Symmetric eigenanalysis
  29. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss11;) ENABLE_SSE_IMPLEMENTATION(__m128 Vs11;) ENABLE_AVX_IMPLEMENTATION(__m256 Vs11;)
  30. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss21;) ENABLE_SSE_IMPLEMENTATION(__m128 Vs21;) ENABLE_AVX_IMPLEMENTATION(__m256 Vs21;)
  31. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss31;) ENABLE_SSE_IMPLEMENTATION(__m128 Vs31;) ENABLE_AVX_IMPLEMENTATION(__m256 Vs31;)
  32. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss22;) ENABLE_SSE_IMPLEMENTATION(__m128 Vs22;) ENABLE_AVX_IMPLEMENTATION(__m256 Vs22;)
  33. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss32;) ENABLE_SSE_IMPLEMENTATION(__m128 Vs32;) ENABLE_AVX_IMPLEMENTATION(__m256 Vs32;)
  34. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss33;) ENABLE_SSE_IMPLEMENTATION(__m128 Vs33;) ENABLE_AVX_IMPLEMENTATION(__m256 Vs33;)
  35. ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=1.;) ENABLE_SSE_IMPLEMENTATION(Vqvs=Vone;) ENABLE_AVX_IMPLEMENTATION(Vqvs=Vone;)
  36. ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vqvvx=_mm_xor_ps(Vqvvx,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vqvvx=_mm256_xor_ps(Vqvvx,Vqvvx);)
  37. ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vqvvy=_mm_xor_ps(Vqvvy,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vqvvy=_mm256_xor_ps(Vqvvy,Vqvvy);)
  38. ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vqvvz=_mm_xor_ps(Vqvvz,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vqvvz=_mm256_xor_ps(Vqvvz,Vqvvz);)
  39. //###########################################################
  40. // Compute normal equations matrix
  41. //###########################################################
  42. ENABLE_SCALAR_IMPLEMENTATION(Ss11.f=Sa11.f*Sa11.f;) ENABLE_SSE_IMPLEMENTATION(Vs11=_mm_mul_ps(Va11,Va11);) ENABLE_AVX_IMPLEMENTATION(Vs11=_mm256_mul_ps(Va11,Va11);)
  43. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa21.f*Sa21.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va21,Va21);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va21,Va21);)
  44. ENABLE_SCALAR_IMPLEMENTATION(Ss11.f=Stmp1.f+Ss11.f;) ENABLE_SSE_IMPLEMENTATION(Vs11=_mm_add_ps(Vtmp1,Vs11);) ENABLE_AVX_IMPLEMENTATION(Vs11=_mm256_add_ps(Vtmp1,Vs11);)
  45. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa31.f*Sa31.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va31,Va31);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va31,Va31);)
  46. ENABLE_SCALAR_IMPLEMENTATION(Ss11.f=Stmp1.f+Ss11.f;) ENABLE_SSE_IMPLEMENTATION(Vs11=_mm_add_ps(Vtmp1,Vs11);) ENABLE_AVX_IMPLEMENTATION(Vs11=_mm256_add_ps(Vtmp1,Vs11);)
  47. ENABLE_SCALAR_IMPLEMENTATION(Ss21.f=Sa12.f*Sa11.f;) ENABLE_SSE_IMPLEMENTATION(Vs21=_mm_mul_ps(Va12,Va11);) ENABLE_AVX_IMPLEMENTATION(Vs21=_mm256_mul_ps(Va12,Va11);)
  48. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa22.f*Sa21.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va22,Va21);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va22,Va21);)
  49. ENABLE_SCALAR_IMPLEMENTATION(Ss21.f=Stmp1.f+Ss21.f;) ENABLE_SSE_IMPLEMENTATION(Vs21=_mm_add_ps(Vtmp1,Vs21);) ENABLE_AVX_IMPLEMENTATION(Vs21=_mm256_add_ps(Vtmp1,Vs21);)
  50. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa32.f*Sa31.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va32,Va31);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va32,Va31);)
  51. ENABLE_SCALAR_IMPLEMENTATION(Ss21.f=Stmp1.f+Ss21.f;) ENABLE_SSE_IMPLEMENTATION(Vs21=_mm_add_ps(Vtmp1,Vs21);) ENABLE_AVX_IMPLEMENTATION(Vs21=_mm256_add_ps(Vtmp1,Vs21);)
  52. ENABLE_SCALAR_IMPLEMENTATION(Ss31.f=Sa13.f*Sa11.f;) ENABLE_SSE_IMPLEMENTATION(Vs31=_mm_mul_ps(Va13,Va11);) ENABLE_AVX_IMPLEMENTATION(Vs31=_mm256_mul_ps(Va13,Va11);)
  53. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa23.f*Sa21.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va23,Va21);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va23,Va21);)
  54. ENABLE_SCALAR_IMPLEMENTATION(Ss31.f=Stmp1.f+Ss31.f;) ENABLE_SSE_IMPLEMENTATION(Vs31=_mm_add_ps(Vtmp1,Vs31);) ENABLE_AVX_IMPLEMENTATION(Vs31=_mm256_add_ps(Vtmp1,Vs31);)
  55. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa33.f*Sa31.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va33,Va31);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va33,Va31);)
  56. ENABLE_SCALAR_IMPLEMENTATION(Ss31.f=Stmp1.f+Ss31.f;) ENABLE_SSE_IMPLEMENTATION(Vs31=_mm_add_ps(Vtmp1,Vs31);) ENABLE_AVX_IMPLEMENTATION(Vs31=_mm256_add_ps(Vtmp1,Vs31);)
  57. ENABLE_SCALAR_IMPLEMENTATION(Ss22.f=Sa12.f*Sa12.f;) ENABLE_SSE_IMPLEMENTATION(Vs22=_mm_mul_ps(Va12,Va12);) ENABLE_AVX_IMPLEMENTATION(Vs22=_mm256_mul_ps(Va12,Va12);)
  58. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa22.f*Sa22.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va22,Va22);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va22,Va22);)
  59. ENABLE_SCALAR_IMPLEMENTATION(Ss22.f=Stmp1.f+Ss22.f;) ENABLE_SSE_IMPLEMENTATION(Vs22=_mm_add_ps(Vtmp1,Vs22);) ENABLE_AVX_IMPLEMENTATION(Vs22=_mm256_add_ps(Vtmp1,Vs22);)
  60. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa32.f*Sa32.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va32,Va32);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va32,Va32);)
  61. ENABLE_SCALAR_IMPLEMENTATION(Ss22.f=Stmp1.f+Ss22.f;) ENABLE_SSE_IMPLEMENTATION(Vs22=_mm_add_ps(Vtmp1,Vs22);) ENABLE_AVX_IMPLEMENTATION(Vs22=_mm256_add_ps(Vtmp1,Vs22);)
  62. ENABLE_SCALAR_IMPLEMENTATION(Ss32.f=Sa13.f*Sa12.f;) ENABLE_SSE_IMPLEMENTATION(Vs32=_mm_mul_ps(Va13,Va12);) ENABLE_AVX_IMPLEMENTATION(Vs32=_mm256_mul_ps(Va13,Va12);)
  63. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa23.f*Sa22.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va23,Va22);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va23,Va22);)
  64. ENABLE_SCALAR_IMPLEMENTATION(Ss32.f=Stmp1.f+Ss32.f;) ENABLE_SSE_IMPLEMENTATION(Vs32=_mm_add_ps(Vtmp1,Vs32);) ENABLE_AVX_IMPLEMENTATION(Vs32=_mm256_add_ps(Vtmp1,Vs32);)
  65. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa33.f*Sa32.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va33,Va32);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va33,Va32);)
  66. ENABLE_SCALAR_IMPLEMENTATION(Ss32.f=Stmp1.f+Ss32.f;) ENABLE_SSE_IMPLEMENTATION(Vs32=_mm_add_ps(Vtmp1,Vs32);) ENABLE_AVX_IMPLEMENTATION(Vs32=_mm256_add_ps(Vtmp1,Vs32);)
  67. ENABLE_SCALAR_IMPLEMENTATION(Ss33.f=Sa13.f*Sa13.f;) ENABLE_SSE_IMPLEMENTATION(Vs33=_mm_mul_ps(Va13,Va13);) ENABLE_AVX_IMPLEMENTATION(Vs33=_mm256_mul_ps(Va13,Va13);)
  68. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa23.f*Sa23.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va23,Va23);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va23,Va23);)
  69. ENABLE_SCALAR_IMPLEMENTATION(Ss33.f=Stmp1.f+Ss33.f;) ENABLE_SSE_IMPLEMENTATION(Vs33=_mm_add_ps(Vtmp1,Vs33);) ENABLE_AVX_IMPLEMENTATION(Vs33=_mm256_add_ps(Vtmp1,Vs33);)
  70. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa33.f*Sa33.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va33,Va33);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va33,Va33);)
  71. ENABLE_SCALAR_IMPLEMENTATION(Ss33.f=Stmp1.f+Ss33.f;) ENABLE_SSE_IMPLEMENTATION(Vs33=_mm_add_ps(Vtmp1,Vs33);) ENABLE_AVX_IMPLEMENTATION(Vs33=_mm256_add_ps(Vtmp1,Vs33);)
  72. //###########################################################
  73. // Solve symmetric eigenproblem using Jacobi iteration
  74. //###########################################################
  75. for(int sweep=1;sweep<=4;sweep++){
  76. // First Jacobi conjugation
  77. #define SS11 Ss11
  78. #define SS21 Ss21
  79. #define SS31 Ss31
  80. #define SS22 Ss22
  81. #define SS32 Ss32
  82. #define SS33 Ss33
  83. #define SQVVX Sqvvx
  84. #define SQVVY Sqvvy
  85. #define SQVVZ Sqvvz
  86. #define STMP1 Stmp1
  87. #define STMP2 Stmp2
  88. #define STMP3 Stmp3
  89. #define VS11 Vs11
  90. #define VS21 Vs21
  91. #define VS31 Vs31
  92. #define VS22 Vs22
  93. #define VS32 Vs32
  94. #define VS33 Vs33
  95. #define VQVVX Vqvvx
  96. #define VQVVY Vqvvy
  97. #define VQVVZ Vqvvz
  98. #define VTMP1 Vtmp1
  99. #define VTMP2 Vtmp2
  100. #define VTMP3 Vtmp3
  101. #include "Singular_Value_Decomposition_Jacobi_Conjugation_Kernel.hpp"
  102. #undef SS11
  103. #undef SS21
  104. #undef SS31
  105. #undef SS22
  106. #undef SS32
  107. #undef SS33
  108. #undef SQVVX
  109. #undef SQVVY
  110. #undef SQVVZ
  111. #undef STMP1
  112. #undef STMP2
  113. #undef STMP3
  114. #undef VS11
  115. #undef VS21
  116. #undef VS31
  117. #undef VS22
  118. #undef VS32
  119. #undef VS33
  120. #undef VQVVX
  121. #undef VQVVY
  122. #undef VQVVZ
  123. #undef VTMP1
  124. #undef VTMP2
  125. #undef VTMP3
  126. // Second Jacobi conjugation
  127. #define SS11 Ss22
  128. #define SS21 Ss32
  129. #define SS31 Ss21
  130. #define SS22 Ss33
  131. #define SS32 Ss31
  132. #define SS33 Ss11
  133. #define SQVVX Sqvvy
  134. #define SQVVY Sqvvz
  135. #define SQVVZ Sqvvx
  136. #define STMP1 Stmp2
  137. #define STMP2 Stmp3
  138. #define STMP3 Stmp1
  139. #define VS11 Vs22
  140. #define VS21 Vs32
  141. #define VS31 Vs21
  142. #define VS22 Vs33
  143. #define VS32 Vs31
  144. #define VS33 Vs11
  145. #define VQVVX Vqvvy
  146. #define VQVVY Vqvvz
  147. #define VQVVZ Vqvvx
  148. #define VTMP1 Vtmp2
  149. #define VTMP2 Vtmp3
  150. #define VTMP3 Vtmp1
  151. #include "Singular_Value_Decomposition_Jacobi_Conjugation_Kernel.hpp"
  152. #undef SS11
  153. #undef SS21
  154. #undef SS31
  155. #undef SS22
  156. #undef SS32
  157. #undef SS33
  158. #undef SQVVX
  159. #undef SQVVY
  160. #undef SQVVZ
  161. #undef STMP1
  162. #undef STMP2
  163. #undef STMP3
  164. #undef VS11
  165. #undef VS21
  166. #undef VS31
  167. #undef VS22
  168. #undef VS32
  169. #undef VS33
  170. #undef VQVVX
  171. #undef VQVVY
  172. #undef VQVVZ
  173. #undef VTMP1
  174. #undef VTMP2
  175. #undef VTMP3
  176. // Third Jacobi conjugation
  177. #define SS11 Ss33
  178. #define SS21 Ss31
  179. #define SS31 Ss32
  180. #define SS22 Ss11
  181. #define SS32 Ss21
  182. #define SS33 Ss22
  183. #define SQVVX Sqvvz
  184. #define SQVVY Sqvvx
  185. #define SQVVZ Sqvvy
  186. #define STMP1 Stmp3
  187. #define STMP2 Stmp1
  188. #define STMP3 Stmp2
  189. #define VS11 Vs33
  190. #define VS21 Vs31
  191. #define VS31 Vs32
  192. #define VS22 Vs11
  193. #define VS32 Vs21
  194. #define VS33 Vs22
  195. #define VQVVX Vqvvz
  196. #define VQVVY Vqvvx
  197. #define VQVVZ Vqvvy
  198. #define VTMP1 Vtmp3
  199. #define VTMP2 Vtmp1
  200. #define VTMP3 Vtmp2
  201. #include "Singular_Value_Decomposition_Jacobi_Conjugation_Kernel.hpp"
  202. #undef SS11
  203. #undef SS21
  204. #undef SS31
  205. #undef SS22
  206. #undef SS32
  207. #undef SS33
  208. #undef SQVVX
  209. #undef SQVVY
  210. #undef SQVVZ
  211. #undef STMP1
  212. #undef STMP2
  213. #undef STMP3
  214. #undef VS11
  215. #undef VS21
  216. #undef VS31
  217. #undef VS22
  218. #undef VS32
  219. #undef VS33
  220. #undef VQVVX
  221. #undef VQVVY
  222. #undef VQVVZ
  223. #undef VTMP1
  224. #undef VTMP2
  225. #undef VTMP3
  226. }
  227. #ifdef PRINT_DEBUGGING_OUTPUT
  228. #ifdef USE_SCALAR_IMPLEMENTATION
  229. std::cout<<"Scalar S ="<<std::endl;
  230. std::cout<<std::setw(12)<<Ss11.f<<std::endl;
  231. std::cout<<std::setw(12)<<Ss21.f<<" "<<std::setw(12)<<Ss22.f<<std::endl;
  232. std::cout<<std::setw(12)<<Ss31.f<<" "<<std::setw(12)<<Ss32.f<<" "<<std::setw(12)<<Ss33.f<<std::endl;
  233. #endif
  234. #ifdef USE_SSE_IMPLEMENTATION
  235. _mm_storeu_ps(buf,Vs11);S11=buf[0];
  236. _mm_storeu_ps(buf,Vs21);S21=buf[0];
  237. _mm_storeu_ps(buf,Vs31);S31=buf[0];
  238. _mm_storeu_ps(buf,Vs22);S22=buf[0];
  239. _mm_storeu_ps(buf,Vs32);S32=buf[0];
  240. _mm_storeu_ps(buf,Vs33);S33=buf[0];
  241. std::cout<<"Vector S ="<<std::endl;
  242. std::cout<<std::setw(12)<<S11<<std::endl;
  243. std::cout<<std::setw(12)<<S21<<" "<<std::setw(12)<<S22<<std::endl;
  244. std::cout<<std::setw(12)<<S31<<" "<<std::setw(12)<<S32<<" "<<std::setw(12)<<S33<<std::endl;
  245. #endif
  246. #ifdef USE_AVX_IMPLEMENTATION
  247. _mm256_storeu_ps(buf,Vs11);S11=buf[0];
  248. _mm256_storeu_ps(buf,Vs21);S21=buf[0];
  249. _mm256_storeu_ps(buf,Vs31);S31=buf[0];
  250. _mm256_storeu_ps(buf,Vs22);S22=buf[0];
  251. _mm256_storeu_ps(buf,Vs32);S32=buf[0];
  252. _mm256_storeu_ps(buf,Vs33);S33=buf[0];
  253. std::cout<<"Vector S ="<<std::endl;
  254. std::cout<<std::setw(12)<<S11<<std::endl;
  255. std::cout<<std::setw(12)<<S21<<" "<<std::setw(12)<<S22<<std::endl;
  256. std::cout<<std::setw(12)<<S31<<" "<<std::setw(12)<<S32<<" "<<std::setw(12)<<S33<<std::endl;
  257. #endif
  258. #endif
  259. } // End block : Symmetric eigenanalysis
  260. //###########################################################
  261. // Normalize quaternion for matrix V
  262. //###########################################################
  263. #if !defined(USE_ACCURATE_RSQRT_IN_JACOBI_CONJUGATION) || defined(PERFORM_STRICT_QUATERNION_RENORMALIZATION)
  264. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sqvs.f*Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vqvs,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vqvs,Vqvs);)
  265. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvx.f*Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvx,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvx,Vqvvx);)
  266. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp1.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp1,Vtmp2);)
  267. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvy.f*Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvy,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvy,Vqvvy);)
  268. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp1.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp1,Vtmp2);)
  269. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvz.f*Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvz,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvz,Vqvvz);)
  270. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp1.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp1,Vtmp2);)
  271. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=rsqrt(Stmp2.f);) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_rsqrt_ps(Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_rsqrt_ps(Vtmp2);)
  272. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp1.f*Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Vtmp1,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Vtmp1,Vone_half);)
  273. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp1.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp1,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp1,Vtmp4);)
  274. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp1.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp1,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp1,Vtmp3);)
  275. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp2.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp2,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp2,Vtmp3);)
  276. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Stmp1.f+Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_add_ps(Vtmp1,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_add_ps(Vtmp1,Vtmp4);)
  277. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Stmp1.f-Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_sub_ps(Vtmp1,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_sub_ps(Vtmp1,Vtmp3);)
  278. ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Sqvs.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=_mm_mul_ps(Vqvs,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vqvs=_mm256_mul_ps(Vqvs,Vtmp1);)
  279. ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=Sqvvx.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvx=_mm_mul_ps(Vqvvx,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vqvvx=_mm256_mul_ps(Vqvvx,Vtmp1);)
  280. ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=Sqvvy.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvy=_mm_mul_ps(Vqvvy,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vqvvy=_mm256_mul_ps(Vqvvy,Vtmp1);)
  281. ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=Sqvvz.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvz=_mm_mul_ps(Vqvvz,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vqvvz=_mm256_mul_ps(Vqvvz,Vtmp1);)
  282. #ifdef PRINT_DEBUGGING_OUTPUT
  283. #ifdef USE_SCALAR_IMPLEMENTATION
  284. std::cout<<"Scalar qV ="<<std::endl;
  285. std::cout<<std::setw(12)<<Sqvs.f<<" "<<std::setw(12)<<Sqvvx.f<<" "<<std::setw(12)<<Sqvvy.f<<" "<<std::setw(12)<<Sqvvz.f<<std::endl;
  286. #endif
  287. #ifdef USE_SSE_IMPLEMENTATION
  288. _mm_storeu_ps(buf,Vqvs);QVS=buf[0];
  289. _mm_storeu_ps(buf,Vqvvx);QVVX=buf[0];
  290. _mm_storeu_ps(buf,Vqvvy);QVVY=buf[0];
  291. _mm_storeu_ps(buf,Vqvvz);QVVZ=buf[0];
  292. std::cout<<"Vector qV ="<<std::endl;
  293. std::cout<<std::setw(12)<<QVS<<" "<<std::setw(12)<<QVVX<<" "<<std::setw(12)<<QVVY<<" "<<std::setw(12)<<QVVZ<<std::endl;
  294. #endif
  295. #ifdef USE_AVX_IMPLEMENTATION
  296. _mm256_storeu_ps(buf,Vqvs);QVS=buf[0];
  297. _mm256_storeu_ps(buf,Vqvvx);QVVX=buf[0];
  298. _mm256_storeu_ps(buf,Vqvvy);QVVY=buf[0];
  299. _mm256_storeu_ps(buf,Vqvvz);QVVZ=buf[0];
  300. std::cout<<"Vector qV ="<<std::endl;
  301. std::cout<<std::setw(12)<<QVS<<" "<<std::setw(12)<<QVVX<<" "<<std::setw(12)<<QVVY<<" "<<std::setw(12)<<QVVZ<<std::endl;
  302. #endif
  303. #endif
  304. #endif
  305. { // Begin block : Conjugation with V
  306. //###########################################################
  307. // Transform quaternion to matrix V
  308. //###########################################################
  309. #ifndef COMPUTE_V_AS_MATRIX
  310. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv11;) ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv11;)
  311. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv21;) ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv21;)
  312. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv31;) ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv31;)
  313. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv12;) ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv12;)
  314. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv22;) ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv22;)
  315. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv32;) ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv32;)
  316. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv13;) ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv13;)
  317. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv23;) ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv23;)
  318. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv33;) ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv33;)
  319. #endif
  320. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvx.f*Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvx,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvx,Vqvvx);)
  321. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sqvvy.f*Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vqvvy,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vqvvy,Vqvvy);)
  322. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Sqvvz.f*Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vqvvz,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vqvvz,Vqvvz);)
  323. ENABLE_SCALAR_IMPLEMENTATION(Sv11.f=Sqvs.f*Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vv11=_mm_mul_ps(Vqvs,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vv11=_mm256_mul_ps(Vqvs,Vqvs);)
  324. ENABLE_SCALAR_IMPLEMENTATION(Sv22.f=Sv11.f-Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vv22=_mm_sub_ps(Vv11,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vv22=_mm256_sub_ps(Vv11,Vtmp1);)
  325. ENABLE_SCALAR_IMPLEMENTATION(Sv33.f=Sv22.f-Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vv33=_mm_sub_ps(Vv22,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vv33=_mm256_sub_ps(Vv22,Vtmp2);)
  326. ENABLE_SCALAR_IMPLEMENTATION(Sv33.f=Sv33.f+Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vv33=_mm_add_ps(Vv33,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vv33=_mm256_add_ps(Vv33,Vtmp3);)
  327. ENABLE_SCALAR_IMPLEMENTATION(Sv22.f=Sv22.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vv22=_mm_add_ps(Vv22,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vv22=_mm256_add_ps(Vv22,Vtmp2);)
  328. ENABLE_SCALAR_IMPLEMENTATION(Sv22.f=Sv22.f-Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vv22=_mm_sub_ps(Vv22,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vv22=_mm256_sub_ps(Vv22,Vtmp3);)
  329. ENABLE_SCALAR_IMPLEMENTATION(Sv11.f=Sv11.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vv11=_mm_add_ps(Vv11,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vv11=_mm256_add_ps(Vv11,Vtmp1);)
  330. ENABLE_SCALAR_IMPLEMENTATION(Sv11.f=Sv11.f-Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vv11=_mm_sub_ps(Vv11,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vv11=_mm256_sub_ps(Vv11,Vtmp2);)
  331. ENABLE_SCALAR_IMPLEMENTATION(Sv11.f=Sv11.f-Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vv11=_mm_sub_ps(Vv11,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vv11=_mm256_sub_ps(Vv11,Vtmp3);)
  332. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvx.f+Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_add_ps(Vqvvx,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_add_ps(Vqvvx,Vqvvx);)
  333. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sqvvy.f+Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vqvvy,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vqvvy,Vqvvy);)
  334. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Sqvvz.f+Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_add_ps(Vqvvz,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_add_ps(Vqvvz,Vqvvz);)
  335. ENABLE_SCALAR_IMPLEMENTATION(Sv32.f=Sqvs.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vv32=_mm_mul_ps(Vqvs,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vv32=_mm256_mul_ps(Vqvs,Vtmp1);)
  336. ENABLE_SCALAR_IMPLEMENTATION(Sv13.f=Sqvs.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vv13=_mm_mul_ps(Vqvs,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vv13=_mm256_mul_ps(Vqvs,Vtmp2);)
  337. ENABLE_SCALAR_IMPLEMENTATION(Sv21.f=Sqvs.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vv21=_mm_mul_ps(Vqvs,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vv21=_mm256_mul_ps(Vqvs,Vtmp3);)
  338. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvy.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvy,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvy,Vtmp1);)
  339. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sqvvz.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vqvvz,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vqvvz,Vtmp2);)
  340. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Sqvvx.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vqvvx,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vqvvx,Vtmp3);)
  341. ENABLE_SCALAR_IMPLEMENTATION(Sv12.f=Stmp1.f-Sv21.f;) ENABLE_SSE_IMPLEMENTATION(Vv12=_mm_sub_ps(Vtmp1,Vv21);) ENABLE_AVX_IMPLEMENTATION(Vv12=_mm256_sub_ps(Vtmp1,Vv21);)
  342. ENABLE_SCALAR_IMPLEMENTATION(Sv23.f=Stmp2.f-Sv32.f;) ENABLE_SSE_IMPLEMENTATION(Vv23=_mm_sub_ps(Vtmp2,Vv32);) ENABLE_AVX_IMPLEMENTATION(Vv23=_mm256_sub_ps(Vtmp2,Vv32);)
  343. ENABLE_SCALAR_IMPLEMENTATION(Sv31.f=Stmp3.f-Sv13.f;) ENABLE_SSE_IMPLEMENTATION(Vv31=_mm_sub_ps(Vtmp3,Vv13);) ENABLE_AVX_IMPLEMENTATION(Vv31=_mm256_sub_ps(Vtmp3,Vv13);)
  344. ENABLE_SCALAR_IMPLEMENTATION(Sv21.f=Stmp1.f+Sv21.f;) ENABLE_SSE_IMPLEMENTATION(Vv21=_mm_add_ps(Vtmp1,Vv21);) ENABLE_AVX_IMPLEMENTATION(Vv21=_mm256_add_ps(Vtmp1,Vv21);)
  345. ENABLE_SCALAR_IMPLEMENTATION(Sv32.f=Stmp2.f+Sv32.f;) ENABLE_SSE_IMPLEMENTATION(Vv32=_mm_add_ps(Vtmp2,Vv32);) ENABLE_AVX_IMPLEMENTATION(Vv32=_mm256_add_ps(Vtmp2,Vv32);)
  346. ENABLE_SCALAR_IMPLEMENTATION(Sv13.f=Stmp3.f+Sv13.f;) ENABLE_SSE_IMPLEMENTATION(Vv13=_mm_add_ps(Vtmp3,Vv13);) ENABLE_AVX_IMPLEMENTATION(Vv13=_mm256_add_ps(Vtmp3,Vv13);)
  347. #ifdef COMPUTE_V_AS_MATRIX
  348. #ifdef PRINT_DEBUGGING_OUTPUT
  349. #ifdef USE_SCALAR_IMPLEMENTATION
  350. std::cout<<"Scalar V ="<<std::endl;
  351. std::cout<<std::setw(12)<<Sv11.f<<" "<<std::setw(12)<<Sv12.f<<" "<<std::setw(12)<<Sv13.f<<std::endl;
  352. std::cout<<std::setw(12)<<Sv21.f<<" "<<std::setw(12)<<Sv22.f<<" "<<std::setw(12)<<Sv23.f<<std::endl;
  353. std::cout<<std::setw(12)<<Sv31.f<<" "<<std::setw(12)<<Sv32.f<<" "<<std::setw(12)<<Sv33.f<<std::endl;
  354. #endif
  355. #ifdef USE_SSE_IMPLEMENTATION
  356. _mm_storeu_ps(buf,Vv11);V11=buf[0];
  357. _mm_storeu_ps(buf,Vv21);V21=buf[0];
  358. _mm_storeu_ps(buf,Vv31);V31=buf[0];
  359. _mm_storeu_ps(buf,Vv12);V12=buf[0];
  360. _mm_storeu_ps(buf,Vv22);V22=buf[0];
  361. _mm_storeu_ps(buf,Vv32);V32=buf[0];
  362. _mm_storeu_ps(buf,Vv13);V13=buf[0];
  363. _mm_storeu_ps(buf,Vv23);V23=buf[0];
  364. _mm_storeu_ps(buf,Vv33);V33=buf[0];
  365. std::cout<<"Vector V ="<<std::endl;
  366. std::cout<<std::setw(12)<<V11<<" "<<std::setw(12)<<V12<<" "<<std::setw(12)<<V13<<std::endl;
  367. std::cout<<std::setw(12)<<V21<<" "<<std::setw(12)<<V22<<" "<<std::setw(12)<<V23<<std::endl;
  368. std::cout<<std::setw(12)<<V31<<" "<<std::setw(12)<<V32<<" "<<std::setw(12)<<V33<<std::endl;
  369. #endif
  370. #ifdef USE_AVX_IMPLEMENTATION
  371. _mm256_storeu_ps(buf,Vv11);V11=buf[0];
  372. _mm256_storeu_ps(buf,Vv21);V21=buf[0];
  373. _mm256_storeu_ps(buf,Vv31);V31=buf[0];
  374. _mm256_storeu_ps(buf,Vv12);V12=buf[0];
  375. _mm256_storeu_ps(buf,Vv22);V22=buf[0];
  376. _mm256_storeu_ps(buf,Vv32);V32=buf[0];
  377. _mm256_storeu_ps(buf,Vv13);V13=buf[0];
  378. _mm256_storeu_ps(buf,Vv23);V23=buf[0];
  379. _mm256_storeu_ps(buf,Vv33);V33=buf[0];
  380. std::cout<<"Vector V ="<<std::endl;
  381. std::cout<<std::setw(12)<<V11<<" "<<std::setw(12)<<V12<<" "<<std::setw(12)<<V13<<std::endl;
  382. std::cout<<std::setw(12)<<V21<<" "<<std::setw(12)<<V22<<" "<<std::setw(12)<<V23<<std::endl;
  383. std::cout<<std::setw(12)<<V31<<" "<<std::setw(12)<<V32<<" "<<std::setw(12)<<V33<<std::endl;
  384. #endif
  385. #endif
  386. #endif
  387. //###########################################################
  388. // Multiply (from the right) with V
  389. //###########################################################
  390. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sa12.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=Va12;) ENABLE_AVX_IMPLEMENTATION(Vtmp2=Va12;)
  391. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Sa13.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=Va13;) ENABLE_AVX_IMPLEMENTATION(Vtmp3=Va13;)
  392. ENABLE_SCALAR_IMPLEMENTATION(Sa12.f=Sv12.f*Sa11.f;) ENABLE_SSE_IMPLEMENTATION(Va12=_mm_mul_ps(Vv12,Va11);) ENABLE_AVX_IMPLEMENTATION(Va12=_mm256_mul_ps(Vv12,Va11);)
  393. ENABLE_SCALAR_IMPLEMENTATION(Sa13.f=Sv13.f*Sa11.f;) ENABLE_SSE_IMPLEMENTATION(Va13=_mm_mul_ps(Vv13,Va11);) ENABLE_AVX_IMPLEMENTATION(Va13=_mm256_mul_ps(Vv13,Va11);)
  394. ENABLE_SCALAR_IMPLEMENTATION(Sa11.f=Sv11.f*Sa11.f;) ENABLE_SSE_IMPLEMENTATION(Va11=_mm_mul_ps(Vv11,Va11);) ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_mul_ps(Vv11,Va11);)
  395. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv21.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv21,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv21,Vtmp2);)
  396. ENABLE_SCALAR_IMPLEMENTATION(Sa11.f=Sa11.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va11=_mm_add_ps(Va11,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_add_ps(Va11,Vtmp1);)
  397. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv31.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv31,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv31,Vtmp3);)
  398. ENABLE_SCALAR_IMPLEMENTATION(Sa11.f=Sa11.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va11=_mm_add_ps(Va11,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_add_ps(Va11,Vtmp1);)
  399. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv22.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv22,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv22,Vtmp2);)
  400. ENABLE_SCALAR_IMPLEMENTATION(Sa12.f=Sa12.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va12=_mm_add_ps(Va12,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va12=_mm256_add_ps(Va12,Vtmp1);)
  401. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv32.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv32,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv32,Vtmp3);)
  402. ENABLE_SCALAR_IMPLEMENTATION(Sa12.f=Sa12.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va12=_mm_add_ps(Va12,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va12=_mm256_add_ps(Va12,Vtmp1);)
  403. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv23.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv23,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv23,Vtmp2);)
  404. ENABLE_SCALAR_IMPLEMENTATION(Sa13.f=Sa13.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va13=_mm_add_ps(Va13,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va13=_mm256_add_ps(Va13,Vtmp1);)
  405. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv33.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv33,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv33,Vtmp3);)
  406. ENABLE_SCALAR_IMPLEMENTATION(Sa13.f=Sa13.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va13=_mm_add_ps(Va13,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va13=_mm256_add_ps(Va13,Vtmp1);)
  407. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sa22.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=Va22;) ENABLE_AVX_IMPLEMENTATION(Vtmp2=Va22;)
  408. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Sa23.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=Va23;) ENABLE_AVX_IMPLEMENTATION(Vtmp3=Va23;)
  409. ENABLE_SCALAR_IMPLEMENTATION(Sa22.f=Sv12.f*Sa21.f;) ENABLE_SSE_IMPLEMENTATION(Va22=_mm_mul_ps(Vv12,Va21);) ENABLE_AVX_IMPLEMENTATION(Va22=_mm256_mul_ps(Vv12,Va21);)
  410. ENABLE_SCALAR_IMPLEMENTATION(Sa23.f=Sv13.f*Sa21.f;) ENABLE_SSE_IMPLEMENTATION(Va23=_mm_mul_ps(Vv13,Va21);) ENABLE_AVX_IMPLEMENTATION(Va23=_mm256_mul_ps(Vv13,Va21);)
  411. ENABLE_SCALAR_IMPLEMENTATION(Sa21.f=Sv11.f*Sa21.f;) ENABLE_SSE_IMPLEMENTATION(Va21=_mm_mul_ps(Vv11,Va21);) ENABLE_AVX_IMPLEMENTATION(Va21=_mm256_mul_ps(Vv11,Va21);)
  412. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv21.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv21,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv21,Vtmp2);)
  413. ENABLE_SCALAR_IMPLEMENTATION(Sa21.f=Sa21.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va21=_mm_add_ps(Va21,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va21=_mm256_add_ps(Va21,Vtmp1);)
  414. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv31.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv31,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv31,Vtmp3);)
  415. ENABLE_SCALAR_IMPLEMENTATION(Sa21.f=Sa21.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va21=_mm_add_ps(Va21,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va21=_mm256_add_ps(Va21,Vtmp1);)
  416. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv22.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv22,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv22,Vtmp2);)
  417. ENABLE_SCALAR_IMPLEMENTATION(Sa22.f=Sa22.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va22=_mm_add_ps(Va22,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va22=_mm256_add_ps(Va22,Vtmp1);)
  418. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv32.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv32,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv32,Vtmp3);)
  419. ENABLE_SCALAR_IMPLEMENTATION(Sa22.f=Sa22.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va22=_mm_add_ps(Va22,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va22=_mm256_add_ps(Va22,Vtmp1);)
  420. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv23.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv23,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv23,Vtmp2);)
  421. ENABLE_SCALAR_IMPLEMENTATION(Sa23.f=Sa23.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va23=_mm_add_ps(Va23,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va23=_mm256_add_ps(Va23,Vtmp1);)
  422. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv33.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv33,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv33,Vtmp3);)
  423. ENABLE_SCALAR_IMPLEMENTATION(Sa23.f=Sa23.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va23=_mm_add_ps(Va23,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va23=_mm256_add_ps(Va23,Vtmp1);)
  424. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sa32.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=Va32;) ENABLE_AVX_IMPLEMENTATION(Vtmp2=Va32;)
  425. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Sa33.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=Va33;) ENABLE_AVX_IMPLEMENTATION(Vtmp3=Va33;)
  426. ENABLE_SCALAR_IMPLEMENTATION(Sa32.f=Sv12.f*Sa31.f;) ENABLE_SSE_IMPLEMENTATION(Va32=_mm_mul_ps(Vv12,Va31);) ENABLE_AVX_IMPLEMENTATION(Va32=_mm256_mul_ps(Vv12,Va31);)
  427. ENABLE_SCALAR_IMPLEMENTATION(Sa33.f=Sv13.f*Sa31.f;) ENABLE_SSE_IMPLEMENTATION(Va33=_mm_mul_ps(Vv13,Va31);) ENABLE_AVX_IMPLEMENTATION(Va33=_mm256_mul_ps(Vv13,Va31);)
  428. ENABLE_SCALAR_IMPLEMENTATION(Sa31.f=Sv11.f*Sa31.f;) ENABLE_SSE_IMPLEMENTATION(Va31=_mm_mul_ps(Vv11,Va31);) ENABLE_AVX_IMPLEMENTATION(Va31=_mm256_mul_ps(Vv11,Va31);)
  429. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv21.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv21,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv21,Vtmp2);)
  430. ENABLE_SCALAR_IMPLEMENTATION(Sa31.f=Sa31.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va31=_mm_add_ps(Va31,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va31=_mm256_add_ps(Va31,Vtmp1);)
  431. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv31.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv31,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv31,Vtmp3);)
  432. ENABLE_SCALAR_IMPLEMENTATION(Sa31.f=Sa31.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va31=_mm_add_ps(Va31,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va31=_mm256_add_ps(Va31,Vtmp1);)
  433. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv22.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv22,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv22,Vtmp2);)
  434. ENABLE_SCALAR_IMPLEMENTATION(Sa32.f=Sa32.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va32=_mm_add_ps(Va32,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va32=_mm256_add_ps(Va32,Vtmp1);)
  435. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv32.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv32,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv32,Vtmp3);)
  436. ENABLE_SCALAR_IMPLEMENTATION(Sa32.f=Sa32.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va32=_mm_add_ps(Va32,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va32=_mm256_add_ps(Va32,Vtmp1);)
  437. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv23.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv23,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv23,Vtmp2);)
  438. ENABLE_SCALAR_IMPLEMENTATION(Sa33.f=Sa33.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va33=_mm_add_ps(Va33,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va33=_mm256_add_ps(Va33,Vtmp1);)
  439. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv33.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv33,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv33,Vtmp3);)
  440. ENABLE_SCALAR_IMPLEMENTATION(Sa33.f=Sa33.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va33=_mm_add_ps(Va33,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va33=_mm256_add_ps(Va33,Vtmp1);)
  441. #ifdef PRINT_DEBUGGING_OUTPUT
  442. #ifdef USE_SCALAR_IMPLEMENTATION
  443. std::cout<<"Scalar A (after multiplying with V) ="<<std::endl;
  444. std::cout<<std::setw(12)<<Sa11.f<<" "<<std::setw(12)<<Sa12.f<<" "<<std::setw(12)<<Sa13.f<<std::endl;
  445. std::cout<<std::setw(12)<<Sa21.f<<" "<<std::setw(12)<<Sa22.f<<" "<<std::setw(12)<<Sa23.f<<std::endl;
  446. std::cout<<std::setw(12)<<Sa31.f<<" "<<std::setw(12)<<Sa32.f<<" "<<std::setw(12)<<Sa33.f<<std::endl;
  447. #endif
  448. #ifdef USE_SSE_IMPLEMENTATION
  449. _mm_storeu_ps(buf,Va11);A11=buf[0];
  450. _mm_storeu_ps(buf,Va21);A21=buf[0];
  451. _mm_storeu_ps(buf,Va31);A31=buf[0];
  452. _mm_storeu_ps(buf,Va12);A12=buf[0];
  453. _mm_storeu_ps(buf,Va22);A22=buf[0];
  454. _mm_storeu_ps(buf,Va32);A32=buf[0];
  455. _mm_storeu_ps(buf,Va13);A13=buf[0];
  456. _mm_storeu_ps(buf,Va23);A23=buf[0];
  457. _mm_storeu_ps(buf,Va33);A33=buf[0];
  458. std::cout<<"Vector A (after multiplying with V) ="<<std::endl;
  459. std::cout<<std::setw(12)<<A11<<" "<<std::setw(12)<<A12<<" "<<std::setw(12)<<A13<<std::endl;
  460. std::cout<<std::setw(12)<<A21<<" "<<std::setw(12)<<A22<<" "<<std::setw(12)<<A23<<std::endl;
  461. std::cout<<std::setw(12)<<A31<<" "<<std::setw(12)<<A32<<" "<<std::setw(12)<<A33<<std::endl;
  462. #endif
  463. #ifdef USE_AVX_IMPLEMENTATION
  464. _mm256_storeu_ps(buf,Va11);A11=buf[0];
  465. _mm256_storeu_ps(buf,Va21);A21=buf[0];
  466. _mm256_storeu_ps(buf,Va31);A31=buf[0];
  467. _mm256_storeu_ps(buf,Va12);A12=buf[0];
  468. _mm256_storeu_ps(buf,Va22);A22=buf[0];
  469. _mm256_storeu_ps(buf,Va32);A32=buf[0];
  470. _mm256_storeu_ps(buf,Va13);A13=buf[0];
  471. _mm256_storeu_ps(buf,Va23);A23=buf[0];
  472. _mm256_storeu_ps(buf,Va33);A33=buf[0];
  473. std::cout<<"Vector A (after multiplying with V) ="<<std::endl;
  474. std::cout<<std::setw(12)<<A11<<" "<<std::setw(12)<<A12<<" "<<std::setw(12)<<A13<<std::endl;
  475. std::cout<<std::setw(12)<<A21<<" "<<std::setw(12)<<A22<<" "<<std::setw(12)<<A23<<std::endl;
  476. std::cout<<std::setw(12)<<A31<<" "<<std::setw(12)<<A32<<" "<<std::setw(12)<<A33<<std::endl;
  477. #endif
  478. #endif
  479. } // End block : Conjugation with V
  480. } // End block : Scope of qV (if not maintained)
  481. //###########################################################
  482. // Permute columns such that the singular values are sorted
  483. //###########################################################
  484. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa11.f*Sa11.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va11,Va11);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va11,Va11);)
  485. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sa21.f*Sa21.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Va21,Va21);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Va21,Va21);)
  486. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Stmp1.f+Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_add_ps(Vtmp1,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_add_ps(Vtmp1,Vtmp4);)
  487. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sa31.f*Sa31.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Va31,Va31);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Va31,Va31);)
  488. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Stmp1.f+Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_add_ps(Vtmp1,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_add_ps(Vtmp1,Vtmp4);)
  489. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sa12.f*Sa12.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Va12,Va12);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Va12,Va12);)
  490. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sa22.f*Sa22.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Va22,Va22);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Va22,Va22);)
  491. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp2.f+Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp2,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp2,Vtmp4);)
  492. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sa32.f*Sa32.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Va32,Va32);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Va32,Va32);)
  493. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp2.f+Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp2,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp2,Vtmp4);)
  494. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Sa13.f*Sa13.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Va13,Va13);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Va13,Va13);)
  495. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sa23.f*Sa23.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Va23,Va23);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Va23,Va23);)
  496. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp3.f+Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_add_ps(Vtmp3,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_add_ps(Vtmp3,Vtmp4);)
  497. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sa33.f*Sa33.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Va33,Va33);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Va33,Va33);)
  498. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp3.f+Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_add_ps(Vtmp3,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_add_ps(Vtmp3,Vtmp4);)
  499. // Swap columns 1-2 if necessary
  500. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.ui=(Stmp1.f<Stmp2.f)?0xffffffff:0;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_cmplt_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_cmp_ps(Vtmp1,Vtmp2, _CMP_LT_OS);) //ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_cmplt_ps(Vtmp1,Vtmp2);)
  501. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa11.ui^Sa12.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va11,Va12);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va11,Va12);)
  502. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  503. ENABLE_SCALAR_IMPLEMENTATION(Sa11.ui=Sa11.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va11=_mm_xor_ps(Va11,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_xor_ps(Va11,Vtmp5);)
  504. ENABLE_SCALAR_IMPLEMENTATION(Sa12.ui=Sa12.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va12=_mm_xor_ps(Va12,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va12=_mm256_xor_ps(Va12,Vtmp5);)
  505. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa21.ui^Sa22.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va21,Va22);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va21,Va22);)
  506. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  507. ENABLE_SCALAR_IMPLEMENTATION(Sa21.ui=Sa21.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va21=_mm_xor_ps(Va21,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va21=_mm256_xor_ps(Va21,Vtmp5);)
  508. ENABLE_SCALAR_IMPLEMENTATION(Sa22.ui=Sa22.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va22=_mm_xor_ps(Va22,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va22=_mm256_xor_ps(Va22,Vtmp5);)
  509. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa31.ui^Sa32.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va31,Va32);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va31,Va32);)
  510. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  511. ENABLE_SCALAR_IMPLEMENTATION(Sa31.ui=Sa31.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va31=_mm_xor_ps(Va31,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va31=_mm256_xor_ps(Va31,Vtmp5);)
  512. ENABLE_SCALAR_IMPLEMENTATION(Sa32.ui=Sa32.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va32=_mm_xor_ps(Va32,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va32=_mm256_xor_ps(Va32,Vtmp5);)
  513. #ifdef COMPUTE_V_AS_MATRIX
  514. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv11.ui^Sv12.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv11,Vv12);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv11,Vv12);)
  515. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  516. ENABLE_SCALAR_IMPLEMENTATION(Sv11.ui=Sv11.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv11=_mm_xor_ps(Vv11,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv11=_mm256_xor_ps(Vv11,Vtmp5);)
  517. ENABLE_SCALAR_IMPLEMENTATION(Sv12.ui=Sv12.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv12=_mm_xor_ps(Vv12,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv12=_mm256_xor_ps(Vv12,Vtmp5);)
  518. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv21.ui^Sv22.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv21,Vv22);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv21,Vv22);)
  519. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  520. ENABLE_SCALAR_IMPLEMENTATION(Sv21.ui=Sv21.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv21=_mm_xor_ps(Vv21,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv21=_mm256_xor_ps(Vv21,Vtmp5);)
  521. ENABLE_SCALAR_IMPLEMENTATION(Sv22.ui=Sv22.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv22=_mm_xor_ps(Vv22,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv22=_mm256_xor_ps(Vv22,Vtmp5);)
  522. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv31.ui^Sv32.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv31,Vv32);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv31,Vv32);)
  523. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  524. ENABLE_SCALAR_IMPLEMENTATION(Sv31.ui=Sv31.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv31=_mm_xor_ps(Vv31,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv31=_mm256_xor_ps(Vv31,Vtmp5);)
  525. ENABLE_SCALAR_IMPLEMENTATION(Sv32.ui=Sv32.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv32=_mm_xor_ps(Vv32,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv32=_mm256_xor_ps(Vv32,Vtmp5);)
  526. #endif
  527. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp1.ui^Stmp2.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vtmp1,Vtmp2);)
  528. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  529. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.ui=Stmp1.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_xor_ps(Vtmp1,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_xor_ps(Vtmp1,Vtmp5);)
  530. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.ui=Stmp2.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_xor_ps(Vtmp2,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_xor_ps(Vtmp2,Vtmp5);)
  531. // If columns 1-2 have been swapped, negate 2nd column of A and V so that V is still a rotation
  532. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=-2.;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_set1_ps(-2.);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_set1_ps(-2.);)
  533. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  534. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=1.;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=Vone;) ENABLE_AVX_IMPLEMENTATION(Vtmp4=Vone;)
  535. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f+Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_add_ps(Vtmp4,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_add_ps(Vtmp4,Vtmp5);)
  536. ENABLE_SCALAR_IMPLEMENTATION(Sa12.f=Sa12.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Va12=_mm_mul_ps(Va12,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Va12=_mm256_mul_ps(Va12,Vtmp4);)
  537. ENABLE_SCALAR_IMPLEMENTATION(Sa22.f=Sa22.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Va22=_mm_mul_ps(Va22,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Va22=_mm256_mul_ps(Va22,Vtmp4);)
  538. ENABLE_SCALAR_IMPLEMENTATION(Sa32.f=Sa32.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Va32=_mm_mul_ps(Va32,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Va32=_mm256_mul_ps(Va32,Vtmp4);)
  539. #ifdef COMPUTE_V_AS_MATRIX
  540. ENABLE_SCALAR_IMPLEMENTATION(Sv12.f=Sv12.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vv12=_mm_mul_ps(Vv12,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vv12=_mm256_mul_ps(Vv12,Vtmp4);)
  541. ENABLE_SCALAR_IMPLEMENTATION(Sv22.f=Sv22.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vv22=_mm_mul_ps(Vv22,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vv22=_mm256_mul_ps(Vv22,Vtmp4);)
  542. ENABLE_SCALAR_IMPLEMENTATION(Sv32.f=Sv32.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vv32=_mm_mul_ps(Vv32,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vv32=_mm256_mul_ps(Vv32,Vtmp4);)
  543. #endif
  544. // If columns 1-2 have been swapped, also update quaternion representation of V (the quaternion may become un-normalized after this)
  545. #ifdef COMPUTE_V_AS_QUATERNION
  546. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f*Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Vtmp4,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Vtmp4,Vone_half);)
  547. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f-Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_sub_ps(Vtmp4,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_sub_ps(Vtmp4,Vone_half);)
  548. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp4.f*Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_mul_ps(Vtmp4,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_mul_ps(Vtmp4,Vqvvz);)
  549. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp5.f+Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_add_ps(Vtmp5,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_add_ps(Vtmp5,Vqvs);)
  550. ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Sqvs.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=_mm_mul_ps(Vqvs,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vqvs=_mm256_mul_ps(Vqvs,Vtmp4);)
  551. ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=Sqvvz.f-Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvz=_mm_sub_ps(Vqvvz,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vqvvz=_mm256_sub_ps(Vqvvz,Vqvs);)
  552. ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=Vtmp5;) ENABLE_AVX_IMPLEMENTATION(Vqvs=Vtmp5;)
  553. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp4.f*Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_mul_ps(Vtmp4,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_mul_ps(Vtmp4,Vqvvx);)
  554. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp5.f+Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_add_ps(Vtmp5,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_add_ps(Vtmp5,Vqvvy);)
  555. ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=Sqvvy.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvy=_mm_mul_ps(Vqvvy,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vqvvy=_mm256_mul_ps(Vqvvy,Vtmp4);)
  556. ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=Sqvvx.f-Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvx=_mm_sub_ps(Vqvvx,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vqvvx=_mm256_sub_ps(Vqvvx,Vqvvy);)
  557. ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvy=Vtmp5;) ENABLE_AVX_IMPLEMENTATION(Vqvvy=Vtmp5;)
  558. #endif
  559. // Swap columns 1-3 if necessary
  560. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.ui=(Stmp1.f<Stmp3.f)?0xffffffff:0;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_cmplt_ps(Vtmp1,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_cmp_ps(Vtmp1,Vtmp3, _CMP_LT_OS);) //ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_cmplt_ps(Vtmp1,Vtmp3);)
  561. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa11.ui^Sa13.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va11,Va13);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va11,Va13);)
  562. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  563. ENABLE_SCALAR_IMPLEMENTATION(Sa11.ui=Sa11.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va11=_mm_xor_ps(Va11,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_xor_ps(Va11,Vtmp5);)
  564. ENABLE_SCALAR_IMPLEMENTATION(Sa13.ui=Sa13.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va13=_mm_xor_ps(Va13,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va13=_mm256_xor_ps(Va13,Vtmp5);)
  565. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa21.ui^Sa23.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va21,Va23);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va21,Va23);)
  566. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  567. ENABLE_SCALAR_IMPLEMENTATION(Sa21.ui=Sa21.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va21=_mm_xor_ps(Va21,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va21=_mm256_xor_ps(Va21,Vtmp5);)
  568. ENABLE_SCALAR_IMPLEMENTATION(Sa23.ui=Sa23.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va23=_mm_xor_ps(Va23,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va23=_mm256_xor_ps(Va23,Vtmp5);)
  569. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa31.ui^Sa33.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va31,Va33);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va31,Va33);)
  570. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  571. ENABLE_SCALAR_IMPLEMENTATION(Sa31.ui=Sa31.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va31=_mm_xor_ps(Va31,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va31=_mm256_xor_ps(Va31,Vtmp5);)
  572. ENABLE_SCALAR_IMPLEMENTATION(Sa33.ui=Sa33.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va33=_mm_xor_ps(Va33,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va33=_mm256_xor_ps(Va33,Vtmp5);)
  573. #ifdef COMPUTE_V_AS_MATRIX
  574. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv11.ui^Sv13.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv11,Vv13);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv11,Vv13);)
  575. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  576. ENABLE_SCALAR_IMPLEMENTATION(Sv11.ui=Sv11.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv11=_mm_xor_ps(Vv11,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv11=_mm256_xor_ps(Vv11,Vtmp5);)
  577. ENABLE_SCALAR_IMPLEMENTATION(Sv13.ui=Sv13.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv13=_mm_xor_ps(Vv13,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv13=_mm256_xor_ps(Vv13,Vtmp5);)
  578. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv21.ui^Sv23.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv21,Vv23);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv21,Vv23);)
  579. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  580. ENABLE_SCALAR_IMPLEMENTATION(Sv21.ui=Sv21.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv21=_mm_xor_ps(Vv21,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv21=_mm256_xor_ps(Vv21,Vtmp5);)
  581. ENABLE_SCALAR_IMPLEMENTATION(Sv23.ui=Sv23.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv23=_mm_xor_ps(Vv23,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv23=_mm256_xor_ps(Vv23,Vtmp5);)
  582. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv31.ui^Sv33.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv31,Vv33);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv31,Vv33);)
  583. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  584. ENABLE_SCALAR_IMPLEMENTATION(Sv31.ui=Sv31.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv31=_mm_xor_ps(Vv31,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv31=_mm256_xor_ps(Vv31,Vtmp5);)
  585. ENABLE_SCALAR_IMPLEMENTATION(Sv33.ui=Sv33.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv33=_mm_xor_ps(Vv33,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv33=_mm256_xor_ps(Vv33,Vtmp5);)
  586. #endif
  587. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp1.ui^Stmp3.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vtmp1,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vtmp1,Vtmp3);)
  588. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  589. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.ui=Stmp1.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_xor_ps(Vtmp1,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_xor_ps(Vtmp1,Vtmp5);)
  590. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.ui=Stmp3.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_xor_ps(Vtmp3,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_xor_ps(Vtmp3,Vtmp5);)
  591. // If columns 1-3 have been swapped, negate 1st column of A and V so that V is still a rotation
  592. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=-2.;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_set1_ps(-2.);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_set1_ps(-2.);)
  593. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  594. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=1.;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=Vone;) ENABLE_AVX_IMPLEMENTATION(Vtmp4=Vone;)
  595. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f+Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_add_ps(Vtmp4,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_add_ps(Vtmp4,Vtmp5);)
  596. ENABLE_SCALAR_IMPLEMENTATION(Sa11.f=Sa11.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Va11=_mm_mul_ps(Va11,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_mul_ps(Va11,Vtmp4);)
  597. ENABLE_SCALAR_IMPLEMENTATION(Sa21.f=Sa21.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Va21=_mm_mul_ps(Va21,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Va21=_mm256_mul_ps(Va21,Vtmp4);)
  598. ENABLE_SCALAR_IMPLEMENTATION(Sa31.f=Sa31.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Va31=_mm_mul_ps(Va31,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Va31=_mm256_mul_ps(Va31,Vtmp4);)
  599. #ifdef COMPUTE_V_AS_MATRIX
  600. ENABLE_SCALAR_IMPLEMENTATION(Sv11.f=Sv11.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vv11=_mm_mul_ps(Vv11,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vv11=_mm256_mul_ps(Vv11,Vtmp4);)
  601. ENABLE_SCALAR_IMPLEMENTATION(Sv21.f=Sv21.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vv21=_mm_mul_ps(Vv21,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vv21=_mm256_mul_ps(Vv21,Vtmp4);)
  602. ENABLE_SCALAR_IMPLEMENTATION(Sv31.f=Sv31.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vv31=_mm_mul_ps(Vv31,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vv31=_mm256_mul_ps(Vv31,Vtmp4);)
  603. #endif
  604. // If columns 1-3 have been swapped, also update quaternion representation of V (the quaternion may become un-normalized after this)
  605. #ifdef COMPUTE_V_AS_QUATERNION
  606. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f*Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Vtmp4,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Vtmp4,Vone_half);)
  607. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f-Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_sub_ps(Vtmp4,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_sub_ps(Vtmp4,Vone_half);)
  608. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp4.f*Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_mul_ps(Vtmp4,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_mul_ps(Vtmp4,Vqvvy);)
  609. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp5.f+Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_add_ps(Vtmp5,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_add_ps(Vtmp5,Vqvs);)
  610. ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Sqvs.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=_mm_mul_ps(Vqvs,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vqvs=_mm256_mul_ps(Vqvs,Vtmp4);)
  611. ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=Sqvvy.f-Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvy=_mm_sub_ps(Vqvvy,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vqvvy=_mm256_sub_ps(Vqvvy,Vqvs);)
  612. ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=Vtmp5;) ENABLE_AVX_IMPLEMENTATION(Vqvs=Vtmp5;)
  613. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp4.f*Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_mul_ps(Vtmp4,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_mul_ps(Vtmp4,Vqvvz);)
  614. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp5.f+Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_add_ps(Vtmp5,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_add_ps(Vtmp5,Vqvvx);)
  615. ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=Sqvvx.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvx=_mm_mul_ps(Vqvvx,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vqvvx=_mm256_mul_ps(Vqvvx,Vtmp4);)
  616. ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=Sqvvz.f-Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvz=_mm_sub_ps(Vqvvz,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vqvvz=_mm256_sub_ps(Vqvvz,Vqvvx);)
  617. ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvx=Vtmp5;) ENABLE_AVX_IMPLEMENTATION(Vqvvx=Vtmp5;)
  618. #endif
  619. // Swap columns 2-3 if necessary
  620. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.ui=(Stmp2.f<Stmp3.f)?0xffffffff:0;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_cmplt_ps(Vtmp2,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_cmp_ps(Vtmp2,Vtmp3, _CMP_LT_OS);) //ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_cmplt_ps(Vtmp2,Vtmp3);)
  621. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa12.ui^Sa13.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va12,Va13);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va12,Va13);)
  622. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  623. ENABLE_SCALAR_IMPLEMENTATION(Sa12.ui=Sa12.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va12=_mm_xor_ps(Va12,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va12=_mm256_xor_ps(Va12,Vtmp5);)
  624. ENABLE_SCALAR_IMPLEMENTATION(Sa13.ui=Sa13.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va13=_mm_xor_ps(Va13,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va13=_mm256_xor_ps(Va13,Vtmp5);)
  625. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa22.ui^Sa23.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va22,Va23);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va22,Va23);)
  626. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  627. ENABLE_SCALAR_IMPLEMENTATION(Sa22.ui=Sa22.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va22=_mm_xor_ps(Va22,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va22=_mm256_xor_ps(Va22,Vtmp5);)
  628. ENABLE_SCALAR_IMPLEMENTATION(Sa23.ui=Sa23.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va23=_mm_xor_ps(Va23,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va23=_mm256_xor_ps(Va23,Vtmp5);)
  629. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa32.ui^Sa33.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va32,Va33);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va32,Va33);)
  630. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  631. ENABLE_SCALAR_IMPLEMENTATION(Sa32.ui=Sa32.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va32=_mm_xor_ps(Va32,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va32=_mm256_xor_ps(Va32,Vtmp5);)
  632. ENABLE_SCALAR_IMPLEMENTATION(Sa33.ui=Sa33.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va33=_mm_xor_ps(Va33,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va33=_mm256_xor_ps(Va33,Vtmp5);)
  633. #ifdef COMPUTE_V_AS_MATRIX
  634. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv12.ui^Sv13.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv12,Vv13);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv12,Vv13);)
  635. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  636. ENABLE_SCALAR_IMPLEMENTATION(Sv12.ui=Sv12.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv12=_mm_xor_ps(Vv12,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv12=_mm256_xor_ps(Vv12,Vtmp5);)
  637. ENABLE_SCALAR_IMPLEMENTATION(Sv13.ui=Sv13.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv13=_mm_xor_ps(Vv13,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv13=_mm256_xor_ps(Vv13,Vtmp5);)
  638. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv22.ui^Sv23.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv22,Vv23);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv22,Vv23);)
  639. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  640. ENABLE_SCALAR_IMPLEMENTATION(Sv22.ui=Sv22.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv22=_mm_xor_ps(Vv22,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv22=_mm256_xor_ps(Vv22,Vtmp5);)
  641. ENABLE_SCALAR_IMPLEMENTATION(Sv23.ui=Sv23.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv23=_mm_xor_ps(Vv23,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv23=_mm256_xor_ps(Vv23,Vtmp5);)
  642. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv32.ui^Sv33.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv32,Vv33);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv32,Vv33);)
  643. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  644. ENABLE_SCALAR_IMPLEMENTATION(Sv32.ui=Sv32.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv32=_mm_xor_ps(Vv32,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv32=_mm256_xor_ps(Vv32,Vtmp5);)
  645. ENABLE_SCALAR_IMPLEMENTATION(Sv33.ui=Sv33.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv33=_mm_xor_ps(Vv33,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv33=_mm256_xor_ps(Vv33,Vtmp5);)
  646. #endif
  647. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp2.ui^Stmp3.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vtmp2,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vtmp2,Vtmp3);)
  648. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  649. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.ui=Stmp2.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_xor_ps(Vtmp2,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_xor_ps(Vtmp2,Vtmp5);)
  650. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.ui=Stmp3.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_xor_ps(Vtmp3,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_xor_ps(Vtmp3,Vtmp5);)
  651. // If columns 2-3 have been swapped, negate 3rd column of A and V so that V is still a rotation
  652. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=-2.;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_set1_ps(-2.);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_set1_ps(-2.);)
  653. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  654. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=1.;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=Vone;) ENABLE_AVX_IMPLEMENTATION(Vtmp4=Vone;)
  655. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f+Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_add_ps(Vtmp4,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_add_ps(Vtmp4,Vtmp5);)
  656. ENABLE_SCALAR_IMPLEMENTATION(Sa13.f=Sa13.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Va13=_mm_mul_ps(Va13,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Va13=_mm256_mul_ps(Va13,Vtmp4);)
  657. ENABLE_SCALAR_IMPLEMENTATION(Sa23.f=Sa23.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Va23=_mm_mul_ps(Va23,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Va23=_mm256_mul_ps(Va23,Vtmp4);)
  658. ENABLE_SCALAR_IMPLEMENTATION(Sa33.f=Sa33.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Va33=_mm_mul_ps(Va33,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Va33=_mm256_mul_ps(Va33,Vtmp4);)
  659. #ifdef COMPUTE_V_AS_MATRIX
  660. ENABLE_SCALAR_IMPLEMENTATION(Sv13.f=Sv13.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vv13=_mm_mul_ps(Vv13,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vv13=_mm256_mul_ps(Vv13,Vtmp4);)
  661. ENABLE_SCALAR_IMPLEMENTATION(Sv23.f=Sv23.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vv23=_mm_mul_ps(Vv23,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vv23=_mm256_mul_ps(Vv23,Vtmp4);)
  662. ENABLE_SCALAR_IMPLEMENTATION(Sv33.f=Sv33.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vv33=_mm_mul_ps(Vv33,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vv33=_mm256_mul_ps(Vv33,Vtmp4);)
  663. #endif
  664. // If columns 2-3 have been swapped, also update quaternion representation of V (the quaternion may become un-normalized after this)
  665. #ifdef COMPUTE_V_AS_QUATERNION
  666. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f*Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Vtmp4,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Vtmp4,Vone_half);)
  667. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f-Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_sub_ps(Vtmp4,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_sub_ps(Vtmp4,Vone_half);)
  668. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp4.f*Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_mul_ps(Vtmp4,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_mul_ps(Vtmp4,Vqvvx);)
  669. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp5.f+Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_add_ps(Vtmp5,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_add_ps(Vtmp5,Vqvs);)
  670. ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Sqvs.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=_mm_mul_ps(Vqvs,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vqvs=_mm256_mul_ps(Vqvs,Vtmp4);)
  671. ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=Sqvvx.f-Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvx=_mm_sub_ps(Vqvvx,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vqvvx=_mm256_sub_ps(Vqvvx,Vqvs);)
  672. ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=Vtmp5;) ENABLE_AVX_IMPLEMENTATION(Vqvs=Vtmp5;)
  673. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp4.f*Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_mul_ps(Vtmp4,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_mul_ps(Vtmp4,Vqvvy);)
  674. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp5.f+Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_add_ps(Vtmp5,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_add_ps(Vtmp5,Vqvvz);)
  675. ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=Sqvvz.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvz=_mm_mul_ps(Vqvvz,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vqvvz=_mm256_mul_ps(Vqvvz,Vtmp4);)
  676. ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=Sqvvy.f-Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvy=_mm_sub_ps(Vqvvy,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vqvvy=_mm256_sub_ps(Vqvvy,Vqvvz);)
  677. ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvz=Vtmp5;) ENABLE_AVX_IMPLEMENTATION(Vqvvz=Vtmp5;)
  678. #endif
  679. #ifdef COMPUTE_V_AS_MATRIX
  680. #ifdef PRINT_DEBUGGING_OUTPUT
  681. #ifdef USE_SCALAR_IMPLEMENTATION
  682. std::cout<<"Scalar V ="<<std::endl;
  683. std::cout<<std::setw(12)<<Sv11.f<<" "<<std::setw(12)<<Sv12.f<<" "<<std::setw(12)<<Sv13.f<<std::endl;
  684. std::cout<<std::setw(12)<<Sv21.f<<" "<<std::setw(12)<<Sv22.f<<" "<<std::setw(12)<<Sv23.f<<std::endl;
  685. std::cout<<std::setw(12)<<Sv31.f<<" "<<std::setw(12)<<Sv32.f<<" "<<std::setw(12)<<Sv33.f<<std::endl;
  686. #endif
  687. #ifdef USE_SSE_IMPLEMENTATION
  688. _mm_storeu_ps(buf,Vv11);V11=buf[0];
  689. _mm_storeu_ps(buf,Vv21);V21=buf[0];
  690. _mm_storeu_ps(buf,Vv31);V31=buf[0];
  691. _mm_storeu_ps(buf,Vv12);V12=buf[0];
  692. _mm_storeu_ps(buf,Vv22);V22=buf[0];
  693. _mm_storeu_ps(buf,Vv32);V32=buf[0];
  694. _mm_storeu_ps(buf,Vv13);V13=buf[0];
  695. _mm_storeu_ps(buf,Vv23);V23=buf[0];
  696. _mm_storeu_ps(buf,Vv33);V33=buf[0];
  697. std::cout<<"Vector V ="<<std::endl;
  698. std::cout<<std::setw(12)<<V11<<" "<<std::setw(12)<<V12<<" "<<std::setw(12)<<V13<<std::endl;
  699. std::cout<<std::setw(12)<<V21<<" "<<std::setw(12)<<V22<<" "<<std::setw(12)<<V23<<std::endl;
  700. std::cout<<std::setw(12)<<V31<<" "<<std::setw(12)<<V32<<" "<<std::setw(12)<<V33<<std::endl;
  701. #endif
  702. #ifdef USE_AVX_IMPLEMENTATION
  703. _mm256_storeu_ps(buf,Vv11);V11=buf[0];
  704. _mm256_storeu_ps(buf,Vv21);V21=buf[0];
  705. _mm256_storeu_ps(buf,Vv31);V31=buf[0];
  706. _mm256_storeu_ps(buf,Vv12);V12=buf[0];
  707. _mm256_storeu_ps(buf,Vv22);V22=buf[0];
  708. _mm256_storeu_ps(buf,Vv32);V32=buf[0];
  709. _mm256_storeu_ps(buf,Vv13);V13=buf[0];
  710. _mm256_storeu_ps(buf,Vv23);V23=buf[0];
  711. _mm256_storeu_ps(buf,Vv33);V33=buf[0];
  712. std::cout<<"Vector V ="<<std::endl;
  713. std::cout<<std::setw(12)<<V11<<" "<<std::setw(12)<<V12<<" "<<std::setw(12)<<V13<<std::endl;
  714. std::cout<<std::setw(12)<<V21<<" "<<std::setw(12)<<V22<<" "<<std::setw(12)<<V23<<std::endl;
  715. std::cout<<std::setw(12)<<V31<<" "<<std::setw(12)<<V32<<" "<<std::setw(12)<<V33<<std::endl;
  716. #endif
  717. #endif
  718. #endif
  719. #ifdef PRINT_DEBUGGING_OUTPUT
  720. #ifdef USE_SCALAR_IMPLEMENTATION
  721. std::cout<<"Scalar A (after multiplying with V) ="<<std::endl;
  722. std::cout<<std::setw(12)<<Sa11.f<<" "<<std::setw(12)<<Sa12.f<<" "<<std::setw(12)<<Sa13.f<<std::endl;
  723. std::cout<<std::setw(12)<<Sa21.f<<" "<<std::setw(12)<<Sa22.f<<" "<<std::setw(12)<<Sa23.f<<std::endl;
  724. std::cout<<std::setw(12)<<Sa31.f<<" "<<std::setw(12)<<Sa32.f<<" "<<std::setw(12)<<Sa33.f<<std::endl;
  725. #endif
  726. #ifdef USE_SSE_IMPLEMENTATION
  727. _mm_storeu_ps(buf,Va11);A11=buf[0];
  728. _mm_storeu_ps(buf,Va21);A21=buf[0];
  729. _mm_storeu_ps(buf,Va31);A31=buf[0];
  730. _mm_storeu_ps(buf,Va12);A12=buf[0];
  731. _mm_storeu_ps(buf,Va22);A22=buf[0];
  732. _mm_storeu_ps(buf,Va32);A32=buf[0];
  733. _mm_storeu_ps(buf,Va13);A13=buf[0];
  734. _mm_storeu_ps(buf,Va23);A23=buf[0];
  735. _mm_storeu_ps(buf,Va33);A33=buf[0];
  736. std::cout<<"Vector A (after multiplying with V) ="<<std::endl;
  737. std::cout<<std::setw(12)<<A11<<" "<<std::setw(12)<<A12<<" "<<std::setw(12)<<A13<<std::endl;
  738. std::cout<<std::setw(12)<<A21<<" "<<std::setw(12)<<A22<<" "<<std::setw(12)<<A23<<std::endl;
  739. std::cout<<std::setw(12)<<A31<<" "<<std::setw(12)<<A32<<" "<<std::setw(12)<<A33<<std::endl;
  740. #endif
  741. #ifdef USE_AVX_IMPLEMENTATION
  742. _mm256_storeu_ps(buf,Va11);A11=buf[0];
  743. _mm256_storeu_ps(buf,Va21);A21=buf[0];
  744. _mm256_storeu_ps(buf,Va31);A31=buf[0];
  745. _mm256_storeu_ps(buf,Va12);A12=buf[0];
  746. _mm256_storeu_ps(buf,Va22);A22=buf[0];
  747. _mm256_storeu_ps(buf,Va32);A32=buf[0];
  748. _mm256_storeu_ps(buf,Va13);A13=buf[0];
  749. _mm256_storeu_ps(buf,Va23);A23=buf[0];
  750. _mm256_storeu_ps(buf,Va33);A33=buf[0];
  751. std::cout<<"Vector A (after multiplying with V) ="<<std::endl;
  752. std::cout<<std::setw(12)<<A11<<" "<<std::setw(12)<<A12<<" "<<std::setw(12)<<A13<<std::endl;
  753. std::cout<<std::setw(12)<<A21<<" "<<std::setw(12)<<A22<<" "<<std::setw(12)<<A23<<std::endl;
  754. std::cout<<std::setw(12)<<A31<<" "<<std::setw(12)<<A32<<" "<<std::setw(12)<<A33<<std::endl;
  755. #endif
  756. #endif
  757. //###########################################################
  758. // Re-normalize quaternion for matrix V
  759. //###########################################################
  760. #ifdef COMPUTE_V_AS_QUATERNION
  761. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sqvs.f*Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vqvs,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vqvs,Vqvs);)
  762. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvx.f*Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvx,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvx,Vqvvx);)
  763. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp1.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp1,Vtmp2);)
  764. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvy.f*Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvy,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvy,Vqvvy);)
  765. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp1.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp1,Vtmp2);)
  766. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvz.f*Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvz,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvz,Vqvvz);)
  767. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp1.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp1,Vtmp2);)
  768. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=rsqrt(Stmp2.f);) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_rsqrt_ps(Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_rsqrt_ps(Vtmp2);)
  769. #ifdef PERFORM_STRICT_QUATERNION_RENORMALIZATION
  770. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp1.f*Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Vtmp1,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Vtmp1,Vone_half);)
  771. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp1.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp1,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp1,Vtmp4);)
  772. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp1.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp1,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp1,Vtmp3);)
  773. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp2.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp2,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp2,Vtmp3);)
  774. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Stmp1.f+Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_add_ps(Vtmp1,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_add_ps(Vtmp1,Vtmp4);)
  775. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Stmp1.f-Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_sub_ps(Vtmp1,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_sub_ps(Vtmp1,Vtmp3);)
  776. #endif
  777. ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Sqvs.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=_mm_mul_ps(Vqvs,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vqvs=_mm256_mul_ps(Vqvs,Vtmp1);)
  778. ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=Sqvvx.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvx=_mm_mul_ps(Vqvvx,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vqvvx=_mm256_mul_ps(Vqvvx,Vtmp1);)
  779. ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=Sqvvy.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvy=_mm_mul_ps(Vqvvy,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vqvvy=_mm256_mul_ps(Vqvvy,Vtmp1);)
  780. ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=Sqvvz.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvz=_mm_mul_ps(Vqvvz,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vqvvz=_mm256_mul_ps(Vqvvz,Vtmp1);)
  781. #ifdef PRINT_DEBUGGING_OUTPUT
  782. #ifdef USE_SCALAR_IMPLEMENTATION
  783. std::cout<<"Scalar qV ="<<std::endl;
  784. std::cout<<std::setw(12)<<Sqvs.f<<" "<<std::setw(12)<<Sqvvx.f<<" "<<std::setw(12)<<Sqvvy.f<<" "<<std::setw(12)<<Sqvvz.f<<std::endl;
  785. #endif
  786. #ifdef USE_SSE_IMPLEMENTATION
  787. _mm_storeu_ps(buf,Vqvs);QVS=buf[0];
  788. _mm_storeu_ps(buf,Vqvvx);QVVX=buf[0];
  789. _mm_storeu_ps(buf,Vqvvy);QVVY=buf[0];
  790. _mm_storeu_ps(buf,Vqvvz);QVVZ=buf[0];
  791. std::cout<<"Vector qV ="<<std::endl;
  792. std::cout<<std::setw(12)<<QVS<<" "<<std::setw(12)<<QVVX<<" "<<std::setw(12)<<QVVY<<" "<<std::setw(12)<<QVVZ<<std::endl;
  793. #endif
  794. #ifdef USE_AVX_IMPLEMENTATION
  795. _mm256_storeu_ps(buf,Vqvs);QVS=buf[0];
  796. _mm256_storeu_ps(buf,Vqvvx);QVVX=buf[0];
  797. _mm256_storeu_ps(buf,Vqvvy);QVVY=buf[0];
  798. _mm256_storeu_ps(buf,Vqvvz);QVVZ=buf[0];
  799. std::cout<<"Vector qV ="<<std::endl;
  800. std::cout<<std::setw(12)<<QVS<<" "<<std::setw(12)<<QVVX<<" "<<std::setw(12)<<QVVY<<" "<<std::setw(12)<<QVVZ<<std::endl;
  801. #endif
  802. #endif
  803. #endif
  804. //###########################################################
  805. // Construct QR factorization of A*V (=U*D) using Givens rotations
  806. //###########################################################
  807. #ifdef COMPUTE_U_AS_MATRIX
  808. ENABLE_SCALAR_IMPLEMENTATION(Su11.f=1.;) ENABLE_SSE_IMPLEMENTATION(Vu11=Vone;) ENABLE_AVX_IMPLEMENTATION(Vu11=Vone;)
  809. ENABLE_SCALAR_IMPLEMENTATION(Su21.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vu21=_mm_xor_ps(Vu21,Vu21);) ENABLE_AVX_IMPLEMENTATION(Vu21=_mm256_xor_ps(Vu21,Vu21);)
  810. ENABLE_SCALAR_IMPLEMENTATION(Su31.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vu31=_mm_xor_ps(Vu31,Vu31);) ENABLE_AVX_IMPLEMENTATION(Vu31=_mm256_xor_ps(Vu31,Vu31);)
  811. ENABLE_SCALAR_IMPLEMENTATION(Su12.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vu12=_mm_xor_ps(Vu12,Vu12);) ENABLE_AVX_IMPLEMENTATION(Vu12=_mm256_xor_ps(Vu12,Vu12);)
  812. ENABLE_SCALAR_IMPLEMENTATION(Su22.f=1.;) ENABLE_SSE_IMPLEMENTATION(Vu22=Vone;) ENABLE_AVX_IMPLEMENTATION(Vu22=Vone;)
  813. ENABLE_SCALAR_IMPLEMENTATION(Su32.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vu32=_mm_xor_ps(Vu32,Vu32);) ENABLE_AVX_IMPLEMENTATION(Vu32=_mm256_xor_ps(Vu32,Vu32);)
  814. ENABLE_SCALAR_IMPLEMENTATION(Su13.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vu13=_mm_xor_ps(Vu13,Vu13);) ENABLE_AVX_IMPLEMENTATION(Vu13=_mm256_xor_ps(Vu13,Vu13);)
  815. ENABLE_SCALAR_IMPLEMENTATION(Su23.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vu23=_mm_xor_ps(Vu23,Vu23);) ENABLE_AVX_IMPLEMENTATION(Vu23=_mm256_xor_ps(Vu23,Vu23);)
  816. ENABLE_SCALAR_IMPLEMENTATION(Su33.f=1.;) ENABLE_SSE_IMPLEMENTATION(Vu33=Vone;) ENABLE_AVX_IMPLEMENTATION(Vu33=Vone;)
  817. #endif
  818. #ifdef COMPUTE_U_AS_QUATERNION
  819. ENABLE_SCALAR_IMPLEMENTATION(Squs.f=1.;) ENABLE_SSE_IMPLEMENTATION(Vqus=Vone;) ENABLE_AVX_IMPLEMENTATION(Vqus=Vone;)
  820. ENABLE_SCALAR_IMPLEMENTATION(Squvx.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vquvx=_mm_xor_ps(Vquvx,Vquvx);) ENABLE_AVX_IMPLEMENTATION(Vquvx=_mm256_xor_ps(Vquvx,Vquvx);)
  821. ENABLE_SCALAR_IMPLEMENTATION(Squvy.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vquvy=_mm_xor_ps(Vquvy,Vquvy);) ENABLE_AVX_IMPLEMENTATION(Vquvy=_mm256_xor_ps(Vquvy,Vquvy);)
  822. ENABLE_SCALAR_IMPLEMENTATION(Squvz.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vquvz=_mm_xor_ps(Vquvz,Vquvz);) ENABLE_AVX_IMPLEMENTATION(Vquvz=_mm256_xor_ps(Vquvz,Vquvz);)
  823. #endif
  824. // First Givens rotation
  825. #define SAPIVOT Sa11
  826. #define SANPIVOT Sa21
  827. #define SA11 Sa11
  828. #define SA21 Sa21
  829. #define SA12 Sa12
  830. #define SA22 Sa22
  831. #define SA13 Sa13
  832. #define SA23 Sa23
  833. #define SU11 Su11
  834. #define SU12 Su12
  835. #define SU21 Su21
  836. #define SU22 Su22
  837. #define SU31 Su31
  838. #define SU32 Su32
  839. #define VAPIVOT Va11
  840. #define VANPIVOT Va21
  841. #define VA11 Va11
  842. #define VA21 Va21
  843. #define VA12 Va12
  844. #define VA22 Va22
  845. #define VA13 Va13
  846. #define VA23 Va23
  847. #define VU11 Vu11
  848. #define VU12 Vu12
  849. #define VU21 Vu21
  850. #define VU22 Vu22
  851. #define VU31 Vu31
  852. #define VU32 Vu32
  853. #include "Singular_Value_Decomposition_Givens_QR_Factorization_Kernel.hpp"
  854. #undef SAPIVOT
  855. #undef SANPIVOT
  856. #undef SA11
  857. #undef SA21
  858. #undef SA12
  859. #undef SA22
  860. #undef SA13
  861. #undef SA23
  862. #undef SU11
  863. #undef SU12
  864. #undef SU21
  865. #undef SU22
  866. #undef SU31
  867. #undef SU32
  868. #undef VAPIVOT
  869. #undef VANPIVOT
  870. #undef VA11
  871. #undef VA21
  872. #undef VA12
  873. #undef VA22
  874. #undef VA13
  875. #undef VA23
  876. #undef VU11
  877. #undef VU12
  878. #undef VU21
  879. #undef VU22
  880. #undef VU31
  881. #undef VU32
  882. // Update quaternion representation of U
  883. #ifdef COMPUTE_U_AS_QUATERNION
  884. ENABLE_SCALAR_IMPLEMENTATION(Squs.f=Sch.f;) ENABLE_SSE_IMPLEMENTATION(Vqus=Vch;) ENABLE_AVX_IMPLEMENTATION(Vqus=Vch;)
  885. ENABLE_SCALAR_IMPLEMENTATION(Squvx.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vquvx=_mm_xor_ps(Vquvx,Vquvx);) ENABLE_AVX_IMPLEMENTATION(Vquvx=_mm256_xor_ps(Vquvx,Vquvx);)
  886. ENABLE_SCALAR_IMPLEMENTATION(Squvy.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vquvy=_mm_xor_ps(Vquvy,Vquvy);) ENABLE_AVX_IMPLEMENTATION(Vquvy=_mm256_xor_ps(Vquvy,Vquvy);)
  887. ENABLE_SCALAR_IMPLEMENTATION(Squvz.f=Ssh.f;) ENABLE_SSE_IMPLEMENTATION(Vquvz=Vsh;) ENABLE_AVX_IMPLEMENTATION(Vquvz=Vsh;)
  888. #endif
  889. // Second Givens rotation
  890. #define SAPIVOT Sa11
  891. #define SANPIVOT Sa31
  892. #define SA11 Sa11
  893. #define SA21 Sa31
  894. #define SA12 Sa12
  895. #define SA22 Sa32
  896. #define SA13 Sa13
  897. #define SA23 Sa33
  898. #define SU11 Su11
  899. #define SU12 Su13
  900. #define SU21 Su21
  901. #define SU22 Su23
  902. #define SU31 Su31
  903. #define SU32 Su33
  904. #define VAPIVOT Va11
  905. #define VANPIVOT Va31
  906. #define VA11 Va11
  907. #define VA21 Va31
  908. #define VA12 Va12
  909. #define VA22 Va32
  910. #define VA13 Va13
  911. #define VA23 Va33
  912. #define VU11 Vu11
  913. #define VU12 Vu13
  914. #define VU21 Vu21
  915. #define VU22 Vu23
  916. #define VU31 Vu31
  917. #define VU32 Vu33
  918. #include "Singular_Value_Decomposition_Givens_QR_Factorization_Kernel.hpp"
  919. #undef SAPIVOT
  920. #undef SANPIVOT
  921. #undef SA11
  922. #undef SA21
  923. #undef SA12
  924. #undef SA22
  925. #undef SA13
  926. #undef SA23
  927. #undef SU11
  928. #undef SU12
  929. #undef SU21
  930. #undef SU22
  931. #undef SU31
  932. #undef SU32
  933. #undef VAPIVOT
  934. #undef VANPIVOT
  935. #undef VA11
  936. #undef VA21
  937. #undef VA12
  938. #undef VA22
  939. #undef VA13
  940. #undef VA23
  941. #undef VU11
  942. #undef VU12
  943. #undef VU21
  944. #undef VU22
  945. #undef VU31
  946. #undef VU32
  947. // Update quaternion representation of U
  948. #ifdef COMPUTE_U_AS_QUATERNION
  949. ENABLE_SCALAR_IMPLEMENTATION(Squvx.f=Ssh.f*Squvz.f;) ENABLE_SSE_IMPLEMENTATION(Vquvx=_mm_mul_ps(Vsh,Vquvz);) ENABLE_AVX_IMPLEMENTATION(Vquvx=_mm256_mul_ps(Vsh,Vquvz);)
  950. ENABLE_SCALAR_IMPLEMENTATION(Ssh.f=Ssh.f*Squs.f;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_mul_ps(Vsh,Vqus);) ENABLE_AVX_IMPLEMENTATION(Vsh=_mm256_mul_ps(Vsh,Vqus);)
  951. ENABLE_SCALAR_IMPLEMENTATION(Squvy.f=Squvy.f-Ssh.f;) ENABLE_SSE_IMPLEMENTATION(Vquvy=_mm_sub_ps(Vquvy,Vsh);) ENABLE_AVX_IMPLEMENTATION(Vquvy=_mm256_sub_ps(Vquvy,Vsh);)
  952. ENABLE_SCALAR_IMPLEMENTATION(Squs.f=Sch.f*Squs.f;) ENABLE_SSE_IMPLEMENTATION(Vqus=_mm_mul_ps(Vch,Vqus);) ENABLE_AVX_IMPLEMENTATION(Vqus=_mm256_mul_ps(Vch,Vqus);)
  953. ENABLE_SCALAR_IMPLEMENTATION(Squvz.f=Sch.f*Squvz.f;) ENABLE_SSE_IMPLEMENTATION(Vquvz=_mm_mul_ps(Vch,Vquvz);) ENABLE_AVX_IMPLEMENTATION(Vquvz=_mm256_mul_ps(Vch,Vquvz);)
  954. #endif
  955. // Third Givens rotation
  956. #define SAPIVOT Sa22
  957. #define SANPIVOT Sa32
  958. #define SA11 Sa21
  959. #define SA21 Sa31
  960. #define SA12 Sa22
  961. #define SA22 Sa32
  962. #define SA13 Sa23
  963. #define SA23 Sa33
  964. #define SU11 Su12
  965. #define SU12 Su13
  966. #define SU21 Su22
  967. #define SU22 Su23
  968. #define SU31 Su32
  969. #define SU32 Su33
  970. #define VAPIVOT Va22
  971. #define VANPIVOT Va32
  972. #define VA11 Va21
  973. #define VA21 Va31
  974. #define VA12 Va22
  975. #define VA22 Va32
  976. #define VA13 Va23
  977. #define VA23 Va33
  978. #define VU11 Vu12
  979. #define VU12 Vu13
  980. #define VU21 Vu22
  981. #define VU22 Vu23
  982. #define VU31 Vu32
  983. #define VU32 Vu33
  984. #include "Singular_Value_Decomposition_Givens_QR_Factorization_Kernel.hpp"
  985. #undef SAPIVOT
  986. #undef SANPIVOT
  987. #undef SA11
  988. #undef SA21
  989. #undef SA12
  990. #undef SA22
  991. #undef SA13
  992. #undef SA23
  993. #undef SU11
  994. #undef SU12
  995. #undef SU21
  996. #undef SU22
  997. #undef SU31
  998. #undef SU32
  999. #undef VAPIVOT
  1000. #undef VANPIVOT
  1001. #undef VA11
  1002. #undef VA21
  1003. #undef VA12
  1004. #undef VA22
  1005. #undef VA13
  1006. #undef VA23
  1007. #undef VU11
  1008. #undef VU12
  1009. #undef VU21
  1010. #undef VU22
  1011. #undef VU31
  1012. #undef VU32
  1013. // Update quaternion representation of U
  1014. #ifdef COMPUTE_U_AS_QUATERNION
  1015. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Ssh.f*Squvx.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vsh,Vquvx);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vsh,Vquvx);)
  1016. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Ssh.f*Squvy.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vsh,Vquvy);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vsh,Vquvy);)
  1017. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Ssh.f*Squvz.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vsh,Vquvz);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vsh,Vquvz);)
  1018. ENABLE_SCALAR_IMPLEMENTATION(Ssh.f=Ssh.f*Squs.f;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_mul_ps(Vsh,Vqus);) ENABLE_AVX_IMPLEMENTATION(Vsh=_mm256_mul_ps(Vsh,Vqus);)
  1019. ENABLE_SCALAR_IMPLEMENTATION(Squs.f=Sch.f*Squs.f;) ENABLE_SSE_IMPLEMENTATION(Vqus=_mm_mul_ps(Vch,Vqus);) ENABLE_AVX_IMPLEMENTATION(Vqus=_mm256_mul_ps(Vch,Vqus);)
  1020. ENABLE_SCALAR_IMPLEMENTATION(Squvx.f=Sch.f*Squvx.f;) ENABLE_SSE_IMPLEMENTATION(Vquvx=_mm_mul_ps(Vch,Vquvx);) ENABLE_AVX_IMPLEMENTATION(Vquvx=_mm256_mul_ps(Vch,Vquvx);)
  1021. ENABLE_SCALAR_IMPLEMENTATION(Squvy.f=Sch.f*Squvy.f;) ENABLE_SSE_IMPLEMENTATION(Vquvy=_mm_mul_ps(Vch,Vquvy);) ENABLE_AVX_IMPLEMENTATION(Vquvy=_mm256_mul_ps(Vch,Vquvy);)
  1022. ENABLE_SCALAR_IMPLEMENTATION(Squvz.f=Sch.f*Squvz.f;) ENABLE_SSE_IMPLEMENTATION(Vquvz=_mm_mul_ps(Vch,Vquvz);) ENABLE_AVX_IMPLEMENTATION(Vquvz=_mm256_mul_ps(Vch,Vquvz);)
  1023. ENABLE_SCALAR_IMPLEMENTATION(Squvx.f=Squvx.f+Ssh.f;) ENABLE_SSE_IMPLEMENTATION(Vquvx=_mm_add_ps(Vquvx,Vsh);) ENABLE_AVX_IMPLEMENTATION(Vquvx=_mm256_add_ps(Vquvx,Vsh);)
  1024. ENABLE_SCALAR_IMPLEMENTATION(Squs.f=Squs.f-Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vqus=_mm_sub_ps(Vqus,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vqus=_mm256_sub_ps(Vqus,Vtmp1);)
  1025. ENABLE_SCALAR_IMPLEMENTATION(Squvy.f=Squvy.f+Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vquvy=_mm_add_ps(Vquvy,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vquvy=_mm256_add_ps(Vquvy,Vtmp3);)
  1026. ENABLE_SCALAR_IMPLEMENTATION(Squvz.f=Squvz.f-Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vquvz=_mm_sub_ps(Vquvz,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vquvz=_mm256_sub_ps(Vquvz,Vtmp2);)
  1027. #endif
  1028. #ifdef COMPUTE_U_AS_MATRIX
  1029. #ifdef PRINT_DEBUGGING_OUTPUT
  1030. #ifdef USE_SCALAR_IMPLEMENTATION
  1031. std::cout<<"Scalar U ="<<std::endl;
  1032. std::cout<<std::setw(12)<<Su11.f<<" "<<std::setw(12)<<Su12.f<<" "<<std::setw(12)<<Su13.f<<std::endl;
  1033. std::cout<<std::setw(12)<<Su21.f<<" "<<std::setw(12)<<Su22.f<<" "<<std::setw(12)<<Su23.f<<std::endl;
  1034. std::cout<<std::setw(12)<<Su31.f<<" "<<std::setw(12)<<Su32.f<<" "<<std::setw(12)<<Su33.f<<std::endl;
  1035. #endif
  1036. #ifdef USE_SSE_IMPLEMENTATION
  1037. _mm_storeu_ps(buf,Vu11);U11=buf[0];
  1038. _mm_storeu_ps(buf,Vu21);U21=buf[0];
  1039. _mm_storeu_ps(buf,Vu31);U31=buf[0];
  1040. _mm_storeu_ps(buf,Vu12);U12=buf[0];
  1041. _mm_storeu_ps(buf,Vu22);U22=buf[0];
  1042. _mm_storeu_ps(buf,Vu32);U32=buf[0];
  1043. _mm_storeu_ps(buf,Vu13);U13=buf[0];
  1044. _mm_storeu_ps(buf,Vu23);U23=buf[0];
  1045. _mm_storeu_ps(buf,Vu33);U33=buf[0];
  1046. std::cout<<"Vector U ="<<std::endl;
  1047. std::cout<<std::setw(12)<<U11<<" "<<std::setw(12)<<U12<<" "<<std::setw(12)<<U13<<std::endl;
  1048. std::cout<<std::setw(12)<<U21<<" "<<std::setw(12)<<U22<<" "<<std::setw(12)<<U23<<std::endl;
  1049. std::cout<<std::setw(12)<<U31<<" "<<std::setw(12)<<U32<<" "<<std::setw(12)<<U33<<std::endl;
  1050. #endif
  1051. #ifdef USE_AVX_IMPLEMENTATION
  1052. _mm256_storeu_ps(buf,Vu11);U11=buf[0];
  1053. _mm256_storeu_ps(buf,Vu21);U21=buf[0];
  1054. _mm256_storeu_ps(buf,Vu31);U31=buf[0];
  1055. _mm256_storeu_ps(buf,Vu12);U12=buf[0];
  1056. _mm256_storeu_ps(buf,Vu22);U22=buf[0];
  1057. _mm256_storeu_ps(buf,Vu32);U32=buf[0];
  1058. _mm256_storeu_ps(buf,Vu13);U13=buf[0];
  1059. _mm256_storeu_ps(buf,Vu23);U23=buf[0];
  1060. _mm256_storeu_ps(buf,Vu33);U33=buf[0];
  1061. std::cout<<"Vector U ="<<std::endl;
  1062. std::cout<<std::setw(12)<<U11<<" "<<std::setw(12)<<U12<<" "<<std::setw(12)<<U13<<std::endl;
  1063. std::cout<<std::setw(12)<<U21<<" "<<std::setw(12)<<U22<<" "<<std::setw(12)<<U23<<std::endl;
  1064. std::cout<<std::setw(12)<<U31<<" "<<std::setw(12)<<U32<<" "<<std::setw(12)<<U33<<std::endl;
  1065. #endif
  1066. #endif
  1067. #endif
  1068. #ifdef PRINT_DEBUGGING_OUTPUT
  1069. #ifdef USE_SCALAR_IMPLEMENTATION
  1070. std::cout<<"Scalar A (after multiplying with U-transpose and V) ="<<std::endl;
  1071. std::cout<<std::setw(12)<<Sa11.f<<" "<<std::setw(12)<<Sa12.f<<" "<<std::setw(12)<<Sa13.f<<std::endl;
  1072. std::cout<<std::setw(12)<<Sa21.f<<" "<<std::setw(12)<<Sa22.f<<" "<<std::setw(12)<<Sa23.f<<std::endl;
  1073. std::cout<<std::setw(12)<<Sa31.f<<" "<<std::setw(12)<<Sa32.f<<" "<<std::setw(12)<<Sa33.f<<std::endl;
  1074. #endif
  1075. #ifdef USE_SSE_IMPLEMENTATION
  1076. _mm_storeu_ps(buf,Va11);A11=buf[0];
  1077. _mm_storeu_ps(buf,Va21);A21=buf[0];
  1078. _mm_storeu_ps(buf,Va31);A31=buf[0];
  1079. _mm_storeu_ps(buf,Va12);A12=buf[0];
  1080. _mm_storeu_ps(buf,Va22);A22=buf[0];
  1081. _mm_storeu_ps(buf,Va32);A32=buf[0];
  1082. _mm_storeu_ps(buf,Va13);A13=buf[0];
  1083. _mm_storeu_ps(buf,Va23);A23=buf[0];
  1084. _mm_storeu_ps(buf,Va33);A33=buf[0];
  1085. std::cout<<"Vector A (after multiplying with U-transpose and V) ="<<std::endl;
  1086. std::cout<<std::setw(12)<<A11<<" "<<std::setw(12)<<A12<<" "<<std::setw(12)<<A13<<std::endl;
  1087. std::cout<<std::setw(12)<<A21<<" "<<std::setw(12)<<A22<<" "<<std::setw(12)<<A23<<std::endl;
  1088. std::cout<<std::setw(12)<<A31<<" "<<std::setw(12)<<A32<<" "<<std::setw(12)<<A33<<std::endl;
  1089. #endif
  1090. #ifdef USE_AVX_IMPLEMENTATION
  1091. _mm256_storeu_ps(buf,Va11);A11=buf[0];
  1092. _mm256_storeu_ps(buf,Va21);A21=buf[0];
  1093. _mm256_storeu_ps(buf,Va31);A31=buf[0];
  1094. _mm256_storeu_ps(buf,Va12);A12=buf[0];
  1095. _mm256_storeu_ps(buf,Va22);A22=buf[0];
  1096. _mm256_storeu_ps(buf,Va32);A32=buf[0];
  1097. _mm256_storeu_ps(buf,Va13);A13=buf[0];
  1098. _mm256_storeu_ps(buf,Va23);A23=buf[0];
  1099. _mm256_storeu_ps(buf,Va33);A33=buf[0];
  1100. std::cout<<"Vector A (after multiplying with U-transpose and V) ="<<std::endl;
  1101. std::cout<<std::setw(12)<<A11<<" "<<std::setw(12)<<A12<<" "<<std::setw(12)<<A13<<std::endl;
  1102. std::cout<<std::setw(12)<<A21<<" "<<std::setw(12)<<A22<<" "<<std::setw(12)<<A23<<std::endl;
  1103. std::cout<<std::setw(12)<<A31<<" "<<std::setw(12)<<A32<<" "<<std::setw(12)<<A33<<std::endl;
  1104. #endif
  1105. #endif
  1106. #ifdef COMPUTE_U_AS_QUATERNION
  1107. #ifdef PRINT_DEBUGGING_OUTPUT
  1108. #ifdef USE_SCALAR_IMPLEMENTATION
  1109. std::cout<<"Scalar qU ="<<std::endl;
  1110. std::cout<<std::setw(12)<<Squs.f<<" "<<std::setw(12)<<Squvx.f<<" "<<std::setw(12)<<Squvy.f<<" "<<std::setw(12)<<Squvz.f<<std::endl;
  1111. #endif
  1112. #ifdef USE_SSE_IMPLEMENTATION
  1113. _mm_storeu_ps(buf,Vqus);QUS=buf[0];
  1114. _mm_storeu_ps(buf,Vquvx);QUVX=buf[0];
  1115. _mm_storeu_ps(buf,Vquvy);QUVY=buf[0];
  1116. _mm_storeu_ps(buf,Vquvz);QUVZ=buf[0];
  1117. std::cout<<"Vector qU ="<<std::endl;
  1118. std::cout<<std::setw(12)<<QUS<<" "<<std::setw(12)<<QUVX<<" "<<std::setw(12)<<QUVY<<" "<<std::setw(12)<<QUVZ<<std::endl;
  1119. #endif
  1120. #ifdef USE_AVX_IMPLEMENTATION
  1121. _mm256_storeu_ps(buf,Vqus);QUS=buf[0];
  1122. _mm256_storeu_ps(buf,Vquvx);QUVX=buf[0];
  1123. _mm256_storeu_ps(buf,Vquvy);QUVY=buf[0];
  1124. _mm256_storeu_ps(buf,Vquvz);QUVZ=buf[0];
  1125. std::cout<<"Vector qU ="<<std::endl;
  1126. std::cout<<std::setw(12)<<QUS<<" "<<std::setw(12)<<QUVX<<" "<<std::setw(12)<<QUVY<<" "<<std::setw(12)<<QUVZ<<std::endl;
  1127. #endif
  1128. #endif
  1129. #endif
  1130. #ifdef __INTEL_COMPILER
  1131. #pragma warning( default : 592 )
  1132. #endif