svd3x3_avx.cpp 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. // This file is part of libigl, a simple c++ geometry processing library.
  2. //
  3. // Copyright (C) 2013 Alec Jacobson <alecjacobson@gmail.com>
  4. //
  5. // This Source Code Form is subject to the terms of the Mozilla Public License
  6. // v. 2.0. If a copy of the MPL was not distributed with this file, You can
  7. // obtain one at http://mozilla.org/MPL/2.0/.
  8. #ifdef __AVX__
  9. #include "svd3x3_avx.h"
  10. #include <cmath>
  11. #include <algorithm>
  12. #undef USE_SCALAR_IMPLEMENTATION
  13. #undef USE_SSE_IMPLEMENTATION
  14. #define USE_AVX_IMPLEMENTATION
  15. #define COMPUTE_U_AS_MATRIX
  16. #define COMPUTE_V_AS_MATRIX
  17. #include "Singular_Value_Decomposition_Preamble.hpp"
  18. #pragma runtime_checks( "u", off ) // disable runtime asserts on xor eax,eax type of stuff (doesn't always work, disable explicitly in compiler settings)
  19. template<typename T>
  20. IGL_INLINE void igl::svd3x3_avx(
  21. const Eigen::Matrix<T, 3*8, 3>& A,
  22. Eigen::Matrix<T, 3*8, 3> &U,
  23. Eigen::Matrix<T, 3*8, 1> &S,
  24. Eigen::Matrix<T, 3*8, 3>&V)
  25. {
  26. // this code assumes USE_AVX_IMPLEMENTATION is defined
  27. float Ashuffle[9][8], Ushuffle[9][8], Vshuffle[9][8], Sshuffle[3][8];
  28. for (int i=0; i<3; i++)
  29. {
  30. for (int j=0; j<3; j++)
  31. {
  32. for (int k=0; k<8; k++)
  33. {
  34. Ashuffle[i + j*3][k] = A(i + 3*k, j);
  35. }
  36. }
  37. }
  38. #include "Singular_Value_Decomposition_Kernel_Declarations.hpp"
  39. ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_loadu_ps(Ashuffle[0]);)
  40. ENABLE_AVX_IMPLEMENTATION(Va21=_mm256_loadu_ps(Ashuffle[1]);)
  41. ENABLE_AVX_IMPLEMENTATION(Va31=_mm256_loadu_ps(Ashuffle[2]);)
  42. ENABLE_AVX_IMPLEMENTATION(Va12=_mm256_loadu_ps(Ashuffle[3]);)
  43. ENABLE_AVX_IMPLEMENTATION(Va22=_mm256_loadu_ps(Ashuffle[4]);)
  44. ENABLE_AVX_IMPLEMENTATION(Va32=_mm256_loadu_ps(Ashuffle[5]);)
  45. ENABLE_AVX_IMPLEMENTATION(Va13=_mm256_loadu_ps(Ashuffle[6]);)
  46. ENABLE_AVX_IMPLEMENTATION(Va23=_mm256_loadu_ps(Ashuffle[7]);)
  47. ENABLE_AVX_IMPLEMENTATION(Va33=_mm256_loadu_ps(Ashuffle[8]);)
  48. #include "Singular_Value_Decomposition_Main_Kernel_Body.hpp"
  49. ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Ushuffle[0],Vu11);)
  50. ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Ushuffle[1],Vu21);)
  51. ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Ushuffle[2],Vu31);)
  52. ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Ushuffle[3],Vu12);)
  53. ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Ushuffle[4],Vu22);)
  54. ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Ushuffle[5],Vu32);)
  55. ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Ushuffle[6],Vu13);)
  56. ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Ushuffle[7],Vu23);)
  57. ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Ushuffle[8],Vu33);)
  58. ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Vshuffle[0],Vv11);)
  59. ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Vshuffle[1],Vv21);)
  60. ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Vshuffle[2],Vv31);)
  61. ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Vshuffle[3],Vv12);)
  62. ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Vshuffle[4],Vv22);)
  63. ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Vshuffle[5],Vv32);)
  64. ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Vshuffle[6],Vv13);)
  65. ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Vshuffle[7],Vv23);)
  66. ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Vshuffle[8],Vv33);)
  67. ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Sshuffle[0],Va11);)
  68. ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Sshuffle[1],Va22);)
  69. ENABLE_AVX_IMPLEMENTATION(_mm256_storeu_ps(Sshuffle[2],Va33);)
  70. for (int i=0; i<3; i++)
  71. {
  72. for (int j=0; j<3; j++)
  73. {
  74. for (int k=0; k<8; k++)
  75. {
  76. U(i + 3*k, j) = Ushuffle[i + j*3][k];
  77. V(i + 3*k, j) = Vshuffle[i + j*3][k];
  78. }
  79. }
  80. }
  81. for (int i=0; i<3; i++)
  82. {
  83. for (int k=0; k<8; k++)
  84. {
  85. S(i + 3*k, 0) = Sshuffle[i][k];
  86. }
  87. }
  88. }
  89. #pragma runtime_checks( "u", restore )
  90. #ifdef IGL_STATIC_LIBRARY
  91. // forced instantiation
  92. //template void igl::svd3x3_avx(const Eigen::Matrix<float, 3*8, 3>& A, Eigen::Matrix<float, 3*8, 3> &U, Eigen::Matrix<float, 3*8, 1> &S, Eigen::Matrix<float, 3*8, 3>&V);
  93. // doesn't even make sense with double because the wunder-SVD code is only single precision anyway...
  94. template void igl::svd3x3_avx<float>(Eigen::Matrix<float, 24, 3, 0, 24, 3> const&, Eigen::Matrix<float, 24, 3, 0, 24, 3>&, Eigen::Matrix<float, 24, 1, 0, 24, 1>&, Eigen::Matrix<float, 24, 3, 0, 24, 3>&);
  95. #endif
  96. #endif