9 jaren geleden · 328682a5eb
--- a/include/igl/ambient_occlusion.cpp
+++ b/include/igl/ambient_occlusion.cpp
@@ -10,7 +10,7 @@
 
				 #include "ray_mesh_intersect.h"
			
 
				 #include "EPS.h"
			
 
				 #include "Hit.h"
			
 
				-#include <thread>
			
 
				+#include "parallel_for.h"
			
 
				 #include <functional>
			
 
				 #include <vector>
			
 
				 #include <algorithm>
			
@@ -38,40 +38,27 @@ IGL_INLINE void igl::ambient_occlusion(
 
				   // Embree seems to be parallel when constructing but not when tracing rays
			
 
				   const MatrixXf D = random_dir_stratified(num_samples).cast<float>();
			
 
				 
			
 
				-  const size_t nthreads = n<1000?1:std::thread::hardware_concurrency();
			
 
				+  const auto & inner = [&P,&N,&num_samples,&D,&S,&shoot_ray](const int p)
			
 
				   {
			
 
				-    std::vector<std::thread> threads(nthreads);
			
 
				-    for(int t = 0;t<nthreads;t++)
			
 
				+    const Vector3f origin = P.row(p).template cast<float>();
			
 
				+    const Vector3f normal = N.row(p).template cast<float>();
			
 
				+    int num_hits = 0;
			
 
				+    for(int s = 0;s<num_samples;s++)
			
 
				     {
			
 
				-      threads[t] = std::thread(std::bind(
			
 
				-        [&P,&N,&shoot_ray,&S,&num_samples,&D](const int bi, const int ei, const int t)
			
 
				-        {
			
 
				-          // loop over mesh vertices in this chunk
			
 
				-          for(int p = bi;p<ei;p++)
			
 
				-          {
			
 
				-            const Vector3f origin = P.row(p).template cast<float>();
			
 
				-            const Vector3f normal = N.row(p).template cast<float>();
			
 
				-            int num_hits = 0;
			
 
				-            for(int s = 0;s<num_samples;s++)
			
 
				-            {
			
 
				-              Vector3f d = D.row(s);
			
 
				-              if(d.dot(normal) < 0)
			
 
				-              {
			
 
				-                // reverse ray
			
 
				-                d *= -1;
			
 
				-              }
			
 
				-              if(shoot_ray(origin,d))
			
 
				-              {
			
 
				-                num_hits++;
			
 
				-              }
			
 
				-            }
			
 
				-            S(p) = (double)num_hits/(double)num_samples;
			
 
				-          }
			
 
				-        },t*n/nthreads,(t+1)==nthreads?n:(t+1)*n/nthreads,t));
			
 
				+      Vector3f d = D.row(s);
			
 
				+      if(d.dot(normal) < 0)
			
 
				+      {
			
 
				+        // reverse ray
			
 
				+        d *= -1;
			
 
				+      }
			
 
				+      if(shoot_ray(origin,d))
			
 
				+      {
			
 
				+        num_hits++;
			
 
				+      }
			
 
				     }
			
 
				-    std::for_each(threads.begin(),threads.end(),[](std::thread& x){x.join();});
			
 
				-  }
			
 
				-
			
 
				+    S(p) = (double)num_hits/(double)num_samples;
			
 
				+  };
			
 
				+  parallel_for(n,inner,1000);
			
 
				 }
			
 
				 
			
 
				 template <
			
--- a/include/igl/doublearea.cpp
+++ b/include/igl/doublearea.cpp
@@ -7,6 +7,7 @@
 
				 // obtain one at http://mozilla.org/MPL/2.0/.
			
 
				 #include "doublearea.h"
			
 
				 #include "edge_lengths.h"
			
 
				+#include "parallel_for.h"
			
 
				 #include "sort.h"
			
 
				 #include <cassert>
			
 
				 #include <iostream>
			
@@ -148,28 +149,21 @@ IGL_INLINE void igl::doublearea(
 
				   assert((Index)s.rows() == m);
			
 
				   // resize output
			
 
				   dblA.resize(l.rows(),1);
			
 
				-  // Minimum number of iterms per openmp thread
			
 
				-  #ifndef IGL_OMP_MIN_VALUE
			
 
				-  #  define IGL_OMP_MIN_VALUE 1000
			
 
				-  #endif
			
 
				-  #pragma omp parallel for if (m>IGL_OMP_MIN_VALUE)
			
 
				-  for(Index i = 0;i<m;i++)
			
 
				-  {
			
 
				-    //// Heron's formula for area
			
 
				-    //const typename Derivedl::Scalar arg =
			
 
				-    //  s(i)*(s(i)-l(i,0))*(s(i)-l(i,1))*(s(i)-l(i,2));
			
 
				-    //assert(arg>=0);
			
 
				-    //dblA(i) = 2.0*sqrt(arg);
			
 
				-    // Kahan's Heron's formula
			
 
				-    const typename Derivedl::Scalar arg =
			
 
				-      (l(i,0)+(l(i,1)+l(i,2)))*
			
 
				-      (l(i,2)-(l(i,0)-l(i,1)))*
			
 
				-      (l(i,2)+(l(i,0)-l(i,1)))*
			
 
				-      (l(i,0)+(l(i,1)-l(i,2)));
			
 
				-    dblA(i) = 2.0*0.25*sqrt(arg);
			
 
				-    assert( l(i,2) - (l(i,0)-l(i,1)) && "FAILED KAHAN'S ASSERTION");
			
 
				-    assert(dblA(i) == dblA(i) && "DOUBLEAREA() PRODUCED NaN");
			
 
				-  }
			
 
				+  parallel_for(
			
 
				+    m,
			
 
				+    [&l,&dblA](const int i)
			
 
				+    {
			
 
				+      // Kahan's Heron's formula
			
 
				+      const typename Derivedl::Scalar arg =
			
 
				+        (l(i,0)+(l(i,1)+l(i,2)))*
			
 
				+        (l(i,2)-(l(i,0)-l(i,1)))*
			
 
				+        (l(i,2)+(l(i,0)-l(i,1)))*
			
 
				+        (l(i,0)+(l(i,1)-l(i,2)));
			
 
				+      dblA(i) = 2.0*0.25*sqrt(arg);
			
 
				+      assert( l(i,2) - (l(i,0)-l(i,1)) && "FAILED KAHAN'S ASSERTION");
			
 
				+      assert(dblA(i) == dblA(i) && "DOUBLEAREA() PRODUCED NaN");
			
 
				+    },
			
 
				+    1000l);
			
 
				 }
			
 
				 
			
 
				 template <typename DerivedV, typename DerivedF, typename DeriveddblA>
			
--- a/include/igl/edge_lengths.cpp
+++ b/include/igl/edge_lengths.cpp
@@ -6,6 +6,7 @@
 
				 // v. 2.0. If a copy of the MPL was not distributed with this file, You can
			
 
				 // obtain one at http://mozilla.org/MPL/2.0/.
			
 
				 #include "edge_lengths.h"
			
 
				+#include "parallel_for.h"
			
 
				 #include <iostream>
			
 
				 
			
 
				 template <typename DerivedV, typename DerivedF, typename DerivedL>
			
@@ -16,10 +17,6 @@ IGL_INLINE void igl::edge_lengths(
 
				 {
			
 
				   using namespace std;
			
 
				   const int m = F.rows();
			
 
				-  // Minimum number of iterms per openmp thread
			
 
				-#ifndef IGL_OMP_MIN_VALUE
			
 
				-#  define IGL_OMP_MIN_VALUE 1000
			
 
				-#endif
			
 
				   switch(F.cols())
			
 
				   {
			
 
				     case 2:
			
@@ -35,29 +32,33 @@ IGL_INLINE void igl::edge_lengths(
 
				     {
			
 
				       L.resize(m,3);
			
 
				       // loop over faces
			
 
				-      #pragma omp parallel for if (m>IGL_OMP_MIN_VALUE)
			
 
				-      for(int i = 0;i<m;i++)
			
 
				-      {
			
 
				-        L(i,0) = (V.row(F(i,1))-V.row(F(i,2))).norm();
			
 
				-        L(i,1) = (V.row(F(i,2))-V.row(F(i,0))).norm();
			
 
				-        L(i,2) = (V.row(F(i,0))-V.row(F(i,1))).norm();
			
 
				-      }
			
 
				+      parallel_for(
			
 
				+        m,
			
 
				+        [&V,&F,&L](const int i)
			
 
				+        {
			
 
				+          L(i,0) = (V.row(F(i,1))-V.row(F(i,2))).norm();
			
 
				+          L(i,1) = (V.row(F(i,2))-V.row(F(i,0))).norm();
			
 
				+          L(i,2) = (V.row(F(i,0))-V.row(F(i,1))).norm();
			
 
				+        },
			
 
				+        1000);
			
 
				       break;
			
 
				     }
			
 
				     case 4:
			
 
				     {
			
 
				       L.resize(m,6);
			
 
				       // loop over faces
			
 
				-      #pragma omp parallel for if (m>IGL_OMP_MIN_VALUE)
			
 
				-      for(int i = 0;i<m;i++)
			
 
				-      {
			
 
				-        L(i,0) = (V.row(F(i,3))-V.row(F(i,0))).norm();
			
 
				-        L(i,1) = (V.row(F(i,3))-V.row(F(i,1))).norm();
			
 
				-        L(i,2) = (V.row(F(i,3))-V.row(F(i,2))).norm();
			
 
				-        L(i,3) = (V.row(F(i,1))-V.row(F(i,2))).norm();
			
 
				-        L(i,4) = (V.row(F(i,2))-V.row(F(i,0))).norm();
			
 
				-        L(i,5) = (V.row(F(i,0))-V.row(F(i,1))).norm();
			
 
				-      }
			
 
				+      parallel_for(
			
 
				+        m,
			
 
				+        [&V,&F,&L](const int i)
			
 
				+        {
			
 
				+          L(i,0) = (V.row(F(i,3))-V.row(F(i,0))).norm();
			
 
				+          L(i,1) = (V.row(F(i,3))-V.row(F(i,1))).norm();
			
 
				+          L(i,2) = (V.row(F(i,3))-V.row(F(i,2))).norm();
			
 
				+          L(i,3) = (V.row(F(i,1))-V.row(F(i,2))).norm();
			
 
				+          L(i,4) = (V.row(F(i,2))-V.row(F(i,0))).norm();
			
 
				+          L(i,5) = (V.row(F(i,0))-V.row(F(i,1))).norm();
			
 
				+        },
			
 
				+        1000);
			
 
				       break;
			
 
				     }
			
 
				     default:
			
--- a/include/igl/internal_angles.cpp
+++ b/include/igl/internal_angles.cpp
@@ -8,6 +8,7 @@
 
				 // obtain one at http://mozilla.org/MPL/2.0/.
			
 
				 #include "internal_angles.h"
			
 
				 #include "edge_lengths.h"
			
 
				+#include "parallel_for.h"
			
 
				 #include "get_seconds.h"
			
 
				 
			
 
				 template <typename DerivedV, typename DerivedF, typename DerivedK>
			
@@ -70,28 +71,19 @@ IGL_INLINE void igl::internal_angles(
 
				   assert(L.cols() == 3 && "Edge-lengths should come from triangles");
			
 
				   const Index m = L.rows();
			
 
				   K.resize(m,3);
			
 
				-  //for(int d = 0;d<3;d++)
			
 
				-  //{
			
 
				-  //  const auto & s1 = L.col(d).array();
			
 
				-  //  const auto & s2 = L.col((d+1)%3).array();
			
 
				-  //  const auto & s3 = L.col((d+2)%3).array();
			
 
				-  //  K.col(d) = ((s3.square() + s2.square() - s1.square())/(2.*s3*s2)).acos();
			
 
				-  //}
			
 
				-  // Minimum number of iterms per openmp thread
			
 
				-  #ifndef IGL_OMP_MIN_VALUE
			
 
				-  #  define IGL_OMP_MIN_VALUE 1000
			
 
				-  #endif
			
 
				-  #pragma omp parallel for if (m>IGL_OMP_MIN_VALUE)
			
 
				-  for(Index f = 0;f<m;f++)
			
 
				-  {
			
 
				-    for(size_t d = 0;d<3;d++)
			
 
				+  parallel_for(
			
 
				+    m,
			
 
				+    [&L,&K](const Index f)
			
 
				     {
			
 
				-      const auto & s1 = L(f,d);
			
 
				-      const auto & s2 = L(f,(d+1)%3);
			
 
				-      const auto & s3 = L(f,(d+2)%3);
			
 
				-      K(f,d) = acos((s3*s3 + s2*s2 - s1*s1)/(2.*s3*s2));
			
 
				-    }
			
 
				-  }
			
 
				+      for(size_t d = 0;d<3;d++)
			
 
				+      {
			
 
				+        const auto & s1 = L(f,d);
			
 
				+        const auto & s2 = L(f,(d+1)%3);
			
 
				+        const auto & s3 = L(f,(d+2)%3);
			
 
				+        K(f,d) = acos((s3*s3 + s2*s2 - s1*s1)/(2.*s3*s2));
			
 
				+      }
			
 
				+    },
			
 
				+    1000l);
			
 
				 }
			
 
				 
			
 
				 #ifdef IGL_STATIC_LIBRARY
			
--- a/include/igl/parallel_for.h
+++ b/include/igl/parallel_for.h
@@ -0,0 +1,180 @@
 
				+// This file is part of libigl, a simple c++ geometry processing library.
			
 
				+// 
			
 
				+// Copyright (C) 2016 Alec Jacobson <alecjacobson@gmail.com>
			
 
				+// 
			
 
				+// This Source Code Form is subject to the terms of the Mozilla Public License 
			
 
				+// v. 2.0. If a copy of the MPL was not distributed with this file, You can 
			
 
				+// obtain one at http://mozilla.org/MPL/2.0/.
			
 
				+#ifndef IGL_PARALLEL_FOR_H
			
 
				+#define IGL_PARALLEL_FOR_H
			
 
				+#include "igl_inline.h"
			
 
				+#include <functional>
			
 
				+
			
 
				+namespace igl
			
 
				+{
			
 
				+  // PARALLEL_FOR Functional implementation of a basic, open-mp style, parallel
			
 
				+  // for loop. If the inner block of a for-loop can be rewritten/encapsulated in
			
 
				+  // a single (anonymous/lambda) function call `func` so that the serial code
			
 
				+  // looks like:
			
 
				+  // 
			
 
				+  //     for(int i = 0;i<loop_size;i++)
			
 
				+  //     {
			
 
				+  //       func(i);
			
 
				+  //     }
			
 
				+  //
			
 
				+  // then `parallel_for(loop_size,func,min_parallel)` will use as many threads as
			
 
				+  // available on the current hardware to parallelize this for loop so long as
			
 
				+  // loop_size<min_parallel, otherwise it will just use a serial for loop.
			
 
				+  //
			
 
				+  // Inputs:
			
 
				+  //   loop_size  number of iterations. I.e. for(int i = 0;i<loop_size;i++) ...
			
 
				+  //   func  function handle taking iteration index as only arguement to compute
			
 
				+  //     inner block of for loop I.e. for(int i ...){ func(i); }
			
 
				+  //   min_parallel  min size of loop_size such that parallel (non-serial)
			
 
				+  //     thread pooling should be attempted {0}
			
 
				+  // Returns true iff thread pool was invoked
			
 
				+  template<typename Index, typename FunctionType >
			
 
				+  inline bool parallel_for(
			
 
				+    const Index loop_size, 
			
 
				+    const FunctionType & func,
			
 
				+    const Index min_parallel=0);
			
 
				+  // PARALLEL_FOR Functional implementation of an open-mp style, parallel for
			
 
				+  // loop with accumulation. For example, serial code separated into n chunks
			
 
				+  // (each to be parallelized with a thread) might look like:
			
 
				+  //     
			
 
				+  //     Eigen::VectorXd S;
			
 
				+  //     const auto & prep_func = [&S](int n){ S = Eigen:VectorXd::Zero(n); };
			
 
				+  //     const auto & func = [&X,&S](int i, int t){ S(t) += X(i); };
			
 
				+  //     const auto & accum_func = [&S,&sum](int t){ sum += S(t); };
			
 
				+  //     prep_func(n);
			
 
				+  //     for(int i = 0;i<loop_size;i++)
			
 
				+  //     {
			
 
				+  //       func(i,i%n);
			
 
				+  //     }
			
 
				+  //     double sum = 0;
			
 
				+  //     for(int t = 0;t<n;t++)
			
 
				+  //     {
			
 
				+  //       accum_func(t);
			
 
				+  //     }
			
 
				+  // 
			
 
				+  // Inputs:
			
 
				+  //   loop_size  number of iterations. I.e. for(int i = 0;i<loop_size;i++) ...
			
 
				+  //   prep_func function handle taking n >= number of threads as only
			
 
				+  //     argument 
			
 
				+  //   func  function handle taking iteration index i and thread id t as only
			
 
				+  //     arguements to compute inner block of for loop I.e. 
			
 
				+  //     for(int i ...){ func(i,t); }
			
 
				+  //   accum_func  function handle taking thread index as only argument, to be
			
 
				+  //     called after all calls of func, e.g., for serial accumulation across
			
 
				+  //     all n (potential) threads, see n in description of prep_func.
			
 
				+  //   min_parallel  min size of loop_size such that parallel (non-serial)
			
 
				+  //     thread pooling should be attempted {0}
			
 
				+  // Returns true iff thread pool was invoked
			
 
				+  template<
			
 
				+    typename Index, 
			
 
				+    typename PrepFunctionType, 
			
 
				+    typename FunctionType, 
			
 
				+    typename AccumFunctionType 
			
 
				+    >
			
 
				+  inline bool parallel_for(
			
 
				+    const Index loop_size, 
			
 
				+    const PrepFunctionType & prep_func,
			
 
				+    const FunctionType & func,
			
 
				+    const AccumFunctionType & accum_func,
			
 
				+    const Index min_parallel=0);
			
 
				+}
			
 
				+
			
 
				+// Implementation
			
 
				+
			
 
				+#include <cmath>
			
 
				+#include <cassert>
			
 
				+#include <thread>
			
 
				+#include <vector>
			
 
				+#include <algorithm>
			
 
				+
			
 
				+template<typename Index, typename FunctionType >
			
 
				+inline bool igl::parallel_for(
			
 
				+  const Index loop_size, 
			
 
				+  const FunctionType & func,
			
 
				+  const Index min_parallel)
			
 
				+{
			
 
				+  using namespace std;
			
 
				+  // no op preparation/accumulation
			
 
				+  const auto & no_op = [](const size_t /*n/t*/){};
			
 
				+  // two-parameter wrapper ignoring thread id
			
 
				+  const auto & wrapper = [&func](Index i,size_t /*t*/){ func(i); };
			
 
				+  return parallel_for(loop_size,no_op,wrapper,no_op,min_parallel);
			
 
				+}
			
 
				+
			
 
				+template<
			
 
				+  typename Index, 
			
 
				+  typename PreFunctionType,
			
 
				+  typename FunctionType, 
			
 
				+  typename AccumFunctionType>
			
 
				+inline bool igl::parallel_for(
			
 
				+  const Index loop_size, 
			
 
				+  const PreFunctionType & prep_func,
			
 
				+  const FunctionType & func,
			
 
				+  const AccumFunctionType & accum_func,
			
 
				+  const Index min_parallel)
			
 
				+{
			
 
				+  assert(loop_size>=0);
			
 
				+  if(loop_size==0) return false;
			
 
				+  // Estimate number of threads in the pool
			
 
				+  // http://ideone.com/Z7zldb
			
 
				+  const static size_t sthc = std::thread::hardware_concurrency();
			
 
				+  const size_t nthreads = loop_size<min_parallel?0:(sthc==0?8:sthc);
			
 
				+  if(nthreads==0)
			
 
				+  {
			
 
				+    // serial
			
 
				+    prep_func(1);
			
 
				+    for(Index i = 0;i<loop_size;i++) func(i,0);
			
 
				+    accum_func(0);
			
 
				+    return false;
			
 
				+  }else
			
 
				+  {
			
 
				+    // Size of a slice for the range functions
			
 
				+    Index slice = 
			
 
				+      std::max(
			
 
				+        (Index)std::round((loop_size+1)/static_cast<double>(nthreads)),(Index)1);
			
 
				+ 
			
 
				+    // [Helper] Inner loop
			
 
				+    const auto & range = [&func](const Index k1, const Index k2, const size_t t)
			
 
				+    {
			
 
				+      for(Index k = k1; k < k2; k++) func(k,t);
			
 
				+    };
			
 
				+    prep_func(nthreads);
			
 
				+    // Create pool and launch jobs
			
 
				+    std::vector<std::thread> pool;
			
 
				+    pool.reserve(nthreads);
			
 
				+    // Inner range extents
			
 
				+    Index i1 = 0;
			
 
				+    Index i2 = std::min(0 + slice, loop_size);
			
 
				+    {
			
 
				+      size_t t = 0;
			
 
				+      for (; t+1 < nthreads && i1 < loop_size; ++t)
			
 
				+      {
			
 
				+        pool.emplace_back(range, i1, i2, t);
			
 
				+        i1 = i2;
			
 
				+        i2 = std::min(i2 + slice, loop_size);
			
 
				+      }
			
 
				+      if (i1 < loop_size)
			
 
				+      {
			
 
				+        pool.emplace_back(range, i1, loop_size, t);
			
 
				+      }
			
 
				+    }
			
 
				+    // Wait for jobs to finish
			
 
				+    for (std::thread &t : pool) if (t.joinable()) t.join();
			
 
				+    // Accumulate across threads
			
 
				+    for(size_t t = 0;t<nthreads;t++)
			
 
				+    {
			
 
				+      accum_func(t);
			
 
				+    }
			
 
				+    return true;
			
 
				+  }
			
 
				+}
			
 
				+ 
			
 
				+//#ifndef IGL_STATIC_LIBRARY
			
 
				+//#include "parallel_for.cpp"
			
 
				+//#endif
			
 
				+#endif
			
--- a/include/igl/per_vertex_normals.cpp
+++ b/include/igl/per_vertex_normals.cpp
@@ -10,6 +10,7 @@
 
				 #include "get_seconds.h"
			
 
				 #include "per_face_normals.h"
			
 
				 #include "doublearea.h"
			
 
				+#include "parallel_for.h"
			
 
				 #include "internal_angles.h"
			
 
				 
			
 
				 template <typename DerivedV, typename DerivedF>
			
@@ -68,23 +69,34 @@ IGL_INLINE void igl::per_vertex_normals(
 
				   }
			
 
				 
			
 
				   // loop over faces
			
 
				-  const int Frows = F.rows();
			
 
				-//// Minimum number of iterms per openmp thread
			
 
				-//#ifndef IGL_OMP_MIN_VALUE
			
 
				-//#  define IGL_OMP_MIN_VALUE 1000
			
 
				-//#endif
			
 
				-//#pragma omp parallel for if (Frows>IGL_OMP_MIN_VALUE)
			
 
				-  for(int i = 0; i < Frows;i++)
			
 
				+  for(int i = 0;i<F.rows();i++)
			
 
				   {
			
 
				     // throw normal at each corner
			
 
				     for(int j = 0; j < 3;j++)
			
 
				     {
			
 
				-      // Q: Does this need to be critical?
			
 
				-      // A: Yes. Different (i,j)'s could produce the same F(i,j)
			
 
				-//#pragma omp critical
			
 
				-      N.row(F(i,j)) += W(i,j)*FN.row(i);
			
 
				+      N.row(F(i,j)) += W(i,j) * FN.row(i);
			
 
				     }
			
 
				   }
			
 
				+
			
 
				+  //// loop over faces
			
 
				+  //std::mutex critical;
			
 
				+  //std::vector<DerivedN> NN;
			
 
				+  //parallel_for(
			
 
				+  //  F.rows(),
			
 
				+  //  [&NN,&N](const size_t n){ NN.resize(n,DerivedN::Zero(N.rows(),3));},
			
 
				+  //  [&F,&W,&FN,&NN,&critical](const int i, const size_t t)
			
 
				+  //  {
			
 
				+  //    // throw normal at each corner
			
 
				+  //    for(int j = 0; j < 3;j++)
			
 
				+  //    {
			
 
				+  //      // Q: Does this need to be critical?
			
 
				+  //      // A: Yes. Different (i,j)'s could produce the same F(i,j)
			
 
				+  //      NN[t].row(F(i,j)) += W(i,j) * FN.row(i);
			
 
				+  //    }
			
 
				+  //  },
			
 
				+  //  [&N,&NN](const size_t t){ N += NN[t]; },
			
 
				+  //  1000l);
			
 
				+
			
 
				   // take average via normalization
			
 
				   N.rowwise().normalize();
			
 
				 }
			
--- a/include/igl/sort.cpp
+++ b/include/igl/sort.cpp
@@ -11,12 +11,11 @@
 
				 #include "reorder.h"
			
 
				 #include "IndexComparison.h"
			
 
				 #include "colon.h"
			
 
				+#include "parallel_for.h"
			
 
				 
			
 
				 #include <cassert>
			
 
				 #include <algorithm>
			
 
				 #include <iostream>
			
 
				-#include <thread>
			
 
				-#include <functional>
			
 
				 
			
 
				 template <typename DerivedX, typename DerivedY, typename DerivedIX>
			
 
				 IGL_INLINE void igl::sort(
			
@@ -223,106 +222,86 @@ IGL_INLINE void igl::sort3(
 
				     IX.col(2).setConstant(2);// = Eigen::PlainObjectBase<DerivedIX>::Ones (IX.rows(),1);
			
 
				   }
			
 
				 
			
 
				-  const int n = num_outer;
			
 
				-  const size_t nthreads = n<8000?1:std::thread::hardware_concurrency();
			
 
				+
			
 
				+  const auto & inner = [&IX,&Y,&dim,&ascending](const Index & i)
			
 
				   {
			
 
				-    std::vector<std::thread> threads(nthreads<=1?0:nthreads);
			
 
				-    for(int t = 0;t<nthreads;t++)
			
 
				+    YScalar & a = (dim==1 ? Y(0,i) : Y(i,0));
			
 
				+    YScalar & b = (dim==1 ? Y(1,i) : Y(i,1));
			
 
				+    YScalar & c = (dim==1 ? Y(2,i) : Y(i,2));
			
 
				+    Index & ai = (dim==1 ? IX(0,i) : IX(i,0));
			
 
				+    Index & bi = (dim==1 ? IX(1,i) : IX(i,1));
			
 
				+    Index & ci = (dim==1 ? IX(2,i) : IX(i,2));
			
 
				+    if(ascending)
			
 
				     {
			
 
				-      const auto & inner =  
			
 
				-        [&X,&Y,&IX,&dim,&ascending](const int bi, const int ei, const int t)
			
 
				+      // 123 132 213 231 312 321
			
 
				+      if(a > b)
			
 
				+      {
			
 
				+        std::swap(a,b);
			
 
				+        std::swap(ai,bi);
			
 
				+      }
			
 
				+      // 123 132 123 231 132 231
			
 
				+      if(b > c)
			
 
				       {
			
 
				-        // loop over columns (or rows)
			
 
				-        for(int i = bi;i<ei;i++)
			
 
				+        std::swap(b,c);
			
 
				+        std::swap(bi,ci);
			
 
				+        // 123 123 123 213 123 213
			
 
				+        if(a > b)
			
 
				         {
			
 
				-          YScalar & a = (dim==1 ? Y(0,i) : Y(i,0));
			
 
				-          YScalar & b = (dim==1 ? Y(1,i) : Y(i,1));
			
 
				-          YScalar & c = (dim==1 ? Y(2,i) : Y(i,2));
			
 
				-          Index & ai = (dim==1 ? IX(0,i) : IX(i,0));
			
 
				-          Index & bi = (dim==1 ? IX(1,i) : IX(i,1));
			
 
				-          Index & ci = (dim==1 ? IX(2,i) : IX(i,2));
			
 
				-          if(ascending)
			
 
				-          {
			
 
				-            // 123 132 213 231 312 321
			
 
				-            if(a > b)
			
 
				-            {
			
 
				-              std::swap(a,b);
			
 
				-              std::swap(ai,bi);
			
 
				-            }
			
 
				-            // 123 132 123 231 132 231
			
 
				-            if(b > c)
			
 
				-            {
			
 
				-              std::swap(b,c);
			
 
				-              std::swap(bi,ci);
			
 
				-              // 123 123 123 213 123 213
			
 
				-              if(a > b)
			
 
				-              {
			
 
				-                std::swap(a,b);
			
 
				-                std::swap(ai,bi);
			
 
				-              }
			
 
				-              // 123 123 123 123 123 123
			
 
				-            }
			
 
				-          }else
			
 
				-          {
			
 
				-            // 123 132 213 231 312 321
			
 
				-            if(a < b)
			
 
				-            {
			
 
				-              std::swap(a,b);
			
 
				-              std::swap(ai,bi);
			
 
				-            }
			
 
				-            // 213 312 213 321 312 321
			
 
				-            if(b < c)
			
 
				-            {
			
 
				-              std::swap(b,c);
			
 
				-              std::swap(bi,ci);
			
 
				-              // 231 321 231 321 321 321
			
 
				-              if(a < b)
			
 
				-              {
			
 
				-                std::swap(a,b);
			
 
				-                std::swap(ai,bi);
			
 
				-              }
			
 
				-              // 321 321 321 321 321 321
			
 
				-            }
			
 
				-          }
			
 
				+          std::swap(a,b);
			
 
				+          std::swap(ai,bi);
			
 
				         }
			
 
				-      };
			
 
				-      if(nthreads == 1)
			
 
				+        // 123 123 123 123 123 123
			
 
				+      }
			
 
				+    }else
			
 
				+    {
			
 
				+      // 123 132 213 231 312 321
			
 
				+      if(a < b)
			
 
				       {
			
 
				-        inner(0,n,0);
			
 
				-      }else
			
 
				+        std::swap(a,b);
			
 
				+        std::swap(ai,bi);
			
 
				+      }
			
 
				+      // 213 312 213 321 312 321
			
 
				+      if(b < c)
			
 
				       {
			
 
				-        threads[t] = std::thread(
			
 
				-          std::bind(inner,t*n/nthreads,(t+1)==nthreads?n:(t+1)*n/nthreads,t));
			
 
				+        std::swap(b,c);
			
 
				+        std::swap(bi,ci);
			
 
				+        // 231 321 231 321 321 321
			
 
				+        if(a < b)
			
 
				+        {
			
 
				+          std::swap(a,b);
			
 
				+          std::swap(ai,bi);
			
 
				+        }
			
 
				+        // 321 321 321 321 321 321
			
 
				       }
			
 
				     }
			
 
				-    std::for_each(threads.begin(),threads.end(),[](std::thread& x){x.join();});
			
 
				-  }
			
 
				+  };
			
 
				+  parallel_for(num_outer,inner,16000);
			
 
				 }
			
 
				 
			
 
				 template <class T>
			
 
				 IGL_INLINE void igl::sort(
			
 
				-  const std::vector<T> & unsorted,
			
 
				-  const bool ascending,
			
 
				-  std::vector<T> & sorted,
			
 
				-  std::vector<size_t> & index_map)
			
 
				+const std::vector<T> & unsorted,
			
 
				+const bool ascending,
			
 
				+std::vector<T> & sorted,
			
 
				+std::vector<size_t> & index_map)
			
 
				 {
			
 
				-  // Original unsorted index map
			
 
				-  index_map.resize(unsorted.size());
			
 
				-  for(size_t i=0;i<unsorted.size();i++)
			
 
				-  {
			
 
				-    index_map[i] = i;
			
 
				-  }
			
 
				-  // Sort the index map, using unsorted for comparison
			
 
				-  std::sort(
			
 
				-    index_map.begin(),
			
 
				-    index_map.end(),
			
 
				-    igl::IndexLessThan<const std::vector<T>& >(unsorted));
			
 
				+// Original unsorted index map
			
 
				+index_map.resize(unsorted.size());
			
 
				+for(size_t i=0;i<unsorted.size();i++)
			
 
				+{
			
 
				+  index_map[i] = i;
			
 
				+}
			
 
				+// Sort the index map, using unsorted for comparison
			
 
				+std::sort(
			
 
				+  index_map.begin(),
			
 
				+  index_map.end(),
			
 
				+  igl::IndexLessThan<const std::vector<T>& >(unsorted));
			
 
				 
			
 
				-  // if not ascending then reverse
			
 
				-  if(!ascending)
			
 
				-  {
			
 
				-    std::reverse(index_map.begin(),index_map.end());
			
 
				-  }
			
 
				+// if not ascending then reverse
			
 
				+if(!ascending)
			
 
				+{
			
 
				+  std::reverse(index_map.begin(),index_map.end());
			
 
				+}
			
 
				   // make space for output without clobbering
			
 
				   sorted.resize(unsorted.size());
			
 
				   // reorder unsorted into sorted using index map
			
--- a/include/igl/triangle_triangle_adjacency.cpp
+++ b/include/igl/triangle_triangle_adjacency.cpp
@@ -9,6 +9,7 @@
 
				 #include "is_edge_manifold.h"
			
 
				 #include "all_edges.h"
			
 
				 #include "unique_simplices.h"
			
 
				+#include "parallel_for.h"
			
 
				 #include "unique_edge_map.h"
			
 
				 #include <algorithm>
			
 
				 #include <iostream>
			
@@ -172,36 +173,37 @@ template <
 
				   }
			
 
				 
			
 
				   // No race conditions because TT*[f][c]'s are in bijection with e's
			
 
				-  // Minimum number of iterms per openmp thread
			
 
				+  // Minimum number of items per thread
			
 
				   //const size_t num_e = E.rows();
			
 
				-# ifndef IGL_OMP_MIN_VALUE
			
 
				-#   define IGL_OMP_MIN_VALUE 1000
			
 
				-# endif
			
 
				-# pragma omp parallel for if (m>IGL_OMP_MIN_VALUE)
			
 
				   // Slightly better memory access than loop over E
			
 
				-  for(Index f = 0;f<(Index)m;f++)
			
 
				-  {
			
 
				-    for(Index c = 0;c<3;c++)
			
 
				+  igl::parallel_for(
			
 
				+    m,
			
 
				+    [&](const Index & f)
			
 
				     {
			
 
				-      const Index e = f + m*c;
			
 
				-      //const Index c = e/m;
			
 
				-      const vector<uE2EType> & N = uE2E[EMAP(e)];
			
 
				-      for(const auto & ne : N)
			
 
				+      for(Index c = 0;c<3;c++)
			
 
				       {
			
 
				-        const Index nf = ne%m;
			
 
				-        // don't add self
			
 
				-        if(nf != f)
			
 
				+        const Index e = f + m*c;
			
 
				+        //const Index c = e/m;
			
 
				+        const vector<uE2EType> & N = uE2E[EMAP(e)];
			
 
				+        for(const auto & ne : N)
			
 
				         {
			
 
				-          TT[f][c].push_back(nf);
			
 
				-          if(construct_TTi)
			
 
				+          const Index nf = ne%m;
			
 
				+          // don't add self
			
 
				+          if(nf != f)
			
 
				           {
			
 
				-            const Index nc = ne/m;
			
 
				-            TTi[f][c].push_back(nc);
			
 
				+            TT[f][c].push_back(nf);
			
 
				+            if(construct_TTi)
			
 
				+            {
			
 
				+              const Index nc = ne/m;
			
 
				+              TTi[f][c].push_back(nc);
			
 
				+            }
			
 
				           }
			
 
				         }
			
 
				       }
			
 
				-    }
			
 
				-  }
			
 
				+    },
			
 
				+    1000ul);
			
 
				+
			
 
				+
			
 
				 }
			
 
				 
			
 
				 #ifdef IGL_STATIC_LIBRARY
			
--- a/include/igl/unique_simplices.cpp
+++ b/include/igl/unique_simplices.cpp
@@ -1,6 +1,6 @@
 
				 // This file is part of libigl, a simple c++ geometry processing library.
			
 
				 // 
			
 
				-// Copyright (C) 2013 Alec Jacobson <alecjacobson@gmail.com>
			
 
				+// Copyright (C) 2016 Alec Jacobson <alecjacobson@gmail.com>
			
 
				 // 
			
 
				 // This Source Code Form is subject to the terms of the Mozilla Public License 
			
 
				 // v. 2.0. If a copy of the MPL was not distributed with this file, You can 
			
@@ -8,7 +8,7 @@
 
				 #include "unique_simplices.h"
			
 
				 #include "sort.h"
			
 
				 #include "unique.h"
			
 
				-#include "get_seconds.h"
			
 
				+#include "parallel_for.h"
			
 
				 
			
 
				 template <
			
 
				   typename DerivedF,
			
@@ -31,16 +31,7 @@ IGL_INLINE void igl::unique_simplices(
 
				   igl::unique_rows(sortF,C,IA,IC);
			
 
				   FF.resize(IA.size(),F.cols());
			
 
				   const size_t mff = FF.rows();
			
 
				-  // Minimum number of iterms per openmp thread
			
 
				-  #ifndef IGL_OMP_MIN_VALUE
			
 
				-  #  define IGL_OMP_MIN_VALUE 1000
			
 
				-  #endif
			
 
				-  #pragma omp parallel for if (mff>IGL_OMP_MIN_VALUE)
			
 
				-  // Copy into output
			
 
				-  for(size_t i = 0;i<mff;i++)
			
 
				-  {
			
 
				-    FF.row(i) = F.row(IA(i));
			
 
				-  }
			
 
				+  parallel_for(mff,[&F,&IA,&FF](size_t & i){FF.row(i) = F.row(IA(i));},1000ul);
			
 
				 }
			
 
				 
			
 
				 template <