/************************************************************************/
/*                                                                      */
/*    vspline - a set of generic tools for creation and evaluation      */
/*              of uniform b-splines                                    */
/*                                                                      */
/*            Copyright 2015 - 2018 by Kay F. Jahnke                    */
/*                                                                      */
/*    The git repository for this software is at                        */
/*                                                                      */
/*    https://bitbucket.org/kfj/vspline                                 */
/*                                                                      */
/*    Please direct questions, bug reports, and contributions to        */
/*                                                                      */
/*    kfjahnke+vspline@gmail.com                                        */
/*                                                                      */
/*    Permission is hereby granted, free of charge, to any person       */
/*    obtaining a copy of this software and associated documentation    */
/*    files (the "Software"), to deal in the Software without           */
/*    restriction, including without limitation the rights to use,      */
/*    copy, modify, merge, publish, distribute, sublicense, and/or      */
/*    sell copies of the Software, and to permit persons to whom the    */
/*    Software is furnished to do so, subject to the following          */
/*    conditions:                                                       */
/*                                                                      */
/*    The above copyright notice and this permission notice shall be    */
/*    included in all copies or substantial portions of the             */
/*    Software.                                                         */
/*                                                                      */
/*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND    */
/*    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES   */
/*    OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND          */
/*    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT       */
/*    HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,      */
/*    WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING      */
/*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR     */
/*    OTHER DEALINGS IN THE SOFTWARE.                                   */
/*                                                                      */
/************************************************************************/

/// \file multithread.h
///
/// \brief code to distribute the processing of bulk data to several threads
/// 
/// The code in this header provides a resonably general method to perform
/// processing of manifolds of data with several threads in parallel. In vspline,
/// there are several areas where potentially large numbers of individual values
/// have to be processed independently of each other or in a dependence which
/// can be preserved in partitioning. To process such 'bulk' data effectively,
/// vspline employs two strategies: multithreading and vectorization.
/// This file handles the multithreading.
///
/// To produce generic code for the purpose, we first introduce a model of what
/// we intend to do. This model looks at the data as occupying a 'range' having
/// a defined starting point and end point. We keep with the convention of defining
/// ranges so that the start point is inside and the end point outside the data
/// set described by the range, just like iterators obtained by begin() and end().
/// This range is made explicit, even if it is implicit in the data which we want to
/// submit to multithreading, and there is a type for the purpose: struct range_type.
/// range_type merely captures the concept of a range, taking 'limit_type' as it's
/// template parameter, so that any type of range can be accomodated. A range is
/// defined by it's lower and upper limit.
///
/// Next we define an object holding a set of ranges, modeling a partitioning of
/// an original/whole range into subranges, which, within the context of this code,
/// are disparate and in sequence. This object is modeled as struct partition_type,
/// taking a range_type as it's template argument.
///
/// With these types, we model concrete ranges and partitionings. The most important
/// one is dealing with multidimensional shapes, where a range extends from a 'lower'
/// coordinate to just below a 'higer' coordinate. These two coordinates can be
/// used directly to call vigra's 'subarray' function.
///
/// Next we provide code to partition ranges into partitionings (sets of subranges).
///
/// Finally we can express a generalized multithreading routine. This routine takes
/// a functor capable of processing a range specification and a parameter pack of
/// arbitrary further parameters, some of which will usually be referring to manifolds
/// of data for which the given range makes sense. We call this routine with a
/// partitioning of the original range and the same parameter pack that is to be passed
/// on to the functor. The multithreading routine proceeds to set up 'tasks' as needed,
/// providing each with the functor as it's functional, a subrange from
/// the partitioning, and the parameter pack as arguments. The routine to be used
/// to partition the 'whole' range is passed in. So, to reiterate: we don't partition
/// the data, but only the information about the extent of the data. The multithreading
/// routine itself merely operates on this extent information, the 'meaning' of the
/// extent information is only known by the functor which is invoked in the worker
/// threads and it is only put to use in the worker thread when the functor gets
/// invoked, receiving it's 'range' as the first argument. With this strategy we can
/// handle the process uniformly for a wide range of situations.
///
/// The tasks, once prepared, are handed over to a 'joint_task' object which handles
/// the interaction with the thread pool (in thread_pool.h). While my initial code
/// used one thread per task, this turned out inefficient, because it was not granular
/// enough: the slowest thread became the limiting factor. Now the job at hand is split
/// into more individual tasks (something like 8 times the number of cores), resulting
/// in a fair compromise concerning granularity. multithread() waits for all tasks to
/// terminate and returns when it's certain that the job is complete. While it's feasible
/// to code so that the multithreading routine does not block, I have chosen not to
/// implement such behaviour for now. For many tasks, this is necessary: Consider
/// nD filtering: Filtering along axis 1 can only start once filtering along axis 0
/// is complete. For transforms, launching a transform would be safe while another
/// transform is running, provided the invocations don't get into each other's way.
/// But waiting for the first transform to terminate is certainly safe. So unless I
/// see a good reason for doing it differently, I'll stick with the blocking multithreading
/// routine. Of course it's possible to launch several threads which call multithread
/// synchronously, and these calls won't block each other. But then the user has to
/// make sure the threads don't step on each other's toes.
///
/// the use of multithreading can be suppressed by defining VSPLINE_SINGLETHREAD. While
/// this is less efficient than avoiding calling multithread() altogether, it's easy and
/// doesn't cost much performance. Since defining VSPLINE_SINGLETHREAD excludes all
/// multithreading-related code, linking can omit -pthread. This option may be helpful
/// in debugging.

#ifndef VSPLINE_MULTITHREAD_H
#define VSPLINE_MULTITHREAD_H

#include <assert.h>

#ifndef VSPLINE_SINGLETHREAD

// only include multithreading-related headers if VSPLINE_SINGLETHREAD is not defined

#include <thread>
#include <mutex>
#include <queue>
#include <condition_variable>
#include "thread_pool.h"

#endif // #ifndef VSPLINE_SINGLETHREAD

#include <vigra/tinyvector.hxx>
#include <vigra/multi_array.hxx>
#include "common.h"

namespace vspline
{
/// number of CPU cores in the system

#ifdef VSPLINE_SINGLETHREAD
const int ncores = 1 ;
#else
const int ncores = std::thread::hardware_concurrency() ;
#endif

/// when multithreading, use this number of jobs per default. This is
/// an attempt at a compromise: too many jobs will produce too much overhead,
/// too few will not distribute the load well and make the system vulnerable
/// to 'straggling' threads

#ifdef VSPLINE_SINGLETHREAD
const int default_njobs = 1 ;
#else
const int default_njobs = 8 * ncores ;
#endif

// next we have partitioning code. this is used for multithreading, but not
// multithreading-specific, so we don't exclude it even if VSPLINE_SINGLETHREAD
// is defined

/// given limit_type, we define range_type as a TinyVector of two limit_types,
/// the first denoting the beginning of the range and the second it's end, with
/// 'end' being just outside of the range.

template < class limit_type >
using range_type = vigra::TinyVector < limit_type , 2 > ;

/// given range_type, we define partition_type as a std::vector of range_type.
/// This data type is used to hold the partitioning of a range into subranges.

template < class range_type >
using partition_type = std::vector < range_type > ;

// now we define a few specific range and partition types:

/// index_range_type is for a simple range from one (1D) index to another

typedef range_type < std::ptrdiff_t > index_range_type ;

/// and index_partition_type is the corresponding partition type

typedef partition_type < index_range_type > index_partition_type ;

/// index_range_splitter partitions an index range. The split is performed
/// so that, if possible, the partitions contain a multiple of 'vsize' indexes.
/// We use the construct of 'bundles' containing vsize indexes each. The
/// jobs are set up to contain at least one bundle each, if possible.
/// If the partitioning produces chunks of different sizes, they are set up
/// so that larger chunks are first in the partitioning, which produces
/// slight 'tapering' - having smaller jobs towards the end. The tapering
/// might be made more pronounced to increase efficiency.

struct index_range_splitter
{
  static index_partition_type part ( const index_range_type & range ,
                                     std::ptrdiff_t njobs ,
                                     const std::ptrdiff_t & vsize )
  {
    // total number of indexes
    
    std::ptrdiff_t ni = range[1] - range[0] ;

    // we want to group the indexes in 'bundles' of vsize indexes.
    // total number of bundles:
    
    std::ptrdiff_t bundles = ni / vsize ;
    
    // leftover indexes leftover after forming bundles
    
    std::ptrdiff_t remainder = ni % vsize ;

    if ( bundles == 0 )
    {
      // we can't even form one bundle,
      // so it's just one job with the whole range

      index_partition_type res ( 1 ) ;
      res[0] = range ;
      return res ;
    }

    // we have at least one bundle. all jobs get a basic allotment of
    // 'socket_size' (possibly 0) bundles
    
    std::ptrdiff_t socket_size = bundles / njobs ;
    
    // we may have 'leftover' bundles which have not been shared out
    // to the basic allotments
    
    std::ptrdiff_t leftover = bundles % njobs ;

    // if socket_size is 0, we adjust *njobs* so that each job
    // gets one of the 'leftover' bundles. Since we have taken
    // care of the case that we can't even form a single bundle,
    // we can be certain that njobs won't decrease to zero.
    
    if ( socket_size == 0 )
      njobs = leftover ;
    
    // now we  set up the index partitioning to hold 'njobs' subranges
    
    index_partition_type res ( njobs ) ;
    
    // and fill in the ranges for the jobs one by one

    for ( int j = 0 ; j < njobs ; j++ )
    {
      // calculate the number of indexes going into this subrange
      
      int sz = socket_size * vsize ;
      if ( j < leftover )
        sz += vsize ;
      
      // the very first subrange starts at range[0] and gets the remainder
      // on top. all other subranges start where the previous subrange ends
      
      if ( j == 0 )
      {
        res[j][0] = range[0] ;
        sz += remainder ;
      }
      else
      {
        res[j][0] = res[j-1][1] ;
      }
      
      // all subranges end 'sz' after their beginning
      
      res[j][1] = res[j][0] + sz ;
    }
   
    // TODO: being paranoid here, may remove assertions for production code
    
    assert ( res[0][0] == range[0] ) ;       // doublecheck
    assert ( res[njobs-1][1] == range[1] ) ; // doublecheck
    
    return res ;
  }

} ;

/// given a dimension, we define a shape_type as a TinyVector of
/// vigra::MultiArrayIndex of this dimension.
/// This is equivalent to vigra's shape type.

// TODO: might instead define as: vigra::MultiArrayShape<dimension>

template < int dimension >
using shape_type = vigra::TinyVector < vigra::MultiArrayIndex , dimension > ;

/// given a dimension, we define shape_range_type as a range defined by
/// two shapes of the given dimension. This definition allows us to directly
/// pass the two shapes as arguments to a call of subarray() on a MultiArrayView
/// of the given dimension. Note the subarray semantics: if the range is
/// [2,2] to [4,4], it refers to elements [2,2], [3,2], [2,3], [3,3].

template < int dimension >
using shape_range_type = range_type < shape_type < dimension > > ;

template < int dimension >
using shape_partition_type = partition_type < shape_range_type < dimension > > ;

/// partition_to_stripes splits an nD region (defined by it's beginning and
/// end nD index, passed in 'range') into nparts (or less) parts. The split
/// is performed along the highest possible axis, to make the resulting
/// chunks span as little memory as possible.

template < int D >
partition_type < shape_range_type < D > >
partition_to_stripes ( shape_range_type < D > range , int nparts )
{
  // shortcut if nparts <= 1

  if ( nparts <= 1 )
  {
    partition_type < shape_range_type < D > > res ( 1 ) ;
    res[0] = range ;
    return res ;
  }

  // get the shape of the range (range[0] may not be the origin)
  
  auto shape = range[1] - range[0] ;

  // find the highest dimension that is at least nparts large
  
  int dmax ;
  for ( dmax = D - 1 ; dmax >= 0 ; dmax-- )
  {
    if ( shape[dmax] >= nparts )
      break ;
  }
  
  // if dmax is -1, there was no such dimension.
  // Try again with lowered nparts if nparts is greater than ncores,
  // otherwise just split the largest dimension into as many parts
  // as it's extent is large.
  
  if ( dmax == -1 )
  {
    if ( ncores < nparts )
      return partition_to_stripes ( range , ncores ) ;
    
    int nparts = -1 ;
    for ( int d = D - 1 ; d >= 0 ; d-- )
    {
      if ( shape[d] > nparts )
      {
        nparts = shape[d] ;
        dmax = d ;
      }
    }
  }
  
  // now we have dmax, the dimension to split, and nparts, the number of
  // parts to split it into. We delegate this task to index_range_splitter,
  // passing a range from the lower to the upper index along dmax and
  // vsize == 1, since we're not bundling.
  
  index_range_type index_range ;
  index_range [ 0 ] = range [ 0 ] [ dmax ] ;
  index_range [ 1 ] = range [ 1 ] [ dmax ] ;
  
  index_partition_type index_partition
    = index_range_splitter::part ( index_range , nparts , 1 ) ;
  
  // we note how many parts index_range_splitter has produced and set up
  // the shape partition we want to return to have just as many entries

  int size = index_partition.size() ;
  shape_partition_type < D > shape_partition ( size ) ;
  
  // now we fill in these entries: initially each entry gets the whole range,
  // then the index along dmax, the splitting axis, is adjusted to the
  // entries in the index partition

  for ( int r = 0 ; r < size ; r++ )
  {
    shape_partition [ r ] [ 0 ] = range [ 0 ] ;
    shape_partition [ r ] [ 1 ] = range [ 1 ] ;
    
    shape_partition [ r ] [ 0 ] [ dmax ] = index_partition [ r ] [ 0 ] ;
    shape_partition [ r ] [ 1 ] [ dmax ] = index_partition [ r ] [ 1 ] ;
  }
  
  // TODO: being paranoid here, may remove assertions for production code
    
  assert ( shape_partition [ 0 ] [ 0 ] == range [ 0 ] ) ;        // doublecheck
  assert ( shape_partition [ size - 1 ] [ 1 ] == range [ 1 ] ) ; // doublecheck
  
  return shape_partition ;
}

/// alternative partitioning into tiles. For the optimal situation, where
/// the view isn't rotated or pitched much, the partitioning into bunches
/// of lines (above) seems to perform slightly better, but with more difficult
/// transformations (like 90 degree rotation), performance suffers (like, -20%),
/// whereas with this tiled partitioning it is roughly the same, supposedly due
/// to identical locality in both cases. So currently I am using this partitioning.
/// note that the current implementation ignores the argument 'nparts' and
/// produces tiles 160X160.
/// Note that this routine is not currently used in vspline

// TODO code is a bit clumsy...

// TODO it may be a good idea to have smaller portions towards the end
// of the partitioning, since they will be processed last, and if the
// last few single-threaded operations are short, they may result in less
// situations where a long single-threaded operation has just started when
// all other tasks are already done, causing the system to idle on the other
// cores. or at least the problem would not persist for so long. 'tapering'

// TODO this is quite specific to pv, might be moved out or made more general

template < int d >
partition_type < shape_range_type<d> >
partition_to_tiles ( shape_range_type<d> range ,
                     int nparts = default_njobs )
{
  // shortcut if nparts <= 1

  if ( nparts <= 1 )
  {
    partition_type < shape_range_type<d> > res ( 1 ) ;
    res[0] = range ;
    return res ;
  }
  
  // To help with the dilemma that this function is really quite specific
  // for images, for the time being I delegate to return partition_to_stripes()
  // for dimensions != 2

  if ( d != 2 )
    return partition_to_stripes ( range , nparts ) ;

  auto shape = range[1] - range[0] ;

// currently disregarding incoming nparts parameter:
//   int nelements = prod ( shape ) ;
//   int ntile = nelements / nparts ;
//   int nedge = pow ( ntile , ( 1.0 / d ) ) ;
  
  // TODO fixing this size is system-specific!
  
  int nedge = 160 ; // heuristic, fixed size tiles

  auto tiled_shape = shape / nedge ;

  typedef std::vector < int > stopv ;
  stopv stops [ d ] ;
  for ( int a = 0 ; a < d ; a++ )
  {
    stops[a].push_back ( 0 ) ;
    for ( int k = 1 ; k < tiled_shape[a] ; k++ )
      stops[a].push_back ( k * nedge ) ;
    stops[a].push_back ( shape[a] ) ;
  }
  
  for ( int a = 0 ; a < d ; a++ )
    tiled_shape[a] = stops[a].size() - 1 ;
  
  int k = prod ( tiled_shape ) ;
  
  // If this partitioning scheme fails to produce a partitioning with
  // at least nparts components, fall back to using partition_to_stripes()
  
  if ( k < nparts )
    return partition_to_stripes ( range , nparts ) ;
  
  nparts = k ;
  partition_type < shape_range_type<d> > res ( nparts ) ;
  
  for ( int a = 0 ; a < d ; a++ )
  {
    int j0 = 1 ;
    for ( int h = 0 ; h < a ; h++ )
      j0 *= tiled_shape[h] ;
    int i = 0 ;
    int j = 0 ;
    for ( int k = 0 ; k < nparts ; k++ )
    {
      res[k][0][a] = stops[a][i] ;
      res[k][1][a] = stops[a][i+1] ;
      ++j ;
      if ( j == j0 )
      {
        j = 0 ;
        ++i ;
        if ( i >= tiled_shape[a] )
          i = 0 ;
      }
    }
  }
  for ( auto & e : res )
  {
    e[0] += range[0] ;
    e[1] += range[0] ;
  }
  return res ;
}

#ifdef VSPLINE_SINGLETHREAD

// if multithreading is suppressed by VSPLINE_SINGLETHREAD,
// we use fallback code. multithreading-specific code will not be
// referenced at all, the relevant headers aren't included.

// TODO while here is a central location where multithreading can be
// switched off easily, it would be more efficient to modify the code
// calling multithread to avoid it altogether. The performance gain
// should be minimal, though, since this is not inner-loop code.

/// fallback routine for multithread() if VSPLINE_SINGLETHREAD
/// is defined to deactivate multithreading. This overload simply passes
/// the 'whole' range to the single-thread function, and 'partition'
/// is not called at all.

template < class range_type , class ...Types >
int multithread ( void (*pfunc) ( range_type , Types... ) ,
                  partition_type < range_type > (*partition) ( range_type , int ) ,
                  int nparts ,
                  range_type range ,
                  Types ...args )
{
  (*pfunc) ( range , args... ) ;
  return 1 ;
}

/// fallback routine for multithread() if VSPLINE_SINGLETHREAD
/// is defined to deactivate multithreading. Here, partitioning has
/// already occured, but it's not used: a range is formed spanning
/// all the ranges in the partitioning and the single-thread function
/// is invoked with this range.

// TODO might still honour the partitioning and call pfunc for all
// parts in turn

template < class range_type , class ...Types >
int multithread ( void (*pfunc) ( range_type , Types... ) ,
                  partition_type < range_type > partitioning ,
                  Types ...args )
{
  int nparts = partitioning.size() ;
  range_type range ( partitioning[0][0] , partitioning[nparts-1][1] ) ;
  (*pfunc) ( range , args... ) ;
  
  return nparts ;
}

#else // ifdef VSPLINE_SINGLETHREAD

// we start out with some collateral code:

/// action_wrapper wraps a functional into an outer function which
/// first calls the functional and then checks if this was the last
/// of a bunch of actions to complete, by incrementing the counter
/// p_done points to and comparing the result to 'nparts'. If the
/// test succeeds, the caller is notified via the condition variable
/// p_pool_cv points to, under the mutex p_pool_mutex points to.

static void action_wrapper ( std::function < void() > payload ,
                             int nparts ,
                             std::mutex * p_pool_mutex ,
                             std::condition_variable * p_pool_cv ,
                             int * p_done )
{
  // execute the 'payload'

  payload() ;

  // under the coordinator's pool mutex, increase the caller's
  // 'done' counter and test if it's now equal to 'nparts', the total
  // number of actions in this bunch
  
  // TODO initially I had the notify_all call after closing the scope of
  // the lock guard, but I had random crashes. Changing the code to call
  // notify_all with the lock guard still in effect seemed to remove the
  // problem, but made me unsure of my logic.
  
  // 2017-06-23 after removing a misplaced semicolon after the conditional
  // below I recoded to perform the notification after closing the lock_guard's
  // scope, and now there doesn't seem to be any problem any more. I leave
  // these comments in for reference in case things go wrong
  // TODO remove this and previous comment if all is well
  
  // 2017-10-12 when stress-testing with restore_test, I had random crashes
  // and failure to join again, so I've taken the notify call into the lock
  // guard's scope again to see if that fixes it which seems to be the case.
  
  {
    std::lock_guard<std::mutex> lk ( * p_pool_mutex ) ;
    if ( ++ ( * p_done ) == nparts )
    {
      // this was the last action originating from the coordinator
      // notify the coordinator that the joint task is now complete
      p_pool_cv->notify_one() ;
    }
  }
}

// with this collateral code at hand, we can now implement multithread().

/// multithread uses a thread pool of worker threads to perform
/// a multithreaded operation. It receives a functor (a single-threaded
/// function used for all individual tasks), a partitioning, which contains
/// information about which part of the data each task should work on, and
/// a set of additional parameters to pass on to the functor.
/// The individual 'payload' tasks are created by binding the functor with
///
/// - a range from the partitioning, describing it's share of the data
///
/// - the remaining parameters
///
/// These tasks are bound to a wrapper routine which takes care of
/// signalling when the last task has completed.

// TODO may write an equivalent function taking an iterator yielding ranges
// instead of a container with ranges.

static thread_pool common_thread_pool ; // keep a thread pool only for multithread()

template < class range_type , class ... Types >
int multithread ( void (*pfunc) ( range_type , Types ... ) ,
                  partition_type < range_type > partitioning ,
                  Types ... args )
{
  // get the number of ranges in the partitioning

  int nparts = partitioning.size() ;
  
  // guard against empty or wrong partitioning

  if ( nparts <= 0 )
  {
    return 0 ;
  }

  if ( nparts == 1 )
  {
    // if only one part is in the partitioning, we take a shortcut
    // and execute the function right here:
    (*pfunc) ( partitioning[0] , args ... ) ;
    return 1 ;
  }

  // alternatively, 'done' can be coded as std::atomic<int>. I tried
  // but couldn't detect any performance benefit, even though allegedly
  // atomics are faster than using mutexes... so I'm leaving the code
  // as it was, using an int and a mutex.
  
  int done = 0 ;                    // number of completed tasks
  std::mutex pool_mutex ;           // mutex to guard access to done and pool_cv
  std::condition_variable pool_cv ; // for signalling completion
  
  {
    // under the thread pool's task_mutex, fill tasks into task queue
    std::lock_guard<std::mutex> lk ( common_thread_pool.task_mutex ) ;
    for ( int i = 0 ; i < nparts ; i++ )
    {
      // first create the 'payload' function
      
      std::function < void() > payload
        = std::bind ( pfunc , partitioning[i] , args ... ) ;

      // now bind it to the action wrapper and enqueue it

      std::function < void() > action
        = std::bind ( action_wrapper ,
                      payload ,
                      nparts ,
                      &pool_mutex ,
                      &pool_cv ,
                      &done
                    ) ;

      common_thread_pool.task_queue.push ( action ) ;
    }
  }

  // alert all worker threads
   
  common_thread_pool.task_cv.notify_all() ;

  {
    // now wait for the last task to complete. This is signalled by
    // action_wrapper by notifying on pool_cv and doublechecked
    // by testing for done == nparts

    std::unique_lock<std::mutex> lk ( pool_mutex ) ;
    
    // the predicate done == nparts rejects spurious wakes
    
    pool_cv.wait ( lk , [&] { return done == nparts ; } ) ;
  }
  
  // all jobs are done

  return nparts ;
}

/// This variant of multithread() takes a pointer to a function performing
/// the partitioning of the incoming range. The partitioning function is
/// invoked on the incoming range (provided nparts is greater than 1) and
/// the resulting partitioning is used as an argument to the first variant
/// of multithread().

// TODO It might be better to code this using std::function objects.

// TODO may use move semantics for forwarding instead of relying on the
// optimizer to figure this out

template < class range_type , class ... Types >
int multithread ( void (*pfunc) ( range_type , Types ... ) ,
                  partition_type < range_type > (*partition) ( range_type , int ) ,
                  int nparts ,
                  range_type range ,
                  Types ... args )
{
  if ( nparts <= 1 )
  {
    // if only one part is requested, we take a shortcut and execute
    // the function right here:
    (*pfunc) ( range , args ... ) ;
    return 1 ;
  }

  // partition the range using the function pointed to by 'partition'

  auto partitioning = (*partition) ( range , nparts ) ;
  
  // then pass pfunc, the partitioning and the remaining arguments
  // to the variant of multithread() above accepting a partitioning
  
  return multithread ( pfunc , partitioning , args ... ) ;
}

#endif

} ; // end if namespace vspline

#endif // #ifndef VSPLINE_MULTITHREAD_H
