Parallel.h
1 /*
2  * Medical Image Registration ToolKit (MIRTK)
3  *
4  * Copyright 2013-2017 Imperial College London
5  * Copyright 2013-2017 Andreas Schuh
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 
20 #ifndef MIRTK_Parallel_H
21 #define MIRTK_Parallel_H
22 
23 #include "mirtk/CommonExport.h"
24 
25 #include "mirtk/Stream.h"
26 #include "mirtk/Memory.h"
27 
28 #ifndef MIRTK_COMMON_WITH_TBB_MALLOC
29 # define MIRTK_COMMON_WITH_TBB_MALLOC 0
30 #endif
31 
32 #ifdef HAVE_TBB
33 // TBB includes windows header which defines min/max macros otherwise
34 # ifndef NOMINMAX
35 # define NOMINMAX
36 # define MIRTK_UNDEF_NOMINMAX
37 # endif
38 # include <tbb/task_scheduler_init.h>
39 # include <tbb/blocked_range.h>
40 # include <tbb/blocked_range2d.h>
41 # include <tbb/blocked_range3d.h>
42 # include <tbb/parallel_for.h>
43 # include <tbb/parallel_reduce.h>
44 # include <tbb/concurrent_queue.h>
45 # if MIRTK_COMMON_WITH_TBB_MALLOC
46 # include <tbb/scalable_allocator.h>
47 # include <tbb/cache_aligned_allocator.h>
48 # endif
49 # include <tbb/mutex.h>
50 # ifdef MIRTK_UNDEF_NOMINMAX
51 # undef MIRTK_UNDEF_NOMINMAX
52 # undef NOMINMAX
53 # endif
54 #endif
55 
56 
57 namespace mirtk {
58 
59 
60 // =============================================================================
61 // Global parallelization options
62 // =============================================================================
63 
64 /// Enable/disable GPU acceleration
65 MIRTK_Common_EXPORT extern bool use_gpu;
66 
67 /// Debugging level of GPU code
68 MIRTK_Common_EXPORT extern int debug_gpu;
69 
70 /// Debugging level of TBB code
71 MIRTK_Common_EXPORT extern int tbb_debug;
72 
73 // =============================================================================
74 // Command help
75 // =============================================================================
76 
77 /// Check if given option is a parallelization option
78 bool IsParallelOption(const char *);
79 
80 /// Parse parallelization option
81 void ParseParallelOption(int &, int &, char *[]);
82 
83 /// Print parallelization command-line options
84 void PrintParallelOptions(ostream &);
85 
86 // =============================================================================
87 // Multi-threading support using Intel's TBB
88 // =============================================================================
89 
90 // -----------------------------------------------------------------------------
91 // If TBB is available and WITH_TBB is set to ON, use TBB to execute
92 // any parallelizable code concurrently
93 //
94 // Attention: DO NOT define TBB_DEPRECATED by default or before including the
95 // other TBB header files, in particular parallel_for. The deprecated
96 // behavior of parallel_for is to not choose the chunk size (grainsize)
97 // automatically!
98 //
99 // http://software.intel.com/sites/products/documentation/doclib/tbb_sa/help/tbb_userguide/Automatic_Chunking.htm
100 #ifdef HAVE_TBB
101 
102 
103 // Import used TBB types into mirtk namespace
104 using tbb::task_scheduler_init;
105 using tbb::blocked_range;
106 using tbb::blocked_range2d;
107 using tbb::blocked_range3d;
108 using tbb::parallel_for;
109 using tbb::parallel_reduce;
110 using tbb::concurrent_queue;
111 using tbb::mutex;
112 using tbb::split;
113 
114 #if MIRTK_COMMON_WITH_TBB_MALLOC
115 using tbb::scalable_allocator;
116 using tbb::cache_aligned_allocator;
117 #endif
118 
119 // A task scheduler is created/terminated automatically by TBB since
120 // version 2.2. It is recommended by Intel not to instantiate any task
121 // scheduler manually. However, in order to support the -threads option
122 // which can be used to limit the number of threads, a global task scheduler
123 // instance is created and the -threads argument passed on to its initialize
124 // method by ParseParallelOption. There should be no task scheduler created/
125 // terminated in any of the MIRTK library functions and classes.
126 MIRTK_Common_EXPORT extern UniquePtr<task_scheduler_init> tbb_scheduler;
127 
128 
129 // -----------------------------------------------------------------------------
130 // Otherwise, use dummy implementations of TBB classes/functions which allows
131 // developers to write parallelizable code as if TBB was available and yet
132 // executes the code serially due to the lack of TBB (or WITH_TBB set to OFF).
133 // This avoids code duplication and unnecessary conditional code compilation.
134 #else // HAVE_TBB
135 
136 template <class T>
137 using scalable_allocator = std::allocator<T>;
138 
139 template <class T>
140 using cache_aligned_allocator = std::allocator<T>;
141 
142 /// Dummy type used to distinguish split constructor from copy constructor
143 struct split {};
144 
145 /// Helper for initialization of task scheduler
147 {
148 public:
149  task_scheduler_init(int) {}
150  void terminate() {}
151 };
152 
153 /// One-dimensional range
154 template <typename T>
156 {
157  T _lbound;
158  T _ubound;
159 public:
160  blocked_range(T l, T u) : _lbound(l), _ubound(u) {}
161  blocked_range(T l, T u, size_t) : _lbound(l), _ubound(u) {}
162  T begin() const { return _lbound; }
163  T end() const { return _ubound; }
164 };
165 
166 /// Two-dimensional range
167 template <typename T>
169 {
170  blocked_range<T> _rows;
171  blocked_range<T> _cols;
172 
173 public:
174 
175  blocked_range2d(T rl, T ru,
176  T cl, T cu)
177  :
178  _rows (rl, ru),
179  _cols (cl, cu)
180  {
181  }
182 
183  blocked_range2d(T rl, T ru, size_t,
184  T cl, T cu, size_t)
185  :
186  _rows (rl, ru),
187  _cols (cl, cu)
188  {
189  }
190 
191  const blocked_range<T> &rows() const { return _rows; }
192  const blocked_range<T> &cols() const { return _cols; }
193 };
194 
195 /// Three-dimensional range
196 template <typename T>
198 {
199  blocked_range<T> _pages;
200  blocked_range<T> _rows;
201  blocked_range<T> _cols;
202 
203 public:
204 
205  blocked_range3d(T pl, T pu,
206  T rl, T ru,
207  T cl, T cu)
208  :
209  _pages(pl, pu),
210  _rows (rl, ru),
211  _cols (cl, cu)
212  {
213  }
214 
215  blocked_range3d(T pl, T pu, size_t,
216  T rl, T ru, size_t,
217  T cl, T cu, size_t)
218  :
219  _pages(pl, pu),
220  _rows (rl, ru),
221  _cols (cl, cu)
222  {
223  }
224 
225  const blocked_range<T> &pages() const { return _pages; }
226  const blocked_range<T> &rows() const { return _rows; }
227  const blocked_range<T> &cols() const { return _cols; }
228 };
229 
230 /// parallel_for dummy template function which executes the body serially
231 template <class Range, class Body>
232 void parallel_for(const Range &range, const Body &body) {
233  body(range);
234 }
235 
236 /// parallel_reduce dummy template function which executes the body serially
237 template <class Range, class Body>
238 void parallel_reduce(const Range &range, Body &body) {
239  body(range);
240 }
241 
242 
243 #endif // HAVE_TBB
244 
245 
246 } // namespace mirtk
247 
248 #endif // MIRTK_Parallel_H
Dummy type used to distinguish split constructor from copy constructor.
Definition: Parallel.h:143
MIRTK_Common_EXPORT int tbb_debug
Debugging level of TBB code.
void ParseParallelOption(int &, int &, char *[])
Parse parallelization option.
Two-dimensional range.
Definition: Parallel.h:168
Helper for initialization of task scheduler.
Definition: Parallel.h:146
bool IsParallelOption(const char *)
Check if given option is a parallelization option.
One-dimensional range.
Definition: Parallel.h:155
Definition: IOConfig.h:41
Three-dimensional range.
Definition: Parallel.h:197
MIRTK_Common_EXPORT int debug_gpu
Debugging level of GPU code.
MIRTK_Common_EXPORT bool use_gpu
Enable/disable GPU acceleration.
void PrintParallelOptions(ostream &)
Print parallelization command-line options.
void parallel_reduce(const Range &range, Body &body)
parallel_reduce dummy template function which executes the body serially
Definition: Parallel.h:238
void parallel_for(const Range &range, const Body &body)
parallel_for dummy template function which executes the body serially
Definition: Parallel.h:232