ie_parallel.hpp
Go to the documentation of this file.
1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 
5 /**
6  * @brief Contains declarations and definitions for sequential and multi-threading implementations.
7  * Multi-threading support is implemented in two variants: using the Threading Building Blocks library and OpenMP* product.
8  * To build a particular implementation, use the corresponding identifier: IE_THREAD_TBB, IE_THREAD_TBB_AUTO, IE_THREAD_OMP or IE_THREAD_SEQ.
9  * @file ie_parallel.hpp
10  */
11 
12 #pragma once
13 
14 #include <cstddef>
15 
16 #define IE_THREAD_TBB 0
17 #define IE_THREAD_OMP 1
18 #define IE_THREAD_SEQ 2
19 #define IE_THREAD_TBB_AUTO 3
20 
21 #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
22 #define TBB_PREVIEW_LOCAL_OBSERVER 1
23 #include "tbb/task_scheduler_observer.h"
24 #include "tbb/parallel_for.h"
25 #include "tbb/task_arena.h"
26 
27 #include "tbb/parallel_reduce.h"
28 #include "tbb/blocked_range.h"
29 #include "tbb/blocked_range2d.h"
30 #include "tbb/blocked_range3d.h"
31 
32 inline int parallel_get_max_threads() { return tbb::this_task_arena::max_concurrency(); }
33 inline int parallel_get_num_threads() { return parallel_get_max_threads(); }
34 inline int parallel_get_thread_num() { return tbb::this_task_arena::current_thread_index(); }
35 inline void parallel_set_num_threads(int n) { return; }
36 inline int parallel_get_env_threads() { return 0; }
37 #if IE_THREAD == IE_THREAD_TBB
38  #define PARTITIONING , tbb::static_partitioner()
39 #else
40  #define PARTITIONING
41 #endif
42 #elif IE_THREAD == IE_THREAD_OMP
43 #include <cstdlib>
44 #include <string>
45 #include <omp.h>
46 
47 
48 /* MSVC still supports omp 2.0 only */
49 #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
50 # define collapse(x)
51 #endif // defined(_MSC_VER) && !defined(__INTEL_COMPILER)
52 inline int parallel_get_max_threads() { return omp_get_max_threads(); }
53 inline int parallel_get_num_threads() { return omp_get_num_threads(); }
54 inline int parallel_get_thread_num() { return omp_get_thread_num(); }
55 inline void parallel_set_num_threads(int n) { omp_set_num_threads(n); }
56 inline int parallel_get_env_threads() {
57  int env_cores = 0;
58  if (getenv("OMP_NUM_THREADS") != nullptr) {
59  try {
60  env_cores = std::stoi(getenv("OMP_NUM_THREADS"));
61  } catch (const std::exception&) {
62  env_cores = 0;
63  }
64  }
65  return env_cores;
66 }
67 
68 #elif IE_THREAD == IE_THREAD_SEQ
69 inline int parallel_get_env_threads() { return 1; }
70 inline int parallel_get_max_threads() { return 1; }
71 inline int parallel_get_num_threads() { return 1; }
72 inline int parallel_get_thread_num() { return 0; }
73 inline void parallel_set_num_threads(int n) { return; }
74 #endif
75 
76 
77 namespace InferenceEngine {
78 
79 template <typename F>
80 void parallel_nt(int nthr, const F &func) {
81 #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
82  if (nthr == 0) nthr = parallel_get_max_threads();
83  if (nthr == 1) {
84  func(0, 1);
85  return;
86  }
87 
88  tbb::parallel_for(0, nthr, [&](int ithr) {
89  func(ithr, nthr);
90  });
91 #elif IE_THREAD == IE_THREAD_OMP
92  if (nthr == 1) {
93  func(0, 1);
94  return;
95  }
96 
97 # pragma omp parallel num_threads(nthr)
98  func(parallel_get_thread_num(), parallel_get_num_threads());
99 #elif IE_THREAD == IE_THREAD_SEQ
100  func(0, 1);
101 #endif
102 }
103 
104 template <typename F>
105 void parallel_nt_static(int nthr, const F &func) {
106 #if IE_THREAD == IE_THREAD_SEQ
107  const bool serial = true;
108 #else
109  const bool serial = false;
110 #endif
111 
112  if (serial || nthr == 1) {
113  func(0, 1);
114  return;
115  }
116 
117  if (nthr == 0) nthr = parallel_get_max_threads();
118 #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
119  tbb::parallel_for(0, nthr, [&](int ithr) {
120  func(ithr, nthr);
121  }
122  , tbb::static_partitioner{});
123 
124 #elif IE_THREAD == IE_THREAD_OMP
125 
126 # pragma omp parallel num_threads(nthr)
127  {
128  func(parallel_get_thread_num(), parallel_get_num_threads());
129  }
130 #endif
131 }
132 
133 template <typename T0, typename R, typename F>
134 R parallel_sum(const T0 &D0, const R &input, const F &func) {
135 #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
136  return tbb::parallel_reduce(
137  tbb::blocked_range<T0>(0, D0), input,
138  [&](const tbb::blocked_range<T0>& r, R init)->R {
139  R sum = init;
140  for (T0 dim1 = r.begin(); dim1 < r.end(); ++dim1)
141  sum += func(dim1);
142  return sum;
143  },
144  [](R x, R y)->R {
145  return x + y;
146  } PARTITIONING);
147 #else
148  R sum = input;
149 
150 #ifdef _MSC_VER
151  using T0_IT = typename std::make_signed<T0>::type;
152 #else
153  using T0_IT = T0;
154 #endif
155 
156 #if IE_THREAD == IE_THREAD_OMP
157  #pragma omp parallel for reduction(+ : sum) schedule(static)
158 #endif
159  for (T0_IT dim1 = 0; dim1 < static_cast<T0_IT>(D0); dim1++) {
160  sum += static_cast<R>(func(dim1));
161  }
162  return sum;
163 #endif
164 }
165 
166 template <typename T0, typename T1, typename R, typename F>
167 R parallel_sum2d(const T0 &D0, const T1 &D1, const R &input, const F &func) {
168 #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
169  return tbb::parallel_reduce(
170  tbb::blocked_range2d<T0, T1>(0, D0, 0, D1), input,
171  [&](const tbb::blocked_range2d<T0, T1>& r, R init)->R {
172  R sum = init;
173  for (T0 dim2 = r.rows().begin(); dim2 < r.rows().end(); dim2++) {
174  for (T1 dim1 = r.cols().begin(); dim1 < r.cols().end(); dim1++) {
175  sum += func(dim2, dim1);
176  }
177  }
178  return sum;
179  },
180  [](R x, R y)->R {
181  return x + y;
182  } PARTITIONING);
183 #else
184  R sum = input;
185 
186 #ifdef _MSC_VER
187  using T0_IT = typename std::make_signed<T0>::type;
188  using T1_IT = typename std::make_signed<T1>::type;
189 #else
190  using T0_IT = T0;
191  using T1_IT = T1;
192 #endif
193 
194 #if IE_THREAD == IE_THREAD_OMP
195  #pragma omp parallel for collapse(2) reduction(+ : sum) schedule(static)
196 #endif
197  for (T0_IT dim2 = 0; dim2 < D0; dim2++) {
198  for (T1_IT dim1 = 0; dim1 < D1; dim1++) {
199  sum += func(dim2, dim1);
200  }
201  }
202  return sum;
203 #endif
204 }
205 template <typename T0, typename T1, typename T2, typename R, typename F>
206 R parallel_sum3d(const T0 &D0, const T1 &D1, const T2 &D2, const R &input, const F &func) {
207 #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO)
208  return tbb::parallel_reduce(
209  tbb::blocked_range3d<T0, T1, T2>(0, D0, 0, D1, 0, D2), input,
210  [&](const tbb::blocked_range3d<T0, T1, T2>& r, R init)->R {
211  R sum = init;
212  for (T0 dim1 = r.pages().begin(); dim1 < r.pages().end(); dim1++) {
213  for (T1 dim2 = r.rows().begin(); dim2 < r.rows().end(); dim2++) {
214  for (T2 dim3 = r.cols().begin(); dim3 < r.cols().end(); dim3++) {
215  sum += func(dim1, dim2, dim3);
216  }
217  }
218  }
219  return sum;
220  },
221  [](R x, R y)->R {
222  return x + y;
223  } PARTITIONING);
224 #else
225  R sum = input;
226 
227 #ifdef _MSC_VER
228  using T0_IT = typename std::make_signed<T0>::type;
229  using T1_IT = typename std::make_signed<T1>::type;
230  using T2_IT = typename std::make_signed<T2>::type;
231 #else
232  using T0_IT = T0;
233  using T1_IT = T1;
234  using T2_IT = T2;
235 #endif
236 
237 #if IE_THREAD == IE_THREAD_OMP
238  #pragma omp parallel for collapse(3) reduction(+ : sum) schedule(static)
239 #endif
240  for (T0_IT dim1 = 0; dim1 < static_cast<T0_IT>(D0); dim1++) {
241  for (T1_IT dim2 = 0; dim2 < static_cast<T1_IT>(D1); dim2++) {
242  for (T2_IT dim3 = 0; dim3 < static_cast<T2_IT>(D2); dim3++) {
243  sum += func(dim1, dim2, dim3);
244  }
245  }
246  }
247  return sum;
248 #endif
249 }
250 
251 template<typename T>
252 inline T parallel_it_init(T start) { return start; }
253 template<typename T, typename Q, typename R, typename... Args>
254 inline T parallel_it_init(T start, Q &x, const R &X, Args &&... tuple) {
255  start = parallel_it_init(start, static_cast<Args>(tuple)...);
256  x = start % X;
257  return start / X;
258 }
259 
260 inline bool parallel_it_step() { return true; }
261 template<typename Q, typename R, typename... Args>
262 inline bool parallel_it_step(Q &x, const R &X, Args &&... tuple) {
263  if (parallel_it_step(static_cast<Args>(tuple)...)) {
264  x = (x + 1) % X;
265  return x == 0;
266  }
267  return false;
268 }
269 
270 template <typename T, typename Q>
271 inline void splitter(const T &n, const Q &team, const Q &tid, T &n_start, T &n_end) {
272  if (team <= 1 || n == 0) {
273  n_start = 0;
274  n_end = n;
275  } else {
276  T n1 = (n + (T)team - 1) / (T)team;
277  T n2 = n1 - 1;
278  T T1 = n - n2 * (T)team;
279  n_end = (T)tid < T1 ? n1 : n2;
280  n_start = (T)tid <= T1 ? tid * n1 : T1 * n1 + ((T)tid - T1) * n2;
281  }
282 
283  n_end += n_start;
284 }
285 
286 
287 template <typename T0, typename F>
288 void for_1d(const int &ithr, const int &nthr, const T0 &D0, const F &func) {
289  T0 d0{ 0 }, end{ 0 };
290  splitter(D0, nthr, ithr, d0, end);
291  for (; d0 < end; ++d0) func(d0);
292 }
293 
294 template <typename T0, typename F>
295 void parallel_for(const T0 &D0, const F &func) {
296 #if IE_THREAD == IE_THREAD_TBB
297  auto work_amount = static_cast<size_t>(D0);
298  int nthr = parallel_get_max_threads();
299  if (static_cast<size_t>(nthr) > work_amount)
300  nthr = static_cast<int>(work_amount);
301  if (nthr == 1) {
302  for_1d(0, 1, D0, func);
303  } else {
304  tbb::parallel_for(0, nthr, [&](int ithr) {
305  for_1d(ithr, nthr, D0, func);
306  }, tbb::static_partitioner());
307  }
308 #elif IE_THREAD == IE_THREAD_TBB_AUTO
309  const int nthr = parallel_get_max_threads();
310  tbb::parallel_for(0, nthr, [&](int ithr) {
311  for_1d(ithr, nthr, D0, func);
312  });
313 #elif IE_THREAD == IE_THREAD_OMP
314 # pragma omp parallel
315  for_1d(parallel_get_thread_num(), parallel_get_num_threads(), D0, func);
316 #elif IE_THREAD == IE_THREAD_SEQ
317  for_1d(0, 1, D0, func);
318 #endif
319 }
320 
321 
322 template <typename T0, typename T1, typename F>
323 void for_2d(const int &ithr, const int &nthr, const T0 &D0, const T1 &D1, const F &func) {
324  const size_t work_amount = (size_t)D0 * D1;
325  if (work_amount == 0) return;
326  size_t start{ 0 }, end{ 0 };
327  splitter(work_amount, nthr, ithr, start, end);
328 
329  T0 d0{ 0 }; T1 d1{ 0 };
330  parallel_it_init(start, d0, D0, d1, D1);
331  for (size_t iwork = start; iwork < end; ++iwork) {
332  func(d0, d1);
333  parallel_it_step(d0, D0, d1, D1);
334  }
335 }
336 
337 template <typename T0, typename T1, typename F>
338 void parallel_for2d(const T0 &D0, const T1 &D1, const F &func) {
339 #if IE_THREAD == IE_THREAD_TBB
340  auto work_amount = static_cast<size_t>(D0 * D1);
341  int nthr = parallel_get_max_threads();
342  if (static_cast<size_t>(nthr) > work_amount)
343  nthr = static_cast<int>(work_amount);
344  if (nthr == 1) {
345  for_2d(0, 1, D0, D1, func);
346  } else {
347  tbb::parallel_for(0, nthr, [&](int ithr) {
348  for_2d(ithr, nthr, D0, D1, func);
349  }, tbb::static_partitioner());
350  }
351 #elif IE_THREAD == IE_THREAD_TBB_AUTO
352  const int nthr = parallel_get_max_threads();
353  tbb::parallel_for(0, nthr, [&](int ithr) {
354  for_2d(ithr, nthr, D0, D1, func);
355  });
356 #elif IE_THREAD == IE_THREAD_OMP
357 # pragma omp parallel
358  for_2d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, func);
359 #elif IE_THREAD == IE_THREAD_SEQ
360  for_2d(0, 1, D0, D1, func);
361 #endif
362 }
363 
364 
365 template <typename T0, typename T1, typename T2, typename F>
366 void for_3d(const int &ithr, const int &nthr, const T0 &D0, const T1 &D1,
367  const T2 &D2, const F &func) {
368  const size_t work_amount = (size_t)D0 * D1 * D2;
369  if (work_amount == 0) return;
370  size_t start{ 0 }, end{ 0 };
371  splitter(work_amount, nthr, ithr, start, end);
372 
373  T0 d0{ 0 }; T1 d1{ 0 }; T2 d2{ 0 };
374  parallel_it_init(start, d0, D0, d1, D1, d2, D2);
375  for (size_t iwork = start; iwork < end; ++iwork) {
376  func(d0, d1, d2);
377  parallel_it_step(d0, D0, d1, D1, d2, D2);
378  }
379 }
380 
381 template <typename T0, typename T1, typename T2, typename F>
382 void parallel_for3d(const T0 &D0, const T1 &D1, const T2 &D2, const F &func) {
383 #if IE_THREAD == IE_THREAD_TBB
384  auto work_amount = static_cast<size_t>(D0 * D1 * D2);
385  int nthr = parallel_get_max_threads();
386  if (static_cast<size_t>(nthr) > work_amount)
387  nthr = static_cast<int>(work_amount);
388  if (nthr == 1) {
389  for_3d(0, 1, D0, D1, D2, func);
390  } else {
391  tbb::parallel_for(0, nthr, [&](int ithr) {
392  for_3d(ithr, nthr, D0, D1, D2, func);
393  }, tbb::static_partitioner());
394  }
395 #elif IE_THREAD == IE_THREAD_TBB_AUTO
396  const int nthr = parallel_get_max_threads();
397  tbb::parallel_for(0, nthr, [&](int ithr) {
398  for_3d(ithr, nthr, D0, D1, D2, func);
399  });
400 #elif IE_THREAD == IE_THREAD_OMP
401 # pragma omp parallel
402  for_3d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, func);
403 #elif IE_THREAD == IE_THREAD_SEQ
404  for_3d(0, 1, D0, D1, D2, func);
405 #endif
406 }
407 
408 template <typename T0, typename T1, typename T2, typename T3, typename F>
409 void for_4d(const int &ithr, const int &nthr, const T0 &D0, const T1 &D1,
410  const T2 &D2, const T3 &D3, const F &func) {
411  const size_t work_amount = (size_t)D0 * D1 * D2 * D3;
412  if (work_amount == 0) return;
413  size_t start{ 0 }, end{ 0 };
414  splitter(work_amount, nthr, ithr, start, end);
415 
416  T0 d0{ 0 }; T1 d1{ 0 }; T2 d2{ 0 }; T3 d3{ 0 };
417  parallel_it_init(start, d0, D0, d1, D1, d2, D2, d3, D3);
418  for (size_t iwork = start; iwork < end; ++iwork) {
419  func(d0, d1, d2, d3);
420  parallel_it_step(d0, D0, d1, D1, d2, D2, d3, D3);
421  }
422 }
423 
424 template <typename T0, typename T1, typename T2, typename T3, typename F>
425 void parallel_for4d(const T0 &D0, const T1 &D1, const T2 &D2, const T3 &D3, const F &func) {
426 #if IE_THREAD == IE_THREAD_TBB
427  auto work_amount = static_cast<size_t>(D0 * D1 * D2 * D3);
428  int nthr = parallel_get_max_threads();
429  if (static_cast<size_t>(nthr) > work_amount)
430  nthr = static_cast<int>(work_amount);
431  if (nthr == 1) {
432  for_4d(0, 1, D0, D1, D2, D3, func);
433  } else {
434  tbb::parallel_for(0, nthr, [&](int ithr) {
435  for_4d(ithr, nthr, D0, D1, D2, D3, func);
436  }, tbb::static_partitioner());
437  }
438 #elif IE_THREAD == IE_THREAD_TBB_AUTO
439  const int nthr = parallel_get_max_threads();
440  tbb::parallel_for(0, nthr, [&](int ithr) {
441  for_4d(ithr, nthr, D0, D1, D2, D3, func);
442  });
443 #elif IE_THREAD == IE_THREAD_OMP
444 # pragma omp parallel
445  for_4d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, func);
446 #elif IE_THREAD == IE_THREAD_SEQ
447  for_4d(0, 1, D0, D1, D2, D3, func);
448 #endif
449 }
450 
451 template <typename T0, typename T1, typename T2, typename T3, typename T4, typename F>
452 void for_5d(const int &ithr, const int &nthr, const T0 &D0, const T1 &D1,
453  const T2 &D2, const T3 &D3, const T4 &D4, const F &func) {
454  const size_t work_amount = (size_t)D0 * D1 * D2 * D3 * D4;
455  if (work_amount == 0) return;
456  size_t start{ 0 }, end{ 0 };
457  splitter(work_amount, nthr, ithr, start, end);
458 
459  T0 d0{ 0 }; T1 d1{ 0 }; T2 d2{ 0 }; T3 d3{ 0 }; T4 d4{ 0 };
460  parallel_it_init(start, d0, D0, d1, D1, d2, D2, d3, D3, d4, D4);
461  for (size_t iwork = start; iwork < end; ++iwork) {
462  func(d0, d1, d2, d3, d4);
463  parallel_it_step(d0, D0, d1, D1, d2, D2, d3, D3, d4, D4);
464  }
465 }
466 
467 template <typename T0, typename T1, typename T2, typename T3, typename T4, typename F>
468 void parallel_for5d(const T0 &D0, const T1 &D1, const T2 &D2, const T3 &D3,
469  const T4 &D4, const F &func) {
470 #if IE_THREAD == IE_THREAD_TBB
471  auto work_amount = static_cast<size_t>(D0 * D1 * D2 * D3 * D4);
472  int nthr = parallel_get_max_threads();
473  if (static_cast<size_t>(nthr) > work_amount)
474  nthr = static_cast<int>(work_amount);
475  if (nthr == 1) {
476  for_5d(0, 1, D0, D1, D2, D3, D4, func);
477  } else {
478  tbb::parallel_for(0, nthr, [&](int ithr) {
479  for_5d(ithr, nthr, D0, D1, D2, D3, D4, func);
480  }, tbb::static_partitioner());
481  }
482 #elif IE_THREAD == IE_THREAD_TBB_AUTO
483  const int nthr = parallel_get_max_threads();
484  tbb::parallel_for(0, nthr, [&](int ithr) {
485  for_5d(ithr, nthr, D0, D1, D2, D3, D4, func);
486  });
487 #elif IE_THREAD == IE_THREAD_OMP
488 # pragma omp parallel
489  for_5d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, D4, func);
490 #elif IE_THREAD == IE_THREAD_SEQ
491  for_5d(0, 1, D0, D1, D2, D3, D4, func);
492 #endif
493 }
494 
495 } // namespace InferenceEngine
496 
Definition: ie_argmax_layer.hpp:11