ie_parallel.hpp
Go to the documentation of this file.
1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 
5 /**
6  * @brief Contains declarations and definitions for sequential and multi-threading implementations.
7  * Multi-threading support is implemented in two variants: using the Threading Building Blocks library and OpenMP* product.
8  * To build a particular implementation, use the corresponding identifier: IE_THREAD_TBB, IE_THREAD_OMP or IE_THREAD_SEQ.
9  * @file ie_parallel.hpp
10  */
11 
12 #pragma once
13 
14 #include <cstddef>
15 
16 #define IE_THREAD_TBB 0
17 #define IE_THREAD_OMP 1
18 #define IE_THREAD_SEQ 2
19 
20 #if IE_THREAD == IE_THREAD_TBB
21 #define TBB_PREVIEW_LOCAL_OBSERVER 1
22 #include "tbb/task_scheduler_observer.h"
23 #include "tbb/parallel_for.h"
24 #include "tbb/task_arena.h"
25 
26 #include "tbb/parallel_reduce.h"
27 #include "tbb/blocked_range.h"
28 #include "tbb/blocked_range2d.h"
29 #include "tbb/blocked_range3d.h"
30 
31 inline int parallel_get_max_threads() { return tbb::this_task_arena::max_concurrency(); }
32 inline int parallel_get_num_threads() { return parallel_get_max_threads(); }
33 inline int parallel_get_thread_num() { return tbb::this_task_arena::current_thread_index(); }
34 inline void parallel_set_num_threads(int n) { return; }
35 inline int parallel_get_env_threads() { return 0; }
36 
37 #elif IE_THREAD == IE_THREAD_OMP
38 #include <cstdlib>
39 #include <string>
40 #include <omp.h>
41 
42 
43 /* MSVC still supports omp 2.0 only */
44 #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
45 # define collapse(x)
46 #endif // defined(_MSC_VER) && !defined(__INTEL_COMPILER)
47 inline int parallel_get_max_threads() { return omp_get_max_threads(); }
48 inline int parallel_get_num_threads() { return omp_get_num_threads(); }
49 inline int parallel_get_thread_num() { return omp_get_thread_num(); }
50 inline void parallel_set_num_threads(int n) { omp_set_num_threads(n); }
51 inline int parallel_get_env_threads() {
52  int env_cores = 0;
53  if (getenv("OMP_NUM_THREADS") != nullptr) {
54  try {
55  env_cores = std::stoi(getenv("OMP_NUM_THREADS"));
56  } catch (const std::exception&) {
57  env_cores = 0;
58  }
59  }
60  return env_cores;
61 }
62 
63 #elif IE_THREAD == IE_THREAD_SEQ
64 inline int parallel_get_env_threads() { return 1; }
65 inline int parallel_get_max_threads() { return 1; }
66 inline int parallel_get_num_threads() { return 1; }
67 inline int parallel_get_thread_num() { return 0; }
68 inline void parallel_set_num_threads(int n) { return; }
69 #endif
70 
71 
72 namespace InferenceEngine {
73 
74 template <typename F>
75 void parallel_nt(int nthr, const F &func) {
76 #if IE_THREAD == IE_THREAD_TBB
77  if (nthr == 0) nthr = parallel_get_max_threads();
78  if (nthr == 1) {
79  func(0, 1);
80  return;
81  }
82 
83  tbb::parallel_for(0, nthr, [&](int ithr) {
84  func(ithr, nthr);
85  });
86 #elif IE_THREAD == IE_THREAD_OMP
87  if (nthr == 1) {
88  func(0, 1);
89  return;
90  }
91 
92 # pragma omp parallel num_threads(nthr)
93  func(parallel_get_thread_num(), parallel_get_num_threads());
94 #elif IE_THREAD == IE_THREAD_SEQ
95  func(0, 1);
96 #endif
97 }
98 
99 template <typename F>
100 void parallel_nt_static(int nthr, const F &func) {
101 #if IE_THREAD == IE_THREAD_SEQ
102  const bool serial = true;
103 #else
104  const bool serial = false;
105 #endif
106 
107  if (serial || nthr == 1) {
108  func(0, 1);
109  return;
110  }
111 
112  if (nthr == 0) nthr = parallel_get_max_threads();
113 #if IE_THREAD == IE_THREAD_TBB
114  tbb::parallel_for(0, nthr, [&](int ithr) {
115  func(ithr, nthr);
116  }
117  , tbb::static_partitioner{});
118 
119 #elif IE_THREAD == IE_THREAD_OMP
120 
121 # pragma omp parallel num_threads(nthr)
122  {
123  func(parallel_get_thread_num(), parallel_get_num_threads());
124  }
125 #endif
126 }
127 
128 template <typename T0, typename R, typename F>
129 R parallel_sum(const T0 &D0, const R &input, const F &func) {
130 #if IE_THREAD == IE_THREAD_TBB
131  return tbb::parallel_reduce(
132  tbb::blocked_range<T0>(0, D0), input,
133  [&](const tbb::blocked_range<T0>& r, R init)->R {
134  R sum = init;
135  for (T0 dim1 = r.begin(); dim1 < r.end(); ++dim1)
136  sum += func(dim1);
137  return sum;
138  },
139  [](R x, R y)->R {
140  return x + y;
141  });
142 #else
143  R sum = input;
144 
145 #ifdef _MSC_VER
146  using T0_IT = typename std::make_signed<T0>::type;
147 #else
148  using T0_IT = T0;
149 #endif
150 
151 #if IE_THREAD == IE_THREAD_OMP
152  #pragma omp parallel for reduction(+ : sum) schedule(static)
153 #endif
154  for (T0_IT dim1 = 0; dim1 < static_cast<T0_IT>(D0); dim1++) {
155  sum += static_cast<R>(func(dim1));
156  }
157  return sum;
158 #endif
159 }
160 
161 template <typename T0, typename T1, typename R, typename F>
162 R parallel_sum2d(const T0 &D0, const T1 &D1, const R &input, const F &func) {
163 #if IE_THREAD == IE_THREAD_TBB
164  return tbb::parallel_reduce(
165  tbb::blocked_range2d<T0, T1>(0, D0, 0, D1), input,
166  [&](const tbb::blocked_range2d<T0, T1>& r, R init)->R {
167  R sum = init;
168  for (T0 dim2 = r.rows().begin(); dim2 < r.rows().end(); dim2++) {
169  for (T1 dim1 = r.cols().begin(); dim1 < r.cols().end(); dim1++) {
170  sum += func(dim2, dim1);
171  }
172  }
173  return sum;
174  },
175  [](R x, R y)->R {
176  return x + y;
177  });
178 #else
179  R sum = input;
180 
181 #ifdef _MSC_VER
182  using T0_IT = typename std::make_signed<T0>::type;
183  using T1_IT = typename std::make_signed<T1>::type;
184 #else
185  using T0_IT = T0;
186  using T1_IT = T1;
187 #endif
188 
189 #if IE_THREAD == IE_THREAD_OMP
190  #pragma omp parallel for collapse(2) reduction(+ : sum) schedule(static)
191 #endif
192  for (T0_IT dim2 = 0; dim2 < D0; dim2++) {
193  for (T1_IT dim1 = 0; dim1 < D1; dim1++) {
194  sum += func(dim2, dim1);
195  }
196  }
197  return sum;
198 #endif
199 }
200 template <typename T0, typename T1, typename T2, typename R, typename F>
201 R parallel_sum3d(const T0 &D0, const T1 &D1, const T2 &D2, const R &input, const F &func) {
202 #if IE_THREAD == IE_THREAD_TBB
203  return tbb::parallel_reduce(
204  tbb::blocked_range3d<T0, T1, T2>(0, D0, 0, D1, 0, D2), input,
205  [&](const tbb::blocked_range3d<T0, T1, T2>& r, R init)->R {
206  R sum = init;
207  for (T0 dim1 = r.pages().begin(); dim1 < r.pages().end(); dim1++) {
208  for (T1 dim2 = r.rows().begin(); dim2 < r.rows().end(); dim2++) {
209  for (T2 dim3 = r.cols().begin(); dim3 < r.cols().end(); dim3++) {
210  sum += func(dim1, dim2, dim3);
211  }
212  }
213  }
214  return sum;
215  },
216  [](R x, R y)->R {
217  return x + y;
218  });
219 #else
220  R sum = input;
221 
222 #ifdef _MSC_VER
223  using T0_IT = typename std::make_signed<T0>::type;
224  using T1_IT = typename std::make_signed<T1>::type;
225  using T2_IT = typename std::make_signed<T2>::type;
226 #else
227  using T0_IT = T0;
228  using T1_IT = T1;
229  using T2_IT = T2;
230 #endif
231 
232 #if IE_THREAD == IE_THREAD_OMP
233  #pragma omp parallel for collapse(3) reduction(+ : sum) schedule(static)
234 #endif
235  for (T0_IT dim1 = 0; dim1 < static_cast<T0_IT>(D0); dim1++) {
236  for (T1_IT dim2 = 0; dim2 < static_cast<T1_IT>(D1); dim2++) {
237  for (T2_IT dim3 = 0; dim3 < static_cast<T2_IT>(D2); dim3++) {
238  sum += func(dim1, dim2, dim3);
239  }
240  }
241  }
242  return sum;
243 #endif
244 }
245 
246 template<typename T>
247 inline T parallel_it_init(T start) { return start; }
248 template<typename T, typename Q, typename R, typename... Args>
249 inline T parallel_it_init(T start, Q &x, const R &X, Args &&... tuple) {
250  start = parallel_it_init(start, static_cast<Args>(tuple)...);
251  x = start % X;
252  return start / X;
253 }
254 
255 inline bool parallel_it_step() { return true; }
256 template<typename Q, typename R, typename... Args>
257 inline bool parallel_it_step(Q &x, const R &X, Args &&... tuple) {
258  if (parallel_it_step(static_cast<Args>(tuple)...)) {
259  x = (x + 1) % X;
260  return x == 0;
261  }
262  return false;
263 }
264 
265 template <typename T, typename Q>
266 inline void splitter(const T &n, const Q &team, const Q &tid, T &n_start, T &n_end) {
267  if (team <= 1 || n == 0) {
268  n_start = 0;
269  n_end = n;
270  } else {
271  T n1 = (n + (T)team - 1) / (T)team;
272  T n2 = n1 - 1;
273  T T1 = n - n2 * (T)team;
274  n_end = (T)tid < T1 ? n1 : n2;
275  n_start = (T)tid <= T1 ? tid * n1 : T1 * n1 + ((T)tid - T1) * n2;
276  }
277 
278  n_end += n_start;
279 }
280 
281 
282 template <typename T0, typename F>
283 void for_1d(const int &ithr, const int &nthr, const T0 &D0, const F &func) {
284  T0 d0{ 0 }, end{ 0 };
285  splitter(D0, nthr, ithr, d0, end);
286  for (; d0 < end; ++d0) func(d0);
287 }
288 
289 template <typename T0, typename F>
290 void parallel_for(const T0 &D0, const F &func) {
291 #if IE_THREAD == IE_THREAD_TBB
292  const int nthr = parallel_get_max_threads();
293  tbb::parallel_for(0, nthr, [&](int ithr) {
294  for_1d(ithr, nthr, D0, func);
295  });
296 #elif IE_THREAD == IE_THREAD_OMP
297  # pragma omp parallel
298  for_1d(parallel_get_thread_num(), parallel_get_num_threads(), D0, func);
299 #elif IE_THREAD == IE_THREAD_SEQ
300  for_1d(0, 1, D0, func);
301 #endif
302 }
303 
304 
305 template <typename T0, typename T1, typename F>
306 void for_2d(const int &ithr, const int &nthr, const T0 &D0, const T1 &D1, const F &func) {
307  const size_t work_amount = (size_t)D0 * D1;
308  if (work_amount == 0) return;
309  size_t start{ 0 }, end{ 0 };
310  splitter(work_amount, nthr, ithr, start, end);
311 
312  T0 d0{ 0 }; T1 d1{ 0 };
313  parallel_it_init(start, d0, D0, d1, D1);
314  for (size_t iwork = start; iwork < end; ++iwork) {
315  func(d0, d1);
316  parallel_it_step(d0, D0, d1, D1);
317  }
318 }
319 
320 template <typename T0, typename T1, typename F>
321 void parallel_for2d(const T0 &D0, const T1 &D1, const F &func) {
322 #if IE_THREAD == IE_THREAD_TBB
323  const int nthr = parallel_get_max_threads();
324  tbb::parallel_for(0, nthr, [&](int ithr) {
325  for_2d(ithr, nthr, D0, D1, func);
326  });
327 #elif IE_THREAD == IE_THREAD_OMP
328  # pragma omp parallel
329  for_2d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, func);
330 #elif IE_THREAD == IE_THREAD_SEQ
331  for_2d(0, 1, D0, D1, func);
332 #endif
333 }
334 
335 
336 template <typename T0, typename T1, typename T2, typename F>
337 void for_3d(const int &ithr, const int &nthr, const T0 &D0, const T1 &D1,
338  const T2 &D2, const F &func) {
339  const size_t work_amount = (size_t)D0 * D1 * D2;
340  if (work_amount == 0) return;
341  size_t start{ 0 }, end{ 0 };
342  splitter(work_amount, nthr, ithr, start, end);
343 
344  T0 d0{ 0 }; T1 d1{ 0 }; T2 d2{ 0 };
345  parallel_it_init(start, d0, D0, d1, D1, d2, D2);
346  for (size_t iwork = start; iwork < end; ++iwork) {
347  func(d0, d1, d2);
348  parallel_it_step(d0, D0, d1, D1, d2, D2);
349  }
350 }
351 
352 template <typename T0, typename T1, typename T2, typename F>
353 void parallel_for3d(const T0 &D0, const T1 &D1, const T2 &D2, const F &func) {
354 #if IE_THREAD == IE_THREAD_TBB
355  const int nthr = parallel_get_max_threads();
356  tbb::parallel_for(0, nthr, [&](int ithr) {
357  for_3d(ithr, nthr, D0, D1, D2, func);
358  });
359 #elif IE_THREAD == IE_THREAD_OMP
360  # pragma omp parallel
361  for_3d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, func);
362 #elif IE_THREAD == IE_THREAD_SEQ
363  for_3d(0, 1, D0, D1, D2, func);
364 #endif
365 }
366 
367 template <typename T0, typename T1, typename T2, typename T3, typename F>
368 void for_4d(const int &ithr, const int &nthr, const T0 &D0, const T1 &D1,
369  const T2 &D2, const T3 &D3, const F &func) {
370  const size_t work_amount = (size_t)D0 * D1 * D2 * D3;
371  if (work_amount == 0) return;
372  size_t start{ 0 }, end{ 0 };
373  splitter(work_amount, nthr, ithr, start, end);
374 
375  T0 d0{ 0 }; T1 d1{ 0 }; T2 d2{ 0 }; T3 d3{ 0 };
376  parallel_it_init(start, d0, D0, d1, D1, d2, D2, d3, D3);
377  for (size_t iwork = start; iwork < end; ++iwork) {
378  func(d0, d1, d2, d3);
379  parallel_it_step(d0, D0, d1, D1, d2, D2, d3, D3);
380  }
381 }
382 
383 template <typename T0, typename T1, typename T2, typename T3, typename F>
384 void parallel_for4d(const T0 &D0, const T1 &D1, const T2 &D2, const T3 &D3, const F &func) {
385 #if IE_THREAD == IE_THREAD_TBB
386  const int nthr = parallel_get_max_threads();
387  tbb::parallel_for(0, nthr, [&](int ithr) {
388  for_4d(ithr, nthr, D0, D1, D2, D3, func);
389  });
390 #elif IE_THREAD == IE_THREAD_OMP
391  # pragma omp parallel
392  for_4d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, func);
393 #elif IE_THREAD == IE_THREAD_SEQ
394  for_4d(0, 1, D0, D1, D2, D3, func);
395 #endif
396 }
397 
398 template <typename T0, typename T1, typename T2, typename T3, typename T4, typename F>
399 void for_5d(const int &ithr, const int &nthr, const T0 &D0, const T1 &D1,
400  const T2 &D2, const T3 &D3, const T4 &D4, const F &func) {
401  const size_t work_amount = (size_t)D0 * D1 * D2 * D3 * D4;
402  if (work_amount == 0) return;
403  size_t start{ 0 }, end{ 0 };
404  splitter(work_amount, nthr, ithr, start, end);
405 
406  T0 d0{ 0 }; T1 d1{ 0 }; T2 d2{ 0 }; T3 d3{ 0 }; T4 d4{ 0 };
407  parallel_it_init(start, d0, D0, d1, D1, d2, D2, d3, D3, d4, D4);
408  for (size_t iwork = start; iwork < end; ++iwork) {
409  func(d0, d1, d2, d3, d4);
410  parallel_it_step(d0, D0, d1, D1, d2, D2, d3, D3, d4, D4);
411  }
412 }
413 
414 template <typename T0, typename T1, typename T2, typename T3, typename T4, typename F>
415 void parallel_for5d(const T0 &D0, const T1 &D1, const T2 &D2, const T3 &D3,
416  const T4 &D4, const F &func) {
417 #if IE_THREAD == IE_THREAD_TBB
418  const int nthr = parallel_get_max_threads();
419  tbb::parallel_for(0, nthr, [&](int ithr) {
420  for_5d(ithr, nthr, D0, D1, D2, D3, D4, func);
421  });
422 #elif IE_THREAD == IE_THREAD_OMP
423  # pragma omp parallel
424  for_5d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, D4, func);
425 #elif IE_THREAD == IE_THREAD_SEQ
426  for_5d(0, 1, D0, D1, D2, D3, D4, func);
427 #endif
428 }
429 
430 
431 template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename F>
432 void for_6d(const int &ithr, const int &nthr, const T0 &D0, const T1 &D1,
433  const T2 &D2, const T3 &D3, const T4 &D4, const T5 &D5, F func) {
434  const size_t work_amount = (size_t)D0 * D1 * D2 * D3 * D4 * D5;
435  if (work_amount == 0) return;
436  size_t start{ 0 }, end{ 0 };
437  splitter(work_amount, nthr, ithr, start, end);
438 
439  T0 d0{ 0 }; T1 d1{ 0 }; T2 d2{ 0 }; T3 d3{ 0 }; T4 d4{ 0 }; T5 d5{ 0 };
440  parallel_it_init(start, d0, D0, d1, D1, d2, D2, d3, D3, d4, D4,
441  d5, D5);
442  for (size_t iwork = start; iwork < end; ++iwork) {
443  func(d0, d1, d2, d3, d4, d5);
444  parallel_it_step(d0, D0, d1, D1, d2, D2, d3, D3, d4, D4, d5, D5);
445  }
446 }
447 
448 template <typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename F>
449 void parallel_for6d(const T0 &D0, const T1 &D1, const T2 &D2, const T3 &D3,
450  const T4 &D4, const T5 &D5, F func) {
451 #if IE_THREAD == IE_THREAD_TBB
452  const int nthr = parallel_get_max_threads();
453  tbb::parallel_for(0, nthr, [&](int ithr) {
454  for_6d(ithr, nthr, D0, D1, D2, D3, D4, D5, func);
455  });
456 #elif IE_THREAD == IE_THREAD_OMP
457 # pragma omp parallel
458  for_6d(parallel_get_thread_num(), parallel_get_num_threads(), D0, D1, D2, D3, D4, D5, func);
459 #elif IE_THREAD == IE_THREAD_SEQ
460  for_6d(0, 1, D0, D1, D2, D3, D4, D5, func);
461 #endif
462 }
463 
464 } // namespace InferenceEngine
465 
Definition: ie_argmax_layer.hpp:11