Kokkos Core Kernels Package Version of the Day
Loading...
Searching...
No Matches
Kokkos_MemoryPool.hpp
1//@HEADER
2// ************************************************************************
3//
4// Kokkos v. 4.0
5// Copyright (2022) National Technology & Engineering
6// Solutions of Sandia, LLC (NTESS).
7//
8// Under the terms of Contract DE-NA0003525 with NTESS,
9// the U.S. Government retains certain rights in this software.
10//
11// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
12// See https://kokkos.org/LICENSE for license information.
13// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
14//
15//@HEADER
16
17#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
18#include <Kokkos_Macros.hpp>
19static_assert(false,
20 "Including non-public Kokkos header files is not allowed.");
21#endif
22#ifndef KOKKOS_MEMORYPOOL_HPP
23#define KOKKOS_MEMORYPOOL_HPP
24
25#include <Kokkos_Core_fwd.hpp>
26#include <Kokkos_Parallel.hpp>
27#include <Kokkos_Atomic.hpp>
28#include <impl/Kokkos_ConcurrentBitset.hpp>
29#include <impl/Kokkos_Error.hpp>
30#include <impl/Kokkos_SharedAlloc.hpp>
31
32namespace Kokkos {
33namespace Impl {
34/* Report violation of size constraints:
35 * min_block_alloc_size <= max_block_alloc_size
36 * max_block_alloc_size <= min_superblock_size
37 * min_superblock_size <= max_superblock_size
38 * min_superblock_size <= min_total_alloc_size
39 * min_superblock_size <= min_block_alloc_size *
40 * max_block_per_superblock
41 */
42void memory_pool_bounds_verification(size_t min_block_alloc_size,
43 size_t max_block_alloc_size,
44 size_t min_superblock_size,
45 size_t max_superblock_size,
46 size_t max_block_per_superblock,
47 size_t min_total_alloc_size);
48} // namespace Impl
49} // namespace Kokkos
50
51namespace Kokkos {
52
53namespace Impl {
54
55void _print_memory_pool_state(std::ostream &s, uint32_t const *sb_state_ptr,
56 int32_t sb_count, uint32_t sb_size_lg2,
57 uint32_t sb_state_size, uint32_t state_shift,
58 uint32_t state_used_mask);
59
60} // end namespace Impl
61
62template <typename DeviceType>
63class MemoryPool {
64 private:
65 using CB = Kokkos::Impl::concurrent_bitset;
66
67 enum : uint32_t { bits_per_int_lg2 = CB::bits_per_int_lg2 };
68 enum : uint32_t { state_shift = CB::state_shift };
69 enum : uint32_t { state_used_mask = CB::state_used_mask };
70 enum : uint32_t { state_header_mask = CB::state_header_mask };
71 enum : uint32_t { max_bit_count_lg2 = CB::max_bit_count_lg2 };
72 enum : uint32_t { max_bit_count = CB::max_bit_count };
73
74 enum : uint32_t { HINT_PER_BLOCK_SIZE = 2 };
75
76 /* Each superblock has a concurrent bitset state
77 * which is an array of uint32_t integers.
78 * [ { block_count_lg2 : state_shift bits
79 * , used_block_count : ( 32 - state_shift ) bits
80 * }
81 * , { block allocation bit set }* ]
82 *
83 * As superblocks are assigned (allocated) to a block size
84 * and released (deallocated) back to empty the superblock state
85 * is concurrently updated.
86 */
87
88 /* Mapping between block_size <-> block_state
89 *
90 * block_state = ( m_sb_size_lg2 - block_size_lg2 ) << state_shift
91 * block_size = m_sb_size_lg2 - ( block_state >> state_shift )
92 *
93 * Thus A_block_size < B_block_size <=> A_block_state > B_block_state
94 */
95
96 using base_memory_space = typename DeviceType::memory_space;
97
98 enum {
99 accessible = Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
100 base_memory_space>::accessible
101 };
102
103 using Tracker = Kokkos::Impl::SharedAllocationTracker;
104 using Record = Kokkos::Impl::SharedAllocationRecord<base_memory_space>;
105
106 Tracker m_tracker;
107 uint32_t *m_sb_state_array;
108 uint32_t m_sb_state_size;
109 uint32_t m_sb_size_lg2;
110 uint32_t m_max_block_size_lg2;
111 uint32_t m_min_block_size_lg2;
112 int32_t m_sb_count;
113 int32_t m_hint_offset; // Offset to K * #block_size array of hints
114 int32_t m_data_offset; // Offset to 0th superblock data
115 int32_t m_unused_padding;
116
117 public:
118 using memory_space = typename DeviceType::memory_space;
119
121 enum : uint32_t { max_superblock_size = 1LU << 31 /* 2 gigabytes */ };
122 enum : uint32_t { max_block_per_superblock = max_bit_count };
123
124 //--------------------------------------------------------------------------
125
126 KOKKOS_INLINE_FUNCTION
127 bool operator==(MemoryPool const &other) const {
128 return m_sb_state_array == other.m_sb_state_array;
129 }
130
131 KOKKOS_INLINE_FUNCTION
132 size_t capacity() const noexcept {
133 return size_t(m_sb_count) << m_sb_size_lg2;
134 }
135
136 KOKKOS_INLINE_FUNCTION
137 size_t min_block_size() const noexcept {
138 return (1LU << m_min_block_size_lg2);
139 }
140
141 KOKKOS_INLINE_FUNCTION
142 size_t max_block_size() const noexcept {
143 return (1LU << m_max_block_size_lg2);
144 }
145
146 struct usage_statistics {
147 size_t capacity_bytes;
148 size_t superblock_bytes;
149 size_t max_block_bytes;
150 size_t min_block_bytes;
151 size_t capacity_superblocks;
152 size_t consumed_superblocks;
153 size_t consumed_blocks;
154 size_t consumed_bytes;
155 size_t reserved_blocks;
156 size_t reserved_bytes;
157 };
158
159 void get_usage_statistics(usage_statistics &stats) const {
160 Kokkos::HostSpace host;
161
162 const size_t alloc_size = m_hint_offset * sizeof(uint32_t);
163
164 uint32_t *const sb_state_array =
165 accessible ? m_sb_state_array : (uint32_t *)host.allocate(alloc_size);
166
167 if (!accessible) {
168 Kokkos::Impl::DeepCopy<Kokkos::HostSpace, base_memory_space>(
169 sb_state_array, m_sb_state_array, alloc_size);
170 Kokkos::fence(
171 "MemoryPool::get_usage_statistics(): fence after copying state "
172 "array to HostSpace");
173 }
174
175 stats.superblock_bytes = (1LU << m_sb_size_lg2);
176 stats.max_block_bytes = (1LU << m_max_block_size_lg2);
177 stats.min_block_bytes = (1LU << m_min_block_size_lg2);
178 stats.capacity_bytes = stats.superblock_bytes * m_sb_count;
179 stats.capacity_superblocks = m_sb_count;
180 stats.consumed_superblocks = 0;
181 stats.consumed_blocks = 0;
182 stats.consumed_bytes = 0;
183 stats.reserved_blocks = 0;
184 stats.reserved_bytes = 0;
185
186 const uint32_t *sb_state_ptr = sb_state_array;
187
188 for (int32_t i = 0; i < m_sb_count; ++i, sb_state_ptr += m_sb_state_size) {
189 const uint32_t block_count_lg2 = (*sb_state_ptr) >> state_shift;
190
191 if (block_count_lg2) {
192 const uint32_t block_count = 1u << block_count_lg2;
193 const uint32_t block_size_lg2 = m_sb_size_lg2 - block_count_lg2;
194 const uint32_t block_size = 1u << block_size_lg2;
195 const uint32_t block_used = (*sb_state_ptr) & state_used_mask;
196
197 stats.consumed_superblocks++;
198 stats.consumed_blocks += block_used;
199 stats.consumed_bytes += static_cast<size_t>(block_used) * block_size;
200 stats.reserved_blocks += block_count - block_used;
201 stats.reserved_bytes +=
202 static_cast<size_t>(block_count - block_used) * block_size;
203 }
204 }
205
206 if (!accessible) {
207 host.deallocate(sb_state_array, alloc_size);
208 }
209 }
210
211 void print_state(std::ostream &s) const {
212 Kokkos::HostSpace host;
213
214 const size_t alloc_size = m_hint_offset * sizeof(uint32_t);
215
216 uint32_t *const sb_state_array =
217 accessible ? m_sb_state_array : (uint32_t *)host.allocate(alloc_size);
218
219 if (!accessible) {
220 Kokkos::Impl::DeepCopy<Kokkos::HostSpace, base_memory_space>(
221 sb_state_array, m_sb_state_array, alloc_size);
222 Kokkos::fence(
223 "MemoryPool::print_state(): fence after copying state array to "
224 "HostSpace");
225 }
226
227 Impl::_print_memory_pool_state(s, sb_state_array, m_sb_count, m_sb_size_lg2,
228 m_sb_state_size, state_shift,
229 state_used_mask);
230
231 if (!accessible) {
232 host.deallocate(sb_state_array, alloc_size);
233 }
234 }
235
236 //--------------------------------------------------------------------------
237
238 KOKKOS_DEFAULTED_FUNCTION MemoryPool(MemoryPool &&) = default;
239 KOKKOS_DEFAULTED_FUNCTION MemoryPool(const MemoryPool &) = default;
240 KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(MemoryPool &&) = default;
241 KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(const MemoryPool &) = default;
242
243 KOKKOS_INLINE_FUNCTION MemoryPool()
244 : m_tracker(),
245 m_sb_state_array(nullptr),
246 m_sb_state_size(0),
247 m_sb_size_lg2(0),
248 m_max_block_size_lg2(0),
249 m_min_block_size_lg2(0),
250 m_sb_count(0),
251 m_hint_offset(0),
252 m_data_offset(0),
253 m_unused_padding(0) {}
254
269 MemoryPool(const base_memory_space &memspace,
270 const size_t min_total_alloc_size, size_t min_block_alloc_size = 0,
271 size_t max_block_alloc_size = 0, size_t min_superblock_size = 0)
272 : m_tracker(),
273 m_sb_state_array(nullptr),
274 m_sb_state_size(0),
275 m_sb_size_lg2(0),
276 m_max_block_size_lg2(0),
277 m_min_block_size_lg2(0),
278 m_sb_count(0),
279 m_hint_offset(0),
280 m_data_offset(0),
281 m_unused_padding(0) {
282 const uint32_t int_align_lg2 = 3; /* align as int[8] */
283 const uint32_t int_align_mask = (1u << int_align_lg2) - 1;
284 const uint32_t default_min_block_size = 1u << 6; /* 64 bytes */
285 const uint32_t default_max_block_size = 1u << 12; /* 4k bytes */
286 const uint32_t default_min_superblock_size = 1u << 20; /* 1M bytes */
287
288 //--------------------------------------------------
289 // Default block and superblock sizes:
290
291 if (0 == min_block_alloc_size) {
292 // Default all sizes:
293
294 min_superblock_size =
295 std::min(size_t(default_min_superblock_size), min_total_alloc_size);
296
297 min_block_alloc_size =
298 std::min(size_t(default_min_block_size), min_superblock_size);
299
300 max_block_alloc_size =
301 std::min(size_t(default_max_block_size), min_superblock_size);
302 } else if (0 == min_superblock_size) {
303 // Choose superblock size as minimum of:
304 // max_block_per_superblock * min_block_size
305 // max_superblock_size
306 // min_total_alloc_size
307
308 const size_t max_superblock =
309 min_block_alloc_size * max_block_per_superblock;
310
311 min_superblock_size =
312 std::min(max_superblock,
313 std::min(size_t(max_superblock_size), min_total_alloc_size));
314 }
315
316 if (0 == max_block_alloc_size) {
317 max_block_alloc_size = min_superblock_size;
318 }
319
320 //--------------------------------------------------
321
322 /* Enforce size constraints:
323 * min_block_alloc_size <= max_block_alloc_size
324 * max_block_alloc_size <= min_superblock_size
325 * min_superblock_size <= max_superblock_size
326 * min_superblock_size <= min_total_alloc_size
327 * min_superblock_size <= min_block_alloc_size *
328 * max_block_per_superblock
329 */
330
331 Kokkos::Impl::memory_pool_bounds_verification(
332 min_block_alloc_size, max_block_alloc_size, min_superblock_size,
333 max_superblock_size, max_block_per_superblock, min_total_alloc_size);
334
335 //--------------------------------------------------
336 // Block and superblock size is power of two:
337 // Maximum value is 'max_superblock_size'
338
339 m_min_block_size_lg2 =
340 Kokkos::Impl::integral_power_of_two_that_contains(min_block_alloc_size);
341
342 m_max_block_size_lg2 =
343 Kokkos::Impl::integral_power_of_two_that_contains(max_block_alloc_size);
344
345 m_sb_size_lg2 =
346 Kokkos::Impl::integral_power_of_two_that_contains(min_superblock_size);
347
348 {
349 // number of superblocks is multiple of superblock size that
350 // can hold min_total_alloc_size.
351
352 const uint64_t sb_size_mask = (1LU << m_sb_size_lg2) - 1;
353
354 m_sb_count = (min_total_alloc_size + sb_size_mask) >> m_sb_size_lg2;
355 }
356
357 {
358 // Any superblock can be assigned to the smallest size block
359 // Size the block bitset to maximum number of blocks
360
361 const uint32_t max_block_count_lg2 = m_sb_size_lg2 - m_min_block_size_lg2;
362
363 m_sb_state_size =
364 (CB::buffer_bound_lg2(max_block_count_lg2) + int_align_mask) &
365 ~int_align_mask;
366 }
367
368 // Array of all superblock states
369
370 const size_t all_sb_state_size =
371 (m_sb_count * m_sb_state_size + int_align_mask) & ~int_align_mask;
372
373 // Number of block sizes
374
375 const int32_t number_block_sizes =
376 1 + m_max_block_size_lg2 - m_min_block_size_lg2;
377
378 // Array length for possible block sizes
379 // Hint array is one uint32_t per block size
380
381 const int32_t block_size_array_size =
382 (number_block_sizes + int_align_mask) & ~int_align_mask;
383
384 m_hint_offset = all_sb_state_size;
385 m_data_offset = m_hint_offset + block_size_array_size * HINT_PER_BLOCK_SIZE;
386
387 // Allocation:
388
389 const size_t header_size = m_data_offset * sizeof(uint32_t);
390 const size_t alloc_size =
391 header_size + (size_t(m_sb_count) << m_sb_size_lg2);
392
393 Record *rec = Record::allocate(memspace, "Kokkos::MemoryPool", alloc_size);
394
395 m_tracker.assign_allocated_record_to_uninitialized(rec);
396
397 m_sb_state_array = (uint32_t *)rec->data();
398
399 Kokkos::HostSpace host;
400
401 uint32_t *const sb_state_array =
402 accessible ? m_sb_state_array : (uint32_t *)host.allocate(header_size);
403
404 for (int32_t i = 0; i < m_data_offset; ++i) sb_state_array[i] = 0;
405
406 // Initial assignment of empty superblocks to block sizes:
407
408 for (int32_t i = 0; i < number_block_sizes; ++i) {
409 const uint32_t block_size_lg2 = i + m_min_block_size_lg2;
410 const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2;
411 const uint32_t block_state = block_count_lg2 << state_shift;
412 const uint32_t hint_begin = m_hint_offset + i * HINT_PER_BLOCK_SIZE;
413
414 // for block size index 'i':
415 // sb_id_hint = sb_state_array[ hint_begin ];
416 // sb_id_begin = sb_state_array[ hint_begin + 1 ];
417
418 const int32_t jbeg = (i * m_sb_count) / number_block_sizes;
419 const int32_t jend = ((i + 1) * m_sb_count) / number_block_sizes;
420
421 sb_state_array[hint_begin] = uint32_t(jbeg);
422 sb_state_array[hint_begin + 1] = uint32_t(jbeg);
423
424 for (int32_t j = jbeg; j < jend; ++j) {
425 sb_state_array[j * m_sb_state_size] = block_state;
426 }
427 }
428
429 // Write out initialized state:
430
431 if (!accessible) {
432 Kokkos::Impl::DeepCopy<base_memory_space, Kokkos::HostSpace>(
433 m_sb_state_array, sb_state_array, header_size);
434 Kokkos::fence(
435 "MemoryPool::MemoryPool(): fence after copying state array from "
436 "HostSpace");
437
438 host.deallocate(sb_state_array, header_size);
439 } else {
440 Kokkos::memory_fence();
441 }
442 }
443
444 //--------------------------------------------------------------------------
445
446 private:
447 /* Given a size 'n' get the block size in which it can be allocated.
448 * Restrict lower bound to minimum block size.
449 */
450 KOKKOS_FORCEINLINE_FUNCTION
451 uint32_t get_block_size_lg2(uint32_t n) const noexcept {
452 const unsigned i = Kokkos::Impl::integral_power_of_two_that_contains(n);
453
454 return i < m_min_block_size_lg2 ? m_min_block_size_lg2 : i;
455 }
456
457 public:
458 /* Return 0 for invalid block size */
459 KOKKOS_INLINE_FUNCTION
460 uint32_t allocate_block_size(uint64_t alloc_size) const noexcept {
461 return alloc_size <= (1UL << m_max_block_size_lg2)
462 ? (1UL << get_block_size_lg2(uint32_t(alloc_size)))
463 : 0;
464 }
465
466 //--------------------------------------------------------------------------
476 KOKKOS_FUNCTION
477 void *allocate(size_t alloc_size, int32_t attempt_limit = 1) const noexcept {
478 if (size_t(1LU << m_max_block_size_lg2) < alloc_size) {
479 Kokkos::abort(
480 "Kokkos MemoryPool allocation request exceeded specified maximum "
481 "allocation size");
482 }
483
484 if (0 == alloc_size) return nullptr;
485
486 void *p = nullptr;
487
488 const uint32_t block_size_lg2 = get_block_size_lg2(alloc_size);
489
490 // Allocation will fit within a superblock
491 // that has block sizes ( 1 << block_size_lg2 )
492
493 const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2;
494 const uint32_t block_state = block_count_lg2 << state_shift;
495 const uint32_t block_count = 1u << block_count_lg2;
496
497 // Superblock hints for this block size:
498 // hint_sb_id_ptr[0] is the dynamically changing hint
499 // hint_sb_id_ptr[1] is the static start point
500
501 volatile uint32_t *const hint_sb_id_ptr =
502 m_sb_state_array /* memory pool state array */
503 + m_hint_offset /* offset to hint portion of array */
504 + HINT_PER_BLOCK_SIZE /* number of hints per block size */
505 * (block_size_lg2 - m_min_block_size_lg2); /* block size id */
506
507 const int32_t sb_id_begin = int32_t(hint_sb_id_ptr[1]);
508
509 // Fast query clock register 'tic' to pseudo-randomize
510 // the guess for which block within a superblock should
511 // be claimed. If not available then a search occurs.
512#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU)
513 const uint32_t block_id_hint = alloc_size;
514#else
515 const uint32_t block_id_hint =
516 (uint32_t)(Kokkos::Impl::clock_tic()
517#ifdef __CUDA_ARCH__ // FIXME_CUDA
518 // Spread out potentially concurrent access
519 // by threads within a warp or thread block.
520 + (threadIdx.x + blockDim.x * threadIdx.y)
521#endif
522 );
523#endif
524
525 // expected state of superblock for allocation
526 uint32_t sb_state = block_state;
527
528 int32_t sb_id = -1;
529
530 volatile uint32_t *sb_state_array = nullptr;
531
532 while (attempt_limit) {
533 int32_t hint_sb_id = -1;
534
535 if (sb_id < 0) {
536 // No superblock specified, try the hint for this block size
537
538 sb_id = hint_sb_id = int32_t(*hint_sb_id_ptr);
539
540 sb_state_array = m_sb_state_array + (sb_id * m_sb_state_size);
541 }
542
543 // Require:
544 // 0 <= sb_id
545 // sb_state_array == m_sb_state_array + m_sb_state_size * sb_id
546
547 if (sb_state == (state_header_mask & *sb_state_array)) {
548 // This superblock state is as expected, for the moment.
549 // Attempt to claim a bit. The attempt updates the state
550 // so have already made sure the state header is as expected.
551
552 const uint32_t count_lg2 = sb_state >> state_shift;
553 const uint32_t mask = (1u << count_lg2) - 1;
554
555 const Kokkos::pair<int, int> result = CB::acquire_bounded_lg2(
556 sb_state_array, count_lg2, block_id_hint & mask, sb_state);
557
558 // If result.first < 0 then failed to acquire
559 // due to either full or buffer was wrong state.
560 // Could be wrong state if a deallocation raced the
561 // superblock to empty before the acquire could succeed.
562
563 if (0 <= result.first) { // acquired a bit
564
565 const uint32_t size_lg2 = m_sb_size_lg2 - count_lg2;
566
567 // Set the allocated block pointer
568
569 p = ((char *)(m_sb_state_array + m_data_offset)) +
570 (uint64_t(sb_id) << m_sb_size_lg2) // superblock memory
571 + (uint64_t(result.first) << size_lg2); // block memory
572
573 break; // Success
574 }
575 }
576 //------------------------------------------------------------------
577 // Arrive here if failed to acquire a block.
578 // Must find a new superblock.
579
580 // Start searching at designated index for this block size.
581 // Look for superblock that, in preferential order,
582 // 1) part-full superblock of this block size
583 // 2) empty superblock to claim for this block size
584 // 3) part-full superblock of the next larger block size
585
586 sb_state = block_state; // Expect to find the desired state
587 sb_id = -1;
588
589 bool update_hint = false;
590 int32_t sb_id_empty = -1;
591 int32_t sb_id_large = -1;
592 uint32_t sb_state_large = 0;
593
594 sb_state_array = m_sb_state_array + sb_id_begin * m_sb_state_size;
595
596 for (int32_t i = 0, id = sb_id_begin; i < m_sb_count; ++i) {
597 // Query state of the candidate superblock.
598 // Note that the state may change at any moment
599 // as concurrent allocations and deallocations occur.
600
601 const uint32_t full_state = *sb_state_array;
602 const uint32_t used = full_state & state_used_mask;
603 const uint32_t state = full_state & state_header_mask;
604
605 if (state == block_state) {
606 // Superblock is assigned to this block size
607
608 if (used < block_count) {
609 // There is room to allocate one block
610
611 sb_id = id;
612
613 // Is there room to allocate more than one block?
614
615 update_hint = used + 1 < block_count;
616
617 break;
618 }
619 } else if (0 == used) {
620 // Superblock is empty
621
622 if (-1 == sb_id_empty) {
623 // Superblock is not assigned to this block size
624 // and is the first empty superblock encountered.
625 // Save this id to use if a partfull superblock is not found.
626
627 sb_id_empty = id;
628 }
629 } else if ((-1 == sb_id_empty /* have not found an empty */) &&
630 (-1 == sb_id_large /* have not found a larger */) &&
631 (state < block_state /* a larger block */) &&
632 // is not full:
633 (used < (1u << (state >> state_shift)))) {
634 // First superblock encountered that is
635 // larger than this block size and
636 // has room for an allocation.
637 // Save this id to use of partfull or empty superblock not found
638 sb_id_large = id;
639 sb_state_large = state;
640 }
641
642 // Iterate around the superblock array:
643
644 if (++id < m_sb_count) {
645 sb_state_array += m_sb_state_size;
646 } else {
647 id = 0;
648 sb_state_array = m_sb_state_array;
649 }
650 }
651
652 // printf(" search m_sb_count(%d) sb_id(%d) sb_id_empty(%d)
653 // sb_id_large(%d)\n" , m_sb_count , sb_id , sb_id_empty , sb_id_large);
654
655 if (sb_id < 0) {
656 // Did not find a partfull superblock for this block size.
657
658 if (0 <= sb_id_empty) {
659 // Found first empty superblock following designated superblock
660 // Attempt to claim it for this block size.
661 // If the claim fails assume that another thread claimed it
662 // for this block size and try to use it anyway,
663 // but do not update hint.
664
665 sb_id = sb_id_empty;
666
667 sb_state_array = m_sb_state_array + (sb_id * m_sb_state_size);
668
669 // If successfully changed assignment of empty superblock 'sb_id'
670 // to this block_size then update the hint.
671
672 const uint32_t state_empty = state_header_mask & *sb_state_array;
673
674 // If this thread claims the empty block then update the hint
675 update_hint =
676 state_empty == Kokkos::atomic_compare_exchange(
677 sb_state_array, state_empty, block_state);
678 } else if (0 <= sb_id_large) {
679 // Found a larger superblock with space available
680
681 sb_id = sb_id_large;
682 sb_state = sb_state_large;
683
684 sb_state_array = m_sb_state_array + (sb_id * m_sb_state_size);
685 } else {
686 // Did not find a potentially usable superblock
687 --attempt_limit;
688 }
689 }
690
691 if (update_hint) {
692 Kokkos::atomic_compare_exchange(hint_sb_id_ptr, uint32_t(hint_sb_id),
693 uint32_t(sb_id));
694 }
695 } // end allocation attempt loop
696 //--------------------------------------------------------------------
697
698 return p;
699 }
700 // end allocate
701 //--------------------------------------------------------------------------
702
709 KOKKOS_INLINE_FUNCTION
710 void deallocate(void *p, size_t /* alloc_size */) const noexcept {
711 if (nullptr == p) return;
712
713 // Determine which superblock and block
714 const ptrdiff_t d =
715 static_cast<char *>(p) -
716 reinterpret_cast<char *>(m_sb_state_array + m_data_offset);
717
718 // Verify contained within the memory pool's superblocks:
719 const int ok_contains =
720 (0 <= d) && (size_t(d) < (size_t(m_sb_count) << m_sb_size_lg2));
721
722 int ok_block_aligned = 0;
723 int ok_dealloc_once = 0;
724
725 if (ok_contains) {
726 const int sb_id = d >> m_sb_size_lg2;
727
728 // State array for the superblock.
729 volatile uint32_t *const sb_state_array =
730 m_sb_state_array + (sb_id * m_sb_state_size);
731
732 const uint32_t block_state = (*sb_state_array) & state_header_mask;
733 const uint32_t block_size_lg2 =
734 m_sb_size_lg2 - (block_state >> state_shift);
735
736 ok_block_aligned = 0 == (d & ((1UL << block_size_lg2) - 1));
737
738 if (ok_block_aligned) {
739 // Map address to block's bit
740 // mask into superblock and then shift down for block index
741
742 const uint32_t bit =
743 (d & (ptrdiff_t(1LU << m_sb_size_lg2) - 1)) >> block_size_lg2;
744
745 const int result = CB::release(sb_state_array, bit, block_state);
746
747 ok_dealloc_once = 0 <= result;
748 }
749 }
750
751 if (!ok_contains || !ok_block_aligned || !ok_dealloc_once) {
752 Kokkos::abort("Kokkos MemoryPool::deallocate given erroneous pointer");
753 }
754 }
755 // end deallocate
756 //--------------------------------------------------------------------------
757
758 KOKKOS_INLINE_FUNCTION
759 int number_of_superblocks() const noexcept { return m_sb_count; }
760
761 KOKKOS_INLINE_FUNCTION
762 void superblock_state(int sb_id, int &block_size, int &block_count_capacity,
763 int &block_count_used) const noexcept {
764 block_size = 0;
765 block_count_capacity = 0;
766 block_count_used = 0;
767
768 bool can_access_state_array = []() {
769 KOKKOS_IF_ON_HOST(
770 (return SpaceAccessibility<DefaultHostExecutionSpace,
771 base_memory_space>::accessible;))
772 KOKKOS_IF_ON_DEVICE(
773 (return SpaceAccessibility<DefaultExecutionSpace,
774 base_memory_space>::accessible;))
775 }();
776
777 if (can_access_state_array) {
778 // Can access the state array
779
780 const uint32_t state =
781 ((uint32_t volatile *)m_sb_state_array)[sb_id * m_sb_state_size];
782
783 const uint32_t block_count_lg2 = state >> state_shift;
784 const uint32_t block_used = state & state_used_mask;
785
786 block_size = 1LU << (m_sb_size_lg2 - block_count_lg2);
787 block_count_capacity = 1LU << block_count_lg2;
788 block_count_used = block_used;
789 }
790 }
791};
792
793} // namespace Kokkos
794
795#endif /* #ifndef KOKKOS_MEMORYPOOL_HPP */
Atomic functions.
Declaration of parallel operators.
void * allocate(const ExecutionSpace &, const size_t arg_alloc_size) const
Allocate untracked memory in the space.
void deallocate(void *const arg_alloc_ptr, const size_t arg_alloc_size) const
Deallocate untracked memory in the space.
ScopeGuard Some user scope issues have been identified with some Kokkos::finalize calls; ScopeGuard a...
first_type first
The first element of the pair.