Skip to content
This repository was archived by the owner on Mar 21, 2024. It is now read-only.

Commit 892e13a

Browse files
authored
Merge pull request #1671 from senior-zero/fix-main/github/reduce_by_int_overflow
Fix thrust::reduce_by_key for 2^31 elements
2 parents 70c24e2 + 9ef3a8e commit 892e13a

File tree

2 files changed

+173
-21
lines changed

2 files changed

+173
-21
lines changed

testing/cuda/reduce_by_key.cu

Lines changed: 110 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
1-
#include <unittest/unittest.h>
2-
#include <thrust/reduce.h>
1+
#include <thrust/equal.h>
32
#include <thrust/execution_policy.h>
3+
#include <thrust/iterator/counting_iterator.h>
4+
#include <thrust/iterator/transform_iterator.h>
5+
#include <thrust/reduce.h>
6+
#include <unittest/unittest.h>
7+
8+
#include <cstdint>
49

510

611
template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Iterator5>
@@ -286,3 +291,106 @@ void TestReduceByKeyCudaStreamsNoSync()
286291
}
287292
DECLARE_UNITTEST(TestReduceByKeyCudaStreamsNoSync);
288293

294+
295+
// Maps indices to key ids
296+
class div_op : public thrust::unary_function<std::int64_t, std::int64_t>
297+
{
298+
std::int64_t m_divisor;
299+
300+
public:
301+
__host__ div_op(std::int64_t divisor)
302+
: m_divisor(divisor)
303+
{}
304+
305+
__host__ __device__
306+
std::int64_t operator()(std::int64_t x) const
307+
{
308+
return x / m_divisor;
309+
}
310+
};
311+
312+
// Produces unique sequence for key
313+
class mod_op : public thrust::unary_function<std::int64_t, std::int64_t>
314+
{
315+
std::int64_t m_divisor;
316+
317+
public:
318+
__host__ mod_op(std::int64_t divisor)
319+
: m_divisor(divisor)
320+
{}
321+
322+
__host__ __device__
323+
std::int64_t operator()(std::int64_t x) const
324+
{
325+
// div: 2
326+
// idx: 0 1 2 3 4 5
327+
// key: 0 0 | 1 1 | 2 2
328+
// mod: 0 1 | 0 1 | 0 1
329+
// ret: 0 1 1 2 2 3
330+
return (x % m_divisor) + (x / m_divisor);
331+
}
332+
};
333+
334+
335+
void TestReduceByKeyWithBigIndexesHelper(int magnitude)
336+
{
337+
const std::int64_t key_size_magnitude = 8;
338+
ASSERT_EQUAL(true, key_size_magnitude < magnitude);
339+
340+
const std::int64_t num_items = 1ll << magnitude;
341+
const std::int64_t num_unique_keys = 1ll << key_size_magnitude;
342+
343+
// Size of each key group
344+
const std::int64_t key_size = num_items / num_unique_keys;
345+
346+
using counting_it = thrust::counting_iterator<std::int64_t>;
347+
using transform_key_it = thrust::transform_iterator<div_op, counting_it>;
348+
using transform_val_it = thrust::transform_iterator<mod_op, counting_it>;
349+
350+
counting_it count_begin(0ll);
351+
counting_it count_end = count_begin + num_items;
352+
ASSERT_EQUAL(static_cast<std::int64_t>(thrust::distance(count_begin, count_end)),
353+
num_items);
354+
355+
transform_key_it keys_begin(count_begin, div_op{key_size});
356+
transform_key_it keys_end(count_end, div_op{key_size});
357+
358+
transform_val_it values_begin(count_begin, mod_op{key_size});
359+
360+
thrust::device_vector<std::int64_t> output_keys(num_unique_keys);
361+
thrust::device_vector<std::int64_t> output_values(num_unique_keys);
362+
363+
// example:
364+
// items: 6
365+
// unique_keys: 2
366+
// key_size: 3
367+
// keys: 0 0 0 | 1 1 1
368+
// values: 0 1 2 | 1 2 3
369+
// result: 3 6 = sum(range(key_size)) + key_size * key_id
370+
thrust::reduce_by_key(keys_begin,
371+
keys_end,
372+
values_begin,
373+
output_keys.begin(),
374+
output_values.begin());
375+
376+
ASSERT_EQUAL(
377+
true,
378+
thrust::equal(output_keys.begin(), output_keys.end(), count_begin));
379+
380+
thrust::host_vector<std::int64_t> result = output_values;
381+
382+
const std::int64_t sum = (key_size - 1) * key_size / 2;
383+
for (std::int64_t key_id = 0; key_id < num_unique_keys; key_id++)
384+
{
385+
ASSERT_EQUAL(result[key_id], sum + key_id * key_size);
386+
}
387+
}
388+
389+
void TestReduceByKeyWithBigIndexes()
390+
{
391+
TestReduceByKeyWithBigIndexesHelper(30);
392+
TestReduceByKeyWithBigIndexesHelper(31);
393+
TestReduceByKeyWithBigIndexesHelper(32);
394+
TestReduceByKeyWithBigIndexesHelper(33);
395+
}
396+
DECLARE_UNITTEST(TestReduceByKeyWithBigIndexes);

thrust/system/cuda/detail/reduce_by_key.h

Lines changed: 63 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -445,8 +445,9 @@ namespace __reduce_by_key {
445445
{
446446
if (segment_flags[ITEM])
447447
{
448-
storage.raw_exchange[segment_indices[ITEM] -
449-
num_tile_segments_prefix] = scatter_items[ITEM];
448+
int idx = static_cast<int>(segment_indices[ITEM] -
449+
num_tile_segments_prefix);
450+
storage.raw_exchange[idx] = scatter_items[ITEM];
450451
}
451452
}
452453

@@ -786,7 +787,7 @@ namespace __reduce_by_key {
786787
// so just assign one tile per block
787788
//
788789
int tile_idx = blockIdx.x;
789-
Size tile_offset = tile_idx * ITEMS_PER_TILE;
790+
Size tile_offset = static_cast<Size>(tile_idx) * ITEMS_PER_TILE;
790791
Size num_remaining = num_items - tile_offset;
791792

792793
if (num_remaining > ITEMS_PER_TILE)
@@ -962,7 +963,8 @@ namespace __reduce_by_key {
962963
return status;
963964
}
964965

965-
template <typename Derived,
966+
template <typename Size,
967+
typename Derived,
966968
typename KeysInputIt,
967969
typename ValuesInputIt,
968970
typename KeysOutputIt,
@@ -971,24 +973,23 @@ namespace __reduce_by_key {
971973
typename ReductionOp>
972974
THRUST_RUNTIME_FUNCTION
973975
pair<KeysOutputIt, ValuesOutputIt>
974-
reduce_by_key(execution_policy<Derived>& policy,
975-
KeysInputIt keys_first,
976-
KeysInputIt keys_last,
977-
ValuesInputIt values_first,
978-
KeysOutputIt keys_output,
979-
ValuesOutputIt values_output,
980-
EqualityOp equality_op,
981-
ReductionOp reduction_op)
976+
reduce_by_key_dispatch(execution_policy<Derived>& policy,
977+
KeysInputIt keys_first,
978+
Size num_items,
979+
ValuesInputIt values_first,
980+
KeysOutputIt keys_output,
981+
ValuesOutputIt values_output,
982+
EqualityOp equality_op,
983+
ReductionOp reduction_op)
982984
{
983-
typedef int size_type;
984-
985-
size_type num_items = static_cast<size_type>(thrust::distance(keys_first, keys_last));
986985
size_t temp_storage_bytes = 0;
987986
cudaStream_t stream = cuda_cub::stream(policy);
988987
bool debug_sync = THRUST_DEBUG_SYNC_FLAG;
989988

990989
if (num_items == 0)
990+
{
991991
return thrust::make_pair(keys_output, values_output);
992+
}
992993

993994
cudaError_t status;
994995
status = doit_step(NULL,
@@ -997,15 +998,15 @@ namespace __reduce_by_key {
997998
values_first,
998999
keys_output,
9991000
values_output,
1000-
reinterpret_cast<size_type*>(NULL),
1001+
reinterpret_cast<Size*>(NULL),
10011002
equality_op,
10021003
reduction_op,
10031004
num_items,
10041005
stream,
10051006
debug_sync);
10061007
cuda_cub::throw_on_error(status, "reduce_by_key failed on 1st step");
10071008

1008-
size_t allocation_sizes[2] = {sizeof(size_type), temp_storage_bytes};
1009+
size_t allocation_sizes[2] = {sizeof(Size), temp_storage_bytes};
10091010
void * allocations[2] = {NULL, NULL};
10101011

10111012
size_t storage_size = 0;
@@ -1026,8 +1027,8 @@ namespace __reduce_by_key {
10261027
allocation_sizes);
10271028
cuda_cub::throw_on_error(status, "reduce failed on 2nd alias_storage");
10281029

1029-
size_type* d_num_runs_out
1030-
= thrust::detail::aligned_reinterpret_cast<size_type*>(allocations[0]);
1030+
Size* d_num_runs_out
1031+
= thrust::detail::aligned_reinterpret_cast<Size*>(allocations[0]);
10311032

10321033
status = doit_step(allocations[1],
10331034
temp_storage_bytes,
@@ -1054,6 +1055,49 @@ namespace __reduce_by_key {
10541055
);
10551056
}
10561057

1058+
template <typename Derived,
1059+
typename KeysInputIt,
1060+
typename ValuesInputIt,
1061+
typename KeysOutputIt,
1062+
typename ValuesOutputIt,
1063+
typename EqualityOp,
1064+
typename ReductionOp>
1065+
THRUST_RUNTIME_FUNCTION
1066+
pair<KeysOutputIt, ValuesOutputIt>
1067+
reduce_by_key(execution_policy<Derived>& policy,
1068+
KeysInputIt keys_first,
1069+
KeysInputIt keys_last,
1070+
ValuesInputIt values_first,
1071+
KeysOutputIt keys_output,
1072+
ValuesOutputIt values_output,
1073+
EqualityOp equality_op,
1074+
ReductionOp reduction_op)
1075+
{
1076+
using size_type = typename iterator_traits<KeysInputIt>::difference_type;
1077+
1078+
size_type num_items = thrust::distance(keys_first, keys_last);
1079+
1080+
if (num_items == 0)
1081+
{
1082+
return thrust::make_pair(keys_output, values_output);
1083+
}
1084+
1085+
pair<KeysOutputIt, ValuesOutputIt> result{};
1086+
THRUST_INDEX_TYPE_DISPATCH(result,
1087+
reduce_by_key_dispatch,
1088+
num_items,
1089+
(policy,
1090+
keys_first,
1091+
num_items_fixed,
1092+
values_first,
1093+
keys_output,
1094+
values_output,
1095+
equality_op,
1096+
reduction_op));
1097+
1098+
return result;
1099+
}
1100+
10571101
} // namespace __reduce_by_key
10581102

10591103
//-------------------------

0 commit comments

Comments
 (0)