Skip to content

Commit 4481142

Browse files
authored
Change cudf::test::make_null_mask to also return null-count (#13081)
Change the `cudf::test::make_null_mask` to return both the null-mask and the null-count. Callers can then use this null-count instead of `UNKNOWN_NULL_COUNT`. These changes include removing `UNKNOWN_NULL_COUNT` usage from the libcudf C++ test source code. One side-effect found that strings column with all nulls can technically have no children but using `UNKNOWN_NULL_COUNT` allowed the check for this to be bypassed. Therefore many utilities started to fail when `UNKNOWN_NULL_COUNT` was removed. The factory was modified to remove the check which results in an offsets column and an empty chars column as children. More code will likely need to be change when the `UNKNOWN_NULL_COUNT` is no longer used as a default parameter for factories and other column functions. No behavior is changed. Since the `cudf::test::make_null_mask` is technically a public API, this PR could be marked as a breaking change as well. Contributes to: #11968 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - MithunR (https://github.com/mythrocks) - Vyas Ramasubramani (https://github.com/vyasr) URL: #13081
1 parent 5c93b44 commit 4481142

40 files changed

+626
-618
lines changed

cpp/include/cudf/strings/detail/strings_column_factories.cuh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,6 @@ std::unique_ptr<column> make_strings_column(CharIterator chars_begin,
170170
size_type bytes = std::distance(chars_begin, chars_end) * sizeof(char);
171171
if (strings_count == 0) return make_empty_column(type_id::STRING);
172172

173-
CUDF_EXPECTS(null_count < strings_count, "null strings column not yet supported");
174173
CUDF_EXPECTS(bytes >= 0, "invalid offsets data");
175174

176175
// build offsets column -- this is the number of strings + 1

cpp/include/cudf_test/column_utilities.hpp

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -254,26 +254,26 @@ std::pair<thrust::host_vector<T>, std::vector<bitmask_type>> to_host(column_view
254254
template <>
255255
inline std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to_host(column_view c)
256256
{
257-
auto const scv = strings_column_view(c);
258-
auto const h_chars = cudf::detail::make_std_vector_sync<char>(
259-
cudf::device_span<char const>(scv.chars().data<char>(), scv.chars().size()),
260-
cudf::get_default_stream());
261-
auto const h_offsets = cudf::detail::make_std_vector_sync(
262-
cudf::device_span<cudf::offset_type const>(
263-
scv.offsets().data<cudf::offset_type>() + scv.offset(), scv.size() + 1),
264-
cudf::get_default_stream());
265-
266-
// build std::string vector from chars and offsets
267-
std::vector<std::string> host_data;
268-
host_data.reserve(c.size());
269-
std::transform(
270-
std::begin(h_offsets),
271-
std::end(h_offsets) - 1,
272-
std::begin(h_offsets) + 1,
273-
std::back_inserter(host_data),
274-
[&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); });
275-
276-
return {host_data, bitmask_to_host(c)};
257+
thrust::host_vector<std::string> host_data(c.size());
258+
if (c.size() > c.null_count()) {
259+
auto const scv = strings_column_view(c);
260+
auto const h_chars = cudf::detail::make_std_vector_sync<char>(
261+
cudf::device_span<char const>(scv.chars().data<char>(), scv.chars().size()),
262+
cudf::get_default_stream());
263+
auto const h_offsets = cudf::detail::make_std_vector_sync(
264+
cudf::device_span<cudf::offset_type const>(
265+
scv.offsets().data<cudf::offset_type>() + scv.offset(), scv.size() + 1),
266+
cudf::get_default_stream());
267+
268+
// build std::string vector from chars and offsets
269+
std::transform(
270+
std::begin(h_offsets),
271+
std::end(h_offsets) - 1,
272+
std::begin(h_offsets) + 1,
273+
host_data.begin(),
274+
[&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); });
275+
}
276+
return {std::move(host_data), bitmask_to_host(c)};
277277
}
278278

279279
} // namespace cudf::test

cpp/include/cudf_test/column_wrapper.hpp

Lines changed: 62 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -240,16 +240,23 @@ rmm::device_buffer make_elements(InputIterator begin, InputIterator end)
240240
* element in `[begin,end)` that evaluated to `true`.
241241
*/
242242
template <typename ValidityIterator>
243-
std::vector<bitmask_type> make_null_mask_vector(ValidityIterator begin, ValidityIterator end)
243+
std::pair<std::vector<bitmask_type>, cudf::size_type> make_null_mask_vector(ValidityIterator begin,
244+
ValidityIterator end)
244245
{
245246
auto const size = cudf::distance(begin, end);
246247
auto const num_words = cudf::bitmask_allocation_size_bytes(size) / sizeof(bitmask_type);
247248

248-
auto null_mask = std::vector<bitmask_type>(num_words, 0);
249-
for (auto i = 0; i < size; ++i)
250-
if (*(begin + i)) set_bit_unsafe(null_mask.data(), i);
249+
auto null_mask = std::vector<bitmask_type>(num_words, 0);
250+
auto null_count = cudf::size_type{0};
251+
for (auto i = 0; i < size; ++i) {
252+
if (*(begin + i)) {
253+
set_bit_unsafe(null_mask.data(), i);
254+
} else {
255+
++null_count;
256+
}
257+
}
251258

252-
return null_mask;
259+
return {std::move(null_mask), null_count};
253260
}
254261

255262
/**
@@ -266,12 +273,14 @@ std::vector<bitmask_type> make_null_mask_vector(ValidityIterator begin, Validity
266273
* element in `[begin,end)` that evaluated to `true`.
267274
*/
268275
template <typename ValidityIterator>
269-
rmm::device_buffer make_null_mask(ValidityIterator begin, ValidityIterator end)
276+
std::pair<rmm::device_buffer, cudf::size_type> make_null_mask(ValidityIterator begin,
277+
ValidityIterator end)
270278
{
271-
auto null_mask = make_null_mask_vector(begin, end);
272-
return rmm::device_buffer{null_mask.data(),
273-
null_mask.size() * sizeof(decltype(null_mask.front())),
274-
cudf::get_default_stream()};
279+
auto [null_mask, null_count] = make_null_mask_vector(begin, end);
280+
auto d_mask = rmm::device_buffer{null_mask.data(),
281+
cudf::bitmask_allocation_size_bytes(cudf::distance(begin, end)),
282+
cudf::get_default_stream()};
283+
return {std::move(d_mask), null_count};
275284
}
276285

277286
/**
@@ -319,10 +328,12 @@ class fixed_width_column_wrapper : public detail::column_wrapper {
319328
fixed_width_column_wrapper() : column_wrapper{}
320329
{
321330
std::vector<ElementTo> empty;
322-
wrapped.reset(new cudf::column{
323-
cudf::data_type{cudf::type_to_id<ElementTo>()},
324-
0,
325-
detail::make_elements<ElementTo, SourceElementT>(empty.begin(), empty.end())});
331+
wrapped.reset(
332+
new cudf::column{cudf::data_type{cudf::type_to_id<ElementTo>()},
333+
0,
334+
detail::make_elements<ElementTo, SourceElementT>(empty.begin(), empty.end()),
335+
rmm::device_buffer{},
336+
0});
326337
}
327338

328339
/**
@@ -349,7 +360,9 @@ class fixed_width_column_wrapper : public detail::column_wrapper {
349360
auto const size = cudf::distance(begin, end);
350361
wrapped.reset(new cudf::column{cudf::data_type{cudf::type_to_id<ElementTo>()},
351362
size,
352-
detail::make_elements<ElementTo, SourceElementT>(begin, end)});
363+
detail::make_elements<ElementTo, SourceElementT>(begin, end),
364+
rmm::device_buffer{},
365+
0});
353366
}
354367

355368
/**
@@ -379,12 +392,13 @@ class fixed_width_column_wrapper : public detail::column_wrapper {
379392
fixed_width_column_wrapper(InputIterator begin, InputIterator end, ValidityIterator v)
380393
: column_wrapper{}
381394
{
382-
auto const size = cudf::distance(begin, end);
395+
auto const size = cudf::distance(begin, end);
396+
auto [null_mask, null_count] = detail::make_null_mask(v, v + size);
383397
wrapped.reset(new cudf::column{cudf::data_type{cudf::type_to_id<ElementTo>()},
384398
size,
385399
detail::make_elements<ElementTo, SourceElementT>(begin, end),
386-
detail::make_null_mask(v, v + size),
387-
cudf::UNKNOWN_NULL_COUNT});
400+
std::move(null_mask),
401+
null_count});
388402
}
389403

390404
/**
@@ -547,7 +561,9 @@ class fixed_point_column_wrapper : public detail::column_wrapper {
547561
wrapped.reset(new cudf::column{
548562
data_type,
549563
size,
550-
rmm::device_buffer{elements.data(), size * sizeof(Rep), cudf::get_default_stream()}});
564+
rmm::device_buffer{elements.data(), size * sizeof(Rep), cudf::get_default_stream()},
565+
rmm::device_buffer{},
566+
0});
551567
}
552568

553569
/**
@@ -603,17 +619,17 @@ class fixed_point_column_wrapper : public detail::column_wrapper {
603619
{
604620
CUDF_EXPECTS(numeric::is_supported_representation_type<Rep>(), "not valid representation type");
605621

606-
auto const size = cudf::distance(begin, end);
607-
auto const elements = thrust::host_vector<Rep>(begin, end);
608-
auto const id = type_to_id<numeric::fixed_point<Rep, numeric::Radix::BASE_10>>();
609-
auto const data_type = cudf::data_type{id, static_cast<int32_t>(scale)};
610-
622+
auto const size = cudf::distance(begin, end);
623+
auto const elements = thrust::host_vector<Rep>(begin, end);
624+
auto const id = type_to_id<numeric::fixed_point<Rep, numeric::Radix::BASE_10>>();
625+
auto const data_type = cudf::data_type{id, static_cast<int32_t>(scale)};
626+
auto [null_mask, null_count] = detail::make_null_mask(v, v + size);
611627
wrapped.reset(new cudf::column{
612628
data_type,
613629
size,
614630
rmm::device_buffer{elements.data(), size * sizeof(Rep), cudf::get_default_stream()},
615-
detail::make_null_mask(v, v + size),
616-
cudf::UNKNOWN_NULL_COUNT});
631+
std::move(null_mask),
632+
null_count});
617633
}
618634

619635
/**
@@ -736,7 +752,7 @@ class strings_column_wrapper : public detail::column_wrapper {
736752
chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
737753
auto d_offsets = cudf::detail::make_device_uvector_sync(
738754
offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
739-
wrapped = cudf::make_strings_column(d_chars, d_offsets);
755+
wrapped = cudf::make_strings_column(d_chars, d_offsets, {}, 0);
740756
}
741757

742758
/**
@@ -771,16 +787,16 @@ class strings_column_wrapper : public detail::column_wrapper {
771787
strings_column_wrapper(StringsIterator begin, StringsIterator end, ValidityIterator v)
772788
: column_wrapper{}
773789
{
774-
size_type num_strings = std::distance(begin, end);
775-
auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, v);
776-
auto null_mask = detail::make_null_mask_vector(v, v + num_strings);
777-
auto d_chars = cudf::detail::make_device_uvector_sync(
790+
size_type num_strings = std::distance(begin, end);
791+
auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, v);
792+
auto [null_mask, null_count] = detail::make_null_mask_vector(v, v + num_strings);
793+
auto d_chars = cudf::detail::make_device_uvector_sync(
778794
chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
779795
auto d_offsets = cudf::detail::make_device_uvector_sync(
780796
offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
781797
auto d_bitmask = cudf::detail::make_device_uvector_sync(
782798
null_mask, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
783-
wrapped = cudf::make_strings_column(d_chars, d_offsets, d_bitmask);
799+
wrapped = cudf::make_strings_column(d_chars, d_offsets, d_bitmask, null_count);
784800
}
785801

786802
/**
@@ -1579,14 +1595,14 @@ class lists_column_wrapper : public detail::column_wrapper {
15791595
// increment depth
15801596
depth = expected_depth + 1;
15811597

1598+
auto [null_mask, null_count] = [&] {
1599+
if (v.size() <= 0) return std::make_pair(rmm::device_buffer{}, cudf::size_type{0});
1600+
return cudf::test::detail::make_null_mask(v.begin(), v.end());
1601+
}();
1602+
15821603
// construct the list column
1583-
wrapped =
1584-
make_lists_column(cols.size(),
1585-
std::move(offsets),
1586-
std::move(data),
1587-
v.size() <= 0 ? 0 : cudf::UNKNOWN_NULL_COUNT,
1588-
v.size() <= 0 ? rmm::device_buffer{}
1589-
: cudf::test::detail::make_null_mask(v.begin(), v.end()));
1604+
wrapped = make_lists_column(
1605+
cols.size(), std::move(offsets), std::move(data), null_count, std::move(null_mask));
15901606
}
15911607

15921608
/**
@@ -1668,7 +1684,7 @@ class lists_column_wrapper : public detail::column_wrapper {
16681684
std::make_unique<column>(lcv.offsets()),
16691685
normalize_column(lists_column_view(col).child(),
16701686
lists_column_view(expected_hierarchy).child()),
1671-
UNKNOWN_NULL_COUNT,
1687+
col.null_count(),
16721688
copy_bitmask(col));
16731689
}
16741690

@@ -1843,12 +1859,13 @@ class structs_column_wrapper : public detail::column_wrapper {
18431859
CUDF_EXPECTS(validity.size() <= 0 || static_cast<size_type>(validity.size()) == num_rows,
18441860
"Validity buffer must have as many elements as rows in the struct column.");
18451861

1862+
auto [null_mask, null_count] = [&] {
1863+
if (validity.size() <= 0) return std::make_pair(rmm::device_buffer{}, cudf::size_type{0});
1864+
return cudf::test::detail::make_null_mask(validity.begin(), validity.end());
1865+
}();
1866+
18461867
wrapped = cudf::make_structs_column(
1847-
num_rows,
1848-
std::move(child_columns),
1849-
validity.size() <= 0 ? 0 : cudf::UNKNOWN_NULL_COUNT,
1850-
validity.size() <= 0 ? rmm::device_buffer{}
1851-
: detail::make_null_mask(validity.begin(), validity.end()));
1868+
num_rows, std::move(child_columns), null_count, std::move(null_mask));
18521869
}
18531870

18541871
template <typename V>

cpp/tests/bitmask/bitmask_tests.cpp

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@ struct BitmaskUtilitiesTest : public cudf::test::BaseFixture {
3636
TEST_F(BitmaskUtilitiesTest, StateNullCount)
3737
{
3838
EXPECT_EQ(0, cudf::state_null_count(cudf::mask_state::UNALLOCATED, 42));
39-
EXPECT_EQ(cudf::UNKNOWN_NULL_COUNT, cudf::state_null_count(cudf::mask_state::UNINITIALIZED, 42));
4039
EXPECT_EQ(42, cudf::state_null_count(cudf::mask_state::ALL_NULL, 42));
4140
EXPECT_EQ(0, cudf::state_null_count(cudf::mask_state::ALL_VALID, 42));
4241
}
@@ -575,12 +574,13 @@ TEST_F(CopyBitmaskTest, TestZeroOffset)
575574
for (auto& m : validity_bit) {
576575
m = this->generate();
577576
}
578-
auto input_mask = cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end());
577+
auto input_mask =
578+
std::get<0>(cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end()));
579579

580580
int begin_bit = 0;
581581
int end_bit = 800;
582-
auto gold_splice_mask = cudf::test::detail::make_null_mask(validity_bit.begin() + begin_bit,
583-
validity_bit.begin() + end_bit);
582+
auto gold_splice_mask = std::get<0>(cudf::test::detail::make_null_mask(
583+
validity_bit.begin() + begin_bit, validity_bit.begin() + end_bit));
584584

585585
auto splice_mask = cudf::copy_bitmask(
586586
static_cast<const cudf::bitmask_type*>(input_mask.data()), begin_bit, end_bit);
@@ -597,12 +597,13 @@ TEST_F(CopyBitmaskTest, TestNonZeroOffset)
597597
for (auto& m : validity_bit) {
598598
m = this->generate();
599599
}
600-
auto input_mask = cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end());
600+
auto input_mask =
601+
std::get<0>(cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end()));
601602

602603
int begin_bit = 321;
603604
int end_bit = 998;
604-
auto gold_splice_mask = cudf::test::detail::make_null_mask(validity_bit.begin() + begin_bit,
605-
validity_bit.begin() + end_bit);
605+
auto gold_splice_mask = std::get<0>(cudf::test::detail::make_null_mask(
606+
validity_bit.begin() + begin_bit, validity_bit.begin() + end_bit));
606607

607608
auto splice_mask = cudf::copy_bitmask(
608609
static_cast<const cudf::bitmask_type*>(input_mask.data()), begin_bit, end_bit);
@@ -621,7 +622,8 @@ TEST_F(CopyBitmaskTest, TestCopyColumnViewVectorContiguous)
621622
for (auto& m : validity_bit) {
622623
m = this->generate();
623624
}
624-
auto gold_mask = cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end());
625+
auto gold_mask =
626+
std::get<0>(cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end()));
625627

626628
rmm::device_buffer copy_mask{gold_mask, cudf::get_default_stream()};
627629
cudf::column original{t,
@@ -661,18 +663,21 @@ TEST_F(CopyBitmaskTest, TestCopyColumnViewVectorDiscontiguous)
661663
for (auto& m : validity_bit) {
662664
m = this->generate();
663665
}
664-
auto gold_mask = cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end());
666+
auto gold_mask =
667+
std::get<0>(cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end()));
665668
std::vector<cudf::size_type> split{0, 104, 128, 152, 311, 491, 583, 734, 760, num_elements};
666669

667670
std::vector<cudf::column> cols;
668671
std::vector<cudf::column_view> views;
669672
for (unsigned i = 0; i < split.size() - 1; i++) {
673+
auto [null_mask, null_count] = cudf::test::detail::make_null_mask(
674+
validity_bit.begin() + split[i], validity_bit.begin() + split[i + 1]);
670675
cols.emplace_back(
671676
t,
672677
split[i + 1] - split[i],
673678
rmm::device_buffer{sizeof(int) * (split[i + 1] - split[i]), cudf::get_default_stream()},
674-
cudf::test::detail::make_null_mask(validity_bit.begin() + split[i],
675-
validity_bit.begin() + split[i + 1]));
679+
std::move(null_mask),
680+
null_count);
676681
views.push_back(cols.back());
677682
}
678683
rmm::device_buffer concatenated_bitmask = cudf::concatenate_masks(views);
@@ -706,7 +711,8 @@ TEST_F(MergeBitmaskTest, TestBitmaskAnd)
706711

707712
auto odd_indices =
708713
cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
709-
auto odd = cudf::test::detail::make_null_mask(odd_indices, odd_indices + input2.num_rows());
714+
auto odd =
715+
std::get<0>(cudf::test::detail::make_null_mask(odd_indices, odd_indices + input2.num_rows()));
710716

711717
EXPECT_EQ(nullptr, result1_mask.data());
712718
CUDF_TEST_EXPECT_EQUAL_BUFFERS(
@@ -735,8 +741,8 @@ TEST_F(MergeBitmaskTest, TestBitmaskOr)
735741

736742
auto all_but_index3 =
737743
cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; });
738-
auto null3 =
739-
cudf::test::detail::make_null_mask(all_but_index3, all_but_index3 + input2.num_rows());
744+
auto null3 = std::get<0>(
745+
cudf::test::detail::make_null_mask(all_but_index3, all_but_index3 + input2.num_rows()));
740746

741747
EXPECT_EQ(nullptr, result1_mask.data());
742748
CUDF_TEST_EXPECT_EQUAL_BUFFERS(

cpp/tests/bitmask/valid_if_tests.cu

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,9 @@ TEST_F(ValidIfTest, OddsValid)
7070
odds_valid{},
7171
cudf::get_default_stream(),
7272
rmm::mr::get_current_device_resource());
73-
CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.data(), actual.first.data(), expected.size());
73+
CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.first.data(), actual.first.data(), expected.first.size());
7474
EXPECT_EQ(5000, actual.second);
75+
EXPECT_EQ(expected.second, actual.second);
7576
}
7677

7778
TEST_F(ValidIfTest, AllValid)
@@ -83,8 +84,9 @@ TEST_F(ValidIfTest, AllValid)
8384
all_valid{},
8485
cudf::get_default_stream(),
8586
rmm::mr::get_current_device_resource());
86-
CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.data(), actual.first.data(), expected.size());
87+
CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.first.data(), actual.first.data(), expected.first.size());
8788
EXPECT_EQ(0, actual.second);
89+
EXPECT_EQ(expected.second, actual.second);
8890
}
8991

9092
TEST_F(ValidIfTest, AllNull)
@@ -96,6 +98,7 @@ TEST_F(ValidIfTest, AllNull)
9698
all_null{},
9799
cudf::get_default_stream(),
98100
rmm::mr::get_current_device_resource());
99-
CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.data(), actual.first.data(), expected.size());
101+
CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.first.data(), actual.first.data(), expected.first.size());
100102
EXPECT_EQ(10000, actual.second);
103+
EXPECT_EQ(expected.second, actual.second);
101104
}

0 commit comments

Comments
 (0)