column_wrapper.hpp
1 /*
2  * Copyright (c) 2019-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf_test/column_utilities.hpp>
20 #include <cudf_test/default_stream.hpp>
21 
22 #include <cudf/column/column.hpp>
24 #include <cudf/copying.hpp>
25 #include <cudf/detail/concatenate.hpp>
26 #include <cudf/detail/iterator.cuh>
27 #include <cudf/detail/utilities/vector_factories.hpp>
31 #include <cudf/null_mask.hpp>
32 #include <cudf/types.hpp>
33 #include <cudf/utilities/bit.hpp>
35 #include <cudf/utilities/export.hpp>
39 
40 #include <rmm/device_buffer.hpp>
41 
42 #include <thrust/copy.h>
43 #include <thrust/functional.h>
44 #include <thrust/host_vector.h>
45 #include <thrust/iterator/constant_iterator.h>
46 #include <thrust/iterator/counting_iterator.h>
47 #include <thrust/iterator/transform_iterator.h>
48 
49 #include <algorithm>
50 #include <iterator>
51 #include <memory>
52 #include <numeric>
53 
54 namespace CUDF_EXPORT cudf {
55 namespace test {
56 namespace detail {
66  public:
74  operator column_view() const { return wrapped->view(); }
75 
83  operator mutable_column_view() { return wrapped->mutable_view(); }
84 
90  std::unique_ptr<cudf::column> release() { return std::move(wrapped); }
91 
92  protected:
93  std::unique_ptr<cudf::column> wrapped{};
94 };
95 
99 template <typename From, typename To>
109  template <typename FromT = From,
110  typename ToT = To,
111  std::enable_if_t<std::is_same_v<FromT, ToT>, void>* = nullptr>
112  constexpr ToT operator()(FromT element) const
113  {
114  return element;
115  }
116 
125  template <
126  typename FromT = From,
127  typename ToT = To,
128  std::enable_if_t<!std::is_same_v<FromT, ToT> && (cudf::is_convertible<FromT, ToT>::value ||
129  std::is_constructible_v<ToT, FromT>),
130  void>* = nullptr>
131  constexpr ToT operator()(FromT element) const
132  {
133  return static_cast<ToT>(element);
134  }
135 
144  template <
145  typename FromT = From,
146  typename ToT = To,
147  std::enable_if_t<std::is_integral_v<FromT> && cudf::is_timestamp<ToT>(), void>* = nullptr>
148  constexpr ToT operator()(FromT element) const
149  {
150  return ToT{typename ToT::duration{element}};
151  }
152 };
153 
164 template <typename ElementTo,
165  typename ElementFrom,
166  typename InputIterator,
167  std::enable_if_t<not cudf::is_fixed_point<ElementTo>()>* = nullptr>
168 rmm::device_buffer make_elements(InputIterator begin, InputIterator end)
169 {
170  static_assert(cudf::is_fixed_width<ElementTo>(), "Unexpected non-fixed width type.");
171  auto transformer = fixed_width_type_converter<ElementFrom, ElementTo>{};
172  auto transform_begin = thrust::make_transform_iterator(begin, transformer);
173  auto const size = cudf::distance(begin, end);
174  auto const elements = thrust::host_vector<ElementTo>(transform_begin, transform_begin + size);
175  return rmm::device_buffer{
176  elements.data(), size * sizeof(ElementTo), cudf::test::get_default_stream()};
177 }
178 
179 // The two signatures below are identical to the above overload apart from
180 // SFINAE so doxygen sees it as a duplicate.
182 
193 template <typename ElementTo,
194  typename ElementFrom,
195  typename InputIterator,
196  std::enable_if_t<not cudf::is_fixed_point<ElementFrom>() and
197  cudf::is_fixed_point<ElementTo>()>* = nullptr>
198 rmm::device_buffer make_elements(InputIterator begin, InputIterator end)
199 {
200  using RepType = typename ElementTo::rep;
201  auto transformer = fixed_width_type_converter<ElementFrom, RepType>{};
202  auto transform_begin = thrust::make_transform_iterator(begin, transformer);
203  auto const size = cudf::distance(begin, end);
204  auto const elements = thrust::host_vector<RepType>(transform_begin, transform_begin + size);
205  return rmm::device_buffer{
206  elements.data(), size * sizeof(RepType), cudf::test::get_default_stream()};
207 }
208 
219 template <typename ElementTo,
220  typename ElementFrom,
221  typename InputIterator,
222  std::enable_if_t<cudf::is_fixed_point<ElementFrom>() and
223  cudf::is_fixed_point<ElementTo>()>* = nullptr>
224 rmm::device_buffer make_elements(InputIterator begin, InputIterator end)
225 {
226  using namespace numeric;
227  using RepType = typename ElementTo::rep;
228 
229  CUDF_EXPECTS(std::all_of(begin, end, [](ElementFrom v) { return v.scale() == 0; }),
230  "Only zero-scale fixed-point values are supported");
231 
232  auto to_rep = [](ElementTo fp) { return fp.value(); };
233  auto transformer_begin = thrust::make_transform_iterator(begin, to_rep);
234  auto const size = cudf::distance(begin, end);
235  auto const elements = thrust::host_vector<RepType>(transformer_begin, transformer_begin + size);
236  return rmm::device_buffer{
237  elements.data(), size * sizeof(RepType), cudf::test::get_default_stream()};
238 }
240 
254 template <typename ValidityIterator>
255 std::pair<std::vector<bitmask_type>, cudf::size_type> make_null_mask_vector(ValidityIterator begin,
256  ValidityIterator end)
257 {
258  auto const size = cudf::distance(begin, end);
259  auto const num_words = cudf::bitmask_allocation_size_bytes(size) / sizeof(bitmask_type);
260 
261  auto null_mask = std::vector<bitmask_type>(num_words, 0);
262  auto null_count = cudf::size_type{0};
263  for (auto i = 0; i < size; ++i) {
264  if (*(begin + i)) {
265  set_bit_unsafe(null_mask.data(), i);
266  } else {
267  ++null_count;
268  }
269  }
270 
271  return {std::move(null_mask), null_count};
272 }
273 
287 template <typename ValidityIterator>
288 std::pair<rmm::device_buffer, cudf::size_type> make_null_mask(ValidityIterator begin,
289  ValidityIterator end)
290 {
291  auto [null_mask, null_count] = make_null_mask_vector(begin, end);
292  auto d_mask = rmm::device_buffer{null_mask.data(),
295  return {std::move(d_mask), null_count};
296 }
297 
312 template <typename StringsIterator, typename ValidityIterator>
313 auto make_chars_and_offsets(StringsIterator begin, StringsIterator end, ValidityIterator v)
314 {
315  std::vector<char> chars{};
316  std::vector<cudf::size_type> offsets(1, 0);
317  for (auto str = begin; str < end; ++str) {
318  std::string tmp = (*v++) ? std::string(*str) : std::string{};
319  chars.insert(chars.end(), std::cbegin(tmp), std::cend(tmp));
320  auto const last_offset = static_cast<std::size_t>(offsets.back());
321  auto const next_offset = last_offset + tmp.length();
322  CUDF_EXPECTS(
323  next_offset < static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
324  "Cannot use strings_column_wrapper to build a large strings column");
325  offsets.push_back(static_cast<cudf::size_type>(next_offset));
326  }
327  return std::pair(std::move(chars), std::move(offsets));
328 };
329 } // namespace detail
330 
339 template <typename ElementTo, typename SourceElementT = ElementTo>
341  public:
345  fixed_width_column_wrapper() : column_wrapper{}
346  {
347  std::vector<ElementTo> empty;
348  wrapped.reset(
349  new cudf::column{cudf::data_type{cudf::type_to_id<ElementTo>()},
350  0,
351  detail::make_elements<ElementTo, SourceElementT>(empty.begin(), empty.end()),
353  0});
354  }
355 
374  template <typename InputIterator>
375  fixed_width_column_wrapper(InputIterator begin, InputIterator end) : column_wrapper{}
376  {
377  auto const size = cudf::distance(begin, end);
378  wrapped.reset(new cudf::column{cudf::data_type{cudf::type_to_id<ElementTo>()},
379  size,
380  detail::make_elements<ElementTo, SourceElementT>(begin, end),
382  0});
383  }
384 
408  template <typename InputIterator, typename ValidityIterator>
409  fixed_width_column_wrapper(InputIterator begin, InputIterator end, ValidityIterator v)
410  : column_wrapper{}
411  {
412  auto const size = cudf::distance(begin, end);
413  auto [null_mask, null_count] = detail::make_null_mask(v, v + size);
414  wrapped.reset(new cudf::column{cudf::data_type{cudf::type_to_id<ElementTo>()},
415  size,
416  detail::make_elements<ElementTo, SourceElementT>(begin, end),
417  std::move(null_mask),
418  null_count});
419  }
420 
433  template <typename ElementFrom>
434  fixed_width_column_wrapper(std::initializer_list<ElementFrom> elements)
435  : fixed_width_column_wrapper(std::cbegin(elements), std::cend(elements))
436  {
437  }
438 
456  template <typename ElementFrom>
457  fixed_width_column_wrapper(std::initializer_list<ElementFrom> elements,
458  std::initializer_list<bool> validity)
459  : fixed_width_column_wrapper(std::cbegin(elements), std::cend(elements), std::cbegin(validity))
460  {
461  }
462 
480  template <typename ValidityIterator, typename ElementFrom>
481  fixed_width_column_wrapper(std::initializer_list<ElementFrom> element_list, ValidityIterator v)
482  : fixed_width_column_wrapper(std::cbegin(element_list), std::cend(element_list), v)
483  {
484  }
485 
504  template <typename InputIterator>
505  fixed_width_column_wrapper(InputIterator begin,
506  InputIterator end,
507  std::initializer_list<bool> const& validity)
508  : fixed_width_column_wrapper(begin, end, std::cbegin(validity))
509  {
510  }
511 
529  template <typename ElementFrom>
530  fixed_width_column_wrapper(std::initializer_list<std::pair<ElementFrom, bool>> elements)
531  {
532  auto begin =
533  thrust::make_transform_iterator(elements.begin(), [](auto const& e) { return e.first; });
534  auto end = begin + elements.size();
535  auto v =
536  thrust::make_transform_iterator(elements.begin(), [](auto const& e) { return e.second; });
538  }
539 };
540 
546 template <typename Rep>
548  public:
565  template <typename FixedPointRepIterator>
566  fixed_point_column_wrapper(FixedPointRepIterator begin,
567  FixedPointRepIterator end,
568  numeric::scale_type scale)
569  : column_wrapper{}
570  {
571  CUDF_EXPECTS(numeric::is_supported_representation_type<Rep>(), "not valid representation type");
572 
573  auto const size = cudf::distance(begin, end);
574  auto const elements = thrust::host_vector<Rep>(begin, end);
575  auto const id = type_to_id<numeric::fixed_point<Rep, numeric::Radix::BASE_10>>();
576  auto const data_type = cudf::data_type{id, static_cast<int32_t>(scale)};
577 
578  wrapped.reset(new cudf::column{
579  data_type,
580  size,
581  rmm::device_buffer{elements.data(), size * sizeof(Rep), cudf::test::get_default_stream()},
583  0});
584  }
585 
598  fixed_point_column_wrapper(std::initializer_list<Rep> values, numeric::scale_type scale)
599  : fixed_point_column_wrapper(std::cbegin(values), std::cend(values), scale)
600  {
601  }
602 
630  template <typename FixedPointRepIterator, typename ValidityIterator>
631  fixed_point_column_wrapper(FixedPointRepIterator begin,
632  FixedPointRepIterator end,
633  ValidityIterator v,
634  numeric::scale_type scale)
635  : column_wrapper{}
636  {
637  CUDF_EXPECTS(numeric::is_supported_representation_type<Rep>(), "not valid representation type");
638 
639  auto const size = cudf::distance(begin, end);
640  auto const elements = thrust::host_vector<Rep>(begin, end);
641  auto const id = type_to_id<numeric::fixed_point<Rep, numeric::Radix::BASE_10>>();
642  auto const data_type = cudf::data_type{id, static_cast<int32_t>(scale)};
643  auto [null_mask, null_count] = detail::make_null_mask(v, v + size);
644  wrapped.reset(new cudf::column{
645  data_type,
646  size,
647  rmm::device_buffer{elements.data(), size * sizeof(Rep), cudf::test::get_default_stream()},
648  std::move(null_mask),
649  null_count});
650  }
651 
669  fixed_point_column_wrapper(std::initializer_list<Rep> elements,
670  std::initializer_list<bool> validity,
671  numeric::scale_type scale)
673  std::cbegin(elements), std::cend(elements), std::cbegin(validity), scale)
674  {
675  }
676 
695  template <typename ValidityIterator>
696  fixed_point_column_wrapper(std::initializer_list<Rep> element_list,
697  ValidityIterator v,
698  numeric::scale_type scale)
699  : fixed_point_column_wrapper(std::cbegin(element_list), std::cend(element_list), v, scale)
700  {
701  }
702 
723  template <typename FixedPointRepIterator>
724  fixed_point_column_wrapper(FixedPointRepIterator begin,
725  FixedPointRepIterator end,
726  std::initializer_list<bool> const& validity,
727  numeric::scale_type scale)
728  : fixed_point_column_wrapper(begin, end, std::cbegin(validity), scale)
729  {
730  }
731 };
732 
737  public:
741  strings_column_wrapper() : strings_column_wrapper(std::initializer_list<std::string>{}) {}
742 
763  template <typename StringsIterator>
764  strings_column_wrapper(StringsIterator begin, StringsIterator end) : column_wrapper{}
765  {
766  size_type num_strings = std::distance(begin, end);
767  if (num_strings == 0) {
769  return;
770  }
771  auto all_valid = thrust::make_constant_iterator(true);
772  auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, all_valid);
773  auto d_chars = cudf::detail::make_device_uvector_async(
775  auto d_offsets = std::make_unique<cudf::column>(
776  cudf::detail::make_device_uvector_sync(
779  0);
780  wrapped =
781  cudf::make_strings_column(num_strings, std::move(d_offsets), d_chars.release(), 0, {});
782  }
783 
812  template <typename StringsIterator, typename ValidityIterator>
813  strings_column_wrapper(StringsIterator begin, StringsIterator end, ValidityIterator v)
814  : column_wrapper{}
815  {
816  size_type num_strings = std::distance(begin, end);
817  if (num_strings == 0) {
819  return;
820  }
821  auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, v);
822  auto [null_mask, null_count] = detail::make_null_mask_vector(v, v + num_strings);
823  auto d_chars = cudf::detail::make_device_uvector_async(
825  auto d_offsets = std::make_unique<cudf::column>(
826  cudf::detail::make_device_uvector_async(
829  0);
830  auto d_bitmask = cudf::detail::make_device_uvector_sync(
832  wrapped = cudf::make_strings_column(
833  num_strings, std::move(d_offsets), d_chars.release(), null_count, d_bitmask.release());
834  }
835 
848  strings_column_wrapper(std::initializer_list<std::string> strings)
849  : strings_column_wrapper(std::cbegin(strings), std::cend(strings))
850  {
851  }
852 
871  template <typename ValidityIterator>
872  strings_column_wrapper(std::initializer_list<std::string> strings, ValidityIterator v)
873  : strings_column_wrapper(std::cbegin(strings), std::cend(strings), v)
874  {
875  }
876 
892  strings_column_wrapper(std::initializer_list<std::string> strings,
893  std::initializer_list<bool> validity)
894  : strings_column_wrapper(std::cbegin(strings), std::cend(strings), std::cbegin(validity))
895  {
896  }
897 
918  strings_column_wrapper(std::initializer_list<std::pair<std::string, bool>> strings)
919  {
920  auto begin =
921  thrust::make_transform_iterator(strings.begin(), [](auto const& s) { return s.first; });
922  auto end = begin + strings.size();
923  auto v =
924  thrust::make_transform_iterator(strings.begin(), [](auto const& s) { return s.second; });
925  wrapped = strings_column_wrapper(begin, end, v).release();
926  }
927 };
928 
937 template <typename KeyElementTo, typename SourceElementT = KeyElementTo>
939  public:
943  operator dictionary_column_view() const { return cudf::dictionary_column_view{wrapped->view()}; }
944 
948  dictionary_column_wrapper() : column_wrapper{}
949  {
951  }
952 
972  template <typename InputIterator>
973  dictionary_column_wrapper(InputIterator begin, InputIterator end) : column_wrapper{}
974  {
975  wrapped =
977  cudf::data_type{type_id::UINT32},
979  }
980 
1006  template <typename InputIterator, typename ValidityIterator>
1007  dictionary_column_wrapper(InputIterator begin, InputIterator end, ValidityIterator v)
1008  : column_wrapper{}
1009  {
1010  wrapped = cudf::dictionary::encode(
1012  cudf::data_type{type_id::UINT32},
1014  }
1015 
1029  template <typename ElementFrom>
1030  dictionary_column_wrapper(std::initializer_list<ElementFrom> elements)
1031  : dictionary_column_wrapper(std::cbegin(elements), std::cend(elements))
1032  {
1033  }
1034 
1053  template <typename ElementFrom>
1054  dictionary_column_wrapper(std::initializer_list<ElementFrom> elements,
1055  std::initializer_list<bool> validity)
1056  : dictionary_column_wrapper(std::cbegin(elements), std::cend(elements), std::cbegin(validity))
1057  {
1058  }
1059 
1078  template <typename ValidityIterator, typename ElementFrom>
1079  dictionary_column_wrapper(std::initializer_list<ElementFrom> element_list, ValidityIterator v)
1080  : dictionary_column_wrapper(std::cbegin(element_list), std::cend(element_list), v)
1081  {
1082  }
1083 
1104  template <typename InputIterator>
1105  dictionary_column_wrapper(InputIterator begin,
1106  InputIterator end,
1107  std::initializer_list<bool> const& validity)
1108  : dictionary_column_wrapper(begin, end, std::cbegin(validity))
1109  {
1110  }
1111 };
1112 
1118 template <>
1120  public:
1125  operator dictionary_column_view() const { return cudf::dictionary_column_view{wrapped->view()}; }
1126 
1132  [[nodiscard]] column_view keys() const
1133  {
1134  return cudf::dictionary_column_view{wrapped->view()}.keys();
1135  }
1136 
1142  [[nodiscard]] column_view indices() const
1143  {
1144  return cudf::dictionary_column_view{wrapped->view()}.indices();
1145  }
1146 
1150  dictionary_column_wrapper() : dictionary_column_wrapper(std::initializer_list<std::string>{}) {}
1151 
1172  template <typename StringsIterator>
1173  dictionary_column_wrapper(StringsIterator begin, StringsIterator end) : column_wrapper{}
1174  {
1175  wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end),
1176  cudf::data_type{type_id::UINT32},
1178  }
1179 
1208  template <typename StringsIterator, typename ValidityIterator>
1209  dictionary_column_wrapper(StringsIterator begin, StringsIterator end, ValidityIterator v)
1210  : column_wrapper{}
1211  {
1212  wrapped = cudf::dictionary::encode(strings_column_wrapper(begin, end, v),
1213  cudf::data_type{type_id::UINT32},
1215  }
1216 
1229  dictionary_column_wrapper(std::initializer_list<std::string> strings)
1230  : dictionary_column_wrapper(std::cbegin(strings), std::cend(strings))
1231  {
1232  }
1233 
1252  template <typename ValidityIterator>
1253  dictionary_column_wrapper(std::initializer_list<std::string> strings, ValidityIterator v)
1254  : dictionary_column_wrapper(std::cbegin(strings), std::cend(strings), v)
1255  {
1256  }
1257 
1273  dictionary_column_wrapper(std::initializer_list<std::string> strings,
1274  std::initializer_list<bool> validity)
1275  : dictionary_column_wrapper(std::cbegin(strings), std::cend(strings), std::cbegin(validity))
1276  {
1277  }
1278 };
1279 
1315 template <typename T, typename SourceElementT = T>
1317  public:
1321  operator lists_column_view() const { return cudf::lists_column_view{wrapped->view()}; }
1322 
1336  template <typename Element = T, std::enable_if_t<cudf::is_fixed_width<Element>()>* = nullptr>
1337  lists_column_wrapper(std::initializer_list<SourceElementT> elements) : column_wrapper{}
1338  {
1339  build_from_non_nested(
1341  }
1342 
1358  template <typename Element = T,
1359  typename InputIterator,
1360  std::enable_if_t<cudf::is_fixed_width<Element>()>* = nullptr>
1361  lists_column_wrapper(InputIterator begin, InputIterator end) : column_wrapper{}
1362  {
1363  build_from_non_nested(
1365  }
1366 
1382  template <typename Element = T,
1383  typename ValidityIterator,
1384  std::enable_if_t<cudf::is_fixed_width<Element>()>* = nullptr>
1385  lists_column_wrapper(std::initializer_list<SourceElementT> elements, ValidityIterator v)
1386  : column_wrapper{}
1387  {
1388  build_from_non_nested(
1390  }
1391 
1409  template <typename Element = T,
1410  typename InputIterator,
1411  typename ValidityIterator,
1412  std::enable_if_t<cudf::is_fixed_width<Element>()>* = nullptr>
1413  lists_column_wrapper(InputIterator begin, InputIterator end, ValidityIterator v)
1414  : column_wrapper{}
1415  {
1416  build_from_non_nested(
1418  }
1419 
1433  template <typename Element = T,
1434  std::enable_if_t<std::is_same_v<Element, cudf::string_view>>* = nullptr>
1435  lists_column_wrapper(std::initializer_list<std::string> elements) : column_wrapper{}
1436  {
1437  build_from_non_nested(
1438  cudf::test::strings_column_wrapper(elements.begin(), elements.end()).release());
1439  }
1440 
1456  template <typename Element = T,
1457  typename ValidityIterator,
1458  std::enable_if_t<std::is_same_v<Element, cudf::string_view>>* = nullptr>
1459  lists_column_wrapper(std::initializer_list<std::string> elements, ValidityIterator v)
1460  : column_wrapper{}
1461  {
1462  build_from_non_nested(
1463  cudf::test::strings_column_wrapper(elements.begin(), elements.end(), v).release());
1464  }
1465 
1488  : column_wrapper{}
1489  {
1490  std::vector<bool> valids;
1491  build_from_nested(elements, valids);
1492  }
1493 
1505  lists_column_wrapper() : column_wrapper{}
1506  {
1507  build_from_non_nested(make_empty_column(cudf::type_to_id<T>()));
1508  }
1509 
1535  template <typename ValidityIterator>
1537  ValidityIterator v)
1538  : column_wrapper{}
1539  {
1540  std::vector<bool> validity;
1541  std::transform(elements.begin(),
1542  elements.end(),
1543  v,
1544  std::back_inserter(validity),
1545  [](lists_column_wrapper const& l, bool valid) { return valid; });
1546  build_from_nested(elements, validity);
1547  }
1548 
1556  {
1559  return lists_column_wrapper<T>(
1560  1,
1561  offsets.release(),
1562  values.release(),
1563  valid ? 0 : 1,
1565  }
1566 
1567  private:
1578  std::unique_ptr<cudf::column>&& offsets,
1579  std::unique_ptr<cudf::column>&& values,
1581  rmm::device_buffer&& null_mask)
1582  {
1583  // construct the list column
1584  wrapped = make_lists_column(num_rows,
1585  std::move(offsets),
1586  std::move(values),
1587  null_count,
1588  std::move(null_mask),
1590  }
1591 
1608  void build_from_nested(std::initializer_list<lists_column_wrapper<T, SourceElementT>> elements,
1609  std::vector<bool> const& v)
1610  {
1611  auto valids = cudf::detail::make_counting_transform_iterator(
1612  0, [&v](auto i) { return v.empty() ? true : v[i]; });
1613 
1614  // compute the expected hierarchy and depth
1615  auto const hierarchy_and_depth =
1616  std::accumulate(elements.begin(),
1617  elements.end(),
1618  std::pair<column_view, int32_t>{{}, -1},
1619  [](auto acc, lists_column_wrapper const& lcw) {
1620  return lcw.depth > acc.second ? std::pair(lcw.get_view(), lcw.depth) : acc;
1621  });
1622  column_view expected_hierarchy = hierarchy_and_depth.first;
1623  int32_t const expected_depth = hierarchy_and_depth.second;
1624 
1625  // preprocess columns so that every column_view in 'cols' is an equivalent hierarchy
1626  auto [cols, stubs] = preprocess_columns(elements, expected_hierarchy, expected_depth);
1627 
1628  // generate offsets
1629  size_type count = 0;
1630  std::vector<size_type> offsetv;
1631  std::transform(cols.cbegin(),
1632  cols.cend(),
1633  valids,
1634  std::back_inserter(offsetv),
1635  [&](cudf::column_view const& col, bool valid) {
1636  // nulls are represented as a repeated offset
1637  size_type ret = count;
1638  if (valid) { count += col.size(); }
1639  return ret;
1640  });
1641  // add the final offset
1642  offsetv.push_back(count);
1643  auto offsets =
1644  cudf::test::fixed_width_column_wrapper<size_type>(offsetv.begin(), offsetv.end()).release();
1645 
1646  // concatenate them together, skipping children that are null.
1647  std::vector<column_view> children;
1648  thrust::copy_if(
1649  std::cbegin(cols), std::cend(cols), valids, std::back_inserter(children), thrust::identity{});
1650 
1651  auto data = children.empty() ? cudf::empty_like(expected_hierarchy)
1652  : cudf::concatenate(children,
1653  cudf::test::get_default_stream(),
1655 
1656  // increment depth
1657  depth = expected_depth + 1;
1658 
1659  auto [null_mask, null_count] = [&] {
1660  if (v.size() <= 0) return std::make_pair(rmm::device_buffer{}, cudf::size_type{0});
1661  return cudf::test::detail::make_null_mask(v.begin(), v.end());
1662  }();
1663 
1664  // construct the list column
1665  wrapped = make_lists_column(cols.size(),
1666  std::move(offsets),
1667  std::move(data),
1668  null_count,
1669  std::move(null_mask),
1671  }
1672 
1680  void build_from_non_nested(std::unique_ptr<column> c)
1681  {
1682  CUDF_EXPECTS(c->type().id() == type_id::EMPTY || !cudf::is_nested(c->type()),
1683  "Unexpected type");
1684 
1685  std::vector<size_type> offsetv;
1686  if (c->size() > 0) {
1687  offsetv.push_back(0);
1688  offsetv.push_back(c->size());
1689  }
1690  auto offsets =
1691  cudf::test::fixed_width_column_wrapper<size_type>(offsetv.begin(), offsetv.end()).release();
1692 
1693  // construct the list column. mark this as a root
1694  root = true;
1695  depth = 0;
1696 
1697  size_type num_elements = offsets->size() == 0 ? 0 : offsets->size() - 1;
1698  wrapped = make_lists_column(num_elements,
1699  std::move(offsets),
1700  std::move(c),
1701  0,
1704  }
1705 
1740  std::unique_ptr<column> normalize_column(column_view const& col,
1741  column_view const& expected_hierarchy)
1742  {
1743  // if are at the bottom of the short column, it must be empty
1744  if (col.type().id() != type_id::LIST) {
1745  CUDF_EXPECTS(col.is_empty(), "Encountered mismatched column!");
1746 
1747  auto remainder = empty_like(expected_hierarchy);
1748  return remainder;
1749  }
1750 
1751  lists_column_view lcv(col);
1752  return make_lists_column(
1753  col.size(),
1754  std::make_unique<column>(lcv.offsets()),
1755  normalize_column(lists_column_view(col).child(),
1756  lists_column_view(expected_hierarchy).child()),
1757  col.null_count(),
1761  }
1762 
1763  std::pair<std::vector<column_view>, std::vector<std::unique_ptr<column>>> preprocess_columns(
1764  std::initializer_list<lists_column_wrapper<T, SourceElementT>> const& elements,
1765  column_view& expected_hierarchy,
1766  int expected_depth)
1767  {
1768  std::vector<std::unique_ptr<column>> stubs;
1769  std::vector<column_view> cols;
1770 
1771  // preprocess the incoming lists.
1772  // - unwrap any "root" lists
1773  // - handle incomplete hierarchies
1774  std::transform(elements.begin(),
1775  elements.end(),
1776  std::back_inserter(cols),
1777  [&](lists_column_wrapper const& l) -> column_view {
1778  // depth mismatch. attempt to normalize the short column.
1779  // this function will also catch if this is a legitimately broken
1780  // set of input
1781  if (l.depth < expected_depth) {
1782  if (l.root) {
1783  // this exception distinguishes between the following two cases:
1784  //
1785  // { {{{1, 2, 3}}}, {} }
1786  // In this case, row 0 is a List<List<List<int>>>, whereas row 1 is
1787  // just a List<> which is an apparent mismatch. However, because row 1
1788  // is empty we will allow that to semantically mean
1789  // "a List<List<List<int>>> that's empty at the top level"
1790  //
1791  // { {{{1, 2, 3}}}, {4, 5, 6} }
1792  // In this case, row 1 is a concrete List<int> with actual values.
1793  // There is no way to rectify the differences so we will treat it as a
1794  // true column mismatch.
1795  CUDF_EXPECTS(l.wrapped->size() == 0, "Mismatch in column types!");
1796  stubs.push_back(empty_like(expected_hierarchy));
1797  } else {
1798  stubs.push_back(normalize_column(l.get_view(), expected_hierarchy));
1799  }
1800  return *(stubs.back());
1801  }
1802  // the empty hierarchy case
1803  return l.get_view();
1804  });
1805 
1806  return {std::move(cols), std::move(stubs)};
1807  }
1808 
1809  [[nodiscard]] column_view get_view() const
1810  {
1811  return root ? lists_column_view(*wrapped).child() : *wrapped;
1812  }
1813 
1814  int depth = 0;
1815  bool root = false;
1816 };
1817 
1822  public:
1850  structs_column_wrapper(std::vector<std::unique_ptr<cudf::column>>&& child_columns,
1851  std::vector<bool> const& validity = {})
1852  {
1853  init(std::move(child_columns), validity);
1854  }
1855 
1877  std::initializer_list<std::reference_wrapper<detail::column_wrapper>> child_column_wrappers,
1878  std::vector<bool> const& validity = {})
1879  {
1880  std::vector<std::unique_ptr<cudf::column>> child_columns;
1881  child_columns.reserve(child_column_wrappers.size());
1882  std::transform(child_column_wrappers.begin(),
1883  child_column_wrappers.end(),
1884  std::back_inserter(child_columns),
1885  [&](auto const& column_wrapper) {
1886  return std::make_unique<cudf::column>(column_wrapper.get(),
1887  cudf::test::get_default_stream());
1888  });
1889  init(std::move(child_columns), validity);
1890  }
1891 
1912  template <typename V>
1914  std::initializer_list<std::reference_wrapper<detail::column_wrapper>> child_column_wrappers,
1915  V validity_iter)
1916  {
1917  std::vector<std::unique_ptr<cudf::column>> child_columns;
1918  child_columns.reserve(child_column_wrappers.size());
1919  std::transform(child_column_wrappers.begin(),
1920  child_column_wrappers.end(),
1921  std::back_inserter(child_columns),
1922  [&](auto const& column_wrapper) {
1923  return std::make_unique<cudf::column>(column_wrapper.get(),
1924  cudf::test::get_default_stream());
1925  });
1926  init(std::move(child_columns), validity_iter);
1927  }
1928 
1929  private:
1930  void init(std::vector<std::unique_ptr<cudf::column>>&& child_columns,
1931  std::vector<bool> const& validity)
1932  {
1933  size_type num_rows = child_columns.empty() ? 0 : child_columns[0]->size();
1934 
1935  CUDF_EXPECTS(std::all_of(child_columns.begin(),
1936  child_columns.end(),
1937  [&](auto const& p_column) { return p_column->size() == num_rows; }),
1938  "All struct member columns must have the same row count.");
1939 
1940  CUDF_EXPECTS(validity.size() <= 0 || static_cast<size_type>(validity.size()) == num_rows,
1941  "Validity buffer must have as many elements as rows in the struct column.");
1942 
1943  auto [null_mask, null_count] = [&] {
1944  if (validity.size() <= 0) return std::make_pair(rmm::device_buffer{}, cudf::size_type{0});
1945  return cudf::test::detail::make_null_mask(validity.begin(), validity.end());
1946  }();
1947 
1948  wrapped = cudf::make_structs_column(num_rows,
1949  std::move(child_columns),
1950  null_count,
1951  std::move(null_mask),
1953  }
1954 
1955  template <typename V>
1956  void init(std::vector<std::unique_ptr<cudf::column>>&& child_columns, V validity_iterator)
1957  {
1958  size_type const num_rows = child_columns.empty() ? 0 : child_columns[0]->size();
1959 
1960  CUDF_EXPECTS(std::all_of(child_columns.begin(),
1961  child_columns.end(),
1962  [&](auto const& p_column) { return p_column->size() == num_rows; }),
1963  "All struct member columns must have the same row count.");
1964 
1965  std::vector<bool> validity(num_rows);
1966  std::copy(validity_iterator, validity_iterator + num_rows, validity.begin());
1967 
1968  init(std::move(child_columns), validity);
1969  }
1970 };
1971 
1972 } // namespace test
1973 } // namespace CUDF_EXPORT cudf
Utilities for bit and bitmask operations.
A non-owning, immutable view of device data as a column of elements, some of which may be null as ind...
A container of nullable device data as a column of elements.
Definition: column.hpp:47
Indicator for the logical data type of an element in a column.
Definition: types.hpp:243
A wrapper class for operations on a dictionary column.
column_view indices() const noexcept
Returns the column of indices.
column_view keys() const noexcept
Returns the column of keys.
Given a column-view of lists type, an instance of this class provides a wrapper on this compound colu...
A non-owning, mutable view of device data as a column of elements, some of which may be null as indic...
Base class for a wrapper around a cudf::column.
std::unique_ptr< cudf::column > release()
Releases internal unique_ptr to wrapped column.
dictionary_column_wrapper(StringsIterator begin, StringsIterator end, ValidityIterator v)
Construct a nullable dictionary column of strings from the range [begin,end) using the range [v,...
dictionary_column_wrapper(std::initializer_list< std::string > strings, std::initializer_list< bool > validity)
Construct a nullable dictionary column of strings from a list of strings and a list of booleans to in...
column_view indices() const
Access indices column view.
dictionary_column_wrapper(std::initializer_list< std::string > strings)
Construct a non-nullable dictionary column of strings from a list of strings.
dictionary_column_wrapper(StringsIterator begin, StringsIterator end)
Construct a non-nullable dictionary column of strings from the range [begin,end).
dictionary_column_wrapper(std::initializer_list< std::string > strings, ValidityIterator v)
Construct a nullable dictionary column of strings from a list of strings and the range [v,...
column_view keys() const
Access keys column view.
dictionary_column_wrapper()
Default constructor initializes an empty dictionary column of strings.
column_wrapper derived class for wrapping dictionary columns.
dictionary_column_wrapper(std::initializer_list< ElementFrom > elements)
Construct a non-nullable dictionary column of fixed-width elements from an initializer list.
dictionary_column_wrapper(std::initializer_list< ElementFrom > elements, std::initializer_list< bool > validity)
Construct a nullable dictionary column from a list of fixed-width elements using another list to indi...
dictionary_column_wrapper(std::initializer_list< ElementFrom > element_list, ValidityIterator v)
Construct a nullable dictionary column from a list of fixed-width elements and the range [v,...
dictionary_column_wrapper()
Default constructor initializes an empty column with dictionary type.
dictionary_column_wrapper(InputIterator begin, InputIterator end)
Construct a non-nullable dictionary column of the fixed-width elements in the range [begin,...
dictionary_column_wrapper(InputIterator begin, InputIterator end, ValidityIterator v)
Construct a nullable dictionary column of the fixed-width elements in the range [begin,...
dictionary_column_wrapper(InputIterator begin, InputIterator end, std::initializer_list< bool > const &validity)
Construct a nullable dictionary column of the fixed-width elements in the range [begin,...
A wrapper for a column of fixed-width elements.
fixed_point_column_wrapper(FixedPointRepIterator begin, FixedPointRepIterator end, std::initializer_list< bool > const &validity, numeric::scale_type scale)
Construct a nullable column of the decimal elements in the range [begin,end) using a validity initial...
fixed_point_column_wrapper(std::initializer_list< Rep > elements, std::initializer_list< bool > validity, numeric::scale_type scale)
Construct a nullable column from an initializer list of decimal elements using another list to indica...
fixed_point_column_wrapper(FixedPointRepIterator begin, FixedPointRepIterator end, numeric::scale_type scale)
Construct a non-nullable column of the decimal elements in the range [begin,end).
fixed_point_column_wrapper(std::initializer_list< Rep > element_list, ValidityIterator v, numeric::scale_type scale)
Construct a nullable column from an initializer list of decimal elements and the range [v,...
fixed_point_column_wrapper(std::initializer_list< Rep > values, numeric::scale_type scale)
Construct a non-nullable column of decimal elements from an initializer list.
fixed_point_column_wrapper(FixedPointRepIterator begin, FixedPointRepIterator end, ValidityIterator v, numeric::scale_type scale)
Construct a nullable column of the fixed-point elements from a range.
column_wrapper derived class for wrapping columns of fixed-width elements.
fixed_width_column_wrapper(InputIterator begin, InputIterator end, std::initializer_list< bool > const &validity)
Construct a nullable column of the fixed-width elements in the range [begin,end) using a validity ini...
fixed_width_column_wrapper(std::initializer_list< std::pair< ElementFrom, bool >> elements)
Construct a nullable column from a list of pairs of fixed-width elements and validity booleans of eac...
fixed_width_column_wrapper(std::initializer_list< ElementFrom > elements, std::initializer_list< bool > validity)
Construct a nullable column from a list of fixed-width elements using another list to indicate the va...
fixed_width_column_wrapper(std::initializer_list< ElementFrom > elements)
Construct a non-nullable column of fixed-width elements from an initializer list.
fixed_width_column_wrapper(InputIterator begin, InputIterator end, ValidityIterator v)
Construct a nullable column of the fixed-width elements in the range [begin,end) using the range [v,...
fixed_width_column_wrapper(std::initializer_list< ElementFrom > element_list, ValidityIterator v)
Construct a nullable column from a list of fixed-width elements and the range [v, v + element_list....
fixed_width_column_wrapper(InputIterator begin, InputIterator end)
Construct a non-nullable column of the fixed-width elements in the range [begin,end).
fixed_width_column_wrapper()
Default constructor initializes an empty column with proper dtype.
column_wrapper derived class for wrapping columns of lists.
lists_column_wrapper(InputIterator begin, InputIterator end)
Construct a lists column containing a single list of fixed-width type from an iterator range.
lists_column_wrapper(std::initializer_list< lists_column_wrapper< T, SourceElementT >> elements, ValidityIterator v)
Construct a lists column of nested lists from an initializer list of values and a validity iterator.
static lists_column_wrapper< T > make_one_empty_row_column(bool valid=true)
Construct a list column containing a single empty, optionally null row.
lists_column_wrapper(InputIterator begin, InputIterator end, ValidityIterator v)
Construct a lists column containing a single list of fixed-width type from an iterator range and a va...
lists_column_wrapper()
Construct am empty lists column.
lists_column_wrapper(std::initializer_list< lists_column_wrapper< T, SourceElementT >> elements)
Construct a lists column of nested lists from an initializer list of values.
lists_column_wrapper(std::initializer_list< SourceElementT > elements)
Construct a lists column containing a single list of fixed-width type from an initializer list of val...
lists_column_wrapper(std::initializer_list< std::string > elements, ValidityIterator v)
Construct a lists column containing a single list of strings from an initializer list of values and a...
lists_column_wrapper(std::initializer_list< std::string > elements)
Construct a lists column containing a single list of strings from an initializer list of values.
lists_column_wrapper(std::initializer_list< SourceElementT > elements, ValidityIterator v)
Construct a lists column containing a single list of fixed-width type from an initializer list of val...
column_wrapper derived class for wrapping columns of strings.
strings_column_wrapper(std::initializer_list< std::pair< std::string, bool >> strings)
Construct a nullable column from a list of pairs of strings and validity booleans of each string.
strings_column_wrapper()
Default constructor initializes an empty column of strings.
strings_column_wrapper(std::initializer_list< std::string > strings)
Construct a non-nullable column of strings from a list of strings.
strings_column_wrapper(std::initializer_list< std::string > strings, std::initializer_list< bool > validity)
Construct a nullable column of strings from a list of strings and a list of booleans to indicate the ...
strings_column_wrapper(std::initializer_list< std::string > strings, ValidityIterator v)
Construct a nullable column of strings from a list of strings and the range [v, v + strings....
strings_column_wrapper(StringsIterator begin, StringsIterator end)
Construct a non-nullable column of strings from the range [begin,end).
strings_column_wrapper(StringsIterator begin, StringsIterator end, ValidityIterator v)
Construct a nullable column of strings from the range [begin,end) using the range [v,...
column_wrapper derived class for wrapping columns of structs.
structs_column_wrapper(std::initializer_list< std::reference_wrapper< detail::column_wrapper >> child_column_wrappers, V validity_iter)
Constructs a struct column from the list of column wrappers for child columns.
structs_column_wrapper(std::initializer_list< std::reference_wrapper< detail::column_wrapper >> child_column_wrappers, std::vector< bool > const &validity={})
Constructs a struct column from the list of column wrappers for child columns.
structs_column_wrapper(std::vector< std::unique_ptr< cudf::column >> &&child_columns, std::vector< bool > const &validity={})
Constructs a struct column from the specified list of pre-constructed child columns.
void const * data() const noexcept
Class definition for cudf::column.
Column factory APIs.
Column APIs for gather, scatter, split, slice, etc.
Dictionary column encode and decode APIs.
Class definition for fixed point data type.
std::unique_ptr< column > empty_like(column_view const &input)
Initializes and returns an empty column of the same type as the input.
std::unique_ptr< column > make_strings_column(cudf::device_span< thrust::pair< char const *, size_type > const > strings, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Construct a STRING type column given a device span of pointer/size pairs.
std::unique_ptr< cudf::column > make_structs_column(size_type num_rows, std::vector< std::unique_ptr< column >> &&child_columns, size_type null_count, rmm::device_buffer &&null_mask, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Construct a STRUCT column using specified child columns as members.
std::unique_ptr< column > make_empty_column(data_type type)
Creates an empty column of the specified type.
std::unique_ptr< cudf::column > make_lists_column(size_type num_rows, std::unique_ptr< column > offsets_column, std::unique_ptr< column > child_column, size_type null_count, rmm::device_buffer &&null_mask, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Construct a LIST type column given offsets column, child column, null mask and null count.
cudf::size_type null_count(bitmask_type const *bitmask, size_type start, size_type stop, rmm::cuda_stream_view stream=cudf::get_default_stream())
Given a validity bitmask, counts the number of null elements (unset bits) in the range [start,...
std::size_t bitmask_allocation_size_bytes(size_type number_of_bits, std::size_t padding_boundary=64)
Computes the required bytes necessary to represent the specified number of bits with a given padding ...
rmm::device_buffer copy_bitmask(bitmask_type const *mask, size_type begin_bit, size_type end_bit, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Creates a device_buffer from a slice of bitmask defined by a range of indices [begin_bit,...
rmm::device_buffer create_null_mask(size_type size, mask_state state, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Creates a device_buffer for use as a null value indicator bitmask of a column.
std::unique_ptr< column > concatenate(host_span< column_view const > columns_to_concat, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Concatenates multiple columns into a single column.
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
std::unique_ptr< column > encode(column_view const &column, data_type indices_type=data_type{type_id::UINT32}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Construct a dictionary column by dictionary encoding an existing column.
scale_type
The scale type for fixed_point.
Definition: fixed_point.hpp:43
rmm::device_async_resource_ref get_current_device_resource_ref()
Get the current device memory resource reference.
device_async_resource_ref get_current_device_resource_ref()
std::unique_ptr< column > is_fixed_point(strings_column_view const &input, data_type decimal_type=data_type{type_id::DECIMAL64}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns a boolean column identifying strings in which all characters are valid for conversion to fixe...
std::unique_ptr< column > transform(column_view const &input, std::string const &unary_udf, data_type output_type, bool is_ptx, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Creates a new column by applying a unary function against every element of an input column.
CUDF_HOST_DEVICE void set_bit_unsafe(bitmask_type *bitmask, size_type bit_index)
Sets the specified bit to 1
Definition: bit.hpp:99
#define CUDF_EXPECTS(...)
Macro for checking (pre-)conditions that throws an exception when a condition is violated.
Definition: error.hpp:178
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:95
uint32_t bitmask_type
Bitmask type stored as 32-bit unsigned integer.
Definition: types.hpp:96
size_type distance(T f, T l)
Similar to std::distance but returns cudf::size_type and performs static_cast
Definition: types.hpp:110
constexpr bool is_nested()
Indicates whether T is a nested type.
Definition: traits.hpp:580
@ ALL_NULL
Null mask allocated, initialized to all elements NULL.
@ STRING
String elements.
@ DICTIONARY32
Dictionary type using int32 indices.
Class definition for cudf::lists_column_view.
cuDF interfaces
Definition: aggregation.hpp:35
fixed_point and supporting types
Definition: fixed_point.hpp:33
APIs for managing validity bitmasks.
Convert between source and target types when they differ and where possible.
constexpr ToT operator()(FromT element) const
No conversion necessary: Same type, simply copy element to output.
Defines the mapping between cudf::type_id runtime type information and concrete C++ types.
Type declarations for libcudf.