sort_merge_join.hpp
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
3  * SPDX-License-Identifier: Apache-2.0
4  */
5 
6 #pragma once
7 
8 #include <cudf/column/column.hpp>
10 #include <cudf/join/join.hpp>
12 #include <cudf/types.hpp>
13 #include <cudf/utilities/export.hpp>
15 
16 #include <rmm/cuda_stream_view.hpp>
17 
18 #include <thrust/iterator/counting_iterator.h>
19 
20 #include <optional>
21 #include <variant>
22 
23 namespace CUDF_EXPORT cudf {
24 
35  public:
36  sort_merge_join() = delete;
37  sort_merge_join(sort_merge_join const&) = delete;
38  sort_merge_join(sort_merge_join&&) = delete;
39  sort_merge_join& operator=(sort_merge_join const&) = delete;
40  sort_merge_join& operator=(sort_merge_join&&) = delete;
41 
56  sorted is_right_sorted,
57  null_equality compare_nulls = null_equality::EQUAL,
59 
75  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
76  std::unique_ptr<rmm::device_uvector<size_type>>>
77  inner_join(table_view const& left,
78  sorted is_left_sorted,
81 
105  table_view const& left,
106  sorted is_left_sorted,
109 
156  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
157  std::unique_ptr<rmm::device_uvector<size_type>>>
159  cudf::join_partition_context const& context,
162 
163  private:
167  struct preprocessed_table {
168  table_view _table_view;
169 
170  table_view
171  _null_processed_table_view;
174 
175  std::optional<rmm::device_buffer> _validity_mask =
176  std::nullopt;
177  std::optional<size_type> _num_nulls =
178  std::nullopt;
179  std::optional<std::unique_ptr<table>> _null_processed_table =
180  std::nullopt;
181 
182  std::optional<std::unique_ptr<column>> _null_processed_table_sorted_order =
183  std::nullopt;
184 
191  void populate_nonnull_filter(rmm::cuda_stream_view stream);
192 
198  void apply_nonnull_filter(rmm::cuda_stream_view stream);
199 
205  void preprocess_unprocessed_table(rmm::cuda_stream_view stream);
206 
212  void get_sorted_order(rmm::cuda_stream_view stream);
213 
221  rmm::device_uvector<size_type> map_table_to_unprocessed(rmm::cuda_stream_view stream);
222  };
223  preprocessed_table preprocessed_left;
224  preprocessed_table preprocessed_right;
225  null_equality compare_nulls;
226 
235  void postprocess_indices(device_span<size_type> smaller_indices,
236  device_span<size_type> larger_indices,
237  rmm::cuda_stream_view stream);
238 
262  template <typename MergeOperation>
263  auto invoke_merge(table_view right_view, table_view left_view, MergeOperation&& op);
264 };
265 
299 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
300  std::unique_ptr<rmm::device_uvector<size_type>>>
302  cudf::table_view const& right_keys,
303  null_equality compare_nulls = null_equality::EQUAL,
306 
341 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
342  std::unique_ptr<rmm::device_uvector<size_type>>>
344  cudf::table_view const& right_keys,
345  null_equality compare_nulls = null_equality::EQUAL,
348  // end of group
350 } // namespace CUDF_EXPORT cudf
Class that implements sort-merge algorithm for table joins.
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > inner_join(table_view const &left, sorted is_left_sorted, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns the row indices that can be used to construct the result of performing an inner join between ...
cudf::join_match_context inner_join_match_context(table_view const &left, sorted is_left_sorted, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns context information about matches between the left and right tables.
sort_merge_join(table_view const &right, sorted is_right_sorted, null_equality compare_nulls=null_equality::EQUAL, rmm::cuda_stream_view stream=cudf::get_default_stream())
Construct a sort-merge join object that pre-processes the right table on creation,...
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > partitioned_inner_join(cudf::join_partition_context const &context, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Performs an inner join between a partition of the left table and the right table.
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:189
Class definition for cudf::column.
column view class definitions
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > merge_inner_join(cudf::table_view const &left_keys, cudf::table_view const &right_keys, null_equality compare_nulls=null_equality::EQUAL, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns a pair of row index vectors corresponding to an inner join between the specified tables.
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > sort_merge_inner_join(cudf::table_view const &left_keys, cudf::table_view const &right_keys, null_equality compare_nulls=null_equality::EQUAL, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns a pair of row index vectors corresponding to an inner join between the specified tables.
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
rmm::device_async_resource_ref get_current_device_resource_ref()
Get the current device memory resource reference.
detail::cccl_async_resource_ref< cuda::mr::resource_ref< cuda::mr::device_accessible > > device_async_resource_ref
null_equality
Enum to consider two nulls as equal or unequal.
Definition: types.hpp:140
sorted
Indicates whether a collection of values is known to be sorted.
Definition: types.hpp:156
cuDF interfaces
Definition: host_udf.hpp:26
Device version of C++20 std::span with reduced feature set.
Definition: span.hpp:323
Holds context information about matches between tables during a join operation.
Definition: join.hpp:46
Stores context information for partitioned join operations.
Definition: join.hpp:63
Class definitions for (mutable)_table_view
Type declarations for libcudf.