sort_merge_join.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2025, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/column/column.hpp>
22 #include <cudf/types.hpp>
23 #include <cudf/utilities/export.hpp>
25 
26 #include <rmm/cuda_stream_view.hpp>
27 
28 #include <thrust/iterator/counting_iterator.h>
29 
30 #include <optional>
31 #include <variant>
32 
33 namespace CUDF_EXPORT cudf {
34 
45  public:
53  struct match_context {
55  std::unique_ptr<rmm::device_uvector<size_type>>
58  };
59 
74  size_type
76  };
77 
92  sorted is_right_sorted,
93  null_equality compare_nulls = null_equality::EQUAL,
95 
111  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
112  std::unique_ptr<rmm::device_uvector<size_type>>>
113  inner_join(table_view const& left,
114  sorted is_left_sorted,
117 
141  table_view const& left,
142  sorted is_left_sorted,
145 
192  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
193  std::unique_ptr<rmm::device_uvector<size_type>>>
195  partition_context const& context,
198 
199  private:
203  struct preprocessed_table {
204  table_view _table_view;
205 
206  table_view
207  _null_processed_table_view;
210 
211  std::optional<rmm::device_buffer> _validity_mask =
212  std::nullopt;
213  std::optional<size_type> _num_nulls =
214  std::nullopt;
215  std::optional<std::unique_ptr<table>> _null_processed_table =
216  std::nullopt;
217 
218  std::optional<std::unique_ptr<column>> _null_processed_table_sorted_order =
219  std::nullopt;
220 
227  void populate_nonnull_filter(rmm::cuda_stream_view stream);
228 
234  void apply_nonnull_filter(rmm::cuda_stream_view stream);
235 
241  void preprocess_unprocessed_table(rmm::cuda_stream_view stream);
242 
248  void get_sorted_order(rmm::cuda_stream_view stream);
249 
257  rmm::device_uvector<size_type> map_table_to_unprocessed(rmm::cuda_stream_view stream);
258  };
259  preprocessed_table preprocessed_left;
260  preprocessed_table preprocessed_right;
261  null_equality compare_nulls;
262 
271  void postprocess_indices(device_span<size_type> smaller_indices,
272  device_span<size_type> larger_indices,
273  rmm::cuda_stream_view stream);
274 
298  template <typename MergeOperation>
299  auto invoke_merge(table_view right_view, table_view left_view, MergeOperation&& op);
300 };
301 
335 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
336  std::unique_ptr<rmm::device_uvector<size_type>>>
338  cudf::table_view const& right_keys,
339  null_equality compare_nulls = null_equality::EQUAL,
342 
377 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
378  std::unique_ptr<rmm::device_uvector<size_type>>>
380  cudf::table_view const& right_keys,
381  null_equality compare_nulls = null_equality::EQUAL,
384  // end of group
386 } // namespace CUDF_EXPORT cudf
Class that implements sort-merge algorithm for table joins.
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > inner_join(table_view const &left, sorted is_left_sorted, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns the row indices that can be used to construct the result of performing an inner join between ...
match_context inner_join_match_context(table_view const &left, sorted is_left_sorted, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns context information about matches between the left and right tables.
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > partitioned_inner_join(partition_context const &context, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Performs an inner join between a partition of the left table and the right table.
sort_merge_join(table_view const &right, sorted is_right_sorted, null_equality compare_nulls=null_equality::EQUAL, rmm::cuda_stream_view stream=cudf::get_default_stream())
Construct a sort-merge join object that pre-processes the right table on creation,...
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:200
Class definition for cudf::column.
column view class definitions
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > merge_inner_join(cudf::table_view const &left_keys, cudf::table_view const &right_keys, null_equality compare_nulls=null_equality::EQUAL, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns a pair of row index vectors corresponding to an inner join between the specified tables.
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > sort_merge_inner_join(cudf::table_view const &left_keys, cudf::table_view const &right_keys, null_equality compare_nulls=null_equality::EQUAL, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns a pair of row index vectors corresponding to an inner join between the specified tables.
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
rmm::device_async_resource_ref get_current_device_resource_ref()
Get the current device memory resource reference.
cuda::mr::async_resource_ref< cuda::mr::device_accessible > device_async_resource_ref
null_equality
Enum to consider two nulls as equal or unequal.
Definition: types.hpp:151
int32_t size_type
Row index type for columns and tables.
Definition: types.hpp:95
sorted
Indicates whether a collection of values is known to be sorted.
Definition: types.hpp:167
cuDF interfaces
Definition: host_udf.hpp:37
Device version of C++20 std::span with reduced feature set.
Definition: span.hpp:355
Holds context information about matches between tables during a join operation.
std::unique_ptr< rmm::device_uvector< size_type > > _match_counts
table_view _left_table
View of the left table involved in the join operation.
Stores context information for partitioned join operations.
match_context left_table_context
The match context from a previous inner_join_match_context call.
size_type left_end_idx
The ending row index (exclusive) of the current left table partition.
size_type left_start_idx
The starting row index of the current left table partition.
Class definitions for (mutable)_table_view
Type declarations for libcudf.