sort_merge_join.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2025, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/column/column.hpp>
22 #include <cudf/types.hpp>
23 #include <cudf/utilities/export.hpp>
25 
26 #include <rmm/cuda_stream_view.hpp>
27 
28 #include <thrust/iterator/counting_iterator.h>
29 
30 #include <optional>
31 #include <variant>
32 
33 namespace CUDF_EXPORT cudf {
34 
45  public:
60  sorted is_right_sorted,
61  null_equality compare_nulls = null_equality::EQUAL,
63 
79  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
80  std::unique_ptr<rmm::device_uvector<size_type>>>
81  inner_join(table_view const& left,
82  sorted is_left_sorted,
85 
86  private:
90  struct preprocessed_table {
91  table_view _table_view;
92 
94  _null_processed_table_view;
97 
98  std::optional<rmm::device_buffer> _validity_mask =
99  std::nullopt;
100  std::optional<size_type> _num_nulls =
101  std::nullopt;
102  std::optional<std::unique_ptr<table>> _null_processed_table =
103  std::nullopt;
104 
105  std::optional<std::unique_ptr<column>> _null_processed_table_sorted_order =
106  std::nullopt;
107 
114  void populate_nonnull_filter(rmm::cuda_stream_view stream);
115 
121  void apply_nonnull_filter(rmm::cuda_stream_view stream);
122 
128  void preprocess_unprocessed_table(rmm::cuda_stream_view stream);
129 
135  void get_sorted_order(rmm::cuda_stream_view stream);
136 
144  rmm::device_uvector<size_type> map_table_to_unprocessed(rmm::cuda_stream_view stream);
145  };
146  preprocessed_table preprocessed_left;
147  preprocessed_table preprocessed_right;
148  null_equality compare_nulls;
149 
158  void postprocess_indices(device_span<size_type> smaller_indices,
159  device_span<size_type> larger_indices,
160  rmm::cuda_stream_view stream);
161 };
162 
196 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
197  std::unique_ptr<rmm::device_uvector<size_type>>>
199  cudf::table_view const& right_keys,
200  null_equality compare_nulls = null_equality::EQUAL,
203 
238 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
239  std::unique_ptr<rmm::device_uvector<size_type>>>
241  cudf::table_view const& right_keys,
242  null_equality compare_nulls = null_equality::EQUAL,
245  // end of group
247 } // namespace CUDF_EXPORT cudf
Class that implements sort-merge algorithm for table joins.
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > inner_join(table_view const &left, sorted is_left_sorted, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns the row indices that can be used to construct the result of performing an inner join between ...
sort_merge_join(table_view const &right, sorted is_right_sorted, null_equality compare_nulls=null_equality::EQUAL, rmm::cuda_stream_view stream=cudf::get_default_stream())
Construct a sort-merge join object that pre-processes the right table on creation,...
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:200
Class definition for cudf::column.
column view class definitions
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > merge_inner_join(cudf::table_view const &left_keys, cudf::table_view const &right_keys, null_equality compare_nulls=null_equality::EQUAL, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns a pair of row index vectors corresponding to an inner join between the specified tables.
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > sort_merge_inner_join(cudf::table_view const &left_keys, cudf::table_view const &right_keys, null_equality compare_nulls=null_equality::EQUAL, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns a pair of row index vectors corresponding to an inner join between the specified tables.
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
rmm::device_async_resource_ref get_current_device_resource_ref()
Get the current device memory resource reference.
cuda::mr::async_resource_ref< cuda::mr::device_accessible > device_async_resource_ref
null_equality
Enum to consider two nulls as equal or unequal.
Definition: types.hpp:151
sorted
Indicates whether a collection of values is known to be sorted.
Definition: types.hpp:167
cuDF interfaces
Definition: host_udf.hpp:37
Device version of C++20 std::span with reduced feature set.
Definition: span.hpp:355
Class definitions for (mutable)_table_view
Type declarations for libcudf.