join.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2019-2025, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/ast/expressions.hpp>
20 #include <cudf/hashing.hpp>
22 #include <cudf/types.hpp>
24 #include <cudf/utilities/export.hpp>
26 #include <cudf/utilities/span.hpp>
27 
28 #include <rmm/cuda_stream_view.hpp>
29 #include <rmm/device_uvector.hpp>
30 
31 #include <optional>
32 #include <utility>
33 #include <vector>
34 
35 namespace CUDF_EXPORT cudf {
36 
37 // forward declaration
38 namespace hashing::detail {
39 
43 template <typename T>
45 } // namespace hashing::detail
46 namespace detail {
47 
51 template <typename T>
52 class hash_join;
53 
57 class distinct_hash_join;
58 } // namespace detail
59 
99 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
100  std::unique_ptr<rmm::device_uvector<size_type>>>
101 inner_join(cudf::table_view const& left_keys,
102  cudf::table_view const& right_keys,
103  null_equality compare_nulls = null_equality::EQUAL,
106 
141 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
142  std::unique_ptr<rmm::device_uvector<size_type>>>
143 left_join(cudf::table_view const& left_keys,
144  cudf::table_view const& right_keys,
145  null_equality compare_nulls = null_equality::EQUAL,
148 
182 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
183  std::unique_ptr<rmm::device_uvector<size_type>>>
184 full_join(cudf::table_view const& left_keys,
185  cudf::table_view const& right_keys,
186  null_equality compare_nulls = null_equality::EQUAL,
189 
213 std::unique_ptr<rmm::device_uvector<size_type>> left_semi_join(
214  cudf::table_view const& left_keys,
215  cudf::table_view const& right_keys,
216  null_equality compare_nulls = null_equality::EQUAL,
219 
246 std::unique_ptr<rmm::device_uvector<size_type>> left_anti_join(
247  cudf::table_view const& left_keys,
248  cudf::table_view const& right_keys,
249  null_equality compare_nulls = null_equality::EQUAL,
252 
276 std::unique_ptr<cudf::table> cross_join(
277  cudf::table_view const& left,
278  cudf::table_view const& right,
281 
290 enum class nullable_join : bool { YES, NO };
291 
299 class hash_join {
300  public:
303 
304  hash_join() = delete;
305  ~hash_join();
306  hash_join(hash_join const&) = delete;
307  hash_join(hash_join&&) = delete;
308  hash_join& operator=(hash_join const&) = delete;
309  hash_join& operator=(hash_join&&) = delete;
310 
322  null_equality compare_nulls,
324 
333  null_equality compare_nulls,
335 
354  [[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
355  std::unique_ptr<rmm::device_uvector<size_type>>>
357  std::optional<std::size_t> output_size = {},
360 
379  [[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
380  std::unique_ptr<rmm::device_uvector<size_type>>>
382  std::optional<std::size_t> output_size = {},
385 
404  [[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
405  std::unique_ptr<rmm::device_uvector<size_type>>>
407  std::optional<std::size_t> output_size = {},
410 
424  [[nodiscard]] std::size_t inner_join_size(
425  cudf::table_view const& probe, rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
426 
440  [[nodiscard]] std::size_t left_join_size(
441  cudf::table_view const& probe, rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
442 
458  [[nodiscard]] std::size_t full_join_size(
459  cudf::table_view const& probe,
462 
463  private:
464  std::unique_ptr<impl_type const> _impl;
465 };
466 
478  public:
479  distinct_hash_join() = delete;
481  distinct_hash_join(distinct_hash_join const&) = delete;
483  distinct_hash_join& operator=(distinct_hash_join const&) = delete;
484  distinct_hash_join& operator=(distinct_hash_join&&) = delete;
485 
494  null_equality compare_nulls = null_equality::EQUAL,
496 
509  [[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
510  std::unique_ptr<rmm::device_uvector<size_type>>>
514 
532  [[nodiscard]] std::unique_ptr<rmm::device_uvector<size_type>> left_join(
533  cudf::table_view const& probe,
536 
537  private:
538  using impl_type = cudf::detail::distinct_hash_join;
539 
540  std::unique_ptr<impl_type> _impl;
541 };
542 
579 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
580  std::unique_ptr<rmm::device_uvector<size_type>>>
582  table_view const& right,
583  ast::expression const& binary_predicate,
584  std::optional<std::size_t> output_size = {},
587 
626 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
627  std::unique_ptr<rmm::device_uvector<size_type>>>
629  table_view const& right,
630  ast::expression const& binary_predicate,
631  std::optional<std::size_t> output_size = {},
634 
671 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
672  std::unique_ptr<rmm::device_uvector<size_type>>>
674  table_view const& right,
675  ast::expression const& binary_predicate,
678 
712 std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
713  table_view const& left,
714  table_view const& right,
715  ast::expression const& binary_predicate,
716  std::optional<std::size_t> output_size = {},
719 
753 std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
754  table_view const& left,
755  table_view const& right,
756  ast::expression const& binary_predicate,
757  std::optional<std::size_t> output_size = {},
760 
808 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
809  std::unique_ptr<rmm::device_uvector<size_type>>>
811  table_view const& left_equality,
812  table_view const& right_equality,
813  table_view const& left_conditional,
814  table_view const& right_conditional,
815  ast::expression const& binary_predicate,
816  null_equality compare_nulls = null_equality::EQUAL,
817  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
820 
870 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
871  std::unique_ptr<rmm::device_uvector<size_type>>>
873  table_view const& left_equality,
874  table_view const& right_equality,
875  table_view const& left_conditional,
876  table_view const& right_conditional,
877  ast::expression const& binary_predicate,
878  null_equality compare_nulls = null_equality::EQUAL,
879  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
882 
932 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
933  std::unique_ptr<rmm::device_uvector<size_type>>>
935  table_view const& left_equality,
936  table_view const& right_equality,
937  table_view const& left_conditional,
938  table_view const& right_conditional,
939  ast::expression const& binary_predicate,
940  null_equality compare_nulls = null_equality::EQUAL,
941  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
944 
984 std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
985  table_view const& left_equality,
986  table_view const& right_equality,
987  table_view const& left_conditional,
988  table_view const& right_conditional,
989  ast::expression const& binary_predicate,
990  null_equality compare_nulls = null_equality::EQUAL,
993 
1034 std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
1035  table_view const& left_equality,
1036  table_view const& right_equality,
1037  table_view const& left_conditional,
1038  table_view const& right_conditional,
1039  ast::expression const& binary_predicate,
1040  null_equality compare_nulls = null_equality::EQUAL,
1043 
1076 std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_inner_join_size(
1077  table_view const& left_equality,
1078  table_view const& right_equality,
1079  table_view const& left_conditional,
1080  table_view const& right_conditional,
1081  ast::expression const& binary_predicate,
1082  null_equality compare_nulls = null_equality::EQUAL,
1085 
1118 std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_left_join_size(
1119  table_view const& left_equality,
1120  table_view const& right_equality,
1121  table_view const& left_conditional,
1122  table_view const& right_conditional,
1123  ast::expression const& binary_predicate,
1124  null_equality compare_nulls = null_equality::EQUAL,
1127 
1147  table_view const& left,
1148  table_view const& right,
1149  ast::expression const& binary_predicate,
1152 
1172  table_view const& left,
1173  table_view const& right,
1174  ast::expression const& binary_predicate,
1177 
1197  table_view const& left,
1198  table_view const& right,
1199  ast::expression const& binary_predicate,
1202 
1222  table_view const& left,
1223  table_view const& right,
1224  ast::expression const& binary_predicate,
1228 } // namespace CUDF_EXPORT cudf
Forward declaration for our hash join.
Definition: join.hpp:52
Distinct hash join that builds hash table in creation and probes results in subsequent *_join member ...
Definition: join.hpp:477
std::unique_ptr< rmm::device_uvector< size_type > > left_join(cudf::table_view const &probe, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) const
Returns the build table indices that can be used to construct the result of performing a left join be...
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > inner_join(cudf::table_view const &probe, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) const
Returns the row indices that can be used to construct the result of performing an inner join between ...
distinct_hash_join(cudf::table_view const &build, null_equality compare_nulls=null_equality::EQUAL, rmm::cuda_stream_view stream=cudf::get_default_stream())
Constructs a distinct hash join object for subsequent probe calls.
Hash join that builds hash table in creation and probes results in subsequent *_join member functions...
Definition: join.hpp:299
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > left_join(cudf::table_view const &probe, std::optional< std::size_t > output_size={}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) const
hash_join(cudf::table_view const &build, nullable_join has_nulls, null_equality compare_nulls, rmm::cuda_stream_view stream=cudf::get_default_stream())
Construct a hash join object for subsequent probe calls.
typename cudf::detail::hash_join< cudf::hashing::detail::MurmurHash3_x86_32< cudf::hash_value_type > > impl_type
Implementation type.
Definition: join.hpp:302
std::size_t left_join_size(cudf::table_view const &probe, rmm::cuda_stream_view stream=cudf::get_default_stream()) const
std::size_t inner_join_size(cudf::table_view const &probe, rmm::cuda_stream_view stream=cudf::get_default_stream()) const
hash_join(cudf::table_view const &build, null_equality compare_nulls, rmm::cuda_stream_view stream=cudf::get_default_stream())
Construct a hash join object for subsequent probe calls.
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > inner_join(cudf::table_view const &probe, std::optional< std::size_t > output_size={}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) const
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > full_join(cudf::table_view const &probe, std::optional< std::size_t > output_size={}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) const
std::size_t full_join_size(cudf::table_view const &probe, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) const
Forward declaration for our Murmur Hash 3 implementation.
Definition: join.hpp:44
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:200
std::unique_ptr< rmm::device_uvector< size_type > > mixed_left_anti_join(table_view const &left_equality, table_view const &right_equality, table_view const &left_conditional, table_view const &right_conditional, ast::expression const &binary_predicate, null_equality compare_nulls=null_equality::EQUAL, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns an index vector corresponding to all rows in the left tables for which there is no row in the...
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > conditional_full_join(table_view const &left, table_view const &right, ast::expression const &binary_predicate, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns a pair of row index vectors corresponding to all pairs of rows between the specified tables w...
std::size_t conditional_inner_join_size(table_view const &left, table_view const &right, ast::expression const &binary_predicate, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns the exact number of matches (rows) when performing a conditional inner join between the speci...
std::size_t conditional_left_semi_join_size(table_view const &left, table_view const &right, ast::expression const &binary_predicate, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns the exact number of matches (rows) when performing a conditional left semi join between the s...
std::pair< std::size_t, std::unique_ptr< rmm::device_uvector< size_type > > > mixed_inner_join_size(table_view const &left_equality, table_view const &right_equality, table_view const &left_conditional, table_view const &right_conditional, ast::expression const &binary_predicate, null_equality compare_nulls=null_equality::EQUAL, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns the exact number of matches (rows) when performing a mixed inner join between the specified t...
std::unique_ptr< rmm::device_uvector< size_type > > mixed_left_semi_join(table_view const &left_equality, table_view const &right_equality, table_view const &left_conditional, table_view const &right_conditional, ast::expression const &binary_predicate, null_equality compare_nulls=null_equality::EQUAL, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns an index vector corresponding to all rows in the left tables where the columns of the equalit...
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > left_join(cudf::table_view const &left_keys, cudf::table_view const &right_keys, null_equality compare_nulls=null_equality::EQUAL, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns a pair of row index vectors corresponding to a left join between the specified tables.
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > full_join(cudf::table_view const &left_keys, cudf::table_view const &right_keys, null_equality compare_nulls=null_equality::EQUAL, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns a pair of row index vectors corresponding to a full join between the specified tables.
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > mixed_full_join(table_view const &left_equality, table_view const &right_equality, table_view const &left_conditional, table_view const &right_conditional, ast::expression const &binary_predicate, null_equality compare_nulls=null_equality::EQUAL, std::optional< std::pair< std::size_t, device_span< size_type const >>> output_size_data={}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns a pair of row index vectors corresponding to all pairs of rows between the specified tables w...
std::unique_ptr< rmm::device_uvector< size_type > > left_anti_join(cudf::table_view const &left_keys, cudf::table_view const &right_keys, null_equality compare_nulls=null_equality::EQUAL, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns a vector of row indices corresponding to a left anti join between the specified tables.
std::unique_ptr< rmm::device_uvector< size_type > > left_semi_join(cudf::table_view const &left_keys, cudf::table_view const &right_keys, null_equality compare_nulls=null_equality::EQUAL, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns a vector of row indices corresponding to a left semi-join between the specified tables.
std::size_t conditional_left_anti_join_size(table_view const &left, table_view const &right, ast::expression const &binary_predicate, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns the exact number of matches (rows) when performing a conditional left anti join between the s...
std::unique_ptr< rmm::device_uvector< size_type > > conditional_left_semi_join(table_view const &left, table_view const &right, ast::expression const &binary_predicate, std::optional< std::size_t > output_size={}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns an index vector corresponding to all rows in the left table for which there exists some row i...
std::pair< std::size_t, std::unique_ptr< rmm::device_uvector< size_type > > > mixed_left_join_size(table_view const &left_equality, table_view const &right_equality, table_view const &left_conditional, table_view const &right_conditional, ast::expression const &binary_predicate, null_equality compare_nulls=null_equality::EQUAL, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns the exact number of matches (rows) when performing a mixed left join between the specified ta...
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > mixed_inner_join(table_view const &left_equality, table_view const &right_equality, table_view const &left_conditional, table_view const &right_conditional, ast::expression const &binary_predicate, null_equality compare_nulls=null_equality::EQUAL, std::optional< std::pair< std::size_t, device_span< size_type const >>> output_size_data={}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns a pair of row index vectors corresponding to all pairs of rows between the specified tables w...
nullable_join
The enum class to specify if any of the input join tables (build table and any later probe table) has...
Definition: join.hpp:290
std::size_t conditional_left_join_size(table_view const &left, table_view const &right, ast::expression const &binary_predicate, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns the exact number of matches (rows) when performing a conditional left join between the specif...
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > inner_join(cudf::table_view const &left_keys, cudf::table_view const &right_keys, null_equality compare_nulls=null_equality::EQUAL, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns a pair of row index vectors corresponding to an inner join between the specified tables.
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > conditional_inner_join(table_view const &left, table_view const &right, ast::expression const &binary_predicate, std::optional< std::size_t > output_size={}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns a pair of row index vectors corresponding to all pairs of rows between the specified tables w...
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > mixed_left_join(table_view const &left_equality, table_view const &right_equality, table_view const &left_conditional, table_view const &right_conditional, ast::expression const &binary_predicate, null_equality compare_nulls=null_equality::EQUAL, std::optional< std::pair< std::size_t, device_span< size_type const >>> output_size_data={}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns a pair of row index vectors corresponding to all pairs of rows between the specified tables w...
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > conditional_left_join(table_view const &left, table_view const &right, ast::expression const &binary_predicate, std::optional< std::size_t > output_size={}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns a pair of row index vectors corresponding to all pairs of rows between the specified tables w...
std::unique_ptr< cudf::table > cross_join(cudf::table_view const &left, cudf::table_view const &right, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Performs a cross join on two tables (left, right)
std::unique_ptr< rmm::device_uvector< size_type > > conditional_left_anti_join(table_view const &left, table_view const &right, ast::expression const &binary_predicate, std::optional< std::size_t > output_size={}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Returns an index vector corresponding to all rows in the left table for which there does not exist an...
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
rmm::device_async_resource_ref get_current_device_resource_ref()
Get the current device memory resource reference.
cuda::mr::async_resource_ref< cuda::mr::device_accessible > device_async_resource_ref
null_equality
Enum to consider two nulls as equal or unequal.
Definition: types.hpp:151
cuDF interfaces
Definition: host_udf.hpp:37
bool has_nulls(table_view const &view)
Returns True if the table has nulls in any of its columns.
APIs for spans.
A generic expression that can be evaluated to return a value.
Definition: expressions.hpp:48
Device version of C++20 std::span with reduced feature set.
Definition: span.hpp:351
Class definitions for (mutable)_table_view
Type declarations for libcudf.