join.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2019-2024, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/ast/expressions.hpp>
20 #include <cudf/hashing.hpp>
22 #include <cudf/types.hpp>
24 #include <cudf/utilities/export.hpp>
25 #include <cudf/utilities/span.hpp>
26 
27 #include <rmm/cuda_stream_view.hpp>
28 #include <rmm/device_uvector.hpp>
30 #include <rmm/resource_ref.hpp>
31 
32 #include <optional>
33 #include <utility>
34 #include <vector>
35 
36 namespace CUDF_EXPORT cudf {
37 
43 enum class has_nested : bool { YES, NO };
44 
45 // forward declaration
46 namespace hashing::detail {
47 
51 template <typename T>
53 } // namespace hashing::detail
54 namespace detail {
55 
59 template <typename T>
60 class hash_join;
61 
65 template <cudf::has_nested HasNested>
67 } // namespace detail
68 
107 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
108  std::unique_ptr<rmm::device_uvector<size_type>>>
109 inner_join(cudf::table_view const& left_keys,
110  cudf::table_view const& right_keys,
111  null_equality compare_nulls = null_equality::EQUAL,
113 
147 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
148  std::unique_ptr<rmm::device_uvector<size_type>>>
149 left_join(cudf::table_view const& left_keys,
150  cudf::table_view const& right_keys,
151  null_equality compare_nulls = null_equality::EQUAL,
153 
186 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
187  std::unique_ptr<rmm::device_uvector<size_type>>>
188 full_join(cudf::table_view const& left_keys,
189  cudf::table_view const& right_keys,
190  null_equality compare_nulls = null_equality::EQUAL,
192 
215 std::unique_ptr<rmm::device_uvector<size_type>> left_semi_join(
216  cudf::table_view const& left_keys,
217  cudf::table_view const& right_keys,
218  null_equality compare_nulls = null_equality::EQUAL,
220 
246 std::unique_ptr<rmm::device_uvector<size_type>> left_anti_join(
247  cudf::table_view const& left_keys,
248  cudf::table_view const& right_keys,
249  null_equality compare_nulls = null_equality::EQUAL,
251 
274 std::unique_ptr<cudf::table> cross_join(
275  cudf::table_view const& left,
276  cudf::table_view const& right,
278 
287 enum class nullable_join : bool { YES, NO };
288 
296 class hash_join {
297  public:
300 
301  hash_join() = delete;
302  ~hash_join();
303  hash_join(hash_join const&) = delete;
304  hash_join(hash_join&&) = delete;
305  hash_join& operator=(hash_join const&) = delete;
306  hash_join& operator=(hash_join&&) = delete;
307 
319  null_equality compare_nulls,
321 
330  null_equality compare_nulls,
332 
351  [[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
352  std::unique_ptr<rmm::device_uvector<size_type>>>
354  std::optional<std::size_t> output_size = {},
357 
376  [[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
377  std::unique_ptr<rmm::device_uvector<size_type>>>
379  std::optional<std::size_t> output_size = {},
382 
401  [[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
402  std::unique_ptr<rmm::device_uvector<size_type>>>
404  std::optional<std::size_t> output_size = {},
407 
421  [[nodiscard]] std::size_t inner_join_size(
422  cudf::table_view const& probe, rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
423 
437  [[nodiscard]] std::size_t left_join_size(
438  cudf::table_view const& probe, rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
439 
455  [[nodiscard]] std::size_t full_join_size(
456  cudf::table_view const& probe,
459 
460  private:
461  const std::unique_ptr<impl_type const> _impl;
462 };
463 
473 // TODO: `HasNested` to be removed via dispatching
474 template <cudf::has_nested HasNested>
476  public:
477  distinct_hash_join() = delete;
479  distinct_hash_join(distinct_hash_join const&) = delete;
481  distinct_hash_join& operator=(distinct_hash_join const&) = delete;
482  distinct_hash_join& operator=(distinct_hash_join&&) = delete;
483 
495  cudf::table_view const& probe,
496  nullable_join has_nulls = nullable_join::YES,
497  null_equality compare_nulls = null_equality::EQUAL,
499 
511  [[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
512  std::unique_ptr<rmm::device_uvector<size_type>>>
515 
531  [[nodiscard]] std::unique_ptr<rmm::device_uvector<size_type>> left_join(
534 
535  private:
536  using impl_type = typename cudf::detail::distinct_hash_join<HasNested>;
537 
538  std::unique_ptr<impl_type> _impl;
539 };
540 
576 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
577  std::unique_ptr<rmm::device_uvector<size_type>>>
579  table_view const& right,
580  ast::expression const& binary_predicate,
581  std::optional<std::size_t> output_size = {},
583 
621 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
622  std::unique_ptr<rmm::device_uvector<size_type>>>
624  table_view const& right,
625  ast::expression const& binary_predicate,
626  std::optional<std::size_t> output_size = {},
628 
664 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
665  std::unique_ptr<rmm::device_uvector<size_type>>>
667  table_view const& right,
668  ast::expression const& binary_predicate,
670 
703 std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
704  table_view const& left,
705  table_view const& right,
706  ast::expression const& binary_predicate,
707  std::optional<std::size_t> output_size = {},
709 
742 std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
743  table_view const& left,
744  table_view const& right,
745  ast::expression const& binary_predicate,
746  std::optional<std::size_t> output_size = {},
748 
795 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
796  std::unique_ptr<rmm::device_uvector<size_type>>>
798  table_view const& left_equality,
799  table_view const& right_equality,
800  table_view const& left_conditional,
801  table_view const& right_conditional,
802  ast::expression const& binary_predicate,
803  null_equality compare_nulls = null_equality::EQUAL,
804  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
806 
855 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
856  std::unique_ptr<rmm::device_uvector<size_type>>>
858  table_view const& left_equality,
859  table_view const& right_equality,
860  table_view const& left_conditional,
861  table_view const& right_conditional,
862  ast::expression const& binary_predicate,
863  null_equality compare_nulls = null_equality::EQUAL,
864  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
866 
915 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
916  std::unique_ptr<rmm::device_uvector<size_type>>>
918  table_view const& left_equality,
919  table_view const& right_equality,
920  table_view const& left_conditional,
921  table_view const& right_conditional,
922  ast::expression const& binary_predicate,
923  null_equality compare_nulls = null_equality::EQUAL,
924  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
926 
965 std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
966  table_view const& left_equality,
967  table_view const& right_equality,
968  table_view const& left_conditional,
969  table_view const& right_conditional,
970  ast::expression const& binary_predicate,
971  null_equality compare_nulls = null_equality::EQUAL,
973 
1013 std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
1014  table_view const& left_equality,
1015  table_view const& right_equality,
1016  table_view const& left_conditional,
1017  table_view const& right_conditional,
1018  ast::expression const& binary_predicate,
1019  null_equality compare_nulls = null_equality::EQUAL,
1021 
1053 std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_inner_join_size(
1054  table_view const& left_equality,
1055  table_view const& right_equality,
1056  table_view const& left_conditional,
1057  table_view const& right_conditional,
1058  ast::expression const& binary_predicate,
1059  null_equality compare_nulls = null_equality::EQUAL,
1061 
1093 std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_left_join_size(
1094  table_view const& left_equality,
1095  table_view const& right_equality,
1096  table_view const& left_conditional,
1097  table_view const& right_conditional,
1098  ast::expression const& binary_predicate,
1099  null_equality compare_nulls = null_equality::EQUAL,
1101 
1120  table_view const& left,
1121  table_view const& right,
1122  ast::expression const& binary_predicate,
1124 
1143  table_view const& left,
1144  table_view const& right,
1145  ast::expression const& binary_predicate,
1147 
1166  table_view const& left,
1167  table_view const& right,
1168  ast::expression const& binary_predicate,
1170 
1189  table_view const& left,
1190  table_view const& right,
1191  ast::expression const& binary_predicate,
1194 } // namespace CUDF_EXPORT cudf
Forward declaration for our distinct hash join.
Definition: join.hpp:66
Forward declaration for our hash join.
Definition: join.hpp:60
Distinct hash join that builds hash table in creation and probes results in subsequent *_join member ...
Definition: join.hpp:475
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > inner_join(rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) const
Returns the row indices that can be used to construct the result of performing an inner join between ...
std::unique_ptr< rmm::device_uvector< size_type > > left_join(rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) const
Returns the build table indices that can be used to construct the result of performing a left join be...
distinct_hash_join(cudf::table_view const &build, cudf::table_view const &probe, nullable_join has_nulls=nullable_join::YES, null_equality compare_nulls=null_equality::EQUAL, rmm::cuda_stream_view stream=cudf::get_default_stream())
Constructs a distinct hash join object for subsequent probe calls.
Hash join that builds hash table in creation and probes results in subsequent *_join member functions...
Definition: join.hpp:296
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > left_join(cudf::table_view const &probe, std::optional< std::size_t > output_size={}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) const
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > full_join(cudf::table_view const &probe, std::optional< std::size_t > output_size={}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) const
std::size_t full_join_size(cudf::table_view const &probe, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) const
hash_join(cudf::table_view const &build, nullable_join has_nulls, null_equality compare_nulls, rmm::cuda_stream_view stream=cudf::get_default_stream())
Construct a hash join object for subsequent probe calls.
typename cudf::detail::hash_join< cudf::hashing::detail::MurmurHash3_x86_32< cudf::hash_value_type > > impl_type
Implementation type.
Definition: join.hpp:299
std::size_t left_join_size(cudf::table_view const &probe, rmm::cuda_stream_view stream=cudf::get_default_stream()) const
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > inner_join(cudf::table_view const &probe, std::optional< std::size_t > output_size={}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) const
std::size_t inner_join_size(cudf::table_view const &probe, rmm::cuda_stream_view stream=cudf::get_default_stream()) const
hash_join(cudf::table_view const &build, null_equality compare_nulls, rmm::cuda_stream_view stream=cudf::get_default_stream())
Construct a hash join object for subsequent probe calls.
Forward declaration for our Murmur Hash 3 implementation.
Definition: join.hpp:52
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:200
std::size_t conditional_left_semi_join_size(table_view const &left, table_view const &right, ast::expression const &binary_predicate, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Returns the exact number of matches (rows) when performing a conditional left semi join between the s...
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > conditional_full_join(table_view const &left, table_view const &right, ast::expression const &binary_predicate, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Returns a pair of row index vectors corresponding to all pairs of rows between the specified tables w...
std::unique_ptr< rmm::device_uvector< size_type > > mixed_left_anti_join(table_view const &left_equality, table_view const &right_equality, table_view const &left_conditional, table_view const &right_conditional, ast::expression const &binary_predicate, null_equality compare_nulls=null_equality::EQUAL, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Returns an index vector corresponding to all rows in the left tables for which there is no row in the...
std::unique_ptr< rmm::device_uvector< size_type > > left_semi_join(cudf::table_view const &left_keys, cudf::table_view const &right_keys, null_equality compare_nulls=null_equality::EQUAL, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Returns a vector of row indices corresponding to a left semi-join between the specified tables.
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > conditional_inner_join(table_view const &left, table_view const &right, ast::expression const &binary_predicate, std::optional< std::size_t > output_size={}, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Returns a pair of row index vectors corresponding to all pairs of rows between the specified tables w...
std::size_t conditional_left_anti_join_size(table_view const &left, table_view const &right, ast::expression const &binary_predicate, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Returns the exact number of matches (rows) when performing a conditional left anti join between the s...
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > conditional_left_join(table_view const &left, table_view const &right, ast::expression const &binary_predicate, std::optional< std::size_t > output_size={}, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Returns a pair of row index vectors corresponding to all pairs of rows between the specified tables w...
std::pair< std::size_t, std::unique_ptr< rmm::device_uvector< size_type > > > mixed_inner_join_size(table_view const &left_equality, table_view const &right_equality, table_view const &left_conditional, table_view const &right_conditional, ast::expression const &binary_predicate, null_equality compare_nulls=null_equality::EQUAL, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Returns the exact number of matches (rows) when performing a mixed inner join between the specified t...
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > full_join(cudf::table_view const &left_keys, cudf::table_view const &right_keys, null_equality compare_nulls=null_equality::EQUAL, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Returns a pair of row index vectors corresponding to a full join between the specified tables.
std::unique_ptr< rmm::device_uvector< size_type > > conditional_left_anti_join(table_view const &left, table_view const &right, ast::expression const &binary_predicate, std::optional< std::size_t > output_size={}, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Returns an index vector corresponding to all rows in the left table for which there does not exist an...
std::size_t conditional_inner_join_size(table_view const &left, table_view const &right, ast::expression const &binary_predicate, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Returns the exact number of matches (rows) when performing a conditional inner join between the speci...
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > mixed_left_join(table_view const &left_equality, table_view const &right_equality, table_view const &left_conditional, table_view const &right_conditional, ast::expression const &binary_predicate, null_equality compare_nulls=null_equality::EQUAL, std::optional< std::pair< std::size_t, device_span< size_type const >>> output_size_data={}, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Returns a pair of row index vectors corresponding to all pairs of rows between the specified tables w...
std::unique_ptr< rmm::device_uvector< size_type > > left_anti_join(cudf::table_view const &left_keys, cudf::table_view const &right_keys, null_equality compare_nulls=null_equality::EQUAL, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Returns a vector of row indices corresponding to a left anti join between the specified tables.
std::unique_ptr< rmm::device_uvector< size_type > > mixed_left_semi_join(table_view const &left_equality, table_view const &right_equality, table_view const &left_conditional, table_view const &right_conditional, ast::expression const &binary_predicate, null_equality compare_nulls=null_equality::EQUAL, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Returns an index vector corresponding to all rows in the left tables where the columns of the equalit...
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > mixed_inner_join(table_view const &left_equality, table_view const &right_equality, table_view const &left_conditional, table_view const &right_conditional, ast::expression const &binary_predicate, null_equality compare_nulls=null_equality::EQUAL, std::optional< std::pair< std::size_t, device_span< size_type const >>> output_size_data={}, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Returns a pair of row index vectors corresponding to all pairs of rows between the specified tables w...
has_nested
Enum to indicate whether the distinct join table has nested columns or not.
Definition: join.hpp:43
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > mixed_full_join(table_view const &left_equality, table_view const &right_equality, table_view const &left_conditional, table_view const &right_conditional, ast::expression const &binary_predicate, null_equality compare_nulls=null_equality::EQUAL, std::optional< std::pair< std::size_t, device_span< size_type const >>> output_size_data={}, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Returns a pair of row index vectors corresponding to all pairs of rows between the specified tables w...
std::pair< std::size_t, std::unique_ptr< rmm::device_uvector< size_type > > > mixed_left_join_size(table_view const &left_equality, table_view const &right_equality, table_view const &left_conditional, table_view const &right_conditional, ast::expression const &binary_predicate, null_equality compare_nulls=null_equality::EQUAL, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Returns the exact number of matches (rows) when performing a mixed left join between the specified ta...
nullable_join
The enum class to specify if any of the input join tables (build table and any later probe table) has...
Definition: join.hpp:287
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > left_join(cudf::table_view const &left_keys, cudf::table_view const &right_keys, null_equality compare_nulls=null_equality::EQUAL, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Returns a pair of row index vectors corresponding to a left join between the specified tables.
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > inner_join(cudf::table_view const &left_keys, cudf::table_view const &right_keys, null_equality compare_nulls=null_equality::EQUAL, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Returns a pair of row index vectors corresponding to an inner join between the specified tables.
std::unique_ptr< cudf::table > cross_join(cudf::table_view const &left, cudf::table_view const &right, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Performs a cross join on two tables (left, right)
std::unique_ptr< rmm::device_uvector< size_type > > conditional_left_semi_join(table_view const &left, table_view const &right, ast::expression const &binary_predicate, std::optional< std::size_t > output_size={}, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Returns an index vector corresponding to all rows in the left table for which there exists some row i...
std::size_t conditional_left_join_size(table_view const &left, table_view const &right, ast::expression const &binary_predicate, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource())
Returns the exact number of matches (rows) when performing a conditional left join between the specif...
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
cuda::mr::async_resource_ref< cuda::mr::device_accessible > device_async_resource_ref
device_memory_resource * get_current_device_resource()
null_equality
Enum to consider two nulls as equal or unequal.
Definition: types.hpp:151
cuDF interfaces
Definition: aggregation.hpp:35
bool has_nulls(table_view const &view)
Returns True if the table has nulls in any of its columns.
APIs for spans.
A generic expression that can be evaluated to return a value.
Definition: expressions.hpp:46
Device version of C++20 std::span with reduced feature set.
Definition: span.hpp:328
Class definitions for (mutable)_table_view
Type declarations for libcudf.