join.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2019-2023, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/ast/expressions.hpp>
20 #include <cudf/hashing.hpp>
22 #include <cudf/types.hpp>
23 #include <cudf/utilities/default_stream.hpp>
24 #include <cudf/utilities/span.hpp>
25 
26 #include <rmm/cuda_stream_view.hpp>
27 #include <rmm/device_uvector.hpp>
29 
30 #include <optional>
31 #include <utility>
32 #include <vector>
33 
34 namespace cudf {
35 
36 // forward declaration
37 namespace hashing::detail {
38 template <typename T>
40 } // namespace hashing::detail
41 namespace detail {
42 template <typename T>
43 class hash_join;
44 } // namespace detail
45 
84 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
85  std::unique_ptr<rmm::device_uvector<size_type>>>
86 inner_join(cudf::table_view const& left_keys,
87  cudf::table_view const& right_keys,
88  null_equality compare_nulls = null_equality::EQUAL,
90 
124 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
125  std::unique_ptr<rmm::device_uvector<size_type>>>
126 left_join(cudf::table_view const& left_keys,
127  cudf::table_view const& right_keys,
128  null_equality compare_nulls = null_equality::EQUAL,
130 
163 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
164  std::unique_ptr<rmm::device_uvector<size_type>>>
165 full_join(cudf::table_view const& left_keys,
166  cudf::table_view const& right_keys,
167  null_equality compare_nulls = null_equality::EQUAL,
169 
192 std::unique_ptr<rmm::device_uvector<size_type>> left_semi_join(
193  cudf::table_view const& left_keys,
194  cudf::table_view const& right_keys,
195  null_equality compare_nulls = null_equality::EQUAL,
197 
223 std::unique_ptr<rmm::device_uvector<size_type>> left_anti_join(
224  cudf::table_view const& left_keys,
225  cudf::table_view const& right_keys,
226  null_equality compare_nulls = null_equality::EQUAL,
228 
251 std::unique_ptr<cudf::table> cross_join(
252  cudf::table_view const& left,
253  cudf::table_view const& right,
255 
264 enum class nullable_join : bool { YES, NO };
265 
273 class hash_join {
274  public:
277 
278  hash_join() = delete;
279  ~hash_join();
280  hash_join(hash_join const&) = delete;
281  hash_join(hash_join&&) = delete;
282  hash_join& operator=(hash_join const&) = delete;
283  hash_join& operator=(hash_join&&) = delete;
284 
296  null_equality compare_nulls,
298 
307  null_equality compare_nulls,
309 
328  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
329  std::unique_ptr<rmm::device_uvector<size_type>>>
331  std::optional<std::size_t> output_size = {},
334 
353  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
354  std::unique_ptr<rmm::device_uvector<size_type>>>
356  std::optional<std::size_t> output_size = {},
359 
378  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
379  std::unique_ptr<rmm::device_uvector<size_type>>>
381  std::optional<std::size_t> output_size = {},
384 
398  [[nodiscard]] std::size_t inner_join_size(
399  cudf::table_view const& probe, rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
400 
414  [[nodiscard]] std::size_t left_join_size(
415  cudf::table_view const& probe, rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
416 
432  std::size_t full_join_size(
433  cudf::table_view const& probe,
436 
437  private:
438  const std::unique_ptr<impl_type const> _impl;
439 };
440 
476 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
477  std::unique_ptr<rmm::device_uvector<size_type>>>
479  table_view const& left,
480  table_view const& right,
481  ast::expression const& binary_predicate,
482  std::optional<std::size_t> output_size = {},
484 
522 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
523  std::unique_ptr<rmm::device_uvector<size_type>>>
525  table_view const& right,
526  ast::expression const& binary_predicate,
527  std::optional<std::size_t> output_size = {},
529 
565 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
566  std::unique_ptr<rmm::device_uvector<size_type>>>
568  table_view const& right,
569  ast::expression const& binary_predicate,
571 
604 std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
605  table_view const& left,
606  table_view const& right,
607  ast::expression const& binary_predicate,
608  std::optional<std::size_t> output_size = {},
610 
643 std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
644  table_view const& left,
645  table_view const& right,
646  ast::expression const& binary_predicate,
647  std::optional<std::size_t> output_size = {},
649 
696 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
697  std::unique_ptr<rmm::device_uvector<size_type>>>
699  table_view const& left_equality,
700  table_view const& right_equality,
701  table_view const& left_conditional,
702  table_view const& right_conditional,
703  ast::expression const& binary_predicate,
704  null_equality compare_nulls = null_equality::EQUAL,
705  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
707 
756 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
757  std::unique_ptr<rmm::device_uvector<size_type>>>
759  table_view const& left_equality,
760  table_view const& right_equality,
761  table_view const& left_conditional,
762  table_view const& right_conditional,
763  ast::expression const& binary_predicate,
764  null_equality compare_nulls = null_equality::EQUAL,
765  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
767 
816 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
817  std::unique_ptr<rmm::device_uvector<size_type>>>
819  table_view const& left_equality,
820  table_view const& right_equality,
821  table_view const& left_conditional,
822  table_view const& right_conditional,
823  ast::expression const& binary_predicate,
824  null_equality compare_nulls = null_equality::EQUAL,
825  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
827 
869 std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
870  table_view const& left_equality,
871  table_view const& right_equality,
872  table_view const& left_conditional,
873  table_view const& right_conditional,
874  ast::expression const& binary_predicate,
875  null_equality compare_nulls = null_equality::EQUAL,
876  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
878 
921 std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
922  table_view const& left_equality,
923  table_view const& right_equality,
924  table_view const& left_conditional,
925  table_view const& right_conditional,
926  ast::expression const& binary_predicate,
927  null_equality compare_nulls = null_equality::EQUAL,
928  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
930 
962 std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_inner_join_size(
963  table_view const& left_equality,
964  table_view const& right_equality,
965  table_view const& left_conditional,
966  table_view const& right_conditional,
967  ast::expression const& binary_predicate,
968  null_equality compare_nulls = null_equality::EQUAL,
970 
1002 std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_left_join_size(
1003  table_view const& left_equality,
1004  table_view const& right_equality,
1005  table_view const& left_conditional,
1006  table_view const& right_conditional,
1007  ast::expression const& binary_predicate,
1008  null_equality compare_nulls = null_equality::EQUAL,
1010 
1042 std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_left_semi_join_size(
1043  table_view const& left_equality,
1044  table_view const& right_equality,
1045  table_view const& left_conditional,
1046  table_view const& right_conditional,
1047  ast::expression const& binary_predicate,
1048  null_equality compare_nulls = null_equality::EQUAL,
1050 
1080 std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_left_anti_join_size(
1081  table_view const& left_equality,
1082  table_view const& right_equality,
1083  table_view const& left_conditional,
1084  table_view const& right_conditional,
1085  ast::expression const& binary_predicate,
1086  null_equality compare_nulls = null_equality::EQUAL,
1088 
1107  table_view const& left,
1108  table_view const& right,
1109  ast::expression const& binary_predicate,
1111 
1130  table_view const& left,
1131  table_view const& right,
1132  ast::expression const& binary_predicate,
1134 
1153  table_view const& left,
1154  table_view const& right,
1155  ast::expression const& binary_predicate,
1157 
1176  table_view const& left,
1177  table_view const& right,
1178  ast::expression const& binary_predicate,
1181 } // namespace cudf
Hash join that builds hash table in creation and probes results in subsequent *_join member functions...
Definition: join.hpp:273
hash_join(cudf::table_view const &build, nullable_join has_nulls, null_equality compare_nulls, rmm::cuda_stream_view stream=cudf::get_default_stream())
Construct a hash join object for subsequent probe calls.
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > inner_join(cudf::table_view const &probe, std::optional< std::size_t > output_size={}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource()) const
typename cudf::detail::hash_join< cudf::hashing::detail::MurmurHash3_x86_32< cudf::hash_value_type > > impl_type
Implementation type.
Definition: join.hpp:276
std::size_t left_join_size(cudf::table_view const &probe, rmm::cuda_stream_view stream=cudf::get_default_stream()) const
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > full_join(cudf::table_view const &probe, std::optional< std::size_t > output_size={}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource()) const
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > left_join(cudf::table_view const &probe, std::optional< std::size_t > output_size={}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource()) const
std::size_t full_join_size(cudf::table_view const &probe, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource()) const
std::size_t inner_join_size(cudf::table_view const &probe, rmm::cuda_stream_view stream=cudf::get_default_stream()) const
hash_join(cudf::table_view const &build, null_equality compare_nulls, rmm::cuda_stream_view stream=cudf::get_default_stream())
Construct a hash join object for subsequent probe calls.
A set of cudf::column_view's of the same size.
Definition: table_view.hpp:187
std::size_t conditional_left_anti_join_size(table_view const &left, table_view const &right, ast::expression const &binary_predicate, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Returns the exact number of matches (rows) when performing a conditional left anti join between the s...
std::pair< std::size_t, std::unique_ptr< rmm::device_uvector< size_type > > > mixed_left_anti_join_size(table_view const &left_equality, table_view const &right_equality, table_view const &left_conditional, table_view const &right_conditional, ast::expression const &binary_predicate, null_equality compare_nulls=null_equality::EQUAL, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Returns the exact number of matches (rows) when performing a mixed left anti join between the specifi...
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > mixed_full_join(table_view const &left_equality, table_view const &right_equality, table_view const &left_conditional, table_view const &right_conditional, ast::expression const &binary_predicate, null_equality compare_nulls=null_equality::EQUAL, std::optional< std::pair< std::size_t, device_span< size_type const >>> output_size_data={}, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Returns a pair of row index vectors corresponding to all pairs of rows between the specified tables w...
std::unique_ptr< rmm::device_uvector< size_type > > conditional_left_anti_join(table_view const &left, table_view const &right, ast::expression const &binary_predicate, std::optional< std::size_t > output_size={}, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Returns an index vector corresponding to all rows in the left table for which there does not exist an...
std::unique_ptr< rmm::device_uvector< size_type > > mixed_left_semi_join(table_view const &left_equality, table_view const &right_equality, table_view const &left_conditional, table_view const &right_conditional, ast::expression const &binary_predicate, null_equality compare_nulls=null_equality::EQUAL, std::optional< std::pair< std::size_t, device_span< size_type const >>> output_size_data={}, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Returns an index vector corresponding to all rows in the left tables where the columns of the equalit...
std::unique_ptr< cudf::table > cross_join(cudf::table_view const &left, cudf::table_view const &right, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Performs a cross join on two tables (left, right)
std::unique_ptr< rmm::device_uvector< size_type > > left_anti_join(cudf::table_view const &left_keys, cudf::table_view const &right_keys, null_equality compare_nulls=null_equality::EQUAL, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Returns a vector of row indices corresponding to a left anti join between the specified tables.
std::unique_ptr< rmm::device_uvector< size_type > > left_semi_join(cudf::table_view const &left_keys, cudf::table_view const &right_keys, null_equality compare_nulls=null_equality::EQUAL, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Returns a vector of row indices corresponding to a left semi-join between the specified tables.
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > mixed_left_join(table_view const &left_equality, table_view const &right_equality, table_view const &left_conditional, table_view const &right_conditional, ast::expression const &binary_predicate, null_equality compare_nulls=null_equality::EQUAL, std::optional< std::pair< std::size_t, device_span< size_type const >>> output_size_data={}, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Returns a pair of row index vectors corresponding to all pairs of rows between the specified tables w...
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > full_join(cudf::table_view const &left_keys, cudf::table_view const &right_keys, null_equality compare_nulls=null_equality::EQUAL, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Returns a pair of row index vectors corresponding to a full join between the specified tables.
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > conditional_full_join(table_view const &left, table_view const &right, ast::expression const &binary_predicate, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Returns a pair of row index vectors corresponding to all pairs of rows between the specified tables w...
std::pair< std::size_t, std::unique_ptr< rmm::device_uvector< size_type > > > mixed_left_semi_join_size(table_view const &left_equality, table_view const &right_equality, table_view const &left_conditional, table_view const &right_conditional, ast::expression const &binary_predicate, null_equality compare_nulls=null_equality::EQUAL, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Returns the exact number of matches (rows) when performing a mixed left semi join between the specifi...
std::unique_ptr< rmm::device_uvector< size_type > > mixed_left_anti_join(table_view const &left_equality, table_view const &right_equality, table_view const &left_conditional, table_view const &right_conditional, ast::expression const &binary_predicate, null_equality compare_nulls=null_equality::EQUAL, std::optional< std::pair< std::size_t, device_span< size_type const >>> output_size_data={}, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Returns an index vector corresponding to all rows in the left tables for which there is no row in the...
nullable_join
The enum class to specify if any of the input join tables (build table and any later probe table) has...
Definition: join.hpp:264
std::size_t conditional_left_join_size(table_view const &left, table_view const &right, ast::expression const &binary_predicate, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Returns the exact number of matches (rows) when performing a conditional left join between the specif...
std::pair< std::size_t, std::unique_ptr< rmm::device_uvector< size_type > > > mixed_left_join_size(table_view const &left_equality, table_view const &right_equality, table_view const &left_conditional, table_view const &right_conditional, ast::expression const &binary_predicate, null_equality compare_nulls=null_equality::EQUAL, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Returns the exact number of matches (rows) when performing a mixed left join between the specified ta...
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > left_join(cudf::table_view const &left_keys, cudf::table_view const &right_keys, null_equality compare_nulls=null_equality::EQUAL, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Returns a pair of row index vectors corresponding to a left join between the specified tables.
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > conditional_inner_join(table_view const &left, table_view const &right, ast::expression const &binary_predicate, std::optional< std::size_t > output_size={}, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Returns a pair of row index vectors corresponding to all pairs of rows between the specified tables w...
std::size_t conditional_inner_join_size(table_view const &left, table_view const &right, ast::expression const &binary_predicate, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Returns the exact number of matches (rows) when performing a conditional inner join between the speci...
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > conditional_left_join(table_view const &left, table_view const &right, ast::expression const &binary_predicate, std::optional< std::size_t > output_size={}, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Returns a pair of row index vectors corresponding to all pairs of rows between the specified tables w...
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > mixed_inner_join(table_view const &left_equality, table_view const &right_equality, table_view const &left_conditional, table_view const &right_conditional, ast::expression const &binary_predicate, null_equality compare_nulls=null_equality::EQUAL, std::optional< std::pair< std::size_t, device_span< size_type const >>> output_size_data={}, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Returns a pair of row index vectors corresponding to all pairs of rows between the specified tables w...
std::pair< std::size_t, std::unique_ptr< rmm::device_uvector< size_type > > > mixed_inner_join_size(table_view const &left_equality, table_view const &right_equality, table_view const &left_conditional, table_view const &right_conditional, ast::expression const &binary_predicate, null_equality compare_nulls=null_equality::EQUAL, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Returns the exact number of matches (rows) when performing a mixed inner join between the specified t...
std::pair< std::unique_ptr< rmm::device_uvector< size_type > >, std::unique_ptr< rmm::device_uvector< size_type > > > inner_join(cudf::table_view const &left_keys, cudf::table_view const &right_keys, null_equality compare_nulls=null_equality::EQUAL, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Returns a pair of row index vectors corresponding to an inner join between the specified tables.
std::size_t conditional_left_semi_join_size(table_view const &left, table_view const &right, ast::expression const &binary_predicate, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Returns the exact number of matches (rows) when performing a conditional left semi join between the s...
std::unique_ptr< rmm::device_uvector< size_type > > conditional_left_semi_join(table_view const &left, table_view const &right, ast::expression const &binary_predicate, std::optional< std::size_t > output_size={}, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
Returns an index vector corresponding to all rows in the left table for which there exists some row i...
device_memory_resource * get_current_device_resource()
null_equality
Enum to consider two nulls as equal or unequal.
Definition: types.hpp:135
@ EQUAL
nulls compare equal
cuDF interfaces
Definition: aggregation.hpp:34
rmm::cuda_stream_view const get_default_stream()
Get the current default stream.
bool has_nulls(table_view const &view)
Returns True if the table has nulls in any of its columns.
Definition: table_view.hpp:318
A generic expression that can be evaluated to return a value.
Definition: expressions.hpp:41
Device version of C++20 std::span with reduced feature set.
Definition: span.hpp:277
Class definitions for (mutable)_table_view
Type declarations for libcudf.