Files
file	stream_compaction.hpp
	Column APIs for filtering rows.

Enumerations
enum class	cudf::duplicate_keep_option { cudf::KEEP_ANY = 0 , cudf::KEEP_FIRST , cudf::KEEP_LAST , cudf::KEEP_NONE }
	Choices for drop_duplicates API for retainment of duplicate rows. More...

Functions
std::unique_ptr< table >	cudf::drop_nulls (table_view const &input, std::vector< size_type > const &keys, cudf::size_type keep_threshold, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Filters a table to remove null elements with threshold count. More...

std::unique_ptr< table >	cudf::drop_nulls (table_view const &input, std::vector< size_type > const &keys, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Filters a table to remove null elements. More...

std::unique_ptr< table >	cudf::drop_nans (table_view const &input, std::vector< size_type > const &keys, cudf::size_type keep_threshold, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Filters a table to remove NANs with threshold count. More...

std::unique_ptr< table >	cudf::drop_nans (table_view const &input, std::vector< size_type > const &keys, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Filters a table to remove NANs. More...

std::unique_ptr< table >	cudf::apply_boolean_mask (table_view const &input, column_view const &boolean_mask, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Filters `input` using `boolean_mask` of boolean values as a mask. More...

std::unique_ptr< table >	cudf::unique (table_view const &input, std::vector< size_type > const &keys, duplicate_keep_option keep, null_equality nulls_equal=null_equality::EQUAL, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Create a new table with consecutive duplicate rows removed. More...

std::unique_ptr< table >	cudf::distinct (table_view const &input, std::vector< size_type > const &keys, duplicate_keep_option keep=duplicate_keep_option::KEEP_ANY, null_equality nulls_equal=null_equality::EQUAL, nan_equality nans_equal=nan_equality::ALL_EQUAL, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Create a new table without duplicate rows. More...

std::unique_ptr< column >	cudf::distinct_indices (table_view const &input, duplicate_keep_option keep=duplicate_keep_option::KEEP_ANY, null_equality nulls_equal=null_equality::EQUAL, nan_equality nans_equal=nan_equality::ALL_EQUAL, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Create a column of indices of all distinct rows in the input table. More...

std::unique_ptr< table >	cudf::stable_distinct (table_view const &input, std::vector< size_type > const &keys, duplicate_keep_option keep=duplicate_keep_option::KEEP_ANY, null_equality nulls_equal=null_equality::EQUAL, nan_equality nans_equal=nan_equality::ALL_EQUAL, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Create a new table without duplicate rows, preserving input order. More...

cudf::size_type	cudf::unique_count (column_view const &input, null_policy null_handling, nan_policy nan_handling, rmm::cuda_stream_view stream=cudf::get_default_stream())
	Count the number of consecutive groups of equivalent rows in a column. More...

cudf::size_type	cudf::unique_count (table_view const &input, null_equality nulls_equal=null_equality::EQUAL, rmm::cuda_stream_view stream=cudf::get_default_stream())
	Count the number of consecutive groups of equivalent rows in a table. More...

cudf::size_type	cudf::distinct_count (column_view const &input, null_policy null_handling, nan_policy nan_handling, rmm::cuda_stream_view stream=cudf::get_default_stream())
	Count the distinct elements in the column_view. More...

cudf::size_type	cudf::distinct_count (table_view const &input, null_equality nulls_equal=null_equality::EQUAL, rmm::cuda_stream_view stream=cudf::get_default_stream())
	Count the distinct rows in a table. More...

std::vector< std::unique_ptr< column > >	cudf::filter (std::vector< column_view > const &predicate_columns, std::string const &predicate_udf, std::vector< column_view > const &filter_columns, bool is_ptx, std::optional< void * > user_data=std::nullopt, null_aware is_null_aware=null_aware::NO, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Creates a new column by applying a filter function against every element of the input columns. More...

std::unique_ptr< table >	cudf::filter (table_view const &predicate_table, ast::expression const &predicate_expr, table_view const &filter_table, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Creates new table by applying a filter function against every element of the input columns. More...

Detailed Description

Enumeration Type Documentation

◆ duplicate_keep_option

enum cudf::duplicate_keep_option

strong

Choices for drop_duplicates API for retainment of duplicate rows.

Enumerator
KEEP_ANY	Keep an unspecified occurrence.
KEEP_FIRST	Keep first occurrence.
KEEP_LAST	Keep last occurrence.
KEEP_NONE	Keep no (remove all) occurrences of duplicates.

Definition at line 217 of file stream_compaction.hpp.

Function Documentation

◆ apply_boolean_mask()

std::unique_ptr<table> cudf::apply_boolean_mask	(	table_view const &	input,
		column_view const &	boolean_mask,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Filters input using boolean_mask of boolean values as a mask.

Given an input table_view and a mask column_view, an element i from each column_view of the input is copied to the corresponding output column if the corresponding element i in the mask is non-null and true. This operation is stable: the input order is preserved.

Note: if input.num_rows() is zero, there is no error, and an empty table is returned.

Exceptions

cudf::logic_error	if `input.num_rows() != boolean_mask.size()`.
cudf::logic_error	if `boolean_mask` is not `type_id::BOOL8` type.

Parameters

[in]	input	The input table_view to filter
[in]	boolean_mask	A nullable column_view of type type_id::BOOL8 used as a mask to filter the `input`.
[in]	stream	CUDA stream used for device memory operations and kernel launches
[in]	mr	Device memory resource used to allocate the returned table's device memory

Returns: Table containing copy of all rows of input passing the filter defined by boolean_mask.

◆ distinct()

std::unique_ptr<table> cudf::distinct	(	table_view const &	input,
		std::vector< size_type > const &	keys,
		duplicate_keep_option	keep = `duplicate_keep_option::KEEP_ANY`,
		null_equality	nulls_equal = `null_equality::EQUAL`,
		nan_equality	nans_equal = `nan_equality::ALL_EQUAL`,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Create a new table without duplicate rows.

Given an input table_view, each row is copied to the output table to create a set of distinct rows. If there are duplicate rows, which row is copied depends on the keep parameter.

The order of rows in the output table is not specified.

Performance hint: if the input is pre-sorted, cudf::unique can produce an equivalent result (i.e., same set of output rows) but with less running time than cudf::distinct.

Parameters

input	The input table
keys	Vector of indices indicating key columns in the `input` table
keep	Copy any, first, last, or none of the found duplicates
nulls_equal	Flag to specify whether null elements should be considered as equal
nans_equal	Flag to specify whether NaN elements should be considered as equal
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the returned table

Returns: Table with distinct rows in an unspecified order

◆ distinct_count() [1/2]

cudf::size_type cudf::distinct_count	(	column_view const &	input,
		null_policy	null_handling,
		nan_policy	nan_handling,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`
	)

Count the distinct elements in the column_view.

If nulls_equal == nulls_equal::UNEQUAL, all nulls are distinct.

Given an input column_view, number of distinct elements in this column_view is returned.

If null_handling is null_policy::EXCLUDE and nan_handling is nan_policy::NAN_IS_NULL, both NaN and null values are ignored. If null_handling is null_policy::EXCLUDE and nan_handling is nan_policy::NAN_IS_VALID, only null is ignored, NaN is considered in distinct count.

nulls are handled as equal.

Parameters

[in]	input	The column_view whose distinct elements will be counted
[in]	null_handling	flag to include or ignore `null` while counting
[in]	nan_handling	flag to consider `NaN==null` or not
[in]	stream	CUDA stream used for device memory operations and kernel launches

Returns: number of distinct rows in the table

◆ distinct_count() [2/2]

cudf::size_type cudf::distinct_count	(	table_view const &	input,
		null_equality	nulls_equal = `null_equality::EQUAL`,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`
	)

Count the distinct rows in a table.

Parameters

[in]	input	Table whose distinct rows will be counted
[in]	nulls_equal	flag to denote if null elements should be considered equal. nulls are not equal if null_equality::UNEQUAL.
[in]	stream	CUDA stream used for device memory operations and kernel launches

Returns: number of distinct rows in the table

◆ distinct_indices()

std::unique_ptr<column> cudf::distinct_indices	(	table_view const &	input,
		duplicate_keep_option	keep = `duplicate_keep_option::KEEP_ANY`,
		null_equality	nulls_equal = `null_equality::EQUAL`,
		nan_equality	nans_equal = `nan_equality::ALL_EQUAL`,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Create a column of indices of all distinct rows in the input table.

Given an input table_view, an output vector of all row indices of the distinct rows is generated. If there are duplicate rows, which index is kept depends on the keep parameter.

Parameters

input	The input table
keep	Get index of any, first, last, or none of the found duplicates
nulls_equal	Flag to specify whether null elements should be considered as equal
nans_equal	Flag to specify whether NaN elements should be considered as equal
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the returned vector

Returns: Column containing the result indices

◆ drop_nans() [1/2]

std::unique_ptr<table> cudf::drop_nans	(	table_view const &	input,
		std::vector< size_type > const &	keys,
		cudf::size_type	keep_threshold,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Filters a table to remove NANs with threshold count.

Filters the rows of the input considering specified columns indicated in keys for NANs. These key columns must be of floating-point type.

Given an input table_view, row i from the input columns is copied to the output if the same row i of keys has at least keep_threshold non-NAN elements.

This operation is stable: the input order is preserved in the output.

input   {col1: {1.0, 2.0, 3.0, NAN},
         col2: {4.0, null, NAN, NAN},
         col3: {7.0, NAN, NAN, NAN}}
keys = {0, 1, 2} // All columns
keep_threshold = 2
 
output {col1: {1.0, 2.0}
        col2: {4.0, null}
        col3: {7.0, NAN}}

Note: if input.num_rows() is zero, or keys is empty, there is no error, and an empty table is returned

Exceptions

cudf::logic_error if The keys columns are not floating-point type.

Parameters

[in]	input	The input `table_view` to filter
[in]	keys	vector of indices representing key columns from `input`
[in]	keep_threshold	The minimum number of non-NAN elements in a row required to keep the row.
[in]	stream	CUDA stream used for device memory operations and kernel launches
[in]	mr	Device memory resource used to allocate the returned table's device memory

Returns: Table containing all rows of the input with at least keep_threshold non-NAN elements in keys.

◆ drop_nans() [2/2]

std::unique_ptr<table> cudf::drop_nans	(	table_view const &	input,
		std::vector< size_type > const &	keys,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Filters a table to remove NANs.

Filters the rows of the input considering specified columns indicated in keys for NANs. These key columns must be of floating-point type.

input   {col1: {1.0, 2.0, 3.0, NAN},
         col2: {4.0, null, NAN, NAN},
         col3: {null, NAN, NAN, NAN}}
keys = {0, 1, 2} // All columns
keep_threshold = 2
 
output {col1: {1.0}
        col2: {4.0}
        col3: {null}}

Same as drop_nans but defaults keep_threshold to the number of columns in keys.

Parameters

[in]	input	The input `table_view` to filter
[in]	keys	vector of indices representing key columns from `input`
[in]	stream	CUDA stream used for device memory operations and kernel launches
[in]	mr	Device memory resource used to allocate the returned table's device memory

Returns: Table containing all rows of the input without NANs in the columns of keys.

◆ drop_nulls() [1/2]

std::unique_ptr<table> cudf::drop_nulls	(	table_view const &	input,
		std::vector< size_type > const &	keys,
		cudf::size_type	keep_threshold,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Filters a table to remove null elements with threshold count.

Filters the rows of the input considering specified columns indicated in keys for validity / null values.

Given an input table_view, row i from the input columns is copied to the output if the same row i of keys has at least keep_threshold non-null fields.

This operation is stable: the input order is preserved in the output.

Any non-nullable column in the input is treated as all non-null.

input   {col1: {1, 2,    3,    null},
         col2: {4, 5,    null, null},
         col3: {7, null, null, null}}
keys = {0, 1, 2} // All columns
keep_threshold = 2
 
output {col1: {1, 2}
        col2: {4, 5}
        col3: {7, null}}

Note: if input.num_rows() is zero, or keys is empty or has no nulls, there is no error, and an empty table is returned

Parameters

[in]	input	The input `table_view` to filter
[in]	keys	vector of indices representing key columns from `input`
[in]	keep_threshold	The minimum number of non-null fields in a row required to keep the row.
[in]	stream	CUDA stream used for device memory operations and kernel launches
[in]	mr	Device memory resource used to allocate the returned table's device memory

Returns: Table containing all rows of the input with at least keep_threshold non-null fields in keys.

◆ drop_nulls() [2/2]

std::unique_ptr<table> cudf::drop_nulls	(	table_view const &	input,
		std::vector< size_type > const &	keys,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Filters a table to remove null elements.

Filters the rows of the input considering specified columns indicated in keys for validity / null values.

input   {col1: {1, 2,    3,    null},
         col2: {4, 5,    null, null},
         col3: {7, null, null, null}}
keys = {0, 1, 2} //All columns
 
output {col1: {1}
        col2: {4}
        col3: {7}}

Same as drop_nulls but defaults keep_threshold to the number of columns in keys.

Parameters

[in]	input	The input `table_view` to filter
[in]	keys	vector of indices representing key columns from `input`
[in]	stream	CUDA stream used for device memory operations and kernel launches
[in]	mr	Device memory resource used to allocate the returned table's device memory

Returns: Table containing all rows of the input without nulls in the columns of keys.

◆ filter() [1/2]

std::vector<std::unique_ptr<column> > cudf::filter	(	std::vector< column_view > const &	predicate_columns,
		std::string const &	predicate_udf,
		std::vector< column_view > const &	filter_columns,
		bool	is_ptx,
		std::optional< void * >	user_data = `std::nullopt`,
		null_aware	is_null_aware = `null_aware::NO`,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Creates a new column by applying a filter function against every element of the input columns.

Null values in the input columns are considered as not matching the filter.

Computes: out[i]... = predicate(columns[i]... ) ? (columns[i]...): not-applied.

Note that for every scalar in columns (columns of size 1), columns[i] == input[0]

Exceptions

std::invalid_argument	if any of the input columns have different sizes (except scalars of size 1)
std::invalid_argument	if the output or any of the inputs are not fixed-width or string types
cudf::logic_error	if JIT is not supported by the runtime
std::invalid_argument	if the size of `copy_mask` does not match the number of input columns

The size of the resulting column is the size of the largest column.

Parameters

predicate_columns	Immutable views of the predicate columns
predicate_udf	The PTX/CUDA string of the transform function to apply
filter_columns	Immutable view of the columns to be filtered
is_ptx	true: the UDF is treated as PTX code; false: the UDF is treated as CUDA code
user_data	User-defined device data to pass to the UDF.
is_null_aware	Signifies the UDF will receive row inputs as optional values
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the returned column's device memory

Returns: The filtered target columns

◆ filter() [2/2]

std::unique_ptr<table> cudf::filter	(	table_view const &	predicate_table,
		ast::expression const &	predicate_expr,
		table_view const &	filter_table,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Creates new table by applying a filter function against every element of the input columns.

Null values in the input columns are considered as not matching the filter.

Computes: out[i]... = predicate(columns[i]... ) ? (columns[i]...): not-applied.

Exceptions

std::invalid_argument	if the output or any of the inputs are not fixed-width or string types
cudf::logic_error	if JIT is not supported by the runtime

Parameters

predicate_table	The table used for predicate expression evaluation
predicate_expr	The predicate filter expression
filter_table	The table to be filtered
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the returned column's device memory

Returns: The filtered table

◆ stable_distinct()

std::unique_ptr<table> cudf::stable_distinct	(	table_view const &	input,
		std::vector< size_type > const &	keys,
		duplicate_keep_option	keep = `duplicate_keep_option::KEEP_ANY`,
		null_equality	nulls_equal = `null_equality::EQUAL`,
		nan_equality	nans_equal = `nan_equality::ALL_EQUAL`,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Create a new table without duplicate rows, preserving input order.

Given an input table_view, each row is copied to the output table to create a set of distinct rows. The input row order is preserved. If there are duplicate rows, which row is copied depends on the keep parameter.

This API produces the same output rows as cudf::distinct, but with input order preserved.

Note that when keep is KEEP_ANY, the choice of which duplicate row to keep is arbitrary, but the returned table will retain the input order. That is, if the key column contained 1, 2, 1 with another values column 3, 4, 5, the result could contain values 3, 4 or 4, 5 but not 4, 3 or 5, 4.

Parameters

input	The input table
keys	Vector of indices indicating key columns in the `input` table
keep	Copy any, first, last, or none of the found duplicates
nulls_equal	Flag to specify whether null elements should be considered as equal
nans_equal	Flag to specify whether NaN elements should be considered as equal
stream	CUDA stream used for device memory operations and kernel launches.
mr	Device memory resource used to allocate the returned table

Returns: Table with distinct rows, preserving input order

◆ unique()

std::unique_ptr<table> cudf::unique	(	table_view const &	input,
		std::vector< size_type > const &	keys,
		duplicate_keep_option	keep,
		null_equality	nulls_equal = `null_equality::EQUAL`,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Create a new table with consecutive duplicate rows removed.

Given an input table_view, each row is copied to the output table to create a set of distinct rows. If there are duplicate rows, which row is copied depends on the keep parameter.

The order of rows in the output table remains the same as in the input.

A row is distinct if there are no equivalent rows in the table. A row is unique if there is no adjacent equivalent row. That is, keeping distinct rows removes all duplicates in the table/column, while keeping unique rows only removes duplicates from consecutive groupings.

Performance hint: if the input is pre-sorted, cudf::unique can produce an equivalent result (i.e., same set of output rows) but with less running time than cudf::distinct.

Exceptions

cudf::logic_error if the keys column indices are out of bounds in the input table.

Parameters

[in]	input	input table_view to copy only unique rows
[in]	keys	vector of indices representing key columns from `input`
[in]	keep	keep any, first, last, or none of the found duplicates
[in]	nulls_equal	flag to denote nulls are equal if null_equality::EQUAL, nulls are not equal if null_equality::UNEQUAL
[in]	stream	CUDA stream used for device memory operations and kernel launches
[in]	mr	Device memory resource used to allocate the returned table's device memory

Returns: Table with unique rows from each sequence of equivalent rows as specified by keep

◆ unique_count() [1/2]

cudf::size_type cudf::unique_count	(	column_view const &	input,
		null_policy	null_handling,
		nan_policy	nan_handling,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`
	)

Count the number of consecutive groups of equivalent rows in a column.

If null_handling is null_policy::EXCLUDE and nan_handling is nan_policy::NAN_IS_NULL, both NaN and null values are ignored. If null_handling is null_policy::EXCLUDE and nan_handling is nan_policy::NAN_IS_VALID, only null is ignored, NaN is considered in count.

nulls are handled as equal.

Parameters

[in]	input	The column_view whose consecutive groups of equivalent rows will be counted
[in]	null_handling	flag to include or ignore `null` while counting
[in]	nan_handling	flag to consider `NaN==null` or not
[in]	stream	CUDA stream used for device memory operations and kernel launches

Returns: number of consecutive groups of equivalent rows in the column

◆ unique_count() [2/2]

cudf::size_type cudf::unique_count	(	table_view const &	input,
		null_equality	nulls_equal = `null_equality::EQUAL`,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`
	)

Count the number of consecutive groups of equivalent rows in a table.

Parameters

[in]	input	Table whose consecutive groups of equivalent rows will be counted
[in]	nulls_equal	flag to denote if null elements should be considered equal nulls are not equal if null_equality::UNEQUAL.
[in]	stream	CUDA stream used for device memory operations and kernel launches

Returns: number of consecutive groups of equivalent rows in the column

Files

Enumerations

Functions

Detailed Description

Enumeration Type Documentation

◆ duplicate_keep_option

Function Documentation

◆ apply_boolean_mask()

◆ distinct()

◆ distinct_count() [1/2]

◆ distinct_count() [2/2]

◆ distinct_indices()

◆ drop_nans() [1/2]

◆ drop_nans() [2/2]

◆ drop_nulls() [1/2]

◆ drop_nulls() [2/2]

◆ filter() [1/2]

◆ filter() [2/2]

◆ stable_distinct()

◆ unique()

◆ unique_count() [1/2]

◆ unique_count() [2/2]