Files | |
file | minhash.hpp |
std::unique_ptr<cudf::column> nvtext::minhash | ( | cudf::strings_column_view const & | input, |
uint32_t | seed, | ||
cudf::device_span< uint32_t const > | parameter_a, | ||
cudf::device_span< uint32_t const > | parameter_b, | ||
cudf::size_type | width, | ||
rmm::cuda_stream_view | stream = cudf::get_default_stream() , |
||
rmm::device_async_resource_ref | mr = cudf::get_current_device_resource_ref() |
||
) |
Returns the minhash values for each string.
This function uses MurmurHash3_x86_32 for the hash algorithm.
The input strings are first hashed using the given seed
over substrings of width
characters. These hash values are then combined with the a
and b
parameter values using the following formula:
This calculation is performed on each substring and the minimum value is computed as follows:
Any null row entries result in corresponding null output rows.
std::invalid_argument | if the width < 2 |
std::invalid_argument | if parameter_a is empty |
std::invalid_argument | if parameter_b.size() != parameter_a.size() |
std::overflow_error | if parameter_a.size() * input.size() exceeds the column size limit |
input | Strings column to compute minhash |
seed | Seed value used for the hash algorithm |
parameter_a | Values used for the permuted calculation |
parameter_b | Values used for the permuted calculation |
width | The character width of substrings to hash for each row |
stream | CUDA stream used for device memory operations and kernel launches |
mr | Device memory resource used to allocate the returned column's device memory |
std::unique_ptr<cudf::column> nvtext::minhash64 | ( | cudf::strings_column_view const & | input, |
uint64_t | seed, | ||
cudf::device_span< uint64_t const > | parameter_a, | ||
cudf::device_span< uint64_t const > | parameter_b, | ||
cudf::size_type | width, | ||
rmm::cuda_stream_view | stream = cudf::get_default_stream() , |
||
rmm::device_async_resource_ref | mr = cudf::get_current_device_resource_ref() |
||
) |
Returns the minhash values for each string.
This function uses MurmurHash3_x64_128 for the hash algorithm.
The input strings are first hashed using the given seed
over substrings of width
characters. These hash values are then combined with the a
and b
parameter values using the following formula:
This calculation is performed on each substring and the minimum value is computed as follows:
Any null row entries result in corresponding null output rows.
std::invalid_argument | if the width < 2 |
std::invalid_argument | if parameter_a is empty |
std::invalid_argument | if parameter_b.size() != parameter_a.size() |
std::overflow_error | if parameter_a.size() * input.size() exceeds the column size limit |
input | Strings column to compute minhash |
seed | Seed value used for the hash algorithm |
parameter_a | Values used for the permuted calculation |
parameter_b | Values used for the permuted calculation |
width | The character width of substrings to hash for each row |
stream | CUDA stream used for device memory operations and kernel launches |
mr | Device memory resource used to allocate the returned column's device memory |
std::unique_ptr<cudf::column> nvtext::minhash64_permuted | ( | cudf::strings_column_view const & | input, |
uint64_t | seed, | ||
cudf::device_span< uint64_t const > | parameter_a, | ||
cudf::device_span< uint64_t const > | parameter_b, | ||
cudf::size_type | width, | ||
rmm::cuda_stream_view | stream = cudf::get_default_stream() , |
||
rmm::device_async_resource_ref | mr = cudf::get_current_device_resource_ref() |
||
) |
Returns the minhash values for each string.
This function uses MurmurHash3_x64_128 for the hash algorithm.
The input strings are first hashed using the given seed
over substrings of width
characters. These hash values are then combined with the a
and b
parameter values using the following formula:
This calculation is performed on each substring and the minimum value is computed as follows:
Any null row entries result in corresponding null output rows.
std::invalid_argument | if the width < 2 |
std::invalid_argument | if parameter_a is empty |
std::invalid_argument | if parameter_b.size() != parameter_a.size() |
std::overflow_error | if parameter_a.size() * input.size() exceeds the column size limit |
input | Strings column to compute minhash |
seed | Seed value used for the hash algorithm |
parameter_a | Values used for the permuted calculation |
parameter_b | Values used for the permuted calculation |
width | The character width of substrings to hash for each row |
stream | CUDA stream used for device memory operations and kernel launches |
mr | Device memory resource used to allocate the returned column's device memory |
std::unique_ptr<cudf::column> nvtext::minhash_permuted | ( | cudf::strings_column_view const & | input, |
uint32_t | seed, | ||
cudf::device_span< uint32_t const > | parameter_a, | ||
cudf::device_span< uint32_t const > | parameter_b, | ||
cudf::size_type | width, | ||
rmm::cuda_stream_view | stream = cudf::get_default_stream() , |
||
rmm::device_async_resource_ref | mr = cudf::get_current_device_resource_ref() |
||
) |
Returns the minhash values for each string.
This function uses MurmurHash3_x86_32 for the hash algorithm.
The input strings are first hashed using the given seed
over substrings of width
characters. These hash values are then combined with the a
and b
parameter values using the following formula:
This calculation is performed on each substring and the minimum value is computed as follows:
Any null row entries result in corresponding null output rows.
std::invalid_argument | if the width < 2 |
std::invalid_argument | if parameter_a is empty |
std::invalid_argument | if parameter_b.size() != parameter_a.size() |
std::overflow_error | if parameter_a.size() * input.size() exceeds the column size limit |
input | Strings column to compute minhash |
seed | Seed value used for the hash algorithm |
parameter_a | Values used for the permuted calculation |
parameter_b | Values used for the permuted calculation |
width | The character width of substrings to hash for each row |
stream | CUDA stream used for device memory operations and kernel launches |
mr | Device memory resource used to allocate the returned column's device memory |