|
std::unique_ptr< bpe_merge_pairs > | load_merge_pairs (cudf::strings_column_view const &merge_pairs, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Create a nvtext::bpe_merge_pairs from a strings column. More...
|
|
std::unique_ptr< cudf::column > | byte_pair_encoding (cudf::strings_column_view const &input, bpe_merge_pairs const &merges_pairs, cudf::string_scalar const &separator=cudf::string_scalar(" "), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Byte pair encode the input strings. More...
|
|
std::unique_ptr< cudf::column > | edit_distance (cudf::strings_column_view const &input, cudf::strings_column_view const &targets, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Compute the edit distance between individual strings in two strings columns. More...
|
|
std::unique_ptr< cudf::column > | edit_distance_matrix (cudf::strings_column_view const &input, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Compute the edit distance between all the strings in the input column. More...
|
|
std::unique_ptr< cudf::column > | generate_ngrams (cudf::strings_column_view const &input, cudf::size_type ngrams, cudf::string_scalar const &separator, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Returns a single column of strings by generating ngrams from a strings column. More...
|
|
std::unique_ptr< cudf::column > | generate_character_ngrams (cudf::strings_column_view const &input, cudf::size_type ngrams=2, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Generates ngrams of characters within each string. More...
|
|
std::unique_ptr< cudf::column > | hash_character_ngrams (cudf::strings_column_view const &input, cudf::size_type ngrams=5, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Hashes ngrams of characters within each string. More...
|
|
std::unique_ptr< cudf::column > | jaccard_index (cudf::strings_column_view const &input1, cudf::strings_column_view const &input2, cudf::size_type width, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Computes the Jaccard similarity between individual rows in two strings columns. More...
|
|
std::unique_ptr< cudf::column > | minhash (cudf::strings_column_view const &input, cudf::numeric_scalar< uint32_t > seed=0, cudf::size_type width=4, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Returns the minhash value for each string. More...
|
|
std::unique_ptr< cudf::column > | minhash (cudf::strings_column_view const &input, cudf::device_span< uint32_t const > seeds, cudf::size_type width=4, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Returns the minhash values for each string per seed. More...
|
|
std::unique_ptr< cudf::column > | minhash64 (cudf::strings_column_view const &input, cudf::numeric_scalar< uint64_t > seed=0, cudf::size_type width=4, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Returns the minhash value for each string. More...
|
|
std::unique_ptr< cudf::column > | minhash64 (cudf::strings_column_view const &input, cudf::device_span< uint64_t const > seeds, cudf::size_type width=4, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Returns the minhash values for each string per seed. More...
|
|
std::unique_ptr< cudf::column > | word_minhash (cudf::lists_column_view const &input, cudf::device_span< uint32_t const > seeds, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Returns the minhash values for each row of strings per seed. More...
|
|
std::unique_ptr< cudf::column > | word_minhash64 (cudf::lists_column_view const &input, cudf::device_span< uint64_t const > seeds, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Returns the minhash values for each row of strings per seed. More...
|
|
std::unique_ptr< cudf::column > | ngrams_tokenize (cudf::strings_column_view const &input, cudf::size_type ngrams, cudf::string_scalar const &delimiter, cudf::string_scalar const &separator, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Returns a single column of strings by tokenizing the input strings column and then producing ngrams of each string. More...
|
|
std::unique_ptr< cudf::column > | normalize_spaces (cudf::strings_column_view const &input, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Returns a new strings column by normalizing the whitespace in each string in the input column. More...
|
|
std::unique_ptr< cudf::column > | normalize_characters (cudf::strings_column_view const &input, bool do_lower_case, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Normalizes strings characters for tokenizing. More...
|
|
std::unique_ptr< cudf::column > | replace_tokens (cudf::strings_column_view const &input, cudf::strings_column_view const &targets, cudf::strings_column_view const &replacements, cudf::string_scalar const &delimiter=cudf::string_scalar{""}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Replaces specified tokens with corresponding replacement strings. More...
|
|
std::unique_ptr< cudf::column > | filter_tokens (cudf::strings_column_view const &input, cudf::size_type min_token_length, cudf::string_scalar const &replacement=cudf::string_scalar{""}, cudf::string_scalar const &delimiter=cudf::string_scalar{""}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Removes tokens whose lengths are less than a specified number of characters. More...
|
|
std::unique_ptr< cudf::column > | is_letter (cudf::strings_column_view const &input, letter_type ltype, cudf::size_type character_index, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Returns boolean column indicating if character_index of the input strings is a consonant or vowel. More...
|
|
std::unique_ptr< cudf::column > | is_letter (cudf::strings_column_view const &input, letter_type ltype, cudf::column_view const &indices, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Returns boolean column indicating if character at indices[i] of input[i] is a consonant or vowel. More...
|
|
std::unique_ptr< cudf::column > | porter_stemmer_measure (cudf::strings_column_view const &input, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Returns the Porter Stemmer measurements of a strings column. More...
|
|
std::unique_ptr< hashed_vocabulary > | load_vocabulary_file (std::string const &filename_hashed_vocabulary, rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Load the hashed vocabulary file into device memory. More...
|
|
tokenizer_result | subword_tokenize (cudf::strings_column_view const &strings, hashed_vocabulary const &vocabulary_table, uint32_t max_sequence_length, uint32_t stride, bool do_lower_case, bool do_truncate, rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Creates a tokenizer that cleans the text, splits it into tokens and returns token-ids from an input vocabulary. More...
|
|
std::unique_ptr< cudf::column > | tokenize (cudf::strings_column_view const &input, cudf::string_scalar const &delimiter=cudf::string_scalar{""}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Returns a single column of strings by tokenizing the input strings column using the provided characters as delimiters. More...
|
|
std::unique_ptr< cudf::column > | tokenize (cudf::strings_column_view const &input, cudf::strings_column_view const &delimiters, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Returns a single column of strings by tokenizing the input strings column using multiple strings as delimiters. More...
|
|
std::unique_ptr< cudf::column > | count_tokens (cudf::strings_column_view const &input, cudf::string_scalar const &delimiter=cudf::string_scalar{""}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Returns the number of tokens in each string of a strings column. More...
|
|
std::unique_ptr< cudf::column > | count_tokens (cudf::strings_column_view const &input, cudf::strings_column_view const &delimiters, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Returns the number of tokens in each string of a strings column by using multiple strings delimiters to identify tokens in each string. More...
|
|
std::unique_ptr< cudf::column > | character_tokenize (cudf::strings_column_view const &input, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Returns a single column of strings by converting each character to a string. More...
|
|
std::unique_ptr< cudf::column > | detokenize (cudf::strings_column_view const &input, cudf::column_view const &row_indices, cudf::string_scalar const &separator=cudf::string_scalar(" "), rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Creates a strings column from a strings column of tokens and an associated column of row ids. More...
|
|
std::unique_ptr< tokenize_vocabulary > | load_vocabulary (cudf::strings_column_view const &input, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Create a tokenize_vocabulary object from a strings column. More...
|
|
std::unique_ptr< cudf::column > | tokenize_with_vocabulary (cudf::strings_column_view const &input, tokenize_vocabulary const &vocabulary, cudf::string_scalar const &delimiter, cudf::size_type default_id=-1, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref()) |
| Returns the token ids for the input string by looking up each delimited token in the given vocabulary. More...
|
|