Strings column APIs. More...
Classes | |
struct | regex_program |
Regex program class. More... | |
Enumerations | |
enum | string_character_types : uint32_t { DECIMAL = 1 << 0 , NUMERIC = 1 << 1 , DIGIT = 1 << 2 , ALPHA = 1 << 3 , SPACE = 1 << 4 , UPPER = 1 << 5 , LOWER = 1 << 6 , ALPHANUM = DECIMAL | NUMERIC | DIGIT | ALPHA , CASE_TYPES = UPPER | LOWER , ALL_TYPES = ALPHANUM | CASE_TYPES | SPACE } |
Character type values. These types can be or'd to check for any combination of types. More... | |
enum class | separator_on_nulls { YES , NO } |
Setting for specifying how separators are added with null strings elements. More... | |
enum class | output_if_empty_list { EMPTY_STRING , NULL_ELEMENT } |
Setting for specifying what will be output from join_list_elements when an input list is empty. More... | |
enum | regex_flags : uint32_t { DEFAULT = 0 , MULTILINE = 8 , DOTALL = 16 , ASCII = 256 } |
Regex flags. More... | |
enum class | capture_groups : uint32_t { EXTRACT , NON_CAPTURE } |
Capture groups setting. More... | |
enum class | side_type { LEFT , RIGHT , BOTH } |
Direction identifier for cudf::strings::strip and cudf::strings::pad functions. More... | |
enum class | filter_type : bool { KEEP , REMOVE } |
Removes or keeps the specified character ranges in cudf::strings::filter_characters. More... | |
Functions | |
std::unique_ptr< column > | count_characters (strings_column_view const &input, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a column containing character lengths of each string in the given column. More... | |
std::unique_ptr< column > | count_bytes (strings_column_view const &input, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a column containing byte lengths of each string in the given column. More... | |
std::unique_ptr< column > | code_points (strings_column_view const &input, rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Creates a numeric column with code point values (integers) for each character of each string. More... | |
std::unique_ptr< column > | capitalize (strings_column_view const &input, string_scalar const &delimiters=string_scalar("", true, cudf::get_default_stream()), rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a column of capitalized strings. More... | |
std::unique_ptr< column > | title (strings_column_view const &input, string_character_types sequence_type=string_character_types::ALPHA, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Modifies first character of each word to upper-case and lower-cases the rest. More... | |
std::unique_ptr< column > | is_title (strings_column_view const &input, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Checks if the strings in the input column are title formatted. More... | |
std::unique_ptr< column > | to_lower (strings_column_view const &strings, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Converts a column of strings to lower case. More... | |
std::unique_ptr< column > | to_upper (strings_column_view const &strings, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Converts a column of strings to upper case. More... | |
std::unique_ptr< column > | swapcase (strings_column_view const &strings, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a column of strings converting lower case characters to upper case and vice versa. More... | |
std::unique_ptr< column > | all_characters_of_type (strings_column_view const &input, string_character_types types, string_character_types verify_types=string_character_types::ALL_TYPES, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a boolean column identifying strings entries in which all characters are of the type specified. More... | |
std::unique_ptr< column > | filter_characters_of_type (strings_column_view const &input, string_character_types types_to_remove, string_scalar const &replacement=string_scalar(""), string_character_types types_to_keep=string_character_types::ALL_TYPES, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Filter specific character types from a column of strings. More... | |
constexpr string_character_types | operator| (string_character_types lhs, string_character_types rhs) |
OR operator for combining string_character_types. More... | |
constexpr string_character_types & | operator|= (string_character_types &lhs, string_character_types rhs) |
Compound assignment OR operator for combining string_character_types. More... | |
std::unique_ptr< column > | join_strings (strings_column_view const &input, string_scalar const &separator=string_scalar(""), string_scalar const &narep=string_scalar("", false), rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Concatenates all strings in the column into one new string delimited by an optional separator string. More... | |
std::unique_ptr< column > | concatenate (table_view const &strings_columns, strings_column_view const &separators, string_scalar const &separator_narep=string_scalar("", false), string_scalar const &col_narep=string_scalar("", false), separator_on_nulls separate_nulls=separator_on_nulls::YES, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Concatenates a list of strings columns using separators for each row and returns the result as a strings column. More... | |
std::unique_ptr< column > | concatenate (table_view const &strings_columns, string_scalar const &separator=string_scalar(""), string_scalar const &narep=string_scalar("", false), separator_on_nulls separate_nulls=separator_on_nulls::YES, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Row-wise concatenates the given list of strings columns and returns a single strings column result. More... | |
std::unique_ptr< column > | join_list_elements (lists_column_view const &lists_strings_column, strings_column_view const &separators, string_scalar const &separator_narep=string_scalar("", false), string_scalar const &string_narep=string_scalar("", false), separator_on_nulls separate_nulls=separator_on_nulls::YES, output_if_empty_list empty_list_policy=output_if_empty_list::EMPTY_STRING, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Given a lists column of strings (each row is a list of strings), concatenates the strings within each row and returns a single strings column result. More... | |
std::unique_ptr< column > | join_list_elements (lists_column_view const &lists_strings_column, string_scalar const &separator=string_scalar(""), string_scalar const &narep=string_scalar("", false), separator_on_nulls separate_nulls=separator_on_nulls::YES, output_if_empty_list empty_list_policy=output_if_empty_list::EMPTY_STRING, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Given a lists column of strings (each row is a list of strings), concatenates the strings within each row and returns a single strings column result. More... | |
std::unique_ptr< column > | contains_re (strings_column_view const &input, regex_program const &prog, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a boolean column identifying rows which match the given regex_program object. More... | |
std::unique_ptr< column > | matches_re (strings_column_view const &input, regex_program const &prog, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a boolean column identifying rows which matching the given regex_program object but only at the beginning the string. More... | |
std::unique_ptr< column > | count_re (strings_column_view const &input, regex_program const &prog, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns the number of times the given regex_program's pattern matches in each string. More... | |
std::unique_ptr< column > | like (strings_column_view const &input, string_scalar const &pattern, string_scalar const &escape_character=string_scalar(""), rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a boolean column identifying rows which match the given like pattern. More... | |
std::unique_ptr< column > | like (strings_column_view const &input, strings_column_view const &patterns, string_scalar const &escape_character=string_scalar(""), rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a boolean column identifying rows which match the corresponding like pattern in the given patterns. More... | |
std::unique_ptr< column > | to_booleans (strings_column_view const &input, string_scalar const &true_string, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a new BOOL8 column by parsing boolean values from the strings in the provided strings column. More... | |
std::unique_ptr< column > | from_booleans (column_view const &booleans, string_scalar const &true_string, string_scalar const &false_string, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a new strings column converting the boolean values from the provided column into strings. More... | |
std::unique_ptr< column > | to_timestamps (strings_column_view const &input, data_type timestamp_type, std::string_view format, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a new timestamp column converting a strings column into timestamps using the provided format pattern. More... | |
std::unique_ptr< column > | is_timestamp (strings_column_view const &input, std::string_view format, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Verifies the given strings column can be parsed to timestamps using the provided format pattern. More... | |
std::unique_ptr< column > | from_timestamps (column_view const ×tamps, std::string_view format="%Y-%m-%dT%H:%M:%SZ", strings_column_view const &names=strings_column_view(column_view{ data_type{type_id::STRING}, 0, nullptr, nullptr, 0}), rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a new strings column converting a timestamp column into strings using the provided format pattern. More... | |
std::unique_ptr< column > | to_durations (strings_column_view const &input, data_type duration_type, std::string_view format, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a new duration column converting a strings column into durations using the provided format pattern. More... | |
std::unique_ptr< column > | from_durations (column_view const &durations, std::string_view format="%D days %H:%M:%S", rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a new strings column converting a duration column into strings using the provided format pattern. More... | |
std::unique_ptr< column > | to_fixed_point (strings_column_view const &input, data_type output_type, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a new fixed-point column parsing decimal values from the provided strings column. More... | |
std::unique_ptr< column > | from_fixed_point (column_view const &input, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a new strings column converting the fixed-point values into a strings column. More... | |
std::unique_ptr< column > | is_fixed_point (strings_column_view const &input, data_type decimal_type=data_type{type_id::DECIMAL64}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a boolean column identifying strings in which all characters are valid for conversion to fixed-point. More... | |
std::unique_ptr< column > | to_floats (strings_column_view const &strings, data_type output_type, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a new numeric column by parsing float values from each string in the provided strings column. More... | |
std::unique_ptr< column > | from_floats (column_view const &floats, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a new strings column converting the float values from the provided column into strings. More... | |
std::unique_ptr< column > | is_float (strings_column_view const &input, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a boolean column identifying strings in which all characters are valid for conversion to floats. More... | |
std::unique_ptr< column > | to_integers (strings_column_view const &input, data_type output_type, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a new integer numeric column parsing integer values from the provided strings column. More... | |
std::unique_ptr< column > | from_integers (column_view const &integers, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a new strings column converting the integer values from the provided column into strings. More... | |
std::unique_ptr< column > | is_integer (strings_column_view const &input, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a boolean column identifying strings in which all characters are valid for conversion to integers. More... | |
std::unique_ptr< column > | is_integer (strings_column_view const &input, data_type int_type, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a boolean column identifying strings in which all characters are valid for conversion to integers. More... | |
std::unique_ptr< column > | hex_to_integers (strings_column_view const &input, data_type output_type, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a new integer numeric column parsing hexadecimal values from the provided strings column. More... | |
std::unique_ptr< column > | is_hex (strings_column_view const &input, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a boolean column identifying strings in which all characters are valid for conversion to integers from hex. More... | |
std::unique_ptr< column > | integers_to_hex (column_view const &input, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a new strings column converting integer columns to hexadecimal characters. More... | |
std::unique_ptr< column > | ipv4_to_integers (strings_column_view const &input, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Converts IPv4 addresses into integers. More... | |
std::unique_ptr< column > | integers_to_ipv4 (column_view const &integers, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Converts integers into IPv4 addresses as strings. More... | |
std::unique_ptr< column > | is_ipv4 (strings_column_view const &input, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a boolean column identifying strings in which all characters are valid for conversion to integers from IPv4 format. More... | |
std::unique_ptr< column > | format_list_column (lists_column_view const &input, string_scalar const &na_rep=string_scalar(""), strings_column_view const &separators=strings_column_view(column_view{ data_type{type_id::STRING}, 0, nullptr, nullptr, 0}), rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Convert a list column of strings into a formatted strings column. More... | |
std::unique_ptr< column > | url_encode (strings_column_view const &input, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Decodes each string using URL encoding. More... | |
std::unique_ptr< column > | url_decode (strings_column_view const &input, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Encodes each string using URL encoding. More... | |
std::unique_ptr< table > | extract (strings_column_view const &input, regex_program const &prog, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a table of strings columns where each column corresponds to the matching group specified in the given regex_program object. More... | |
std::unique_ptr< column > | extract_all_record (strings_column_view const &input, regex_program const &prog, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a lists column of strings where each string column row corresponds to the matching group specified in the given regex_program object. More... | |
std::unique_ptr< column > | find (strings_column_view const &input, string_scalar const &target, size_type start=0, size_type stop=-1, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a column of character position values where the target string is first found in each string of the provided column. More... | |
std::unique_ptr< column > | rfind (strings_column_view const &input, string_scalar const &target, size_type start=0, size_type stop=-1, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a column of character position values where the target string is first found searching from the end of each string. More... | |
std::unique_ptr< column > | find (strings_column_view const &input, strings_column_view const &target, size_type start=0, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a column of character position values where the target string is first found in the corresponding string of the provided column. More... | |
std::unique_ptr< column > | contains (strings_column_view const &input, string_scalar const &target, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a column of boolean values for each string where true indicates the target string was found within that string in the provided column. More... | |
std::unique_ptr< column > | contains (strings_column_view const &input, strings_column_view const &targets, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a column of boolean values for each string where true indicates the corresponding target string was found within that string in the provided column. More... | |
std::unique_ptr< column > | starts_with (strings_column_view const &input, string_scalar const &target, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a column of boolean values for each string where true indicates the target string was found at the beginning of that string in the provided column. More... | |
std::unique_ptr< column > | starts_with (strings_column_view const &input, strings_column_view const &targets, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a column of boolean values for each string where true indicates corresponding string in target column was found at the beginning of that string in the provided column. More... | |
std::unique_ptr< column > | ends_with (strings_column_view const &input, string_scalar const &target, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a column of boolean values for each string where true indicates the target string was found at the end of that string in the provided column. More... | |
std::unique_ptr< column > | ends_with (strings_column_view const &input, strings_column_view const &targets, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a column of boolean values for each string where true indicates corresponding string in target column was found at the end of that string in the provided column. More... | |
std::unique_ptr< column > | find_multiple (strings_column_view const &input, strings_column_view const &targets, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a lists column with character position values where each of the target strings are found in each string. More... | |
std::unique_ptr< column > | findall (strings_column_view const &input, regex_program const &prog, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a lists column of strings for each matching occurrence using the regex_program pattern within each string. More... | |
std::unique_ptr< column > | pad (strings_column_view const &input, size_type width, side_type side=side_type::RIGHT, std::string_view fill_char=" ", rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Add padding to each string using a provided character. More... | |
std::unique_ptr< column > | zfill (strings_column_view const &input, size_type width, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Add '0' as padding to the left of each string. More... | |
constexpr bool | is_multiline (regex_flags const f) |
Returns true if the given flags contain MULTILINE. More... | |
constexpr bool | is_dotall (regex_flags const f) |
Returns true if the given flags contain DOTALL. More... | |
constexpr bool | is_ascii (regex_flags const f) |
Returns true if the given flags contain ASCII. More... | |
std::unique_ptr< string_scalar > | repeat_string (string_scalar const &input, size_type repeat_times, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Repeat the given string scalar a given number of times. More... | |
std::unique_ptr< column > | repeat_strings (strings_column_view const &input, size_type repeat_times, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Repeat each string in the given strings column a given number of times. More... | |
std::unique_ptr< column > | repeat_strings (strings_column_view const &input, column_view const &repeat_times, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Repeat each string in the given strings column by the numbers of times given in another numeric column. More... | |
std::unique_ptr< column > | replace (strings_column_view const &input, string_scalar const &target, string_scalar const &repl, cudf::size_type maxrepl=-1, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Replaces target string within each string with the specified replacement string. More... | |
std::unique_ptr< column > | replace_slice (strings_column_view const &input, string_scalar const &repl=string_scalar(""), size_type start=0, size_type stop=-1, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
This function replaces each string in the column with the provided repl string within the [start,stop) character position range. More... | |
std::unique_ptr< column > | replace_multiple (strings_column_view const &input, strings_column_view const &targets, strings_column_view const &repls, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Replaces substrings matching a list of targets with the corresponding replacement strings. More... | |
std::unique_ptr< column > | replace (strings_column_view const &input, strings_column_view const &targets, strings_column_view const &repls, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Replaces substrings matching a list of targets with the corresponding replacement strings. More... | |
std::unique_ptr< column > | replace_re (strings_column_view const &input, regex_program const &prog, string_scalar const &replacement=string_scalar(""), std::optional< size_type > max_replace_count=std::nullopt, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
For each string, replaces any character sequence matching the given regex with the provided replacement string. More... | |
std::unique_ptr< column > | replace_re (strings_column_view const &input, std::vector< std::string > const &patterns, strings_column_view const &replacements, regex_flags const flags=regex_flags::DEFAULT, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
For each string, replaces any character sequence matching the given patterns with the corresponding string in the replacements column. More... | |
std::unique_ptr< column > | replace_with_backrefs (strings_column_view const &input, regex_program const &prog, std::string_view replacement, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
For each string, replaces any character sequence matching the given regex using the replacement template for back-references. More... | |
std::unique_ptr< column > | reverse (strings_column_view const &input, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Reverses the characters within each string. More... | |
std::unique_ptr< column > | slice_strings (strings_column_view const &input, numeric_scalar< size_type > const &start=numeric_scalar< size_type >(0, false), numeric_scalar< size_type > const &stop=numeric_scalar< size_type >(0, false), numeric_scalar< size_type > const &step=numeric_scalar< size_type >(1), rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a new strings column that contains substrings of the strings in the provided column. More... | |
std::unique_ptr< column > | slice_strings (strings_column_view const &input, column_view const &starts, column_view const &stops, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a new strings column that contains substrings of the strings in the provided column using unique ranges for each string. More... | |
std::unique_ptr< table > | partition (strings_column_view const &input, string_scalar const &delimiter=string_scalar(""), rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a set of 3 columns by splitting each string using the specified delimiter. More... | |
std::unique_ptr< table > | rpartition (strings_column_view const &input, string_scalar const &delimiter=string_scalar(""), rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a set of 3 columns by splitting each string using the specified delimiter starting from the end of each string. More... | |
std::unique_ptr< table > | split (strings_column_view const &strings_column, string_scalar const &delimiter=string_scalar(""), size_type maxsplit=-1, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a list of columns by splitting each string using the specified delimiter. More... | |
std::unique_ptr< table > | rsplit (strings_column_view const &strings_column, string_scalar const &delimiter=string_scalar(""), size_type maxsplit=-1, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Returns a list of columns by splitting each string using the specified delimiter starting from the end of each string. More... | |
std::unique_ptr< column > | split_record (strings_column_view const &strings, string_scalar const &delimiter=string_scalar(""), size_type maxsplit=-1, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Splits individual strings elements into a list of strings. More... | |
std::unique_ptr< column > | rsplit_record (strings_column_view const &strings, string_scalar const &delimiter=string_scalar(""), size_type maxsplit=-1, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Splits individual strings elements into a list of strings starting from the end of each string. More... | |
std::unique_ptr< table > | split_re (strings_column_view const &input, regex_program const &prog, size_type maxsplit=-1, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Splits strings elements into a table of strings columns using a regex_program's pattern to delimit each string. More... | |
std::unique_ptr< table > | rsplit_re (strings_column_view const &input, regex_program const &prog, size_type maxsplit=-1, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Splits strings elements into a table of strings columns using a regex_program's pattern to delimit each string starting from the end of the string. More... | |
std::unique_ptr< column > | split_record_re (strings_column_view const &input, regex_program const &prog, size_type maxsplit=-1, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Splits strings elements into a list column of strings using the given regex_program to delimit each string. More... | |
std::unique_ptr< column > | rsplit_record_re (strings_column_view const &input, regex_program const &prog, size_type maxsplit=-1, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Splits strings elements into a list column of strings using the given regex_program to delimit each string starting from the end of the string. More... | |
std::unique_ptr< column > | strip (strings_column_view const &input, side_type side=side_type::BOTH, string_scalar const &to_strip=string_scalar(""), rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Removes the specified characters from the beginning or end (or both) of each string. More... | |
std::unique_ptr< column > | translate (strings_column_view const &input, std::vector< std::pair< char_utf8, char_utf8 >> const &chars_table, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Translates individual characters within each string. More... | |
std::unique_ptr< column > | filter_characters (strings_column_view const &input, std::vector< std::pair< cudf::char_utf8, cudf::char_utf8 >> characters_to_filter, filter_type keep_characters=filter_type::KEEP, string_scalar const &replacement=string_scalar(""), rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Removes ranges of characters from each string in a strings column. More... | |
rmm::device_uvector< string_view > | create_string_vector_from_column (cudf::strings_column_view const strings, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Creates a string_view vector from a strings column. More... | |
int64_t | get_offset64_threshold () |
Return the threshold size for a strings column to use int64 offsets. More... | |
bool | is_large_strings_enabled () |
Checks if large strings is enabled. More... | |
std::unique_ptr< column > | wrap (strings_column_view const &input, size_type width, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=rmm::mr::get_current_device_resource()) |
Wraps strings onto multiple lines shorter than width by replacing appropriate white space with new-line characters (ASCII 0x0A). More... | |
Strings column APIs.
rmm::device_uvector<string_view> cudf::strings::create_string_vector_from_column | ( | cudf::strings_column_view const | strings, |
rmm::cuda_stream_view | stream = cudf::get_default_stream() , |
||
rmm::device_async_resource_ref | mr = rmm::mr::get_current_device_resource() |
||
) |
Creates a string_view vector from a strings column.
strings | Strings column instance. |
stream | CUDA stream used for device memory operations and kernel launches. |
mr | Device memory resource used to allocate the returned vector's device memory. |
int64_t cudf::strings::get_offset64_threshold | ( | ) |
Return the threshold size for a strings column to use int64 offsets.
A computed size above this threshold should using int64 offsets, otherwise int32 offsets. By default this function will return std::numeric_limits<int32_t>::max(). This value can be overridden at runtime using the environment variable LIBCUDF_LARGE_STRINGS_THRESHOLD.
bool cudf::strings::is_large_strings_enabled | ( | ) |
Checks if large strings is enabled.
This checks the setting in the environment variable LIBCUDF_LARGE_STRINGS_ENABLED.