Files
file	partition.hpp
	Strings partition APIs.

file	split.hpp

file	split_re.hpp

Functions
std::unique_ptr< table >	cudf::strings::partition (strings_column_view const &input, string_scalar const &delimiter=string_scalar(""), rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Returns a set of 3 columns by splitting each string using the specified delimiter. More...

std::unique_ptr< table >	cudf::strings::rpartition (strings_column_view const &input, string_scalar const &delimiter=string_scalar(""), rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Returns a set of 3 columns by splitting each string using the specified delimiter starting from the end of each string. More...

std::unique_ptr< table >	cudf::strings::split (strings_column_view const &strings_column, string_scalar const &delimiter=string_scalar(""), size_type maxsplit=-1, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Returns a list of columns by splitting each string using the specified delimiter. More...

std::unique_ptr< table >	cudf::strings::rsplit (strings_column_view const &strings_column, string_scalar const &delimiter=string_scalar(""), size_type maxsplit=-1, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Returns a list of columns by splitting each string using the specified delimiter starting from the end of each string. More...

std::unique_ptr< column >	cudf::strings::split_record (strings_column_view const &strings, string_scalar const &delimiter=string_scalar(""), size_type maxsplit=-1, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Splits individual strings elements into a list of strings. More...

std::unique_ptr< column >	cudf::strings::rsplit_record (strings_column_view const &strings, string_scalar const &delimiter=string_scalar(""), size_type maxsplit=-1, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Splits individual strings elements into a list of strings starting from the end of each string. More...

std::unique_ptr< column >	cudf::strings::split_part (strings_column_view const &input, string_scalar const &delimiter=string_scalar(""), size_type index=0, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Returns a columns of strings by splitting each input string using the specified delimiter and returning the string at the specified index. More...

std::unique_ptr< table >	cudf::strings::split_re (strings_column_view const &input, regex_program const &prog, size_type maxsplit=-1, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Splits strings elements into a table of strings columns using a regex_program's pattern to delimit each string. More...

std::unique_ptr< table >	cudf::strings::rsplit_re (strings_column_view const &input, regex_program const &prog, size_type maxsplit=-1, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Splits strings elements into a table of strings columns using a regex_program's pattern to delimit each string starting from the end of the string. More...

std::unique_ptr< column >	cudf::strings::split_record_re (strings_column_view const &input, regex_program const &prog, size_type maxsplit=-1, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Splits strings elements into a list column of strings using the given regex_program to delimit each string. More...

std::unique_ptr< column >	cudf::strings::rsplit_record_re (strings_column_view const &input, regex_program const &prog, size_type maxsplit=-1, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
	Splits strings elements into a list column of strings using the given regex_program to delimit each string starting from the end of the string. More...

Detailed Description

Function Documentation

◆ partition()

std::unique_ptr<table> cudf::strings::partition	(	strings_column_view const &	input,
		string_scalar const &	delimiter = `string_scalar("")`,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Returns a set of 3 columns by splitting each string using the specified delimiter.

The number of rows in the output columns will be the same as the input column. The first column will contain the first tokens of each string as a result of the split. The second column will contain the delimiter. The third column will contain the remaining characters of each string after the delimiter.

Any null string entries return corresponding null output columns.

Example:
s = ["ab_cd","def_g_h"]
r = partition(s,"_")
r[0] is ["ab","def"]
r[1] is ["_","_"]
r[2] is ["cd","g_h"]

Parameters

input	Strings instance for this operation
delimiter	UTF-8 encoded string indicating where to split each string. Default of empty string indicates split on whitespace.
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the returned table's device memory

Returns: New table of strings columns

◆ rpartition()

std::unique_ptr<table> cudf::strings::rpartition	(	strings_column_view const &	input,
		string_scalar const &	delimiter = `string_scalar("")`,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Returns a set of 3 columns by splitting each string using the specified delimiter starting from the end of each string.

The number of rows in the output columns will be the same as the input column. The first column will contain the characters of each string before the last delimiter found. The second column will contain the delimiter. The third column will contain the remaining characters of each string after the delimiter.

Any null string entries return corresponding null output columns.

Example:
s = ["ab_cd","def_g_h"]
r = rpartition(s,"_")
r[0] is ["ab","def_g"]
r[1] is ["_","_"]
r[2] is ["cd","h"]

Parameters

input	Strings instance for this operation
delimiter	UTF-8 encoded string indicating where to split each string. Default of empty string indicates split on whitespace.
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the returned table's device memory

Returns: New strings columns

◆ rsplit()

std::unique_ptr<table> cudf::strings::rsplit	(	strings_column_view const &	strings_column,
		string_scalar const &	delimiter = `string_scalar("")`,
		size_type	maxsplit = `-1`,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Returns a list of columns by splitting each string using the specified delimiter starting from the end of each string.

The number of rows in the output columns will be the same as the input column. The first column will contain the first tokens encountered in each string as a result of the split. Subsequent columns contain the next token strings. Null entries are added for a row where split results have been exhausted. The total number of columns will equal the maximum number of splits encountered on any string in the input column.

Any null string entries return corresponding null output columns.

Parameters

strings_column	Strings instance for this operation
delimiter	UTF-8 encoded string indicating the split points in each string; Default of empty string indicates split on whitespace.
maxsplit	Maximum number of splits to perform; Default of -1 indicates all possible splits on each string.
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the returned table's device memory

Returns: New strings columns.

◆ rsplit_re()

std::unique_ptr<table> cudf::strings::rsplit_re	(	strings_column_view const &	input,
		regex_program const &	prog,
		size_type	maxsplit = `-1`,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Splits strings elements into a table of strings columns using a regex_program's pattern to delimit each string starting from the end of the string.

Each element generates a vector of strings that are stored in corresponding rows in the output table – table[col,row] = token[col] of string[row] where token is the substring between each delimiter.

The number of rows in the output table will be the same as the number of elements in the input column. The resulting number of columns will be the maximum number of tokens found in any input row.

Splitting occurs by traversing starting from the end of the input string. The pattern is used to identify the delimiters within a string and splitting stops when either maxsplit or the beginning of the string is reached.

An empty input string will produce a corresponding empty string in the corresponding row of the first column. A null row will produce corresponding null rows in the output table.

The regex_program's regex_flags are ignored.

s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
p1 = regex_program::create("[_ ]")
s1 = rsplit_re(s, p1)
s1 is a table of strings columns:
    [ ["a", "a", "", "ab"],
      ["bc", "", "ab", "cd"],
      ["def", "bc", "cd", ""],
      ["g", null, null, null] ]
p2 = regex_program::create("[ _]")
s2 = rsplit_re(s, p2, 1)
s2 is a table of strings columns:
    [ ["a_bc def", "a_", "_ab", "ab"],
      ["g", "bc", "cd", "cd "] ]

Exceptions

cudf::logic_error if pattern is empty.

Parameters

input	A column of string elements to be split
prog	Regex program instance
maxsplit	Maximum number of splits to perform. Default of -1 indicates all possible splits on each string.
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the returned result's device memory

Returns: A table of columns of strings

◆ rsplit_record()

std::unique_ptr<column> cudf::strings::rsplit_record	(	strings_column_view const &	strings,
		string_scalar const &	delimiter = `string_scalar("")`,
		size_type	maxsplit = `-1`,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Splits individual strings elements into a list of strings starting from the end of each string.

Each element generates an array of strings that are stored in an output lists column.

The number of elements in the output column will be the same as the number of elements in the input column. Each individual list item will contain the new strings for that row. The resulting number of strings in each row can vary from 0 to maxsplit + 1.

The delimiter is searched from end to beginning within each string and splitting stops when either maxsplit or the beginning of the string is reached.

If a delimiter is not whitespace and occurs adjacent to another delimiter, an empty string is produced for that split occurrence. Likewise, a non-whitespace delimiter produces an empty string if it appears at the beginning or the end of a string.

Note that rsplit_record and split_record produce equivalent results for the default maxsplit value.

s = ["a_bc_def_g", "a__bc", "_ab_cd", "ab_cd_"]
s1 = rsplit_record(s, "_")
s1 is a lists column of strings:
    [ ["a", "bc", "def", "g"],
      ["a", "", "bc"],
      ["", "ab", "cd"],
      ["ab", "cd", ""] ]
s2 = rsplit_record(s, "_", 1)
s2 is a lists column of strings:
    [ ["a_bc_def", "g"],
      ["a_", "bc"],
      ["_ab", "cd"],
      ["ab_cd", ""] ]

A whitespace delimiter produces no empty strings.

s = ["a bc def", "a  bc", " ab cd", "ab cd "]
s1 = rsplit_record(s, "")
s1 is a lists column of strings:
    [ ["a", "bc", "def"],
      ["a", "bc"],
      ["ab", "cd"],
      ["ab", "cd"] ]
s2 = rsplit_record(s, "", 1)
s2 is a lists column of strings:
    [ ["a bc", "def"],
      ["a", "bc"],
      [" ab", "cd"],
      ["ab", "cd"] ]

A null string element will result in a null list item for that row.

Exceptions

cudf::logic_error if delimiter is invalid.

Parameters

strings	A column of string elements to be split
delimiter	The string to identify split points in each string; Default of empty string indicates split on whitespace.
maxsplit	Maximum number of splits to perform; Default of -1 indicates all possible splits on each string
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the returned result's device memory

Returns: Lists column of strings; Each row of the lists column holds splits from a single row element of the input column.

◆ rsplit_record_re()

std::unique_ptr<column> cudf::strings::rsplit_record_re	(	strings_column_view const &	input,
		regex_program const &	prog,
		size_type	maxsplit = `-1`,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Splits strings elements into a list column of strings using the given regex_program to delimit each string starting from the end of the string.

Each element generates a vector of strings that are stored in an output lists column – list[row] = [token1, token2, ...] found in input[row] where token is a substring between delimiters.

The number of elements in the output column will be the same as the number of elements in the input column. Each individual list item will contain the new strings for that row. The resulting number of strings in each row can vary from 0 to maxsplit + 1.

Splitting occurs by traversing starting from the end of the input string. The pattern is used to identify the separation points within a string and splitting stops when either maxsplit or the beginning of the string is reached.

An empty input string will produce a corresponding empty list item output row. A null row will produce a corresponding null output row.

The regex_program's regex_flags are ignored.

s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
p1 = regex_program::create("[_ ]")
s1 = rsplit_record_re(s, p1)
s1 is a lists column of strings:
    [ ["a", "bc", "def", "g"],
      ["a", "", "bc"],
      ["", "ab", "cd"],
      ["ab", "cd", ""] ]
p2 = regex_program::create("[ _]")
s2 = rsplit_record_re(s, p2, 1)
s2 is a lists column of strings:
    [ ["a_bc def", "g"],
      ["a_", "bc"],
      ["_ab", "cd"],
      ["ab_cd", ""] ]

See the Regex Features page for details on patterns supported by this API.

Exceptions

cudf::logic_error if pattern is empty.

Parameters

input	A column of string elements to be split
prog	Regex program instance
maxsplit	Maximum number of splits to perform. Default of -1 indicates all possible splits on each string.
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the returned result's device memory

Returns: Lists column of strings

◆ split()

std::unique_ptr<table> cudf::strings::split	(	strings_column_view const &	strings_column,
		string_scalar const &	delimiter = `string_scalar("")`,
		size_type	maxsplit = `-1`,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Returns a list of columns by splitting each string using the specified delimiter.

The number of rows in the output columns will be the same as the input column. The first column will contain the first tokens of each string as a result of the split. Subsequent columns contain the next token strings. Null entries are added for a row where split results have been exhausted. The total number of columns will equal the maximum number of splits encountered on any string in the input column.

Any null string entries return corresponding null output columns.

Parameters

strings_column	Strings instance for this operation
delimiter	UTF-8 encoded string indicating the split points in each string; Default of empty string indicates split on whitespace.
maxsplit	Maximum number of splits to perform; Default of -1 indicates all possible splits on each string.
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the returned table's device memory

Returns: New table of strings columns

◆ split_part()

std::unique_ptr<column> cudf::strings::split_part	(	strings_column_view const &	input,
		string_scalar const &	delimiter = `string_scalar("")`,
		size_type	index = `0`,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Returns a columns of strings by splitting each input string using the specified delimiter and returning the string at the specified index.

Any null rows in the input return corresponding null output rows. A null row is also returned if the number of tokens computed by splitting the string for that row is less than the index.

Parameters

input	Strings instance for this operation
delimiter	UTF-8 encoded string indicating the split points in each string; Default of empty string indicates split on whitespace
index	The 0-based index of the string to return from the split
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the returned column's device memory

Returns: New column of strings

◆ split_re()

std::unique_ptr<table> cudf::strings::split_re	(	strings_column_view const &	input,
		regex_program const &	prog,
		size_type	maxsplit = `-1`,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Splits strings elements into a table of strings columns using a regex_program's pattern to delimit each string.

Each element generates a vector of strings that are stored in corresponding rows in the output table – table[col,row] = token[col] of strings[row] where token is a substring between delimiters.

The number of rows in the output table will be the same as the number of elements in the input column. The resulting number of columns will be the maximum number of tokens found in any input row.

The pattern is used to identify the delimiters within a string and splitting stops when either maxsplit or the end of the string is reached.

An empty input string will produce a corresponding empty string in the corresponding row of the first column. A null row will produce corresponding null rows in the output table.

The regex_program's regex_flags are ignored.

s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
p1 = regex_program::create("[_ ]")
s1 = split_re(s, p1)
s1 is a table of strings columns:
    [ ["a", "a", "", "ab"],
      ["bc", "", "ab", "cd"],
      ["def", "bc", "cd", ""],
      ["g", null, null, null] ]
p2 = regex_program::create("[ _]")
s2 = split_re(s, p2, 1)
s2 is a table of strings columns:
    [ ["a", "a", "", "ab"],
      ["bc def_g", "_bc", "ab cd", "cd "] ]

Exceptions

cudf::logic_error if pattern is empty.

Parameters

input	A column of string elements to be split
prog	Regex program instance
maxsplit	Maximum number of splits to perform. Default of -1 indicates all possible splits on each string.
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the returned result's device memory

Returns: A table of columns of strings

◆ split_record()

std::unique_ptr<column> cudf::strings::split_record	(	strings_column_view const &	strings,
		string_scalar const &	delimiter = `string_scalar("")`,
		size_type	maxsplit = `-1`,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Splits individual strings elements into a list of strings.

Each element generates an array of strings that are stored in an output lists column.

The number of elements in the output column will be the same as the number of elements in the input column. Each individual list item will contain the new strings for that row. The resulting number of strings in each row can vary from 0 to maxsplit + 1.

The delimiter is searched within each string from beginning to end and splitting stops when either maxsplit or the end of the string is reached.

If a delimiter is not whitespace and occurs adjacent to another delimiter, an empty string is produced for that split occurrence. Likewise, a non-whitespace delimiter produces an empty string if it appears at the beginning or the end of a string.

s = ["a_bc_def_g", "a__bc", "_ab_cd", "ab_cd_"]
s1 = split_record(s, "_")
s1 is a lists column of strings:
    [ ["a", "bc", "def", "g"],
      ["a", "", "bc"],
      ["", "ab", "cd"],
      ["ab", "cd", ""] ]
s2 = split_record(s, "_", 1)
s2 is a lists column of strings:
    [ ["a", "bc_def_g"],
      ["a", "_bc"],
      ["", "ab_cd"],
      ["ab", "cd_"] ]

A whitespace delimiter produces no empty strings.

s = ["a bc def", "a  bc", " ab cd", "ab cd "]
s1 = split_record(s, "")
s1 is a lists column of strings:
    [ ["a", "bc", "def"],
      ["a", "bc"],
      ["ab", "cd"],
      ["ab", "cd"] ]
s2 = split_record(s, "", 1)
s2 is a lists column of strings:
    [ ["a", "bc def"],
      ["a", "bc"],
      ["ab", "cd"],
      ["ab", "cd "] ]

A null string element will result in a null list item for that row.

Exceptions

cudf::logic_error if delimiter is invalid.

Parameters

strings	A column of string elements to be split
delimiter	The string to identify split points in each string; Default of empty string indicates split on whitespace.
maxsplit	Maximum number of splits to perform; Default of -1 indicates all possible splits on each string
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the returned result's device memory

Returns: Lists column of strings; Each row of the lists column holds splits from a single row element of the input column.

◆ split_record_re()

std::unique_ptr<column> cudf::strings::split_record_re	(	strings_column_view const &	input,
		regex_program const &	prog,
		size_type	maxsplit = `-1`,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::device_async_resource_ref	mr = `cudf::get_current_device_resource_ref()`
	)

Splits strings elements into a list column of strings using the given regex_program to delimit each string.

Each element generates an array of strings that are stored in an output lists column – list[row] = [token1, token2, ...] found in input[row] where token is a substring between delimiters.

The number of elements in the output column will be the same as the number of elements in the input column. Each individual list item will contain the new strings for that row. The resulting number of strings in each row can vary from 0 to maxsplit + 1.

The pattern is used to identify the delimiters within a string and splitting stops when either maxsplit or the end of the string is reached.

An empty input string will produce a corresponding empty list item output row. A null row will produce a corresponding null output row.

The regex_program's regex_flags are ignored.

s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
p1 = regex_program::create("[_ ]")
s1 = split_record_re(s, p1)
s1 is a lists column of strings:
    [ ["a", "bc", "def", "g"],
      ["a", "", "bc"],
      ["", "ab", "cd"],
      ["ab", "cd", ""] ]
p2 = regex_program::create("[ _]")
s2 = split_record_re(s, p2, 1)
s2 is a lists column of strings:
    [ ["a", "bc def_g"],
      ["a", "_bc"],
      ["", "ab cd"],
      ["ab", "cd "] ]

Exceptions

cudf::logic_error if pattern is empty.

See the Regex Features page for details on patterns supported by this API.

Parameters

input	A column of string elements to be split
prog	Regex program instance
maxsplit	Maximum number of splits to perform. Default of -1 indicates all possible splits on each string.
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the returned result's device memory

Returns: Lists column of strings

Files

Functions

Detailed Description

Function Documentation

◆ partition()

◆ rpartition()

◆ rsplit()

◆ rsplit_re()

◆ rsplit_record()

◆ rsplit_record_re()

◆ split()

◆ split_part()

◆ split_re()

◆ split_record()

◆ split_record_re()