Strings Classes#

group strings_classes
class string_view#
#include <string_view.hpp>

A non-owning, immutable view of device data that is a variable length char array representing a UTF-8 string.

The caller must maintain the device memory for the lifetime of this instance.

This may be used to wrap a device pointer and size but any member function that requires accessing the device memory must be called from a kernel.

Public Functions

inline size_type size_bytes() const#

Return the number of bytes in this string.

Returns:

The number of bytes in this string

inline size_type length() const#

Return the number of characters in this string.

Returns:

The number of characters in this string

inline char const *data() const#

Return a pointer to the internal device array.

Returns:

A pointer to the internal device array

inline bool empty() const#

Return true if string has no characters.

Returns:

true if string has no characters

inline const_iterator begin() const#

Return new iterator pointing to the beginning of this string.

Returns:

new iterator pointing to the beginning of this string

inline const_iterator end() const#

Return new iterator pointing past the end of this string.

Returns:

new iterator pointing past the end of this string

inline char_utf8 operator[](size_type pos) const#

Return single UTF-8 character at the given character position.

Parameters:

pos – Character position

Returns:

UTF-8 character at the given character position

inline size_type byte_offset(size_type pos) const#

Return the byte offset from data() for a given character position.

Parameters:

pos – Character position

Returns:

Byte offset from data() for a given character position

inline int compare(string_view const &str) const#

Comparing target string with this string. Each character is compared as a UTF-8 code-point value.

Parameters:

str – Target string to compare with this string.

Returns:

0 If they compare equal. <0 Either the value of the first character of this string that does not match is lower in the arg string, or all compared characters match but the arg string is shorter. >0 Either the value of the first character of this string that does not match is greater in the arg string, or all compared characters match but the arg string is longer.

inline int compare(char const *str, size_type bytes) const#

Comparing target string with this string. Each character is compared as a UTF-8 code-point value.

Parameters:
  • str – Target string to compare with this string.

  • bytes – Number of bytes in str.

Returns:

0 If they compare equal. <0 Either the value of the first character of this string that does not match is lower in the arg string, or all compared characters match but the arg string is shorter. >0 Either the value of the first character of this string that does not match is greater in the arg string, or all compared characters match but the arg string is longer.

inline bool operator==(string_view const &rhs) const#

Returns true if rhs matches this string exactly.

Parameters:

rhs – Target string to compare with this string.

Returns:

true if rhs matches this string exactly

inline bool operator!=(string_view const &rhs) const#

Returns true if rhs does not match this string.

Parameters:

rhs – Target string to compare with this string.

Returns:

true if rhs does not match this string

inline bool operator<(string_view const &rhs) const#

Returns true if this string is ordered before rhs.

Parameters:

rhs – Target string to compare with this string.

Returns:

true if this string is ordered before rhs

inline bool operator>(string_view const &rhs) const#

Returns true if rhs is ordered before this string.

Parameters:

rhs – Target string to compare with this string.

Returns:

true if rhs is ordered before this string

inline bool operator<=(string_view const &rhs) const#

Returns true if this string matches or is ordered before rhs.

Parameters:

rhs – Target string to compare with this string.

Returns:

true if this string matches or is ordered before rhs

inline bool operator>=(string_view const &rhs) const#

Returns true if rhs matches or is ordered before this string.

Parameters:

rhs – Target string to compare with this string.

Returns:

true if rhs matches or is ordered before this string

inline size_type find(string_view const &str, size_type pos = 0, size_type count = -1) const#

Returns the character position of the first occurrence where the argument str is found in this string within the character range [pos,pos+n).

Parameters:
  • str – Target string to search within this string.

  • pos – Character position to start search within this string.

  • count – Number of characters from pos to include in the search. Specify -1 to indicate to the end of the string.

Returns:

npos if str is not found in this string.

inline size_type find(char const *str, size_type bytes, size_type pos = 0, size_type count = -1) const#

Returns the character position of the first occurrence where the array str is found in this string within the character range [pos,pos+n).

Parameters:
  • str – Target array to search within this string.

  • bytes – Number of bytes in str.

  • pos – Character position to start search within this string.

  • count – Number of characters from pos to include in the search. Specify -1 to indicate to the end of the string.

Returns:

npos if arg string is not found in this string.

inline size_type find(char_utf8 character, size_type pos = 0, size_type count = -1) const#

Returns the character position of the first occurrence where character is found in this string within the character range [pos,pos+n).

Parameters:
  • character – Single encoded character.

  • pos – Character position to start search within this string.

  • count – Number of characters from pos to include in the search. Specify -1 to indicate to the end of the string.

Returns:

npos if arg string is not found in this string.

inline size_type rfind(string_view const &str, size_type pos = 0, size_type count = -1) const#

Returns the character position of the last occurrence where the argument str is found in this string within the character range [pos,pos+n).

Parameters:
  • str – Target string to search within this string.

  • pos – Character position to start search within this string.

  • count – Number of characters from pos to include in the search. Specify -1 to indicate to the end of the string.

Returns:

npos if arg string is not found in this string.

inline size_type rfind(char const *str, size_type bytes, size_type pos = 0, size_type count = -1) const#

Returns the character position of the last occurrence where the array str is found in this string within the character range [pos,pos+n).

Parameters:
  • str – Target string to search with this string.

  • bytes – Number of bytes in str.

  • pos – Character position to start search within this string.

  • count – Number of characters from pos to include in the search. Specify -1 to indicate to the end of the string.

Returns:

npos if arg string is not found in this string.

inline size_type rfind(char_utf8 character, size_type pos = 0, size_type count = -1) const#

Returns the character position of the last occurrence where character is found in this string within the character range [pos,pos+n).

Parameters:
  • character – Single encoded character.

  • pos – Character position to start search within this string.

  • count – Number of characters from pos to include in the search. Specify -1 to indicate to the end of the string.

Returns:

npos if arg string is not found in this string.

inline string_view substr(size_type start, size_type length) const#

Return a sub-string of this string. The original string and device memory must still be maintained for the lifetime of the returned instance.

Parameters:
  • start – Character position to start the sub-string.

  • length – Number of characters from start to include in the sub-string.

Returns:

New instance pointing to a subset of the characters within this instance.

inline string_view()#

Default constructor represents an empty string.

inline string_view(char const *data, size_type bytes)#

Create instance from existing device char array.

Parameters:
  • data – Device char array encoded in UTF8.

  • bytes – Number of bytes in data array.

string_view(string_view const&) = default#

Copy constructor.

string_view(string_view&&) = default#

Move constructor.

string_view &operator=(string_view const&) = default#

Copy assignment operator.

Returns:

Reference to this instance

string_view &operator=(string_view&&) = default#

Move assignment operator.

Returns:

Reference to this instance (after transferring ownership)

Public Static Functions

static inline string_view min()#

Return minimum value associated with the string type.

This function is needed to be host callable because it is called by a host callable function DeviceMax::identity<string_view>()

Returns:

An empty string

static inline string_view max()#

Return maximum value associated with the string type.

This function is needed to be host callable because it is called by a host callable function DeviceMin::identity<string_view>()

Returns:

A string value which represents the highest possible valid UTF-8 encoded character.

Public Static Attributes

static cudf::size_type const npos = {-1}#

No-position value.

Used when specifying or returning an invalid or unknown character position value.

class const_iterator#
#include <string_view.hpp>

Handy iterator for navigating through encoded characters.

class strings_column_view : private cudf::column_view#
#include <strings_column_view.hpp>

Given a column-view of strings type, an instance of this class provides a wrapper on this compound column for strings operations.

Public Types

using offset_iterator = size_type const*#

offsets iterator type

using chars_iterator = char const*#

character iterator type

Public Functions

strings_column_view(column_view strings_column)#

Construct a new strings column view object from a column view.s.

Parameters:

strings_column – The column view to wrap.

strings_column_view(strings_column_view&&) = default#

Move constructor.

strings_column_view(strings_column_view const&) = default#

Copy constructor.

strings_column_view &operator=(strings_column_view const&) = default#

Copy assignment operator.

Returns:

Reference to this instance

strings_column_view &operator=(strings_column_view&&) = default#

Move assignment operator.

Returns:

Reference to this instance (after transferring ownership)

column_view parent() const#

Returns the parent column.

Returns:

The parents column

column_view offsets() const#

Returns the internal column of offsets.

Throws:

cudf::logic_error – if this is an empty column

Returns:

The offsets column

offset_iterator offsets_begin() const#

Return an iterator for the offsets child column.

Deprecated:

Since 24.04

This automatically applies the offset of the parent.

Returns:

Iterator pointing to the first offset value.

offset_iterator offsets_end() const#

Return an end iterator for the offsets child column.

Deprecated:

Since 24.04

This automatically applies the offset of the parent.

Returns:

Iterator pointing 1 past the last offset value.

int64_t chars_size(rmm::cuda_stream_view stream) const noexcept#

Returns the number of bytes in the chars child column.

This accounts for empty columns but does not reflect a sliced parent column view (i.e.: non-zero offset or reduced row count).

Parameters:

stream – CUDA stream used for device memory operations and kernel launches

Returns:

Number of bytes in the chars child column

chars_iterator chars_begin(rmm::cuda_stream_view) const#

Return an iterator for the chars child column.

This does not apply the offset of the parent. The offsets child must be used to properly address the char bytes.

For example, to access the first character of string i (accounting for a sliced column offset) use: chars_begin(stream)[offsets_begin()[i]].

Returns:

Iterator pointing to the first char byte.

chars_iterator chars_end(rmm::cuda_stream_view stream) const#

Return an end iterator for the offsets child column.

This does not apply the offset of the parent. The offsets child must be used to properly address the char bytes.

Parameters:

stream – CUDA stream used for device memory operations and kernel launches

Returns:

Iterator pointing 1 past the last char byte.

Public Static Attributes

static constexpr size_type offsets_column_index = {0}#

Child index of the offsets column.