libwholegraph API doc#

Doxygen WholeGraph C API documentation#

For doxygen documentation, please refer to Doxygen Documentation

WholeGraph C API documentation#

Library Level APIs#

enum wholememory_error_code_t#

WholeMemory Error Code definition.

Defines error code of WholeMemory library.

Values:

enumerator WHOLEMEMORY_SUCCESS#

success

enumerator WHOLEMEMORY_UNKNOW_ERROR#

unknown error

enumerator WHOLEMEMORY_NOT_IMPLEMENTED#

method is not implemented

enumerator WHOLEMEMORY_LOGIC_ERROR#

logic error

enumerator WHOLEMEMORY_CUDA_ERROR#

CUDA error

enumerator WHOLEMEMORY_COMMUNICATION_ERROR#

communication error

enumerator WHOLEMEMORY_INVALID_INPUT#

input is invalid, e.g. nullptr

enumerator WHOLEMEMORY_INVALID_VALUE#

input value is invalid

enumerator WHOLEMEMORY_OUT_OF_MEMORY#

out of memory

enumerator WHOLEMEMORY_NOT_SUPPORTED#

not supported

enumerator WHOLEMEMORY_SYSTEM_ERROR#

system error>

wholememory_error_code_t wholememory_init(unsigned int flags, LogLevel log_level = LEVEL_INFO)#

Initialize WholeMemory library

Parameters:
  • flags – : reserved should be 0

  • log_level – : wholememory log level, the default level is “info”

Returns:

: wholememory_error_code_t

wholememory_error_code_t wholememory_finalize()#

Finalize WholeMemory library

Returns:

: wholememory_error_code_t

int fork_get_device_count()#

Fork a new process and get device count. Should be called before other CUDA call

Returns:

: CUDA device count, -1 on error

WholeMemory Communicator APIs#

typedef struct wholememory_comm_ *wholememory_comm_t#

Opaque handle to communicator.

An Opaque handle to communicator

struct wholememory_unique_id_t#

Unique ID for WholeMemory Communicators.

An Opaque handle to WholeMemory Communicators, exposes as char array. Underlying implementation may be ncclUniqueId_t

wholememory_error_code_t wholememory_create_unique_id(wholememory_unique_id_t *unique_id)#

Create UniqueID for WholeMemory Communicator

Parameters:

unique_id – : returned UniqueID

Returns:

: wholememory_error_code_t

wholememory_error_code_t wholememory_create_communicator(wholememory_comm_t *comm, wholememory_unique_id_t unique_id, int rank, int size)#

Create WholeMemory Communicator

Parameters:
  • comm – : returned WholeMemory Communicator

  • unique_id – : UniqueID

  • rank – : rank of this process.

  • size – : number of processes in this Communicator

Returns:

: wholememory_error_code_t

wholememory_error_code_t wholememory_destroy_communicator(wholememory_comm_t comm)#

Destroy WholeMemory Communicator

Parameters:

comm – : WholeMemory Communicator to destroy

Returns:

: wholememory_error_code_t

wholememory_error_code_t wholememory_communicator_get_rank(int *rank, wholememory_comm_t comm)#

Get the rank of current process in the WholeMemory Communicator

Parameters:
  • rank – : returned rank

  • comm – : WholeMemory Communicator

Returns:

: wholememory_error_code_t

wholememory_error_code_t wholememory_communicator_get_size(int *size, wholememory_comm_t comm)#

Get the size of WholeMemory Communicator

Parameters:
  • size – : returned size

  • comm – : WholeMemory Communicator

Returns:

: wholememory_error_code_t

wholememory_error_code_t wholememory_communicator_barrier(wholememory_comm_t comm)#

Barrier on WholeMemory Communicator

Parameters:

comm – : WholeMemory Communicator

Returns:

: wholememory_error_code_t

WholeMemoryHandle APIs#

enum wholememory_memory_type_t#

Memory Type of WholeMemory.

Memory Type is the Memory Address Mapping Type of WholeMemory

Values:

enumerator WHOLEMEMORY_MT_NONE#

Not defined.

enumerator WHOLEMEMORY_MT_CONTINUOUS#

Memory from all ranks are mapped in continuous address space

enumerator WHOLEMEMORY_MT_CHUNKED#

Memory from all ranks are mapped in chunked address space

enumerator WHOLEMEMORY_MT_DISTRIBUTED#

Memory from other ranks are not mapped.

enumerator WHOLEMEMORY_MT_HIERARCHY#

Memory from other ranks are mapped in hierarchy address space

enum wholememory_memory_location_t#

Memory Location of WholeMemory.

Memory Location of WholeMemory can be host or device.

Values:

enumerator WHOLEMEMORY_ML_NONE#

Not defined

enumerator WHOLEMEMORY_ML_DEVICE#

Device Memory

enumerator WHOLEMEMORY_ML_HOST#

Host Memory

typedef struct wholememory_handle_ *wholememory_handle_t#

Opaque handle to WholeMemory.

An Opaque handle to WholeMemory

struct wholememory_gref_t#

Global reference of a WholeMemory object.

A global reference is for Continuous or Chunked WholeMemory Type, in these types, each rank can directly access all memory from all ranks. The global reference is used to do this direct access.

wholememory_error_code_t wholememory_malloc(wholememory_handle_t *wholememory_handle_ptr, size_t total_size, wholememory_comm_t comm, wholememory_memory_type_t memory_type, wholememory_memory_location_t memory_location, size_t data_granularity, size_t *rank_entry_partition = nullptr)#

Malloc WholeMemory

Parameters:
  • wholememory_handle_ptr – : returned WholeMemory Handle

  • total_size – : total allocated size in bytes.

  • comm – : WholeMemory Communicator

  • memory_type – : WholeMemory type

  • memory_location – : memory location, host or device

  • data_granularity – : granularity size of data, which is guaranteed not to be partitioned.

  • rank_entry_partition – : entry count of each rank (size of entry equal to data_granularity)

Returns:

: wholememory_error_code_t

wholememory_error_code_t wholememory_free(wholememory_handle_t wholememory_handle)#

Free allocated WholeMemory Handle

Parameters:

wholememory_handle – : WholeMemory Handle to free

Returns:

: wholememory_error_code_t

wholememory_error_code_t wholememory_get_communicator(wholememory_comm_t *comm, wholememory_handle_t wholememory_handle)#

Get underlying WholeMemory Communicator from WholeMemory Handle

Parameters:
  • comm – : returned WholeMemory Communicator

  • wholememory_handle – : WholeMemory Handle

Returns:

: wholememory_error_code_t

wholememory_memory_type_t wholememory_get_memory_type(wholememory_handle_t wholememory_handle)#

Get WholeMemory Type

Parameters:

wholememory_handle – : WholeMemory Handle

Returns:

: WholeMemory Type

wholememory_memory_location_t wholememory_get_memory_location(wholememory_handle_t wholememory_handle)#

Get WholeMemory Location

Parameters:

wholememory_handle – : WholeMemory Handle

Returns:

: WholeMemory Location

size_t wholememory_get_total_size(wholememory_handle_t wholememory_handle)#

Get total size of WholeMemory

Parameters:

wholememory_handle – : WholeMemory Handle

Returns:

: total size

size_t wholememory_get_data_granularity(wholememory_handle_t wholememory_handle)#

Get data granularity of WholeMemory Handle

Parameters:

wholememory_handle – : WholeMemory Handle

Returns:

: data granularity size

wholememory_error_code_t wholememory_get_local_memory(void **local_ptr, size_t *local_size, size_t *local_offset, wholememory_handle_t wholememory_handle)#

Get local memory from WholeMemory Handle of current rank, local memory has direct access to the memory. But local memory doesn’t have to be on local GPU.

Parameters:
  • local_ptr – : returned local memory pointer

  • local_size – : returned local memory size

  • local_offset – : returned local memory offset from WholeMemory

  • wholememory_handle – : WholeMemory Handle

Returns:

: wholememory_error_code_t

wholememory_error_code_t wholememory_get_rank_memory(void **rank_memory_ptr, size_t *rank_memory_size, size_t *rank_memory_offset, int rank, wholememory_handle_t wholememory_handle)#

Get local memory of specified rank from WholeMemory Handle

Parameters:
  • rank_memory_ptr – : returned local memory pointer of specified rank

  • rank_memory_size – : returned local memory size of specified rank

  • rank_memory_offset – : returned local memory offset of specified rank from WholeMemory

  • rank – : rank specified

  • wholememory_handle – : WholeMemory Handle

Returns:

: wholememory_error_code_t

wholememory_error_code_t wholememory_get_global_pointer(void **global_ptr, wholememory_handle_t wholememory_handle)#

Get global memory pointer from WholeMemory Handle. Only Continuous memory type or Chunked Host memory has global pointer.

Parameters:
  • global_ptr – : returned pointer of WholeMemory

  • wholememory_handle – : WholeMemory Handle

Returns:

: wholememory_error_code_t

wholememory_error_code_t wholememory_get_global_reference(wholememory_gref_t *wholememory_gref, wholememory_handle_t wholememory_handle)#

Get global reference from WholeMemory Handle WholeMemory global reference is common data structure for Continuous and Chunked Memory Types.

Parameters:
  • wholememory_gref – : returned WholeMemory global reference

  • wholememory_handle – : WholeMemory Handle

Returns:

: wholememory_error_code_t

Warning

doxygenfunction: Cannot find function “wholememory_determine_partition_plan” in doxygen xml output for project “libwholegraph” from directory: /tmp/tmp.dn7Kp0vImX

Warning

doxygenfunction: Cannot find function “wholememory_determine_entry_partition_plan” in doxygen xml output for project “libwholegraph” from directory: /tmp/tmp.dn7Kp0vImX

Warning

doxygenfunction: Cannot find function “wholememory_get_partition_plan” in doxygen xml output for project “libwholegraph” from directory: /tmp/tmp.dn7Kp0vImX

wholememory_error_code_t wholememory_load_from_file(wholememory_handle_t wholememory_handle, size_t memory_offset, size_t memory_entry_size, size_t file_entry_size, const char **file_names, int file_count, int round_robin_size)#

Load WholeMemory from binary files, all rank should be called together

Parameters:
  • wholememory_handle – : WholeMemory Handle

  • memory_offset – : load to memory offset

  • memory_entry_size – : entry size of WholeMemory

  • file_entry_size – : entry size in file, should be less than or equal to memory_entry_size

  • file_names – : file names, all binary files will be logically concatenated and loaded.

  • file_count – : number of files.

  • round_robin_size – : continuous embedding number for a rank under round-robin shard mode

Returns:

: wholememory_error_code_t

wholememory_error_code_t wholememory_store_to_file(wholememory_handle_t wholememory_handle, size_t memory_offset, size_t memory_entry_stride, size_t file_entry_size, const char *local_file_name)#

Store local WholeMemory to file, this should be called by all ranks, with different local_file_name.

Parameters:
  • wholememory_handle – : WholeMemory Handle

  • memory_offset – : memory offset to store

  • memory_entry_stride – : entry size of WholeMemory

  • file_entry_size – : entry size in file, should be less than or equal to memory_entry_size

  • local_file_name – : local file to store to

Returns:

: wholememory_error_code_t

WholeMemoryTensor APIs#

enum wholememory_dtype_t#

defines WholeMemory data type for tensors

Values:

enumerator WHOLEMEMORY_DT_UNKNOWN#

Unknown type

enumerator WHOLEMEMORY_DT_FLOAT#

32-bit float type

enumerator WHOLEMEMORY_DT_HALF#

16-bit half float type

enumerator WHOLEMEMORY_DT_DOUBLE#

64-bit double type

enumerator WHOLEMEMORY_DT_BF16#

16-bit bfloat type

enumerator WHOLEMEMORY_DT_INT#

32-bit signed integer type

enumerator WHOLEMEMORY_DT_INT64#

64-bit signed integer type

enumerator WHOLEMEMORY_DT_INT16#

16-bit signed integer type

enumerator WHOLEMEMORY_DT_INT8#

8-bit signed integer type

enumerator WHOLEMEMORY_DT_COUNT#

total count if types

struct wholememory_array_description_t#

wrapper for array in WholeMemory

struct wholememory_matrix_description_t#

wrapper for matrix in WholeMemory

struct wholememory_tensor_description_t#

Tensor description in WholeMemory, dimension 0 is the slowest changed dimension.

typedef struct wholememory_tensor_ *wholememory_tensor_t#

Opaque handle to WholeMemoryTensor.

An Opaque handle to WholeMemoryTensor

size_t wholememory_dtype_get_element_size(wholememory_dtype_t dtype)#

Get element size of wholememory_dtype_t

Parameters:

dtype – : wholememory_dtype_t

Returns:

: element size of dtype, -1 on invalid dtype.

bool wholememory_dtype_is_floating_number(wholememory_dtype_t dtype)#

Check if dtype is floating number

Parameters:

dtype – : wholememory_dtype_t

Returns:

: True if dtype is WHOLEMEMORY_DT_FLOAT, WHOLEMEMORY_DT_HALF, WHOLEMEMORY_DT_DOUBLE or WHOLEMEMORY_DT_BF16. False otherwise.

bool wholememory_dtype_is_integer_number(wholememory_dtype_t dtype)#

Check if dtype is integer number

Parameters:

dtype – : wholememory_dtype_t

Returns:

: True if dtype is WHOLEMEMORY_DT_INT, WHOLEMEMORY_DT_INT64, WHOLEMEMORY_DT_INT16 or WHOLEMEMORY_DT_INT8, False otherwise.

wholememory_array_description_t wholememory_create_array_desc(int64_t size, int64_t storage_offset, wholememory_dtype_t dtype)#

Create wholememory_array_description_t object

Parameters:
  • size – : array size in number of elements

  • storage_offset – : storage offset in number of elements

  • dtype – : data type of array elements

Returns:

created wholememory_array_description_t

wholememory_matrix_description_t wholememory_create_matrix_desc(int64_t sizes[2], int64_t stride, int64_t storage_offset, wholememory_dtype_t dtype)#

Create wholememory_matrix_description_t object

Parameters:
  • sizes – : matrix sizes array, counted in number of elements, sizes[1] changes fastest.

  • stride – : stride of first dimension(slower changed dimension), stride is counted in number of elements

  • storage_offset – : storage offset in number of elements

  • dtype – : data type of matrix elements

Returns:

created wholememory_matrix_description_t

void wholememory_initialize_tensor_desc(wholememory_tensor_description_t *p_tensor_description)#

Initialize wholememory_tensor_description_t, set sizes and strides to all ones, and set storage_offset to 0, set dtype to WHOLEMEMORY_DT_UNKNOWN, set dim to 0.

Parameters:

p_tensor_description – : pointer to wholememory_tensor_description_t.

void wholememory_copy_array_desc_to_matrix(wholememory_matrix_description_t *p_matrix_description, wholememory_array_description_t *p_array_description)#

Copy array description to tensor description

Parameters:
void wholememory_copy_array_desc_to_tensor(wholememory_tensor_description_t *p_tensor_description, wholememory_array_description_t *p_array_description)#

Copy array description to tensor description

Parameters:
void wholememory_copy_matrix_desc_to_tensor(wholememory_tensor_description_t *p_tensor_description, wholememory_matrix_description_t *p_matrix_description)#

Copy matrix description to tensor description

Parameters:
bool wholememory_convert_tensor_desc_to_array(wholememory_array_description_t *p_array_description, wholememory_tensor_description_t *p_tensor_description)#

Convert tensor description to array description

Parameters:
Returns:

: Return true if convertible else false.

bool wholememory_convert_tensor_desc_to_matrix(wholememory_matrix_description_t *p_matrix_description, wholememory_tensor_description_t *p_tensor_description)#

Convert tensor description to matrix description

Parameters:
Returns:

: Return true if convertible else false.

int64_t wholememory_get_memory_element_count_from_array(wholememory_array_description_t *p_array_description)#

Get total element count from array description.

Parameters:

p_array_description – : pointer to wholememory_array_description_t.

Returns:

: Return element count.

int64_t wholememory_get_memory_size_from_array(wholememory_array_description_t *p_array_description)#

Get total memory size from array description.

Parameters:

p_array_description – : pointer to wholememory_array_description_t.

Returns:

: Return memory size.

int64_t wholememory_get_memory_element_count_from_matrix(wholememory_matrix_description_t *p_matrix_description)#

Get total element count from matrix description.

Parameters:

p_matrix_description – : pointer to wholememory_matrix_description_t.

Returns:

: Return element count.

int64_t wholememory_get_memory_size_from_matrix(wholememory_matrix_description_t *p_matrix_description)#

Get total memory size from matrix description.

Parameters:

p_matrix_description – : pointer to wholememory_matrix_description_t.

Returns:

: Return memory size.

int64_t wholememory_get_memory_element_count_from_tensor(wholememory_tensor_description_t *p_tensor_description)#

Get total element count from tensor description.

Parameters:

p_tensor_description – : pointer to wholememory_tensor_description_t.

Returns:

: Return element count.

int64_t wholememory_get_memory_size_from_tensor(wholememory_tensor_description_t *p_tensor_description)#

Get total memory size from tensor description.

Parameters:

p_tensor_description – : pointer to wholememory_tensor_description_t.

Returns:

: Return memory size.

bool wholememory_unsqueeze_tensor(wholememory_tensor_description_t *p_tensor_description, int dim)#

Unsqueeze tensor

Parameters:
Returns:

: true if success else false

wholememory_error_code_t wholememory_create_tensor(wholememory_tensor_t *wholememory_tensor, wholememory_tensor_description_t *tensor_description, wholememory_comm_t comm, wholememory_memory_type_t memory_type, wholememory_memory_location_t memory_location, size_t *tensor_entry_partition = nullptr)#

Create WholeMemory Tensor

Parameters:
  • wholememory_tensor – : returned WholeMemory Tensor handle

  • tensor_description – : description of the WholeMemory Tensor, should be 1-D or 2-D continuous tensor without offset.

  • comm – : WholeMemory Communicator

  • memory_type – : Memory Type of the underlying WholeMemory

  • memory_location – : Memory Location of the underlying WholeMemory

  • tensor_entry_partition – : Tensor entry count of each rank, the length must be world_size.

Returns:

: wholememory_error_code_t

wholememory_error_code_t wholememory_destroy_tensor(wholememory_tensor_t wholememory_tensor)#

Destroy WholeMemory Tensor

Parameters:

wholememory_tensor – : WholeMemory Tensor to destroy

Returns:

: wholememory_error_code_t

wholememory_error_code_t wholememory_make_tensor_from_pointer(wholememory_tensor_t *wholememory_tensor, void *storage_ptr, wholememory_tensor_description_t *tensor_description)#

Make WholeMemory Tensor from local memory

Parameters:
  • wholememory_tensor – : returned WholeMemory Tensor handle

  • storage_ptr – : pointer to underlying storage memory. Note: storage pointer may be not same as data pointer.

  • tensor_description – : description of the WholeMemory Tensor, should be 1-D or 2-D

Returns:

: wholememory_error_code_t

wholememory_error_code_t wholememory_make_tensor_from_handle(wholememory_tensor_t *wholememory_tensor, wholememory_handle_t wholememory_handle, wholememory_tensor_description_t *tensor_description)#

Make WholeMemory Tensor from local memory

Parameters:
  • wholememory_tensor – : returned WholeMemory Tensor handle

  • wholememory_handle – : WholeMemory Handle

  • tensor_description – : description of the WholeMemory Tensor, should be 1-D or 2-D

Returns:

: wholememory_error_code_t

bool wholememory_tensor_has_handle(wholememory_tensor_t wholememory_tensor)#

Check if has WholeMemory Handle, WholeMemory Tensor created by wholememory_make_tensor has no Handle

Parameters:

wholememory_tensor – : WholeMemory Tensor

Returns:

: if has WholeMemory Handle

wholememory_handle_t wholememory_tensor_get_memory_handle(wholememory_tensor_t wholememory_tensor)#

Get WholeMemory handle from WholeMemory Tensor

Parameters:

wholememory_tensor – : WholeMemory Tensor

Returns:

: WholeMemory handle

wholememory_tensor_description_t *wholememory_tensor_get_tensor_description(wholememory_tensor_t wholememory_tensor)#

Get tensor description from WholeMemory Tensor

Parameters:

wholememory_tensor – : WholeMemory Tensor

Returns:

: pointer to the underlying wholememory_tensor_description_t

wholememory_error_code_t wholememory_tensor_get_global_reference(wholememory_tensor_t wholememory_tensor, wholememory_gref_t *wholememory_gref)#

Get global reference from WholeMemory Tensor

Parameters:
  • wholememory_tensor – : WholeMemory Tensor

  • wholememory_gref – : global reference

Returns:

: wholememory_error_code_t

wholememory_error_code_t wholememory_tensor_map_local_tensor(wholememory_tensor_t wholememory_tensor, wholememory_tensor_t *local_tensor)#

Map local tensor of WholeMemory Tensor. Only support 1D and 2D tensor with WholeMemory Handle. For 1D tensor, storage_offset should be 0 For 2D tensor, storage_offset + size[1] should <= stride[0]

Parameters:
  • wholememory_tensor – : WholeMemory Tensor.

  • local_tensor – : returned local tensor, need to be destroyed.

Returns:

: wholememory_error_code_t

void *wholememory_tensor_get_data_pointer(wholememory_tensor_t wholememory_tensor)#

Get data pointer from WholeMemory Tensor

Parameters:

wholememory_tensor – : WholeMemory Tensor

Returns:

: Pointer to first data for CONTINUOUS WholeMemory or not WholeMemory.

Warning

doxygenfunction: Cannot find function “wholememory_tensor_get_entry_per_partition” in doxygen xml output for project “libwholegraph” from directory: /tmp/tmp.dn7Kp0vImX

wholememory_error_code_t wholememory_tensor_get_subtensor(wholememory_tensor_t wholememory_tensor, int64_t *starts, int64_t *ends, wholememory_tensor_t *sub_wholememory_tensor)#

Get sub tensor of a WholeMemory Tensor

Parameters:
  • wholememory_tensor – : WholeMemory Tensor

  • starts – : starts of each dim, length should be the dim of wholememory_tensor.

  • ends – : ends of each dim, length should be the dim of wholememory_tensor

  • sub_wholememory_tensor – : pointer to returned sub tensor

Returns:

: wholememory_error_code_t

wholememory_tensor_t wholememory_tensor_get_root(wholememory_tensor_t wholememory_tensor)#

Get root tensor of a WholeMemory Tensor, root means it is not a sub tensor of any WholeMemory Tensor.

Parameters:

wholememory_tensor – : WholeMemory Tensor

Returns:

: the root of current WholeMemory tensor, maybe same as wholememory_tensor.

Ops on WholeMemory Tensors#

wholememory_error_code_t wholememory_gather(wholememory_tensor_t wholememory_tensor, wholememory_tensor_t indices_tensor, wholememory_tensor_t output_tensor, wholememory_env_func_t *p_env_fns, void *stream, int gather_sms = -1)#

Gather Op

Parameters:
  • wholememory_tensor – : WholeMemory Tensor of embedding table.

  • indices_tensor – : indices to gather from, should NOT be WholeMemory Tensor

  • output_tensor – : output tensor to gather to, should NOT be WholeMemoryTensor

  • p_env_fns – : pointers to environment functions.

  • stream – : cudaStream_t to use.

  • gather_sms – : the number of stream multiprocessor used in gather kernel

Returns:

: wholememory_error_code_t

wholememory_error_code_t wholememory_scatter(wholememory_tensor_t input_tensor, wholememory_tensor_t indices_tensor, wholememory_tensor_t wholememory_tensor, wholememory_env_func_t *p_env_fns, void *stream, int scatter_sms = -1)#

Scatter Op

Parameters:
  • input_tensor – : input tensor tor scatter from, should NOT be WholeMemory Tensor

  • indices_tensor – : indices to scatter to, should NOT be WholeMemory Tensor

  • wholememory_tensor – : WholeMemory Tensor of embedding table.

  • p_env_fns – : pointers to environment functions.

  • stream – : cudaStream_t to use.

  • scatter_sms – : the number of stream multiprocessor used in scatter kernel

Returns:

: wholememory_error_code_t

WholeTensorEmbedding APIs#

typedef struct wholememory_embedding_cache_policy_ *wholememory_embedding_cache_policy_t#

Opaque handle to WholeMemory Embedding Cache Policy.

An Opaque handle to WholeMemory Embedding Cache Policy

typedef struct wholememory_embedding_optimizer_ *wholememory_embedding_optimizer_t#

Opaque handle to WholeMemory Embedding Optimizer.

An Opaque handle to WholeMemory Embedding Optimizer

typedef struct wholememory_embedding_ *wholememory_embedding_t#

Opaque handle to WholeMemory Embedding.

An Opaque handle to WholeMemory Embedding

enum wholememory_access_type_t#

defines access type of WholeMemory Embedding

Values:

enumerator WHOLEMEMORY_AT_NONE#

Not defined

enumerator WHOLEMEMORY_AT_READONLY#

Only have readonly access to the WholeMemory

enumerator WHOLEMEMORY_AT_READWRITE#

May have write access to the WholeMemory

enum wholememory_optimizer_type_t#

defines optimizer type for WholeMemory Embedding

Values:

enumerator WHOLEMEMORY_OPT_NONE#

No optimizer needed

enumerator WHOLEMEMORY_OPT_SGD#

Use SGD optimizer

enumerator WHOLEMEMORY_OPT_LAZY_ADAM#

Use Lazy Adam optimizer

enumerator WHOLEMEMORY_OPT_RMSPROP#

Use RMSProp optimizer

enumerator WHOLEMEMORY_OPT_ADAGRAD#

Use AdaGrad optimizer

wholememory_error_code_t wholememory_create_embedding_optimizer(wholememory_embedding_optimizer_t *optimizer, wholememory_optimizer_type_t optimizer_type)#

Create Optimizer

Parameters:
  • optimizer – : Returned wholememory_embedding_optimizer_t

  • optimizer_type – : Optimizer type

Returns:

: wholememory_error_code_t

wholememory_error_code_t wholememory_optimizer_set_parameter(wholememory_embedding_optimizer_t optimizer, const char *parameter_name, void *value)#

Set parameter for optimizer.

Parameters:
  • optimizer – : Optimizer to set parameter

  • parameter_name – : parameter name

  • value – : parameter value

Returns:

: wholememory_error_code_t

void wholememory_destroy_embedding_optimizer(wholememory_embedding_optimizer_t optimizer)#

Destroy optimizer

Parameters:

optimizer – : optimizer to destroy.

wholememory_error_code_t wholememory_create_embedding_cache_policy(wholememory_embedding_cache_policy_t *cache_policy, wholememory_comm_t cache_level_comm, wholememory_memory_type_t memory_type, wholememory_memory_location_t memory_location, wholememory_access_type_t access_type, float cache_ratio)#

Create WholeMemory Embedding Cache Policy

Parameters:
  • cache_policy – : Returned wholememory_embedding_cache_policy_t

  • cache_level_comm – : At which level to cache the full embedding. In most cases it should be same as wholememory_embedding_t’s comm. If access_type is WHOLEMEMORY_AT_READONLY, it can be different for multiple readonly caches. E.g. for a multi-node WHOLEMEMORY_MT_DISTRIBUTED WHOLEMEMORY_AT_READONLY embedding, it can have a intra-node WHOLEMEMORY_MT_CHUNKED cache. or a multi-node WHOLEMEMORY_MT_DISTRIBUTED cache.

  • memory_type – : Memory Type of the underlying WholeMemory for cache

  • memory_location – : Memory Location of the underlying WholeMemory for cache

  • access_type – : ReadOnly or ReadWrite

  • cache_ratio – : suggested cache ratio, values should be in range [1.0 / 512, 1.0]

Returns:

: wholememory_error_code_t

wholememory_error_code_t wholememory_destroy_embedding_cache_policy(wholememory_embedding_cache_policy_t cache_policy)#

Destroy WholeMemory Embedding Cache Policy

Parameters:

cache_policy – : WholeMemory Embedding Cache Policy to destroy.

Returns:

: wholememory_error_code_t

wholememory_error_code_t wholememory_create_embedding(wholememory_embedding_t *wholememory_embedding, wholememory_tensor_description_t *embedding_tensor_description, wholememory_comm_t comm, wholememory_memory_type_t memory_type, wholememory_memory_location_t memory_location, wholememory_embedding_cache_policy_t cache_policy, size_t *embedding_entry_partition = nullptr, int user_defined_sms = -1, int round_robin_size = 0)#

Create WholeMemory Embedding

Parameters:
  • wholememory_embedding – : Returned wholememory_embedding_t

  • embedding_tensor_description – : Description of the embedding, sizes and dtype used, stride and storage_offset ignored. Must be matrix

  • comm – : WholeMemory Communicator

  • memory_type – : Memory Type of the underlying WholeMemory

  • memory_location – : Memory Location of the underlying WholeMemory

  • cache_policy – : Cache policy for this embedding, if don’t use cache, use nullptr

  • embedding_entry_partition – Embedding entry count of each rank, the length must be world_size

  • user_defined_sms – : User-defined sms number for raw embedding gather/scatter

  • round_robin_size – : continuous embedding size in each rank under round-robin shard mode

Returns:

: wholememory_error_code_t

wholememory_error_code_t wholememory_destroy_embedding(wholememory_embedding_t wholememory_embedding)#

Destroy WholeMemory Embedding

Parameters:

wholememory_embedding – : WholeMemory Embedding to destroy

Returns:

: wholememory_error_code_t

wholememory_tensor_t wholememory_embedding_get_embedding_tensor(wholememory_embedding_t wholememory_embedding)#

Get WholeMemory Tensor from WholeMemory Embedding.

Parameters:

wholememory_embedding – : WholeMemory Embedding

Returns:

: WholeMemory Tensor

wholememory_error_code_t wholememory_embedding_gather(wholememory_embedding_t wholememory_embedding, wholememory_tensor_t indices, wholememory_tensor_t output, bool adjust_cache, wholememory_env_func_t *p_env_fns, int64_t stream_int)#

Gather from WholeMemory Embedding

Parameters:
  • wholememory_embedding – : WholeMemory Embedding

  • indices – : indices to gather

  • output – : output tensor

  • adjust_cache – : if we should adjust cache in this gather

  • p_env_fns – : env fns

  • stream_int – : CUDA stream to use

Returns:

: wholememory_error_code_t

wholememory_error_code_t wholememory_embedding_gather_gradient_apply(wholememory_embedding_t wholememory_embedding, wholememory_tensor_t indices, wholememory_tensor_t grads, bool adjust_cache, float lr, wholememory_env_func_t *p_env_fns, int64_t stream_int)#

Gather backward for WholeMemory Embedding

Parameters:
  • wholememory_embedding – : WholeMemory Embedding

  • indices – : indices to gather

  • grads – : gradient of output tensor

  • adjust_cache – : if we should adjust cache in this gather

  • lr – : learning rate of current step.

  • p_env_fns – : env fns

  • stream_int – : CUDA stream to use

Returns:

: wholememory_error_code_t

const char *const *wholememory_embedding_get_optimizer_state_names(wholememory_embedding_t wholememory_embedding)#

Get optimizer internal state names

Parameters:

wholememory_embedding – : WholeMemory Embedding

Returns:

: nullptr terminated names.

wholememory_tensor_t wholememory_embedding_get_optimizer_state(wholememory_embedding_t wholememory_embedding, const char *name)#

Get optimizer internal state

Parameters:
  • wholememory_embedding – : WholeMemory Embedding

  • name – : state name

Returns:

: internal state, nullptr for not exist.

wholememory_error_code_t wholememory_embedding_writeback_cache(wholememory_embedding_t wholememory_embedding, int64_t stream_int)#

Writeback all cache WholeMemory Embedding

Parameters:
  • wholememory_embedding – : WholeMemory Embedding

  • stream_int – : CUDA stream to use.

Returns:

: wholememory_error_code_t

wholememory_error_code_t wholememory_embedding_drop_all_cache(wholememory_embedding_t wholememory_embedding, int64_t stream_int)#

Drop all cache in WholeMemory Embedding

Parameters:
  • wholememory_embedding – : WholeMemory Embedding

  • stream_int – : CUDA stream to use.

Returns:

: wholememory_error_code_t

Ops on graphs stored in WholeMemory#

wholememory_error_code_t wholegraph_csr_unweighted_sample_without_replacement(wholememory_tensor_t wm_csr_row_ptr_tensor, wholememory_tensor_t wm_csr_col_ptr_tensor, wholememory_tensor_t center_nodes_tensor, int max_sample_count, wholememory_tensor_t output_sample_offset_tensor, void *output_dest_memory_context, void *output_center_localid_memory_context, void *output_edge_gid_memory_context, unsigned long long random_seed, wholememory_env_func_t *p_env_fns, void *stream)#

Unweighted sample without replacement kernel op

Parameters:
  • wm_csr_row_ptr_tensor – : Wholememory Tensor of graph csr_row_ptr

  • wm_csr_col_ptr_tensor – : Wholememory Tensor of graph csr_col_ptr

  • center_nodes_tensor – : None Wholememory Tensor of center node to sample

  • max_sample_count – : maximum sample count

  • output_sample_offset_tensor – : pointer to output sample offset

  • output_dest_memory_context – : memory context to output dest nodes

  • output_center_localid_memory_context – : memory context to output center local id

  • output_edge_gid_memory_context – : memory context to output edge global id

  • random_seed – random number generator seed

  • p_env_fns – : pointers to environment functions.

  • stream – : CUDA stream to use

Returns:

: wholememory_error_code_t

wholememory_error_code_t wholegraph_csr_weighted_sample_without_replacement(wholememory_tensor_t wm_csr_row_ptr_tensor, wholememory_tensor_t wm_csr_col_ptr_tensor, wholememory_tensor_t wm_csr_weight_ptr_tensor, wholememory_tensor_t center_nodes_tensor, int max_sample_count, wholememory_tensor_t output_sample_offset_tensor, void *output_dest_memory_context, void *output_center_localid_memory_context, void *output_edge_gid_memory_context, unsigned long long random_seed, wholememory_env_func_t *p_env_fns, void *stream)#

Unweighted sample without replacement kernel op

Parameters:
  • wm_csr_row_ptr_tensor – : Wholememory Tensor of graph csr_row_ptr

  • wm_csr_col_ptr_tensor – : Wholememory Tensor of graph csr_col_ptr

  • wm_csr_weight_ptr_tensor – : Wholememory Tensor of graph edge weight

  • center_nodes_tensor – : None Wholememory Tensor of center node to sample

  • max_sample_count – : maximum sample count

  • output_sample_offset_tensor – : pointer to output sample offset

  • output_dest_memory_context – : memory context to output dest nodes

  • output_center_localid_memory_context – : memory context to output center local id

  • output_edge_gid_memory_context – : memory context to output edge global id

  • random_seed – random number generator seed

  • p_env_fns – : pointers to environment functions.

  • stream – : CUDA stream to use

Returns:

: wholememory_error_code_t

Miscellaneous Ops for graph#

wholememory_error_code_t graph_append_unique(wholememory_tensor_t target_nodes_tensor, wholememory_tensor_t neighbor_nodes_tensor, void *output_unique_node_memory_context, wholememory_tensor_t output_neighbor_raw_to_unique_mapping_tensor, wholememory_env_func_t *p_env_fns, void *stream)#

Append Unique op

Parameters:
  • target_nodes_tensor – : Wholememory Tensor of graph csr_row_ptr

  • neighbor_nodes_tensor – : Wholememory Tensor of graph csr_col_ptr

  • output_unique_node_memory_context – : memory context to output dest nodes

  • output_neighbor_raw_to_unique_mapping_tensor – : pointer to output sample offset, optional output

  • p_env_fns – : pointers to environment functions.

  • stream – : CUDA stream to use

Returns:

: wholememory_error_code_t

wholememory_error_code_t csr_add_self_loop(wholememory_tensor_t csr_row_ptr_tensor, wholememory_tensor_t csr_col_ptr_tensor, wholememory_tensor_t output_csr_row_ptr_tensor, wholememory_tensor_t output_csr_col_ptr_tensor, void *stream)#

Csr Add Self Loop Op

Parameters:
  • csr_row_ptr_tensor – : Wholememory Tensor of local graph csr_row_ptr

  • csr_col_ptr_tensor – : Wholememory Tensor of csr_col_ptr

  • output_csr_row_ptr_tensor – : Wholememory Tensor of output_csr_row_ptr

  • output_csr_col_ptr_tensor – : Wholememory Tensor of output_csr_col_ptr

  • stream – : CUDA stream to use

Returns:

: wholememory_error_code_t