remote_handle.hpp
1 /*
2  * Copyright (c) 2024-2025, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #pragma once
17 
18 #include <cassert>
19 #include <cstddef>
20 #include <cstring>
21 #include <memory>
22 #include <optional>
23 #include <string>
24 
25 #include <kvikio/defaults.hpp>
26 #include <kvikio/error.hpp>
27 #include <kvikio/parallel_operation.hpp>
28 #include <kvikio/posix_io.hpp>
29 #include <kvikio/utils.hpp>
30 
31 struct curl_slist;
32 
33 namespace kvikio {
34 
35 class CurlHandle; // Prototype
36 
43 enum class RemoteEndpointType : uint8_t {
44  AUTO,
46  S3,
49  S3_PUBLIC,
54  WEBHDFS,
56  HTTP,
58 };
59 
69  protected:
70  RemoteEndpointType _remote_endpoint_type{RemoteEndpointType::AUTO};
72 
73  public:
74  virtual ~RemoteEndpoint() = default;
75 
83  virtual void setopt(CurlHandle& curl) = 0;
84 
90  virtual std::string str() const = 0;
91 
97  virtual std::size_t get_file_size() = 0;
98 
103  virtual void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) = 0;
104 
110  [[nodiscard]] RemoteEndpointType remote_endpoint_type() const noexcept;
111 };
112 
119 class HttpEndpoint : public RemoteEndpoint {
120  private:
121  std::string _url;
122 
123  public:
129  HttpEndpoint(std::string url);
130 
131  ~HttpEndpoint() override = default;
132  void setopt(CurlHandle& curl) override;
133  std::string str() const override;
134  std::size_t get_file_size() override;
135  void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
136 
143  static bool is_url_valid(std::string const& url) noexcept;
144 };
145 
152 class S3Endpoint : public RemoteEndpoint {
153  private:
154  std::string _url;
155  std::string _aws_sigv4;
156  std::string _aws_userpwd;
157  curl_slist* _curl_header_list{};
158 
172  static std::string unwrap_or_default(std::optional<std::string> aws_arg,
173  std::string const& env_var,
174  std::string const& err_msg = "");
175 
176  public:
192  static std::string url_from_bucket_and_object(std::string bucket_name,
193  std::string object_name,
194  std::optional<std::string> aws_region,
195  std::optional<std::string> aws_endpoint_url);
196 
205  [[nodiscard]] static std::pair<std::string, std::string> parse_s3_url(std::string const& s3_url);
206 
222  S3Endpoint(std::string url,
223  std::optional<std::string> aws_region = std::nullopt,
224  std::optional<std::string> aws_access_key = std::nullopt,
225  std::optional<std::string> aws_secret_access_key = std::nullopt,
226  std::optional<std::string> aws_session_token = std::nullopt);
227 
245  S3Endpoint(std::pair<std::string, std::string> bucket_and_object_names,
246  std::optional<std::string> aws_region = std::nullopt,
247  std::optional<std::string> aws_access_key = std::nullopt,
248  std::optional<std::string> aws_secret_access_key = std::nullopt,
249  std::optional<std::string> aws_endpoint_url = std::nullopt,
250  std::optional<std::string> aws_session_token = std::nullopt);
251 
252  ~S3Endpoint() override;
253  void setopt(CurlHandle& curl) override;
254  std::string str() const override;
255  std::size_t get_file_size() override;
256  void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
257 
264  static bool is_url_valid(std::string const& url) noexcept;
265 };
266 
274  private:
275  std::string _url;
276 
277  public:
278  explicit S3PublicEndpoint(std::string url);
279 
280  ~S3PublicEndpoint() override = default;
281  void setopt(CurlHandle& curl) override;
282  std::string str() const override;
283  std::size_t get_file_size() override;
284  void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
285 
292  static bool is_url_valid(std::string const& url) noexcept;
293 };
294 
302  private:
303  std::string _url;
304 
305  public:
306  explicit S3EndpointWithPresignedUrl(std::string presigned_url);
307 
308  ~S3EndpointWithPresignedUrl() override = default;
309  void setopt(CurlHandle& curl) override;
310  std::string str() const override;
311  std::size_t get_file_size() override;
312  void setup_range_request(CurlHandle& curl, std::size_t file_offset, std::size_t size) override;
313 
320  static bool is_url_valid(std::string const& url) noexcept;
321 };
322 
327  private:
328  std::unique_ptr<RemoteEndpoint> _endpoint;
329  std::size_t _nbytes;
330 
331  public:
409  static RemoteHandle open(std::string url,
411  std::optional<std::vector<RemoteEndpointType>> allow_list = std::nullopt,
412  std::optional<std::size_t> nbytes = std::nullopt);
413 
420  RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint, std::size_t nbytes);
421 
429  RemoteHandle(std::unique_ptr<RemoteEndpoint> endpoint);
430 
431  // A remote handle is moveable but not copyable.
432  RemoteHandle(RemoteHandle&& o) = default;
433  RemoteHandle& operator=(RemoteHandle&& o) = default;
434  RemoteHandle(RemoteHandle const&) = delete;
435  RemoteHandle& operator=(RemoteHandle const&) = delete;
436 
442  [[nodiscard]] RemoteEndpointType remote_endpoint_type() const noexcept;
443 
452  [[nodiscard]] std::size_t nbytes() const noexcept;
453 
459  [[nodiscard]] RemoteEndpoint const& endpoint() const noexcept;
460 
473  std::size_t read(void* buf, std::size_t size, std::size_t file_offset = 0);
474 
487  std::future<std::size_t> pread(void* buf,
488  std::size_t size,
489  std::size_t file_offset = 0,
490  std::size_t task_size = defaults::task_size());
491 };
492 
493 } // namespace kvikio
Representation of a curl easy handle pointer and its operations.
Definition: libcurl.hpp:91
A remote endpoint for HTTP/HTTPS resources.
void setup_range_request(CurlHandle &curl, std::size_t file_offset, std::size_t size) override
Set up the range request in order to read part of a file given the file offset and read size.
std::size_t get_file_size() override
Get the size of the remote file.
std::string str() const override
Get a description of this remote point instance.
HttpEndpoint(std::string url)
Create an http endpoint from a url.
void setopt(CurlHandle &curl) override
Set needed connection options on a curl handle.
static bool is_url_valid(std::string const &url) noexcept
Whether the given URL is valid for HTTP/HTTPS endpoints.
Abstract base class for remote endpoints.
RemoteEndpointType remote_endpoint_type() const noexcept
Get the type of the remote file.
virtual std::size_t get_file_size()=0
Get the size of the remote file.
virtual void setup_range_request(CurlHandle &curl, std::size_t file_offset, std::size_t size)=0
Set up the range request in order to read part of a file given the file offset and read size.
virtual void setopt(CurlHandle &curl)=0
Set needed connection options on a curl handle.
virtual std::string str() const =0
Get a description of this remote point instance.
Handle of remote file.
RemoteEndpointType remote_endpoint_type() const noexcept
Get the type of the remote file.
static RemoteHandle open(std::string url, RemoteEndpointType remote_endpoint_type=RemoteEndpointType::AUTO, std::optional< std::vector< RemoteEndpointType >> allow_list=std::nullopt, std::optional< std::size_t > nbytes=std::nullopt)
Create a remote file handle from a URL.
RemoteHandle(std::unique_ptr< RemoteEndpoint > endpoint)
Create a new remote handle from an endpoint (infers the file size).
RemoteHandle(std::unique_ptr< RemoteEndpoint > endpoint, std::size_t nbytes)
Create a new remote handle from an endpoint and a file size.
A remote endpoint for AWS S3 storage using presigned URLs.
std::size_t get_file_size() override
Get the size of the remote file.
std::string str() const override
Get a description of this remote point instance.
void setup_range_request(CurlHandle &curl, std::size_t file_offset, std::size_t size) override
Set up the range request in order to read part of a file given the file offset and read size.
static bool is_url_valid(std::string const &url) noexcept
Whether the given URL is valid for S3 endpoints with presigned URL.
void setopt(CurlHandle &curl) override
Set needed connection options on a curl handle.
A remote endpoint for AWS S3 storage requiring credentials.
static std::string url_from_bucket_and_object(std::string bucket_name, std::string object_name, std::optional< std::string > aws_region, std::optional< std::string > aws_endpoint_url)
Get url from a AWS S3 bucket and object name.
static bool is_url_valid(std::string const &url) noexcept
Whether the given URL is valid for S3 endpoints (excluding presigned URL).
S3Endpoint(std::pair< std::string, std::string > bucket_and_object_names, std::optional< std::string > aws_region=std::nullopt, std::optional< std::string > aws_access_key=std::nullopt, std::optional< std::string > aws_secret_access_key=std::nullopt, std::optional< std::string > aws_endpoint_url=std::nullopt, std::optional< std::string > aws_session_token=std::nullopt)
Create a S3 endpoint from a bucket and object name.
S3Endpoint(std::string url, std::optional< std::string > aws_region=std::nullopt, std::optional< std::string > aws_access_key=std::nullopt, std::optional< std::string > aws_secret_access_key=std::nullopt, std::optional< std::string > aws_session_token=std::nullopt)
Create a S3 endpoint from a url.
void setopt(CurlHandle &curl) override
Set needed connection options on a curl handle.
void setup_range_request(CurlHandle &curl, std::size_t file_offset, std::size_t size) override
Set up the range request in order to read part of a file given the file offset and read size.
static std::pair< std::string, std::string > parse_s3_url(std::string const &s3_url)
Given an url like "s3://<bucket>/<object>", return the name of the bucket and object.
std::string str() const override
Get a description of this remote point instance.
std::size_t get_file_size() override
Get the size of the remote file.
A remote endpoint for publicly accessible S3 objects without authentication.
static bool is_url_valid(std::string const &url) noexcept
Whether the given URL is valid for S3 public endpoints.
std::size_t get_file_size() override
Get the size of the remote file.
void setup_range_request(CurlHandle &curl, std::size_t file_offset, std::size_t size) override
Set up the range request in order to read part of a file given the file offset and read size.
void setopt(CurlHandle &curl) override
Set needed connection options on a curl handle.
std::string str() const override
Get a description of this remote point instance.
Singleton class of default values used throughout KvikIO.
Definition: defaults.hpp:123
KvikIO namespace.
Definition: batch.hpp:27
RemoteEndpointType
Types of remote file endpoints supported by KvikIO.