Module Values_0.BatchRebootClusterNodesResponseSource

Reboots specific nodes within a SageMaker HyperPod cluster using a soft recovery mechanism. BatchRebootClusterNodes performs a graceful reboot of the specified nodes by calling the Amazon Elastic Compute Cloud RebootInstances API, which attempts to cleanly shut down the operating system before restarting the instance. This operation is useful for recovering from transient issues or applying certain configuration changes that require a restart. Rebooting a node may cause temporary service interruption for workloads running on that node. Ensure your workloads can handle node restarts or use appropriate scheduling to minimize impact. You can reboot up to 25 nodes in a single request. For SageMaker HyperPod clusters using the Slurm workload manager, ensure rebooting nodes will not disrupt critical cluster operations.

Sourcetype nonrec t = {
  1. successful : ClusterNodeIds.t option;
    (*

    A list of EC2 instance IDs for which the reboot operation was successfully initiated.

    *)
  2. failed : BatchRebootClusterNodesErrors.t option;
    (*

    A list of errors encountered for EC2 instance IDs that could not be rebooted. Each error includes the instance ID, an error code, and a descriptive message.

    *)
  3. failedNodeLogicalIds : BatchRebootClusterNodeLogicalIdsErrors.t option;
    (*

    A list of errors encountered for logical node IDs that could not be rebooted. Each error includes the logical node ID, an error code, and a descriptive message. This field is only present when NodeLogicalIds were provided in the request.

    *)
  4. successfulNodeLogicalIds : ClusterNodeLogicalIdList.t option;
    (*

    A list of logical node IDs for which the reboot operation was successfully initiated. This field is only present when NodeLogicalIds were provided in the request.

    *)
}
Sourcetype nonrec error = [
  1. | `ResourceNotFound of ResourceNotFound.t
  2. | `Unknown_operation_error of string * string option
]
Sourceval make : ?successful:??? -> ?failed:??? -> ?failedNodeLogicalIds:??? -> ?successfulNodeLogicalIds:??? -> unit -> t
Sourceval error_of_json : string -> Yojson.Safe.t -> [> `ResourceNotFound of ResourceNotFound.t | `Unknown_operation_error of string * string option ]
Sourceval error_of_xml : string -> Awso.Xml.t -> [> `ResourceNotFound of ResourceNotFound.t | `Unknown_operation_error of string * string option ]
Sourceval error_to_json : error -> Yojson.Safe.t
Sourceval to_value : t -> [> `Structure of (string * [> `List of [> `String of ClusterNodeId.t | `Structure of (string * [> `Enum of string | `String of ClusterNodeId.t ]) list ] list ]) list ]
Sourceval to_query : t -> Awso.Client.Query.t
Sourceval of_xml : Awso.Xml.t -> t
Sourceval of_string : string -> t
Sourceval of_json : Yojson.Safe.t -> t
Sourceval to_json : t -> Yojson.Safe.t