diff --git a/yaml/native/native_functions.yaml b/yaml/native/native_functions.yaml
index a97dce594..f3812e99e 100644
--- a/yaml/native/native_functions.yaml
+++ b/yaml/native/native_functions.yaml
@@ -8,6 +8,7 @@
   device_guard: False
   dispatch:
     SparseXPU: copy_sparse_wrapper_
+    NestedTensorXPU: copy_nested_
   autogen: copy.out
 
 - func: _copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor
@@ -29,11 +30,14 @@
   variants: function, method
   dispatch:
     SparseXPU: add_sparse
+    NestedTensorXPU: NestedTensor_add_Tensor
   tags: [core, pointwise]
 
 - func: _to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor
   device_check: NoCheck
   device_guard: False
+  dispatch:
+    NestedTensorXPU: _to_copy_nested
   autogen: _to_copy.out
   tags: core
 
@@ -66,6 +70,7 @@
   structured_delegate: add.out
   dispatch:
     SparseXPU: add_sparse_
+    NestedTensorXPU: NestedTensor_add__Tensor
   tags: pointwise
 
 - func: add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
@@ -214,6 +219,7 @@
   structured_delegate: sub.out
   dispatch:
     SparseXPU: sub_sparse
+    NestedTensorXPU: NestedTensor_sub_Tensor
   tags: [core, pointwise]
 
 - func: sub_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)
@@ -243,6 +249,7 @@
   variants: function, method
   dispatch:
     SparseXPU: mul_sparse
+    NestedTensorXPU: NestedTensor_mul_Tensor
   tags: [core, pointwise]
 
 - func: mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
@@ -251,6 +258,7 @@
   variants: method
   dispatch:
     SparseXPU: mul_sparse_
+    NestedTensorXPU: NestedTensor_mul__Tensor
   tags: pointwise
 
 - func: mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -266,11 +274,15 @@
 - func: mul.Scalar(Tensor self, Scalar other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+  dispatch:
+    NestedTensorXPU: NestedTensor_mul_Scalar
   tags: [core, pointwise]
 
 - func: mul_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
   device_check: NoCheck   # TensorIterator
   variants: method
+  dispatch:
+    NestedTensorXPU: NestedTensor_mul__Scalar
   autogen: mul.Scalar_out
   tags: pointwise
 # multiply, alias for mul
@@ -312,6 +324,7 @@
   structured_delegate: div.out
   dispatch:
     SparseXPU: div_sparse
+    NestedTensorXPU: NestedTensor_div_Tensor
   tags: [core, pointwise]
 
 - func: div_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
@@ -362,7 +375,7 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: div
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_div_Scalar
+    NestedTensorXPU: NestedTensor_div_Scalar
   tags: [core, pointwise]
 
 - func: div_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)
@@ -521,6 +534,8 @@
   structured_delegate: eq.Scalar_out
   device_check: NoCheck   # TensorIterator
   variants: method, function
+  dispatch:
+    NestedTensorXPU: eq_scalar_nested
   tags: [core, pointwise]
 
 - func: eq.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -535,6 +550,8 @@
   structured_delegate: eq.Tensor_out
   device_check: NoCheck   # TensorIterator
   variants: method, function
+  dispatch:
+    NestedTensorXPU: eq_tensor_nested
   tags: [core, pointwise]
 
 - func: eq_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
@@ -668,6 +685,8 @@
   structured_delegate: gt.Scalar_out
   device_check: NoCheck   # TensorIterator
   variants: method, function
+  dispatch:
+    NestedTensorXPU: gt_scalar_nested
   tags: [core, pointwise]
 
 - func: gt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -706,6 +725,8 @@
   structured_delegate: ge.Scalar_out
   device_check: NoCheck   # TensorIterator
   variants: method, function
+  dispatch:
+    NestedTensorXPU: ge_scalar_nested
   tags: [core, pointwise]
 
 - func: ge.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@@ -739,6 +760,7 @@
   dispatch:
     XPU: isnan
     SparseXPU: isnan_sparse
+    NestedTensorXPU: NestedTensor_isnan
   autogen: isnan.out
   tags: [core, pointwise]
 
@@ -752,6 +774,8 @@
 - func: masked_fill.Scalar(Tensor self, Tensor mask, Scalar value) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
+  dispatch:
+    NestedTensorXPU: NestedTensor_masked_fill
   tags: pointwise
 
 - func: masked_fill_.Tensor(Tensor(a!) self, Tensor mask, Tensor value) -> Tensor(a!)
@@ -877,6 +901,7 @@
   structured_delegate: threshold_backward.grad_input
   dispatch:
     SparseXPU: threshold_backward_sparse
+    NestedTensorXPU: threshold_backwards_nested
   tags: pointwise
 
 - func: gelu.out(Tensor self, *, str approximate='none', Tensor(a!) out) -> Tensor(a!)
@@ -891,11 +916,15 @@
   structured_delegate: gelu.out
   device_check: NoCheck   # TensorIterator
   python_module: nn
+  dispatch:
+    NestedTensorXPU: NestedTensor_gelu_
 
 - func: gelu(Tensor self, *, str approximate='none') -> Tensor
   structured_delegate: gelu.out
   device_check: NoCheck   # TensorIterator
   python_module: nn
+  dispatch:
+    NestedTensorXPU: NestedTensor_gelu
   tags: [core, pointwise]
 
 - func: gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!)
@@ -908,6 +937,8 @@
 - func: gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor
   structured_delegate: gelu_backward.grad_input
   python_module: nn
+  dispatch:
+    NestedTensorXPU: gelu_backwards_nested
   tags: pointwise
 
 - func: arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -941,6 +972,7 @@
   variants: function, method
   dispatch:
     SparseXPU: abs_sparse
+    NestedTensorXPU: NestedTensor_abs
   tags: [core, pointwise]
 
 - func: abs_(Tensor(a!) self) -> Tensor(a!)
@@ -948,6 +980,7 @@
   variants: function, method
   dispatch:
     SparseXPU: abs_sparse_
+    NestedTensorXPU: NestedTensor_abs_
 
 - func: sin(Tensor self) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -955,6 +988,7 @@
   variants: function, method
   dispatch:
     SparseXPU: sin_sparse
+    NestedTensorXPU: NestedTensor_sin
   tags: [core, pointwise]
 
 - func: sin_(Tensor(a!) self) -> Tensor(a!)
@@ -996,7 +1030,7 @@
   variants: function, method
   structured_delegate: cos.out
   dispatch:
-    NestedTensorCPU, NestedTensorCUDA: cos_nested
+    NestedTensorXPU: NestedTensor_cos
   tags: [core, pointwise]
 
 - func: cos_(Tensor(a!) self) -> Tensor(a!)
@@ -1039,6 +1073,7 @@
   variants: function, method
   dispatch:
     SparseXPU: sqrt_sparse
+    NestedTensorXPU: NestedTensor_sqrt
   tags: [core, pointwise]
 
 - func: sqrt_(Tensor(a!) self) -> Tensor(a!)
@@ -1084,6 +1119,7 @@
   variants: function, method
   dispatch:
     SparseXPU: tanh_sparse
+    NestedTensorXPU: NestedTensor_tanh
   tags: [core, pointwise]
 
 - func: tanh_(Tensor(a!) self) -> Tensor(a!)
@@ -1092,6 +1128,7 @@
   variants: function, method
   dispatch:
     SparseXPU: tanh_sparse_
+    NestedTensorXPU: NestedTensor_tanh_
   tags: pointwise
 
 - func: tanh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -1109,6 +1146,7 @@
   variants: function, method
   dispatch:
     SparseXPU: neg_sparse
+    NestedTensorXPU: NestedTensor_neg
   tags: [core, pointwise]
 
 - func: neg_(Tensor(a!) self) -> Tensor(a!)
@@ -1117,6 +1155,7 @@
   variants: function, method
   dispatch:
     SparseXPU: neg_sparse_
+    NestedTensorXPU: NestedTensor_neg_
   tags: pointwise
 
 - func: neg.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -1215,6 +1254,7 @@
   dispatch:
     CompositeExplicitAutograd: empty_like
     SparseXPU: empty_like_sparse_coo
+    NestedTensorXPU: empty_like_nested
   autogen: empty_like.out
 
 - func: new_empty_strided(Tensor self, SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -1241,6 +1281,7 @@
   variants: function, method
   dispatch:
     XPU: fill_
+    NestedTensorXPU: fill_nested_
   autogen: fill.Scalar_out
 
 - func: fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!)
@@ -1248,6 +1289,7 @@
   variants: function, method
   dispatch:
     XPU: fill_
+    NestedTensorXPU: fill_nested_
   autogen: fill.Tensor_out
 
 - func: resize_as_sparse_(Tensor(a!) self, Tensor the_template) -> Tensor(a!)
@@ -1263,6 +1305,7 @@
   dispatch:
     XPU: zero_
     SparseXPU: zero_sparse_
+    NestedTensorXPU: zero_nested_
   autogen: zero, zero.out
 
 - func: random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!)
@@ -1295,6 +1338,7 @@
   variants: method
   dispatch:
     XPU: normal_
+    NestedTensorXPU: normal_nested_
   autogen: normal.out
 
 # Only used by the functionalization pass.
@@ -1395,12 +1439,14 @@
   variants: function
   dispatch:
     XPU: native_dropout_xpu
+    NestedTensorXPU: native_dropout_nested
   tags: [nondeterministic_seeded, core]
   autogen: native_dropout.out
 
 - func: native_dropout_backward(Tensor grad_output, Tensor mask, float scale) -> Tensor
   dispatch:
     XPU: native_dropout_backward_xpu
+    NestedTensorXPU: native_dropout_backward
   autogen: native_dropout_backward.out
   tags: pointwise
 
@@ -1410,6 +1456,7 @@
   device_guard: False
   dispatch:
     XPU: view
+    NestedTensorXPU: view_nested
   tags: core
 
 - func: view_as_real(Tensor(a) self) -> Tensor(a)
@@ -1685,12 +1732,14 @@
   device_check: NoCheck   # TensorIterator
   dispatch:
     XPU: where_self_out
+    NestedTensorXPU: NestedTensor_where_out
 
 - func: where.self(Tensor condition, Tensor self, Tensor other) -> Tensor
   device_check: NoCheck   # TensorIterator
   variants: function, method
   dispatch:
     XPU: where
+    NestedTensorXPU: NestedTensor_where
   tags: [core, pointwise]
 
 - func: clamp(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor
@@ -2030,6 +2079,8 @@
 
 - func: _softmax(Tensor self, int dim, bool half_to_float) -> Tensor
   structured_delegate: _softmax.out
+  dispatch:
+    NestedTensorXPU: softmax_nested
   tags: core
 
 - func: _softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!)
@@ -2043,6 +2094,8 @@
 
 - func: _softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor
   structured_delegate: _softmax_backward_data.out
+  dispatch:
+    NestedTensorXPU: nested_softmax_backward
 
 - func: _softmax_backward_data.out(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype, *, Tensor(a!) grad_input) -> Tensor(a!)
   structured: True
@@ -2124,6 +2177,7 @@
   structured_delegate: sgn.out
   dispatch:
     SparseXPU: sgn_sparse
+    NestedTensorXPU: NestedTensor_sgn
   tags: pointwise
 
 - func: sgn_(Tensor(a!) self) -> Tensor(a!)
@@ -2131,6 +2185,7 @@
   structured_delegate: sgn.out
   dispatch:
     SparseXPU: sgn_sparse_
+    NestedTensorXPU: NestedTensor_sgn_
   tags: pointwise
 
 - func: sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -3158,12 +3213,14 @@
 - func: native_layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor, Tensor, Tensor)
   dispatch:
     XPU: layer_norm_xpu
+    NestedTensorXPU: nested_layer_norm
   autogen: native_layer_norm.out
   tags: core
 
 - func: native_layer_norm_backward(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
   dispatch:
     XPU: layer_norm_backward_xpu
+    NestedTensorXPU: layer_norm_backward_nested
   autogen: native_layer_norm_backward.out
   tags: core
 
@@ -3274,6 +3331,7 @@
   structured_delegate: cat.out
   dispatch:
     SparseXPU: cat_sparse
+    NestedTensorXPU: cat_nested
   tags: core
 
 - func: cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
@@ -3516,11 +3574,15 @@
 - func: silu(Tensor self) -> Tensor
   structured_delegate: silu.out
   python_module: nn
+  dispatch:
+    NestedTensorXPU: NestedTensor_silu
   tags: pointwise
 
 - func: silu_(Tensor(a!) self) -> Tensor(a!)
   structured_delegate: silu.out
   python_module: nn
+  dispatch:
+    NestedTensorXPU: NestedTensor_silu_
   tags: pointwise
 
 - func: silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -3544,6 +3606,7 @@
   python_module: nn
   dispatch:
     CompositeImplicitAutograd: math_silu_backward
+    NestedTensorXPU: silu_backward_nested
   tags: pointwise
 
 - func: hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -3606,6 +3669,7 @@
   dispatch:
     XPU: relu
     SparseXPU: relu_sparse
+    NestedTensorXPU: NestedTensor_relu
   tags: [core, pointwise]
 
 - func: relu_(Tensor(a!) self) -> Tensor(a!)
@@ -3614,6 +3678,7 @@
   dispatch:
     XPU: relu_
     SparseXPU: relu_sparse_
+    NestedTensorXPU: NestedTensor_relu_
   autogen: relu.out
   tags: pointwise
 
@@ -3621,6 +3686,8 @@
   device_check: NoCheck   # TensorIterator
   structured_delegate: all.out
   variants: function, method
+  dispatch:
+    NestedTensorXPU: NestedTensor_all
 
 - func: all.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
   device_check: NoCheck   # TensorIterator
@@ -4333,6 +4400,7 @@
   dispatch:
     CompositeExplicitAutograd: clone
     SparseXPU: clone_sparse
+    NestedTensorXPU: clone_nested
   autogen: clone.out
   tags: [core, pointwise]
 
@@ -5032,7 +5100,7 @@
   variants: function, method
   dispatch:
     CompositeExplicitAutograd: logical_not
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not
+    NestedTensorXPU: NestedTensor_logical_not
   tags: [core, pointwise]
 
 - func: logical_not_(Tensor(a!) self) -> Tensor(a!)
@@ -5040,7 +5108,7 @@
   variants: method
   dispatch:
     CompositeExplicitAutograd: logical_not_
-    NestedTensorCPU, NestedTensorCUDA: NestedTensor_logical_not_
+    NestedTensorXPU: NestedTensor_logical_not_
   tags: pointwise
 
 - func: logical_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -5989,13 +6057,39 @@
 # Fused implementation detail for transformers. Adds in-projection bias to QKV and divides Q by sqrt(D/num_heads).
 - func: _transform_bias_rescale_qkv(Tensor qkv, Tensor qkv_bias, int num_heads) -> (Tensor, Tensor, Tensor)
   dispatch:
-    XPU: transform_bias_rescale_qkv_xpu
+    XPU, NestedTensorXPU: transform_bias_rescale_qkv_xpu
   autogen: _transform_bias_rescale_qkv.out
 
+# These private functions are temporary. They will be updated/deleted when nested tensors switch to using SymInts for their metadata representation
+- func: _nested_tensor_size(Tensor self) -> Tensor
+  variants: method
+  dispatch:
+    NestedTensorXPU: _nested_tensor_size
+  autogen: _nested_tensor_size.out
+
+- func: _nested_tensor_strides(Tensor self) -> Tensor
+  variants: method
+  dispatch:
+    NestedTensorXPU: _nested_tensor_strides
+  autogen: _nested_tensor_strides.out
+
+- func: _nested_tensor_storage_offsets(Tensor self) -> Tensor
+  variants: method
+  dispatch:
+    NestedTensorXPU: _nested_tensor_storage_offsets
+  autogen: _nested_tensor_storage_offsets.out
+
+# _nested_from_padded is not usable from Python, so
+# _nested_from_padded_and_nested_example is available for testing.
+- func: _nested_from_padded_and_nested_example(Tensor padded, Tensor nt_example) -> Tensor
+  dispatch:
+    NestedTensorXPU: NestedTensor_from_padded_and_nested_example
+  autogen: _nested_from_padded_and_nested_example.out
+
 - func: _native_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, bool need_weights=True, bool average_attn_weights=True, int? mask_type=None) -> (Tensor, Tensor)
   variants: function
   dispatch:
-    XPU: native_multi_head_attention_xpu
+    XPU, NestedTensorXPU: native_multi_head_attention_xpu
   autogen: _native_multi_head_attention.out
 
 - func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
@@ -6721,6 +6815,7 @@
   dispatch:
     CompositeExplicitAutograd: unsqueeze
     SparseXPU: unsqueeze_sparse
+    NestedTensorXPU: unsqueeze_nested
   tags: core
 
 - func: zeros(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
@@ -6899,6 +6994,7 @@
   structured_delegate: isposinf.out
   dispatch:
     SparseXPU: isposinf_sparse
+    NestedTensorXPU: NestedTensor_isposinf
   tags: pointwise
 
 - func: isposinf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -6914,6 +7010,7 @@
   structured_delegate: isneginf.out
   dispatch:
     SparseXPU: isneginf_sparse
+    NestedTensorXPU: NestedTensor_isneginf
   tags: pointwise
 
 - func: isneginf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
@@ -7966,6 +8063,7 @@
   dispatch:
     CompositeExplicitAutograd: isinf
     SparseXPU: isinf_sparse
+    NestedTensorXPU: NestedTensor_isinf
   autogen: isinf.out
   tags: [core, pointwise]
 
@@ -8594,6 +8692,7 @@
   variants: method
   dispatch:
     SparseXPU: values_sparse
+    NestedTensorXPU: values_nested
   device_check: NoCheck
   device_guard: False
 
@@ -8623,13 +8722,30 @@
     SparseXPU: copy_sparse_
   autogen: copy_sparse_to_sparse, copy_sparse_to_sparse.out
 
+- func: unbind_copy.int(Tensor self, int dim=0) -> Tensor[]
+  variants: function
+  dispatch:
+    CompositeExplicitAutogradNonFunctional: unbind_copy_int
+  tags: view_copy
+
+- func: unbind_copy.int_out(Tensor self, int dim=0, *, Tensor(a!)[] out) -> ()
+  variants: function
+  dispatch:
+    CompositeExplicitAutograd: unbind_copy_int_out
+
+- func: unbind.int(Tensor(a -> *) self, int dim=0) -> Tensor(a)[]
+  variants: function, method
+  dispatch:
+    CompositeExplicitAutograd: unbind
+    NestedTensorXPU: NestedTensor_unbind
+
 - func: _weight_int4pack_mm(Tensor self, Tensor mat2, int qGroupSize, Tensor qScaleAndZeros) -> Tensor
   dispatch:
     XPU: _weight_int4pack_mm_xpu
   # autogen: _weight_int4pack_mm.out
   # tags: core
 
-  - func: _nested_compute_contiguous_strides_offsets(Tensor nested_size) -> (Tensor, Tensor)
+- func: _nested_compute_contiguous_strides_offsets(Tensor nested_size) -> (Tensor, Tensor)
   variants: function
   device_check: NoCheck
   dispatch:
@@ -8656,4 +8772,4 @@
   tags: view_copy
   dispatch:
     CompositeExplicitAutogradNonFunctional: _nested_view_from_buffer_copy
-  autogen: _nested_view_from_buffer_copy.out
\ No newline at end of file
+  autogen: _nested_view_from_buffer_copy.out