diff --git a/ext_lib/eigen b/ext_lib/eigen index 18018ed..6f9ad7d 160000 --- a/ext_lib/eigen +++ b/ext_lib/eigen @@ -1 +1 @@ -Subproject commit 18018ed013029ca3f28f52a62360999b5a659eac +Subproject commit 6f9ad7da6122fdb4197c0b43dfec09ec3525305e diff --git a/ext_lib/googletest b/ext_lib/googletest index 8a6feab..5b7fd63 160000 --- a/ext_lib/googletest +++ b/ext_lib/googletest @@ -1 +1 @@ -Subproject commit 8a6feabf04bec8fb125e0df0ad1195c42350725f +Subproject commit 5b7fd63d6d69f1754d5f3be956949484ebac06d5 diff --git a/ext_lib/json b/ext_lib/json index 788e546..58d6aa5 160000 --- a/ext_lib/json +++ b/ext_lib/json @@ -1 +1 @@ -Subproject commit 788e5468e4cb4573941600d5c39ccd552dc8a429 +Subproject commit 58d6aa58316a84fc770bdd0409934cdfdf0697dc diff --git a/ext_lib/mdspan b/ext_lib/mdspan index f840358..b186529 160000 --- a/ext_lib/mdspan +++ b/ext_lib/mdspan @@ -1 +1 @@ -Subproject commit f84035865a92241a5163d8d0e5100aea037892ca +Subproject commit b186529a12b39d0ecb97e6ae9d4a06031aef2f88 diff --git a/ext_lib/stdexec b/ext_lib/stdexec index dc90c30..5d96d38 160000 --- a/ext_lib/stdexec +++ b/ext_lib/stdexec @@ -1 +1 @@ -Subproject commit dc90c30ff4968d5043299f97a8294d3f4a6c62e4 +Subproject commit 5d96d38881d611aade22eeea7c6242ba56a6c6cb diff --git a/lib/executors/numpy_like.hpp b/lib/executors/numpy_like.hpp index 6eb8c22..43b4ac6 100644 --- a/lib/executors/numpy_like.hpp +++ b/lib/executors/numpy_like.hpp @@ -33,13 +33,13 @@ namespace Impl { for(int ir=0; ir < reduce_size; ir++) { if(reduce_dim == 0) { - auto sub_in = stdex::submdspan(in, ir, std::full_extent, std::full_extent); + auto sub_in = std::submdspan(in, ir, std::full_extent, std::full_extent); sum += sub_in(i0, i1); } else if(reduce_dim == 1) { - auto sub_in = stdex::submdspan(in, std::full_extent, ir, std::full_extent); + auto sub_in = std::submdspan(in, std::full_extent, ir, std::full_extent); sum += sub_in(i0, i1); } else { - auto sub_in = stdex::submdspan(in, std::full_extent, std::full_extent, ir); + auto sub_in = std::submdspan(in, std::full_extent, std::full_extent, ir); sum += sub_in(i0, i1); } } @@ -56,13 +56,13 @@ namespace Impl { assert(out.extent(reduce_dim) == 1); if(reduce_dim == 0) { - auto sub_out = stdex::submdspan(out, 0, std::full_extent, std::full_extent); + auto sub_out = std::submdspan(out, 0, std::full_extent, std::full_extent); mean_(in, sub_out, axis); } else if(reduce_dim == 1) { - auto sub_out = stdex::submdspan(out, std::full_extent, 0, std::full_extent); + auto sub_out = std::submdspan(out, std::full_extent, 0, std::full_extent); mean_(in, sub_out, axis); } else { - auto sub_out = stdex::submdspan(out, std::full_extent, std::full_extent, 0); + auto sub_out = std::submdspan(out, std::full_extent, std::full_extent, 0); mean_(in, sub_out, axis); } } @@ -85,10 +85,10 @@ namespace Impl { for(int ir=0; ir < reduce_size; ir++) { if(reduce_dim == 0) { - auto sub_in = stdex::submdspan(in, ir, std::full_extent); + auto sub_in = std::submdspan(in, ir, std::full_extent); sum += sub_in(idx); } else { - auto sub_in = stdex::submdspan(in, std::full_extent, ir); + auto sub_in = std::submdspan(in, std::full_extent, ir); sum += sub_in(idx); } } @@ -105,10 +105,10 @@ namespace Impl { assert(out.extent(reduce_dim) == 1); if(reduce_dim == 0) { - auto sub_out = stdex::submdspan(out, 0, std::full_extent); + auto sub_out = std::submdspan(out, 0, std::full_extent); mean_(in, sub_out, axis); } else { - auto sub_out = stdex::submdspan(out, std::full_extent, 0); + auto sub_out = std::submdspan(out, std::full_extent, 0); mean_(in, sub_out, axis); } } @@ -340,10 +340,10 @@ namespace Impl { x(i0, i1) = alpha * x(i0, i1) + beta * y(i0, i1); }); } else if( ny0 == 1 && ny0 < nx0 && ny1 == nx1 ) { - auto sub_y = stdex::submdspan(y, 0, std::full_extent); + auto sub_y = std::submdspan(y, 0, std::full_extent); axpy_(x, sub_y, beta, alpha, 1); } else if( ny0 == nx0 && ny1 == 1 && ny1 < nx1 ) { - auto sub_y = stdex::submdspan(y, std::full_extent, 0); + auto sub_y = std::submdspan(y, std::full_extent, 0); axpy_(x, sub_y, beta, alpha, 0); } else if( ny0 == 1 && ny0 < nx0 && ny1 == 1 && ny1 < nx1 ) { IteratePolicy policy2d({0, 0}, {nx0, nx1}); @@ -380,10 +380,10 @@ namespace Impl { z(i0, i1) = alpha * x(i0, i1) + beta * y(i0, i1); }); } else if( ny0 == 1 && ny0 < nx0 && ny1 == nx1 ) { - auto sub_y = stdex::submdspan(y, 0, std::full_extent); + auto sub_y = std::submdspan(y, 0, std::full_extent); axpy_(x, sub_y, z, beta, alpha, 1); } else if( ny0 == nx0 && ny1 == 1 && ny1 < nx1 ) { - auto sub_y = stdex::submdspan(y, std::full_extent, 0); + auto sub_y = std::submdspan(y, std::full_extent, 0); axpy_(x, sub_y, z, beta, alpha, 0); } else if( ny0 == 1 && ny0 < nx0 && ny1 == 1 && ny1 < nx1 ) { IteratePolicy policy2d({0, 0}, {nx0, nx1}); @@ -572,22 +572,22 @@ namespace Impl { x(i0, i1, i2) = alpha * x(i0, i1, i2) + beta * y(i0, i1, i2); }); } else if( ny0 == 1 && ny0 < nx0 && ny1 == nx1 && ny2 == nx2 ) { - auto sub_y = stdex::submdspan(y, 0, std::full_extent, std::full_extent); + auto sub_y = std::submdspan(y, 0, std::full_extent, std::full_extent); axpy_(x, sub_y, beta, alpha, 0); } else if( ny0 == 1 && ny0 < nx0 && ny1 == 1 && ny1 < nx1 && ny2 == nx2 ) { - auto sub_y = stdex::submdspan(y, 0, 0, std::full_extent); + auto sub_y = std::submdspan(y, 0, 0, std::full_extent); axpy_(x, sub_y, beta, alpha, 2); } else if( ny0 == 1 && ny0 < nx0 && ny1 == nx1 && ny2 == 1 && ny2 < nx2 ) { - auto sub_y = stdex::submdspan(y, 0, std::full_extent, 0); + auto sub_y = std::submdspan(y, 0, std::full_extent, 0); axpy_(x, sub_y, beta, alpha, 1); } else if( ny0 == nx0 && ny1 == 1 && ny1 < nx1 && ny2 == nx2 ) { - auto sub_y = stdex::submdspan(y, std::full_extent, 0, std::full_extent); + auto sub_y = std::submdspan(y, std::full_extent, 0, std::full_extent); axpy_(x, sub_y, beta, alpha, 1); } else if( ny0 == nx0 && ny1 == 1 && ny1 < nx1 && ny2 == 1 && ny2 < nx2 ) { - auto sub_y = stdex::submdspan(y, std::full_extent, 0, 0); + auto sub_y = std::submdspan(y, std::full_extent, 0, 0); axpy_(x, sub_y, beta, alpha, 0); } else if( ny0 == nx0 && ny1 == nx1 && ny2 == 1 && ny2 < nx2 ) { - auto sub_y = stdex::submdspan(y, std::full_extent, std::full_extent, 0); + auto sub_y = std::submdspan(y, std::full_extent, std::full_extent, 0); axpy_(x, sub_y, beta, alpha, 2); } else if( ny0 == 1 && ny0 < nx0 && ny1 == 1 && ny1 < nx1 && ny2 == 1 && ny2 < nx2 ) { IteratePolicy policy3d({0, 0, 0}, {nx0, nx1, nx2}); @@ -625,22 +625,22 @@ namespace Impl { z(i0, i1, i2) = alpha * x(i0, i1, i2) + beta * y(i0, i1, i2); }); } else if( ny0 == 1 && ny0 < nx0 && ny1 == nx1 && ny2 == nx2 ) { - auto sub_y = stdex::submdspan(y, 0, std::full_extent, std::full_extent); + auto sub_y = std::submdspan(y, 0, std::full_extent, std::full_extent); axpy_(x, sub_y, z, beta, alpha, 0); } else if( ny0 == 1 && ny0 < nx0 && ny1 == 1 && ny1 < nx1 && ny2 == nx2 ) { - auto sub_y = stdex::submdspan(y, 0, 0, std::full_extent); + auto sub_y = std::submdspan(y, 0, 0, std::full_extent); axpy_(x, sub_y, z, beta, alpha, 2); } else if( ny0 == 1 && ny0 < nx0 && ny1 == nx1 && ny2 == 1 && ny2 < nx2 ) { - auto sub_y = stdex::submdspan(y, 0, std::full_extent, 0); + auto sub_y = std::submdspan(y, 0, std::full_extent, 0); axpy_(x, sub_y, z, beta, alpha, 1); } else if( ny0 == nx0 && ny1 == 1 && ny1 < nx1 && ny2 == nx2 ) { - auto sub_y = stdex::submdspan(y, std::full_extent, 0, std::full_extent); + auto sub_y = std::submdspan(y, std::full_extent, 0, std::full_extent); axpy_(x, sub_y, z, beta, alpha, 1); } else if( ny0 == nx0 && ny1 == 1 && ny1 < nx1 && ny2 == 1 && ny2 < nx2 ) { - auto sub_y = stdex::submdspan(y, std::full_extent, 0, 0); + auto sub_y = std::submdspan(y, std::full_extent, 0, 0); axpy_(x, sub_y, z, beta, alpha, 0); } else if( ny0 == nx0 && ny1 == nx1 && ny2 == 1 && ny2 < nx2 ) { - auto sub_y = stdex::submdspan(y, std::full_extent, std::full_extent, 0); + auto sub_y = std::submdspan(y, std::full_extent, std::full_extent, 0); axpy_(x, sub_y, z, beta, alpha, 2); } else if( ny0 == 1 && ny0 < nx0 && ny1 == 1 && ny1 < nx1 && ny2 == 1 && ny2 < nx2 ) { IteratePolicy policy3d({0, 0, 0}, {nx0, nx1, nx2}); diff --git a/lib/stdpar/numpy_like.hpp b/lib/stdpar/numpy_like.hpp index 2bb4c0b..f129016 100644 --- a/lib/stdpar/numpy_like.hpp +++ b/lib/stdpar/numpy_like.hpp @@ -76,13 +76,13 @@ namespace Impl { for(int ir=0; ir < reduce_size; ir++) { if(reduce_dim == 0) { - auto sub_in = stdex::submdspan(in, ir, std::full_extent, std::full_extent); + auto sub_in = std::submdspan(in, ir, std::full_extent, std::full_extent); sum += sub_in(i0, i1); } else if(reduce_dim == 1) { - auto sub_in = stdex::submdspan(in, std::full_extent, ir, std::full_extent); + auto sub_in = std::submdspan(in, std::full_extent, ir, std::full_extent); sum += sub_in(i0, i1); } else { - auto sub_in = stdex::submdspan(in, std::full_extent, std::full_extent, ir); + auto sub_in = std::submdspan(in, std::full_extent, std::full_extent, ir); sum += sub_in(i0, i1); } } @@ -99,13 +99,13 @@ namespace Impl { assert(out.extent(reduce_dim) == 1); if(reduce_dim == 0) { - auto sub_out = stdex::submdspan(out, 0, std::full_extent, std::full_extent); + auto sub_out = std::submdspan(out, 0, std::full_extent, std::full_extent); mean_(in, sub_out, axis); } else if(reduce_dim == 1) { - auto sub_out = stdex::submdspan(out, std::full_extent, 0, std::full_extent); + auto sub_out = std::submdspan(out, std::full_extent, 0, std::full_extent); mean_(in, sub_out, axis); } else { - auto sub_out = stdex::submdspan(out, std::full_extent, std::full_extent, 0); + auto sub_out = std::submdspan(out, std::full_extent, std::full_extent, 0); mean_(in, sub_out, axis); } } @@ -127,10 +127,10 @@ namespace Impl { for(int ir=0; ir < reduce_size; ir++) { if(reduce_dim == 0) { - auto sub_in = stdex::submdspan(in, ir, std::full_extent); + auto sub_in = std::submdspan(in, ir, std::full_extent); sum += sub_in(idx); } else { - auto sub_in = stdex::submdspan(in, std::full_extent, ir); + auto sub_in = std::submdspan(in, std::full_extent, ir); sum += sub_in(idx); } } @@ -147,10 +147,10 @@ namespace Impl { assert(out.extent(reduce_dim) == 1); if(reduce_dim == 0) { - auto sub_out = stdex::submdspan(out, 0, std::full_extent); + auto sub_out = std::submdspan(out, 0, std::full_extent); mean_(in, sub_out, axis); } else { - auto sub_out = stdex::submdspan(out, std::full_extent, 0); + auto sub_out = std::submdspan(out, std::full_extent, 0); mean_(in, sub_out, axis); } } @@ -368,10 +368,10 @@ namespace Impl { x(i0, i1) = alpha * x(i0, i1) + beta * y(i0, i1); }); } else if( ny0 == 1 && ny0 < nx0 && ny1 == nx1 ) { - auto sub_y = stdex::submdspan(y, 0, std::full_extent); + auto sub_y = std::submdspan(y, 0, std::full_extent); axpy_(x, sub_y, beta, alpha, 1); } else if( ny0 == nx0 && ny1 == 1 && ny1 < nx1 ) { - auto sub_y = stdex::submdspan(y, std::full_extent, 0); + auto sub_y = std::submdspan(y, std::full_extent, 0); axpy_(x, sub_y, beta, alpha, 0); } else if( ny0 == 1 && ny0 < nx0 && ny1 == 1 && ny1 < nx1 ) { IteratePolicy policy2d({0, 0}, {nx0, nx1}); @@ -404,10 +404,10 @@ namespace Impl { z(i0, i1) = alpha * x(i0, i1) + beta * y(i0, i1); }); } else if( ny0 == 1 && ny0 < nx0 && ny1 == nx1 ) { - auto sub_y = stdex::submdspan(y, 0, std::full_extent); + auto sub_y = std::submdspan(y, 0, std::full_extent); axpy_(x, sub_y, z, beta, alpha, 1); } else if( ny0 == nx0 && ny1 == 1 && ny1 < nx1 ) { - auto sub_y = stdex::submdspan(y, std::full_extent, 0); + auto sub_y = std::submdspan(y, std::full_extent, 0); axpy_(x, sub_y, z, beta, alpha, 0); } else if( ny0 == 1 && ny0 < nx0 && ny1 == 1 && ny1 < nx1 ) { IteratePolicy policy2d({0, 0}, {nx0, nx1}); @@ -573,22 +573,22 @@ namespace Impl { }); } else if( ny0 == 1 && ny0 < nx0 && ny1 == nx1 && ny2 == nx2 ) { - auto sub_y = stdex::submdspan(y, 0, std::full_extent, std::full_extent); + auto sub_y = std::submdspan(y, 0, std::full_extent, std::full_extent); axpy_(x, sub_y, beta, alpha, 0); } else if( ny0 == 1 && ny0 < nx0 && ny1 == 1 && ny1 < nx1 && ny2 == nx2 ) { - auto sub_y = stdex::submdspan(y, 0, 0, std::full_extent); + auto sub_y = std::submdspan(y, 0, 0, std::full_extent); axpy_(x, sub_y, beta, alpha, 2); } else if( ny0 == 1 && ny0 < nx0 && ny1 == nx1 && ny2 == 1 && ny2 < nx2 ) { - auto sub_y = stdex::submdspan(y, 0, std::full_extent, 0); + auto sub_y = std::submdspan(y, 0, std::full_extent, 0); axpy_(x, sub_y, beta, alpha, 1); } else if( ny0 == nx0 && ny1 == 1 && ny1 < nx1 && ny2 == nx2 ) { - auto sub_y = stdex::submdspan(y, std::full_extent, 0, std::full_extent); + auto sub_y = std::submdspan(y, std::full_extent, 0, std::full_extent); axpy_(x, sub_y, beta, alpha, 1); } else if( ny0 == nx0 && ny1 == 1 && ny1 < nx1 && ny2 == 1 && ny2 < nx2 ) { - auto sub_y = stdex::submdspan(y, std::full_extent, 0, 0); + auto sub_y = std::submdspan(y, std::full_extent, 0, 0); axpy_(x, sub_y, beta, alpha, 0); } else if( ny0 == nx0 && ny1 == nx1 && ny2 == 1 && ny2 < nx2 ) { - auto sub_y = stdex::submdspan(y, std::full_extent, std::full_extent, 0); + auto sub_y = std::submdspan(y, std::full_extent, std::full_extent, 0); axpy_(x, sub_y, beta, alpha, 2); } else if( ny0 == 1 && ny0 < nx0 && ny1 == 1 && ny1 < nx1 && ny2 == 1 && ny2 < nx2 ) { IteratePolicy policy3d({0, 0, 0}, {nx0, nx1, nx2}); @@ -620,22 +620,22 @@ namespace Impl { z(i0, i1, i2) = alpha * x(i0, i1, i2) + beta * y(i0, i1, i2); }); } else if( ny0 == 1 && ny0 < nx0 && ny1 == nx1 && ny2 == nx2 ) { - auto sub_y = stdex::submdspan(y, 0, std::full_extent, std::full_extent); + auto sub_y = std::submdspan(y, 0, std::full_extent, std::full_extent); axpy_(x, sub_y, z, beta, alpha, 0); } else if( ny0 == 1 && ny0 < nx0 && ny1 == 1 && ny1 < nx1 && ny2 == nx2 ) { - auto sub_y = stdex::submdspan(y, 0, 0, std::full_extent); + auto sub_y = std::submdspan(y, 0, 0, std::full_extent); axpy_(x, sub_y, z, beta, alpha, 2); } else if( ny0 == 1 && ny0 < nx0 && ny1 == nx1 && ny2 == 1 && ny2 < nx2 ) { - auto sub_y = stdex::submdspan(y, 0, std::full_extent, 0); + auto sub_y = std::submdspan(y, 0, std::full_extent, 0); axpy_(x, sub_y, z, beta, alpha, 1); } else if( ny0 == nx0 && ny1 == 1 && ny1 < nx1 && ny2 == nx2 ) { - auto sub_y = stdex::submdspan(y, std::full_extent, 0, std::full_extent); + auto sub_y = std::submdspan(y, std::full_extent, 0, std::full_extent); axpy_(x, sub_y, z, beta, alpha, 1); } else if( ny0 == nx0 && ny1 == 1 && ny1 < nx1 && ny2 == 1 && ny2 < nx2 ) { - auto sub_y = stdex::submdspan(y, std::full_extent, 0, 0); + auto sub_y = std::submdspan(y, std::full_extent, 0, 0); axpy_(x, sub_y, z, beta, alpha, 0); } else if( ny0 == nx0 && ny1 == nx1 && ny2 == 1 && ny2 < nx2 ) { - auto sub_y = stdex::submdspan(y, std::full_extent, std::full_extent, 0); + auto sub_y = std::submdspan(y, std::full_extent, std::full_extent, 0); axpy_(x, sub_y, z, beta, alpha, 2); } else if( ny0 == 1 && ny0 < nx0 && ny1 == 1 && ny1 < nx1 && ny2 == 1 && ny2 < nx2 ) { IteratePolicy policy3d({0, 0, 0}, {nx0, nx1, nx2}); diff --git a/mini-apps/heat3d-mpi/executors/mpi_comm.hpp b/mini-apps/heat3d-mpi/executors/mpi_comm.hpp index 8f4538d..903489e 100644 --- a/mini-apps/heat3d-mpi/executors/mpi_comm.hpp +++ b/mini-apps/heat3d-mpi/executors/mpi_comm.hpp @@ -305,18 +305,18 @@ stdexec::sender auto pack_all_sender(Sender&& sender, Scheduler&& scheduler, Com const std::pair inner_z(1, u.extent(2) - 1); int i = 0; - auto ux_send_left = stdex::submdspan(u, 1, inner_y, inner_z); - auto ux_send_right = stdex::submdspan(u, u.extent(0) - 2, inner_y, inner_z); + auto ux_send_left = std::submdspan(u, 1, inner_y, inner_z); + auto ux_send_right = std::submdspan(u, u.extent(0) - 2, inner_y, inner_z); auto _pack_x_sender = pack_sender(sender, scheduler, comm.send_buffer(i), ux_send_left, ux_send_right); i = 1; - auto uy_send_left = stdex::submdspan(u, inner_x, 1, inner_z); - auto uy_send_right = stdex::submdspan(u, inner_x, u.extent(1) - 2, inner_z); + auto uy_send_left = std::submdspan(u, inner_x, 1, inner_z); + auto uy_send_right = std::submdspan(u, inner_x, u.extent(1) - 2, inner_z); auto _pack_y_sender = pack_sender(sender, scheduler, comm.send_buffer(i), uy_send_left, uy_send_right); i = 2; - auto uz_send_left = stdex::submdspan(u, inner_x, inner_y, 1); - auto uz_send_right = stdex::submdspan(u, inner_x, inner_y, u.extent(2) - 2); + auto uz_send_left = std::submdspan(u, inner_x, inner_y, 1); + auto uz_send_right = std::submdspan(u, inner_x, inner_y, u.extent(2) - 2); auto _pack_z_sender = pack_sender(sender, scheduler, comm.send_buffer(i), uz_send_left, uz_send_right); return stdexec::when_all( @@ -334,18 +334,18 @@ stdexec::sender auto unpack_all_sender(Sender&& sender, Scheduler&& scheduler, C const std::pair inner_z(1, u.extent(2) - 1); int i = 0; - auto ux_recv_left = stdex::submdspan(u, 0, inner_y, inner_z); - auto ux_recv_right = stdex::submdspan(u, u.extent(0) - 1, inner_y, inner_z); + auto ux_recv_left = std::submdspan(u, 0, inner_y, inner_z); + auto ux_recv_right = std::submdspan(u, u.extent(0) - 1, inner_y, inner_z); auto _unpack_x_sender = unpack_sender(sender, scheduler, ux_recv_left, ux_recv_right, comm.recv_buffer(i)); i = 1; - auto uy_recv_left = stdex::submdspan(u, inner_x, 0, inner_z); - auto uy_recv_right = stdex::submdspan(u, inner_x, u.extent(1) - 1, inner_z); + auto uy_recv_left = std::submdspan(u, inner_x, 0, inner_z); + auto uy_recv_right = std::submdspan(u, inner_x, u.extent(1) - 1, inner_z); auto _unpack_y_sender = unpack_sender(sender, scheduler, uy_recv_left, uy_recv_right, comm.recv_buffer(i)); i = 2; - auto uz_recv_left = stdex::submdspan(u, inner_x, inner_y, 0); - auto uz_recv_right = stdex::submdspan(u, inner_x, inner_y, u.extent(2) - 1); + auto uz_recv_left = std::submdspan(u, inner_x, inner_y, 0); + auto uz_recv_right = std::submdspan(u, inner_x, inner_y, u.extent(2) - 1); auto _unpack_z_sender = unpack_sender(sender, scheduler, uz_recv_left, uz_recv_right, comm.recv_buffer(i)); return stdexec::when_all( @@ -364,20 +364,20 @@ stdexec::sender auto boundaryUpdate_all_sender(Sender&& sender, Scheduler&& schd const std::pair inner_z(1, u.extent(2) - 1); int i = 0; - auto ux_recv_left = stdex::submdspan(u, 1, inner_y, inner_z); - auto ux_recv_right = stdex::submdspan(u, u.extent(0) - 2, inner_y, inner_z); + auto ux_recv_left = std::submdspan(u, 1, inner_y, inner_z); + auto ux_recv_right = std::submdspan(u, u.extent(0) - 2, inner_y, inner_z); auto _boundary_update_x_sender = boundaryUpdate_sender(sender, schdeuler, conf, ux_recv_left, ux_recv_right, comm.recv_buffer(i)); // Exchange in y direction i = 1; - auto uy_recv_left = stdex::submdspan(u, inner_x, 1, inner_z); - auto uy_recv_right = stdex::submdspan(u, inner_x, u.extent(1) - 2, inner_z); + auto uy_recv_left = std::submdspan(u, inner_x, 1, inner_z); + auto uy_recv_right = std::submdspan(u, inner_x, u.extent(1) - 2, inner_z); auto _boundary_update_y_sender = boundaryUpdate_sender(_boundary_update_x_sender, schdeuler, conf, uy_recv_left, uy_recv_right, comm.recv_buffer(i)); // Exchange in z direction i = 2; - auto uz_recv_left = stdex::submdspan(u, inner_x, inner_y, 1); - auto uz_recv_right = stdex::submdspan(u, inner_x, inner_y, u.extent(2) - 2); + auto uz_recv_left = std::submdspan(u, inner_x, inner_y, 1); + auto uz_recv_right = std::submdspan(u, inner_x, inner_y, u.extent(2) - 2); auto _boundary_update_z_sender = boundaryUpdate_sender(_boundary_update_y_sender, schdeuler, conf, uz_recv_left, uz_recv_right, comm.recv_buffer(i)); return _boundary_update_z_sender; diff --git a/mini-apps/heat3d-mpi/stdpar/mpi_comm.hpp b/mini-apps/heat3d-mpi/stdpar/mpi_comm.hpp index a36107b..fe25059 100644 --- a/mini-apps/heat3d-mpi/stdpar/mpi_comm.hpp +++ b/mini-apps/heat3d-mpi/stdpar/mpi_comm.hpp @@ -234,10 +234,10 @@ class Comm { // Exchange in x direction { int i = 0; - auto ux_send_left = stdex::submdspan(u, 1, inner_y, inner_z); - auto ux_send_right = stdex::submdspan(u, u.extent(0) - 2, inner_y, inner_z); - auto ux_recv_left = stdex::submdspan(u, 0, inner_y, inner_z); - auto ux_recv_right = stdex::submdspan(u, u.extent(0) - 1, inner_y, inner_z); + auto ux_send_left = std::submdspan(u, 1, inner_y, inner_z); + auto ux_send_right = std::submdspan(u, u.extent(0) - 2, inner_y, inner_z); + auto ux_recv_left = std::submdspan(u, 0, inner_y, inner_z); + auto ux_recv_right = std::submdspan(u, u.extent(0) - 1, inner_y, inner_z); if(use_timer) timers[HaloPack]->begin(); pack_(send_buffer(i), ux_send_left, ux_send_right); @@ -255,10 +255,10 @@ class Comm { // Exchange in y direction { int i = 1; - auto uy_send_left = stdex::submdspan(u, inner_x, 1, inner_z); - auto uy_send_right = stdex::submdspan(u, inner_x, u.extent(1) - 2, inner_z); - auto uy_recv_left = stdex::submdspan(u, inner_x, 0, inner_z); - auto uy_recv_right = stdex::submdspan(u, inner_x, u.extent(1) - 1, inner_z); + auto uy_send_left = std::submdspan(u, inner_x, 1, inner_z); + auto uy_send_right = std::submdspan(u, inner_x, u.extent(1) - 2, inner_z); + auto uy_recv_left = std::submdspan(u, inner_x, 0, inner_z); + auto uy_recv_right = std::submdspan(u, inner_x, u.extent(1) - 1, inner_z); if(use_timer) timers[HaloPack]->begin(); pack_(send_buffer(i), uy_send_left, uy_send_right); @@ -276,10 +276,10 @@ class Comm { // Exchange in z direction { int i = 2; - auto uz_send_left = stdex::submdspan(u, inner_x, inner_y, 1); - auto uz_send_right = stdex::submdspan(u, inner_x, inner_y, u.extent(2) - 2); - auto uz_recv_left = stdex::submdspan(u, inner_x, inner_y, 0); - auto uz_recv_right = stdex::submdspan(u, inner_x, inner_y, u.extent(2) - 1); + auto uz_send_left = std::submdspan(u, inner_x, inner_y, 1); + auto uz_send_right = std::submdspan(u, inner_x, inner_y, u.extent(2) - 2); + auto uz_recv_left = std::submdspan(u, inner_x, inner_y, 0); + auto uz_recv_right = std::submdspan(u, inner_x, inner_y, u.extent(2) - 1); if(use_timer) timers[HaloPack]->begin(); pack_(send_buffer(i), uz_send_left, uz_send_right); diff --git a/mini-apps/heat3d-mpi/thrust/mpi_comm.hpp b/mini-apps/heat3d-mpi/thrust/mpi_comm.hpp index 56846b6..7eb0419 100644 --- a/mini-apps/heat3d-mpi/thrust/mpi_comm.hpp +++ b/mini-apps/heat3d-mpi/thrust/mpi_comm.hpp @@ -236,8 +236,8 @@ class Comm { // Exchange in x direction { int i = 0; - auto ux_send_left = stdex::submdspan(u, 1, inner_y, inner_z); - auto ux_send_right = stdex::submdspan(u, u.extent(0) - 2, inner_y, inner_z); + auto ux_send_left = std::submdspan(u, 1, inner_y, inner_z); + auto ux_send_right = std::submdspan(u, u.extent(0) - 2, inner_y, inner_z); pack_(send_buffer(i), ux_send_left, ux_send_right); } @@ -245,8 +245,8 @@ class Comm { // Exchange in y direction { int i = 1; - auto uy_send_left = stdex::submdspan(u, inner_x, 1, inner_z); - auto uy_send_right = stdex::submdspan(u, inner_x, u.extent(1) - 2, inner_z); + auto uy_send_left = std::submdspan(u, inner_x, 1, inner_z); + auto uy_send_right = std::submdspan(u, inner_x, u.extent(1) - 2, inner_z); pack_(send_buffer(i), uy_send_left, uy_send_right); } @@ -254,8 +254,8 @@ class Comm { // Exchange in z direction { int i = 2; - auto uz_send_left = stdex::submdspan(u, inner_x, inner_y, 1); - auto uz_send_right = stdex::submdspan(u, inner_x, inner_y, u.extent(2) - 2); + auto uz_send_left = std::submdspan(u, inner_x, inner_y, 1); + auto uz_send_right = std::submdspan(u, inner_x, inner_y, u.extent(2) - 2); pack_(send_buffer(i), uz_send_left, uz_send_right); } @@ -271,8 +271,8 @@ class Comm { // Exchange in x direction { int i = 0; - auto ux_recv_left = stdex::submdspan(u, 0, inner_y, inner_z); - auto ux_recv_right = stdex::submdspan(u, u.extent(0) - 1, inner_y, inner_z); + auto ux_recv_left = std::submdspan(u, 0, inner_y, inner_z); + auto ux_recv_right = std::submdspan(u, u.extent(0) - 1, inner_y, inner_z); unpack_(ux_recv_left, ux_recv_right, recv_buffer(i)); } @@ -280,8 +280,8 @@ class Comm { // Exchange in y direction { int i = 1; - auto uy_recv_left = stdex::submdspan(u, inner_x, 0, inner_z); - auto uy_recv_right = stdex::submdspan(u, inner_x, u.extent(1) - 1, inner_z); + auto uy_recv_left = std::submdspan(u, inner_x, 0, inner_z); + auto uy_recv_right = std::submdspan(u, inner_x, u.extent(1) - 1, inner_z); unpack_(uy_recv_left, uy_recv_right, recv_buffer(i)); } @@ -289,8 +289,8 @@ class Comm { // Exchange in z direction { int i = 2; - auto uz_recv_left = stdex::submdspan(u, inner_x, inner_y, 0); - auto uz_recv_right = stdex::submdspan(u, inner_x, inner_y, u.extent(2) - 1); + auto uz_recv_left = std::submdspan(u, inner_x, inner_y, 0); + auto uz_recv_right = std::submdspan(u, inner_x, inner_y, u.extent(2) - 1); unpack_(uz_recv_left, uz_recv_right, recv_buffer(i)); } @@ -306,8 +306,8 @@ class Comm { // Exchange in x direction { int i = 0; - auto ux_recv_left = stdex::submdspan(u, 1, inner_y, inner_z); - auto ux_recv_right = stdex::submdspan(u, u.extent(0) - 2, inner_y, inner_z); + auto ux_recv_left = std::submdspan(u, 1, inner_y, inner_z); + auto ux_recv_right = std::submdspan(u, u.extent(0) - 2, inner_y, inner_z); boundaryUpdate_(conf, ux_recv_left, ux_recv_right, recv_buffer(i)); } @@ -315,8 +315,8 @@ class Comm { // Exchange in y direction { int i = 1; - auto uy_recv_left = stdex::submdspan(u, inner_x, 1, inner_z); - auto uy_recv_right = stdex::submdspan(u, inner_x, u.extent(1) - 2, inner_z); + auto uy_recv_left = std::submdspan(u, inner_x, 1, inner_z); + auto uy_recv_right = std::submdspan(u, inner_x, u.extent(1) - 2, inner_z); boundaryUpdate_(conf, uy_recv_left, uy_recv_right, recv_buffer(i)); } @@ -324,8 +324,8 @@ class Comm { // Exchange in z direction { int i = 2; - auto uz_recv_left = stdex::submdspan(u, inner_x, inner_y, 1); - auto uz_recv_right = stdex::submdspan(u, inner_x, inner_y, u.extent(2) - 2); + auto uz_recv_left = std::submdspan(u, inner_x, inner_y, 1); + auto uz_recv_right = std::submdspan(u, inner_x, inner_y, u.extent(2) - 2); boundaryUpdate_(conf, uz_recv_left, uz_recv_right, recv_buffer(i)); } @@ -360,10 +360,10 @@ class Comm { // Exchange in y direction { int i = 1; - auto uy_send_left = stdex::submdspan(u, inner_x, 1, inner_z); - auto uy_send_right = stdex::submdspan(u, inner_x, u.extent(1) - 2, inner_z); - auto uy_recv_left = stdex::submdspan(u, inner_x, 0, inner_z); - auto uy_recv_right = stdex::submdspan(u, inner_x, u.extent(1) - 1, inner_z); + auto uy_send_left = std::submdspan(u, inner_x, 1, inner_z); + auto uy_send_right = std::submdspan(u, inner_x, u.extent(1) - 2, inner_z); + auto uy_recv_left = std::submdspan(u, inner_x, 0, inner_z); + auto uy_recv_right = std::submdspan(u, inner_x, u.extent(1) - 1, inner_z); pack_(send_buffer(i), uy_send_left, uy_send_right); commP2P_(recv_buffer(i), send_buffer(i)); @@ -373,10 +373,10 @@ class Comm { // Exchange in z direction { int i = 2; - auto uz_send_left = stdex::submdspan(u, inner_x, inner_y, 1); - auto uz_send_right = stdex::submdspan(u, inner_x, inner_y, u.extent(2) - 2); - auto uz_recv_left = stdex::submdspan(u, inner_x, inner_y, 0); - auto uz_recv_right = stdex::submdspan(u, inner_x, inner_y, u.extent(2) - 1); + auto uz_send_left = std::submdspan(u, inner_x, inner_y, 1); + auto uz_send_right = std::submdspan(u, inner_x, inner_y, u.extent(2) - 2); + auto uz_recv_left = std::submdspan(u, inner_x, inner_y, 0); + auto uz_recv_right = std::submdspan(u, inner_x, inner_y, u.extent(2) - 1); pack_(send_buffer(i), uz_send_left, uz_send_right); commP2P_(recv_buffer(i), send_buffer(i)); diff --git a/mini-apps/lbm2d-letkf/executors/force.hpp b/mini-apps/lbm2d-letkf/executors/force.hpp index 549808e..7e17140 100644 --- a/mini-apps/lbm2d-letkf/executors/force.hpp +++ b/mini-apps/lbm2d-letkf/executors/force.hpp @@ -55,7 +55,7 @@ struct Force { const auto x = x_.mdspan(); const auto y = y_.mdspan(); const auto rand_pool = rand_pool_.mdspan(); - const auto sub_rand_pool = stdex::submdspan(rand_pool, std::full_extent, std::full_extent, shift); + const auto sub_rand_pool = std::submdspan(rand_pool, std::full_extent, std::full_extent, shift); auto fx = fx_.mdspan(); auto fy = fy_.mdspan(); diff --git a/mini-apps/lbm2d-letkf/executors/letkf_solver.hpp b/mini-apps/lbm2d-letkf/executors/letkf_solver.hpp index b863d10..635e15d 100644 --- a/mini-apps/lbm2d-letkf/executors/letkf_solver.hpp +++ b/mini-apps/lbm2d-letkf/executors/letkf_solver.hpp @@ -32,13 +32,13 @@ stdexec::sender auto mean_sender(Sender&& sender, Scheduler&& scheduler, const I const int i1 = idx/n0; for(int ir=0; ir < reduce_size; ir++) { if(reduce_dim == 0) { - auto sub_in = stdex::submdspan(in, ir, std::full_extent, std::full_extent); + auto sub_in = std::submdspan(in, ir, std::full_extent, std::full_extent); sum += sub_in(i0, i1); } else if(reduce_dim == 1) { - auto sub_in = stdex::submdspan(in, std::full_extent, ir, std::full_extent); + auto sub_in = std::submdspan(in, std::full_extent, ir, std::full_extent); sum += sub_in(i0, i1); } else { - auto sub_in = stdex::submdspan(in, std::full_extent, std::full_extent, ir); + auto sub_in = std::submdspan(in, std::full_extent, std::full_extent, ir); sum += sub_in(i0, i1); } } diff --git a/mini-apps/lbm2d-letkf/thrust/force.hpp b/mini-apps/lbm2d-letkf/thrust/force.hpp index 8a1b45e..1da8b64 100644 --- a/mini-apps/lbm2d-letkf/thrust/force.hpp +++ b/mini-apps/lbm2d-letkf/thrust/force.hpp @@ -54,7 +54,7 @@ struct Force { const auto x = x_.mdspan(); const auto y = y_.mdspan(); const auto rand_pool = rand_pool_.mdspan(); - const auto sub_rand_pool = stdex::submdspan(rand_pool, std::full_extent, std::full_extent, shift); + const auto sub_rand_pool = std::submdspan(rand_pool, std::full_extent, std::full_extent, shift); auto fx = fx_.mdspan(); auto fy = fy_.mdspan(); diff --git a/tutorial/07_heat2d_repeat_n/repeat_n.hpp b/tutorial/07_heat2d_repeat_n/repeat_n.hpp index ce92bee..a28a6cf 100644 --- a/tutorial/07_heat2d_repeat_n/repeat_n.hpp +++ b/tutorial/07_heat2d_repeat_n/repeat_n.hpp @@ -53,69 +53,34 @@ namespace nvexec { namespace ex = stdexec; #if defined(_NVHPC_CUDA) || defined(__CUDACC__) -namespace nvexec::STDEXEC_STREAM_DETAIL_NS { namespace repeat_n { - template - class receiver_2_t : public stream_receiver_base { - using Sender = typename OpT::PredSender; - using Receiver = typename OpT::Receiver; - - OpT& op_state_; +namespace nvexec::STDEXEC_STREAM_DETAIL_NS { // - public: - template _Tag, class... _Args> - friend void tag_invoke(_Tag __tag, receiver_2_t&& __self, _Args&&... __args) noexcept { - OpT& op_state = __self.op_state_; - op_state.propagate_completion_signal(_Tag{}, (_Args&&) __args...); - } + namespace repeat_n { + template + class receiver_2_t : public stream_receiver_base { + using Sender = typename OpT::PredSender; + using Receiver = typename OpT::Receiver; - friend void tag_invoke(ex::set_value_t, receiver_2_t&& __self) noexcept { - using inner_op_state_t = typename OpT::inner_op_state_t; + OpT& op_state_; - OpT& op_state = __self.op_state_; - op_state.i_++; - - if (op_state.i_ == op_state.n_) { - op_state.propagate_completion_signal(stdexec::set_value); - return; + public: + template _Tag, class... _Args> + friend void tag_invoke(_Tag __tag, receiver_2_t&& __self, _Args&&... __args) noexcept { + OpT& op_state = __self.op_state_; + op_state.propagate_completion_signal(_Tag{}, (_Args&&) __args...); } - auto sch = stdexec::get_scheduler(stdexec::get_env(op_state.rcvr_)); - inner_op_state_t& inner_op_state = op_state.inner_op_state_.emplace( - stdexec::__conv{[&]() noexcept { - return ex::connect(ex::schedule(sch) | op_state.closure_, receiver_2_t{op_state}); - }}); - - ex::start(inner_op_state); - } - - friend typename OpT::env_t tag_invoke(ex::get_env_t, const receiver_2_t& self) noexcept { - return self.op_state_.make_env(); - } - - explicit receiver_2_t(OpT& op_state) - : op_state_(op_state) { - } - }; - - template - class receiver_1_t : public stream_receiver_base { - using Receiver = typename OpT::Receiver; - - OpT& op_state_; - - public: - template _Tag, class... _Args> - friend void tag_invoke(_Tag __tag, receiver_1_t&& __self, _Args&&... __args) noexcept { - OpT& op_state = __self.op_state_; - op_state.propagate_completion_signal(_Tag{}, (_Args&&) __args...); - } + friend void tag_invoke(ex::set_value_t, receiver_2_t&& __self) noexcept { + using inner_op_state_t = typename OpT::inner_op_state_t; - friend void tag_invoke(ex::set_value_t, receiver_1_t&& __self) noexcept { - using inner_op_state_t = typename OpT::inner_op_state_t; + OpT& op_state = __self.op_state_; + op_state.i_++; - OpT& op_state = __self.op_state_; + if (op_state.i_ == op_state.n_) { + op_state.propagate_completion_signal(stdexec::set_value); + return; + } - if (op_state.n_) { auto sch = stdexec::get_scheduler(stdexec::get_env(op_state.rcvr_)); inner_op_state_t& inner_op_state = op_state.inner_op_state_.emplace( stdexec::__conv{[&]() noexcept { @@ -123,208 +88,247 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS { namespace repeat_n { }}); ex::start(inner_op_state); - } else { - op_state.propagate_completion_signal(stdexec::set_value); } - } - friend typename OpT::env_t tag_invoke(ex::get_env_t, const receiver_1_t& self) noexcept { - return self.op_state_.make_env(); - } + friend typename OpT::env_t tag_invoke(ex::get_env_t, const receiver_2_t& self) noexcept { + return self.op_state_.make_env(); + } - explicit receiver_1_t(OpT& op_state) - : op_state_(op_state) { - } - }; + explicit receiver_2_t(OpT& op_state) + : op_state_(op_state) { + } + }; - template - struct operation_state_t : operation_state_base_t { - using PredSender = stdexec::__t; - using Closure = stdexec::__t; - using Receiver = stdexec::__t; - using Scheduler = - stdexec::tag_invoke_result_t>; - using InnerSender = - std::invoke_result_t>; + template + class receiver_1_t : public stream_receiver_base { + using Receiver = typename OpT::Receiver; - using predecessor_op_state_t = - ex::connect_result_t>; - using inner_op_state_t = ex::connect_result_t>; + OpT& op_state_; - PredSender pred_sender_; - Closure closure_; - std::optional pred_op_state_; - std::optional inner_op_state_; - std::size_t n_{}; - std::size_t i_{}; - - friend void tag_invoke(stdexec::start_t, operation_state_t& op) noexcept { - if (op.stream_provider_.status_ != cudaSuccess) { - // Couldn't allocate memory for operation state, complete with error - op.propagate_completion_signal(stdexec::set_error, std::move(op.stream_provider_.status_)); - } else { - if (op.n_) { - stdexec::start(*op.pred_op_state_); + public: + template _Tag, class... _Args> + friend void tag_invoke(_Tag __tag, receiver_1_t&& __self, _Args&&... __args) noexcept { + OpT& op_state = __self.op_state_; + op_state.propagate_completion_signal(_Tag{}, (_Args&&) __args...); + } + + friend void tag_invoke(ex::set_value_t, receiver_1_t&& __self) noexcept { + using inner_op_state_t = typename OpT::inner_op_state_t; + + OpT& op_state = __self.op_state_; + + if (op_state.n_) { + auto sch = stdexec::get_scheduler(stdexec::get_env(op_state.rcvr_)); + inner_op_state_t& inner_op_state = op_state.inner_op_state_.emplace( + stdexec::__conv{[&]() noexcept { + return ex::connect( + ex::schedule(sch) | op_state.closure_, receiver_2_t{op_state}); + }}); + + ex::start(inner_op_state); } else { - op.propagate_completion_signal(stdexec::set_value); + op_state.propagate_completion_signal(stdexec::set_value); } } - } - operation_state_t(PredSender&& pred_sender, Closure closure, Receiver&& rcvr, std::size_t n) - : operation_state_base_t( - (Receiver&&) rcvr, - stdexec::get_completion_scheduler(stdexec::get_env(pred_sender)) - .context_state_) - , pred_sender_{(PredSender&&) pred_sender} - , closure_(closure) - , n_(n) { - pred_op_state_.emplace(stdexec::__conv{[&]() noexcept { - return ex::connect((PredSender&&) pred_sender_, receiver_1_t{*this}); - }}); - } - }; + friend typename OpT::env_t tag_invoke(ex::get_env_t, const receiver_1_t& self) noexcept { + return self.op_state_.make_env(); + } + + explicit receiver_1_t(OpT& op_state) + : op_state_(op_state) { + } + }; + + template + struct operation_state_t : operation_state_base_t { + using PredSender = stdexec::__t; + using Receiver = stdexec::__t; + using Scheduler = + stdexec::tag_invoke_result_t>; + using InnerSender = + std::invoke_result_t>; + + using predecessor_op_state_t = + ex::connect_result_t>; + using inner_op_state_t = ex::connect_result_t>; + + PredSender pred_sender_; + Closure closure_; + std::optional pred_op_state_; + std::optional inner_op_state_; + std::size_t n_{}; + std::size_t i_{}; + + friend void tag_invoke(stdexec::start_t, operation_state_t& op) noexcept { + if (op.stream_provider_.status_ != cudaSuccess) { + // Couldn't allocate memory for operation state, complete with error + op.propagate_completion_signal( + stdexec::set_error, std::move(op.stream_provider_.status_)); + } else { + if (op.n_) { + stdexec::start(*op.pred_op_state_); + } else { + op.propagate_completion_signal(stdexec::set_value); + } + } + } + + operation_state_t(PredSender&& pred_sender, Closure closure, Receiver&& rcvr, std::size_t n) + : operation_state_base_t( + (Receiver&&) rcvr, + stdexec::get_completion_scheduler(stdexec::get_env(pred_sender)) + .context_state_) + , pred_sender_{(PredSender&&) pred_sender} + , closure_(closure) + , n_(n) { + pred_op_state_.emplace(stdexec::__conv{[&]() noexcept { + return ex::connect((PredSender&&) pred_sender_, receiver_1_t{*this}); + }}); + } + }; }} #endif namespace repeat_n_detail { -template -class receiver_t { - using Receiver = typename OpT::Receiver; + template + class receiver_t { + using Receiver = typename OpT::Receiver; - OpT& op_state_; + OpT& op_state_; - public: - using is_receiver = void; + public: + using __t = receiver_t; + using __id = receiver_t; + using is_receiver = void; - template _Tag, class... _Args> - STDEXEC_DETAIL_CUDACC_HOST_DEVICE // - friend void - tag_invoke(_Tag __tag, receiver_t&& __self, _Args&&... __args) noexcept { - __tag(std::move(__self.op_state_.rcvr_), (_Args&&) __args...); - } + template _Tag, class... _Args> + STDEXEC_ATTRIBUTE((host, device)) + friend void tag_invoke(_Tag __tag, receiver_t&& __self, _Args&&... __args) noexcept { + __tag(std::move(__self.op_state_.rcvr_), (_Args&&) __args...); + } - friend void tag_invoke(ex::set_value_t, receiver_t&& __self) noexcept { - OpT& op_state = __self.op_state_; + friend void tag_invoke(ex::set_value_t, receiver_t&& __self) noexcept { + OpT& op_state = __self.op_state_; - for (std::size_t i = 0; i < op_state.n_; i++) { - stdexec::sync_wait(ex::schedule(exec::inline_scheduler{}) | op_state.closure_); - } + for (std::size_t i = 0; i < op_state.n_; i++) { + stdexec::sync_wait(ex::schedule(exec::inline_scheduler{}) | op_state.closure_); + } - stdexec::set_value(std::move(op_state.rcvr_)); - } + stdexec::set_value(std::move(op_state.rcvr_)); + } - friend auto tag_invoke(ex::get_env_t, const receiver_t& self) noexcept - -> stdexec::env_of_t { - return stdexec::get_env(self.op_state_.rcvr_); - } + friend auto tag_invoke(ex::get_env_t, const receiver_t& self) noexcept + -> stdexec::env_of_t { + return stdexec::get_env(self.op_state_.rcvr_); + } - explicit receiver_t(OpT& op_state) - : op_state_(op_state) { - } -}; + explicit receiver_t(OpT& op_state) + : op_state_(op_state) { + } + }; -template -struct operation_state_t { - using Sender = stdexec::__t; - using Closure = stdexec::__t; - using Receiver = stdexec::__t; + template + struct operation_state_t { + using Sender = stdexec::__t; + using Receiver = stdexec::__t; - using inner_op_state_t = stdexec::connect_result_t>; + using inner_op_state_t = stdexec::connect_result_t>; - inner_op_state_t op_state_; - Closure closure_; - Receiver rcvr_; - std::size_t n_{}; + inner_op_state_t op_state_; + Closure closure_; + Receiver rcvr_; + std::size_t n_{}; - friend void tag_invoke(stdexec::start_t, operation_state_t& self) noexcept { - stdexec::start(self.op_state_); - } + friend void tag_invoke(stdexec::start_t, operation_state_t& self) noexcept { + stdexec::start(self.op_state_); + } - operation_state_t(Sender&& sender, Closure closure, Receiver&& rcvr, std::size_t n) - : op_state_{stdexec::connect((Sender&&) sender, receiver_t{*this})} - , closure_{closure} - , rcvr_{(Receiver&&) rcvr} - , n_(n) { - } -}; -} + operation_state_t(Sender&& sender, Closure closure, Receiver&& rcvr, std::size_t n) + : op_state_{stdexec::connect((Sender&&) sender, receiver_t{*this})} + , closure_{closure} + , rcvr_{(Receiver&&) rcvr} + , n_(n) { + } + }; -struct repeat_n_t { -template -struct repeat_n_sender_t { - using Sender = stdexec::__t; - using Closure = stdexec::__t; - using is_sender = void; - - using completion_signatures = // - stdexec::completion_signatures< - stdexec::set_value_t(), - stdexec::set_stopped_t(), - stdexec::set_error_t(std::exception_ptr) + template + struct repeat_n_sender_t { + using __t = repeat_n_sender_t; + using __id = repeat_n_sender_t; + using Sender = stdexec::__t; + using is_sender = void; + + using completion_signatures = // + stdexec::completion_signatures< + stdexec::set_value_t(), + stdexec::set_stopped_t(), + stdexec::set_error_t(std::exception_ptr) #if defined(_NVHPC_CUDA) || defined(__CUDACC__) - , - stdexec::set_error_t(cudaError_t) + , + stdexec::set_error_t(cudaError_t) #endif - >; + >; - Sender sender_; - Closure closure_; - std::size_t n_{}; + Sender sender_; + Closure closure_; + std::size_t n_{}; #if defined(_NVHPC_CUDA) || defined(__CUDACC__) - template Self, stdexec::receiver Receiver> - requires(stdexec::tag_invocable) - && (!nvexec::STDEXEC_STREAM_DETAIL_NS::receiver_with_stream_env) - friend auto tag_invoke(stdexec::connect_t, Self&& self, Receiver r) - -> repeat_n_detail::operation_state_t> { - return repeat_n_detail::operation_state_t>( - (Sender&&) self.sender_, self.closure_, (Receiver&&) r, self.n_); - } - - template Self, stdexec::receiver Receiver> - requires(stdexec::tag_invocable) - && (nvexec::STDEXEC_STREAM_DETAIL_NS::receiver_with_stream_env) - friend auto tag_invoke(stdexec::connect_t, Self&& self, Receiver r) - -> nvexec::STDEXEC_STREAM_DETAIL_NS::repeat_n:: - operation_state_t> { - return nvexec::STDEXEC_STREAM_DETAIL_NS::repeat_n:: - operation_state_t>( + template Self, stdexec::receiver Receiver> + requires(stdexec::sender_to) + && (!nvexec::STDEXEC_STREAM_DETAIL_NS::receiver_with_stream_env) + friend auto tag_invoke(stdexec::connect_t, Self&& self, Receiver r) + -> repeat_n_detail::operation_state_t> { + return repeat_n_detail::operation_state_t>( (Sender&&) self.sender_, self.closure_, (Receiver&&) r, self.n_); - } + } + + template Self, stdexec::receiver Receiver> + requires(stdexec::sender_to) + && (nvexec::STDEXEC_STREAM_DETAIL_NS::receiver_with_stream_env) + friend auto tag_invoke(stdexec::connect_t, Self&& self, Receiver r) + -> nvexec::STDEXEC_STREAM_DETAIL_NS::repeat_n:: + operation_state_t> { + return nvexec::STDEXEC_STREAM_DETAIL_NS::repeat_n:: + operation_state_t>( + (Sender&&) self.sender_, self.closure_, (Receiver&&) r, self.n_); + } #else template Self, stdexec::receiver Receiver> - requires stdexec::tag_invocable + requires stdexec::sender_to friend auto tag_invoke(stdexec::connect_t, Self&& self, Receiver r) - -> repeat_n_detail::operation_state_t> { - return repeat_n_detail::operation_state_t>( + -> repeat_n_detail::operation_state_t> { + return repeat_n_detail::operation_state_t>( (Sender&&) self.sender_, self.closure_, (Receiver&&) r, self.n_); } #endif - friend auto tag_invoke(stdexec::get_env_t, const repeat_n_sender_t& s) // - noexcept(stdexec::__nothrow_callable) - -> stdexec::env_of_t { - return stdexec::get_env(s.sender_); - } -}; - -template -auto operator()(Sender&& __sndr, std::size_t n, Closure closure) const noexcept - -> repeat_n_sender_t, stdexec::__x> { - return repeat_n_sender_t, stdexec::__x>{ - std::forward(__sndr), closure, n}; -} - -template -auto operator()(std::size_t n, Closure closure) const - -> stdexec::__binder_back { - return { - {}, - {}, - {n, (Closure&&) closure} + friend auto tag_invoke(stdexec::get_env_t, const repeat_n_sender_t& s) // + noexcept(stdexec::__nothrow_callable) + -> stdexec::env_of_t { + return stdexec::get_env(s.sender_); + } }; } + +struct repeat_n_t { + template + auto operator()(Sender&& __sndr, std::size_t n, Closure closure) const noexcept + -> repeat_n_detail::repeat_n_sender_t, Closure> { + return repeat_n_detail::repeat_n_sender_t, Closure>{ + std::forward(__sndr), closure, n}; + } + + template + auto operator()(std::size_t n, Closure closure) const + -> stdexec::__binder_back { + return { + {}, + {}, + {n, (Closure&&) closure} + }; + } }; inline constexpr repeat_n_t repeat_n{}; diff --git a/wk/letkf_768.json b/wk/letkf_768.json new file mode 100644 index 0000000..2c2ddb6 --- /dev/null +++ b/wk/letkf_768.json @@ -0,0 +1,34 @@ +{ + "Physics": { + "rho_ref": 1.0, + "u_ref": 1.0, + "nu": 1.0e-4, + "friction_rate": 5.0e-4, + "kf": 4.0, + "fkf": 5.6, + "dk": 10, + "sigma": 5, + "p_amp": 0.01, + "obs_error_rho": 0.01, + "obs_error_u": 0.1 + }, + "Settings": { + "base_dir": "./", + "sim_type": "letkf", + "case_name": "letkf768", + "in_case_name": "nature768", + "nx": 768, + "ny": 768, + "spinup": 200000, + "nbiter": 40000, + "io_interval": 200, + "da_interval": 200, + "obs_interval": 1, + "lyapnov": false, + "les": true, + "disable_output": true, + "da_nud_rate": 0.1, + "beta": 1.07, + "rloc_len": 1 + } +} diff --git a/wk/letkf_768_time.json b/wk/letkf_768_time.json new file mode 100644 index 0000000..3be758b --- /dev/null +++ b/wk/letkf_768_time.json @@ -0,0 +1,35 @@ +{ + "Physics": { + "rho_ref": 1.0, + "u_ref": 1.0, + "nu": 1.0e-4, + "friction_rate": 5.0e-4, + "kf": 4.0, + "fkf": 5.6, + "dk": 10, + "sigma": 5, + "p_amp": 0.01, + "obs_error_rho": 0.01, + "obs_error_u": 0.1 + }, + "Settings": { + "base_dir": "./", + "sim_type": "letkf", + "case_name": "letkf768", + "in_case_name": "nature768", + "nx": 768, + "ny": 768, + "spinup": 200000, + "nbiter": 40000, + "io_interval": 200, + "da_interval": 200, + "obs_interval": 1, + "lyapnov": false, + "les": true, + "use_time_stamps": true, + "disable_output": true, + "da_nud_rate": 0.1, + "beta": 1.07, + "rloc_len": 1 + } +} diff --git a/wk/letkf_async_768.json b/wk/letkf_async_768.json new file mode 100644 index 0000000..307e5d5 --- /dev/null +++ b/wk/letkf_async_768.json @@ -0,0 +1,35 @@ +{ + "Physics": { + "rho_ref": 1.0, + "u_ref": 1.0, + "nu": 1.0e-4, + "friction_rate": 5.0e-4, + "kf": 4.0, + "fkf": 5.6, + "dk": 10, + "sigma": 5, + "p_amp": 0.01, + "obs_error_rho": 0.01, + "obs_error_u": 0.1 + }, + "Settings": { + "base_dir": "./", + "sim_type": "letkf", + "case_name": "letkf768", + "in_case_name": "nature768", + "nx": 768, + "ny": 768, + "spinup": 200000, + "nbiter": 40000, + "io_interval": 200, + "da_interval": 200, + "obs_interval": 1, + "lyapnov": false, + "les": true, + "is_async": true, + "disable_output": true, + "da_nud_rate": 0.1, + "beta": 1.07, + "rloc_len": 1 + } +} diff --git a/wk/letkf_async_768_time.json b/wk/letkf_async_768_time.json new file mode 100644 index 0000000..d6bda62 --- /dev/null +++ b/wk/letkf_async_768_time.json @@ -0,0 +1,36 @@ +{ + "Physics": { + "rho_ref": 1.0, + "u_ref": 1.0, + "nu": 1.0e-4, + "friction_rate": 5.0e-4, + "kf": 4.0, + "fkf": 5.6, + "dk": 10, + "sigma": 5, + "p_amp": 0.01, + "obs_error_rho": 0.01, + "obs_error_u": 0.1 + }, + "Settings": { + "base_dir": "./", + "sim_type": "letkf", + "case_name": "letkf_async768", + "in_case_name": "nature768", + "nx": 768, + "ny": 768, + "spinup": 200000, + "nbiter": 40000, + "io_interval": 200, + "da_interval": 200, + "obs_interval": 1, + "lyapnov": false, + "les": true, + "is_async": true, + "use_time_stamps": true, + "disable_output": true, + "da_nud_rate": 0.1, + "beta": 1.07, + "rloc_len": 1 + } +} diff --git a/wk/nature_768.json b/wk/nature_768.json new file mode 100644 index 0000000..b5d501e --- /dev/null +++ b/wk/nature_768.json @@ -0,0 +1,30 @@ +{ + "Physics": { + "rho_ref": 1.0, + "u_ref": 1.0, + "nu": 1.0e-4, + "friction_rate": 5.0e-4, + "kf": 4.0, + "fkf": 5.6, + "dk": 10, + "sigma": 5, + "p_amp": 0.01, + "obs_error_rho": 0.01, + "obs_error_u": 0.1 + }, + "Settings": { + "base_dir": "./", + "sim_type": "nature", + "case_name": "nature768", + "nx": 768, + "ny": 768, + "spinup": 200000, + "nbiter": 40000, + "io_interval": 200, + "da_interval": 200, + "obs_interval": 1, + "lyapnov": false, + "les": true, + "da_nud_rate": 0.1 + } +} diff --git a/wk/sub_executors_heat3d_mpi_time_stamps_A100_16GPUs.sh b/wk/sub_executors_heat3d_mpi_time_stamps_A100_16GPUs.sh new file mode 100644 index 0000000..c0c0a6b --- /dev/null +++ b/wk/sub_executors_heat3d_mpi_time_stamps_A100_16GPUs.sh @@ -0,0 +1,46 @@ +#!/bin/bash +#PJM -L "node=2" +#PJM -L "rscgrp=regular-a" +#PJM -L "elapse=10:00" +#PJM -s +#PJM -g jh220031a +#PJM --mpi proc=16 + +. /etc/profile.d/modules.sh # Initialize module command + +module purge + +# Load spack +export HOME=/work/jh220031a/i18048 +. $HOME/spack/share/spack/setup-env.sh + +spack load gcc@11.3.0 +spack load cmake@3.24.3%gcc@8.3.1 +module load /work/04/jh220031a/i18048/lib/nvidia/hpc_sdk23.3/modulefiles/nvhpc/23.3 +module list + +# Need GPUs to build the code appropriately +# So compile inside a batch job, wherein GPUs are visible +if [ ! -d "../build" ] +then + cd ../ + rm -rf build + mkdir build && cd build + cmake -DCMAKE_CXX_COMPILER=nvc++ -DBACKEND=CUDA .. + cmake --build . -j 8 + cd ../wk/ +fi + +echo "HOME: ${HOME}" +echo "PATH: ${PATH}" +echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}" + +export UCX_MEMTYPE_CACHE=n +export UCX_IB_GPU_DIRECT_RDMA=no +export UCX_RNDV_FRAG_MEM_TYPE=cuda +export OMPI_MCA_plm_rsh_agent=/bin/pjrsh + +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 8 \ + ./wrapper.sh ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 16 --nx 1536 --ny 1536 --nz 96 --nbiter 1000 --freq_diag 0 --use_time_stamps 1 +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 8 \ + ./wrapper.sh ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 16 --nx 1536 --ny 1536 --nz 96 --nbiter 1000 --freq_diag 0 --use_time_stamps 1 --is_async 1 diff --git a/wk/sub_executors_heat3d_mpi_time_stamps_A100_32GPUs.sh b/wk/sub_executors_heat3d_mpi_time_stamps_A100_32GPUs.sh new file mode 100644 index 0000000..fb989aa --- /dev/null +++ b/wk/sub_executors_heat3d_mpi_time_stamps_A100_32GPUs.sh @@ -0,0 +1,46 @@ +#!/bin/bash +#PJM -L "node=4" +#PJM -L "rscgrp=regular-a" +#PJM -L "elapse=10:00" +#PJM -s +#PJM -g jh220031a +#PJM --mpi proc=32 + +. /etc/profile.d/modules.sh # Initialize module command + +module purge + +# Load spack +export HOME=/work/jh220031a/i18048 +. $HOME/spack/share/spack/setup-env.sh + +spack load gcc@11.3.0 +spack load cmake@3.24.3%gcc@8.3.1 +module load /work/04/jh220031a/i18048/lib/nvidia/hpc_sdk23.3/modulefiles/nvhpc/23.3 +module list + +# Need GPUs to build the code appropriately +# So compile inside a batch job, wherein GPUs are visible +if [ ! -d "../build" ] +then + cd ../ + rm -rf build + mkdir build && cd build + cmake -DCMAKE_CXX_COMPILER=nvc++ -DBACKEND=CUDA .. + cmake --build . -j 8 + cd ../wk/ +fi + +echo "HOME: ${HOME}" +echo "PATH: ${PATH}" +echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}" + +export UCX_MEMTYPE_CACHE=n +export UCX_IB_GPU_DIRECT_RDMA=no +export UCX_RNDV_FRAG_MEM_TYPE=cuda +export OMPI_MCA_plm_rsh_agent=/bin/pjrsh + +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 8 \ + ./wrapper.sh ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 32 --nx 1536 --ny 1536 --nz 48 --nbiter 1000 --freq_diag 0 --use_time_stamps 1 +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 8 \ + ./wrapper.sh ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 32 --nx 1536 --ny 1536 --nz 48 --nbiter 1000 --freq_diag 0 --use_time_stamps 1 --is_async 1 diff --git a/wk/sub_executors_heat3d_mpi_time_stamps_A100_48GPUs.sh b/wk/sub_executors_heat3d_mpi_time_stamps_A100_48GPUs.sh new file mode 100644 index 0000000..4fdd093 --- /dev/null +++ b/wk/sub_executors_heat3d_mpi_time_stamps_A100_48GPUs.sh @@ -0,0 +1,46 @@ +#!/bin/bash +#PJM -L "node=6" +#PJM -L "rscgrp=regular-a" +#PJM -L "elapse=10:00" +#PJM -s +#PJM -g jh220031a +#PJM --mpi proc=48 + +. /etc/profile.d/modules.sh # Initialize module command + +module purge + +# Load spack +export HOME=/work/jh220031a/i18048 +. $HOME/spack/share/spack/setup-env.sh + +spack load gcc@11.3.0 +spack load cmake@3.24.3%gcc@8.3.1 +module load /work/04/jh220031a/i18048/lib/nvidia/hpc_sdk23.3/modulefiles/nvhpc/23.3 +module list + +# Need GPUs to build the code appropriately +# So compile inside a batch job, wherein GPUs are visible +if [ ! -d "../build" ] +then + cd ../ + rm -rf build + mkdir build && cd build + cmake -DCMAKE_CXX_COMPILER=nvc++ -DBACKEND=CUDA .. + cmake --build . -j 8 + cd ../wk/ +fi + +echo "HOME: ${HOME}" +echo "PATH: ${PATH}" +echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}" + +export UCX_MEMTYPE_CACHE=n +export UCX_IB_GPU_DIRECT_RDMA=no +export UCX_RNDV_FRAG_MEM_TYPE=cuda +export OMPI_MCA_plm_rsh_agent=/bin/pjrsh + +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 8 \ + ./wrapper.sh ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 48 --nx 1536 --ny 1536 --nz 32 --nbiter 1000 --freq_diag 0 --use_time_stamps 1 +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 8 \ + ./wrapper.sh ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 48 --nx 1536 --ny 1536 --nz 32 --nbiter 1000 --freq_diag 0 --use_time_stamps 1 --is_async 1 diff --git a/wk/sub_executors_heat3d_mpi_time_stamps_A100_64GPUs.sh b/wk/sub_executors_heat3d_mpi_time_stamps_A100_64GPUs.sh new file mode 100644 index 0000000..df6fa4f --- /dev/null +++ b/wk/sub_executors_heat3d_mpi_time_stamps_A100_64GPUs.sh @@ -0,0 +1,46 @@ +#!/bin/bash +#PJM -L "node=8" +#PJM -L "rscgrp=regular-a" +#PJM -L "elapse=10:00" +#PJM -s +#PJM -g jh220031a +#PJM --mpi proc=64 + +. /etc/profile.d/modules.sh # Initialize module command + +module purge + +# Load spack +export HOME=/work/jh220031a/i18048 +. $HOME/spack/share/spack/setup-env.sh + +spack load gcc@11.3.0 +spack load cmake@3.24.3%gcc@8.3.1 +module load /work/04/jh220031a/i18048/lib/nvidia/hpc_sdk23.3/modulefiles/nvhpc/23.3 +module list + +# Need GPUs to build the code appropriately +# So compile inside a batch job, wherein GPUs are visible +if [ ! -d "../build" ] +then + cd ../ + rm -rf build + mkdir build && cd build + cmake -DCMAKE_CXX_COMPILER=nvc++ -DBACKEND=CUDA .. + cmake --build . -j 8 + cd ../wk/ +fi + +echo "HOME: ${HOME}" +echo "PATH: ${PATH}" +echo "LD_LIBRARY_PATH: ${LD_LIBRARY_PATH}" + +export UCX_MEMTYPE_CACHE=n +export UCX_IB_GPU_DIRECT_RDMA=no +export UCX_RNDV_FRAG_MEM_TYPE=cuda +export OMPI_MCA_plm_rsh_agent=/bin/pjrsh + +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 8 \ + ./wrapper.sh ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 64 --nx 1536 --ny 1536 --nz 24 --nbiter 1000 --freq_diag 0 --use_time_stamps 1 +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 8 \ + ./wrapper.sh ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 64 --nx 1536 --ny 1536 --nz 24 --nbiter 1000 --freq_diag 0 --use_time_stamps 1 --is_async 1 diff --git a/wk/sub_executors_heat3d_mpi_time_stamps_Icelake.sh b/wk/sub_executors_heat3d_mpi_time_stamps_Icelake.sh index cdcc2cf..f1b33e6 100644 --- a/wk/sub_executors_heat3d_mpi_time_stamps_Icelake.sh +++ b/wk/sub_executors_heat3d_mpi_time_stamps_Icelake.sh @@ -14,10 +14,8 @@ module purge export HOME=/work/jh220031a/i18048 . $HOME/spack/share/spack/setup-env.sh -spack load gcc@11.3.0 -spack load cmake@3.24.3%gcc@8.3.1 -module load /work/04/jh220031a/i18048/lib/nvidia/hpc_sdk23.3/modulefiles/nvhpc/23.3 -module list +module load nvidia/23.3 cmake/3.24.0 nvmpi/23.3 +export NVLOCALRC=/work/opt/local/x86_64/cores/nvidia/23.3/Linux_x86_64/23.3/compilers/bin/localrc_gcc12.2.0 # Need GPUs to build the code appropriately # So compile inside a batch job, wherein GPUs are visible @@ -26,7 +24,7 @@ then cd ../ rm -rf build mkdir build && cd build - cmake -DCMAKE_CXX_COMPILER=nvc++ -DBACKEND=OPENMP .. + cmake -DCMAKE_CXX_COMPILER=nvc++ -DBACKEND=OPENMP -DCMAKE_CXX_FLAGS="-std=c++20" .. cmake --build . -j 8 cd ../wk/ fi @@ -34,18 +32,8 @@ fi export UCX_MEMTYPE_CACHE=n export UCX_IB_GPU_DIRECT_RDMA=no export OMP_NUM_THREADS=36 -export OMP_PROC_BIND=true -mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC \ - ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 2 --nx 512 --ny 512 --nz 256 --nbiter 100 --freq_diag 0 --use_time_stamps 1 -mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC \ - ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 2 --nx 512 --ny 512 --nz 256 --nbiter 100 --freq_diag 0 --use_time_stamps 1 --is_async 1 -##mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC --map-by ppr:1:socket:PE=${OMP_NUM_THREADS} \ -## ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 2 --nx 512 --ny 512 --nz 256 --nbiter 100 --freq_diag 0 --use_time_stamps 1 -##mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC --map-by ppr:1:socket:PE=${OMP_NUM_THREADS} \ -## ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 2 --nx 512 --ny 512 --nz 256 --nbiter 100 --freq_diag 0 --use_time_stamps 1 --is_async 1 - -#mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 4 \ -# ./wrapper.sh ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 4 --nx 1024 --ny 1024 --nz 256 --nbiter 1000 --freq_diag 0 --use_time_stamps 1 -#mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 4 \ -# ./wrapper.sh ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 4 --nx 1024 --ny 1024 --nz 256 --nbiter 1000 --freq_diag 0 --use_time_stamps 1 --is_async 1 +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC --bind-to none \ + ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 2 --nx 512 --ny 512 --nz 256 --nbiter 1000 --freq_diag 0 --use_time_stamps 1 +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC --bind-to none \ + ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 2 --nx 512 --ny 512 --nz 256 --nbiter 1000 --freq_diag 0 --use_time_stamps 1 --is_async 1 diff --git a/wk/sub_executors_heat3d_mpi_time_stamps_Icelake_12CPUs.sh b/wk/sub_executors_heat3d_mpi_time_stamps_Icelake_12CPUs.sh new file mode 100644 index 0000000..35a046c --- /dev/null +++ b/wk/sub_executors_heat3d_mpi_time_stamps_Icelake_12CPUs.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#PJM -L "node=6" +#PJM -L "rscgrp=regular-a" +#PJM -L "elapse=60:00" +#PJM -s +#PJM -g jh220031a +#PJM --mpi proc=12 + +. /etc/profile.d/modules.sh # Initialize module command + +module purge + +# Load spack +export HOME=/work/jh220031a/i18048 +. $HOME/spack/share/spack/setup-env.sh + +module load nvidia/23.3 cmake/3.24.0 nvmpi/23.3 +export NVLOCALRC=/work/opt/local/x86_64/cores/nvidia/23.3/Linux_x86_64/23.3/compilers/bin/localrc_gcc12.2.0 + +# Need GPUs to build the code appropriately +# So compile inside a batch job, wherein GPUs are visible +if [ ! -d "../build" ] +then + cd ../ + rm -rf build + mkdir build && cd build + cmake -DCMAKE_CXX_COMPILER=nvc++ -DBACKEND=OPENMP -DCMAKE_CXX_FLAGS="-std=c++20" .. + cmake --build . -j 8 + cd ../wk/ +fi + +export UCX_MEMTYPE_CACHE=n +export UCX_IB_GPU_DIRECT_RDMA=no +export OMPI_MCA_plm_rsh_agent=/bin/pjrsh +export OMP_NUM_THREADS=36 + +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC --bind-to none \ + ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 12 --nx 1536 --ny 1536 --nz 128 --nbiter 100 --freq_diag 0 --use_time_stamps 1 +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC --bind-to none \ + ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 12 --nx 1536 --ny 1536 --nz 128 --nbiter 100 --freq_diag 0 --use_time_stamps 1 --is_async 1 diff --git a/wk/sub_executors_heat3d_mpi_time_stamps_Icelake_16CPUs.sh b/wk/sub_executors_heat3d_mpi_time_stamps_Icelake_16CPUs.sh new file mode 100644 index 0000000..a720b3e --- /dev/null +++ b/wk/sub_executors_heat3d_mpi_time_stamps_Icelake_16CPUs.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#PJM -L "node=8" +#PJM -L "rscgrp=regular-a" +#PJM -L "elapse=60:00" +#PJM -s +#PJM -g jh220031a +#PJM --mpi proc=16 + +. /etc/profile.d/modules.sh # Initialize module command + +module purge + +# Load spack +export HOME=/work/jh220031a/i18048 +. $HOME/spack/share/spack/setup-env.sh + +module load nvidia/23.3 cmake/3.24.0 nvmpi/23.3 +export NVLOCALRC=/work/opt/local/x86_64/cores/nvidia/23.3/Linux_x86_64/23.3/compilers/bin/localrc_gcc12.2.0 + +# Need GPUs to build the code appropriately +# So compile inside a batch job, wherein GPUs are visible +if [ ! -d "../build" ] +then + cd ../ + rm -rf build + mkdir build && cd build + cmake -DCMAKE_CXX_COMPILER=nvc++ -DBACKEND=OPENMP -DCMAKE_CXX_FLAGS="-std=c++20" .. + cmake --build . -j 8 + cd ../wk/ +fi + +export UCX_MEMTYPE_CACHE=n +export UCX_IB_GPU_DIRECT_RDMA=no +export OMPI_MCA_plm_rsh_agent=/bin/pjrsh +export OMP_NUM_THREADS=36 + +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC --bind-to none \ + ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 16 --nx 1536 --ny 1536 --nz 96 --nbiter 100 --freq_diag 0 --use_time_stamps 1 +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC --bind-to none \ + ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 16 --nx 1536 --ny 1536 --nz 96 --nbiter 100 --freq_diag 0 --use_time_stamps 1 --is_async 1 diff --git a/wk/sub_executors_heat3d_mpi_time_stamps_Icelake_4CPUs.sh b/wk/sub_executors_heat3d_mpi_time_stamps_Icelake_4CPUs.sh new file mode 100644 index 0000000..4bdf6e2 --- /dev/null +++ b/wk/sub_executors_heat3d_mpi_time_stamps_Icelake_4CPUs.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#PJM -L "node=2" +#PJM -L "rscgrp=regular-a" +#PJM -L "elapse=60:00" +#PJM -s +#PJM -g jh220031a +#PJM --mpi proc=4 + +. /etc/profile.d/modules.sh # Initialize module command + +module purge + +# Load spack +export HOME=/work/jh220031a/i18048 +. $HOME/spack/share/spack/setup-env.sh + +module load nvidia/23.3 cmake/3.24.0 nvmpi/23.3 +export NVLOCALRC=/work/opt/local/x86_64/cores/nvidia/23.3/Linux_x86_64/23.3/compilers/bin/localrc_gcc12.2.0 + +# Need GPUs to build the code appropriately +# So compile inside a batch job, wherein GPUs are visible +if [ ! -d "../build" ] +then + cd ../ + rm -rf build + mkdir build && cd build + cmake -DCMAKE_CXX_COMPILER=nvc++ -DBACKEND=OPENMP -DCMAKE_CXX_FLAGS="-std=c++20" .. + cmake --build . -j 8 + cd ../wk/ +fi + +export UCX_MEMTYPE_CACHE=n +export UCX_IB_GPU_DIRECT_RDMA=no +export OMPI_MCA_plm_rsh_agent=/bin/pjrsh +export OMP_NUM_THREADS=36 + +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC --bind-to none \ + ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 4 --nx 1536 --ny 1536 --nz 384 --nbiter 100 --freq_diag 0 --use_time_stamps 1 +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC --bind-to none \ + ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 4 --nx 1536 --ny 1536 --nz 384 --nbiter 100 --freq_diag 0 --use_time_stamps 1 --is_async 1 diff --git a/wk/sub_executors_heat3d_mpi_time_stamps_Icelake_8CPUs.sh b/wk/sub_executors_heat3d_mpi_time_stamps_Icelake_8CPUs.sh new file mode 100644 index 0000000..c3c30a1 --- /dev/null +++ b/wk/sub_executors_heat3d_mpi_time_stamps_Icelake_8CPUs.sh @@ -0,0 +1,40 @@ +#!/bin/bash +#PJM -L "node=4" +#PJM -L "rscgrp=regular-a" +#PJM -L "elapse=60:00" +#PJM -s +#PJM -g jh220031a +#PJM --mpi proc=8 + +. /etc/profile.d/modules.sh # Initialize module command + +module purge + +# Load spack +export HOME=/work/jh220031a/i18048 +. $HOME/spack/share/spack/setup-env.sh + +module load nvidia/23.3 cmake/3.24.0 nvmpi/23.3 +export NVLOCALRC=/work/opt/local/x86_64/cores/nvidia/23.3/Linux_x86_64/23.3/compilers/bin/localrc_gcc12.2.0 + +# Need GPUs to build the code appropriately +# So compile inside a batch job, wherein GPUs are visible +if [ ! -d "../build" ] +then + cd ../ + rm -rf build + mkdir build && cd build + cmake -DCMAKE_CXX_COMPILER=nvc++ -DBACKEND=OPENMP -DCMAKE_CXX_FLAGS="-std=c++20" .. + cmake --build . -j 8 + cd ../wk/ +fi + +export UCX_MEMTYPE_CACHE=n +export UCX_IB_GPU_DIRECT_RDMA=no +export OMPI_MCA_plm_rsh_agent=/bin/pjrsh +export OMP_NUM_THREADS=36 + +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC --bind-to none \ + ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 8 --nx 1536 --ny 1536 --nz 192 --nbiter 100 --freq_diag 0 --use_time_stamps 1 +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC --bind-to none \ + ../build/mini-apps/heat3d-mpi/executors/heat3d-mpi-executors --px 1 --py 1 --pz 8 --nx 1536 --ny 1536 --nz 192 --nbiter 100 --freq_diag 0 --use_time_stamps 1 --is_async 1 diff --git a/wk/sub_executors_lbm2d_letkf_time_stamps_A100_16GPUs.sh b/wk/sub_executors_lbm2d_letkf_time_stamps_A100_16GPUs.sh new file mode 100644 index 0000000..c96235c --- /dev/null +++ b/wk/sub_executors_lbm2d_letkf_time_stamps_A100_16GPUs.sh @@ -0,0 +1,49 @@ +#!/bin/bash +#PJM -L "node=2" +#PJM -L "rscgrp=regular-a" +#PJM -L "elapse=60:00" +#PJM -s +#PJM -g jh220031a +#PJM --mpi proc=16 + +. /etc/profile.d/modules.sh # Initialize module command + +module purge + +# Load spack +export HOME=/work/jh220031a/i18048 +. $HOME/spack/share/spack/setup-env.sh + +spack load gcc@11.3.0 +spack load cmake@3.24.3%gcc@8.3.1 +module load /work/04/jh220031a/i18048/lib/nvidia/hpc_sdk23.3/modulefiles/nvhpc/23.3 +module list + +# Need GPUs to build the code appropriately +# So compile inside a batch job, wherein GPUs are visible +if [ ! -d "../build" ] +then + cd ../ + rm -rf build + mkdir build && cd build + cmake -DCMAKE_CXX_COMPILER=nvc++ -DBACKEND=CUDA .. + cmake --build . -j 8 + cd ../wk/ +fi + +export UCX_MEMTYPE_CACHE=n +export UCX_IB_GPU_DIRECT_RDMA=no +export UCX_RNDV_FRAG_MEM_TYPE=cuda +export OMPI_MCA_plm_rsh_agent=/bin/pjrsh + +mpiexec -machinefile $PJM_O_NODEINF -np 1 -npernode 1 \ + ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename nature_768.json + +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 8 \ + ./wrapper.sh ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename letkf_768.json + +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 8 \ + ./wrapper.sh ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename letkf_768_time.json + +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 8 \ + ./wrapper.sh ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename letkf_async_768_time.json diff --git a/wk/sub_executors_lbm2d_letkf_time_stamps_A100_24GPUs.sh b/wk/sub_executors_lbm2d_letkf_time_stamps_A100_24GPUs.sh new file mode 100644 index 0000000..1887eeb --- /dev/null +++ b/wk/sub_executors_lbm2d_letkf_time_stamps_A100_24GPUs.sh @@ -0,0 +1,49 @@ +#!/bin/bash +#PJM -L "node=3" +#PJM -L "rscgrp=regular-a" +#PJM -L "elapse=60:00" +#PJM -s +#PJM -g jh220031a +#PJM --mpi proc=24 + +. /etc/profile.d/modules.sh # Initialize module command + +module purge + +# Load spack +export HOME=/work/jh220031a/i18048 +. $HOME/spack/share/spack/setup-env.sh + +spack load gcc@11.3.0 +spack load cmake@3.24.3%gcc@8.3.1 +module load /work/04/jh220031a/i18048/lib/nvidia/hpc_sdk23.3/modulefiles/nvhpc/23.3 +module list + +# Need GPUs to build the code appropriately +# So compile inside a batch job, wherein GPUs are visible +if [ ! -d "../build" ] +then + cd ../ + rm -rf build + mkdir build && cd build + cmake -DCMAKE_CXX_COMPILER=nvc++ -DBACKEND=CUDA .. + cmake --build . -j 8 + cd ../wk/ +fi + +export UCX_MEMTYPE_CACHE=n +export UCX_IB_GPU_DIRECT_RDMA=no +export UCX_RNDV_FRAG_MEM_TYPE=cuda +export OMPI_MCA_plm_rsh_agent=/bin/pjrsh + +#mpiexec -machinefile $PJM_O_NODEINF -np 1 -npernode 1 \ +# ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename nature_768.json + +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 8 \ + ./wrapper.sh ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename letkf_768.json + +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 8 \ + ./wrapper.sh ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename letkf_768_time.json + +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 8 \ + ./wrapper.sh ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename letkf_async_768_time.json diff --git a/wk/sub_executors_lbm2d_letkf_time_stamps_A100_32GPUs.sh b/wk/sub_executors_lbm2d_letkf_time_stamps_A100_32GPUs.sh new file mode 100644 index 0000000..8a2dd82 --- /dev/null +++ b/wk/sub_executors_lbm2d_letkf_time_stamps_A100_32GPUs.sh @@ -0,0 +1,49 @@ +#!/bin/bash +#PJM -L "node=4" +#PJM -L "rscgrp=regular-a" +#PJM -L "elapse=60:00" +#PJM -s +#PJM -g jh220031a +#PJM --mpi proc=32 + +. /etc/profile.d/modules.sh # Initialize module command + +module purge + +# Load spack +export HOME=/work/jh220031a/i18048 +. $HOME/spack/share/spack/setup-env.sh + +spack load gcc@11.3.0 +spack load cmake@3.24.3%gcc@8.3.1 +module load /work/04/jh220031a/i18048/lib/nvidia/hpc_sdk23.3/modulefiles/nvhpc/23.3 +module list + +# Need GPUs to build the code appropriately +# So compile inside a batch job, wherein GPUs are visible +if [ ! -d "../build" ] +then + cd ../ + rm -rf build + mkdir build && cd build + cmake -DCMAKE_CXX_COMPILER=nvc++ -DBACKEND=CUDA .. + cmake --build . -j 8 + cd ../wk/ +fi + +export UCX_MEMTYPE_CACHE=n +export UCX_IB_GPU_DIRECT_RDMA=no +export UCX_RNDV_FRAG_MEM_TYPE=cuda +export OMPI_MCA_plm_rsh_agent=/bin/pjrsh + +#mpiexec -machinefile $PJM_O_NODEINF -np 1 -npernode 1 \ +# ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename nature_768.json + +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 8 \ + ./wrapper.sh ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename letkf_768.json + +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 8 \ + ./wrapper.sh ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename letkf_768_time.json + +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 8 \ + ./wrapper.sh ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename letkf_async_768_time.json diff --git a/wk/sub_executors_lbm2d_letkf_time_stamps_A100_8GPUs.sh b/wk/sub_executors_lbm2d_letkf_time_stamps_A100_8GPUs.sh new file mode 100644 index 0000000..5671585 --- /dev/null +++ b/wk/sub_executors_lbm2d_letkf_time_stamps_A100_8GPUs.sh @@ -0,0 +1,49 @@ +#!/bin/bash +#PJM -L "node=1" +#PJM -L "rscgrp=regular-a" +#PJM -L "elapse=60:00" +#PJM -s +#PJM -g jh220031a +#PJM --mpi proc=8 + +. /etc/profile.d/modules.sh # Initialize module command + +module purge + +# Load spack +export HOME=/work/jh220031a/i18048 +. $HOME/spack/share/spack/setup-env.sh + +spack load gcc@11.3.0 +spack load cmake@3.24.3%gcc@8.3.1 +module load /work/04/jh220031a/i18048/lib/nvidia/hpc_sdk23.3/modulefiles/nvhpc/23.3 +module list + +# Need GPUs to build the code appropriately +# So compile inside a batch job, wherein GPUs are visible +if [ ! -d "../build" ] +then + cd ../ + rm -rf build + mkdir build && cd build + cmake -DCMAKE_CXX_COMPILER=nvc++ -DBACKEND=CUDA .. + cmake --build . -j 8 + cd ../wk/ +fi + +export UCX_MEMTYPE_CACHE=n +export UCX_IB_GPU_DIRECT_RDMA=no +export UCX_RNDV_FRAG_MEM_TYPE=cuda +export OMPI_MCA_plm_rsh_agent=/bin/pjrsh + +mpiexec -machinefile $PJM_O_NODEINF -np 1 -npernode 1 \ + ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename nature_768.json + +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 8 \ + ./wrapper.sh ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename letkf_768.json + +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 8 \ + ./wrapper.sh ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename letkf_768_time.json + +mpiexec -machinefile $PJM_O_NODEINF -np $PJM_MPI_PROC -npernode 8 \ + ./wrapper.sh ../build/mini-apps/lbm2d-letkf/executors/lbm2d-letkf-executors --filename letkf_async_768_time.json