Skip to content

Commit

Permalink
Fix may_alias::yes in partition tunings, offset::size selection and p…
Browse files Browse the repository at this point in the history
…ass template parameter to Nominal4BItemsToItems call
  • Loading branch information
gonidelis authored and bernhardmgruber committed Jan 31, 2025
1 parent f7b4b1e commit f2ecc5c
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 35 deletions.
2 changes: 1 addition & 1 deletion cub/cub/device/dispatch/dispatch_select_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ template <typename InputIteratorT,
typename PolicyHub =
detail::select::policy_hub<detail::value_t<InputIteratorT>,
detail::value_t<FlagsInputIteratorT>,
detail::select::per_partition_offset_t,
OffsetT,
detail::select::is_partition_distinct_output_t<SelectedOutputIteratorT>::value,
MayAlias,
KeepRejects>>
Expand Down
87 changes: 53 additions & 34 deletions cub/cub/device/dispatch/tuning/tuning_select_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -876,7 +876,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_1,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_0.ld_0.ipt_15.tpb_608.ns_676.dcid_7.l2w_500 1.171303 1.042818 1.175890 1.456731
Expand All @@ -894,7 +894,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_2,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_0.ld_0.ipt_22.tpb_320.ns_1756.dcid_6.l2w_615 1.206387 1.079118 1.202408 1.307692
Expand All @@ -912,7 +912,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_4,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_1.ld_0.ipt_19.tpb_320.ns_716.dcid_5.l2w_570 1.177521 1.123348 1.177703 1.307692
Expand All @@ -930,7 +930,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_8,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_1.ld_0.ipt_20.tpb_416.ns_1672.dcid_7.l2w_1050 1.086221 0.977775 1.090731 1.257618
Expand All @@ -948,7 +948,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_1,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_0.ld_0.ipt_22.tpb_576.ns_368.dcid_7.l2w_680 1.191750 0.990521 1.175654 1.433174
Expand All @@ -966,7 +966,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_2,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_1.ld_0.ipt_20.tpb_608.ns_516.dcid_7.l2w_635 1.244961 0.848558 1.212567 1.461538
Expand All @@ -984,7 +984,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_4,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_1.ld_0.ipt_18.tpb_608.ns_1712.dcid_5.l2w_825 1.255078 0.990588 1.231055 1.421176
Expand All @@ -1002,7 +1002,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_8,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_1.ld_0.ipt_14.tpb_512.ns_1468.dcid_7.l2w_820 1.111830 1.011070 1.119481 1.245868
Expand All @@ -1020,7 +1020,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_1,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_0.ld_0.ipt_22.tpb_224.ns_68.dcid_2.l2w_990 1.151989 1.064433 1.146707 1.305288
Expand All @@ -1038,7 +1038,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_2,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_0.ld_0.ipt_22.tpb_320.ns_560.dcid_5.l2w_640 1.205538 1.080520 1.201709 1.307692
Expand All @@ -1056,7 +1056,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_4,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_1.ld_0.ipt_19.tpb_608.ns_724.dcid_5.l2w_970 1.196592 0.982227 1.177984 1.310843
Expand All @@ -1074,7 +1074,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_8,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_1.ld_0.ipt_23.tpb_416.ns_1608.dcid_2.l2w_560 1.099752 0.977393 1.106477 1.259336
Expand All @@ -1092,7 +1092,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_1,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_0.ld_0.ipt_20.tpb_608.ns_1016.dcid_6.l2w_545 1.239144 1.002404 1.225460 1.444711
Expand All @@ -1110,7 +1110,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_2,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_1.ld_0.ipt_22.tpb_288.ns_124.dcid_2.l2w_690 1.202783 1.000000 1.183737 1.311755
Expand All @@ -1128,7 +1128,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_4,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_1.ld_0.ipt_19.tpb_608.ns_1884.dcid_6.l2w_950 1.250302 0.988124 1.225191 1.392931
Expand All @@ -1146,7 +1146,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_8,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_1.ld_0.ipt_23.tpb_416.ns_0.dcid_2.l2w_1200 1.156864 1.011990 1.152368 1.266667
Expand All @@ -1157,6 +1157,21 @@ struct sm100_tuning<Input,
using delay_constructor = exponential_backoff_constructor_t<0, 1200>;
};

// because we introduced cases for when offset is I64 this leads to regressions if not
// defaulted explicitly
template <distinct_partitions DistinctPartitions>
struct sm100_tuning<__int128_t,
flagged::no,
keep_rejects::yes,
offset_size::_8,
primitive::no,
input_size::_16,
may_alias::no,
DistinctPartitions>
: sm90_tuning<__int128_t, flagged::no, keep_rejects::yes, offset_size::_4, primitive::no, input_size::_16>
// ^^^^^ this base is wrong and leads to regressions ^^^^^
{};

// partition::flagged
template <class Input>
struct sm100_tuning<Input,
Expand All @@ -1165,7 +1180,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_1,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_0.ld_0.ipt_20.tpb_448.ns_964.dcid_7.l2w_385 1.111204 1.036205 1.111986 1.275210
Expand All @@ -1183,7 +1198,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_2,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_0.ld_0.ipt_18.tpb_256.ns_300.dcid_6.l2w_820 1.107466 0.923750 1.126995 1.346591
Expand All @@ -1201,7 +1216,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_4,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_0.ld_0.ipt_19.tpb_256.ns_1608.dcid_7.l2w_675 1.097548 0.964114 1.109189 1.283333
Expand All @@ -1219,7 +1234,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_8,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_0.ld_0.ipt_21.tpb_384.ns_300.dcid_7.l2w_580 1.239128 1.019324 1.238373 1.347458
Expand All @@ -1237,7 +1252,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_1,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_0.ld_1.ipt_20.tpb_448.ns_240.dcid_6.l2w_845 1.097180 0.990453 1.091667 1.452153
Expand All @@ -1255,7 +1270,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_2,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_0.ld_0.ipt_14.tpb_320.ns_1428.dcid_7.l2w_830 1.380164 1.133333 1.367514 1.628793
Expand All @@ -1273,7 +1288,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_4,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_0.ld_0.ipt_14.tpb_640.ns_1204.dcid_5.l2w_635 1.155209 1.000000 1.143742 1.380659
Expand All @@ -1291,7 +1306,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_8,
may_alias::yes,
may_alias::no,
distinct_partitions::yes>
{
// trp_0.ld_0.ipt_19.tpb_384.ns_1016.dcid_7.l2w_875 1.227540 1.181818 1.223936 1.261954
Expand All @@ -1309,7 +1324,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_1,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_0.ld_0.ipt_24.tpb_256.ns_2024.dcid_5.l2w_835 1.146782 1.001841 1.149438 1.439904
Expand All @@ -1327,7 +1342,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_2,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_0.ld_0.ipt_18.tpb_256.ns_1832.dcid_5.l2w_590 1.128674 0.984403 1.150806 1.355932
Expand All @@ -1345,7 +1360,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_4,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_0.ld_0.ipt_11.tpb_448.ns_476.dcid_7.l2w_665 1.173664 1.035556 1.186114 1.393153
Expand All @@ -1363,7 +1378,7 @@ struct sm100_tuning<Input,
offset_size::_4,
primitive::yes,
input_size::_8,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_0.ld_0.ipt_20.tpb_384.ns_1420.dcid_5.l2w_525 (39_new/2.db) 1.157326 1.110920 1.162458 1.259336
Expand All @@ -1381,7 +1396,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_1,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_0.ld_0.ipt_12.tpb_256.ns_0.dcid_5.l2w_850 1.150864 1.005760 1.157687 1.395833
Expand All @@ -1399,7 +1414,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_2,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_0.ld_0.ipt_12.tpb_256.ns_1552.dcid_7.l2w_730 1.374892 1.171831 1.360076 1.513390
Expand All @@ -1417,7 +1432,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_4,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_0.ld_0.ipt_14.tpb_352.ns_1444.dcid_5.l2w_655 1.183452 1.000000 1.177224 1.402083
Expand All @@ -1435,7 +1450,7 @@ struct sm100_tuning<Input,
offset_size::_8,
primitive::yes,
input_size::_8,
may_alias::yes,
may_alias::no,
distinct_partitions::no>
{
// trp_0.ld_0.ipt_11.tpb_512.ns_536.dcid_2.l2w_845 1.248969 1.184659 1.251631 1.360795
Expand Down Expand Up @@ -1563,19 +1578,23 @@ struct policy_hub
template <typename Tuning>
static auto select_agent_policy100(int)
-> AgentSelectIfPolicy<Tuning::threads,
Nominal4BItemsToItems(Tuning::nominal_4b_items),
Nominal4BItemsToItems<InputT>(Tuning::nominal_4b_items),
Tuning::load_algorithm,
Tuning::load_modifier,
BLOCK_SCAN_WARP_SCANS,
typename Tuning::delay_constructor>;
template <typename Tuning>
static auto select_agent_policy100(long) -> typename Policy900::SelectIfPolicyT;

// We use KeepRejects to differentiate between partition and select in the tunings.
// If KeepRejects is true we tune for partition otherwise we tune for select.
static constexpr offset_size offset_t = KeepRejects ? classify_offset_size<OffsetT>() : offset_size::_4;

using SelectIfPolicyT =
decltype(select_agent_policy100<sm100_tuning<InputT,
is_flagged<FlagT>(),
are_rejects_kept<KeepRejects>(),
classify_offset_size<OffsetT>(),
offset_t,
is_primitive<InputT>(),
classify_input_size<InputT>(),
should_alias<MayAlias>(),
Expand Down

0 comments on commit f2ecc5c

Please sign in to comment.