Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Merge branch 'staging/ctk_12.0' into 2.0.X
Browse files Browse the repository at this point in the history
  • Loading branch information
alliepiper committed Dec 13, 2022
2 parents 243423e + f31d712 commit 9be5b3b
Show file tree
Hide file tree
Showing 19 changed files with 75 additions and 37 deletions.
2 changes: 1 addition & 1 deletion cmake/CubCudaConfig.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ enable_language(CUDA)
# Architecture options:
#

set(all_archs 35 37 50 52 53 60 61 62 70 72 75 80 86)
set(all_archs 35 37 50 52 53 60 61 62 70 72 75 80 86 90)
set(arch_message "CUB: Explicitly enabled compute architectures:")

# Thrust sets up the architecture flags in CMAKE_CUDA_FLAGS already. Just
Expand Down
9 changes: 9 additions & 0 deletions cub/detail/detect_cuda_runtime.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,15 @@ namespace detail

#endif // CUB_RUNTIME_FUNCTION predefined

#ifdef CUB_RDC_ENABLED
// Detect available version of CDP:
#if __CUDACC_VER_MAJOR__ < 12 || defined(CUDA_FORCE_CDP1_IF_SUPPORTED)
#define CUB_DETAIL_CDPv1
#else
#define CUB_DETAIL_CDPv2
#endif
#endif

#endif // Do not document

} // namespace detail
Expand Down
18 changes: 13 additions & 5 deletions cub/detail/device_synchronize.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -37,18 +37,26 @@ namespace detail
CUB_EXEC_CHECK_DISABLE
CUB_RUNTIME_FUNCTION inline cudaError_t device_synchronize()
{
cudaError_t result = cudaErrorUnknown;
cudaError_t result = cudaErrorNotSupported;

#if defined(__CUDACC__) && \
((__CUDACC_VER_MAJOR__ > 11) || \
((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 6)))
// Device-side sync is only available under CDPv1:
#if defined(CUB_DETAIL_CDPv1)

#if ((__CUDACC_VER_MAJOR__ > 11) || \
((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 6)))
// CUDA >= 11.6
#define CUB_TMP_DEVICE_SYNC_IMPL \
result = __cudaDeviceSynchronizeDeprecationAvoidance();
#else // CUDA < 11.6
#else // CUDA < 11.6:
#define CUB_TMP_DEVICE_SYNC_IMPL result = cudaDeviceSynchronize();
#endif

#else // CDPv2 or no CDP:

#define CUB_TMP_DEVICE_SYNC_IMPL /* unavailable */

#endif // CDP version

NV_IF_TARGET(NV_IS_HOST,
(result = cudaDeviceSynchronize();),
(CUB_TMP_DEVICE_SYNC_IMPL));
Expand Down
47 changes: 34 additions & 13 deletions cub/util_device.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -502,7 +502,7 @@ CUB_RUNTIME_FUNCTION inline cudaError_t SmVersion(int &sm_version,
*/
CUB_RUNTIME_FUNCTION inline cudaError_t SyncStream(cudaStream_t stream)
{
cudaError_t result = cudaErrorUnknown;
cudaError_t result = cudaErrorNotSupported;

NV_IF_TARGET(NV_IS_HOST,
(result = CubDebug(cudaStreamSynchronize(stream));),
Expand Down Expand Up @@ -532,21 +532,42 @@ namespace detail
CUB_RUNTIME_FUNCTION inline cudaError_t DebugSyncStream(cudaStream_t stream)
{
#ifndef CUB_DETAIL_DEBUG_ENABLE_SYNC
(void)stream;

return cudaSuccess;
#else
#if 1 // All valid targets currently support device-side synchronization
_CubLog("%s\n", "Synchronizing...");
return SyncStream(stream);
#else
(void)stream;
_CubLog("%s\n",
"WARNING: Skipping CUB `debug_synchronous` synchronization "
"(unsupported target).");
return cudaSuccess;
#endif
#endif

#else // CUB_DETAIL_DEBUG_ENABLE_SYNC:

#define CUB_TMP_SYNC_AVAILABLE \
_CubLog("%s\n", "Synchronizing..."); \
return SyncStream(stream)

#define CUB_TMP_DEVICE_SYNC_UNAVAILABLE \
(void)stream; \
_CubLog("WARNING: Skipping CUB `debug_synchronous` synchronization (%s).\n", \
"device-side sync requires <sm_90, RDC, and CDPv1"); \
return cudaSuccess

#ifdef CUB_DETAIL_CDPv1

// Can sync everywhere but SM_90+
NV_IF_TARGET(NV_PROVIDES_SM_90,
(CUB_TMP_DEVICE_SYNC_UNAVAILABLE;),
(CUB_TMP_SYNC_AVAILABLE;));

#else // CDPv2 or no CDP:

// Can only sync on host
NV_IF_TARGET(NV_IS_HOST,
(CUB_TMP_SYNC_AVAILABLE;),
(CUB_TMP_DEVICE_SYNC_UNAVAILABLE;));

#endif // CDP version

#undef CUB_TMP_DEVICE_SYNC_UNAVAILABLE
#undef CUB_TMP_SYNC_AVAILABLE

#endif // CUB_DETAIL_DEBUG_ENABLE_SYNC
}

/** \brief Gets whether the current device supports unified addressing */
Expand Down
2 changes: 1 addition & 1 deletion cub/version.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
* <tt>CUB_VERSION / 100 % 1000</tt> is the minor version.
* <tt>CUB_VERSION / 100000</tt> is the major version.
*/
#define CUB_VERSION 200000
#define CUB_VERSION 200001

/*! \def CUB_MAJOR_VERSION
* \brief The preprocessor macro \p CUB_MAJOR_VERSION encodes the
Expand Down
2 changes: 1 addition & 1 deletion test/test_block_load_store.cu
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,7 @@ int main(int argc, char** argv)

// Compile/run thorough tests
#if TEST_VALUE_TYPES == 0
TestThreads<char>(2, 0.8f);
TestThreads<signed char>(2, 0.8f);
TestThreads<int>(2, 0.8f);
TestThreads<long>(2, 0.8f);
#elif TEST_VALUE_TYPES == 1
Expand Down
4 changes: 2 additions & 2 deletions test/test_block_radix_sort.cu
Original file line number Diff line number Diff line change
Expand Up @@ -531,7 +531,7 @@ template <
void TestKeysAndPairs()
{
// Test pairs sorting with only 4-byte configs
Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, char>(); // With small-values
Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, signed char>(); // With small-values
Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, Key>(); // With same-values
Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, TestFoo>(); // With large values
}
Expand Down Expand Up @@ -560,7 +560,7 @@ void Test()

#elif TEST_VALUE_TYPES == 2
// Test signed and fp types with paired values
TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, char>();
TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, signed char>();
TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, short>();
TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, int>();
#elif TEST_VALUE_TYPES == 3
Expand Down
2 changes: 1 addition & 1 deletion test/test_block_reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -734,7 +734,7 @@ int main(int argc, char** argv)

// primitives
#if TEST_VALUE_TYPES == 0
Test<char>();
Test<signed char>();
Test<short>();
Test<int>();
Test<long long>();
Expand Down
2 changes: 1 addition & 1 deletion test/test_device_radix_sort.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1946,7 +1946,7 @@ int main(int argc, char** argv)
// Compile/run thorough tests
#if TEST_KEY_BYTES == 1

TestGen<char, true> (num_items, num_segments);
TestGen<signed char, true> (num_items, num_segments);

#ifdef TEST_EXTENDED_KEY_TYPES
TestGen<bool, false> (num_items, num_segments);
Expand Down
4 changes: 2 additions & 2 deletions test/test_device_reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1473,9 +1473,9 @@ int main(int argc, char** argv)
// %PARAM% TEST_TYPES types 0:1:2:3

#if TEST_TYPES == 0
TestType<char, char>(max_items, max_segments);
TestType<signed char, signed char>(max_items, max_segments);
TestType<unsigned char, unsigned char>(max_items, max_segments);
TestType<char, int>(max_items, max_segments);
TestType<signed char, int>(max_items, max_segments);
#elif TEST_TYPES == 1
TestType<short, short>(max_items, max_segments);
TestType<int, int>(max_items, max_segments);
Expand Down
4 changes: 2 additions & 2 deletions test/test_device_reduce_by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -719,7 +719,7 @@ int main(int argc, char** argv)
// %PARAM% TEST_CDP cdp 0:1

// Test different input types
TestOp<int, char>(num_items);
TestOp<int, signed char>(num_items);
TestOp<int, short>(num_items);
TestOp<int, int>(num_items);
TestOp<int, long>(num_items);
Expand All @@ -735,7 +735,7 @@ int main(int argc, char** argv)
TestOp<int, TestFoo>(num_items);
TestOp<int, TestBar>(num_items);

TestOp<char, int>(num_items);
TestOp<signed char, int>(num_items);
TestOp<long long, int>(num_items);
TestOp<TestFoo, int>(num_items);
TestOp<TestBar, int>(num_items);
Expand Down
2 changes: 1 addition & 1 deletion test/test_device_run_length_encode.cu
Original file line number Diff line number Diff line change
Expand Up @@ -749,7 +749,7 @@ int main(int argc, char** argv)
// %PARAM% TEST_CDP cdp 0:1

// Test different input types
TestSize<char, int, int>(num_items);
TestSize<signed char, int, int>(num_items);
TestSize<short, int, int>(num_items);
TestSize<int, int, int>(num_items);
TestSize<long, int, int>(num_items);
Expand Down
2 changes: 1 addition & 1 deletion test/test_device_scan.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1221,7 +1221,7 @@ int main(int argc, char** argv)

// Test same input+output data types
TestSize<unsigned char>(num_items, (unsigned char)0, (unsigned char)99);
TestSize<char>(num_items, (char)0, (char)99);
TestSize<signed char>(num_items, (char)0, (char)99);
TestSize<unsigned short>(num_items, (unsigned short)0, (unsigned short)99);
TestSize<unsigned int>(num_items, (unsigned int)0, (unsigned int)99);
TestSize<unsigned long long>(num_items,
Expand Down
2 changes: 1 addition & 1 deletion test/test_device_scan_by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1036,7 +1036,7 @@ int main(int argc, char** argv)

// Test same input+output data types
TestSize<unsigned char>(num_items, (unsigned char)0, (unsigned char)99);
TestSize<char>(num_items, (char)0, (char)99);
TestSize<signed char>(num_items, (char)0, (char)99);

#elif TEST_VALUE_TYPES == 1

Expand Down
2 changes: 1 addition & 1 deletion test/test_device_spmv.cu
Original file line number Diff line number Diff line change
Expand Up @@ -566,7 +566,7 @@ void test_types()
{
test_type<float>();
test_type<double>();
test_type<char>();
test_type<signed char>();
test_type<int>();
test_type<unsigned long long>();
}
Expand Down
2 changes: 1 addition & 1 deletion test/test_iterator.cu
Original file line number Diff line number Diff line change
Expand Up @@ -500,7 +500,7 @@ int main(int argc, char** argv)
CubDebugExit(PtxVersion(ptx_version));

// Evaluate different data types
Test<char>();
Test<signed char>();
Test<short>();
Test<int>();
Test<long>();
Expand Down
2 changes: 1 addition & 1 deletion test/test_iterator_deprecated.cu
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ int main(int argc, char** argv)
CubDebugExit(args.DeviceInit());

// Evaluate different data types
Test<char>();
Test<signed char>();
Test<short>();
Test<int>();
Test<long>();
Expand Down
2 changes: 1 addition & 1 deletion test/test_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -1134,7 +1134,7 @@ std::ostream& operator<<(std::ostream& os, const CUB_NS_QUALIFIER::KeyValuePair<
/**
* Define for types
*/
CUB_VEC_OVERLOAD(char, char)
CUB_VEC_OVERLOAD(char, signed char)
CUB_VEC_OVERLOAD(short, short)
CUB_VEC_OVERLOAD(int, int)
CUB_VEC_OVERLOAD(long, long)
Expand Down
2 changes: 1 addition & 1 deletion test/test_warp_reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -721,7 +721,7 @@ template <
void Test(GenMode gen_mode)
{
// primitive
Test<WARPS, LOGICAL_WARP_THREADS, char>( gen_mode, Sum());
Test<WARPS, LOGICAL_WARP_THREADS, signed char>( gen_mode, Sum());
Test<WARPS, LOGICAL_WARP_THREADS, short>( gen_mode, Sum());
Test<WARPS, LOGICAL_WARP_THREADS, int>( gen_mode, Sum());
Test<WARPS, LOGICAL_WARP_THREADS, long long>( gen_mode, Sum());
Expand Down

0 comments on commit 9be5b3b

Please sign in to comment.