This repository has been archived by the owner on Apr 28, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 212
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This commit is to start support for strided tensors. I made changes to percolate a vector in TensorInfo down to emitCudaKernel to allow codegen to cast strided tensors. This required changes to an unit test to expect the correct cast.
- Loading branch information
Protonu Basu
committed
Jun 8, 2018
1 parent
cc4b1eb
commit ff1ed36
Showing
8 changed files
with
260 additions
and
15 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,226 @@ | ||
Note: Google Test filter = *Strided* | ||
[==========] Running 1 test from 1 test case. | ||
[----------] Global test environment set-up. | ||
[----------] 1 test from TcCudaMapperTest | ||
[ RUN ] TcCudaMapperTest.TensorAddStrided | ||
WARNING: | ||
Reduction without initialization. If O is not pre-initialized before calling the TC function, consider using the !-suffixed reduction operator +=! instead of +=: | ||
|
||
def tensoraddstrided(float(N, M) I0_view, float(N, M) I1_view) -> (O) { | ||
O(n, m) += I0_view(n, m) + I1_view(n, m) | ||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~... <--- HERE | ||
} | ||
|
||
WARNING: | ||
Reduction without initialization. If O is not pre-initialized before calling the TC function, consider using the !-suffixed reduction operator +=! instead of +=: | ||
|
||
def tensoraddstrided(float(N, M) I0_view, float(N, M) I1_view) -> (O) { | ||
O(n, m) += I0_view(n, m) + I1_view(n, m) | ||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~... <--- HERE | ||
} | ||
|
||
I0607 13:02:54.070823 21973 cuda_tc_executor.cc:82] tc::CudaMappingOptions::makeNaiveMappingOptions() | ||
.outerScheduleFusionStrategy(tc::FusionStrategy::Preserve3Coincident) | ||
.outerScheduleAllowSkewing(false) | ||
.outerSchedulePositiveOrthant(true) | ||
.intraTileScheduleFusionStrategy(tc::FusionStrategy::Preserve3Coincident) | ||
.intraTileScheduleAllowSkewing(false) | ||
.intraTileSchedulePositiveOrthant(true) | ||
.fixParametersBeforeScheduling(false) | ||
.tile(32, 32, 32) | ||
.unroll(1) | ||
.tileImperfectlyNested(false) | ||
.matchLibraryCalls(false) | ||
.mapToThreads(32, 8) | ||
.mapToBlocks(256, 256) | ||
.useSharedMemory(false) | ||
.usePrivateMemory(false) | ||
.unrollCopyShared(false) | ||
.useReadOnlyCache(false); | ||
I0607 13:02:54.072165 21973 cuda_tc_executor.cc:83] original schedule: | ||
domain( | ||
[M, N] -> { S_0[O_s1_n, O_s1_m] : 0 <= O_s1_n < N and 0 <= O_s1_m < M }) | ||
band(n(1) permutable(0) coincident(0) unroll(0) | ||
----------------------------------------------------------------------- | ||
| [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_n)] } | ||
----------------------------------------------------------------------- | ||
band(n(1) permutable(0) coincident(0) unroll(0) | ||
----------------------------------------------------------------------- | ||
| [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_m)] } | ||
----------------------------------------------------------------------- | ||
I0607 13:02:54.075304 21973 scop.cc:400] After scheduling: | ||
domain( | ||
[M, N] -> { S_0[O_s1_n, O_s1_m] : 0 <= O_s1_n < N and 0 <= O_s1_m < M }) | ||
band(n(2) permutable(1) coincident(1, 1) unroll(0, 0) | ||
----------------------------------------------------------------------- | ||
| [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_n)] } | ||
----------------------------------------------------------------------- | ||
| [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_m)] } | ||
----------------------------------------------------------------------- | ||
I0607 13:02:54.075870 21973 scop.cc:454] After tiling outer: | ||
domain( | ||
[M, N] -> { S_0[O_s1_n, O_s1_m] : 0 <= O_s1_n < N and 0 <= O_s1_m < M }) | ||
band(n(2) permutable(1) coincident(1, 1) unroll(0, 0) | ||
----------------------------------------------------------------------- | ||
| [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(floor((O_s1_n)/32))] } | ||
----------------------------------------------------------------------- | ||
| [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(floor((O_s1_m)/32))] } | ||
----------------------------------------------------------------------- | ||
band(n(2) permutable(1) coincident(1, 1) unroll(0, 0) | ||
----------------------------------------------------------------------- | ||
| [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_n - 32*floor((O_s1_n)/32))] } | ||
----------------------------------------------------------------------- | ||
| [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_m - 32*floor((O_s1_m)/32))] } | ||
----------------------------------------------------------------------- | ||
I0607 13:02:54.078128 21973 mapped_scop.cc:1021] After mapping to threads: | ||
domain( | ||
[M, N] -> { S_0[O_s1_n, O_s1_m] : M = 64 and N = 64 and 0 <= O_s1_n <= 63 and 0 <= O_s1_m <= 63 }) | ||
context([M, N, t1, t0, t2, b2, b1, b0] -> { [] : t2 = 0 and b2 = 0 and 0 <= t1 <= 7 and 0 <= t0 <= 31 and 0 <= b1 <= 255 and 0 <= b0 <= 255 }) | ||
band(n(2) permutable(1) coincident(1, 1) unroll(0, 0) | ||
----------------------------------------------------------------------- | ||
| [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(floor((O_s1_n)/32))] } | ||
----------------------------------------------------------------------- | ||
| [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(floor((O_s1_m)/32))] } | ||
----------------------------------------------------------------------- | ||
mapping_filter(ids(t1, t0, ) | ||
[M, N, t0, t1] -> { S_0[O_s1_n, O_s1_m] : (-t1 + O_s1_n) mod 8 = 0 and (-t0 + O_s1_m) mod 32 = 0 and 0 <= t0 <= 31 and 0 <= t1 <= 7 }) | ||
band(n(2) permutable(1) coincident(1, 1) unroll(0, 0) | ||
----------------------------------------------------------------------- | ||
| [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_n - 32*floor((O_s1_n)/32))] } | ||
----------------------------------------------------------------------- | ||
| [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_m - 32*floor((O_s1_m)/32))] } | ||
----------------------------------------------------------------------- | ||
thread_specific() | ||
I0607 13:02:54.079393 21973 schedule_transforms.cc:391] Resizing scales to 2 entries: 32 32 32 | ||
I0607 13:02:54.079439 21973 mapped_scop.cc:1029] After mapping to blocks: | ||
domain( | ||
[M, N] -> { S_0[O_s1_n, O_s1_m] : M = 64 and N = 64 and 0 <= O_s1_n <= 63 and 0 <= O_s1_m <= 63 }) | ||
context([M, N, t1, t0, t2, b2, b1, b0] -> { [] : t2 = 0 and b2 = 0 and 0 <= t1 <= 7 and 0 <= t0 <= 31 and 0 <= b1 <= 255 and 0 <= b0 <= 255 }) | ||
mapping_filter(ids(b1, b0, ) | ||
[M, N, b0, b1] -> { S_0[O_s1_n, O_s1_m] : -31 - 32b1 + O_s1_m <= 8192*floor((O_s1_m)/8192) <= -32b1 + O_s1_m and -31 - 32b0 + O_s1_n <= 8192*floor((O_s1_n)/8192) <= -32b0 + O_s1_n }) | ||
band(n(2) permutable(1) coincident(1, 1) unroll(0, 0) | ||
----------------------------------------------------------------------- | ||
| [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(32*floor((O_s1_n)/32))] } | ||
----------------------------------------------------------------------- | ||
| [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(32*floor((O_s1_m)/32))] } | ||
----------------------------------------------------------------------- | ||
mapping_filter(ids(t1, t0, ) | ||
[M, N, t0, t1] -> { S_0[O_s1_n, O_s1_m] : (-t1 + O_s1_n) mod 8 = 0 and (-t0 + O_s1_m) mod 32 = 0 and 0 <= t0 <= 31 and 0 <= t1 <= 7 }) | ||
band(n(2) permutable(1) coincident(1, 1) unroll(0, 0) | ||
----------------------------------------------------------------------- | ||
| [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_n - 32*floor((O_s1_n)/32))] } | ||
----------------------------------------------------------------------- | ||
| [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_m - 32*floor((O_s1_m)/32))] } | ||
----------------------------------------------------------------------- | ||
thread_specific() | ||
I0607 13:02:54.079643 21973 mapped_scop.cc:1083] After outerBlockInnerThread strategy: | ||
domain( | ||
[M, N] -> { S_0[O_s1_n, O_s1_m] : M = 64 and N = 64 and 0 <= O_s1_n <= 63 and 0 <= O_s1_m <= 63 }) | ||
context([M, N, t1, t0, t2, b2, b1, b0] -> { [] : t2 = 0 and b2 = 0 and 0 <= t1 <= 7 and 0 <= t0 <= 31 and 0 <= b1 <= 255 and 0 <= b0 <= 255 }) | ||
mapping_filter(ids(b1, b0, ) | ||
[M, N, b0, b1] -> { S_0[O_s1_n, O_s1_m] : -31 - 32b1 + O_s1_m <= 8192*floor((O_s1_m)/8192) <= -32b1 + O_s1_m and -31 - 32b0 + O_s1_n <= 8192*floor((O_s1_n)/8192) <= -32b0 + O_s1_n }) | ||
band(n(2) permutable(1) coincident(1, 1) unroll(0, 0) | ||
----------------------------------------------------------------------- | ||
| [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(32*floor((O_s1_n)/32))] } | ||
----------------------------------------------------------------------- | ||
| [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(32*floor((O_s1_m)/32))] } | ||
----------------------------------------------------------------------- | ||
mapping_filter(ids(t1, t0, ) | ||
[M, N, t0, t1] -> { S_0[O_s1_n, O_s1_m] : (-t1 + O_s1_n) mod 8 = 0 and (-t0 + O_s1_m) mod 32 = 0 and 0 <= t0 <= 31 and 0 <= t1 <= 7 }) | ||
band(n(2) permutable(1) coincident(1, 1) unroll(0, 0) | ||
----------------------------------------------------------------------- | ||
| [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_n - 32*floor((O_s1_n)/32))] } | ||
----------------------------------------------------------------------- | ||
| [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_m - 32*floor((O_s1_m)/32))] } | ||
----------------------------------------------------------------------- | ||
thread_specific() | ||
I0607 13:02:54.079829 21973 cuda_tc_executor.cc:90] Mapped schedule: | ||
domain( | ||
[M, N] -> { S_0[O_s1_n, O_s1_m] : M = 64 and N = 64 and 0 <= O_s1_n <= 63 and 0 <= O_s1_m <= 63 }) | ||
context([M, N, t1, t0, t2, b2, b1, b0] -> { [] : t2 = 0 and b2 = 0 and 0 <= t1 <= 7 and 0 <= t0 <= 31 and 0 <= b1 <= 255 and 0 <= b0 <= 255 }) | ||
mapping_filter(ids(b1, b0, ) | ||
[M, N, b0, b1] -> { S_0[O_s1_n, O_s1_m] : -31 - 32b1 + O_s1_m <= 8192*floor((O_s1_m)/8192) <= -32b1 + O_s1_m and -31 - 32b0 + O_s1_n <= 8192*floor((O_s1_n)/8192) <= -32b0 + O_s1_n }) | ||
band(n(2) permutable(1) coincident(1, 1) unroll(0, 0) | ||
----------------------------------------------------------------------- | ||
| [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(32*floor((O_s1_n)/32))] } | ||
----------------------------------------------------------------------- | ||
| [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(32*floor((O_s1_m)/32))] } | ||
----------------------------------------------------------------------- | ||
mapping_filter(ids(t1, t0, ) | ||
[M, N, t0, t1] -> { S_0[O_s1_n, O_s1_m] : (-t1 + O_s1_n) mod 8 = 0 and (-t0 + O_s1_m) mod 32 = 0 and 0 <= t0 <= 31 and 0 <= t1 <= 7 }) | ||
band(n(2) permutable(1) coincident(1, 1) unroll(0, 0) | ||
----------------------------------------------------------------------- | ||
| [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_n - 32*floor((O_s1_n)/32))] } | ||
----------------------------------------------------------------------- | ||
| [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_m - 32*floor((O_s1_m)/32))] } | ||
----------------------------------------------------------------------- | ||
thread_specific() | ||
I0607 13:02:54.091660 21973 mapped_scop.cc:900] Codegen with tightened bounds [blocks:CudaDim(2, 2, 1) @0x7ffefab63f90, threads:CudaDim(32, 8, 1) @0x7ffefab63fd0] for tree: | ||
domain( | ||
[M, N] -> { S_0[O_s1_n, O_s1_m] : M = 64 and N = 64 and 0 <= O_s1_n <= 63 and 0 <= O_s1_m <= 63 }) | ||
context([M, N, t1, t0, t2, b2, b1, b0] -> { [] : M = 64 and N = 64 and t2 = 0 and b2 = 0 and 0 <= t1 <= 7 and 0 <= t0 <= 31 and 0 <= b1 <= 1 and 0 <= b0 <= 1 }) | ||
mapping_filter(ids(b1, b0, ) | ||
[M, N, b0, b1] -> { S_0[O_s1_n, O_s1_m] : -31 - 32b1 + O_s1_m <= 8192*floor((O_s1_m)/8192) <= -32b1 + O_s1_m and -31 - 32b0 + O_s1_n <= 8192*floor((O_s1_n)/8192) <= -32b0 + O_s1_n }) | ||
band(n(2) permutable(1) coincident(1, 1) unroll(0, 0) | ||
----------------------------------------------------------------------- | ||
| [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(32*floor((O_s1_n)/32))] } | ||
----------------------------------------------------------------------- | ||
| [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(32*floor((O_s1_m)/32))] } | ||
----------------------------------------------------------------------- | ||
mapping_filter(ids(t1, t0, ) | ||
[M, N, t0, t1] -> { S_0[O_s1_n, O_s1_m] : (-t1 + O_s1_n) mod 8 = 0 and (-t0 + O_s1_m) mod 32 = 0 and 0 <= t0 <= 31 and 0 <= t1 <= 7 }) | ||
band(n(2) permutable(1) coincident(1, 1) unroll(0, 0) | ||
----------------------------------------------------------------------- | ||
| [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_n - 32*floor((O_s1_n)/32))] } | ||
----------------------------------------------------------------------- | ||
| [M, N] -> { S_0[O_s1_n, O_s1_m] -> [(O_s1_m - 32*floor((O_s1_m)/32))] } | ||
----------------------------------------------------------------------- | ||
thread_specific() | ||
I0607 13:02:54.130249 21973 cuda_rtc.cc:58] NVRTC function source: | ||
|
||
template<typename T> inline __device__ T floord(T n, T d) { | ||
return n < 0 ? - (-n + d - 1)/d : n / d; | ||
} | ||
#define if_then_else(cond,a,b) ((cond) ? (a) : (b)) | ||
|
||
// Halide type handling | ||
typedef int int32; | ||
typedef long int64; | ||
typedef float float32; | ||
typedef double float64; | ||
|
||
#define inff __int_as_float(0x7f800000) | ||
#define inf __longlong_as_double(0x7ff0000000000000LL) | ||
|
||
// Before CUDA 9, syncwarp is a noop since warps are always synchronized. | ||
#if __CUDACC_VER_MAJOR__ < 9 | ||
__device__ void __syncwarp(unsigned mask = 0xFFFFFFFF) {} | ||
#endif | ||
|
||
extern "C" { | ||
__global__ void tensoraddstrided_64_64(int32 M, int32 N, float32* pO, const float32* pI0_view, const float32* pI1_view) { | ||
int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z; | ||
int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z; | ||
float32 (*O)[64] = reinterpret_cast<float32 (*)[64]>(pO); | ||
const float32 (*I0_view)[128] = reinterpret_cast<const float32 (*)[128]>(pI0_view); | ||
const float32 (*I1_view)[128] = reinterpret_cast<const float32 (*)[128]>(pI1_view); | ||
for (int c2 = t1; c2 <= 31; c2 += 8) { | ||
O[(32 * b0 + c2)][(t0 + 32 * b1)] = (O[(32 * b0 + c2)][(t0 + 32 * b1)] + (I0_view[(32 * b0 + c2)][(t0 + 32 * b1)] + I1_view[(32 * b0 + c2)][(t0 + 32 * b1)])); | ||
} | ||
} | ||
} | ||
I0607 13:02:54.348301 21973 cuda_tc_executor.cc:64] [COMPILE] Compiling with host JIT compiler took: 218ms | ||
WARNING: | ||
Reduction without initialization. If O is not pre-initialized before calling the TC function, consider using the !-suffixed reduction operator +=! instead of +=: | ||
|
||
def tensoraddstrided(float(N, M) I0_view, float(N, M) I1_view) -> (O) { | ||
O(n, m) += I0_view(n, m) + I1_view(n, m) | ||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~... <--- HERE | ||
} | ||
|
||
[ OK ] TcCudaMapperTest.TensorAddStrided (297 ms) | ||
[----------] 1 test from TcCudaMapperTest (297 ms total) | ||
|
||
[----------] Global test environment tear-down | ||
[==========] 1 test from 1 test case ran. (298 ms total) | ||
[ PASSED ] 1 test. |