From d28523d4f404e83554dab3f93e9fdb805118ce8a Mon Sep 17 00:00:00 2001 From: bulatz Date: Sat, 18 Jun 2016 10:22:27 +0300 Subject: [PATCH] BSLab: new profile.txt with NUM_THREADS = 4*WARP_SIZE for mtf_cuda_thread[by4] --- app_bslab/bslab.cpp | 2 +- app_bslab/profile.txt | 9887 ++++++++++++---------------------- app_bslab/resource-usage.txt | 172 +- 3 files changed, 3669 insertions(+), 6392 deletions(-) diff --git a/app_bslab/bslab.cpp b/app_bslab/bslab.cpp index eff2d4b..833c566 100644 --- a/app_bslab/bslab.cpp +++ b/app_bslab/bslab.cpp @@ -350,7 +350,7 @@ int main (int argc, char **argv) time_run ("mtf_cuda_thread ", [&] {mtf_cuda_thread <<<(inbytes-1)/(CHUNK*WARP_SIZE)+1, WARP_SIZE>>> (d_inbuf, d_outbuf, inbytes, CHUNK);}); time_run ("mtf_cuda_thread_by4 ", [&] {mtf_cuda_thread_by4 <<<(inbytes-1)/(CHUNK*WARP_SIZE)+1, WARP_SIZE>>> (d_inbuf, d_outbuf, inbytes, CHUNK);}); - const int NUM_THREADS = 1*WARP_SIZE; + const int NUM_THREADS = 4*WARP_SIZE; time_run ("mtf_cuda_thread<8> ", [&] {mtf_cuda_thread <<<(inbytes-1)/(CHUNK*NUM_THREADS)+1, NUM_THREADS>>> (d_inbuf, d_outbuf, inbytes, CHUNK);}); time_run ("mtf_cuda_thread<16> ", [&] {mtf_cuda_thread <<<(inbytes-1)/(CHUNK*NUM_THREADS)+1, NUM_THREADS>>> (d_inbuf, d_outbuf, inbytes, CHUNK);}); time_run ("mtf_cuda_thread<32> ", [&] {mtf_cuda_thread <<<(inbytes-1)/(CHUNK*NUM_THREADS)+1, NUM_THREADS>>> (d_inbuf, d_outbuf, inbytes, CHUNK);}); diff --git a/app_bslab/profile.txt b/app_bslab/profile.txt index f628993..8247c89 100644 --- a/app_bslab/profile.txt +++ b/app_bslab/profile.txt @@ -1,6345 +1,3554 @@ -==16952== NVPROF is profiling process 16952, command: mtf -b34m z:\e8 -==15752== Warning: The following aggregate event values were extrapolated from limited profile data and may therefore be inaccurate. To see the non-aggregate event values, use "--aggregate-mode off". -l1_global_load_hit,tex1_cache_sector_queries,l1_local_store_miss -==18708== Warning: The following aggregate event values were extrapolated from limited profile data and may therefore be inaccurate. To see the non-aggregate event values, use "--aggregate-mode off". -l1_local_load_hit,l1_local_load_miss,l1_local_store_hit,l1_local_store_miss -==20488== Warning: The following aggregate event values were extrapolated from limited profile data and may therefore be inaccurate. To see the non-aggregate event values, use "--aggregate-mode off". -tex0_cache_sector_misses,l1_local_load_miss,l1_local_store_hit -==21848== Warning: The following aggregate event values were extrapolated from limited profile data and may therefore be inaccurate. To see the non-aggregate event values, use "--aggregate-mode off". -event_name,tex0_cache_sector_misses -==20944== Warning: The following aggregate event values were extrapolated from limited profile data and may therefore be inaccurate. To see the non-aggregate event values, use "--aggregate-mode off". -l1_local_load_hit,l1_local_load_miss,l1_local_store_hit,l1_local_store_miss -==19696== Warning: The following aggregate event values were extrapolated from limited profile data and may therefore be inaccurate. To see the non-aggregate event values, use "--aggregate-mode off". -l1_local_store_hit,l1_local_store_miss -==20820== Warning: The following aggregate event values were extrapolated from limited profile data and may therefore be inaccurate. To see the non-aggregate event values, use "--aggregate-mode off". -event_name -==20556== Warning: The following aggregate event values were extrapolated from limited profile data and may therefore be inaccurate. To see the non-aggregate event values, use "--aggregate-mode off". -tex1_cache_sector_misses,uncached_global_load_transaction,global_store_transaction -==19624== Warning: The following aggregate event values were extrapolated from limited profile data and may therefore be inaccurate. To see the non-aggregate event values, use "--aggregate-mode off". -tex0_cache_sector_queries,l1_local_load_hit,elapsed_cycles_sm,sm_cta_launched -==19760== Warning: The following aggregate event values were extrapolated from limited profile data and may therefore be inaccurate. To see the non-aggregate event values, use "--aggregate-mode off". -tex0_cache_sector_queries,event_name,l1_shared_bank_conflict -==18912== Warning: The following aggregate event values were extrapolated from limited profile data and may therefore be inaccurate. To see the non-aggregate event values, use "--aggregate-mode off". -uncached_global_load_transaction,global_store_transaction -==19500== Warning: The following aggregate event values were extrapolated from limited profile data and may therefore be inaccurate. To see the non-aggregate event values, use "--aggregate-mode off". -l1_shared_bank_conflict -==21996== Warning: The following aggregate event values were extrapolated from limited profile data and may therefore be inaccurate. To see the non-aggregate event values, use "--aggregate-mode off". -l1_local_load_hit,elapsed_cycles_sm,l1_global_load_miss,l1_global_load_hit,l1_local_store_hit -==21960== Warning: The following aggregate event values were extrapolated from limited profile data and may therefore be inaccurate. To see the non-aggregate event values, use "--aggregate-mode off". -event_name,l1_shared_bank_conflict,tex1_cache_sector_queries -==21748== Warning: The following aggregate event values were extrapolated from limited profile data and may therefore be inaccurate. To see the non-aggregate event values, use "--aggregate-mode off". -tex1_cache_sector_misses,l1_global_load_miss -==5996== Warning: The following aggregate event values were extrapolated from limited profile data and may therefore be inaccurate. To see the non-aggregate event values, use "--aggregate-mode off". -l1_local_load_hit,l1_global_load_miss,l1_local_load_miss,l1_local_store_miss ======== Profiling result: ======== Event result: -Invocations Event Name Min Max Avg +Invocations Event Name Min Max Avg Total Device "GeForce GTX 560 Ti (0)" - Kernel: void mtf_2symbols(unsigned char const *, unsigned char*, int, int) - 3 local_load 0 0 0 - 3 local_store 0 0 0 - 3 gld_request 12126708 14962238 13994416 - 3 gst_request 10973184 13606912 12707157 - 3 shared_load 11907541 14704888 13750268 - 3 shared_store 22570315 27947650 26111446 - 3 branch 12881276 15937806 14893996 - 3 divergent_branch 0 0 0 - 3 active_cycles 163980982 202824185 189554903 - 3 inst_issued1_0 92102739 113659704 106261211 - 3 inst_issued2_0 30972757 38305760 35799878 - 3 inst_issued1_1 92058462 113698764 106356631 - 3 inst_issued2_1 30952844 38315356 35801699 - 3 inst_executed 308007708 380595408 355815243 - 3 warps_launched 2680 3324 3104 - 3 threads_launched 85760 106368 99328 - 3 thread_inst_executed_0 3938345248 4862815552 4545866496 - 3 thread_inst_executed_2 991124128 1225780224 1145592021 - 3 thread_inst_executed_1 3936290368 4864369984 4548978986 - 3 thread_inst_executed_3 990486912 1226087296 1145650293 - 3 active_warps 4818190880 6006833078 5575826405 - 3 tex0_cache_sector_queries 0 0 0 - 3 tex0_cache_sector_misses 0 0 0 - 3 tex1_cache_sector_queries 0 0 0 - 3 tex1_cache_sector_misses 0 0 0 - 3 sm_cta_launched 672 828 774 - 3 l1_local_load_hit 0 0 0 - 3 l1_local_load_miss 0 0 0 - 3 l1_local_store_hit 0 0 0 - 3 l1_local_store_miss 0 0 0 - 3 l1_global_load_hit 11720084 14198068 13351946 - 3 l1_global_load_miss 512960 808520 643265 - 3 uncached_global_load_transaction 0 0 0 - 3 global_store_transaction 10944512 13533184 12659370 - 3 l1_shared_bank_conflict 0 0 0 - 3 prof_trigger_00 0 0 0 - 3 prof_trigger_01 0 0 0 - 3 prof_trigger_02 0 0 0 - 3 prof_trigger_03 0 0 0 - 3 prof_trigger_04 0 0 0 - 3 prof_trigger_05 0 0 0 - 3 prof_trigger_06 0 0 0 - 3 prof_trigger_07 0 0 0 - 3 fb_subp0_read_sectors 360073 441453 413532 - 3 fb_subp1_read_sectors 359569 441709 413498 - 3 fb_subp0_write_sectors 4968862 6188474 5761645 - 3 fb_subp1_write_sectors 4975583 6194375 5773493 - 3 l2_subp0_write_sector_misses 4968862 6188474 5761645 - 3 l2_subp1_write_sector_misses 4975583 6194375 5773493 - 3 l2_subp0_read_sector_misses 360168 441225 413663 - 3 l2_subp1_read_sector_misses 358903 440561 412900 - 3 l2_subp0_write_sector_queries 5486592 6803456 6353578 - 3 l2_subp1_write_sector_queries 5486592 6803456 6353578 - 3 l2_subp0_read_sector_queries 933548 1291008 1146488 - 3 l2_subp1_read_sector_queries 999336 1362796 1228621 - 3 l2_subp0_read_tex_sector_queries 0 0 0 - 3 l2_subp1_read_tex_sector_queries 0 0 0 - 3 l2_subp0_read_hit_sectors 849484 1243444 1093281 - 3 l2_subp1_read_hit_sectors 1264 1832 1593 - 3 l2_subp0_read_tex_hit_sectors 0 0 0 - 3 l2_subp1_read_tex_hit_sectors 0 0 0 - 3 elapsed_cycles_sm 167449488 206792804 193297157 - 3 l2_subp0_read_sysmem_sector_queries 0 0 0 - 3 l2_subp1_read_sysmem_sector_queries 0 0 0 - 3 l2_subp0_write_sysmem_sector_queries 0 0 0 - 3 l2_subp1_write_sysmem_sector_queries 0 0 0 - 3 l2_subp0_total_read_sector_queries 1047029 1388627 1270366 - 3 l2_subp1_total_read_sector_queries 1110370 1383937 1292648 - 3 l2_subp0_total_write_sector_queries 5486595 6803456 6353580 - 3 l2_subp1_total_write_sector_queries 5486604 6803458 6353587 - 3 atom_count 0 0 0 - 3 gred_count 0 0 0 - 3 gld_inst_8bit 388054656 478791616 447821312 - 3 gld_inst_16bit 0 0 0 - 3 gld_inst_32bit 0 0 0 - 3 gld_inst_64bit 0 0 0 - 3 gld_inst_128bit 0 0 0 - 3 gst_inst_8bit 10973184 13606912 12707157 - 3 gst_inst_16bit 0 0 0 - 3 gst_inst_32bit 0 0 0 - 3 gst_inst_64bit 0 0 0 - 3 gst_inst_128bit 0 0 0 - Kernel: void mtf_scalar(unsigned char const *, unsigned char*, int, int) - 3 local_load 0 0 0 - 3 local_store 0 0 0 - 3 gld_request 10975802 13610159 12710186 - 3 gst_request 10973184 13606912 12707157 - 3 shared_load 11934422 14736053 13780269 - 3 shared_store 22570315 27947650 26111446 - 3 branch 24921895 30886785 28849414 - 3 divergent_branch 0 0 0 - 3 active_cycles 152908240 187363102 175666199 - 3 inst_issued1_0 90587306 112171430 104770690 - 3 inst_issued2_0 25067501 30979460 28964916 - 3 inst_issued1_1 90571633 112162723 104810308 - 3 inst_issued2_1 25035333 30999117 28962859 - 3 inst_executed 281299518 348152505 325328182 - 3 warps_launched 2680 3324 3104 - 3 threads_launched 85760 106368 99328 - 3 thread_inst_executed_0 3699932480 4578604000 4277813525 - 3 thread_inst_executed_2 802155936 991338624 926873237 - 3 thread_inst_executed_1 3698369600 4578969888 4279007680 - 3 thread_inst_executed_3 801126560 991967648 926807392 - 3 active_warps 4655783751 5776776995 5394005357 - 3 tex0_cache_sector_queries 0 0 0 - 3 tex0_cache_sector_misses 0 0 0 - 3 tex1_cache_sector_queries 0 0 0 - 3 tex1_cache_sector_misses 0 0 0 - 3 sm_cta_launched 668 832 774 - 3 l1_local_load_hit 0 0 0 - 3 l1_local_load_miss 0 0 0 - 3 l1_local_store_hit 0 0 0 - 3 l1_local_store_miss 0 0 0 - 3 l1_global_load_hit 9917916 12408528 11529246 - 3 l1_global_load_miss 790168 1149940 997718 - 3 uncached_global_load_transaction 0 0 0 - 3 global_store_transaction 10993664 13598720 12708522 - 3 l1_shared_bank_conflict 0 0 0 - 3 prof_trigger_00 0 0 0 - 3 prof_trigger_01 0 0 0 - 3 prof_trigger_02 0 0 0 - 3 prof_trigger_03 0 0 0 - 3 prof_trigger_04 0 0 0 - 3 prof_trigger_05 0 0 0 - 3 prof_trigger_06 0 0 0 - 3 prof_trigger_07 0 0 0 - 3 fb_subp0_read_sectors 341147 426509 396812 - 3 fb_subp1_read_sectors 346942 429333 399066 - 3 fb_subp0_write_sectors 5002907 6231777 5817175 - 3 fb_subp1_write_sectors 4999040 6245617 5820614 - 3 l2_subp0_write_sector_misses 5002907 6231777 5817175 - 3 l2_subp1_write_sector_misses 4999040 6245617 5820614 - 3 l2_subp0_read_sector_misses 343542 423795 396149 - 3 l2_subp1_read_sector_misses 346661 429129 398824 - 3 l2_subp0_write_sector_queries 5486592 6803456 6353578 - 3 l2_subp1_write_sector_queries 5486592 6803456 6353578 - 3 l2_subp0_read_sector_queries 1663332 2399736 2145569 - 3 l2_subp1_read_sector_queries 1683796 2546124 2161985 - 3 l2_subp0_read_tex_sector_queries 0 0 0 - 3 l2_subp1_read_tex_sector_queries 0 0 0 - 3 l2_subp0_read_hit_sectors 1571924 2180492 1941441 - 3 l2_subp1_read_hit_sectors 2764 3936 3477 - 3 l2_subp0_read_tex_hit_sectors 0 0 0 - 3 l2_subp1_read_tex_hit_sectors 0 0 0 - 3 elapsed_cycles_sm 155588792 190410236 178622808 - 3 l2_subp0_read_sysmem_sector_queries 0 0 0 - 3 l2_subp1_read_sysmem_sector_queries 0 0 0 - 3 l2_subp0_write_sysmem_sector_queries 0 0 0 - 3 l2_subp1_write_sysmem_sector_queries 0 0 0 - 3 l2_subp0_total_read_sector_queries 1711019 2449286 2165966 - 3 l2_subp1_total_read_sector_queries 1675986 2367221 2100049 - 3 l2_subp0_total_write_sector_queries 5486595 6803458 6353581 - 3 l2_subp1_total_write_sector_queries 5486594 6803468 6353587 - 3 atom_count 0 0 0 - 3 gred_count 0 0 0 - 3 gld_inst_8bit 351225664 435525088 406725962 - 3 gld_inst_16bit 0 0 0 - 3 gld_inst_32bit 0 0 0 - 3 gld_inst_64bit 0 0 0 - 3 gld_inst_128bit 0 0 0 - 3 gst_inst_8bit 351141888 435421184 406629034 - 3 gst_inst_16bit 0 0 0 - 3 gst_inst_32bit 0 0 0 - 3 gst_inst_64bit 0 0 0 - 3 gst_inst_128bit 0 0 0 - Kernel: void mtf_4by8(unsigned char const *, unsigned char*, int, int) - 3 local_load 0 0 0 - 3 local_store 0 0 0 - 3 gld_request 4344096 5340311 5001744 - 3 gst_request 4343775 5339880 5001352 - 3 shared_load 7805482 9541039 8953243 - 3 shared_store 12111125 14834984 13911488 - 3 branch 13736183 16849706 15794851 - 3 divergent_branch 3110596 3787797 3558041 - 3 active_cycles 120985991 141159758 134242929 - 3 inst_issued1_0 60201738 73020714 68733574 - 3 inst_issued2_0 28144877 34641854 32476102 - 3 inst_issued1_1 60334722 72926652 68636980 - 3 inst_issued2_1 28336203 34592004 32435778 - 3 inst_executed 192689564 236296641 221514924 - 3 warps_launched 672 832 778 - 3 threads_launched 21504 26624 24917 - 3 thread_inst_executed_0 1567158636 1931104963 1807908772 - 3 thread_inst_executed_2 654255832 805553760 754382901 - 3 thread_inst_executed_1 1571843276 1933137228 1809033485 - 3 thread_inst_executed_3 656427440 806606248 754994842 - 3 active_warps 3993548325 4779447009 4512826000 - 3 tex0_cache_sector_queries 0 0 0 - 3 tex0_cache_sector_misses 0 0 0 - 3 tex1_cache_sector_queries 0 0 0 - 3 tex1_cache_sector_misses 0 0 0 - 3 sm_cta_launched 80 112 98 - 3 l1_local_load_hit 0 0 0 - 3 l1_local_load_miss 0 0 0 - 3 l1_local_store_hit 0 0 0 - 3 l1_local_store_miss 0 0 0 - 3 l1_global_load_hit 3469732 5240492 4380169 - 3 l1_global_load_miss 7032652 9267444 8264085 - 3 uncached_global_load_transaction 0 0 0 - 3 global_store_transaction 10485760 14155776 12757674 - 3 l1_shared_bank_conflict 19108248 26048824 23467198 - 3 prof_trigger_00 0 0 0 - 3 prof_trigger_01 0 0 0 - 3 prof_trigger_02 0 0 0 - 3 prof_trigger_03 0 0 0 - 3 prof_trigger_04 0 0 0 - 3 prof_trigger_05 0 0 0 - 3 prof_trigger_06 0 0 0 - 3 prof_trigger_07 0 0 0 - 3 fb_subp0_read_sectors 383912 479890 443195 - 3 fb_subp1_read_sectors 454807 500397 484210 - 3 fb_subp0_write_sectors 5029538 6137057 5763803 - 3 fb_subp1_write_sectors 5007352 6119013 5747421 - 3 l2_subp0_write_sector_misses 5029538 6137057 5763803 - 3 l2_subp1_write_sector_misses 5007352 6119013 5747421 - 3 l2_subp0_read_sector_misses 377689 481757 444839 - 3 l2_subp1_read_sector_misses 457802 512297 489628 - 3 l2_subp0_write_sector_queries 5486592 6803456 6353578 - 3 l2_subp1_write_sector_queries 5486592 6803456 6353578 - 3 l2_subp0_read_sector_queries 14469736 17886196 16522373 - 3 l2_subp1_read_sector_queries 14551332 17857052 16669490 - 3 l2_subp0_read_tex_sector_queries 0 0 0 - 3 l2_subp1_read_tex_sector_queries 0 0 0 - 3 l2_subp0_read_hit_sectors 14419060 17770768 16451900 - 3 l2_subp1_read_hit_sectors 62032 66192 63729 - 3 l2_subp0_read_tex_hit_sectors 0 0 0 - 3 l2_subp1_read_tex_hit_sectors 0 0 0 - 3 elapsed_cycles_sm 138640720 164231968 154802008 - 3 l2_subp0_read_sysmem_sector_queries 0 0 0 - 3 l2_subp1_read_sysmem_sector_queries 0 0 0 - 3 l2_subp0_write_sysmem_sector_queries 0 0 0 - 3 l2_subp1_write_sysmem_sector_queries 0 0 0 - 3 l2_subp0_total_read_sector_queries 14430230 17350986 16238053 - 3 l2_subp1_total_read_sector_queries 14588025 17698162 16500204 - 3 l2_subp0_total_write_sector_queries 5486595 6803458 6353580 - 3 l2_subp1_total_write_sector_queries 5486605 6803468 6353587 - 3 atom_count 0 0 0 - 3 gred_count 0 0 0 - 3 gld_inst_8bit 87806904 108881872 101682077 - 3 gld_inst_16bit 0 0 0 - 3 gld_inst_32bit 0 0 0 - 3 gld_inst_64bit 0 0 0 - 3 gld_inst_128bit 0 0 0 - 3 gst_inst_8bit 87785472 108855296 101657258 - 3 gst_inst_16bit 0 0 0 - 3 gst_inst_32bit 0 0 0 - 3 gst_inst_64bit 0 0 0 - 3 gst_inst_128bit 0 0 0 - Kernel: void mtf_thread_by4(unsigned char const *, unsigned char*, int, int) - 3 local_load 0 0 0 - 3 local_store 0 0 0 - 3 gld_request 3376292 4127417 3870655 - 3 gst_request 3376926 4128206 3871383 - 3 shared_load 4313001 5302263 4948768 - 3 shared_store 4314345 5303927 4950326 - 3 branch 13216110 16198916 15156884 - 3 divergent_branch 3350722 4086023 3837796 - 3 active_cycles 203829426 232263029 221811899 - 3 inst_issued1_0 24311297 27946623 26367306 - 3 inst_issued2_0 17910900 20003859 18981417 - 3 inst_issued1_1 19856877 27095091 24464339 - 3 inst_issued2_1 14154221 19939436 17784141 - 3 inst_executed 92611037 113477939 106203349 - 3 warps_launched 84 104 97 - 3 threads_launched 2688 3328 3114 - 3 thread_inst_executed_0 372811204 417134531 397428905 - 3 thread_inst_executed_2 152414009 170971536 162579417 - 3 thread_inst_executed_1 352810474 413196134 389511346 - 3 thread_inst_executed_3 121791223 168590546 151705954 - 3 active_warps 1228257181 1497870342 1405641588 - 3 tex0_cache_sector_queries 0 0 0 - 3 tex0_cache_sector_misses 0 0 0 - 3 tex1_cache_sector_queries 0 0 0 - 3 tex1_cache_sector_misses 0 0 0 - 3 sm_cta_launched 88 112 102 - 3 l1_local_load_hit 0 0 0 - 3 l1_local_load_miss 0 0 0 - 3 l1_local_store_hit 0 0 0 - 3 l1_local_store_miss 0 0 0 - 3 l1_global_load_hit 3507216 4873880 4236744 - 3 l1_global_load_miss 7941784 9610712 8999732 - 3 uncached_global_load_transaction 0 0 0 - 3 global_store_transaction 11534336 14319616 13336576 - 3 l1_shared_bank_conflict 0 0 0 - 3 prof_trigger_00 0 0 0 - 3 prof_trigger_01 0 0 0 - 3 prof_trigger_02 0 0 0 - 3 prof_trigger_03 0 0 0 - 3 prof_trigger_04 0 0 0 - 3 prof_trigger_05 0 0 0 - 3 prof_trigger_06 0 0 0 - 3 prof_trigger_07 0 0 0 - 3 fb_subp0_read_sectors 886161 1001389 952654 - 3 fb_subp1_read_sectors 889293 1015071 965892 - 3 fb_subp0_write_sectors 5427342 6742665 6293354 - 3 fb_subp1_write_sectors 5425425 6739264 6290608 - 3 l2_subp0_write_sector_misses 5427342 6742665 6293354 - 3 l2_subp1_write_sector_misses 5425425 6739264 6290608 - 3 l2_subp0_read_sector_misses 884978 1001009 950147 - 3 l2_subp1_read_sector_misses 887822 1013701 963051 - 3 l2_subp0_write_sector_queries 5486592 6803456 6353578 - 3 l2_subp1_write_sector_queries 5486592 6803456 6353578 - 3 l2_subp0_read_sector_queries 15269620 18709732 17473609 - 3 l2_subp1_read_sector_queries 15181268 18712280 17468657 - 3 l2_subp0_read_tex_sector_queries 0 0 0 - 3 l2_subp1_read_tex_sector_queries 0 0 0 - 3 l2_subp0_read_hit_sectors 14659992 18026292 16804326 - 3 l2_subp1_read_hit_sectors 74892 84880 80446 - 3 l2_subp0_read_tex_hit_sectors 0 0 0 - 3 l2_subp1_read_tex_hit_sectors 0 0 0 - 3 elapsed_cycles_sm 247681136 273328424 263639630 - 3 l2_subp0_read_sysmem_sector_queries 0 0 0 - 3 l2_subp1_read_sysmem_sector_queries 0 0 0 - 3 l2_subp0_write_sysmem_sector_queries 0 0 0 - 3 l2_subp1_write_sysmem_sector_queries 0 0 0 - 3 l2_subp0_total_read_sector_queries 15179071 18490175 17381275 - 3 l2_subp1_total_read_sector_queries 15228205 18772118 17483130 - 3 l2_subp0_total_write_sector_queries 5486592 6803458 6353580 - 3 l2_subp1_total_write_sector_queries 5486594 6803469 6353588 - 3 atom_count 0 0 0 - 3 gred_count 0 0 0 - 3 gld_inst_8bit 10975863 13610234 12710259 - 3 gld_inst_16bit 0 0 0 - 3 gld_inst_32bit 0 0 0 - 3 gld_inst_64bit 0 0 0 - 3 gld_inst_128bit 0 0 0 - 3 gst_inst_8bit 10973184 13606912 12707157 - 3 gst_inst_16bit 0 0 0 - 3 gst_inst_32bit 0 0 0 - 3 gst_inst_64bit 0 0 0 - 3 gst_inst_128bit 0 0 0 - Kernel: void mtf_thread(unsigned char const *, unsigned char*, int, int) - 3 local_load 0 0 0 - 3 local_store 0 0 0 - 3 gld_request 2962351 3618057 3393864 - 3 gst_request 2962877 3618693 3394466 - 3 shared_load 3819173 4693902 4377087 - 3 shared_store 6783310 8314155 7773014 - 3 branch 10651241 13093263 12215445 - 3 divergent_branch 2939808 3577734 3363320 - 3 active_cycles 224290841 249205795 240621612 - 3 inst_issued1_0 31961661 39448780 36713344 - 3 inst_issued2_0 17456852 21070868 19705899 - 3 inst_issued1_1 30793482 38543119 35428248 - 3 inst_issued2_1 16526847 21101237 19250266 - 3 inst_executed 114291159 140275494 131017233 - 3 warps_launched 84 104 97 - 3 threads_launched 2688 3328 3114 - 3 thread_inst_executed_0 599235135 745364894 687667980 - 3 thread_inst_executed_2 254004255 282682765 268117690 - 3 thread_inst_executed_1 511048632 728825690 645478960 - 3 thread_inst_executed_3 194203908 280309576 245920959 - 3 active_warps 1292665879 1585175491 1482552539 - 3 tex0_cache_sector_queries 0 0 0 - 3 tex0_cache_sector_misses 0 0 0 - 3 tex1_cache_sector_queries 0 0 0 - 3 tex1_cache_sector_misses 0 0 0 - 3 sm_cta_launched 88 108 100 - 3 l1_local_load_hit 0 0 0 - 3 l1_local_load_miss 0 0 0 - 3 l1_local_store_hit 0 0 0 - 3 l1_local_store_miss 0 0 0 - 3 l1_global_load_hit 4213512 5912872 5203921 - 3 l1_global_load_miss 7173540 8311192 7735426 - 3 uncached_global_load_transaction 0 0 0 - 3 global_store_transaction 11386880 13795328 12937898 - 3 l1_shared_bank_conflict 0 0 0 - 3 prof_trigger_00 0 0 0 - 3 prof_trigger_01 0 0 0 - 3 prof_trigger_02 0 0 0 - 3 prof_trigger_03 0 0 0 - 3 prof_trigger_04 0 0 0 - 3 prof_trigger_05 0 0 0 - 3 prof_trigger_06 0 0 0 - 3 prof_trigger_07 0 0 0 - 3 fb_subp0_read_sectors 768605 879365 831761 - 3 fb_subp1_read_sectors 758080 872204 828567 - 3 fb_subp0_write_sectors 5393665 6715713 6265053 - 3 fb_subp1_write_sectors 5393693 6715548 6264447 - 3 l2_subp0_write_sector_misses 5393665 6715713 6265053 - 3 l2_subp1_write_sector_misses 5393693 6715548 6264447 - 3 l2_subp0_read_sector_misses 766602 875474 827227 - 3 l2_subp1_read_sector_misses 764179 876380 829913 - 3 l2_subp0_write_sector_queries 5486592 6803456 6353578 - 3 l2_subp1_write_sector_queries 5486592 6803456 6353578 - 3 l2_subp0_read_sector_queries 13605304 16521304 15479834 - 3 l2_subp1_read_sector_queries 13574740 16519680 15467657 - 3 l2_subp0_read_tex_sector_queries 0 0 0 - 3 l2_subp1_read_tex_sector_queries 0 0 0 - 3 l2_subp0_read_hit_sectors 13061268 16007740 14949461 - 3 l2_subp1_read_hit_sectors 48860 56484 53408 - 3 l2_subp0_read_tex_hit_sectors 0 0 0 - 3 l2_subp1_read_tex_hit_sectors 0 0 0 - 3 elapsed_cycles_sm 282970064 318577656 301951346 - 3 l2_subp0_read_sysmem_sector_queries 0 0 0 - 3 l2_subp1_read_sysmem_sector_queries 0 0 0 - 3 l2_subp0_write_sysmem_sector_queries 0 0 0 - 3 l2_subp1_write_sysmem_sector_queries 0 0 0 - 3 l2_subp0_total_read_sector_queries 13651431 16497631 15389058 - 3 l2_subp1_total_read_sector_queries 13500378 16524382 15400771 - 3 l2_subp0_total_write_sector_queries 5486595 6803458 6353581 - 3 l2_subp1_total_write_sector_queries 5486605 6803459 6353588 - 3 atom_count 0 0 0 - 3 gred_count 0 0 0 - 3 gld_inst_8bit 10975863 13610234 12710259 - 3 gld_inst_16bit 0 0 0 - 3 gld_inst_32bit 0 0 0 - 3 gld_inst_64bit 0 0 0 - 3 gld_inst_128bit 0 0 0 - 3 gst_inst_8bit 10973184 13606912 12707157 - 3 gst_inst_16bit 0 0 0 - 3 gst_inst_32bit 0 0 0 - 3 gst_inst_64bit 0 0 0 - 3 gst_inst_128bit 0 0 0 - Kernel: void mtf_thread(unsigned char const *, unsigned char*, int, int) - 3 local_load 0 0 0 - 3 local_store 0 0 0 - 3 gld_request 1856116 2282037 2138632 - 3 gst_request 1856393 2282363 2138936 - 3 shared_load 1975945 2430080 2277617 - 3 shared_store 3832422 4712547 4416651 - 3 branch 5832943 7173659 6722322 - 3 divergent_branch 1846247 2269521 2127406 - 3 active_cycles 195302632 214908590 206913045 - 3 inst_issued1_0 21725215 26702864 24725105 - 3 inst_issued2_0 10979024 13414933 12412057 - 3 inst_issued1_1 20489796 25802848 23774961 - 3 inst_issued2_1 10206467 13164637 12002992 - 3 inst_executed 63652399 78276294 73356875 - 3 warps_launched 84 104 97 - 3 threads_launched 2688 3328 3114 - 3 thread_inst_executed_0 361390428 442846933 410899404 - 3 thread_inst_executed_2 178497192 219041262 203731673 - 3 thread_inst_executed_1 338040165 434172749 397381233 - 3 thread_inst_executed_3 167099334 214816392 198136495 - 3 active_warps 1182938260 1436908170 1349700174 - 3 tex0_cache_sector_queries 0 0 0 - 3 tex0_cache_sector_misses 0 0 0 - 3 tex1_cache_sector_queries 0 0 0 - 3 tex1_cache_sector_misses 0 0 0 - 3 sm_cta_launched 92 112 104 - 3 l1_local_load_hit 0 0 0 - 3 l1_local_load_miss 0 0 0 - 3 l1_local_store_hit 0 0 0 - 3 l1_local_store_miss 0 0 0 - 3 l1_global_load_hit 3132764 3709516 3496645 - 3 l1_global_load_miss 8910172 10404464 9888673 - 3 uncached_global_load_transaction 0 0 0 - 3 global_store_transaction 11911168 14319616 13462186 - 3 l1_shared_bank_conflict 0 0 0 - 3 prof_trigger_00 0 0 0 - 3 prof_trigger_01 0 0 0 - 3 prof_trigger_02 0 0 0 - 3 prof_trigger_03 0 0 0 - 3 prof_trigger_04 0 0 0 - 3 prof_trigger_05 0 0 0 - 3 prof_trigger_06 0 0 0 - 3 prof_trigger_07 0 0 0 - 3 fb_subp0_read_sectors 2234714 2693038 2471593 - 3 fb_subp1_read_sectors 2190101 2659454 2488342 - 3 fb_subp0_write_sectors 5403316 6714098 6264651 - 3 fb_subp1_write_sectors 5398265 6710934 6262459 - 3 l2_subp0_write_sector_misses 5403316 6714098 6264651 - 3 l2_subp1_write_sector_misses 5398265 6710934 6262459 - 3 l2_subp0_read_sector_misses 2239658 2682557 2468478 - 3 l2_subp1_read_sector_misses 2199242 2638955 2483598 - 3 l2_subp0_write_sector_queries 5486592 6803456 6353578 - 3 l2_subp1_write_sector_queries 5486592 6803456 6353578 - 3 l2_subp0_read_sector_queries 16854592 20538812 19227092 - 3 l2_subp1_read_sector_queries 16904100 20538628 19324298 - 3 l2_subp0_read_tex_sector_queries 0 0 0 - 3 l2_subp1_read_tex_sector_queries 0 0 0 - 3 l2_subp0_read_hit_sectors 14880504 18201340 17027712 - 3 l2_subp1_read_hit_sectors 264208 326948 298881 - 3 l2_subp0_read_tex_hit_sectors 0 0 0 - 3 l2_subp1_read_tex_hit_sectors 0 0 0 - 3 elapsed_cycles_sm 206202664 223637752 217124517 - 3 l2_subp0_read_sysmem_sector_queries 0 0 0 - 3 l2_subp1_read_sysmem_sector_queries 0 0 0 - 3 l2_subp0_write_sysmem_sector_queries 0 0 0 - 3 l2_subp1_write_sysmem_sector_queries 0 0 0 - 3 l2_subp0_total_read_sector_queries 16607747 20529074 19049672 - 3 l2_subp1_total_read_sector_queries 16909608 20702820 19430294 - 3 l2_subp0_total_write_sector_queries 5486595 6803458 6353581 - 3 l2_subp1_total_write_sector_queries 5486605 6803469 6353588 - 3 atom_count 0 0 0 - 3 gred_count 0 0 0 - 3 gld_inst_8bit 10975863 13610234 12710259 - 3 gld_inst_16bit 0 0 0 - 3 gld_inst_32bit 0 0 0 - 3 gld_inst_64bit 0 0 0 - 3 gld_inst_128bit 0 0 0 - 3 gst_inst_8bit 10973184 13606912 12707157 - 3 gst_inst_16bit 0 0 0 - 3 gst_inst_32bit 0 0 0 - 3 gst_inst_64bit 0 0 0 - 3 gst_inst_128bit 0 0 0 - Kernel: void mtf_thread_by4(unsigned char const *, unsigned char*, int, int) - 3 local_load 0 0 0 - 3 local_store 0 0 0 - 3 gld_request 2507398 3091900 2894436 - 3 gst_request 2507849 3092449 2894955 - 3 shared_load 2266384 2793582 2617263 - 3 shared_store 2266552 2793790 2617457 - 3 branch 8414585 10374323 9714988 - 3 divergent_branch 2496590 3077575 2881726 - 3 active_cycles 213158565 231968440 225511757 - 3 inst_issued1_0 22613607 27826875 25754550 - 3 inst_issued2_0 9604106 11794129 10897824 - 3 inst_issued1_1 20563934 25942062 23881059 - 3 inst_issued2_1 8589675 11093503 10106837 - 3 inst_executed 59680363 73579622 68902285 - 3 warps_launched 84 104 97 - 3 threads_launched 2688 3328 3114 - 3 thread_inst_executed_0 249056800 304278737 283747806 - 3 thread_inst_executed_2 92859061 113913291 105919392 - 3 thread_inst_executed_1 221219899 288197403 262018027 - 3 thread_inst_executed_3 82724486 106819269 97708339 - 3 active_warps 1321386620 1577796030 1489214672 - 3 tex0_cache_sector_queries 0 0 0 - 3 tex0_cache_sector_misses 0 0 0 - 3 tex1_cache_sector_queries 0 0 0 - 3 tex1_cache_sector_misses 0 0 0 - 3 sm_cta_launched 92 116 106 - 3 l1_local_load_hit 0 0 0 - 3 l1_local_load_miss 0 0 0 - 3 l1_local_store_hit 0 0 0 - 3 l1_local_store_miss 0 0 0 - 3 l1_global_load_hit 2704024 3063372 2903425 - 3 l1_global_load_miss 9821720 12572612 11328892 - 3 uncached_global_load_transaction 0 0 0 - 3 global_store_transaction 12058624 14680064 13686101 - 3 l1_shared_bank_conflict 0 0 0 - 3 prof_trigger_00 0 0 0 - 3 prof_trigger_01 0 0 0 - 3 prof_trigger_02 0 0 0 - 3 prof_trigger_03 0 0 0 - 3 prof_trigger_04 0 0 0 - 3 prof_trigger_05 0 0 0 - 3 prof_trigger_06 0 0 0 - 3 prof_trigger_07 0 0 0 - 3 fb_subp0_read_sectors 3219526 3765779 3496669 - 3 fb_subp1_read_sectors 3167769 3754165 3526042 - 3 fb_subp0_write_sectors 5412249 6719856 6269930 - 3 fb_subp1_write_sectors 5405456 6709651 6264910 - 3 l2_subp0_write_sector_misses 5412249 6719856 6269930 - 3 l2_subp1_write_sector_misses 5405456 6709651 6264910 - 3 l2_subp0_read_sector_misses 3230680 3792245 3512725 - 3 l2_subp1_read_sector_misses 3160202 3757802 3527230 - 3 l2_subp0_write_sector_queries 5486592 6803456 6353578 - 3 l2_subp1_write_sector_queries 5486592 6803456 6353578 - 3 l2_subp0_read_sector_queries 17702912 22331116 20648760 - 3 l2_subp1_read_sector_queries 17952544 21878148 20532408 - 3 l2_subp0_read_tex_sector_queries 0 0 0 - 3 l2_subp1_read_tex_sector_queries 0 0 0 - 3 l2_subp0_read_hit_sectors 14648052 18676660 17062098 - 3 l2_subp1_read_hit_sectors 338644 423088 374326 - 3 l2_subp0_read_tex_hit_sectors 0 0 0 - 3 l2_subp1_read_tex_hit_sectors 0 0 0 - 3 elapsed_cycles_sm 222496104 238209948 232698548 - 3 l2_subp0_read_sysmem_sector_queries 0 0 0 - 3 l2_subp1_read_sysmem_sector_queries 0 0 0 - 3 l2_subp0_write_sysmem_sector_queries 0 0 0 - 3 l2_subp1_write_sysmem_sector_queries 0 0 0 - 3 l2_subp0_total_read_sector_queries 17678822 21576459 20276726 - 3 l2_subp1_total_read_sector_queries 17767588 21758716 20407125 - 3 l2_subp0_total_write_sector_queries 5486595 6803456 6353580 - 3 l2_subp1_total_write_sector_queries 5486594 6803469 6353584 - 3 atom_count 0 0 0 - 3 gred_count 0 0 0 - 3 gld_inst_8bit 10975863 13610234 12710259 - 3 gld_inst_16bit 0 0 0 - 3 gld_inst_32bit 0 0 0 - 3 gld_inst_64bit 0 0 0 - 3 gld_inst_128bit 0 0 0 - 3 gst_inst_8bit 10973184 13606912 12707157 - 3 gst_inst_16bit 0 0 0 - 3 gst_inst_32bit 0 0 0 - 3 gst_inst_64bit 0 0 0 - 3 gst_inst_128bit 0 0 0 - Kernel: void mtf_2buffers_depth32(unsigned char const *, unsigned char*, int, int) - 3 local_load 0 0 0 - 3 local_store 0 0 0 - 3 gld_request 10979960 13610234 12711625 - 3 gst_request 10977280 13606912 12708522 - 3 shared_load 10977280 13606912 12708522 - 3 shared_store 21957240 27215300 25418839 - 3 branch 345720 428541 400246 - 3 divergent_branch 0 0 0 - 3 active_cycles 100996256 124624322 116559607 - 3 inst_issued1_0 50295323 62381564 58249807 - 3 inst_issued2_0 34309488 42553978 39737537 - 3 inst_issued1_1 50295926 62306108 58199803 - 3 inst_issued2_1 34309488 42502772 39703400 - 3 inst_executed 237762900 294719562 275260848 - 3 warps_launched 1340 1664 1553 - 3 threads_launched 42880 53248 49706 - 3 thread_inst_executed_0 2706306880 3356628608 3134469013 - 3 thread_inst_executed_2 1097899520 1361723200 1271597098 - 3 thread_inst_executed_1 2706306880 3352589568 3131776320 - 3 thread_inst_executed_3 1097899520 1360084608 1270504704 - 3 active_warps 3020115582 3772280659 3514790754 - 3 tex0_cache_sector_queries 0 0 0 - 3 tex0_cache_sector_misses 0 0 0 - 3 tex1_cache_sector_queries 0 0 0 - 3 tex1_cache_sector_misses 0 0 0 - 3 sm_cta_launched 336 416 388 - 3 l1_local_load_hit 0 0 0 - 3 l1_local_load_miss 0 0 0 - 3 l1_local_store_hit 0 0 0 - 3 l1_local_store_miss 0 0 0 - 3 l1_global_load_hit 8615616 10587884 9826764 - 3 l1_global_load_miss 2650548 2891888 2800570 - 3 uncached_global_load_transaction 0 0 0 - 3 global_store_transaction 11010048 13631488 12724906 - 3 l1_shared_bank_conflict 0 0 0 - 3 prof_trigger_00 0 0 0 - 3 prof_trigger_01 0 0 0 - 3 prof_trigger_02 0 0 0 - 3 prof_trigger_03 0 0 0 - 3 prof_trigger_04 0 0 0 - 3 prof_trigger_05 0 0 0 - 3 prof_trigger_06 0 0 0 - 3 prof_trigger_07 0 0 0 - 3 fb_subp0_read_sectors 285448 354456 331039 - 3 fb_subp1_read_sectors 286744 354669 331711 - 3 fb_subp0_write_sectors 4263028 5307367 4946433 - 3 fb_subp1_write_sectors 4262734 5302856 4949259 - 3 l2_subp0_write_sector_misses 4263028 5307367 4946433 - 3 l2_subp1_write_sector_misses 4262734 5302856 4949259 - 3 l2_subp0_read_sector_misses 291256 353624 332572 - 3 l2_subp1_read_sector_misses 285593 353963 330956 - 3 l2_subp0_write_sector_queries 5488640 6803456 6354261 - 3 l2_subp1_write_sector_queries 5488640 6803456 6354261 - 3 l2_subp0_read_sector_queries 4997612 6546240 5740097 - 3 l2_subp1_read_sector_queries 4939716 5961968 5574276 - 3 l2_subp0_read_tex_sector_queries 0 0 0 - 3 l2_subp1_read_tex_sector_queries 0 0 0 - 3 l2_subp0_read_hit_sectors 4835616 6021512 5580856 - 3 l2_subp1_read_hit_sectors 10044 12024 11330 - 3 l2_subp0_read_tex_hit_sectors 0 0 0 - 3 l2_subp1_read_tex_hit_sectors 0 0 0 - 3 elapsed_cycles_sm 101122908 124778112 116844828 - 3 l2_subp0_read_sysmem_sector_queries 0 0 0 - 3 l2_subp1_read_sysmem_sector_queries 0 0 0 - 3 l2_subp0_write_sysmem_sector_queries 0 0 0 - 3 l2_subp1_write_sysmem_sector_queries 0 0 0 - 3 l2_subp0_total_read_sector_queries 5345956 6013773 5579732 - 3 l2_subp1_total_read_sector_queries 4959317 6053861 5632684 - 3 l2_subp0_total_write_sector_queries 5488645 6803456 6354263 - 3 l2_subp1_total_write_sector_queries 5488651 6803457 6354269 - 3 atom_count 0 0 0 - 3 gred_count 0 0 0 - 3 gld_inst_8bit 351358720 435527488 406772010 - 3 gld_inst_16bit 0 0 0 - 3 gld_inst_32bit 0 0 0 - 3 gld_inst_64bit 0 0 0 - 3 gld_inst_128bit 0 0 0 - 3 gst_inst_8bit 351272960 435421184 406672725 - 3 gst_inst_16bit 0 0 0 - 3 gst_inst_32bit 0 0 0 - 3 gst_inst_64bit 0 0 0 - 3 gst_inst_128bit 0 0 0 - Kernel: void mtf_2buffers_depth32(unsigned char const *, unsigned char*, int, int) - 3 local_load 0 0 0 - 3 local_store 0 0 0 - 3 gld_request 10975863 13618428 12712991 - 3 gst_request 10973184 13615104 12709888 - 3 shared_load 10973184 13615104 12709888 - 3 shared_store 21949047 27223494 25418840 - 3 branch 230397 285864 266859 - 3 divergent_branch 0 0 0 - 3 active_cycles 103232224 126165236 118309840 - 3 inst_issued1_0 58750246 72806674 67991367 - 3 inst_issued2_0 27811129 34468346 32187057 - 3 inst_issued1_1 58638899 72834951 67973194 - 3 inst_issued2_1 27748914 34468346 32166319 - 3 inst_executed 228408888 283400916 264558692 - 3 warps_launched 896 1108 1036 - 3 threads_launched 28672 35456 33152 - 3 thread_inst_executed_0 2768682464 3431431680 3204321429 - 3 thread_inst_executed_2 889952032 1102982976 1029981738 - 3 thread_inst_executed_1 2762488768 3431431680 3202256864 - 3 thread_inst_executed_3 887961152 1102982976 1029318112 - 3 active_warps 3003260626 3752572587 3495053626 - 3 tex0_cache_sector_queries 0 0 0 - 3 tex0_cache_sector_misses 0 0 0 - 3 tex1_cache_sector_queries 0 0 0 - 3 tex1_cache_sector_misses 0 0 0 - 3 sm_cta_launched 224 276 257 - 3 l1_local_load_hit 0 0 0 - 3 l1_local_load_miss 0 0 0 - 3 l1_local_store_hit 0 0 0 - 3 l1_local_store_miss 0 0 0 - 3 l1_global_load_hit 8040104 9463760 8971945 - 3 l1_global_load_miss 2967040 4030736 3544804 - 3 uncached_global_load_transaction 0 0 0 - 3 global_store_transaction 10862592 13762560 12730368 - 3 l1_shared_bank_conflict 0 0 0 - 3 prof_trigger_00 0 0 0 - 3 prof_trigger_01 0 0 0 - 3 prof_trigger_02 0 0 0 - 3 prof_trigger_03 0 0 0 - 3 prof_trigger_04 0 0 0 - 3 prof_trigger_05 0 0 0 - 3 prof_trigger_06 0 0 0 - 3 prof_trigger_07 0 0 0 - 3 fb_subp0_read_sectors 288505 358217 334564 - 3 fb_subp1_read_sectors 293977 358525 336536 - 3 fb_subp0_write_sectors 4722490 5910595 5506246 - 3 fb_subp1_write_sectors 4731101 5916645 5510816 - 3 l2_subp0_write_sector_misses 4722490 5910595 5506246 - 3 l2_subp1_write_sector_misses 4731101 5916645 5510816 - 3 l2_subp0_read_sector_misses 294039 358572 336644 - 3 l2_subp1_read_sector_misses 288357 358425 334583 - 3 l2_subp0_write_sector_queries 5486592 6807552 6354944 - 3 l2_subp1_write_sector_queries 5486592 6807552 6354944 - 3 l2_subp0_read_sector_queries 6015040 7798332 7164610 - 3 l2_subp1_read_sector_queries 5837008 8304360 7301926 - 3 l2_subp0_read_tex_sector_queries 0 0 0 - 3 l2_subp1_read_tex_sector_queries 0 0 0 - 3 l2_subp0_read_hit_sectors 5858828 7826292 7062769 - 3 l2_subp1_read_hit_sectors 12120 15152 14046 - 3 l2_subp0_read_tex_hit_sectors 0 0 0 - 3 l2_subp1_read_tex_hit_sectors 0 0 0 - 3 elapsed_cycles_sm 103627140 127724560 119658908 - 3 l2_subp0_read_sysmem_sector_queries 0 0 0 - 3 l2_subp1_read_sysmem_sector_queries 0 0 0 - 3 l2_subp0_write_sysmem_sector_queries 0 0 0 - 3 l2_subp1_write_sysmem_sector_queries 0 0 0 - 3 l2_subp0_total_read_sector_queries 5872981 8131737 7205998 - 3 l2_subp1_total_read_sector_queries 5919417 8360809 7528221 - 3 l2_subp0_total_write_sector_queries 5486597 6807554 6354946 - 3 l2_subp1_total_write_sector_queries 5486593 6807564 6354948 - 3 atom_count 0 0 0 - 3 gred_count 0 0 0 - 3 gld_inst_8bit 351227616 435789696 406815712 - 3 gld_inst_16bit 0 0 0 - 3 gld_inst_32bit 0 0 0 - 3 gld_inst_64bit 0 0 0 - 3 gld_inst_128bit 0 0 0 - 3 gst_inst_8bit 351141888 435683328 406716416 - 3 gst_inst_16bit 0 0 0 - 3 gst_inst_32bit 0 0 0 - 3 gst_inst_64bit 0 0 0 - 3 gst_inst_128bit 0 0 0 - Kernel: void mtf_2buffers_depth32(unsigned char const *, unsigned char*, int, int) - 3 local_load 0 0 0 - 3 local_store 0 0 0 - 3 gld_request 10979960 13618428 12717088 - 3 gst_request 10977280 13615104 12713984 - 3 shared_load 10977280 13615104 12713984 - 3 shared_store 21957240 27223494 25424302 - 3 branch 172862 214399 200209 - 3 divergent_branch 0 0 0 - 3 active_cycles 110498667 133213952 125641290 - 3 inst_issued1_0 45333406 55821585 52275467 - 3 inst_issued2_0 34907800 43348160 40465238 - 3 inst_issued1_1 45341136 55700865 52181745 - 3 inst_issued2_1 34907800 43243960 40395772 - 3 inst_executed 226257008 280626216 262052884 - 3 warps_launched 672 832 777 - 3 threads_launched 21504 26624 24874 - 3 thread_inst_executed_0 2503066624 3108285440 2901564608 - 3 thread_inst_executed_2 1117045504 1387137024 1294883541 - 3 thread_inst_executed_1 2503066624 3100813824 2896583530 - 3 thread_inst_executed_3 1117045504 1383802624 1292660608 - 3 active_warps 3142705067 3891079197 3631407707 - 3 tex0_cache_sector_queries 0 0 0 - 3 tex0_cache_sector_misses 0 0 0 - 3 tex1_cache_sector_queries 0 0 0 - 3 tex1_cache_sector_misses 0 0 0 - 3 sm_cta_launched 168 212 196 - 3 l1_local_load_hit 0 0 0 - 3 l1_local_load_miss 0 0 0 - 3 l1_local_store_hit 0 0 0 - 3 l1_local_store_miss 0 0 0 - 3 l1_global_load_hit 5542412 7016988 6318906 - 3 l1_global_load_miss 5780204 6993844 6477089 - 3 uncached_global_load_transaction 0 0 0 - 3 global_store_transaction 11010048 13631488 12757674 - 3 l1_shared_bank_conflict 0 0 0 - 3 prof_trigger_00 0 0 0 - 3 prof_trigger_01 0 0 0 - 3 prof_trigger_02 0 0 0 - 3 prof_trigger_03 0 0 0 - 3 prof_trigger_04 0 0 0 - 3 prof_trigger_05 0 0 0 - 3 prof_trigger_06 0 0 0 - 3 prof_trigger_07 0 0 0 - 3 fb_subp0_read_sectors 454765 579157 528884 - 3 fb_subp1_read_sectors 450063 550054 504798 - 3 fb_subp0_write_sectors 5085192 6315352 5895001 - 3 fb_subp1_write_sectors 5092098 6294277 5884792 - 3 l2_subp0_write_sector_misses 5085192 6315352 5895001 - 3 l2_subp1_write_sector_misses 5092098 6294277 5884792 - 3 l2_subp0_read_sector_misses 469913 562145 525639 - 3 l2_subp1_read_sector_misses 477816 546642 504235 - 3 l2_subp0_write_sector_queries 5488640 6807552 6356992 - 3 l2_subp1_write_sector_queries 5488640 6807552 6356992 - 3 l2_subp0_read_sector_queries 11609188 13886288 13084114 - 3 l2_subp1_read_sector_queries 11427624 14112340 13162773 - 3 l2_subp0_read_tex_sector_queries 0 0 0 - 3 l2_subp1_read_tex_sector_queries 0 0 0 - 3 l2_subp0_read_hit_sectors 11226276 13935340 12776013 - 3 l2_subp1_read_hit_sectors 63296 75264 69598 - 3 l2_subp0_read_tex_hit_sectors 0 0 0 - 3 l2_subp1_read_tex_hit_sectors 0 0 0 - 3 elapsed_cycles_sm 111501992 137156088 128489306 - 3 l2_subp0_read_sysmem_sector_queries 0 0 0 - 3 l2_subp1_read_sysmem_sector_queries 0 0 0 - 3 l2_subp0_write_sysmem_sector_queries 0 0 0 - 3 l2_subp1_write_sysmem_sector_queries 0 0 0 - 3 l2_subp0_total_read_sector_queries 11637022 13329801 12748966 - 3 l2_subp1_total_read_sector_queries 11313586 14122075 13140038 - 3 l2_subp0_total_write_sector_queries 5488644 6807552 6356994 - 3 l2_subp1_total_write_sector_queries 5488641 6807553 6356993 - 3 atom_count 0 0 0 - 3 gred_count 0 0 0 - 3 gld_inst_8bit 351358720 435789696 406946816 - 3 gld_inst_16bit 0 0 0 - 3 gld_inst_32bit 0 0 0 - 3 gld_inst_64bit 0 0 0 - 3 gld_inst_128bit 0 0 0 - 3 gst_inst_8bit 351272960 435683328 406847488 - 3 gst_inst_16bit 0 0 0 - 3 gst_inst_32bit 0 0 0 - 3 gst_inst_64bit 0 0 0 - 3 gst_inst_128bit 0 0 0 - Kernel: void mtf_2buffers(unsigned char const *, unsigned char*, int, int) - 3 local_load 0 0 0 - 3 local_store 0 0 0 - 3 gld_request 10979844 13610088 12711484 - 3 gst_request 10977280 13606912 12708522 - 3 shared_load 12245280 15095415 14123142 - 3 shared_store 22578700 27947650 26114241 - 3 branch 14177757 17451776 16333853 - 3 divergent_branch 0 0 0 - 3 active_cycles 131465081 162008592 151774223 - 3 inst_issued1_0 72366253 89094718 83326416 - 3 inst_issued2_0 26758923 33089371 30917188 - 3 inst_issued1_1 72123430 88675825 83082273 - 3 inst_issued2_1 26723050 33011070 30871655 - 3 inst_executed 250833865 309251253 289310656 - 3 warps_launched 1340 1664 1553 - 3 threads_launched 42880 53248 49706 - 3 thread_inst_executed_0 3162142944 3898258592 3644965674 - 3 thread_inst_executed_2 856282464 1058856800 989346954 - 3 thread_inst_executed_1 3153123744 3882573536 3635738485 - 3 thread_inst_executed_3 855134528 1056351168 987889898 - 3 active_warps 2826896053 3494732963 3267146025 - 3 tex0_cache_sector_queries 0 0 0 - 3 tex0_cache_sector_misses 0 0 0 - 3 tex1_cache_sector_queries 0 0 0 - 3 tex1_cache_sector_misses 0 0 0 - 3 sm_cta_launched 336 416 388 - 3 l1_local_load_hit 0 0 0 - 3 l1_local_load_miss 0 0 0 - 3 l1_local_store_hit 0 0 0 - 3 l1_local_store_miss 0 0 0 - 3 l1_global_load_hit 8974080 10811412 10145086 - 3 l1_global_load_miss 2518740 3010768 2742472 - 3 uncached_global_load_transaction 0 0 0 - 3 global_store_transaction 10878976 13533184 12648448 - 3 l1_shared_bank_conflict 0 0 0 - 3 prof_trigger_00 0 0 0 - 3 prof_trigger_01 0 0 0 - 3 prof_trigger_02 0 0 0 - 3 prof_trigger_03 0 0 0 - 3 prof_trigger_04 0 0 0 - 3 prof_trigger_05 0 0 0 - 3 prof_trigger_06 0 0 0 - 3 prof_trigger_07 0 0 0 - 3 fb_subp0_read_sectors 329389 404860 379345 - 3 fb_subp1_read_sectors 324328 403265 376053 - 3 fb_subp0_write_sectors 4785842 5931881 5544451 - 3 fb_subp1_write_sectors 4792820 5932539 5549035 - 3 l2_subp0_write_sector_misses 4785842 5931881 5544451 - 3 l2_subp1_write_sector_misses 4792820 5932539 5549035 - 3 l2_subp0_read_sector_misses 323834 406049 377842 - 3 l2_subp1_read_sector_misses 329758 405793 379354 - 3 l2_subp0_write_sector_queries 5488640 6803456 6354261 - 3 l2_subp1_write_sector_queries 5488640 6803456 6354261 - 3 l2_subp0_read_sector_queries 4805632 5907728 5498740 - 3 l2_subp1_read_sector_queries 4596464 5735476 5353820 - 3 l2_subp0_read_tex_sector_queries 0 0 0 - 3 l2_subp1_read_tex_sector_queries 0 0 0 - 3 l2_subp0_read_hit_sectors 4404192 5728068 5203058 - 3 l2_subp1_read_hit_sectors 8452 10488 9628 - 3 l2_subp0_read_tex_hit_sectors 0 0 0 - 3 l2_subp1_read_tex_hit_sectors 0 0 0 - 3 elapsed_cycles_sm 140325288 169774620 159946849 - 3 l2_subp0_read_sysmem_sector_queries 0 0 0 - 3 l2_subp1_read_sysmem_sector_queries 0 0 0 - 3 l2_subp0_write_sysmem_sector_queries 0 0 0 - 3 l2_subp1_write_sysmem_sector_queries 0 0 0 - 3 l2_subp0_total_read_sector_queries 4495787 5956934 5357334 - 3 l2_subp1_total_read_sector_queries 4422886 5890630 5351790 - 3 l2_subp0_total_write_sector_queries 5488645 6803456 6354264 - 3 l2_subp1_total_write_sector_queries 5488641 6803468 6354266 - 3 atom_count 0 0 0 - 3 gred_count 0 0 0 - 3 gld_inst_8bit 351355008 435522816 406767509 - 3 gld_inst_16bit 0 0 0 - 3 gld_inst_32bit 0 0 0 - 3 gld_inst_64bit 0 0 0 - 3 gld_inst_128bit 0 0 0 - 3 gst_inst_8bit 10977280 13606912 12708522 - 3 gst_inst_16bit 0 0 0 - 3 gst_inst_32bit 0 0 0 - 3 gst_inst_64bit 0 0 0 - 3 gst_inst_128bit 0 0 0 - Kernel: bsc_st567_encode_cuda_postsort(unsigned char*, __int64*, int, __int64, int*) - 12 local_load 0 0 0 - 12 local_store 0 0 0 - 12 gld_request 852126 1059228 989864 - 12 gst_request 852126 1059228 989864 - 12 shared_load 0 0 0 - 12 shared_store 0 0 0 - 12 branch 852894 1059996 990632 - 12 divergent_branch 0 0 0 - 12 active_cycles 19619819 24847753 23086016 - 12 inst_issued1_0 4452453 5595996 5209445 - 12 inst_issued2_0 1706686 2120952 1982167 - 12 inst_issued1_1 4451180 5588565 5211864 - 12 inst_issued2_1 1706687 2120952 1982169 - 12 inst_executed 13645536 16959168 15849344 - 12 warps_launched 384 384 384 - 12 threads_launched 12288 12288 12288 - 12 thread_inst_executed_0 163731072 203494656 190176768 - 12 thread_inst_executed_2 54597504 67852032 63412736 - 12 thread_inst_executed_1 163731072 203494656 190176768 - 12 thread_inst_executed_3 54597504 67852032 63412736 - 12 active_warps 899952316 1133744397 1055296356 - 12 tex0_cache_sector_queries 0 0 0 - 12 tex0_cache_sector_misses 0 0 0 - 12 tex1_cache_sector_queries 0 0 0 - 12 tex1_cache_sector_misses 0 0 0 - 12 sm_cta_launched 64 64 64 - 12 l1_local_load_hit 0 0 0 - 12 l1_local_load_miss 0 0 0 - 12 l1_local_store_hit 0 0 0 - 12 l1_local_store_miss 0 0 0 - 12 l1_global_load_hit 0 0 0 - 12 l1_global_load_miss 1704288 2118480 1979760 - 12 uncached_global_load_transaction 0 0 0 - 12 global_store_transaction 852144 1059240 989880 - 12 l1_shared_bank_conflict 0 0 0 - 12 prof_trigger_00 0 0 0 - 12 prof_trigger_01 0 0 0 - 12 prof_trigger_02 0 0 0 - 12 prof_trigger_03 0 0 0 - 12 prof_trigger_04 0 0 0 - 12 prof_trigger_05 0 0 0 - 12 prof_trigger_06 0 0 0 - 12 prof_trigger_07 0 0 0 - 12 fb_subp0_read_sectors 3425038 4265076 3984670 - 12 fb_subp1_read_sectors 3425529 4265194 3984335 - 12 fb_subp0_write_sectors 426063 529613 494931 - 12 fb_subp1_write_sectors 426063 529623 494934 - 12 l2_subp0_write_sector_misses 426063 529613 494931 - 12 l2_subp1_write_sector_misses 426063 529624 494934 - 12 l2_subp0_read_sector_misses 3426488 4265260 3984236 - 12 l2_subp1_read_sector_misses 3425506 4265207 3984628 - 12 l2_subp0_write_sector_queries 426064 529612 494931 - 12 l2_subp1_write_sector_queries 426064 529612 494931 - 12 l2_subp0_read_sector_queries 3408504 4236912 3959456 - 12 l2_subp1_read_sector_queries 3408504 4236912 3959456 - 12 l2_subp0_read_tex_sector_queries 0 0 0 - 12 l2_subp1_read_tex_sector_queries 0 0 0 - 12 l2_subp0_read_hit_sectors 0 1 0 - 12 l2_subp1_read_hit_sectors 0 0 0 - 12 l2_subp0_read_tex_hit_sectors 0 0 0 - 12 l2_subp1_read_tex_hit_sectors 0 0 0 - 12 elapsed_cycles_sm 19813936 25024344 23239038 - 12 l2_subp0_read_sysmem_sector_queries 0 0 0 - 12 l2_subp1_read_sysmem_sector_queries 0 0 0 - 12 l2_subp0_write_sysmem_sector_queries 0 0 0 - 12 l2_subp1_write_sysmem_sector_queries 0 0 0 - 12 l2_subp0_total_read_sector_queries 3408552 4236961 3959504 - 12 l2_subp1_total_read_sector_queries 3408553 4240003 3959942 - 12 l2_subp0_total_write_sector_queries 426063 529616 494933 - 12 l2_subp1_total_write_sector_queries 426064 529617 494935 - 12 atom_count 0 0 0 - 12 gred_count 1 1 1 - 12 gld_inst_8bit 0 0 0 - 12 gld_inst_16bit 0 0 0 - 12 gld_inst_32bit 0 0 0 - 12 gld_inst_64bit 27268032 33895296 31675648 - 12 gld_inst_128bit 0 0 0 - 12 gst_inst_8bit 27268032 33895296 31675648 - 12 gst_inst_16bit 0 0 0 - 12 gst_inst_32bit 0 0 0 - 12 gst_inst_64bit 0 0 0 - 12 gst_inst_128bit 0 0 0 - Kernel: void cub::DeviceRadixSortDownsweepKernel::Policy520, bool=0, bool=0, __int64, unsigned char, int>(cub::DeviceRadixSortPolicy<__int64, unsigned char, int>::Policy520*, cub::DeviceRadixSortPolicy<__int64, unsigned char, int>::Policy520, bool=0*, bool=0, bool=0*, bool=0*, int, int, cub::GridEvenShare) - 36 local_load 0 0 0 - 36 local_store 0 0 0 - 36 gld_request 1704506 2118706 1979982 - 36 gst_request 1704244 2118452 1979724 - 36 shared_load 11007338 13682330 12786429 - 36 shared_store 10190342 12666865 11837435 - 36 branch 144904 179420 167860 - 36 divergent_branch 0 0 0 - 36 active_cycles 61515346 79527643 72695563 - 36 inst_issued1_0 25841759 33355006 30489711 - 36 inst_issued2_0 8168372 10153064 9488384 - 36 inst_issued1_1 24460091 31515963 28873332 - 36 inst_issued2_1 8168148 10152855 9488168 - 36 inst_executed 73406613 91242976 85269546 - 36 warps_launched 480 480 480 - 36 threads_launched 15360 15360 15360 - 36 thread_inst_executed_0 917130912 1139969024 1065340224 - 36 thread_inst_executed_2 261384192 324894336 303624576 - 36 thread_inst_executed_1 909120000 1130025216 1056043776 - 36 thread_inst_executed_3 261376512 324886656 303616896 - 36 active_warps 1151346545 1497553734 1364525898 - 36 tex0_cache_sector_queries 0 0 0 - 36 tex0_cache_sector_misses 0 0 0 - 36 tex1_cache_sector_queries 0 0 0 - 36 tex1_cache_sector_misses 0 0 0 - 36 sm_cta_launched 120 120 120 - 36 l1_local_load_hit 0 0 0 - 36 l1_local_load_miss 0 0 0 - 36 l1_local_store_hit 0 0 0 - 36 l1_local_store_miss 0 0 0 - 36 l1_global_load_hit 641260 796688 744579 - 36 l1_global_load_miss 1922512 2388996 2232716 - 36 uncached_global_load_transaction 0 0 0 - 36 global_store_transaction 4658408 6798660 5730031 - 36 l1_shared_bank_conflict 6216512 9117332 7891107 - 36 prof_trigger_00 0 0 0 - 36 prof_trigger_01 0 0 0 - 36 prof_trigger_02 0 0 0 - 36 prof_trigger_03 0 0 0 - 36 prof_trigger_04 0 0 0 - 36 prof_trigger_05 0 0 0 - 36 prof_trigger_06 0 0 0 - 36 prof_trigger_07 0 0 0 - 36 fb_subp0_read_sectors 3902361 4861316 4538534 - 36 fb_subp1_read_sectors 3902550 4862236 4538710 - 36 fb_subp0_write_sectors 4085935 5514008 4885890 - 36 fb_subp1_write_sectors 4087203 5511267 4886014 - 36 l2_subp0_write_sector_misses 4085935 5514008 4885890 - 36 l2_subp1_write_sector_misses 4087203 5511267 4886014 - 36 l2_subp0_read_sector_misses 3902680 4861892 4537259 - 36 l2_subp1_read_sector_misses 3903666 4862340 4539468 - 36 l2_subp0_write_sector_queries 5004731 6686639 5970592 - 36 l2_subp1_write_sector_queries 5004777 6685230 5970543 - 36 l2_subp0_read_sector_queries 3844732 4777196 4464425 - 36 l2_subp1_read_sector_queries 3846012 4778204 4466336 - 36 l2_subp0_read_tex_sector_queries 0 0 0 - 36 l2_subp1_read_tex_sector_queries 0 0 0 - 36 l2_subp0_read_hit_sectors 4920 8068 6115 - 36 l2_subp1_read_hit_sectors 132 312 208 - 36 l2_subp0_read_tex_hit_sectors 0 0 0 - 36 l2_subp1_read_tex_hit_sectors 0 0 0 - 36 elapsed_cycles_sm 61620352 79866208 73011723 - 36 l2_subp0_read_sysmem_sector_queries 0 0 0 - 36 l2_subp1_read_sysmem_sector_queries 0 3 0 - 36 l2_subp0_write_sysmem_sector_queries 0 0 0 - 36 l2_subp1_write_sysmem_sector_queries 0 0 0 - 36 l2_subp0_total_read_sector_queries 3845769 4778370 4465272 - 36 l2_subp1_total_read_sector_queries 3846228 4778413 4466689 - 36 l2_subp0_total_write_sector_queries 5005269 6685196 5970646 - 36 l2_subp1_total_write_sector_queries 5005156 6685281 5970903 - 36 atom_count 0 0 0 - 36 gred_count 0 0 0 - 36 gld_inst_8bit 27268608 33895680 31676160 - 36 gld_inst_16bit 0 0 0 - 36 gld_inst_32bit 7680 7680 7680 - 36 gld_inst_64bit 27267887 33895203 31675564 - 36 gld_inst_128bit 0 0 0 - 36 gst_inst_8bit 27267887 33895203 31675564 - 36 gst_inst_16bit 0 0 0 - 36 gst_inst_32bit 0 0 0 - 36 gst_inst_64bit 27267887 33895203 31675564 - 36 gst_inst_128bit 0 0 0 - Kernel: void cub::DeviceRadixSortUpsweepKernel::Policy520, bool=0, bool=0, __int64, int>(cub::DeviceRadixSortPolicy<__int64, unsigned char, int>::Policy520*, bool=0*, cub::DeviceRadixSortPolicy<__int64, unsigned char, int>::Policy520*, int, int, cub::GridEvenShare::Policy520*>) - 36 local_load 0 0 0 - 36 local_store 0 0 0 - 36 gld_request 852122 1059226 989862 - 36 gst_request 120 120 120 - 36 shared_load 978842 1201306 1126822 - 36 shared_store 886682 1097626 1026982 - 36 branch 86394 105130 98884 - 36 divergent_branch 1 2 1 - 36 active_cycles 14020880 17459716 16298723 - 36 inst_issued1_0 3304223 4099083 3826317 - 36 inst_issued2_0 2633611 3289496 3069578 - 36 inst_issued1_1 3291353 4087461 3813667 - 36 inst_issued2_1 2633251 3289139 3069222 - 36 inst_executed 15670197 19455177 18187922 - 36 warps_launched 480 480 480 - 36 threads_launched 15360 15360 15360 - 36 thread_inst_executed_0 166584672 206156032 192917184 - 36 thread_inst_executed_2 84270944 105259264 98221909 - 36 thread_inst_executed_1 166330807 205902128 192663396 - 36 thread_inst_executed_3 84259203 105247512 98210215 - 36 active_warps 305687761 381124769 355492240 - 36 tex0_cache_sector_queries 0 0 0 - 36 tex0_cache_sector_misses 0 0 0 - 36 tex1_cache_sector_queries 0 0 0 - 36 tex1_cache_sector_misses 0 0 0 - 36 sm_cta_launched 120 120 120 - 36 l1_local_load_hit 0 0 0 - 36 l1_local_load_miss 0 0 0 - 36 l1_local_store_hit 0 0 0 - 36 l1_local_store_miss 0 0 0 - 36 l1_global_load_hit 0 0 0 - 36 l1_global_load_miss 1704192 2118528 1979777 - 36 uncached_global_load_transaction 0 0 0 - 36 global_store_transaction 3840 3840 3840 - 36 l1_shared_bank_conflict 0 0 0 - 36 prof_trigger_00 0 0 0 - 36 prof_trigger_01 0 0 0 - 36 prof_trigger_02 0 0 0 - 36 prof_trigger_03 0 0 0 - 36 prof_trigger_04 0 0 0 - 36 prof_trigger_05 0 0 0 - 36 prof_trigger_06 0 0 0 - 36 prof_trigger_07 0 0 0 - 36 fb_subp0_read_sectors 3419129 4256856 3976817 - 36 fb_subp1_read_sectors 3419163 4256873 3977166 - 36 fb_subp0_write_sectors 686 954 816 - 36 fb_subp1_write_sectors 814 1157 944 - 36 l2_subp0_write_sector_misses 686 954 816 - 36 l2_subp1_write_sector_misses 814 1157 944 - 36 l2_subp0_read_sector_misses 3420481 4256820 3977387 - 36 l2_subp1_read_sector_misses 3419211 4256837 3977067 - 36 l2_subp0_write_sector_queries 1920 1920 1920 - 36 l2_subp1_write_sector_queries 1920 1920 1920 - 36 l2_subp0_read_sector_queries 3408484 4236900 3959446 - 36 l2_subp1_read_sector_queries 3408484 4236904 3959447 - 36 l2_subp0_read_tex_sector_queries 0 0 0 - 36 l2_subp1_read_tex_sector_queries 0 0 0 - 36 l2_subp0_read_hit_sectors 0 37 1 - 36 l2_subp1_read_hit_sectors 0 16 0 - 36 l2_subp0_read_tex_hit_sectors 0 0 0 - 36 l2_subp1_read_tex_hit_sectors 0 0 0 - 36 elapsed_cycles_sm 14098432 17593456 16396381 - 36 l2_subp0_read_sysmem_sector_queries 0 0 0 - 36 l2_subp1_read_sysmem_sector_queries 0 0 0 - 36 l2_subp0_write_sysmem_sector_queries 0 0 0 - 36 l2_subp1_write_sysmem_sector_queries 0 0 0 - 36 l2_subp0_total_read_sector_queries 3408756 4237185 3959718 - 36 l2_subp1_total_read_sector_queries 3408756 4237176 3959719 - 36 l2_subp0_total_write_sector_queries 1920 1922 1920 - 36 l2_subp1_total_write_sector_queries 1920 1932 1921 - 36 atom_count 0 0 0 - 36 gred_count 0 0 0 - 36 gld_inst_8bit 0 0 0 - 36 gld_inst_16bit 0 0 0 - 36 gld_inst_32bit 0 0 0 - 36 gld_inst_64bit 27267887 33895203 31675564 - 36 gld_inst_128bit 0 0 0 - 36 gst_inst_8bit 0 0 0 - 36 gst_inst_16bit 0 0 0 - 36 gst_inst_32bit 3840 3840 3840 - 36 gst_inst_64bit 0 0 0 - 36 gst_inst_128bit 0 0 0 - Kernel: void cub::DeviceRadixSortUpsweepKernel::Policy520, bool=0, bool=0, __int64, int>(cub::DeviceRadixSortPolicy<__int64, cub::NullType, int>::Policy520*, bool=0*, cub::DeviceRadixSortPolicy<__int64, cub::NullType, int>::Policy520*, int, int, cub::GridEvenShare::Policy520*>) - 96 local_load 0 0 0 - 96 local_store 0 0 0 - 96 gld_request 852122 1059227 989862 - 96 gst_request 192 192 192 - 96 shared_load 968858 1200539 1122982 - 96 shared_store 885914 1099163 1027750 - 96 branch 67808 83145 77996 - 96 divergent_branch 1 2 1 - 96 active_cycles 14166324 17665908 16479555 - 96 inst_issued1_0 2917394 3646393 3397285 - 96 inst_issued2_0 2738323 3402255 3179935 - 96 inst_issued1_1 2897731 3622842 3377257 - 96 inst_issued2_1 2737939 3401879 3179554 - 96 inst_executed 15671929 19463814 18194213 - 96 warps_launched 384 384 384 - 96 threads_launched 12288 12288 12288 - 96 thread_inst_executed_0 163340576 202764512 189564704 - 96 thread_inst_executed_2 87622240 108868064 101754528 - 96 thread_inst_executed_1 162928503 202352574 189152743 - 96 thread_inst_executed_3 87609731 108855689 101742100 - 96 active_warps 215149526 268024475 250151165 - 96 tex0_cache_sector_queries 0 0 0 - 96 tex0_cache_sector_misses 0 0 0 - 96 tex1_cache_sector_queries 0 0 0 - 96 tex1_cache_sector_misses 0 0 0 - 96 sm_cta_launched 192 192 192 - 96 l1_local_load_hit 0 0 0 - 96 l1_local_load_miss 0 0 0 - 96 l1_local_store_hit 0 0 0 - 96 l1_local_store_miss 0 0 0 - 96 l1_global_load_hit 0 4 0 - 96 l1_global_load_miss 1704240 2118528 1979757 - 96 uncached_global_load_transaction 0 0 0 - 96 global_store_transaction 6144 6144 6144 - 96 l1_shared_bank_conflict 0 0 0 - 96 prof_trigger_00 0 0 0 - 96 prof_trigger_01 0 0 0 - 96 prof_trigger_02 0 0 0 - 96 prof_trigger_03 0 0 0 - 96 prof_trigger_04 0 0 0 - 96 prof_trigger_05 0 0 0 - 96 prof_trigger_06 0 0 0 - 96 prof_trigger_07 0 0 0 - 96 fb_subp0_read_sectors 3419481 4257211 3977651 - 96 fb_subp1_read_sectors 3419367 4257401 3977571 - 96 fb_subp0_write_sectors 1269 1804 1581 - 96 fb_subp1_write_sectors 1290 1787 1554 - 96 l2_subp0_write_sector_misses 1269 1804 1581 - 96 l2_subp1_write_sector_misses 1290 1787 1554 - 96 l2_subp0_read_sector_misses 3419153 4257428 3977527 - 96 l2_subp1_read_sector_misses 3419351 4257273 3977641 - 96 l2_subp0_write_sector_queries 3072 3072 3072 - 96 l2_subp1_write_sector_queries 3072 3072 3072 - 96 l2_subp0_read_sector_queries 3408484 4236900 3959446 - 96 l2_subp1_read_sector_queries 3408484 4236904 3959447 - 96 l2_subp0_read_tex_sector_queries 0 0 0 - 96 l2_subp1_read_tex_sector_queries 0 0 0 - 96 l2_subp0_read_hit_sectors 0 85 5 - 96 l2_subp1_read_hit_sectors 0 28 1 - 96 l2_subp0_read_tex_hit_sectors 0 0 0 - 96 l2_subp1_read_tex_hit_sectors 0 0 0 - 96 elapsed_cycles_sm 14245736 17810552 16589653 - 96 l2_subp0_read_sysmem_sector_queries 0 0 0 - 96 l2_subp1_read_sysmem_sector_queries 0 0 0 - 96 l2_subp0_write_sysmem_sector_queries 0 0 0 - 96 l2_subp1_write_sysmem_sector_queries 0 0 0 - 96 l2_subp0_total_read_sector_queries 3408828 4237242 3959780 - 96 l2_subp1_total_read_sector_queries 3408812 4237241 3959776 - 96 l2_subp0_total_write_sector_queries 3072 3074 3072 - 96 l2_subp1_total_write_sector_queries 3072 3083 3072 - 96 atom_count 0 0 0 - 96 gred_count 0 0 0 - 96 gld_inst_8bit 0 0 0 - 96 gld_inst_16bit 0 0 0 - 96 gld_inst_32bit 0 0 0 - 96 gld_inst_64bit 27267887 33895203 31675564 - 96 gld_inst_128bit 0 0 0 - 96 gst_inst_8bit 0 0 0 - 96 gst_inst_16bit 0 0 0 - 96 gst_inst_32bit 6144 6144 6144 - 96 gst_inst_64bit 0 0 0 - 96 gst_inst_128bit 0 0 0 - Kernel: void cub::DeviceRadixSortDownsweepKernel::Policy520, bool=1, bool=0, __int64, unsigned char, int>(cub::DeviceRadixSortPolicy<__int64, unsigned char, int>::Policy520*, cub::DeviceRadixSortPolicy<__int64, unsigned char, int>::Policy520, bool=1*, bool=1, bool=0*, bool=1*, int, int, cub::GridEvenShare) - 3 local_load 0 0 0 - 3 local_store 0 0 0 - 3 gld_request 1704506 2118706 1979982 - 3 gst_request 1704244 2118452 1979724 - 3 shared_load 8734954 10857690 10146749 - 3 shared_store 7917958 9842225 9197755 - 3 branch 144904 179420 167860 - 3 divergent_branch 0 0 0 - 3 active_cycles 58530020 72777449 67998541 - 3 inst_issued1_0 22567790 28058080 26220507 - 3 inst_issued2_0 8522920 10593902 9900322 - 3 inst_issued1_1 21433154 26646231 24904637 - 3 inst_issued2_1 8522710 10593691 9900111 - 3 inst_executed 66872533 83121160 77679490 - 3 warps_launched 480 480 480 - 3 threads_launched 15360 15360 15360 - 3 thread_inst_executed_0 797263440 990970048 926097888 - 3 thread_inst_executed_2 272729728 339001152 316806592 - 3 thread_inst_executed_1 793229184 985969344 921420864 - 3 thread_inst_executed_3 272722048 338993472 316798912 - 3 active_warps 1100639602 1368695693 1278761961 - 3 tex0_cache_sector_queries 0 0 0 - 3 tex0_cache_sector_misses 0 0 0 - 3 tex1_cache_sector_queries 0 0 0 - 3 tex1_cache_sector_misses 0 0 0 - 3 sm_cta_launched 120 120 120 - 3 l1_local_load_hit 0 0 0 - 3 l1_local_load_miss 0 0 0 - 3 l1_local_store_hit 0 0 0 - 3 l1_local_store_miss 0 0 0 - 3 l1_global_load_hit 640196 795380 743408 - 3 l1_global_load_miss 1920372 2386232 2230198 - 3 uncached_global_load_transaction 0 0 0 - 3 global_store_transaction 5340268 6637180 6202846 - 3 l1_shared_bank_conflict 7217788 8982588 8390976 - 3 prof_trigger_00 0 0 0 - 3 prof_trigger_01 0 0 0 - 3 prof_trigger_02 0 0 0 - 3 prof_trigger_03 0 0 0 - 3 prof_trigger_04 0 0 0 - 3 prof_trigger_05 0 0 0 - 3 prof_trigger_06 0 0 0 - 3 prof_trigger_07 0 0 0 - 3 fb_subp0_read_sectors 3896244 4844813 4528523 - 3 fb_subp1_read_sectors 3901764 4849808 4530596 - 3 fb_subp0_write_sectors 4343476 5398337 5045030 - 3 fb_subp1_write_sectors 4343127 5397841 5044837 - 3 l2_subp0_write_sector_misses 4343476 5398337 5045030 - 3 l2_subp1_write_sector_misses 4343127 5397841 5044837 - 3 l2_subp0_read_sector_misses 3898880 4849937 4531310 - 3 l2_subp1_read_sector_misses 3901928 4849912 4530632 - 3 l2_subp0_write_sector_queries 5323230 6616931 6183571 - 3 l2_subp1_write_sector_queries 5323687 6616041 6183564 - 3 l2_subp0_read_sector_queries 3839696 4772380 4459641 - 3 l2_subp1_read_sector_queries 3840880 4772488 4460781 - 3 l2_subp0_read_tex_sector_queries 0 0 0 - 3 l2_subp1_read_tex_sector_queries 0 0 0 - 3 l2_subp0_read_hit_sectors 4236 5120 4566 - 3 l2_subp1_read_hit_sectors 140 192 158 - 3 l2_subp0_read_tex_hit_sectors 0 0 0 - 3 l2_subp1_read_tex_hit_sectors 0 0 0 - 3 elapsed_cycles_sm 58685016 72919132 68133876 - 3 l2_subp0_read_sysmem_sector_queries 0 0 0 - 3 l2_subp1_read_sysmem_sector_queries 0 0 0 - 3 l2_subp0_write_sysmem_sector_queries 0 0 0 - 3 l2_subp1_write_sysmem_sector_queries 0 0 0 - 3 l2_subp0_total_read_sector_queries 3840792 4772819 4460288 - 3 l2_subp1_total_read_sector_queries 3841216 4772593 4461030 - 3 l2_subp0_total_write_sector_queries 5323353 6617052 6183952 - 3 l2_subp1_total_write_sector_queries 5323818 6616405 6183950 - 3 atom_count 0 0 0 - 3 gred_count 0 0 0 - 3 gld_inst_8bit 27268608 33895680 31676160 - 3 gld_inst_16bit 0 0 0 - 3 gld_inst_32bit 3840 3840 3840 - 3 gld_inst_64bit 27267887 33895203 31675564 - 3 gld_inst_128bit 0 0 0 - 3 gst_inst_8bit 27267887 33895203 31675564 - 3 gst_inst_16bit 0 0 0 - 3 gst_inst_32bit 0 0 0 - 3 gst_inst_64bit 27267887 33895203 31675564 - 3 gst_inst_128bit 0 0 0 - Kernel: void cub::DeviceRadixSortDownsweepKernel::Policy520, bool=0, bool=0, __int64, cub::NullType, int>(cub::DeviceRadixSortPolicy<__int64, cub::NullType, int>::Policy520*, cub::DeviceRadixSortPolicy<__int64, cub::NullType, int>::Policy520, bool=0*, bool=0, bool=0*, bool=0*, int, int, cub::GridEvenShare) - 96 local_load 0 0 0 - 96 local_store 0 0 0 - 96 gld_request 852506 1059610 990246 - 96 gst_request 852122 1059226 989862 - 96 shared_load 8048354 10004204 9349137 - 96 shared_store 6580591 8179786 7644172 - 96 branch 96986 119996 112289 - 96 divergent_branch 0 0 0 - 96 active_cycles 40461127 54022787 48281294 - 96 inst_issued1_0 12272196 16112070 14616240 - 96 inst_issued2_0 8711986 10828906 10119892 - 96 inst_issued1_1 11199982 14838464 13376533 - 96 inst_issued2_1 8711646 10828573 10119556 - 96 inst_executed 55071390 68451855 63970561 - 96 warps_launched 384 384 384 - 96 threads_launched 12288 12288 12288 - 96 thread_inst_executed_0 607715136 755349696 705905856 - 96 thread_inst_executed_2 278781504 346522944 323834517 - 96 thread_inst_executed_1 597018624 742076064 693495370 - 96 thread_inst_executed_3 278769216 346510656 323822229 - 96 active_warps 627461942 844915500 752200104 - 96 tex0_cache_sector_queries 0 0 0 - 96 tex0_cache_sector_misses 0 0 0 - 96 tex1_cache_sector_queries 0 0 0 - 96 tex1_cache_sector_misses 0 0 0 - 96 sm_cta_launched 192 192 192 - 96 l1_local_load_hit 0 0 0 - 96 l1_local_load_miss 0 0 0 - 96 l1_local_store_hit 0 0 0 - 96 l1_local_store_miss 0 0 0 - 96 l1_global_load_hit 3584 4520 3871 - 96 l1_global_load_miss 1712064 2127308 1988205 - 96 uncached_global_load_transaction 0 0 0 - 96 global_store_transaction 3444888 5532976 4294255 - 96 l1_shared_bank_conflict 2882372 5082208 4280918 - 96 prof_trigger_00 0 0 0 - 96 prof_trigger_01 0 0 0 - 96 prof_trigger_02 0 0 0 - 96 prof_trigger_03 0 0 0 - 96 prof_trigger_04 0 0 0 - 96 prof_trigger_05 0 0 0 - 96 prof_trigger_06 0 0 0 - 96 prof_trigger_07 0 0 0 - 96 fb_subp0_read_sectors 3452669 4301192 4016587 - 96 fb_subp1_read_sectors 3453278 4301540 4016568 - 96 fb_subp0_write_sectors 3507598 4871922 4188914 - 96 fb_subp1_write_sectors 3506674 4871257 4188844 - 96 l2_subp0_write_sector_misses 3507598 4871922 4188914 - 96 l2_subp1_write_sector_misses 3506674 4871257 4188844 - 96 l2_subp0_read_sector_misses 3453349 4301392 4016470 - 96 l2_subp1_read_sector_misses 3453634 4302119 4016399 - 96 l2_subp0_write_sector_queries 4111587 5602911 4885525 - 96 l2_subp1_write_sector_queries 4111598 5601914 4885413 - 96 l2_subp0_read_sector_queries 3424224 4254808 3976196 - 96 l2_subp1_read_sector_queries 3424440 4255044 3976354 - 96 l2_subp0_read_tex_sector_queries 0 0 0 - 96 l2_subp1_read_tex_sector_queries 0 0 0 - 96 l2_subp0_read_hit_sectors 9416 15584 12493 - 96 l2_subp1_read_hit_sectors 372 736 488 - 96 l2_subp0_read_tex_hit_sectors 0 0 0 - 96 l2_subp1_read_tex_hit_sectors 0 0 0 - 96 elapsed_cycles_sm 40709424 54102160 48488763 - 96 l2_subp0_read_sysmem_sector_queries 0 0 0 - 96 l2_subp1_read_sysmem_sector_queries 0 0 0 - 96 l2_subp0_write_sysmem_sector_queries 0 1 0 - 96 l2_subp1_write_sysmem_sector_queries 0 0 0 - 96 l2_subp0_total_read_sector_queries 3424296 4254873 3976676 - 96 l2_subp1_total_read_sector_queries 3425081 4258162 3976862 - 96 l2_subp0_total_write_sector_queries 4111589 5602913 4885526 - 96 l2_subp1_total_write_sector_queries 4111599 5601915 4885415 - 96 atom_count 0 0 0 - 96 gred_count 0 0 0 - 96 gld_inst_8bit 0 0 0 - 96 gld_inst_16bit 0 0 0 - 96 gld_inst_32bit 12288 12288 12288 - 96 gld_inst_64bit 27267887 33895203 31675564 - 96 gld_inst_128bit 0 0 0 - 96 gst_inst_8bit 0 0 0 - 96 gst_inst_16bit 0 0 0 - 96 gst_inst_32bit 0 0 0 - 96 gst_inst_64bit 27267887 33895203 31675564 - 96 gst_inst_128bit 0 0 0 - Kernel: void cub::DeviceRadixSortDownsweepKernel::Policy520, bool=1, bool=0, __int64, cub::NullType, int>(cub::DeviceRadixSortPolicy<__int64, cub::NullType, int>::Policy520*, cub::DeviceRadixSortPolicy<__int64, cub::NullType, int>::Policy520, bool=1*, bool=1, bool=0*, bool=1*, int, int, cub::GridEvenShare) - 18 local_load 0 0 0 - 18 local_store 0 0 0 - 18 gld_request 852506 1059610 990246 - 18 gst_request 852122 1059226 989862 - 18 shared_load 6533442 8121132 7589372 - 18 shared_store 5065679 6296714 5884407 - 18 branch 96986 119996 112289 - 18 divergent_branch 0 0 0 - 18 active_cycles 36568156 47616171 43683258 - 18 inst_issued1_0 10802346 13883254 12833670 - 18 inst_issued2_0 8191236 10181601 9514974 - 18 inst_issued1_1 10109983 13013729 12042171 - 18 inst_issued2_1 8190898 10181266 9514638 - 18 inst_executed 50526654 62802639 58691265 - 18 warps_launched 384 384 384 - 18 threads_launched 12288 12288 12288 - 18 thread_inst_executed_0 546361184 679085264 634635344 - 18 thread_inst_executed_2 262117504 325809184 304477130 - 18 thread_inst_executed_1 540966848 672402368 628384021 - 18 thread_inst_executed_3 262105216 325796896 304464842 - 18 active_warps 566483286 742419694 680633242 - 18 tex0_cache_sector_queries 0 0 0 - 18 tex0_cache_sector_misses 0 0 0 - 18 tex1_cache_sector_queries 0 0 0 - 18 tex1_cache_sector_misses 0 0 0 - 18 sm_cta_launched 192 192 192 - 18 l1_local_load_hit 0 0 0 - 18 l1_local_load_miss 0 0 0 - 18 l1_local_store_hit 0 0 0 - 18 l1_local_store_miss 0 0 0 - 18 l1_global_load_hit 1792 2120 1944 - 18 l1_global_load_miss 1708344 2122700 1983933 - 18 uncached_global_load_transaction 0 0 0 - 18 global_store_transaction 3425408 4878228 4296088 - 18 l1_shared_bank_conflict 3056496 4736816 4180173 - 18 prof_trigger_00 0 0 0 - 18 prof_trigger_01 0 0 0 - 18 prof_trigger_02 0 0 0 - 18 prof_trigger_03 0 0 0 - 18 prof_trigger_04 0 0 0 - 18 prof_trigger_05 0 0 0 - 18 prof_trigger_06 0 0 0 - 18 prof_trigger_07 0 0 0 - 18 fb_subp0_read_sectors 3446549 4292191 4007655 - 18 fb_subp1_read_sectors 3446694 4292780 4008045 - 18 fb_subp0_write_sectors 3489582 4593339 4184005 - 18 fb_subp1_write_sectors 3489587 4593644 4184056 - 18 l2_subp0_write_sector_misses 3489582 4593339 4184005 - 18 l2_subp1_write_sector_misses 3489587 4593644 4184056 - 18 l2_subp0_read_sector_misses 3446681 4292637 4008444 - 18 l2_subp1_read_sector_misses 3449887 4292269 4009093 - 18 l2_subp0_write_sector_queries 4100005 5341691 4886940 - 18 l2_subp1_write_sector_queries 4100230 5342430 4887215 - 18 l2_subp0_read_sector_queries 3416232 4245156 3967740 - 18 l2_subp1_read_sector_queries 3416780 4245772 3967926 - 18 l2_subp0_read_tex_sector_queries 0 0 0 - 18 l2_subp1_read_tex_sector_queries 0 0 0 - 18 l2_subp0_read_hit_sectors 4092 7856 6621 - 18 l2_subp1_read_hit_sectors 192 396 309 - 18 l2_subp0_read_tex_hit_sectors 0 0 0 - 18 l2_subp1_read_tex_hit_sectors 0 0 0 - 18 elapsed_cycles_sm 36856304 47615336 43880457 - 18 l2_subp0_read_sysmem_sector_queries 0 0 0 - 18 l2_subp1_read_sysmem_sector_queries 0 0 0 - 18 l2_subp0_write_sysmem_sector_queries 0 0 0 - 18 l2_subp1_write_sysmem_sector_queries 0 2 0 - 18 l2_subp0_total_read_sector_queries 3416560 4245713 3968116 - 18 l2_subp1_total_read_sector_queries 3416864 4246088 3968184 - 18 l2_subp0_total_write_sector_queries 4100005 5341691 4886940 - 18 l2_subp1_total_write_sector_queries 4100230 5342430 4887217 - 18 atom_count 0 0 0 - 18 gred_count 0 0 0 - 18 gld_inst_8bit 0 0 0 - 18 gld_inst_16bit 0 0 0 - 18 gld_inst_32bit 6144 6144 6144 - 18 gld_inst_64bit 27267887 33895203 31675564 - 18 gld_inst_128bit 0 0 0 - 18 gst_inst_8bit 0 0 0 - 18 gst_inst_16bit 0 0 0 - 18 gst_inst_32bit 0 0 0 - 18 gst_inst_64bit 27267887 33895203 31675564 - 18 gst_inst_128bit 0 0 0 - Kernel: void mtf_thread(unsigned char const *, unsigned char*, int, int) - 3 local_load 0 0 0 - 3 local_store 0 0 0 - 3 gld_request 3163348 3856860 3620514 - 3 gst_request 3163902 3857501 3621133 - 3 shared_load 4197584 5174566 4827494 - 3 shared_store 7366778 9038619 8454759 - 3 branch 11614078 14302531 13351295 - 3 divergent_branch 3138309 3811972 3585785 - 3 active_cycles 260784591 325147956 302643598 - 3 inst_issued1_0 31528214 34993596 33825597 - 3 inst_issued2_0 23514935 25446166 24770704 - 3 inst_issued1_1 23148367 32548195 29217562 - 3 inst_issued2_1 16539925 23812141 21207613 - 3 inst_executed 119068686 146266743 136727219 - 3 warps_launched 84 104 97 - 3 threads_launched 2688 3328 3114 - 3 thread_inst_executed_0 636669844 695552844 673040601 - 3 thread_inst_executed_2 281783435 314380878 300599479 - 3 thread_inst_executed_1 473164508 652199536 588702613 - 3 thread_inst_executed_3 210869777 289738788 263349080 - 3 active_warps 1286426404 1584254747 1480546217 - 3 tex0_cache_sector_queries 0 0 0 - 3 tex0_cache_sector_misses 0 0 0 - 3 tex1_cache_sector_queries 0 0 0 - 3 tex1_cache_sector_misses 0 0 0 - 3 sm_cta_launched 88 100 96 - 3 l1_local_load_hit 0 0 0 - 3 l1_local_load_miss 0 0 0 - 3 l1_local_store_hit 0 0 0 - 3 l1_local_store_miss 0 0 0 - 3 l1_global_load_hit 5990388 7511972 6841394 - 3 l1_global_load_miss 5399900 6126528 5838669 - 3 uncached_global_load_transaction 0 0 0 - 3 global_store_transaction 11386880 13631488 12675754 - 3 l1_shared_bank_conflict 0 0 0 - 3 prof_trigger_00 0 0 0 - 3 prof_trigger_01 0 0 0 - 3 prof_trigger_02 0 0 0 - 3 prof_trigger_03 0 0 0 - 3 prof_trigger_04 0 0 0 - 3 prof_trigger_05 0 0 0 - 3 prof_trigger_06 0 0 0 - 3 prof_trigger_07 0 0 0 - 3 fb_subp0_read_sectors 570784 692362 650357 - 3 fb_subp1_read_sectors 572328 689965 650451 - 3 fb_subp0_write_sectors 5474711 6790990 6341026 - 3 fb_subp1_write_sectors 5471220 6790500 6339990 - 3 l2_subp0_write_sector_misses 5474711 6790990 6341026 - 3 l2_subp1_write_sector_misses 5471220 6790500 6339990 - 3 l2_subp0_read_sector_misses 570970 694634 651029 - 3 l2_subp1_read_sector_misses 571603 694993 653343 - 3 l2_subp0_write_sector_queries 5486592 6803456 6353578 - 3 l2_subp1_write_sector_queries 5486592 6803456 6353578 - 3 l2_subp0_read_sector_queries 10349040 12507356 11769162 - 3 l2_subp1_read_sector_queries 10303980 12546040 11753152 - 3 l2_subp0_read_tex_sector_queries 0 0 0 - 3 l2_subp1_read_tex_sector_queries 0 0 0 - 3 l2_subp0_read_hit_sectors 10141164 12390808 11568788 - 3 l2_subp1_read_hit_sectors 17124 21152 19562 - 3 l2_subp0_read_tex_hit_sectors 0 0 0 - 3 l2_subp1_read_tex_hit_sectors 0 0 0 - 3 elapsed_cycles_sm 337280676 410606648 385707406 - 3 l2_subp0_read_sysmem_sector_queries 0 0 0 - 3 l2_subp1_read_sysmem_sector_queries 0 0 0 - 3 l2_subp0_write_sysmem_sector_queries 0 0 0 - 3 l2_subp1_write_sysmem_sector_queries 0 0 0 - 3 l2_subp0_total_read_sector_queries 10455096 12580033 11818079 - 3 l2_subp1_total_read_sector_queries 10301716 12547626 11754664 - 3 l2_subp0_total_write_sector_queries 5486595 6803458 6353581 - 3 l2_subp1_total_write_sector_queries 5486606 6803471 6353593 - 3 atom_count 0 0 0 - 3 gred_count 0 0 0 - 3 gld_inst_8bit 10975863 13610234 12710259 - 3 gld_inst_16bit 0 0 0 - 3 gld_inst_32bit 0 0 0 - 3 gld_inst_64bit 0 0 0 - 3 gld_inst_128bit 0 0 0 - 3 gst_inst_8bit 10973184 13606912 12707157 - 3 gst_inst_16bit 0 0 0 - 3 gst_inst_32bit 0 0 0 - 3 gst_inst_64bit 0 0 0 - 3 gst_inst_128bit 0 0 0 - Kernel: void cub::RadixSortScanBinsKernel::Policy520, int>(cub::NullType*, int) - 114 local_load 0 0 0 - 114 local_store 0 0 0 - 114 gld_request 64 64 64 - 114 gst_request 64 64 64 - 114 shared_load 156 156 156 - 114 shared_store 160 160 160 - 114 branch 96 96 96 - 114 divergent_branch 0 0 0 - 114 active_cycles 8211 8449 8277 - 114 inst_issued1_0 898 903 899 - 114 inst_issued2_0 265 268 267 - 114 inst_issued1_1 708 711 708 - 114 inst_issued2_1 166 168 167 - 114 inst_executed 2152 2152 2152 - 114 warps_launched 16 16 16 - 114 threads_launched 512 512 512 - 114 thread_inst_executed_0 32392 32392 32392 - 114 thread_inst_executed_2 8064 8064 8064 - 114 thread_inst_executed_1 23296 23296 23296 - 114 thread_inst_executed_3 4864 4864 4864 - 114 active_warps 129634 132988 130529 - 114 tex0_cache_sector_queries 0 0 0 - 114 tex0_cache_sector_misses 0 0 0 - 114 tex1_cache_sector_queries 0 0 0 - 114 tex1_cache_sector_misses 0 0 0 - 114 sm_cta_launched 4 4 4 - 114 l1_local_load_hit 0 0 0 - 114 l1_local_load_miss 0 0 0 - 114 l1_local_store_hit 0 0 0 - 114 l1_local_store_miss 0 0 0 - 114 l1_global_load_hit 0 0 0 - 114 l1_global_load_miss 1024 1024 1024 - 114 uncached_global_load_transaction 0 0 0 - 114 global_store_transaction 1024 1024 1024 - 114 l1_shared_bank_conflict 512 512 512 - 114 prof_trigger_00 0 0 0 - 114 prof_trigger_01 0 0 0 - 114 prof_trigger_02 0 0 0 - 114 prof_trigger_03 0 0 0 - 114 prof_trigger_04 0 0 0 - 114 prof_trigger_05 0 0 0 - 114 prof_trigger_06 0 0 0 - 114 prof_trigger_07 0 0 0 - 114 fb_subp0_read_sectors 552 676 643 - 114 fb_subp1_read_sectors 543 679 638 - 114 fb_subp0_write_sectors 512 512 512 - 114 fb_subp1_write_sectors 512 512 512 - 114 l2_subp0_write_sector_misses 512 512 512 - 114 l2_subp1_write_sector_misses 512 512 512 - 114 l2_subp0_read_sector_misses 552 684 650 - 114 l2_subp1_read_sector_misses 543 667 634 - 114 l2_subp0_write_sector_queries 512 512 512 - 114 l2_subp1_write_sector_queries 512 512 512 - 114 l2_subp0_read_sector_queries 512 512 512 - 114 l2_subp1_read_sector_queries 512 512 512 - 114 l2_subp0_read_tex_sector_queries 0 0 0 - 114 l2_subp1_read_tex_sector_queries 0 0 0 - 114 l2_subp0_read_hit_sectors 0 0 0 - 114 l2_subp1_read_hit_sectors 0 0 0 - 114 l2_subp0_read_tex_hit_sectors 0 0 0 - 114 l2_subp1_read_tex_hit_sectors 0 0 0 - 114 elapsed_cycles_sm 83516 108168 85571 - 114 l2_subp0_read_sysmem_sector_queries 0 0 0 - 114 l2_subp1_read_sysmem_sector_queries 0 0 0 - 114 l2_subp0_write_sysmem_sector_queries 0 0 0 - 114 l2_subp1_write_sysmem_sector_queries 0 0 0 - 114 l2_subp0_total_read_sector_queries 544 571 546 - 114 l2_subp1_total_read_sector_queries 544 544 544 - 114 l2_subp0_total_write_sector_queries 512 512 512 - 114 l2_subp1_total_write_sector_queries 512 512 512 - 114 atom_count 0 0 0 - 114 gred_count 0 0 0 - 114 gld_inst_8bit 0 0 0 - 114 gld_inst_16bit 0 0 0 - 114 gld_inst_32bit 0 0 0 - 114 gld_inst_64bit 0 0 0 - 114 gld_inst_128bit 2048 2048 2048 - 114 gst_inst_8bit 0 0 0 - 114 gst_inst_16bit 0 0 0 - 114 gst_inst_32bit 0 0 0 - 114 gst_inst_64bit 0 0 0 - 114 gst_inst_128bit 2048 2048 2048 - Kernel: bsc_st567_encode_cuda_presort(unsigned char*, __int64*, int) - 12 local_load 0 0 0 - 12 local_store 0 0 0 - 12 gld_request 1136168 1412304 1319818 - 12 gst_request 852126 1059228 989864 - 12 shared_load 6817008 8473824 7918912 - 12 shared_store 1136168 1412304 1319818 - 12 branch 1705020 2119224 1980496 - 12 divergent_branch 142021 176538 164977 - 12 active_cycles 25013669 31336958 29175895 - 12 inst_issued1_0 13272323 16536335 15430408 - 12 inst_issued2_0 2131275 2649030 2475620 - 12 inst_issued1_1 12236832 15258540 14229794 - 12 inst_issued2_1 2131275 2649030 2475620 - 12 inst_executed 30113060 37430664 34979802 - 12 warps_launched 384 384 384 - 12 threads_launched 12288 12288 12288 - 12 thread_inst_executed_0 407081338 506007060 472874189 - 12 thread_inst_executed_2 68194656 84762816 79213696 - 12 thread_inst_executed_1 409069632 508478592 475183872 - 12 thread_inst_executed_3 68194656 84762816 79213696 - 12 active_warps 998911065 1245521579 1162278072 - 12 tex0_cache_sector_queries 0 0 0 - 12 tex0_cache_sector_misses 0 0 0 - 12 tex1_cache_sector_queries 0 0 0 - 12 tex1_cache_sector_misses 0 0 0 - 12 sm_cta_launched 64 64 64 - 12 l1_local_load_hit 0 0 0 - 12 l1_local_load_miss 0 0 0 - 12 l1_local_store_hit 0 0 0 - 12 l1_local_store_miss 0 0 0 - 12 l1_global_load_hit 709276 881924 824115 - 12 l1_global_load_miss 426576 530544 495710 - 12 uncached_global_load_transaction 0 0 0 - 12 global_store_transaction 1704288 2118480 1979760 - 12 l1_shared_bank_conflict 0 0 0 - 12 prof_trigger_00 0 0 0 - 12 prof_trigger_01 0 0 0 - 12 prof_trigger_02 0 0 0 - 12 prof_trigger_03 0 0 0 - 12 prof_trigger_04 0 0 0 - 12 prof_trigger_05 0 0 0 - 12 prof_trigger_06 0 0 0 - 12 prof_trigger_07 0 0 0 - 12 fb_subp0_read_sectors 511500 653093 604758 - 12 fb_subp1_read_sectors 528324 657864 610117 - 12 fb_subp0_write_sectors 3408504 4236913 3959456 - 12 fb_subp1_write_sectors 3408504 4236913 3959458 - 12 l2_subp0_write_sector_misses 3408504 4236913 3959456 - 12 l2_subp1_write_sector_misses 3408504 4236913 3959458 - 12 l2_subp0_read_sector_misses 508276 660133 606428 - 12 l2_subp1_read_sector_misses 506118 646528 599696 - 12 l2_subp0_write_sector_queries 3408504 4236912 3959456 - 12 l2_subp1_write_sector_queries 3408504 4236912 3959456 - 12 l2_subp0_read_sector_queries 852824 1060252 990758 - 12 l2_subp1_read_sector_queries 852816 1060180 990725 - 12 l2_subp0_read_tex_sector_queries 0 0 0 - 12 l2_subp1_read_tex_sector_queries 0 0 0 - 12 l2_subp0_read_hit_sectors 352780 449384 411893 - 12 l2_subp1_read_hit_sectors 6064 8668 7561 - 12 l2_subp0_read_tex_hit_sectors 0 0 0 - 12 l2_subp1_read_tex_hit_sectors 0 0 0 - 12 elapsed_cycles_sm 25734508 32879488 30363941 - 12 l2_subp0_read_sysmem_sector_queries 0 0 0 - 12 l2_subp1_read_sysmem_sector_queries 0 0 0 - 12 l2_subp0_write_sysmem_sector_queries 0 0 0 - 12 l2_subp1_write_sysmem_sector_queries 0 0 0 - 12 l2_subp0_total_read_sector_queries 852784 1061099 990875 - 12 l2_subp1_total_read_sector_queries 852909 1060252 990777 - 12 l2_subp0_total_write_sector_queries 3408504 4236912 3959456 - 12 l2_subp1_total_write_sector_queries 3408504 4236923 3959457 - 12 atom_count 0 0 0 - 12 gred_count 0 0 0 - 12 gld_inst_8bit 28404200 35307600 32995466 - 12 gld_inst_16bit 0 0 0 - 12 gld_inst_32bit 0 0 0 - 12 gld_inst_64bit 0 0 0 - 12 gld_inst_128bit 0 0 0 - 12 gst_inst_8bit 0 0 0 - 12 gst_inst_16bit 0 0 0 - 12 gst_inst_32bit 0 0 0 - 12 gst_inst_64bit 27268032 33895296 31675648 - 12 gst_inst_128bit 0 0 0 - Kernel: void mtf_thread_by4(unsigned char const *, unsigned char*, int, int) - 3 local_load 0 0 0 - 3 local_store 0 0 0 - 3 gld_request 3286592 4017478 3767660 - 3 gst_request 3287182 4018200 3768346 - 3 shared_load 3848182 4706132 4403629 - 3 shared_store 3848854 4706964 4404408 - 3 branch 12340258 15086883 14134725 - 3 divergent_branch 3262880 3983490 3739625 - 3 active_cycles 207661858 226483708 219268644 - 3 inst_issued1_0 21839106 25715857 24400298 - 3 inst_issued2_0 15716275 18712590 17622192 - 3 inst_issued1_1 20457082 26286186 24192810 - 3 inst_issued2_1 14537950 18541960 17037524 - 3 inst_executed 86740102 106038282 99358379 - 3 warps_launched 84 104 97 - 3 threads_launched 2688 3328 3114 - 3 thread_inst_executed_0 320627258 382392392 360984097 - 3 thread_inst_executed_2 141375656 155889967 150836983 - 3 thread_inst_executed_1 276598928 388890864 347114228 - 3 thread_inst_executed_3 113903946 158167980 142758748 - 3 active_warps 1220644199 1481321599 1391027412 - 3 tex0_cache_sector_queries 0 0 0 - 3 tex0_cache_sector_misses 0 0 0 - 3 tex1_cache_sector_queries 0 0 0 - 3 tex1_cache_sector_misses 0 0 0 - 3 sm_cta_launched 88 112 104 - 3 l1_local_load_hit 0 0 0 - 3 l1_local_load_miss 0 0 0 - 3 l1_local_store_hit 0 0 0 - 3 l1_local_store_miss 0 0 0 - 3 l1_global_load_hit 3440396 4398616 3998397 - 3 l1_global_load_miss 8161248 10529896 9641389 - 3 uncached_global_load_transaction 0 0 0 - 3 global_store_transaction 11534336 15106048 13773482 - 3 l1_shared_bank_conflict 0 0 0 - 3 prof_trigger_00 0 0 0 - 3 prof_trigger_01 0 0 0 - 3 prof_trigger_02 0 0 0 - 3 prof_trigger_03 0 0 0 - 3 prof_trigger_04 0 0 0 - 3 prof_trigger_05 0 0 0 - 3 prof_trigger_06 0 0 0 - 3 prof_trigger_07 0 0 0 - 3 fb_subp0_read_sectors 1136101 1243690 1201366 - 3 fb_subp1_read_sectors 1147073 1282862 1216269 - 3 fb_subp0_write_sectors 5428437 6742226 6292989 - 3 fb_subp1_write_sectors 5426379 6737187 6290397 - 3 l2_subp0_write_sector_misses 5428437 6742226 6292989 - 3 l2_subp1_write_sector_misses 5426379 6737187 6290397 - 3 l2_subp0_read_sector_misses 1139141 1250977 1205712 - 3 l2_subp1_read_sector_misses 1140435 1280157 1215305 - 3 l2_subp0_write_sector_queries 5486592 6803456 6353578 - 3 l2_subp1_write_sector_queries 5486592 6803456 6353578 - 3 l2_subp0_read_sector_queries 15587336 19522824 18036365 - 3 l2_subp1_read_sector_queries 15659560 19331336 17987694 - 3 l2_subp0_read_tex_sector_queries 0 0 0 - 3 l2_subp1_read_tex_sector_queries 0 0 0 - 3 l2_subp0_read_hit_sectors 14786940 18540676 17108110 - 3 l2_subp1_read_hit_sectors 112228 118960 114618 - 3 l2_subp0_read_tex_hit_sectors 0 0 0 - 3 l2_subp1_read_tex_hit_sectors 0 0 0 - 3 elapsed_cycles_sm 231590004 255616376 245837466 - 3 l2_subp0_read_sysmem_sector_queries 0 0 0 - 3 l2_subp1_read_sysmem_sector_queries 0 0 0 - 3 l2_subp0_write_sysmem_sector_queries 0 0 0 - 3 l2_subp1_write_sysmem_sector_queries 0 0 0 - 3 l2_subp0_total_read_sector_queries 15836850 19117327 18005558 - 3 l2_subp1_total_read_sector_queries 15587090 19626149 18054997 - 3 l2_subp0_total_write_sector_queries 5486594 6803458 6353580 - 3 l2_subp1_total_write_sector_queries 5486605 6803469 6353592 - 3 atom_count 0 0 0 - 3 gred_count 0 0 0 - 3 gld_inst_8bit 10975863 13610234 12710259 - 3 gld_inst_16bit 0 0 0 - 3 gld_inst_32bit 0 0 0 - 3 gld_inst_64bit 0 0 0 - 3 gld_inst_128bit 0 0 0 - 3 gst_inst_8bit 10973184 13606912 12707157 - 3 gst_inst_16bit 0 0 0 - 3 gst_inst_32bit 0 0 0 - 3 gst_inst_64bit 0 0 0 - 3 gst_inst_128bit 0 0 0 - Kernel: void mtf_thread_by4(unsigned char const *, unsigned char*, int, int) - 3 local_load 0 0 0 - 3 local_store 0 0 0 - 3 gld_request 3450675 4216056 3955785 - 3 gst_request 3451275 4216793 3956484 - 3 shared_load 4703204 5795821 5412194 - 3 shared_store 4708580 5802477 5418423 - 3 branch 12791261 15694821 14691113 - 3 divergent_branch 3422340 4169731 3918718 - 3 active_cycles 197415881 249127084 230804865 - 3 inst_issued1_0 27908150 30319530 29511555 - 3 inst_issued2_0 16136082 17110768 16746366 - 3 inst_issued1_1 21094818 30130919 26984452 - 3 inst_issued2_1 11707211 17140255 15219628 - 3 inst_executed 89455397 109726736 102728411 - 3 warps_launched 84 104 97 - 3 threads_launched 2688 3328 3114 - 3 thread_inst_executed_0 387884866 411443772 403245608 - 3 thread_inst_executed_2 147847898 161171486 155636675 - 3 thread_inst_executed_1 288575772 414805145 372058145 - 3 thread_inst_executed_3 111199309 158879226 142705959 - 3 active_warps 997600405 1226094216 1148035071 - 3 tex0_cache_sector_queries 0 0 0 - 3 tex0_cache_sector_misses 0 0 0 - 3 tex1_cache_sector_queries 0 0 0 - 3 tex1_cache_sector_misses 0 0 0 - 3 sm_cta_launched 88 104 96 - 3 l1_local_load_hit 0 0 0 - 3 l1_local_load_miss 0 0 0 - 3 l1_local_store_hit 0 0 0 - 3 l1_local_store_miss 0 0 0 - 3 l1_global_load_hit 4794720 5266108 5004606 - 3 l1_global_load_miss 6592220 7846488 7358925 - 3 uncached_global_load_transaction 0 0 0 - 3 global_store_transaction 11386880 13107200 12358997 - 3 l1_shared_bank_conflict 0 0 0 - 3 prof_trigger_00 0 0 0 - 3 prof_trigger_01 0 0 0 - 3 prof_trigger_02 0 0 0 - 3 prof_trigger_03 0 0 0 - 3 prof_trigger_04 0 0 0 - 3 prof_trigger_05 0 0 0 - 3 prof_trigger_06 0 0 0 - 3 prof_trigger_07 0 0 0 - 3 fb_subp0_read_sectors 491913 608094 568423 - 3 fb_subp1_read_sectors 492473 615976 572680 - 3 fb_subp0_write_sectors 5433032 6721574 6278165 - 3 fb_subp1_write_sectors 5428085 6720123 6276140 - 3 l2_subp0_write_sector_misses 5433032 6721574 6278165 - 3 l2_subp1_write_sector_misses 5428085 6720123 6276140 - 3 l2_subp0_read_sector_misses 492789 611070 568904 - 3 l2_subp1_read_sector_misses 492745 614912 570714 - 3 l2_subp0_write_sector_queries 5486592 6803456 6353578 - 3 l2_subp1_write_sector_queries 5486592 6803456 6353578 - 3 l2_subp0_read_sector_queries 12700444 15655312 14571432 - 3 l2_subp1_read_sector_queries 12657900 15507348 14501629 - 3 l2_subp0_read_tex_sector_queries 0 0 0 - 3 l2_subp1_read_tex_sector_queries 0 0 0 - 3 l2_subp0_read_hit_sectors 12469592 15371248 14309552 - 3 l2_subp1_read_hit_sectors 25308 30004 28285 - 3 l2_subp0_read_tex_hit_sectors 0 0 0 - 3 l2_subp1_read_tex_hit_sectors 0 0 0 - 3 elapsed_cycles_sm 237496448 307397488 283228661 - 3 l2_subp0_read_sysmem_sector_queries 0 0 0 - 3 l2_subp1_read_sysmem_sector_queries 0 0 0 - 3 l2_subp0_write_sysmem_sector_queries 0 0 0 - 3 l2_subp1_write_sysmem_sector_queries 0 0 0 - 3 l2_subp0_total_read_sector_queries 12645163 15608191 14619371 - 3 l2_subp1_total_read_sector_queries 12651330 15675434 14538181 - 3 l2_subp0_total_write_sector_queries 5486594 6803460 6353581 - 3 l2_subp1_total_write_sector_queries 5486605 6803458 6353588 - 3 atom_count 0 0 0 - 3 gred_count 0 0 0 - 3 gld_inst_8bit 10975863 13610234 12710259 - 3 gld_inst_16bit 0 0 0 - 3 gld_inst_32bit 0 0 0 - 3 gld_inst_64bit 0 0 0 - 3 gld_inst_128bit 0 0 0 - 3 gst_inst_8bit 10973184 13606912 12707157 - 3 gst_inst_16bit 0 0 0 - 3 gst_inst_32bit 0 0 0 - 3 gst_inst_64bit 0 0 0 - 3 gst_inst_128bit 0 0 0 - Kernel: void cub::RadixSortScanBinsKernel::Policy520, int>(unsigned char*, int) - 39 local_load 0 0 0 - 39 local_store 0 0 0 - 39 gld_request 32 32 32 - 39 gst_request 32 32 32 - 39 shared_load 78 78 78 - 39 shared_store 80 80 80 - 39 branch 64 64 64 - 39 divergent_branch 0 0 0 - 39 active_cycles 4555 4687 4596 - 39 inst_issued1_0 486 489 486 - 39 inst_issued2_0 154 154 154 - 39 inst_issued1_1 390 392 390 - 39 inst_issued2_1 104 104 104 - 39 inst_executed 1188 1188 1188 - 39 warps_launched 16 16 16 - 39 threads_launched 512 512 512 - 39 thread_inst_executed_0 17604 17604 17604 - 39 thread_inst_executed_2 4416 4416 4416 - 39 thread_inst_executed_1 13056 13056 13056 - 39 thread_inst_executed_3 2816 2816 2816 - 39 active_warps 71116 73038 71771 - 39 tex0_cache_sector_queries 0 0 0 - 39 tex0_cache_sector_misses 0 0 0 - 39 tex1_cache_sector_queries 0 0 0 - 39 tex1_cache_sector_misses 0 0 0 - 39 sm_cta_launched 4 4 4 - 39 l1_local_load_hit 0 0 0 - 39 l1_local_load_miss 0 0 0 - 39 l1_local_store_hit 0 0 0 - 39 l1_local_store_miss 0 0 0 - 39 l1_global_load_hit 0 0 0 - 39 l1_global_load_miss 512 512 512 - 39 uncached_global_load_transaction 0 0 0 - 39 global_store_transaction 512 512 512 - 39 l1_shared_bank_conflict 256 256 256 - 39 prof_trigger_00 0 0 0 - 39 prof_trigger_01 0 0 0 - 39 prof_trigger_02 0 0 0 - 39 prof_trigger_03 0 0 0 - 39 prof_trigger_04 0 0 0 - 39 prof_trigger_05 0 0 0 - 39 prof_trigger_06 0 0 0 - 39 prof_trigger_07 0 0 0 - 39 fb_subp0_read_sectors 288 404 349 - 39 fb_subp1_read_sectors 295 407 355 - 39 fb_subp0_write_sectors 256 256 256 - 39 fb_subp1_write_sectors 256 256 256 - 39 l2_subp0_write_sector_misses 256 256 256 - 39 l2_subp1_write_sector_misses 256 256 256 - 39 l2_subp0_read_sector_misses 336 360 347 - 39 l2_subp1_read_sector_misses 295 407 353 - 39 l2_subp0_write_sector_queries 256 256 256 - 39 l2_subp1_write_sector_queries 256 256 256 - 39 l2_subp0_read_sector_queries 256 256 256 - 39 l2_subp1_read_sector_queries 256 256 256 - 39 l2_subp0_read_tex_sector_queries 0 0 0 - 39 l2_subp1_read_tex_sector_queries 0 0 0 - 39 l2_subp0_read_hit_sectors 0 0 0 - 39 l2_subp1_read_hit_sectors 0 0 0 - 39 l2_subp0_read_tex_hit_sectors 0 0 0 - 39 l2_subp1_read_tex_hit_sectors 0 0 0 - 39 elapsed_cycles_sm 54332 100688 57117 - 39 l2_subp0_read_sysmem_sector_queries 0 0 0 - 39 l2_subp1_read_sysmem_sector_queries 0 0 0 - 39 l2_subp0_write_sysmem_sector_queries 0 0 0 - 39 l2_subp1_write_sysmem_sector_queries 0 0 0 - 39 l2_subp0_total_read_sector_queries 288 296 290 - 39 l2_subp1_total_read_sector_queries 296 296 296 - 39 l2_subp0_total_write_sector_queries 256 256 256 - 39 l2_subp1_total_write_sector_queries 256 256 256 - 39 atom_count 0 0 0 - 39 gred_count 0 0 0 - 39 gld_inst_8bit 0 0 0 - 39 gld_inst_16bit 0 0 0 - 39 gld_inst_32bit 0 0 0 - 39 gld_inst_64bit 0 0 0 - 39 gld_inst_128bit 1024 1024 1024 - 39 gst_inst_8bit 0 0 0 - 39 gst_inst_16bit 0 0 0 - 39 gst_inst_32bit 0 0 0 - 39 gst_inst_64bit 0 0 0 - 39 gst_inst_128bit 1024 1024 1024 - Kernel: void mtf_thread(unsigned char const *, unsigned char*, int, int) - 3 local_load 0 0 0 - 3 local_store 0 0 0 - 3 gld_request 2745253 3357981 3148117 - 3 gst_request 2745746 3358557 3148675 - 3 shared_load 3374497 4120668 3853614 - 3 shared_store 6120831 7479953 7002971 - 3 branch 9541512 11671495 10912963 - 3 divergent_branch 2724328 3325235 3122441 - 3 active_cycles 214678639 236722208 227988262 - 3 inst_issued1_0 39373416 45345511 42791443 - 3 inst_issued2_0 13232178 15063799 14250387 - 3 inst_issued1_1 32732739 43745001 39820845 - 3 inst_issued2_1 10665075 14501120 13108918 - 3 inst_executed 102785237 125665260 117580331 - 3 warps_launched 84 104 97 - 3 threads_launched 2688 3328 3114 - 3 thread_inst_executed_0 675851874 773376012 728311007 - 3 thread_inst_executed_2 148574955 169482975 160831529 - 3 thread_inst_executed_1 552012736 762874620 686498370 - 3 thread_inst_executed_3 123496308 168187591 151929593 - 3 active_warps 1248733324 1520250398 1425434561 - 3 tex0_cache_sector_queries 0 0 0 - 3 tex0_cache_sector_misses 0 0 0 - 3 tex1_cache_sector_queries 0 0 0 - 3 tex1_cache_sector_misses 0 0 0 - 3 sm_cta_launched 84 108 100 - 3 l1_local_load_hit 0 0 0 - 3 l1_local_load_miss 0 0 0 - 3 l1_local_store_hit 0 0 0 - 3 l1_local_store_miss 0 0 0 - 3 l1_global_load_hit 3535072 5471540 4766501 - 3 l1_global_load_miss 7476064 8589400 8190693 - 3 uncached_global_load_transaction 0 0 0 - 3 global_store_transaction 11010048 14155776 13074432 - 3 l1_shared_bank_conflict 0 0 0 - 3 prof_trigger_00 0 0 0 - 3 prof_trigger_01 0 0 0 - 3 prof_trigger_02 0 0 0 - 3 prof_trigger_03 0 0 0 - 3 prof_trigger_04 0 0 0 - 3 prof_trigger_05 0 0 0 - 3 prof_trigger_06 0 0 0 - 3 prof_trigger_07 0 0 0 - 3 fb_subp0_read_sectors 940757 1069674 1018569 - 3 fb_subp1_read_sectors 955747 1097538 1031331 - 3 fb_subp0_write_sectors 5396082 6707546 6261544 - 3 fb_subp1_write_sectors 5392492 6704665 6258660 - 3 l2_subp0_write_sector_misses 5396082 6707546 6261544 - 3 l2_subp1_write_sector_misses 5392492 6704665 6258660 - 3 l2_subp0_read_sector_misses 939621 1069470 1020519 - 3 l2_subp1_read_sector_misses 958997 1089922 1031326 - 3 l2_subp0_write_sector_queries 5486592 6803456 6353578 - 3 l2_subp1_write_sector_queries 5486592 6803456 6353578 - 3 l2_subp0_read_sector_queries 14279280 17169604 16152626 - 3 l2_subp1_read_sector_queries 14289104 17169472 16158028 - 3 l2_subp0_read_tex_sector_queries 0 0 0 - 3 l2_subp1_read_tex_sector_queries 0 0 0 - 3 l2_subp0_read_hit_sectors 13619296 16414452 15430398 - 3 l2_subp1_read_hit_sectors 88900 92108 89985 - 3 l2_subp0_read_tex_hit_sectors 0 0 0 - 3 l2_subp1_read_tex_hit_sectors 0 0 0 - 3 elapsed_cycles_sm 256882872 287977256 273661401 - 3 l2_subp0_read_sysmem_sector_queries 0 0 0 - 3 l2_subp1_read_sysmem_sector_queries 0 0 0 - 3 l2_subp0_write_sysmem_sector_queries 0 0 0 - 3 l2_subp1_write_sysmem_sector_queries 0 0 0 - 3 l2_subp0_total_read_sector_queries 14163882 17034357 16068822 - 3 l2_subp1_total_read_sector_queries 14284982 17168666 16155365 - 3 l2_subp0_total_write_sector_queries 5486595 6803459 6353580 - 3 l2_subp1_total_write_sector_queries 5486605 6803470 6353592 - 3 atom_count 0 0 0 - 3 gred_count 0 0 0 - 3 gld_inst_8bit 10975863 13610234 12710259 - 3 gld_inst_16bit 0 0 0 - 3 gld_inst_32bit 0 0 0 - 3 gld_inst_64bit 0 0 0 - 3 gld_inst_128bit 0 0 0 - 3 gst_inst_8bit 10973184 13606912 12707157 - 3 gst_inst_16bit 0 0 0 - 3 gst_inst_32bit 0 0 0 - 3 gst_inst_64bit 0 0 0 - 3 gst_inst_128bit 0 0 0 - Kernel: void mtf_4by8(unsigned char const *, unsigned char*, int, int) - 3 local_load 0 0 0 - 3 local_store 0 0 0 - 3 gld_request 3545808 4339760 4065850 - 3 gst_request 3545940 4339906 4065989 - 3 shared_load 7418051 9067252 8488979 - 3 shared_store 10836259 13253760 12411228 - 3 branch 11666648 14280608 13356758 - 3 divergent_branch 3359406 4099032 3845584 - 3 active_cycles 132877813 166245843 154502609 - 3 inst_issued1_0 56457928 68940766 64601085 - 3 inst_issued2_0 26702646 32738676 30574471 - 3 inst_issued1_1 56176527 68609585 64329082 - 3 inst_issued2_1 26693072 32594638 30555695 - 3 inst_executed 174408302 213420539 199706357 - 3 warps_launched 336 416 389 - 3 threads_launched 10752 13312 12458 - 3 thread_inst_executed_0 1132799570 1382333234 1299128608 - 3 thread_inst_executed_2 518372504 631940448 593976477 - 3 thread_inst_executed_1 1114929714 1373726593 1283891616 - 3 thread_inst_executed_3 509684768 627571248 586587722 - 3 active_warps 2509985316 3036696794 2854128244 - 3 tex0_cache_sector_queries 0 0 0 - 3 tex0_cache_sector_misses 0 0 0 - 3 tex1_cache_sector_queries 0 0 0 - 3 tex1_cache_sector_misses 0 0 0 - 3 sm_cta_launched 88 104 97 - 3 l1_local_load_hit 0 0 0 - 3 l1_local_load_miss 0 0 0 - 3 l1_local_store_hit 0 0 0 - 3 l1_local_store_miss 0 0 0 - 3 l1_global_load_hit 4688644 6107748 5354933 - 3 l1_global_load_miss 7030932 8230608 7706634 - 3 uncached_global_load_transaction 0 0 0 - 3 global_store_transaction 11534336 13533184 12724906 - 3 l1_shared_bank_conflict 28231128 34872176 31985245 - 3 prof_trigger_00 0 0 0 - 3 prof_trigger_01 0 0 0 - 3 prof_trigger_02 0 0 0 - 3 prof_trigger_03 0 0 0 - 3 prof_trigger_04 0 0 0 - 3 prof_trigger_05 0 0 0 - 3 prof_trigger_06 0 0 0 - 3 prof_trigger_07 0 0 0 - 3 fb_subp0_read_sectors 451581 578342 527789 - 3 fb_subp1_read_sectors 515097 579718 550701 - 3 fb_subp0_write_sectors 4992927 6184815 5774593 - 3 fb_subp1_write_sectors 4968444 6171878 5763212 - 3 l2_subp0_write_sector_misses 4992927 6184815 5774593 - 3 l2_subp1_write_sector_misses 4968444 6171878 5763212 - 3 l2_subp0_read_sector_misses 453340 579681 529563 - 3 l2_subp1_read_sector_misses 507145 572640 546036 - 3 l2_subp0_write_sector_queries 5486592 6803456 6353578 - 3 l2_subp1_write_sector_queries 5486592 6803456 6353578 - 3 l2_subp0_read_sector_queries 13587964 16706292 15499404 - 3 l2_subp1_read_sector_queries 13726848 16591348 15594976 - 3 l2_subp0_read_tex_sector_queries 0 0 0 - 3 l2_subp1_read_tex_sector_queries 0 0 0 - 3 l2_subp0_read_hit_sectors 13434304 16066068 15069873 - 3 l2_subp1_read_hit_sectors 50604 64696 58389 - 3 l2_subp0_read_tex_hit_sectors 0 0 0 - 3 l2_subp1_read_tex_hit_sectors 0 0 0 - 3 elapsed_cycles_sm 158874200 206002332 189773162 - 3 l2_subp0_read_sysmem_sector_queries 0 0 0 - 3 l2_subp1_read_sysmem_sector_queries 0 0 0 - 3 l2_subp0_write_sysmem_sector_queries 0 0 0 - 3 l2_subp1_write_sysmem_sector_queries 0 0 0 - 3 l2_subp0_total_read_sector_queries 13922210 16820166 15585946 - 3 l2_subp1_total_read_sector_queries 13451067 16601658 15377502 - 3 l2_subp0_total_write_sector_queries 5486595 6803456 6353580 - 3 l2_subp1_total_write_sector_queries 5486593 6803458 6353583 - 3 atom_count 0 0 0 - 3 gred_count 0 0 0 - 3 gld_inst_8bit 43903452 54440936 50841038 - 3 gld_inst_16bit 0 0 0 - 3 gld_inst_32bit 0 0 0 - 3 gld_inst_64bit 0 0 0 - 3 gld_inst_128bit 0 0 0 - 3 gst_inst_8bit 43892736 54427648 50828629 - 3 gst_inst_16bit 0 0 0 - 3 gst_inst_32bit 0 0 0 - 3 gst_inst_64bit 0 0 0 - 3 gst_inst_128bit 0 0 0 - Kernel: bsc_st8_encode_cuda_postsort(__int64*, int, __int64, int*) - 3 local_load 0 0 0 - 3 local_store 0 0 0 - 3 gld_request 852126 1059228 989864 - 3 gst_request 0 0 0 - 3 shared_load 0 0 0 - 3 shared_store 0 0 0 - 3 branch 852894 1059996 990632 - 3 divergent_branch 0 0 0 - 3 active_cycles 14374903 17825362 16659297 - 3 inst_issued1_0 2868829 3558818 3323890 - 3 inst_issued2_0 1279725 1590378 1486332 - 3 inst_issued1_1 2865303 3558735 3324577 - 3 inst_issued2_1 1279725 1590378 1486332 - 3 inst_executed 9381834 11659956 10896952 - 3 warps_launched 384 384 384 - 3 threads_launched 12288 12288 12288 - 3 thread_inst_executed_0 109170432 135679488 126800896 - 3 thread_inst_executed_2 40938912 50879808 47550336 - 3 thread_inst_executed_1 109170432 135679488 126800896 - 3 thread_inst_executed_3 40938912 50879808 47550336 - 3 active_warps 660313552 820532353 766858911 - 3 tex0_cache_sector_queries 0 0 0 - 3 tex0_cache_sector_misses 0 0 0 - 3 tex1_cache_sector_queries 0 0 0 - 3 tex1_cache_sector_misses 0 0 0 - 3 sm_cta_launched 64 64 64 - 3 l1_local_load_hit 0 0 0 - 3 l1_local_load_miss 0 0 0 - 3 l1_local_store_hit 0 0 0 - 3 l1_local_store_miss 0 0 0 - 3 l1_global_load_hit 0 0 0 - 3 l1_global_load_miss 1704288 2118480 1979760 - 3 uncached_global_load_transaction 0 0 0 - 3 global_store_transaction 0 0 0 - 3 l1_shared_bank_conflict 0 0 0 - 3 prof_trigger_00 0 0 0 - 3 prof_trigger_01 0 0 0 - 3 prof_trigger_02 0 0 0 - 3 prof_trigger_03 0 0 0 - 3 prof_trigger_04 0 0 0 - 3 prof_trigger_05 0 0 0 - 3 prof_trigger_06 0 0 0 - 3 prof_trigger_07 0 0 0 - 3 fb_subp0_read_sectors 3424784 4253228 3976587 - 3 fb_subp1_read_sectors 3424804 4257052 3976515 - 3 fb_subp0_write_sectors 0 1 0 - 3 fb_subp1_write_sectors 2 15 8 - 3 l2_subp0_write_sector_misses 0 1 0 - 3 l2_subp1_write_sector_misses 2 15 8 - 3 l2_subp0_read_sector_misses 3419405 4257168 3976564 - 3 l2_subp1_read_sector_misses 3424838 4257102 3978363 - 3 l2_subp0_write_sector_queries 0 0 0 - 3 l2_subp1_write_sector_queries 0 0 0 - 3 l2_subp0_read_sector_queries 3408504 4236912 3959456 - 3 l2_subp1_read_sector_queries 3408504 4236912 3959456 - 3 l2_subp0_read_tex_sector_queries 0 0 0 - 3 l2_subp1_read_tex_sector_queries 0 0 0 - 3 l2_subp0_read_hit_sectors 0 18 6 - 3 l2_subp1_read_hit_sectors 0 0 0 - 3 l2_subp0_read_tex_hit_sectors 0 0 0 - 3 l2_subp1_read_tex_hit_sectors 0 0 0 - 3 elapsed_cycles_sm 14461584 18011928 16807286 - 3 l2_subp0_read_sysmem_sector_queries 0 0 0 - 3 l2_subp1_read_sysmem_sector_queries 0 0 0 - 3 l2_subp0_write_sysmem_sector_queries 0 0 0 - 3 l2_subp1_write_sysmem_sector_queries 0 0 0 - 3 l2_subp0_total_read_sector_queries 3408536 4236946 3959488 - 3 l2_subp1_total_read_sector_queries 3408538 4236946 3959490 - 3 l2_subp0_total_write_sector_queries 1 239 139 - 3 l2_subp1_total_write_sector_queries 1 239 139 - 3 atom_count 0 0 0 - 3 gred_count 1 8 5 - 3 gld_inst_8bit 0 0 0 - 3 gld_inst_16bit 0 0 0 - 3 gld_inst_32bit 0 0 0 - 3 gld_inst_64bit 27268032 33895296 31675648 - 3 gld_inst_128bit 0 0 0 - 3 gst_inst_8bit 0 0 0 - 3 gst_inst_16bit 0 0 0 - 3 gst_inst_32bit 0 0 0 - 3 gst_inst_64bit 0 0 0 - 3 gst_inst_128bit 0 0 0 - Kernel: void mtf_thread_by4(unsigned char const *, unsigned char*, int, int) - 3 local_load 0 0 0 - 3 local_store 0 0 0 - 3 gld_request 3157040 3875998 3630842 - 3 gst_request 3157649 3876695 3631518 - 3 shared_load 3178032 3896077 3650893 - 3 shared_store 3178368 3896493 3651283 - 3 branch 11078272 13592134 12734661 - 3 divergent_branch 3138011 3851690 3609426 - 3 active_cycles 202268417 223451789 215509779 - 3 inst_issued1_0 20463266 25521263 23759080 - 3 inst_issued2_0 14418837 17627600 16447025 - 3 inst_issued1_1 19168482 23101281 21789376 - 3 inst_issued2_1 13229068 16490250 15339282 - 3 inst_executed 78283255 96051196 89992122 - 3 warps_launched 84 104 97 - 3 threads_launched 2688 3328 3114 - 3 thread_inst_executed_0 288997524 354027590 331151888 - 3 thread_inst_executed_2 113131538 142351608 132299562 - 3 thread_inst_executed_1 282253081 337156061 316058282 - 3 thread_inst_executed_3 117246182 137312601 130447865 - 3 active_warps 1216662116 1471597801 1383736242 - 3 tex0_cache_sector_queries 0 0 0 - 3 tex0_cache_sector_misses 0 0 0 - 3 tex1_cache_sector_queries 0 0 0 - 3 tex1_cache_sector_misses 0 0 0 - 3 sm_cta_launched 96 112 105 - 3 l1_local_load_hit 0 0 0 - 3 l1_local_load_miss 0 0 0 - 3 l1_local_store_hit 0 0 0 - 3 l1_local_store_miss 0 0 0 - 3 l1_global_load_hit 3301296 4055364 3685912 - 3 l1_global_load_miss 9486024 11015848 10271641 - 3 uncached_global_load_transaction 0 0 0 - 3 global_store_transaction 12435456 14680064 13757098 - 3 l1_shared_bank_conflict 0 0 0 - 3 prof_trigger_00 0 0 0 - 3 prof_trigger_01 0 0 0 - 3 prof_trigger_02 0 0 0 - 3 prof_trigger_03 0 0 0 - 3 prof_trigger_04 0 0 0 - 3 prof_trigger_05 0 0 0 - 3 prof_trigger_06 0 0 0 - 3 prof_trigger_07 0 0 0 - 3 fb_subp0_read_sectors 1629801 1809682 1731054 - 3 fb_subp1_read_sectors 1595006 1860343 1733044 - 3 fb_subp0_write_sectors 5419098 6734307 6282518 - 3 fb_subp1_write_sectors 5419130 6725698 6281141 - 3 l2_subp0_write_sector_misses 5419098 6734307 6282518 - 3 l2_subp1_write_sector_misses 5419130 6725698 6281141 - 3 l2_subp0_read_sector_misses 1624252 1811397 1727886 - 3 l2_subp1_read_sector_misses 1571320 1857843 1721419 - 3 l2_subp0_write_sector_queries 5486592 6803456 6353578 - 3 l2_subp1_write_sector_queries 5486592 6803456 6353578 - 3 l2_subp0_read_sector_queries 16716444 20163356 18903240 - 3 l2_subp1_read_sector_queries 16757148 20156368 18995820 - 3 l2_subp0_read_tex_sector_queries 0 0 0 - 3 l2_subp1_read_tex_sector_queries 0 0 0 - 3 l2_subp0_read_hit_sectors 15384644 18605136 17494582 - 3 l2_subp1_read_hit_sectors 159544 192020 172800 - 3 l2_subp0_read_tex_hit_sectors 0 0 0 - 3 l2_subp1_read_tex_hit_sectors 0 0 0 - 3 elapsed_cycles_sm 216951200 239003544 230841934 - 3 l2_subp0_read_sysmem_sector_queries 0 0 0 - 3 l2_subp1_read_sysmem_sector_queries 0 0 0 - 3 l2_subp0_write_sysmem_sector_queries 0 0 0 - 3 l2_subp1_write_sysmem_sector_queries 0 0 0 - 3 l2_subp0_total_read_sector_queries 16488555 20219887 18945042 - 3 l2_subp1_total_read_sector_queries 16897476 20462527 19190049 - 3 l2_subp0_total_write_sector_queries 5486594 6803458 6353580 - 3 l2_subp1_total_write_sector_queries 5486605 6803469 6353591 - 3 atom_count 0 0 0 - 3 gred_count 0 0 0 - 3 gld_inst_8bit 10975863 13610234 12710259 - 3 gld_inst_16bit 0 0 0 - 3 gld_inst_32bit 0 0 0 - 3 gld_inst_64bit 0 0 0 - 3 gld_inst_128bit 0 0 0 - 3 gst_inst_8bit 10973184 13606912 12707157 - 3 gst_inst_16bit 0 0 0 - 3 gst_inst_32bit 0 0 0 - 3 gst_inst_64bit 0 0 0 - 3 gst_inst_128bit 0 0 0 - Kernel: void mtf_thread(unsigned char const *, unsigned char*, int, int) - 3 local_load 0 0 0 - 3 local_store 0 0 0 - 3 gld_request 2413849 2959554 2773806 - 3 gst_request 2414236 2960069 2774269 - 3 shared_load 2764281 3379925 3167319 - 3 shared_store 5178769 6340306 5941881 - 3 branch 7981876 9770971 9153172 - 3 divergent_branch 2396774 2937085 2754528 - 3 active_cycles 203004749 222249700 214609991 - 3 inst_issued1_0 27615627 33369527 31275415 - 3 inst_issued2_0 13732961 16291134 15298401 - 3 inst_issued1_1 25253892 31522092 29382869 - 3 inst_issued2_1 12216547 15722016 14473907 - 3 inst_executed 86511911 105909339 99235239 - 3 warps_launched 84 104 97 - 3 threads_launched 2688 3328 3114 - 3 thread_inst_executed_0 505231326 596034510 564289407 - 3 thread_inst_executed_2 191927955 229609417 214830321 - 3 thread_inst_executed_1 453009888 590282054 541419333 - 3 thread_inst_executed_3 174072096 222491560 205527682 - 3 active_warps 1185370801 1434180516 1349304595 - 3 tex0_cache_sector_queries 0 0 0 - 3 tex0_cache_sector_misses 0 0 0 - 3 tex1_cache_sector_queries 0 0 0 - 3 tex1_cache_sector_misses 0 0 0 - 3 sm_cta_launched 92 108 102 - 3 l1_local_load_hit 0 0 0 - 3 l1_local_load_miss 0 0 0 - 3 l1_local_store_hit 0 0 0 - 3 l1_local_store_miss 0 0 0 - 3 l1_global_load_hit 4005768 4973000 4571750 - 3 l1_global_load_miss 8054976 9608256 8920454 - 3 uncached_global_load_transaction 0 0 0 - 3 global_store_transaction 12058624 14155776 13423957 - 3 l1_shared_bank_conflict 0 0 0 - 3 prof_trigger_00 0 0 0 - 3 prof_trigger_01 0 0 0 - 3 prof_trigger_02 0 0 0 - 3 prof_trigger_03 0 0 0 - 3 prof_trigger_04 0 0 0 - 3 prof_trigger_05 0 0 0 - 3 prof_trigger_06 0 0 0 - 3 prof_trigger_07 0 0 0 - 3 fb_subp0_read_sectors 1274533 1436125 1376167 - 3 fb_subp1_read_sectors 1270468 1477016 1381356 - 3 fb_subp0_write_sectors 5392520 6698177 6251124 - 3 fb_subp1_write_sectors 5389381 6692178 6250939 - 3 l2_subp0_write_sector_misses 5392520 6698177 6251124 - 3 l2_subp1_write_sector_misses 5389381 6692178 6250939 - 3 l2_subp0_read_sector_misses 1270201 1440065 1376098 - 3 l2_subp1_read_sector_misses 1273507 1483333 1383461 - 3 l2_subp0_write_sector_queries 5486592 6803456 6353578 - 3 l2_subp1_write_sector_queries 5486592 6803456 6353578 - 3 l2_subp0_read_sector_queries 15087436 18286284 17108812 - 3 l2_subp1_read_sector_queries 15173500 18208384 17144340 - 3 l2_subp0_read_tex_sector_queries 0 0 0 - 3 l2_subp1_read_tex_sector_queries 0 0 0 - 3 l2_subp0_read_hit_sectors 14056228 17201640 16034081 - 3 l2_subp1_read_hit_sectors 141124 149732 144249 - 3 l2_subp0_read_tex_hit_sectors 0 0 0 - 3 l2_subp1_read_tex_hit_sectors 0 0 0 - 3 elapsed_cycles_sm 226399200 244269280 236851592 - 3 l2_subp0_read_sysmem_sector_queries 0 0 0 - 3 l2_subp1_read_sysmem_sector_queries 0 0 0 - 3 l2_subp0_write_sysmem_sector_queries 0 0 0 - 3 l2_subp1_write_sysmem_sector_queries 0 0 0 - 3 l2_subp0_total_read_sector_queries 15232179 18613975 17290405 - 3 l2_subp1_total_read_sector_queries 15098715 18169234 17119027 - 3 l2_subp0_total_write_sector_queries 5486595 6803459 6353581 - 3 l2_subp1_total_write_sector_queries 5486594 6803469 6353588 - 3 atom_count 0 0 0 - 3 gred_count 0 0 0 - 3 gld_inst_8bit 10975863 13610234 12710259 - 3 gld_inst_16bit 0 0 0 - 3 gld_inst_32bit 0 0 0 - 3 gld_inst_64bit 0 0 0 - 3 gld_inst_128bit 0 0 0 - 3 gst_inst_8bit 10973184 13606912 12707157 - 3 gst_inst_16bit 0 0 0 - 3 gst_inst_32bit 0 0 0 - 3 gst_inst_64bit 0 0 0 - 3 gst_inst_128bit 0 0 0 - Kernel: bsc_st8_encode_cuda_presort(unsigned char*, __int64*, unsigned char*, int) - 3 local_load 0 0 0 - 3 local_store 0 0 0 - 3 gld_request 1136168 1412304 1319818 - 3 gst_request 1704252 2118456 1979728 - 3 shared_load 7669134 9533052 8908776 - 3 shared_store 1136168 1412304 1319818 - 3 branch 1705020 2119224 1980496 - 3 divergent_branch 142021 176538 164977 - 3 active_cycles 27432122 34000001 31798625 - 3 inst_issued1_0 14196087 17692686 16515002 - 3 inst_issued2_0 2557338 3178644 2970552 - 3 inst_issued1_1 13150603 16352250 15266996 - 3 inst_issued2_1 2557338 3178644 2970552 - 3 inst_executed 33521564 41667576 38939258 - 3 warps_launched 384 384 384 - 3 threads_launched 12288 12288 12288 - 3 thread_inst_executed_0 447699344 556496928 520057706 - 3 thread_inst_executed_2 81828672 101710464 95051520 - 3 thread_inst_executed_1 449971680 559321536 522697344 - 3 thread_inst_executed_3 81828672 101710464 95051520 - 3 active_warps 1077797519 1340176434 1252246773 - 3 tex0_cache_sector_queries 0 0 0 - 3 tex0_cache_sector_misses 0 0 0 - 3 tex1_cache_sector_queries 0 0 0 - 3 tex1_cache_sector_misses 0 0 0 - 3 sm_cta_launched 64 64 64 - 3 l1_local_load_hit 0 0 0 - 3 l1_local_load_miss 0 0 0 - 3 l1_local_store_hit 0 0 0 - 3 l1_local_store_miss 0 0 0 - 3 l1_global_load_hit 709628 882024 824262 - 3 l1_global_load_miss 426660 530184 495520 - 3 uncached_global_load_transaction 0 0 0 - 3 global_store_transaction 2556432 3177720 2969640 - 3 l1_shared_bank_conflict 0 0 0 - 3 prof_trigger_00 0 0 0 - 3 prof_trigger_01 0 0 0 - 3 prof_trigger_02 0 0 0 - 3 prof_trigger_03 0 0 0 - 3 prof_trigger_04 0 0 0 - 3 prof_trigger_05 0 0 0 - 3 prof_trigger_06 0 0 0 - 3 prof_trigger_07 0 0 0 - 3 fb_subp0_read_sectors 531220 651653 608738 - 3 fb_subp1_read_sectors 537256 667423 620837 - 3 fb_subp0_write_sectors 3834566 4766528 4454388 - 3 fb_subp1_write_sectors 3834566 4766524 4454386 - 3 l2_subp0_write_sector_misses 3834566 4766528 4454388 - 3 l2_subp1_write_sector_misses 3834566 4766524 4454386 - 3 l2_subp0_read_sector_misses 518644 664073 613486 - 3 l2_subp1_read_sector_misses 533253 650032 611025 - 3 l2_subp0_write_sector_queries 3834568 4766528 4454389 - 3 l2_subp1_write_sector_queries 3834568 4766524 4454387 - 3 l2_subp0_read_sector_queries 852908 1059980 990665 - 3 l2_subp1_read_sector_queries 852824 1059960 990614 - 3 l2_subp0_read_tex_sector_queries 0 0 0 - 3 l2_subp1_read_tex_sector_queries 0 0 0 - 3 l2_subp0_read_hit_sectors 353680 428528 401558 - 3 l2_subp1_read_hit_sectors 5344 6776 6237 - 3 l2_subp0_read_tex_hit_sectors 0 0 0 - 3 l2_subp1_read_tex_hit_sectors 0 0 0 - 3 elapsed_cycles_sm 28449980 34792276 32584610 - 3 l2_subp0_read_sysmem_sector_queries 0 0 0 - 3 l2_subp1_read_sysmem_sector_queries 0 0 0 - 3 l2_subp0_write_sysmem_sector_queries 0 0 0 - 3 l2_subp1_write_sysmem_sector_queries 0 0 0 - 3 l2_subp0_total_read_sector_queries 852818 1060040 990622 - 3 l2_subp1_total_read_sector_queries 852796 1061329 991384 - 3 l2_subp0_total_write_sector_queries 3834566 4766528 4454388 - 3 l2_subp1_total_write_sector_queries 3834569 4766525 4454388 - 3 atom_count 0 0 0 - 3 gred_count 0 0 0 - 3 gld_inst_8bit 28546221 35484138 33160444 - 3 gld_inst_16bit 0 0 0 - 3 gld_inst_32bit 0 0 0 - 3 gld_inst_64bit 0 0 0 - 3 gld_inst_128bit 0 0 0 - 3 gst_inst_8bit 27268032 33895296 31675648 - 3 gst_inst_16bit 0 0 0 - 3 gst_inst_32bit 0 0 0 - 3 gst_inst_64bit 27268032 33895296 31675648 - 3 gst_inst_128bit 0 0 0 - Kernel: void cub::DeviceRadixSortUpsweepKernel::Policy520, bool=1, bool=0, __int64, int>(cub::DeviceRadixSortPolicy<__int64, cub::NullType, int>::Policy520*, bool=1*, cub::DeviceRadixSortPolicy<__int64, cub::NullType, int>::Policy520*, int, int, cub::GridEvenShare::Policy520*>) - 18 local_load 0 0 0 - 18 local_store 0 0 0 - 18 gld_request 852122 1059227 989862 - 18 gst_request 192 192 192 - 18 shared_load 913562 1132955 1059494 - 18 shared_store 869018 1079195 1008806 - 18 branch 60896 74697 70060 - 18 divergent_branch 1 2 1 - 18 active_cycles 14125824 17588395 16431055 - 18 inst_issued1_0 2826028 3510617 3280742 - 18 inst_issued2_0 2736033 3400817 3178309 - 18 inst_issued1_1 2810388 3493815 3264487 - 18 inst_issued2_1 2735457 3400248 3177736 - 18 inst_executed 15499733 19255484 17997891 - 18 warps_launched 384 384 384 - 18 threads_launched 12288 12288 12288 - 18 thread_inst_executed_0 160658272 199476800 186475285 - 18 thread_inst_executed_2 87549408 108822496 101702794 - 18 thread_inst_executed_1 160252326 199071122 186069496 - 18 thread_inst_executed_3 87530772 108803890 101684204 - 18 active_warps 214630732 267066119 249365827 - 18 tex0_cache_sector_queries 0 0 0 - 18 tex0_cache_sector_misses 0 0 0 - 18 tex1_cache_sector_queries 0 0 0 - 18 tex1_cache_sector_misses 0 0 0 - 18 sm_cta_launched 192 192 192 - 18 l1_local_load_hit 0 0 0 - 18 l1_local_load_miss 0 0 0 - 18 l1_local_store_hit 0 0 0 - 18 l1_local_store_miss 0 0 0 - 18 l1_global_load_hit 0 4 0 - 18 l1_global_load_miss 1704240 2118528 1979760 - 18 uncached_global_load_transaction 0 0 0 - 18 global_store_transaction 3072 3072 3072 - 18 l1_shared_bank_conflict 0 0 0 - 18 prof_trigger_00 0 0 0 - 18 prof_trigger_01 0 0 0 - 18 prof_trigger_02 0 0 0 - 18 prof_trigger_03 0 0 0 - 18 prof_trigger_04 0 0 0 - 18 prof_trigger_05 0 0 0 - 18 prof_trigger_06 0 0 0 - 18 prof_trigger_07 0 0 0 - 18 fb_subp0_read_sectors 3422441 4257092 3977413 - 18 fb_subp1_read_sectors 3424597 4257237 3978265 - 18 fb_subp0_write_sectors 715 853 783 - 18 fb_subp1_write_sectors 623 814 733 - 18 l2_subp0_write_sector_misses 715 853 783 - 18 l2_subp1_write_sector_misses 623 814 733 - 18 l2_subp0_read_sector_misses 3424193 4257276 3977658 - 18 l2_subp1_read_sector_misses 3424470 4257045 3977725 - 18 l2_subp0_write_sector_queries 1536 1536 1536 - 18 l2_subp1_write_sector_queries 1536 1536 1536 - 18 l2_subp0_read_sector_queries 3408484 4236900 3959446 - 18 l2_subp1_read_sector_queries 3408484 4236904 3959447 - 18 l2_subp0_read_tex_sector_queries 0 0 0 - 18 l2_subp1_read_tex_sector_queries 0 0 0 - 18 l2_subp0_read_hit_sectors 0 4 0 - 18 l2_subp1_read_hit_sectors 0 0 0 - 18 l2_subp0_read_tex_hit_sectors 0 0 0 - 18 l2_subp1_read_tex_hit_sectors 0 0 0 - 18 elapsed_cycles_sm 14226072 17720952 16541115 - 18 l2_subp0_read_sysmem_sector_queries 0 0 0 - 18 l2_subp1_read_sysmem_sector_queries 0 0 0 - 18 l2_subp0_write_sysmem_sector_queries 0 0 0 - 18 l2_subp1_write_sysmem_sector_queries 0 0 0 - 18 l2_subp0_total_read_sector_queries 3408796 4237204 3959752 - 18 l2_subp1_total_read_sector_queries 3408784 4237209 3959751 - 18 l2_subp0_total_write_sector_queries 1536 1538 1536 - 18 l2_subp1_total_write_sector_queries 1536 1547 1536 - 18 atom_count 0 0 0 - 18 gred_count 0 0 0 - 18 gld_inst_8bit 0 0 0 - 18 gld_inst_16bit 0 0 0 - 18 gld_inst_32bit 0 0 0 - 18 gld_inst_64bit 27267887 33895203 31675564 - 18 gld_inst_128bit 0 0 0 - 18 gst_inst_8bit 0 0 0 - 18 gst_inst_16bit 0 0 0 - 18 gst_inst_32bit 3072 3072 3072 - 18 gst_inst_64bit 0 0 0 - 18 gst_inst_128bit 0 0 0 - Kernel: void cub::DeviceRadixSortUpsweepKernel::Policy520, bool=1, bool=0, __int64, int>(cub::DeviceRadixSortPolicy<__int64, unsigned char, int>::Policy520*, bool=1*, cub::DeviceRadixSortPolicy<__int64, unsigned char, int>::Policy520*, int, int, cub::GridEvenShare::Policy520*>) - 3 local_load 0 0 0 - 3 local_store 0 0 0 - 3 gld_request 852122 1059226 989862 - 3 gst_request 120 120 120 - 3 shared_load 917402 1132186 1060262 - 3 shared_store 869402 1078426 1008422 - 3 branch 82554 100810 94724 - 3 divergent_branch 1 2 1 - 3 active_cycles 14181654 17625754 16472744 - 3 inst_issued1_0 3221216 4002876 3742057 - 3 inst_issued2_0 2551956 3186030 2973677 - 3 inst_issued1_1 3204517 3992211 3729586 - 3 inst_issued2_1 2551705 3185778 2973430 - 3 inst_executed 14955529 18642153 17407738 - 3 warps_launched 480 480 480 - 3 threads_launched 15360 15360 15360 - 3 thread_inst_executed_0 157764256 196459904 183504202 - 3 thread_inst_executed_2 81656672 101947008 95151946 - 3 thread_inst_executed_1 157506551 196202160 183246570 - 3 thread_inst_executed_3 81648771 101939096 95144097 - 3 active_warps 358378035 446289420 416748569 - 3 tex0_cache_sector_queries 0 0 0 - 3 tex0_cache_sector_misses 0 0 0 - 3 tex1_cache_sector_queries 0 0 0 - 3 tex1_cache_sector_misses 0 0 0 - 3 sm_cta_launched 120 120 120 - 3 l1_local_load_hit 0 0 0 - 3 l1_local_load_miss 0 0 0 - 3 l1_local_store_hit 0 0 0 - 3 l1_local_store_miss 0 0 0 - 3 l1_global_load_hit 0 0 0 - 3 l1_global_load_miss 1704012 2118336 1979652 - 3 uncached_global_load_transaction 0 0 0 - 3 global_store_transaction 1920 1920 1920 - 3 l1_shared_bank_conflict 0 0 0 - 3 prof_trigger_00 0 0 0 - 3 prof_trigger_01 0 0 0 - 3 prof_trigger_02 0 0 0 - 3 prof_trigger_03 0 0 0 - 3 prof_trigger_04 0 0 0 - 3 prof_trigger_05 0 0 0 - 3 prof_trigger_06 0 0 0 - 3 prof_trigger_07 0 0 0 - 3 fb_subp0_read_sectors 3424697 4256984 3976570 - 3 fb_subp1_read_sectors 3424725 4253009 3976435 - 3 fb_subp0_write_sectors 370 431 405 - 3 fb_subp1_write_sectors 442 516 471 - 3 l2_subp0_write_sector_misses 370 431 405 - 3 l2_subp1_write_sector_misses 442 516 471 - 3 l2_subp0_read_sector_misses 3424877 4253068 3976940 - 3 l2_subp1_read_sector_misses 3419356 4257005 3976479 - 3 l2_subp0_write_sector_queries 960 960 960 - 3 l2_subp1_write_sector_queries 960 960 960 - 3 l2_subp0_read_sector_queries 3408488 4236900 3959446 - 3 l2_subp1_read_sector_queries 3408484 4236904 3959446 - 3 l2_subp0_read_tex_sector_queries 0 0 0 - 3 l2_subp1_read_tex_sector_queries 0 0 0 - 3 l2_subp0_read_hit_sectors 0 0 0 - 3 l2_subp1_read_hit_sectors 0 0 0 - 3 l2_subp0_read_tex_hit_sectors 0 0 0 - 3 l2_subp1_read_tex_hit_sectors 0 0 0 - 3 elapsed_cycles_sm 14235288 17690704 16525357 - 3 l2_subp0_read_sysmem_sector_queries 0 0 0 - 3 l2_subp1_read_sysmem_sector_queries 0 0 0 - 3 l2_subp0_write_sysmem_sector_queries 0 0 0 - 3 l2_subp1_write_sysmem_sector_queries 0 0 0 - 3 l2_subp0_total_read_sector_queries 3408766 4237156 3959699 - 3 l2_subp1_total_read_sector_queries 3408725 4237153 3959690 - 3 l2_subp0_total_write_sector_queries 960 962 961 - 3 l2_subp1_total_write_sector_queries 960 960 960 - 3 atom_count 0 0 0 - 3 gred_count 0 0 0 - 3 gld_inst_8bit 0 0 0 - 3 gld_inst_16bit 0 0 0 - 3 gld_inst_32bit 0 0 0 - 3 gld_inst_64bit 27267887 33895203 31675564 - 3 gld_inst_128bit 0 0 0 - 3 gst_inst_8bit 0 0 0 - 3 gst_inst_16bit 0 0 0 - 3 gst_inst_32bit 1920 1920 1920 - 3 gst_inst_64bit 0 0 0 - 3 gst_inst_128bit 0 0 0 + Kernel: void mtf_cuda_2buffers(unsigned char const *, unsigned char*, int, int) + 1 local_load 0 0 0 0 + 1 local_store 0 0 0 0 + 1 gld_request 36716918 36716918 36716918 36716918 + 1 gst_request 36708352 36708352 36708352 36708352 + 1 shared_load 40518034 40518034 40518034 40518034 + 1 shared_store 75304969 75304969 75304969 75304969 + 1 branch 44871115 44871115 44871115 44871115 + 1 divergent_branch 0 0 0 0 + 1 active_cycles 424345680 424345680 424345680 424345680 + 1 inst_issued1_0 231568042 231568042 231568042 231568042 + 1 inst_issued2_0 89791939 89791939 89791939 89791939 + 1 inst_issued1_1 231418937 231418937 231418937 231418937 + 1 inst_issued2_1 89748211 89748211 89748211 89748211 + 1 inst_executed 820467872 820467872 820467872 820467872 + 1 warps_launched 4484 4484 4484 4484 + 1 threads_launched 143488 143488 143488 143488 + 1 thread_inst_executed_0 1.0258e+10 1.0258e+10 1.0258e+10 1.0258e+10 + 1 thread_inst_executed_2 2873338976 2873338976 2873338976 2873338976 + 1 thread_inst_executed_1 1.0252e+10 1.0252e+10 1.0252e+10 1.0252e+10 + 1 thread_inst_executed_3 2871939680 2871939680 2871939680 2871939680 + 1 active_warps 9517303812 9517303812 9517303812 9517303812 + 1 tex0_cache_sector_queries 0 0 0 0 + 1 tex0_cache_sector_misses 0 0 0 0 + 1 tex1_cache_sector_queries 0 0 0 0 + 1 tex1_cache_sector_misses 0 0 0 0 + 1 sm_cta_launched 1124 1124 1124 1124 + 1 l1_local_load_hit 0 0 0 0 + 1 l1_local_load_miss 0 0 0 0 + 1 l1_local_store_hit 0 0 0 0 + 1 l1_local_store_miss 0 0 0 0 + 1 l1_global_load_hit 29597200 29597200 29597200 29597200 + 1 l1_global_load_miss 6980464 6980464 6980464 6980464 + 1 uncached_global_load_transaction 0 0 0 0 + 1 global_store_transaction 36700160 36700160 36700160 36700160 + 1 l1_shared_bank_conflict 0 0 0 0 + 1 prof_trigger_00 0 0 0 0 + 1 prof_trigger_01 0 0 0 0 + 1 prof_trigger_02 0 0 0 0 + 1 prof_trigger_03 0 0 0 0 + 1 prof_trigger_04 0 0 0 0 + 1 prof_trigger_05 0 0 0 0 + 1 prof_trigger_06 0 0 0 0 + 1 prof_trigger_07 0 0 0 0 + 1 fb_subp0_read_sectors 1062163 1062163 1062163 1062163 + 1 fb_subp1_read_sectors 1057100 1057100 1057100 1057100 + 1 fb_subp0_write_sectors 16233711 16233711 16233711 16233711 + 1 fb_subp1_write_sectors 16247710 16247710 16247710 16247710 + 1 l2_subp0_write_sector_misses 16267773 16267773 16267773 16267773 + 1 l2_subp1_write_sector_misses 16247710 16247710 16247710 16247710 + 1 l2_subp0_read_sector_misses 1061054 1061054 1061054 1061054 + 1 l2_subp1_read_sector_misses 1055652 1055652 1055652 1055652 + 1 l2_subp0_write_sector_queries 18354176 18354176 18354176 18354176 + 1 l2_subp1_write_sector_queries 18354176 18354176 18354176 18354176 + 1 l2_subp0_read_sector_queries 13813944 13813944 13813944 13813944 + 1 l2_subp1_read_sector_queries 14351728 14351728 14351728 14351728 + 1 l2_subp0_read_tex_sector_queries 0 0 0 0 + 1 l2_subp1_read_tex_sector_queries 0 0 0 0 + 1 l2_subp0_read_hit_sectors 13652172 13652172 13652172 13652172 + 1 l2_subp1_read_hit_sectors 14024560 14024560 14024560 14024560 + 1 l2_subp0_read_tex_hit_sectors 0 0 0 0 + 1 l2_subp1_read_tex_hit_sectors 0 0 0 0 + 1 elapsed_cycles_sm 434255784 434255784 434255784 434255784 + 1 l2_subp0_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_write_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_write_sysmem_sector_queries 2 2 2 2 + 1 l2_subp0_total_read_sector_queries 13663059 13663059 13663059 13663059 + 1 l2_subp1_total_read_sector_queries 15158872 15158872 15158872 15158872 + 1 l2_subp0_total_write_sector_queries 18354178 18354178 18354178 18354178 + 1 l2_subp1_total_write_sector_queries 18354182 18354182 18354182 18354182 + 1 atom_count 0 0 0 0 + 1 gred_count 0 0 0 0 + 1 gld_inst_8bit 1174941376 1174941376 1174941376 1174941376 + 1 gld_inst_16bit 0 0 0 0 + 1 gld_inst_32bit 0 0 0 0 + 1 gld_inst_64bit 0 0 0 0 + 1 gld_inst_128bit 0 0 0 0 + 1 gst_inst_8bit 36708352 36708352 36708352 36708352 + 1 gst_inst_16bit 0 0 0 0 + 1 gst_inst_32bit 0 0 0 0 + 1 gst_inst_64bit 0 0 0 0 + 1 gst_inst_128bit 0 0 0 0 + Kernel: void mtf_cuda_thread_by4(unsigned char const *, unsigned char*, int, int) + 1 local_load 0 0 0 0 + 1 local_store 0 0 0 0 + 1 gld_request 10882318 10882318 10882318 10882318 + 1 gst_request 10884318 10884318 10884318 10884318 + 1 shared_load 13817286 13817286 13817286 13817286 + 1 shared_store 13821782 13821782 13821782 13821782 + 1 branch 42466425 42466425 42466425 42466425 + 1 divergent_branch 10780055 10780055 10780055 10780055 + 1 active_cycles 608254000 608254000 608254000 608254000 + 1 inst_issued1_0 109902820 109902820 109902820 109902820 + 1 inst_issued2_0 51239718 51239718 51239718 51239718 + 1 inst_issued1_1 85309160 85309160 85309160 85309160 + 1 inst_issued2_1 51893196 51893196 51893196 51893196 + 1 inst_executed 297620701 297620701 297620701 297620701 + 1 warps_launched 284 284 284 284 + 1 threads_launched 9088 9088 9088 9088 + 1 thread_inst_executed_0 1055924522 1055924522 1055924522 1055924522 + 1 thread_inst_executed_2 447695597 447695597 447695597 447695597 + 1 thread_inst_executed_1 1095090723 1095090723 1095090723 1095090723 + 1 thread_inst_executed_3 462920834 462920834 462920834 462920834 + 1 active_warps 1.2269e+10 1.2269e+10 1.2269e+10 1.2269e+10 + 1 tex0_cache_sector_queries 0 0 0 0 + 1 tex0_cache_sector_misses 0 0 0 0 + 1 tex1_cache_sector_queries 0 0 0 0 + 1 tex1_cache_sector_misses 0 0 0 0 + 1 sm_cta_launched 72 72 72 72 + 1 l1_local_load_hit 0 0 0 0 + 1 l1_local_load_miss 0 0 0 0 + 1 l1_local_store_hit 0 0 0 0 + 1 l1_local_store_miss 0 0 0 0 + 1 l1_global_load_hit 5115744 5115744 5115744 5115744 + 1 l1_global_load_miss 32658596 32658596 32658596 32658596 + 1 uncached_global_load_transaction 0 0 0 0 + 1 global_store_transaction 37765120 37765120 37765120 37765120 + 1 l1_shared_bank_conflict 0 0 0 0 + 1 prof_trigger_00 0 0 0 0 + 1 prof_trigger_01 0 0 0 0 + 1 prof_trigger_02 0 0 0 0 + 1 prof_trigger_03 0 0 0 0 + 1 prof_trigger_04 0 0 0 0 + 1 prof_trigger_05 0 0 0 0 + 1 prof_trigger_06 0 0 0 0 + 1 prof_trigger_07 0 0 0 0 + 1 fb_subp0_read_sectors 26026479 26026479 26026479 26026479 + 1 fb_subp1_read_sectors 26326042 26326042 26326042 26326042 + 1 fb_subp0_write_sectors 18019297 18019297 18019297 18019297 + 1 fb_subp1_write_sectors 18008587 18008587 18008587 18008587 + 1 l2_subp0_write_sector_misses 18022663 18022663 18022663 18022663 + 1 l2_subp1_write_sector_misses 18008587 18008587 18008587 18008587 + 1 l2_subp0_read_sector_misses 25931301 25931301 25931301 25931301 + 1 l2_subp1_read_sector_misses 26520474 26520474 26520474 26520474 + 1 l2_subp0_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp1_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp0_read_sector_queries 63273300 63273300 63273300 63273300 + 1 l2_subp1_read_sector_queries 63387536 63387536 63387536 63387536 + 1 l2_subp0_read_tex_sector_queries 0 0 0 0 + 1 l2_subp1_read_tex_sector_queries 0 0 0 0 + 1 l2_subp0_read_hit_sectors 37987356 37987356 37987356 37987356 + 1 l2_subp1_read_hit_sectors 37389604 37389604 37389604 37389604 + 1 l2_subp0_read_tex_hit_sectors 0 0 0 0 + 1 l2_subp1_read_tex_hit_sectors 0 0 0 0 + 1 elapsed_cycles_sm 674324216 674324216 674324216 674324216 + 1 l2_subp0_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_write_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_write_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_total_read_sector_queries 63315814 63315814 63315814 63315814 + 1 l2_subp1_total_read_sector_queries 63142456 63142456 63142456 63142456 + 1 l2_subp0_total_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp1_total_write_sector_queries 18352133 18352133 18352133 18352133 + 1 atom_count 0 0 0 0 + 1 gred_count 0 0 0 0 + 1 gld_inst_8bit 36713217 36713217 36713217 36713217 + 1 gld_inst_16bit 0 0 0 0 + 1 gld_inst_32bit 0 0 0 0 + 1 gld_inst_64bit 0 0 0 0 + 1 gld_inst_128bit 0 0 0 0 + 1 gst_inst_8bit 36704256 36704256 36704256 36704256 + 1 gst_inst_16bit 0 0 0 0 + 1 gst_inst_32bit 0 0 0 0 + 1 gst_inst_64bit 0 0 0 0 + 1 gst_inst_128bit 0 0 0 0 + Kernel: void mtf_cuda_4by8(unsigned char const *, unsigned char*, int, int) + 1 local_load 0 0 0 0 + 1 local_store 0 0 0 0 + 1 gld_request 14145630 14145630 14145630 14145630 + 1 gst_request 14144469 14144469 14144469 14144469 + 1 shared_load 24983597 24983597 24983597 24983597 + 1 shared_store 39016540 39016540 39016540 39016540 + 1 branch 44503051 44503051 44503051 44503051 + 1 divergent_branch 9808814 9808814 9808814 9808814 + 1 active_cycles 359238555 359238555 359238555 359238555 + 1 inst_issued1_0 191148076 191148076 191148076 191148076 + 1 inst_issued2_0 91163507 91163507 91163507 91163507 + 1 inst_issued1_1 191310786 191310786 191310786 191310786 + 1 inst_issued2_1 91328912 91328912 91328912 91328912 + 1 inst_executed 623380211 623380211 623380211 623380211 + 1 warps_launched 2248 2248 2248 2248 + 1 threads_launched 71936 71936 71936 71936 + 1 thread_inst_executed_0 5163813268 5163813268 5163813268 5163813268 + 1 thread_inst_executed_2 2152163824 2152163824 2152163824 2152163824 + 1 thread_inst_executed_1 5167624317 5167624317 5167624317 5167624317 + 1 thread_inst_executed_3 2154005856 2154005856 2154005856 2154005856 + 1 active_warps 1.2634e+10 1.2634e+10 1.2634e+10 1.2634e+10 + 1 tex0_cache_sector_queries 0 0 0 0 + 1 tex0_cache_sector_misses 0 0 0 0 + 1 tex1_cache_sector_queries 0 0 0 0 + 1 tex1_cache_sector_misses 0 0 0 0 + 1 sm_cta_launched 284 284 284 284 + 1 l1_local_load_hit 0 0 0 0 + 1 l1_local_load_miss 0 0 0 0 + 1 l1_local_store_hit 0 0 0 0 + 1 l1_local_store_miss 0 0 0 0 + 1 l1_global_load_hit 15722852 15722852 15722852 15722852 + 1 l1_global_load_miss 22035100 22035100 22035100 22035100 + 1 uncached_global_load_transaction 0 0 0 0 + 1 global_store_transaction 36716544 36716544 36716544 36716544 + 1 l1_shared_bank_conflict 71463908 71463908 71463908 71463908 + 1 prof_trigger_00 0 0 0 0 + 1 prof_trigger_01 0 0 0 0 + 1 prof_trigger_02 0 0 0 0 + 1 prof_trigger_03 0 0 0 0 + 1 prof_trigger_04 0 0 0 0 + 1 prof_trigger_05 0 0 0 0 + 1 prof_trigger_06 0 0 0 0 + 1 prof_trigger_07 0 0 0 0 + 1 fb_subp0_read_sectors 1152710 1152710 1152710 1152710 + 1 fb_subp1_read_sectors 1179475 1179475 1179475 1179475 + 1 fb_subp0_write_sectors 16530585 16530585 16530585 16530585 + 1 fb_subp1_write_sectors 16534703 16534703 16534703 16534703 + 1 l2_subp0_write_sector_misses 16539348 16539348 16539348 16539348 + 1 l2_subp1_write_sector_misses 16534703 16534703 16534703 16534703 + 1 l2_subp0_read_sector_misses 1151148 1151148 1151148 1151148 + 1 l2_subp1_read_sector_misses 1173543 1173543 1173543 1173543 + 1 l2_subp0_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp1_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp0_read_sector_queries 42834884 42834884 42834884 42834884 + 1 l2_subp1_read_sector_queries 42463900 42463900 42463900 42463900 + 1 l2_subp0_read_tex_sector_queries 0 0 0 0 + 1 l2_subp1_read_tex_sector_queries 0 0 0 0 + 1 l2_subp0_read_hit_sectors 41641276 41641276 41641276 41641276 + 1 l2_subp1_read_hit_sectors 42071748 42071748 42071748 42071748 + 1 l2_subp0_read_tex_hit_sectors 0 0 0 0 + 1 l2_subp1_read_tex_hit_sectors 0 0 0 0 + 1 elapsed_cycles_sm 377753956 377753956 377753956 377753956 + 1 l2_subp0_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_write_sysmem_sector_queries 2 2 2 2 + 1 l2_subp1_write_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_total_read_sector_queries 41950439 41950439 41950439 41950439 + 1 l2_subp1_total_read_sector_queries 42919800 42919800 42919800 42919800 + 1 l2_subp0_total_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp1_total_write_sector_queries 18352131 18352131 18352131 18352131 + 1 atom_count 0 0 0 0 + 1 gred_count 0 0 0 0 + 1 gld_inst_8bit 293705736 293705736 293705736 293705736 + 1 gld_inst_16bit 0 0 0 0 + 1 gld_inst_32bit 0 0 0 0 + 1 gld_inst_64bit 0 0 0 0 + 1 gld_inst_128bit 0 0 0 0 + 1 gst_inst_8bit 293634048 293634048 293634048 293634048 + 1 gst_inst_16bit 0 0 0 0 + 1 gst_inst_32bit 0 0 0 0 + 1 gst_inst_64bit 0 0 0 0 + 1 gst_inst_128bit 0 0 0 0 + Kernel: void mtf_cuda_thread(unsigned char const *, unsigned char*, int, int) + 1 local_load 0 0 0 0 + 1 local_store 0 0 0 0 + 1 gld_request 9510273 9510273 9510273 9510273 + 1 gst_request 9511965 9511965 9511965 9511965 + 1 shared_load 12159416 12159416 12159416 12159416 + 1 shared_store 21675596 21675596 21675596 21675596 + 1 branch 34035679 34035679 34035679 34035679 + 1 divergent_branch 9416081 9416081 9416081 9416081 + 1 active_cycles 563733131 563733131 563733131 563733131 + 1 inst_issued1_0 138815900 138815900 138815900 138815900 + 1 inst_issued2_0 55561689 55561689 55561689 55561689 + 1 inst_issued1_1 114044830 114044830 114044830 114044830 + 1 inst_issued2_1 55846976 55846976 55846976 55846976 + 1 inst_executed 365209249 365209249 365209249 365209249 + 1 warps_launched 284 284 284 284 + 1 threads_launched 9088 9088 9088 9088 + 1 thread_inst_executed_0 1760232850 1760232850 1760232850 1760232850 + 1 thread_inst_executed_2 845356452 845356452 845356452 845356452 + 1 thread_inst_executed_1 1833239341 1833239341 1833239341 1833239341 + 1 thread_inst_executed_3 857394438 857394438 857394438 857394438 + 1 active_warps 1.1154e+10 1.1154e+10 1.1154e+10 1.1154e+10 + 1 tex0_cache_sector_queries 0 0 0 0 + 1 tex0_cache_sector_misses 0 0 0 0 + 1 tex1_cache_sector_queries 0 0 0 0 + 1 tex1_cache_sector_misses 0 0 0 0 + 1 sm_cta_launched 68 68 68 68 + 1 l1_local_load_hit 0 0 0 0 + 1 l1_local_load_miss 0 0 0 0 + 1 l1_local_store_hit 0 0 0 0 + 1 l1_local_store_miss 0 0 0 0 + 1 l1_global_load_hit 7015784 7015784 7015784 7015784 + 1 l1_global_load_miss 30742168 30742168 30742168 30742168 + 1 uncached_global_load_transaction 0 0 0 0 + 1 global_store_transaction 35651584 35651584 35651584 35651584 + 1 l1_shared_bank_conflict 0 0 0 0 + 1 prof_trigger_00 0 0 0 0 + 1 prof_trigger_01 0 0 0 0 + 1 prof_trigger_02 0 0 0 0 + 1 prof_trigger_03 0 0 0 0 + 1 prof_trigger_04 0 0 0 0 + 1 prof_trigger_05 0 0 0 0 + 1 prof_trigger_06 0 0 0 0 + 1 prof_trigger_07 0 0 0 0 + 1 fb_subp0_read_sectors 20909176 20909176 20909176 20909176 + 1 fb_subp1_read_sectors 21231838 21231838 21231838 21231838 + 1 fb_subp0_write_sectors 17838501 17838501 17838501 17838501 + 1 fb_subp1_write_sectors 17808741 17808741 17808741 17808741 + 1 l2_subp0_write_sector_misses 17839988 17839988 17839988 17839988 + 1 l2_subp1_write_sector_misses 17808741 17808741 17808741 17808741 + 1 l2_subp0_read_sector_misses 20736692 20736692 20736692 20736692 + 1 l2_subp1_read_sector_misses 21131985 21131985 21131985 21131985 + 1 l2_subp0_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp1_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp0_read_sector_queries 60526588 60526588 60526588 60526588 + 1 l2_subp1_read_sector_queries 60852052 60852052 60852052 60852052 + 1 l2_subp0_read_tex_sector_queries 0 0 0 0 + 1 l2_subp1_read_tex_sector_queries 0 0 0 0 + 1 l2_subp0_read_hit_sectors 40362196 40362196 40362196 40362196 + 1 l2_subp1_read_hit_sectors 40133048 40133048 40133048 40133048 + 1 l2_subp0_read_tex_hit_sectors 0 0 0 0 + 1 l2_subp1_read_tex_hit_sectors 0 0 0 0 + 1 elapsed_cycles_sm 662381400 662381400 662381400 662381400 + 1 l2_subp0_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_write_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_write_sysmem_sector_queries 2 2 2 2 + 1 l2_subp0_total_read_sector_queries 59966205 59966205 59966205 59966205 + 1 l2_subp1_total_read_sector_queries 60204652 60204652 60204652 60204652 + 1 l2_subp0_total_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp1_total_write_sector_queries 18352144 18352144 18352144 18352144 + 1 atom_count 0 0 0 0 + 1 gred_count 0 0 0 0 + 1 gld_inst_8bit 36713217 36713217 36713217 36713217 + 1 gld_inst_16bit 0 0 0 0 + 1 gld_inst_32bit 0 0 0 0 + 1 gld_inst_64bit 0 0 0 0 + 1 gld_inst_128bit 0 0 0 0 + 1 gst_inst_8bit 36704256 36704256 36704256 36704256 + 1 gst_inst_16bit 0 0 0 0 + 1 gst_inst_32bit 0 0 0 0 + 1 gst_inst_64bit 0 0 0 0 + 1 gst_inst_128bit 0 0 0 0 + Kernel: void mtf_cuda_thread_by4(unsigned char const *, unsigned char*, int, int) + 1 local_load 0 0 0 0 + 1 local_store 0 0 0 0 + 1 gld_request 11111097 11111097 11111097 11111097 + 1 gst_request 11113106 11113106 11113106 11113106 + 1 shared_load 15022860 15022860 15022860 15022860 + 1 shared_store 15040844 15040844 15040844 15040844 + 1 branch 41044638 41044638 41044638 41044638 + 1 divergent_branch 10996503 10996503 10996503 10996503 + 1 active_cycles 573961111 573961111 573961111 573961111 + 1 inst_issued1_0 80756530 80756530 80756530 80756530 + 1 inst_issued2_0 45325170 45325170 45325170 45325170 + 1 inst_issued1_1 79138202 79138202 79138202 79138202 + 1 inst_issued2_1 43982243 43982243 43982243 43982243 + 1 inst_executed 286973422 286973422 286973422 286973422 + 1 warps_launched 281 281 281 281 + 1 threads_launched 8992 8992 8992 8992 + 1 thread_inst_executed_0 1098954160 1098954160 1098954160 1098954160 + 1 thread_inst_executed_2 413728720 413728720 413728720 413728720 + 1 thread_inst_executed_1 1111677993 1111677993 1111677993 1111677993 + 1 thread_inst_executed_3 407820788 407820788 407820788 407820788 + 1 active_warps 3220940193 3220940193 3220940193 3220940193 + 1 tex0_cache_sector_queries 0 0 0 0 + 1 tex0_cache_sector_misses 0 0 0 0 + 1 tex1_cache_sector_queries 0 0 0 0 + 1 tex1_cache_sector_misses 0 0 0 0 + 1 sm_cta_launched 284 284 284 284 + 1 l1_local_load_hit 0 0 0 0 + 1 l1_local_load_miss 0 0 0 0 + 1 l1_local_store_hit 0 0 0 0 + 1 l1_local_store_miss 0 0 0 0 + 1 l1_global_load_hit 18199072 18199072 18199072 18199072 + 1 l1_global_load_miss 19034464 19034464 19034464 19034464 + 1 uncached_global_load_transaction 0 0 0 0 + 1 global_store_transaction 37224448 37224448 37224448 37224448 + 1 l1_shared_bank_conflict 0 0 0 0 + 1 prof_trigger_00 0 0 0 0 + 1 prof_trigger_01 0 0 0 0 + 1 prof_trigger_02 0 0 0 0 + 1 prof_trigger_03 0 0 0 0 + 1 prof_trigger_04 0 0 0 0 + 1 prof_trigger_05 0 0 0 0 + 1 prof_trigger_06 0 0 0 0 + 1 prof_trigger_07 0 0 0 0 + 1 fb_subp0_read_sectors 1387587 1387587 1387587 1387587 + 1 fb_subp1_read_sectors 1395697 1395697 1395697 1395697 + 1 fb_subp0_write_sectors 18158878 18158878 18158878 18158878 + 1 fb_subp1_write_sectors 18158284 18158284 18158284 18158284 + 1 l2_subp0_write_sector_misses 18159734 18159734 18159734 18159734 + 1 l2_subp1_write_sector_misses 18158284 18158284 18158284 18158284 + 1 l2_subp0_read_sector_misses 1390277 1390277 1390277 1390277 + 1 l2_subp1_read_sector_misses 1392649 1392649 1392649 1392649 + 1 l2_subp0_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp1_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp0_read_sector_queries 38106904 38106904 38106904 38106904 + 1 l2_subp1_read_sector_queries 37920228 37920228 37920228 37920228 + 1 l2_subp0_read_tex_sector_queries 0 0 0 0 + 1 l2_subp1_read_tex_sector_queries 0 0 0 0 + 1 l2_subp0_read_hit_sectors 37320760 37320760 37320760 37320760 + 1 l2_subp1_read_hit_sectors 37435744 37435744 37435744 37435744 + 1 l2_subp0_read_tex_hit_sectors 0 0 0 0 + 1 l2_subp1_read_tex_hit_sectors 0 0 0 0 + 1 elapsed_cycles_sm 628071896 628071896 628071896 628071896 + 1 l2_subp0_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_write_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_write_sysmem_sector_queries 2 2 2 2 + 1 l2_subp0_total_read_sector_queries 38355589 38355589 38355589 38355589 + 1 l2_subp1_total_read_sector_queries 38473928 38473928 38473928 38473928 + 1 l2_subp0_total_write_sector_queries 18352130 18352130 18352130 18352130 + 1 l2_subp1_total_write_sector_queries 18352133 18352133 18352133 18352133 + 1 atom_count 0 0 0 0 + 1 gred_count 0 0 0 0 + 1 gld_inst_8bit 36713217 36713217 36713217 36713217 + 1 gld_inst_16bit 0 0 0 0 + 1 gld_inst_32bit 0 0 0 0 + 1 gld_inst_64bit 0 0 0 0 + 1 gld_inst_128bit 0 0 0 0 + 1 gst_inst_8bit 36704256 36704256 36704256 36704256 + 1 gst_inst_16bit 0 0 0 0 + 1 gst_inst_32bit 0 0 0 0 + 1 gst_inst_64bit 0 0 0 0 + 1 gst_inst_128bit 0 0 0 0 + Kernel: void mtf_cuda_2buffers_depth32(unsigned char const *, unsigned char*, int, int) + 1 local_load 0 0 0 0 + 1 local_store 0 0 0 0 + 1 gld_request 36717314 36717314 36717314 36717314 + 1 gst_request 36708352 36708352 36708352 36708352 + 1 shared_load 36708352 36708352 36708352 36708352 + 1 shared_store 73421110 73421110 73421110 73421110 + 1 branch 1156101 1156101 1156101 1156101 + 1 divergent_branch 0 0 0 0 + 1 active_cycles 330732874 330732874 330732874 330732874 + 1 inst_issued1_0 168221301 168221301 168221301 168221301 + 1 inst_issued2_0 114757258 114757258 114757258 114757258 + 1 inst_issued1_1 168141460 168141460 168141460 168141460 + 1 inst_issued2_1 114706052 114706052 114706052 114706052 + 1 inst_executed 795086262 795086262 795086262 795086262 + 1 warps_launched 4484 4484 4484 4484 + 1 threads_launched 143488 143488 143488 143488 + 1 thread_inst_executed_0 9051990848 9051990848 9051990848 9051990848 + 1 thread_inst_executed_2 3672228160 3672228160 3672228160 3672228160 + 1 thread_inst_executed_1 9047951808 9047951808 9047951808 9047951808 + 1 thread_inst_executed_3 3670589568 3670589568 3670589568 3670589568 + 1 active_warps 1.0236e+10 1.0236e+10 1.0236e+10 1.0236e+10 + 1 tex0_cache_sector_queries 0 0 0 0 + 1 tex0_cache_sector_misses 0 0 0 0 + 1 tex1_cache_sector_queries 0 0 0 0 + 1 tex1_cache_sector_misses 0 0 0 0 + 1 sm_cta_launched 1120 1120 1120 1120 + 1 l1_local_load_hit 0 0 0 0 + 1 l1_local_load_miss 0 0 0 0 + 1 l1_local_store_hit 0 0 0 0 + 1 l1_local_store_miss 0 0 0 0 + 1 l1_global_load_hit 29618560 29618560 29618560 29618560 + 1 l1_global_load_miss 7090560 7090560 7090560 7090560 + 1 uncached_global_load_transaction 0 0 0 0 + 1 global_store_transaction 36700160 36700160 36700160 36700160 + 1 l1_shared_bank_conflict 0 0 0 0 + 1 prof_trigger_00 0 0 0 0 + 1 prof_trigger_01 0 0 0 0 + 1 prof_trigger_02 0 0 0 0 + 1 prof_trigger_03 0 0 0 0 + 1 prof_trigger_04 0 0 0 0 + 1 prof_trigger_05 0 0 0 0 + 1 prof_trigger_06 0 0 0 0 + 1 prof_trigger_07 0 0 0 0 + 1 fb_subp0_read_sectors 949715 949715 949715 949715 + 1 fb_subp1_read_sectors 956054 956054 956054 956054 + 1 fb_subp0_write_sectors 14307549 14307549 14307549 14307549 + 1 fb_subp1_write_sectors 14267155 14267155 14267155 14267155 + 1 l2_subp0_write_sector_misses 14354513 14354513 14354513 14354513 + 1 l2_subp1_write_sector_misses 14267155 14267155 14267155 14267155 + 1 l2_subp0_read_sector_misses 949894 949894 949894 949894 + 1 l2_subp1_read_sector_misses 955270 955270 955270 955270 + 1 l2_subp0_write_sector_queries 18354176 18354176 18354176 18354176 + 1 l2_subp1_write_sector_queries 18354176 18354176 18354176 18354176 + 1 l2_subp0_read_sector_queries 15075172 15075172 15075172 15075172 + 1 l2_subp1_read_sector_queries 16333216 16333216 16333216 16333216 + 1 l2_subp0_read_tex_sector_queries 0 0 0 0 + 1 l2_subp1_read_tex_sector_queries 0 0 0 0 + 1 l2_subp0_read_hit_sectors 15910736 15910736 15910736 15910736 + 1 l2_subp1_read_hit_sectors 15508240 15508240 15508240 15508240 + 1 l2_subp0_read_tex_hit_sectors 0 0 0 0 + 1 l2_subp1_read_tex_hit_sectors 0 0 0 0 + 1 elapsed_cycles_sm 332355872 332355872 332355872 332355872 + 1 l2_subp0_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_write_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_write_sysmem_sector_queries 1 1 1 1 + 1 l2_subp0_total_read_sector_queries 15173419 15173419 15173419 15173419 + 1 l2_subp1_total_read_sector_queries 15437044 15437044 15437044 15437044 + 1 l2_subp0_total_write_sector_queries 18354176 18354176 18354176 18354176 + 1 l2_subp1_total_write_sector_queries 18354179 18354179 18354179 18354179 + 1 atom_count 0 0 0 0 + 1 gred_count 0 0 0 0 + 1 gld_inst_8bit 1174954048 1174954048 1174954048 1174954048 + 1 gld_inst_16bit 0 0 0 0 + 1 gld_inst_32bit 0 0 0 0 + 1 gld_inst_64bit 0 0 0 0 + 1 gld_inst_128bit 0 0 0 0 + 1 gst_inst_8bit 1174667264 1174667264 1174667264 1174667264 + 1 gst_inst_16bit 0 0 0 0 + 1 gst_inst_32bit 0 0 0 0 + 1 gst_inst_64bit 0 0 0 0 + 1 gst_inst_128bit 0 0 0 0 + Kernel: void mtf_cuda_2buffers_depth32(unsigned char const *, unsigned char*, int, int) + 1 local_load 0 0 0 0 + 1 local_store 0 0 0 0 + 1 gld_request 36713217 36713217 36713217 36713217 + 1 gst_request 36704256 36704256 36704256 36704256 + 1 shared_load 36704256 36704256 36704256 36704256 + 1 shared_store 73417013 73417013 73417013 73417013 + 1 branch 770647 770647 770647 770647 + 1 divergent_branch 0 0 0 0 + 1 active_cycles 333638568 333638568 333638568 333638568 + 1 inst_issued1_0 196361417 196361417 196361417 196361417 + 1 inst_issued2_0 92952326 92952326 92952326 92952326 + 1 inst_issued1_1 196332088 196332088 196332088 196332088 + 1 inst_issued2_1 92890111 92890111 92890111 92890111 + 1 inst_executed 764005908 764005908 764005908 764005908 + 1 warps_launched 2988 2988 2988 2988 + 1 threads_launched 95616 95616 95616 95616 + 1 thread_inst_executed_0 9253716480 9253716480 9253716480 9253716480 + 1 thread_inst_executed_2 2974470336 2974470336 2974470336 2974470336 + 1 thread_inst_executed_1 9247522784 9247522784 9247522784 9247522784 + 1 thread_inst_executed_3 2972479456 2972479456 2972479456 2972479456 + 1 active_warps 1.0274e+10 1.0274e+10 1.0274e+10 1.0274e+10 + 1 tex0_cache_sector_queries 0 0 0 0 + 1 tex0_cache_sector_misses 0 0 0 0 + 1 tex1_cache_sector_queries 0 0 0 0 + 1 tex1_cache_sector_misses 0 0 0 0 + 1 sm_cta_launched 744 744 744 744 + 1 l1_local_load_hit 0 0 0 0 + 1 l1_local_load_miss 0 0 0 0 + 1 l1_local_store_hit 0 0 0 0 + 1 l1_local_store_miss 0 0 0 0 + 1 l1_global_load_hit 25393216 25393216 25393216 25393216 + 1 l1_global_load_miss 11332292 11332292 11332292 11332292 + 1 uncached_global_load_transaction 0 0 0 0 + 1 global_store_transaction 36569088 36569088 36569088 36569088 + 1 l1_shared_bank_conflict 0 0 0 0 + 1 prof_trigger_00 0 0 0 0 + 1 prof_trigger_01 0 0 0 0 + 1 prof_trigger_02 0 0 0 0 + 1 prof_trigger_03 0 0 0 0 + 1 prof_trigger_04 0 0 0 0 + 1 prof_trigger_05 0 0 0 0 + 1 prof_trigger_06 0 0 0 0 + 1 prof_trigger_07 0 0 0 0 + 1 fb_subp0_read_sectors 955033 955033 955033 955033 + 1 fb_subp1_read_sectors 955607 955607 955607 955607 + 1 fb_subp0_write_sectors 15908757 15908757 15908757 15908757 + 1 fb_subp1_write_sectors 15876889 15876889 15876889 15876889 + 1 l2_subp0_write_sector_misses 15906888 15906888 15906888 15906888 + 1 l2_subp1_write_sector_misses 15876889 15876889 15876889 15876889 + 1 l2_subp0_read_sector_misses 955596 955596 955596 955596 + 1 l2_subp1_read_sector_misses 955403 955403 955403 955403 + 1 l2_subp0_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp1_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp0_read_sector_queries 23485164 23485164 23485164 23485164 + 1 l2_subp1_read_sector_queries 23803760 23803760 23803760 23803760 + 1 l2_subp0_read_tex_sector_queries 0 0 0 0 + 1 l2_subp1_read_tex_sector_queries 0 0 0 0 + 1 l2_subp0_read_hit_sectors 24151532 24151532 24151532 24151532 + 1 l2_subp1_read_hit_sectors 23335716 23335716 23335716 23335716 + 1 l2_subp0_read_tex_hit_sectors 0 0 0 0 + 1 l2_subp1_read_tex_hit_sectors 0 0 0 0 + 1 elapsed_cycles_sm 336008340 336008340 336008340 336008340 + 1 l2_subp0_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_write_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_write_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_total_read_sector_queries 23874555 23874555 23874555 23874555 + 1 l2_subp1_total_read_sector_queries 24045168 24045168 24045168 24045168 + 1 l2_subp0_total_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp1_total_write_sector_queries 18352131 18352131 18352131 18352131 + 1 atom_count 0 0 0 0 + 1 gred_count 0 0 0 0 + 1 gld_inst_8bit 1174822944 1174822944 1174822944 1174822944 + 1 gld_inst_16bit 0 0 0 0 + 1 gld_inst_32bit 0 0 0 0 + 1 gld_inst_64bit 0 0 0 0 + 1 gld_inst_128bit 0 0 0 0 + 1 gst_inst_8bit 1174536192 1174536192 1174536192 1174536192 + 1 gst_inst_16bit 0 0 0 0 + 1 gst_inst_32bit 0 0 0 0 + 1 gst_inst_64bit 0 0 0 0 + 1 gst_inst_128bit 0 0 0 0 + Kernel: void mtf_cuda_thread(unsigned char const *, unsigned char*, int, int) + 1 local_load 0 0 0 0 + 1 local_store 0 0 0 0 + 1 gld_request 10125684 10125684 10125684 10125684 + 1 gst_request 10127446 10127446 10127446 10127446 + 1 shared_load 13330443 13330443 13330443 13330443 + 1 shared_store 23475592 23475592 23475592 23475592 + 1 branch 37020223 37020223 37020223 37020223 + 1 divergent_branch 10017982 10017982 10017982 10017982 + 1 active_cycles 766257512 766257512 766257512 766257512 + 1 inst_issued1_0 104547971 104547971 104547971 104547971 + 1 inst_issued2_0 53897466 53897466 53897466 53897466 + 1 inst_issued1_1 103226785 103226785 103226785 103226785 + 1 inst_issued2_1 53619355 53619355 53619355 53619355 + 1 inst_executed 369460401 369460401 369460401 369460401 + 1 warps_launched 281 281 281 281 + 1 threads_launched 8992 8992 8992 8992 + 1 thread_inst_executed_0 1798685922 1798685922 1798685922 1798685922 + 1 thread_inst_executed_2 754655126 754655126 754655126 754655126 + 1 thread_inst_executed_1 1803027662 1803027662 1803027662 1803027662 + 1 thread_inst_executed_3 764706082 764706082 764706082 764706082 + 1 active_warps 4172498498 4172498498 4172498498 4172498498 + 1 tex0_cache_sector_queries 0 0 0 0 + 1 tex0_cache_sector_misses 0 0 0 0 + 1 tex1_cache_sector_queries 0 0 0 0 + 1 tex1_cache_sector_misses 0 0 0 0 + 1 sm_cta_launched 284 284 284 284 + 1 l1_local_load_hit 0 0 0 0 + 1 l1_local_load_miss 0 0 0 0 + 1 l1_local_store_hit 0 0 0 0 + 1 l1_local_store_miss 0 0 0 0 + 1 l1_global_load_hit 22555772 22555772 22555772 22555772 + 1 l1_global_load_miss 14694152 14694152 14694152 14694152 + 1 uncached_global_load_transaction 0 0 0 0 + 1 global_store_transaction 37240832 37240832 37240832 37240832 + 1 l1_shared_bank_conflict 0 0 0 0 + 1 prof_trigger_00 0 0 0 0 + 1 prof_trigger_01 0 0 0 0 + 1 prof_trigger_02 0 0 0 0 + 1 prof_trigger_03 0 0 0 0 + 1 prof_trigger_04 0 0 0 0 + 1 prof_trigger_05 0 0 0 0 + 1 prof_trigger_06 0 0 0 0 + 1 prof_trigger_07 0 0 0 0 + 1 fb_subp0_read_sectors 1574556 1574556 1574556 1574556 + 1 fb_subp1_read_sectors 1574619 1574619 1574619 1574619 + 1 fb_subp0_write_sectors 18325423 18325423 18325423 18325423 + 1 fb_subp1_write_sectors 18325084 18325084 18325084 18325084 + 1 l2_subp0_write_sector_misses 18325346 18325346 18325346 18325346 + 1 l2_subp1_write_sector_misses 18325084 18325084 18325084 18325084 + 1 l2_subp0_read_sector_misses 1574309 1574309 1574309 1574309 + 1 l2_subp1_read_sector_misses 1574715 1574715 1574715 1574715 + 1 l2_subp0_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp1_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp0_read_sector_queries 30441300 30441300 30441300 30441300 + 1 l2_subp1_read_sector_queries 30373072 30373072 30373072 30373072 + 1 l2_subp0_read_tex_sector_queries 0 0 0 0 + 1 l2_subp1_read_tex_sector_queries 0 0 0 0 + 1 l2_subp0_read_hit_sectors 29973548 29973548 29973548 29973548 + 1 l2_subp1_read_hit_sectors 29679624 29679624 29679624 29679624 + 1 l2_subp0_read_tex_hit_sectors 0 0 0 0 + 1 l2_subp1_read_tex_hit_sectors 0 0 0 0 + 1 elapsed_cycles_sm 853306928 853306928 853306928 853306928 + 1 l2_subp0_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_write_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_write_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_total_read_sector_queries 30613729 30613729 30613729 30613729 + 1 l2_subp1_total_read_sector_queries 30425844 30425844 30425844 30425844 + 1 l2_subp0_total_write_sector_queries 18352130 18352130 18352130 18352130 + 1 l2_subp1_total_write_sector_queries 18352146 18352146 18352146 18352146 + 1 atom_count 0 0 0 0 + 1 gred_count 0 0 0 0 + 1 gld_inst_8bit 36713217 36713217 36713217 36713217 + 1 gld_inst_16bit 0 0 0 0 + 1 gld_inst_32bit 0 0 0 0 + 1 gld_inst_64bit 0 0 0 0 + 1 gld_inst_128bit 0 0 0 0 + 1 gst_inst_8bit 36704256 36704256 36704256 36704256 + 1 gst_inst_16bit 0 0 0 0 + 1 gst_inst_32bit 0 0 0 0 + 1 gst_inst_64bit 0 0 0 0 + 1 gst_inst_128bit 0 0 0 0 + Kernel: void mtf_cuda_2buffers_depth32(unsigned char const *, unsigned char*, int, int) + 1 local_load 0 0 0 0 + 1 local_store 0 0 0 0 + 1 gld_request 36725508 36725508 36725508 36725508 + 1 gst_request 36716544 36716544 36716544 36716544 + 1 shared_load 36716544 36716544 36716544 36716544 + 1 shared_store 73429304 73429304 73429304 73429304 + 1 branch 578181 578181 578181 578181 + 1 divergent_branch 0 0 0 0 + 1 active_cycles 332523777 332523777 332523777 332523777 + 1 inst_issued1_0 148007794 148007794 148007794 148007794 + 1 inst_issued2_0 116810572 116810572 116810572 116810572 + 1 inst_issued1_1 147874790 147874790 147874790 147874790 + 1 inst_issued2_1 116706372 116706372 116706372 116706372 + 1 inst_executed 756779004 756779004 756779004 756779004 + 1 warps_launched 2244 2244 2244 2244 + 1 threads_launched 71808 71808 71808 71808 + 1 thread_inst_executed_0 8375932864 8375932864 8375932864 8375932864 + 1 thread_inst_executed_2 3737934208 3737934208 3737934208 3737934208 + 1 thread_inst_executed_1 8368461248 8368461248 8368461248 8368461248 + 1 thread_inst_executed_3 3734599808 3734599808 3734599808 3734599808 + 1 active_warps 1.0132e+10 1.0132e+10 1.0132e+10 1.0132e+10 + 1 tex0_cache_sector_queries 0 0 0 0 + 1 tex0_cache_sector_misses 0 0 0 0 + 1 tex1_cache_sector_queries 0 0 0 0 + 1 tex1_cache_sector_misses 0 0 0 0 + 1 sm_cta_launched 564 564 564 564 + 1 l1_local_load_hit 0 0 0 0 + 1 l1_local_load_miss 0 0 0 0 + 1 l1_local_store_hit 0 0 0 0 + 1 l1_local_store_miss 0 0 0 0 + 1 l1_global_load_hit 19691560 19691560 19691560 19691560 + 1 l1_global_load_miss 16820904 16820904 16820904 16820904 + 1 uncached_global_load_transaction 0 0 0 0 + 1 global_store_transaction 36700160 36700160 36700160 36700160 + 1 l1_shared_bank_conflict 0 0 0 0 + 1 prof_trigger_00 0 0 0 0 + 1 prof_trigger_01 0 0 0 0 + 1 prof_trigger_02 0 0 0 0 + 1 prof_trigger_03 0 0 0 0 + 1 prof_trigger_04 0 0 0 0 + 1 prof_trigger_05 0 0 0 0 + 1 prof_trigger_06 0 0 0 0 + 1 prof_trigger_07 0 0 0 0 + 1 fb_subp0_read_sectors 1209883 1209883 1209883 1209883 + 1 fb_subp1_read_sectors 1185474 1185474 1185474 1185474 + 1 fb_subp0_write_sectors 16830428 16830428 16830428 16830428 + 1 fb_subp1_write_sectors 17054969 17054969 17054969 17054969 + 1 l2_subp0_write_sector_misses 16901584 16901584 16901584 16901584 + 1 l2_subp1_write_sector_misses 17054969 17054969 17054969 17054969 + 1 l2_subp0_read_sector_misses 1207751 1207751 1207751 1207751 + 1 l2_subp1_read_sector_misses 1120467 1120467 1120467 1120467 + 1 l2_subp0_write_sector_queries 18358272 18358272 18358272 18358272 + 1 l2_subp1_write_sector_queries 18358272 18358272 18358272 18358272 + 1 l2_subp0_read_sector_queries 34492596 34492596 34492596 34492596 + 1 l2_subp1_read_sector_queries 35253748 35253748 35253748 35253748 + 1 l2_subp0_read_tex_sector_queries 0 0 0 0 + 1 l2_subp1_read_tex_sector_queries 0 0 0 0 + 1 l2_subp0_read_hit_sectors 33652936 33652936 33652936 33652936 + 1 l2_subp1_read_hit_sectors 33963284 33963284 33963284 33963284 + 1 l2_subp0_read_tex_hit_sectors 0 0 0 0 + 1 l2_subp1_read_tex_hit_sectors 0 0 0 0 + 1 elapsed_cycles_sm 334941312 334941312 334941312 334941312 + 1 l2_subp0_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_write_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_write_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_total_read_sector_queries 34966167 34966167 34966167 34966167 + 1 l2_subp1_total_read_sector_queries 35490112 35490112 35490112 35490112 + 1 l2_subp0_total_write_sector_queries 18358272 18358272 18358272 18358272 + 1 l2_subp1_total_write_sector_queries 18358275 18358275 18358275 18358275 + 1 atom_count 0 0 0 0 + 1 gred_count 0 0 0 0 + 1 gld_inst_8bit 1175216256 1175216256 1175216256 1175216256 + 1 gld_inst_16bit 0 0 0 0 + 1 gld_inst_32bit 0 0 0 0 + 1 gld_inst_64bit 0 0 0 0 + 1 gld_inst_128bit 0 0 0 0 + 1 gst_inst_8bit 1174929408 1174929408 1174929408 1174929408 + 1 gst_inst_16bit 0 0 0 0 + 1 gst_inst_32bit 0 0 0 0 + 1 gst_inst_64bit 0 0 0 0 + 1 gst_inst_128bit 0 0 0 0 + Kernel: void mtf_cuda_thread(unsigned char const *, unsigned char*, int, int) + 1 local_load 0 0 0 0 + 1 local_store 0 0 0 0 + 1 gld_request 6082355 6082355 6082355 6082355 + 1 gst_request 6083279 6083279 6083279 6083279 + 1 shared_load 6462783 6462783 6462783 6462783 + 1 shared_store 12546343 12546343 12546343 12546343 + 1 branch 19091057 19091057 19091057 19091057 + 1 divergent_branch 6049485 6049485 6049485 6049485 + 1 active_cycles 709424446 709424446 709424446 709424446 + 1 inst_issued1_0 337984001 337984001 337984001 337984001 + 1 inst_issued2_0 34616091 34616091 34616091 34616091 + 1 inst_issued1_1 203542149 203542149 203542149 203542149 + 1 inst_issued2_1 34736297 34736297 34736297 34736297 + 1 inst_executed 208358080 208358080 208358080 208358080 + 1 warps_launched 284 284 284 284 + 1 threads_launched 9088 9088 9088 9088 + 1 thread_inst_executed_0 1144956810 1144956810 1144956810 1144956810 + 1 thread_inst_executed_2 570766441 570766441 570766441 570766441 + 1 thread_inst_executed_1 1168028019 1168028019 1168028019 1168028019 + 1 thread_inst_executed_3 577185594 577185594 577185594 577185594 + 1 active_warps 1.9862e+10 1.9862e+10 1.9862e+10 1.9862e+10 + 1 tex0_cache_sector_queries 0 0 0 0 + 1 tex0_cache_sector_misses 0 0 0 0 + 1 tex1_cache_sector_queries 0 0 0 0 + 1 tex1_cache_sector_misses 0 0 0 0 + 1 sm_cta_launched 72 72 72 72 + 1 l1_local_load_hit 0 0 0 0 + 1 l1_local_load_miss 0 0 0 0 + 1 l1_local_store_hit 0 0 0 0 + 1 l1_local_store_miss 0 0 0 0 + 1 l1_global_load_hit 2406332 2406332 2406332 2406332 + 1 l1_global_load_miss 35351620 35351620 35351620 35351620 + 1 uncached_global_load_transaction 0 0 0 0 + 1 global_store_transaction 37765120 37765120 37765120 37765120 + 1 l1_shared_bank_conflict 0 0 0 0 + 1 prof_trigger_00 0 0 0 0 + 1 prof_trigger_01 0 0 0 0 + 1 prof_trigger_02 0 0 0 0 + 1 prof_trigger_03 0 0 0 0 + 1 prof_trigger_04 0 0 0 0 + 1 prof_trigger_05 0 0 0 0 + 1 prof_trigger_06 0 0 0 0 + 1 prof_trigger_07 0 0 0 0 + 1 fb_subp0_read_sectors 47497641 47497641 47497641 47497641 + 1 fb_subp1_read_sectors 47505518 47505518 47505518 47505518 + 1 fb_subp0_write_sectors 18254007 18254007 18254007 18254007 + 1 fb_subp1_write_sectors 18258787 18258787 18258787 18258787 + 1 l2_subp0_write_sector_misses 18252258 18252258 18252258 18252258 + 1 l2_subp1_write_sector_misses 18258787 18258787 18258787 18258787 + 1 l2_subp0_read_sector_misses 47657052 47657052 47657052 47657052 + 1 l2_subp1_read_sector_misses 47448171 47448171 47448171 47448171 + 1 l2_subp0_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp1_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp0_read_sector_queries 68460584 68460584 68460584 68460584 + 1 l2_subp1_read_sector_queries 68485836 68485836 68485836 68485836 + 1 l2_subp0_read_tex_sector_queries 0 0 0 0 + 1 l2_subp1_read_tex_sector_queries 0 0 0 0 + 1 l2_subp0_read_hit_sectors 21521092 21521092 21521092 21521092 + 1 l2_subp1_read_hit_sectors 21670764 21670764 21670764 21670764 + 1 l2_subp0_read_tex_hit_sectors 0 0 0 0 + 1 l2_subp1_read_tex_hit_sectors 0 0 0 0 + 1 elapsed_cycles_sm 737459536 737459536 737459536 737459536 + 1 l2_subp0_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_write_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_write_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_total_read_sector_queries 68768346 68768346 68768346 68768346 + 1 l2_subp1_total_read_sector_queries 68437792 68437792 68437792 68437792 + 1 l2_subp0_total_write_sector_queries 18352130 18352130 18352130 18352130 + 1 l2_subp1_total_write_sector_queries 18352134 18352134 18352134 18352134 + 1 atom_count 0 0 0 0 + 1 gred_count 0 0 0 0 + 1 gld_inst_8bit 36713217 36713217 36713217 36713217 + 1 gld_inst_16bit 0 0 0 0 + 1 gld_inst_32bit 0 0 0 0 + 1 gld_inst_64bit 0 0 0 0 + 1 gld_inst_128bit 0 0 0 0 + 1 gst_inst_8bit 36704256 36704256 36704256 36704256 + 1 gst_inst_16bit 0 0 0 0 + 1 gst_inst_32bit 0 0 0 0 + 1 gst_inst_64bit 0 0 0 0 + 1 gst_inst_128bit 0 0 0 0 + Kernel: void mtf_cuda_thread_by4(unsigned char const *, unsigned char*, int, int) + 1 local_load 0 0 0 0 + 1 local_store 0 0 0 0 + 1 gld_request 8251932 8251932 8251932 8251932 + 1 gst_request 8253385 8253385 8253385 8253385 + 1 shared_load 7475750 7475750 7475750 7475750 + 1 shared_store 7476312 7476312 7476312 7476312 + 1 branch 27715507 27715507 27715507 27715507 + 1 divergent_branch 8210887 8210887 8210887 8210887 + 1 active_cycles 740649528 740649528 740649528 740649528 + 1 inst_issued1_0 334835859 334835859 334835859 334835859 + 1 inst_issued2_0 29921701 29921701 29921701 29921701 + 1 inst_issued1_1 194539366 194539366 194539366 194539366 + 1 inst_issued2_1 29990521 29990521 29990521 29990521 + 1 inst_executed 196552394 196552394 196552394 196552394 + 1 warps_launched 284 284 284 284 + 1 threads_launched 9088 9088 9088 9088 + 1 thread_inst_executed_0 772770521 772770521 772770521 772770521 + 1 thread_inst_executed_2 288572786 288572786 288572786 288572786 + 1 thread_inst_executed_1 785397047 785397047 785397047 785397047 + 1 thread_inst_executed_3 293234095 293234095 293234095 293234095 + 1 active_warps 2.0847e+10 2.0847e+10 2.0847e+10 2.0847e+10 + 1 tex0_cache_sector_queries 0 0 0 0 + 1 tex0_cache_sector_misses 0 0 0 0 + 1 tex1_cache_sector_queries 0 0 0 0 + 1 tex1_cache_sector_misses 0 0 0 0 + 1 sm_cta_launched 72 72 72 72 + 1 l1_local_load_hit 0 0 0 0 + 1 l1_local_load_miss 0 0 0 0 + 1 l1_local_store_hit 0 0 0 0 + 1 l1_local_store_miss 0 0 0 0 + 1 l1_global_load_hit 2259108 2259108 2259108 2259108 + 1 l1_global_load_miss 35498844 35498844 35498844 35498844 + 1 uncached_global_load_transaction 0 0 0 0 + 1 global_store_transaction 37765120 37765120 37765120 37765120 + 1 l1_shared_bank_conflict 0 0 0 0 + 1 prof_trigger_00 0 0 0 0 + 1 prof_trigger_01 0 0 0 0 + 1 prof_trigger_02 0 0 0 0 + 1 prof_trigger_03 0 0 0 0 + 1 prof_trigger_04 0 0 0 0 + 1 prof_trigger_05 0 0 0 0 + 1 prof_trigger_06 0 0 0 0 + 1 prof_trigger_07 0 0 0 0 + 1 fb_subp0_read_sectors 49289849 49289849 49289849 49289849 + 1 fb_subp1_read_sectors 49555803 49555803 49555803 49555803 + 1 fb_subp0_write_sectors 18250831 18250831 18250831 18250831 + 1 fb_subp1_write_sectors 18259571 18259571 18259571 18259571 + 1 l2_subp0_write_sector_misses 18240457 18240457 18240457 18240457 + 1 l2_subp1_write_sector_misses 18259571 18259571 18259571 18259571 + 1 l2_subp0_read_sector_misses 49438437 49438437 49438437 49438437 + 1 l2_subp1_read_sector_misses 49165955 49165955 49165955 49165955 + 1 l2_subp0_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp1_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp0_read_sector_queries 68897244 68897244 68897244 68897244 + 1 l2_subp1_read_sector_queries 68985036 68985036 68985036 68985036 + 1 l2_subp0_read_tex_sector_queries 0 0 0 0 + 1 l2_subp1_read_tex_sector_queries 0 0 0 0 + 1 l2_subp0_read_hit_sectors 19813700 19813700 19813700 19813700 + 1 l2_subp1_read_hit_sectors 20185552 20185552 20185552 20185552 + 1 l2_subp0_read_tex_hit_sectors 0 0 0 0 + 1 l2_subp1_read_tex_hit_sectors 0 0 0 0 + 1 elapsed_cycles_sm 765572800 765572800 765572800 765572800 + 1 l2_subp0_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_write_sysmem_sector_queries 1 1 1 1 + 1 l2_subp1_write_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_total_read_sector_queries 68901346 68901346 68901346 68901346 + 1 l2_subp1_total_read_sector_queries 69041052 69041052 69041052 69041052 + 1 l2_subp0_total_write_sector_queries 18352130 18352130 18352130 18352130 + 1 l2_subp1_total_write_sector_queries 18352134 18352134 18352134 18352134 + 1 atom_count 0 0 0 0 + 1 gred_count 0 0 0 0 + 1 gld_inst_8bit 36713217 36713217 36713217 36713217 + 1 gld_inst_16bit 0 0 0 0 + 1 gld_inst_32bit 0 0 0 0 + 1 gld_inst_64bit 0 0 0 0 + 1 gld_inst_128bit 0 0 0 0 + 1 gst_inst_8bit 36704256 36704256 36704256 36704256 + 1 gst_inst_16bit 0 0 0 0 + 1 gst_inst_32bit 0 0 0 0 + 1 gst_inst_64bit 0 0 0 0 + 1 gst_inst_128bit 0 0 0 0 + Kernel: void mtf_cuda_thread_by4(unsigned char const *, unsigned char*, int, int) + 1 local_load 0 0 0 0 + 1 local_store 0 0 0 0 + 1 gld_request 10591710 10591710 10591710 10591710 + 1 gst_request 10593628 10593628 10593628 10593628 + 1 shared_load 12376912 12376912 12376912 12376912 + 1 shared_store 12379160 12379160 12379160 12379160 + 1 branch 39729054 39729054 39729054 39729054 + 1 divergent_branch 10502375 10502375 10502375 10502375 + 1 active_cycles 688921507 688921507 688921507 688921507 + 1 inst_issued1_0 245369699 245369699 245369699 245369699 + 1 inst_issued2_0 48405847 48405847 48405847 48405847 + 1 inst_issued1_1 142077952 142077952 142077952 142077952 + 1 inst_issued2_1 49045081 49045081 49045081 49045081 + 1 inst_executed 279259990 279259990 279259990 279259990 + 1 warps_launched 284 284 284 284 + 1 threads_launched 9088 9088 9088 9088 + 1 thread_inst_executed_0 985195443 985195443 985195443 985195443 + 1 thread_inst_executed_2 420192662 420192662 420192662 420192662 + 1 thread_inst_executed_1 1017513690 1017513690 1017513690 1017513690 + 1 thread_inst_executed_3 432773641 432773641 432773641 432773641 + 1 active_warps 1.9640e+10 1.9640e+10 1.9640e+10 1.9640e+10 + 1 tex0_cache_sector_queries 0 0 0 0 + 1 tex0_cache_sector_misses 0 0 0 0 + 1 tex1_cache_sector_queries 0 0 0 0 + 1 tex1_cache_sector_misses 0 0 0 0 + 1 sm_cta_launched 72 72 72 72 + 1 l1_local_load_hit 0 0 0 0 + 1 l1_local_load_miss 0 0 0 0 + 1 l1_local_store_hit 0 0 0 0 + 1 l1_local_store_miss 0 0 0 0 + 1 l1_global_load_hit 2677716 2677716 2677716 2677716 + 1 l1_global_load_miss 35096624 35096624 35096624 35096624 + 1 uncached_global_load_transaction 0 0 0 0 + 1 global_store_transaction 37748736 37748736 37748736 37748736 + 1 l1_shared_bank_conflict 0 0 0 0 + 1 prof_trigger_00 0 0 0 0 + 1 prof_trigger_01 0 0 0 0 + 1 prof_trigger_02 0 0 0 0 + 1 prof_trigger_03 0 0 0 0 + 1 prof_trigger_04 0 0 0 0 + 1 prof_trigger_05 0 0 0 0 + 1 prof_trigger_06 0 0 0 0 + 1 prof_trigger_07 0 0 0 0 + 1 fb_subp0_read_sectors 44975299 44975299 44975299 44975299 + 1 fb_subp1_read_sectors 44611682 44611682 44611682 44611682 + 1 fb_subp0_write_sectors 18158225 18158225 18158225 18158225 + 1 fb_subp1_write_sectors 18166094 18166094 18166094 18166094 + 1 l2_subp0_write_sector_misses 18160293 18160293 18160293 18160293 + 1 l2_subp1_write_sector_misses 18166094 18166094 18166094 18166094 + 1 l2_subp0_read_sector_misses 44919470 44919470 44919470 44919470 + 1 l2_subp1_read_sector_misses 44760890 44760890 44760890 44760890 + 1 l2_subp0_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp1_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp0_read_sector_queries 67696980 67696980 67696980 67696980 + 1 l2_subp1_read_sector_queries 67196644 67196644 67196644 67196644 + 1 l2_subp0_read_tex_sector_queries 0 0 0 0 + 1 l2_subp1_read_tex_sector_queries 0 0 0 0 + 1 l2_subp0_read_hit_sectors 23592168 23592168 23592168 23592168 + 1 l2_subp1_read_hit_sectors 23557808 23557808 23557808 23557808 + 1 l2_subp0_read_tex_hit_sectors 0 0 0 0 + 1 l2_subp1_read_tex_hit_sectors 0 0 0 0 + 1 elapsed_cycles_sm 753454288 753454288 753454288 753454288 + 1 l2_subp0_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_write_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_write_sysmem_sector_queries 1 1 1 1 + 1 l2_subp0_total_read_sector_queries 67182382 67182382 67182382 67182382 + 1 l2_subp1_total_read_sector_queries 67553104 67553104 67553104 67553104 + 1 l2_subp0_total_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp1_total_write_sector_queries 18352159 18352159 18352159 18352159 + 1 atom_count 0 0 0 0 + 1 gred_count 0 0 0 0 + 1 gld_inst_8bit 36713217 36713217 36713217 36713217 + 1 gld_inst_16bit 0 0 0 0 + 1 gld_inst_32bit 0 0 0 0 + 1 gld_inst_64bit 0 0 0 0 + 1 gld_inst_128bit 0 0 0 0 + 1 gst_inst_8bit 36704256 36704256 36704256 36704256 + 1 gst_inst_16bit 0 0 0 0 + 1 gst_inst_32bit 0 0 0 0 + 1 gst_inst_64bit 0 0 0 0 + 1 gst_inst_128bit 0 0 0 0 + Kernel: void mtf_cuda_4by8(unsigned char const *, unsigned char*, int, int) + 1 local_load 0 0 0 0 + 1 local_store 0 0 0 0 + 1 gld_request 11411004 11411004 11411004 11411004 + 1 gst_request 11411393 11411393 11411393 11411393 + 1 shared_load 23675950 23675950 23675950 23675950 + 1 shared_store 34699899 34699899 34699899 34699899 + 1 branch 37276879 37276879 37276879 37276879 + 1 divergent_branch 10748963 10748963 10748963 10748963 + 1 active_cycles 382927642 382927642 382927642 382927642 + 1 inst_issued1_0 182234759 182234759 182234759 182234759 + 1 inst_issued2_0 85564042 85564042 85564042 85564042 + 1 inst_issued1_1 181744394 181744394 181744394 181744394 + 1 inst_issued2_1 85075600 85075600 85075600 85075600 + 1 inst_executed 557769056 557769056 557769056 557769056 + 1 warps_launched 1124 1124 1124 1124 + 1 threads_launched 35968 35968 35968 35968 + 1 thread_inst_executed_0 3659929074 3659929074 3659929074 3659929074 + 1 thread_inst_executed_2 1669723944 1669723944 1669723944 1669723944 + 1 thread_inst_executed_1 3669181043 3669181043 3669181043 3669181043 + 1 thread_inst_executed_3 1674407008 1674407008 1674407008 1674407008 + 1 active_warps 8002404096 8002404096 8002404096 8002404096 + 1 tex0_cache_sector_queries 0 0 0 0 + 1 tex0_cache_sector_misses 0 0 0 0 + 1 tex1_cache_sector_queries 0 0 0 0 + 1 tex1_cache_sector_misses 0 0 0 0 + 1 sm_cta_launched 272 272 272 272 + 1 l1_local_load_hit 0 0 0 0 + 1 l1_local_load_miss 0 0 0 0 + 1 l1_local_store_hit 0 0 0 0 + 1 l1_local_store_miss 0 0 0 0 + 1 l1_global_load_hit 16633180 16633180 16633180 16633180 + 1 l1_global_load_miss 19043496 19043496 19043496 19043496 + 1 uncached_global_load_transaction 0 0 0 0 + 1 global_store_transaction 35651584 35651584 35651584 35651584 + 1 l1_shared_bank_conflict 91088040 91088040 91088040 91088040 + 1 prof_trigger_00 0 0 0 0 + 1 prof_trigger_01 0 0 0 0 + 1 prof_trigger_02 0 0 0 0 + 1 prof_trigger_03 0 0 0 0 + 1 prof_trigger_04 0 0 0 0 + 1 prof_trigger_05 0 0 0 0 + 1 prof_trigger_06 0 0 0 0 + 1 prof_trigger_07 0 0 0 0 + 1 fb_subp0_read_sectors 1288788 1288788 1288788 1288788 + 1 fb_subp1_read_sectors 1312283 1312283 1312283 1312283 + 1 fb_subp0_write_sectors 16386060 16386060 16386060 16386060 + 1 fb_subp1_write_sectors 16375296 16375296 16375296 16375296 + 1 l2_subp0_write_sector_misses 16378897 16378897 16378897 16378897 + 1 l2_subp1_write_sector_misses 16375296 16375296 16375296 16375296 + 1 l2_subp0_read_sector_misses 1276341 1276341 1276341 1276341 + 1 l2_subp1_read_sector_misses 1319263 1319263 1319263 1319263 + 1 l2_subp0_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp1_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp0_read_sector_queries 40291296 40291296 40291296 40291296 + 1 l2_subp1_read_sector_queries 40035120 40035120 40035120 40035120 + 1 l2_subp0_read_tex_sector_queries 0 0 0 0 + 1 l2_subp1_read_tex_sector_queries 0 0 0 0 + 1 l2_subp0_read_hit_sectors 39306500 39306500 39306500 39306500 + 1 l2_subp1_read_hit_sectors 39294860 39294860 39294860 39294860 + 1 l2_subp0_read_tex_hit_sectors 0 0 0 0 + 1 l2_subp1_read_tex_hit_sectors 0 0 0 0 + 1 elapsed_cycles_sm 421723444 421723444 421723444 421723444 + 1 l2_subp0_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_write_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_write_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_total_read_sector_queries 40418281 40418281 40418281 40418281 + 1 l2_subp1_total_read_sector_queries 40339084 40339084 40339084 40339084 + 1 l2_subp0_total_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp1_total_write_sector_queries 18352151 18352151 18352151 18352151 + 1 atom_count 0 0 0 0 + 1 gred_count 0 0 0 0 + 1 gld_inst_8bit 146852868 146852868 146852868 146852868 + 1 gld_inst_16bit 0 0 0 0 + 1 gld_inst_32bit 0 0 0 0 + 1 gld_inst_64bit 0 0 0 0 + 1 gld_inst_128bit 0 0 0 0 + 1 gst_inst_8bit 146817024 146817024 146817024 146817024 + 1 gst_inst_16bit 0 0 0 0 + 1 gst_inst_32bit 0 0 0 0 + 1 gst_inst_64bit 0 0 0 0 + 1 gst_inst_128bit 0 0 0 0 + Kernel: void mtf_cuda_2symbols(unsigned char const *, unsigned char*, int, int) + 1 local_load 0 0 0 0 + 1 local_store 0 0 0 0 + 1 gld_request 40170970 40170970 40170970 40170970 + 1 gst_request 36704256 36704256 36704256 36704256 + 1 shared_load 39522009 39522009 39522009 39522009 + 1 shared_store 75300865 75300865 75300865 75300865 + 1 branch 42903444 42903444 42903444 42903444 + 1 divergent_branch 0 0 0 0 + 1 active_cycles 537230353 537230353 537230353 537230353 + 1 inst_issued1_0 305535308 305535308 305535308 305535308 + 1 inst_issued2_0 103134627 103134627 103134627 103134627 + 1 inst_issued1_1 305274087 305274087 305274087 305274087 + 1 inst_issued2_1 103108796 103108796 103108796 103108796 + 1 inst_executed 1023285210 1023285210 1023285210 1023285210 + 1 warps_launched 8964 8964 8964 8964 + 1 threads_launched 286848 286848 286848 286848 + 1 thread_inst_executed_0 1.3077e+10 1.3077e+10 1.3077e+10 1.3077e+10 + 1 thread_inst_executed_2 3300303968 3300303968 3300303968 3300303968 + 1 thread_inst_executed_1 1.3068e+10 1.3068e+10 1.3068e+10 1.3068e+10 + 1 thread_inst_executed_3 3299477376 3299477376 3299477376 3299477376 + 1 active_warps 1.5858e+10 1.5858e+10 1.5858e+10 1.5858e+10 + 1 tex0_cache_sector_queries 0 0 0 0 + 1 tex0_cache_sector_misses 0 0 0 0 + 1 tex1_cache_sector_queries 0 0 0 0 + 1 tex1_cache_sector_misses 0 0 0 0 + 1 sm_cta_launched 2248 2248 2248 2248 + 1 l1_local_load_hit 0 0 0 0 + 1 l1_local_load_miss 0 0 0 0 + 1 l1_local_store_hit 0 0 0 0 + 1 l1_local_store_miss 0 0 0 0 + 1 l1_global_load_hit 38984316 38984316 38984316 38984316 + 1 l1_global_load_miss 1343540 1343540 1343540 1343540 + 1 uncached_global_load_transaction 0 0 0 0 + 1 global_store_transaction 36831232 36831232 36831232 36831232 + 1 l1_shared_bank_conflict 0 0 0 0 + 1 prof_trigger_00 0 0 0 0 + 1 prof_trigger_01 0 0 0 0 + 1 prof_trigger_02 0 0 0 0 + 1 prof_trigger_03 0 0 0 0 + 1 prof_trigger_04 0 0 0 0 + 1 prof_trigger_05 0 0 0 0 + 1 prof_trigger_06 0 0 0 0 + 1 prof_trigger_07 0 0 0 0 + 1 fb_subp0_read_sectors 1180640 1180640 1180640 1180640 + 1 fb_subp1_read_sectors 1173353 1173353 1173353 1173353 + 1 fb_subp0_write_sectors 16623084 16623084 16623084 16623084 + 1 fb_subp1_write_sectors 16619993 16619993 16619993 16619993 + 1 l2_subp0_write_sector_misses 16579998 16579998 16579998 16579998 + 1 l2_subp1_write_sector_misses 16619993 16619993 16619993 16619993 + 1 l2_subp0_read_sector_misses 1172456 1172456 1172456 1172456 + 1 l2_subp1_read_sector_misses 1172133 1172133 1172133 1172133 + 1 l2_subp0_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp1_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp0_read_sector_queries 3341332 3341332 3341332 3341332 + 1 l2_subp1_read_sector_queries 3244672 3244672 3244672 3244672 + 1 l2_subp0_read_tex_sector_queries 0 0 0 0 + 1 l2_subp1_read_tex_sector_queries 0 0 0 0 + 1 l2_subp0_read_hit_sectors 3115728 3115728 3115728 3115728 + 1 l2_subp1_read_hit_sectors 2738712 2738712 2738712 2738712 + 1 l2_subp0_read_tex_hit_sectors 0 0 0 0 + 1 l2_subp1_read_tex_hit_sectors 0 0 0 0 + 1 elapsed_cycles_sm 542137496 542137496 542137496 542137496 + 1 l2_subp0_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_write_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_write_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_total_read_sector_queries 3259496 3259496 3259496 3259496 + 1 l2_subp1_total_read_sector_queries 3853620 3853620 3853620 3853620 + 1 l2_subp0_total_write_sector_queries 18352130 18352130 18352130 18352130 + 1 l2_subp1_total_write_sector_queries 18352132 18352132 18352132 18352132 + 1 atom_count 0 0 0 0 + 1 gred_count 0 0 0 0 + 1 gld_inst_8bit 1285471040 1285471040 1285471040 1285471040 + 1 gld_inst_16bit 0 0 0 0 + 1 gld_inst_32bit 0 0 0 0 + 1 gld_inst_64bit 0 0 0 0 + 1 gld_inst_128bit 0 0 0 0 + 1 gst_inst_8bit 36704256 36704256 36704256 36704256 + 1 gst_inst_16bit 0 0 0 0 + 1 gst_inst_32bit 0 0 0 0 + 1 gst_inst_64bit 0 0 0 0 + 1 gst_inst_128bit 0 0 0 0 + Kernel: void mtf_cuda_thread(unsigned char const *, unsigned char*, int, int) + 1 local_load 0 0 0 0 + 1 local_store 0 0 0 0 + 1 gld_request 8842860 8842860 8842860 8842860 + 1 gst_request 8844386 8844386 8844386 8844386 + 1 shared_load 10779518 10779518 10779518 10779518 + 1 shared_store 19625871 19625871 19625871 19625871 + 1 branch 30582334 30582334 30582334 30582334 + 1 divergent_branch 8761655 8761655 8761655 8761655 + 1 active_cycles 675698832 675698832 675698832 675698832 + 1 inst_issued1_0 295242483 295242483 295242483 295242483 + 1 inst_issued2_0 39228377 39228377 39228377 39228377 + 1 inst_issued1_1 197131135 197131135 197131135 197131135 + 1 inst_issued2_1 39529898 39529898 39529898 39529898 + 1 inst_executed 329512395 329512395 329512395 329512395 + 1 warps_launched 284 284 284 284 + 1 threads_launched 9088 9088 9088 9088 + 1 thread_inst_executed_0 1852478139 1852478139 1852478139 1852478139 + 1 thread_inst_executed_2 544591920 544591920 544591920 544591920 + 1 thread_inst_executed_1 1922013054 1922013054 1922013054 1922013054 + 1 thread_inst_executed_3 557808968 557808968 557808968 557808968 + 1 active_warps 1.8717e+10 1.8717e+10 1.8717e+10 1.8717e+10 + 1 tex0_cache_sector_queries 0 0 0 0 + 1 tex0_cache_sector_misses 0 0 0 0 + 1 tex1_cache_sector_queries 0 0 0 0 + 1 tex1_cache_sector_misses 0 0 0 0 + 1 sm_cta_launched 72 72 72 72 + 1 l1_local_load_hit 0 0 0 0 + 1 l1_local_load_miss 0 0 0 0 + 1 l1_local_store_hit 0 0 0 0 + 1 l1_local_store_miss 0 0 0 0 + 1 l1_global_load_hit 3411696 3411696 3411696 3411696 + 1 l1_global_load_miss 34346256 34346256 34346256 34346256 + 1 uncached_global_load_transaction 0 0 0 0 + 1 global_store_transaction 37748736 37748736 37748736 37748736 + 1 l1_shared_bank_conflict 0 0 0 0 + 1 prof_trigger_00 0 0 0 0 + 1 prof_trigger_01 0 0 0 0 + 1 prof_trigger_02 0 0 0 0 + 1 prof_trigger_03 0 0 0 0 + 1 prof_trigger_04 0 0 0 0 + 1 prof_trigger_05 0 0 0 0 + 1 prof_trigger_06 0 0 0 0 + 1 prof_trigger_07 0 0 0 0 + 1 fb_subp0_read_sectors 39150884 39150884 39150884 39150884 + 1 fb_subp1_read_sectors 38974566 38974566 38974566 38974566 + 1 fb_subp0_write_sectors 18124544 18124544 18124544 18124544 + 1 fb_subp1_write_sectors 18113027 18113027 18113027 18113027 + 1 l2_subp0_write_sector_misses 18121835 18121835 18121835 18121835 + 1 l2_subp1_write_sector_misses 18113027 18113027 18113027 18113027 + 1 l2_subp0_read_sector_misses 39326473 39326473 39326473 39326473 + 1 l2_subp1_read_sector_misses 38724706 38724706 38724706 38724706 + 1 l2_subp0_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp1_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp0_read_sector_queries 66104036 66104036 66104036 66104036 + 1 l2_subp1_read_sector_queries 66003408 66003408 66003408 66003408 + 1 l2_subp0_read_tex_sector_queries 0 0 0 0 + 1 l2_subp1_read_tex_sector_queries 0 0 0 0 + 1 l2_subp0_read_hit_sectors 27559372 27559372 27559372 27559372 + 1 l2_subp1_read_hit_sectors 27899128 27899128 27899128 27899128 + 1 l2_subp0_read_tex_hit_sectors 0 0 0 0 + 1 l2_subp1_read_tex_hit_sectors 0 0 0 0 + 1 elapsed_cycles_sm 746001488 746001488 746001488 746001488 + 1 l2_subp0_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_write_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_write_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_total_read_sector_queries 66003821 66003821 66003821 66003821 + 1 l2_subp1_total_read_sector_queries 65894528 65894528 65894528 65894528 + 1 l2_subp0_total_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp1_total_write_sector_queries 18352367 18352367 18352367 18352367 + 1 atom_count 0 0 0 0 + 1 gred_count 0 0 0 0 + 1 gld_inst_8bit 36713217 36713217 36713217 36713217 + 1 gld_inst_16bit 0 0 0 0 + 1 gld_inst_32bit 0 0 0 0 + 1 gld_inst_64bit 0 0 0 0 + 1 gld_inst_128bit 0 0 0 0 + 1 gst_inst_8bit 36704256 36704256 36704256 36704256 + 1 gst_inst_16bit 0 0 0 0 + 1 gst_inst_32bit 0 0 0 0 + 1 gst_inst_64bit 0 0 0 0 + 1 gst_inst_128bit 0 0 0 0 + Kernel: void mtf_cuda_thread_by4(unsigned char const *, unsigned char*, int, int) + 1 local_load 0 0 0 0 + 1 local_store 0 0 0 0 + 1 gld_request 10233160 10233160 10233160 10233160 + 1 gst_request 10235017 10235017 10235017 10235017 + 1 shared_load 10312387 10312387 10312387 10312387 + 1 shared_store 10313511 10313511 10313511 10313511 + 1 branch 35923552 35923552 35923552 35923552 + 1 divergent_branch 10169995 10169995 10169995 10169995 + 1 active_cycles 718563959 718563959 718563959 718563959 + 1 inst_issued1_0 261078534 261078534 261078534 261078534 + 1 inst_issued2_0 44641803 44641803 44641803 44641803 + 1 inst_issued1_1 143406469 143406469 143406469 143406469 + 1 inst_issued2_1 45024465 45024465 45024465 45024465 + 1 inst_executed 253839379 253839379 253839379 253839379 + 1 warps_launched 284 284 284 284 + 1 threads_launched 9088 9088 9088 9088 + 1 thread_inst_executed_0 890229234 890229234 890229234 890229234 + 1 thread_inst_executed_2 383982608 383982608 383982608 383982608 + 1 thread_inst_executed_1 912722232 912722232 912722232 912722232 + 1 thread_inst_executed_3 392792005 392792005 392792005 392792005 + 1 active_warps 2.0321e+10 2.0321e+10 2.0321e+10 2.0321e+10 + 1 tex0_cache_sector_queries 0 0 0 0 + 1 tex0_cache_sector_misses 0 0 0 0 + 1 tex1_cache_sector_queries 0 0 0 0 + 1 tex1_cache_sector_misses 0 0 0 0 + 1 sm_cta_launched 72 72 72 72 + 1 l1_local_load_hit 0 0 0 0 + 1 l1_local_load_miss 0 0 0 0 + 1 l1_local_store_hit 0 0 0 0 + 1 l1_local_store_miss 0 0 0 0 + 1 l1_global_load_hit 2373344 2373344 2373344 2373344 + 1 l1_global_load_miss 35384608 35384608 35384608 35384608 + 1 uncached_global_load_transaction 0 0 0 0 + 1 global_store_transaction 37748736 37748736 37748736 37748736 + 1 l1_shared_bank_conflict 0 0 0 0 + 1 prof_trigger_00 0 0 0 0 + 1 prof_trigger_01 0 0 0 0 + 1 prof_trigger_02 0 0 0 0 + 1 prof_trigger_03 0 0 0 0 + 1 prof_trigger_04 0 0 0 0 + 1 prof_trigger_05 0 0 0 0 + 1 prof_trigger_06 0 0 0 0 + 1 prof_trigger_07 0 0 0 0 + 1 fb_subp0_read_sectors 47263349 47263349 47263349 47263349 + 1 fb_subp1_read_sectors 47304246 47304246 47304246 47304246 + 1 fb_subp0_write_sectors 18218051 18218051 18218051 18218051 + 1 fb_subp1_write_sectors 18210821 18210821 18210821 18210821 + 1 l2_subp0_write_sector_misses 18219936 18219936 18219936 18219936 + 1 l2_subp1_write_sector_misses 18210821 18210821 18210821 18210821 + 1 l2_subp0_read_sector_misses 47480379 47480379 47480379 47480379 + 1 l2_subp1_read_sector_misses 47311114 47311114 47311114 47311114 + 1 l2_subp0_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp1_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp0_read_sector_queries 68072004 68072004 68072004 68072004 + 1 l2_subp1_read_sector_queries 68124340 68124340 68124340 68124340 + 1 l2_subp0_read_tex_sector_queries 0 0 0 0 + 1 l2_subp1_read_tex_sector_queries 0 0 0 0 + 1 l2_subp0_read_hit_sectors 21508292 21508292 21508292 21508292 + 1 l2_subp1_read_hit_sectors 21480324 21480324 21480324 21480324 + 1 l2_subp0_read_tex_hit_sectors 0 0 0 0 + 1 l2_subp1_read_tex_hit_sectors 0 0 0 0 + 1 elapsed_cycles_sm 755240248 755240248 755240248 755240248 + 1 l2_subp0_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_write_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_write_sysmem_sector_queries 2 2 2 2 + 1 l2_subp0_total_read_sector_queries 68070671 68070671 68070671 68070671 + 1 l2_subp1_total_read_sector_queries 68009444 68009444 68009444 68009444 + 1 l2_subp0_total_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp1_total_write_sector_queries 18352134 18352134 18352134 18352134 + 1 atom_count 0 0 0 0 + 1 gred_count 0 0 0 0 + 1 gld_inst_8bit 36713217 36713217 36713217 36713217 + 1 gld_inst_16bit 0 0 0 0 + 1 gld_inst_32bit 0 0 0 0 + 1 gld_inst_64bit 0 0 0 0 + 1 gld_inst_128bit 0 0 0 0 + 1 gst_inst_8bit 36704256 36704256 36704256 36704256 + 1 gst_inst_16bit 0 0 0 0 + 1 gst_inst_32bit 0 0 0 0 + 1 gst_inst_64bit 0 0 0 0 + 1 gst_inst_128bit 0 0 0 0 + Kernel: void mtf_cuda_scalar(unsigned char const *, unsigned char*, int, int) + 1 local_load 0 0 0 0 + 1 local_store 0 0 0 0 + 1 gld_request 36713012 36713012 36713012 36713012 + 1 gst_request 36704256 36704256 36704256 36704256 + 1 shared_load 39600271 39600271 39600271 39600271 + 1 shared_store 75300865 75300865 75300865 75300865 + 1 branch 83279396 83279396 83279396 83279396 + 1 divergent_branch 0 0 0 0 + 1 active_cycles 493762986 493762986 493762986 493762986 + 1 inst_issued1_0 300154225 300154225 300154225 300154225 + 1 inst_issued2_0 85648597 85648597 85648597 85648597 + 1 inst_issued1_1 300113070 300113070 300113070 300113070 + 1 inst_issued2_1 85568778 85568778 85568778 85568778 + 1 inst_executed 942303984 942303984 942303984 942303984 + 1 warps_launched 8964 8964 8964 8964 + 1 threads_launched 286848 286848 286848 286848 + 1 thread_inst_executed_0 1.2339e+10 1.2339e+10 1.2339e+10 1.2339e+10 + 1 thread_inst_executed_2 2740751008 2740751008 2740751008 2740751008 + 1 thread_inst_executed_1 1.2335e+10 1.2335e+10 1.2335e+10 1.2335e+10 + 1 thread_inst_executed_3 2738196800 2738196800 2738196800 2738196800 + 1 active_warps 1.5482e+10 1.5482e+10 1.5482e+10 1.5482e+10 + 1 tex0_cache_sector_queries 0 0 0 0 + 1 tex0_cache_sector_misses 0 0 0 0 + 1 tex1_cache_sector_queries 0 0 0 0 + 1 tex1_cache_sector_misses 0 0 0 0 + 1 sm_cta_launched 2248 2248 2248 2248 + 1 l1_local_load_hit 0 0 0 0 + 1 l1_local_load_miss 0 0 0 0 + 1 l1_local_store_hit 0 0 0 0 + 1 l1_local_store_miss 0 0 0 0 + 1 l1_global_load_hit 33464268 33464268 33464268 33464268 + 1 l1_global_load_miss 3261052 3261052 3261052 3261052 + 1 uncached_global_load_transaction 0 0 0 0 + 1 global_store_transaction 36765696 36765696 36765696 36765696 + 1 l1_shared_bank_conflict 0 0 0 0 + 1 prof_trigger_00 0 0 0 0 + 1 prof_trigger_01 0 0 0 0 + 1 prof_trigger_02 0 0 0 0 + 1 prof_trigger_03 0 0 0 0 + 1 prof_trigger_04 0 0 0 0 + 1 prof_trigger_05 0 0 0 0 + 1 prof_trigger_06 0 0 0 0 + 1 prof_trigger_07 0 0 0 0 + 1 fb_subp0_read_sectors 1127616 1127616 1127616 1127616 + 1 fb_subp1_read_sectors 1128996 1128996 1128996 1128996 + 1 fb_subp0_write_sectors 16819063 16819063 16819063 16819063 + 1 fb_subp1_write_sectors 16831194 16831194 16831194 16831194 + 1 l2_subp0_write_sector_misses 16810676 16810676 16810676 16810676 + 1 l2_subp1_write_sector_misses 16831194 16831194 16831194 16831194 + 1 l2_subp0_read_sector_misses 1126232 1126232 1126232 1126232 + 1 l2_subp1_read_sector_misses 1124512 1124512 1124512 1124512 + 1 l2_subp0_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp1_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp0_read_sector_queries 6654196 6654196 6654196 6654196 + 1 l2_subp1_read_sector_queries 7021288 7021288 7021288 7021288 + 1 l2_subp0_read_tex_sector_queries 0 0 0 0 + 1 l2_subp1_read_tex_sector_queries 0 0 0 0 + 1 l2_subp0_read_hit_sectors 5677544 5677544 5677544 5677544 + 1 l2_subp1_read_hit_sectors 6350124 6350124 6350124 6350124 + 1 l2_subp0_read_tex_hit_sectors 0 0 0 0 + 1 l2_subp1_read_tex_hit_sectors 0 0 0 0 + 1 elapsed_cycles_sm 495452032 495452032 495452032 495452032 + 1 l2_subp0_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_write_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_write_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_total_read_sector_queries 7045006 7045006 7045006 7045006 + 1 l2_subp1_total_read_sector_queries 7008556 7008556 7008556 7008556 + 1 l2_subp0_total_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp1_total_write_sector_queries 18352143 18352143 18352143 18352143 + 1 atom_count 0 0 0 0 + 1 gred_count 0 0 0 0 + 1 gld_inst_8bit 1174816384 1174816384 1174816384 1174816384 + 1 gld_inst_16bit 0 0 0 0 + 1 gld_inst_32bit 0 0 0 0 + 1 gld_inst_64bit 0 0 0 0 + 1 gld_inst_128bit 0 0 0 0 + 1 gst_inst_8bit 1174536192 1174536192 1174536192 1174536192 + 1 gst_inst_16bit 0 0 0 0 + 1 gst_inst_32bit 0 0 0 0 + 1 gst_inst_64bit 0 0 0 0 + 1 gst_inst_128bit 0 0 0 0 + Kernel: void mtf_cuda_thread(unsigned char const *, unsigned char*, int, int) + 1 local_load 0 0 0 0 + 1 local_store 0 0 0 0 + 1 gld_request 7829487 7829487 7829487 7829487 + 1 gst_request 7830824 7830824 7830824 7830824 + 1 shared_load 8896454 8896454 8896454 8896454 + 1 shared_store 16728121 16728121 16728121 16728121 + 1 branch 25748836 25748836 25748836 25748836 + 1 divergent_branch 7775181 7775181 7775181 7775181 + 1 active_cycles 681037669 681037669 681037669 681037669 + 1 inst_issued1_0 298065862 298065862 298065862 298065862 + 1 inst_issued2_0 42405732 42405732 42405732 42405732 + 1 inst_issued1_1 182094720 182094720 182094720 182094720 + 1 inst_issued2_1 42540558 42540558 42540558 42540558 + 1 inst_executed 279274913 279274913 279274913 279274913 + 1 warps_launched 284 284 284 284 + 1 threads_launched 9088 9088 9088 9088 + 1 thread_inst_executed_0 1451996223 1451996223 1451996223 1451996223 + 1 thread_inst_executed_2 688568546 688568546 688568546 688568546 + 1 thread_inst_executed_1 1493956235 1493956235 1493956235 1493956235 + 1 thread_inst_executed_3 700806026 700806026 700806026 700806026 + 1 active_warps 1.9356e+10 1.9356e+10 1.9356e+10 1.9356e+10 + 1 tex0_cache_sector_queries 0 0 0 0 + 1 tex0_cache_sector_misses 0 0 0 0 + 1 tex1_cache_sector_queries 0 0 0 0 + 1 tex1_cache_sector_misses 0 0 0 0 + 1 sm_cta_launched 72 72 72 72 + 1 l1_local_load_hit 0 0 0 0 + 1 l1_local_load_miss 0 0 0 0 + 1 l1_local_store_hit 0 0 0 0 + 1 l1_local_store_miss 0 0 0 0 + 1 l1_global_load_hit 2980836 2980836 2980836 2980836 + 1 l1_global_load_miss 34777116 34777116 34777116 34777116 + 1 uncached_global_load_transaction 0 0 0 0 + 1 global_store_transaction 37748736 37748736 37748736 37748736 + 1 l1_shared_bank_conflict 0 0 0 0 + 1 prof_trigger_00 0 0 0 0 + 1 prof_trigger_01 0 0 0 0 + 1 prof_trigger_02 0 0 0 0 + 1 prof_trigger_03 0 0 0 0 + 1 prof_trigger_04 0 0 0 0 + 1 prof_trigger_05 0 0 0 0 + 1 prof_trigger_06 0 0 0 0 + 1 prof_trigger_07 0 0 0 0 + 1 fb_subp0_read_sectors 42841130 42841130 42841130 42841130 + 1 fb_subp1_read_sectors 42485391 42485391 42485391 42485391 + 1 fb_subp0_write_sectors 18182249 18182249 18182249 18182249 + 1 fb_subp1_write_sectors 18191673 18191673 18191673 18191673 + 1 l2_subp0_write_sector_misses 18182354 18182354 18182354 18182354 + 1 l2_subp1_write_sector_misses 18191673 18191673 18191673 18191673 + 1 l2_subp0_read_sector_misses 42837049 42837049 42837049 42837049 + 1 l2_subp1_read_sector_misses 42416322 42416322 42416322 42416322 + 1 l2_subp0_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp1_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp0_read_sector_queries 67135676 67135676 67135676 67135676 + 1 l2_subp1_read_sector_queries 67153480 67153480 67153480 67153480 + 1 l2_subp0_read_tex_sector_queries 0 0 0 0 + 1 l2_subp1_read_tex_sector_queries 0 0 0 0 + 1 l2_subp0_read_hit_sectors 25083832 25083832 25083832 25083832 + 1 l2_subp1_read_hit_sectors 25373152 25373152 25373152 25373152 + 1 l2_subp0_read_tex_hit_sectors 0 0 0 0 + 1 l2_subp1_read_tex_hit_sectors 0 0 0 0 + 1 elapsed_cycles_sm 728464628 728464628 728464628 728464628 + 1 l2_subp0_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp1_read_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_write_sysmem_sector_queries 2 2 2 2 + 1 l2_subp1_write_sysmem_sector_queries 0 0 0 0 + 1 l2_subp0_total_read_sector_queries 67020842 67020842 67020842 67020842 + 1 l2_subp1_total_read_sector_queries 67034192 67034192 67034192 67034192 + 1 l2_subp0_total_write_sector_queries 18352128 18352128 18352128 18352128 + 1 l2_subp1_total_write_sector_queries 18352134 18352134 18352134 18352134 + 1 atom_count 0 0 0 0 + 1 gred_count 0 0 0 0 + 1 gld_inst_8bit 36713217 36713217 36713217 36713217 + 1 gld_inst_16bit 0 0 0 0 + 1 gld_inst_32bit 0 0 0 0 + 1 gld_inst_64bit 0 0 0 0 + 1 gld_inst_128bit 0 0 0 0 + 1 gst_inst_8bit 36704256 36704256 36704256 36704256 + 1 gst_inst_16bit 0 0 0 0 + 1 gst_inst_32bit 0 0 0 0 + 1 gst_inst_64bit 0 0 0 0 + 1 gst_inst_128bit 0 0 0 0 ======== Metric result: -Invocations Metric Name Metric Description Min Max Avg +Invocations Metric Name Metric Description Min Max Avg Device "GeForce GTX 560 Ti (0)" - Kernel: void mtf_2symbols(unsigned char const *, unsigned char*, int, int) - 3 l1_cache_global_hit_rate L1 Global Hit Rate 94.57% 95.94% 95.44% - 3 branch_efficiency Branch Efficiency 100.00% 100.00% 100.00% - 3 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 3 sm_efficiency Multiprocessor Activity 97.99% 98.46% 98.26% - 3 achieved_occupancy Achieved Occupancy 0.610048 0.619671 0.613341 - 3 gld_requested_throughput Requested Global Load Throughput 15.520GB/s 15.611GB/s 15.553GB/s - 3 gst_requested_throughput Requested Global Store Throughput 449.39MB/s 454.17MB/s 451.81MB/s - 3 ipc Executed IPC 1.870561 1.875110 1.872464 - 3 sm_efficiency_instance Multiprocessor Activity 97.99% 98.46% 98.26% - 3 ipc_instance Executed IPC 1.870561 1.875110 1.872464 - 3 inst_per_warp Instructions per warp 1.1450e+05 1.1493e+05 1.1465e+05 - 3 gld_transactions Global Load Transactions 12233044 14946208 13995212 - 3 gst_transactions Global Store Transactions 10944512 13533184 12659370 - 3 local_load_transactions Local Load Transactions 0 0 0 - 3 local_store_transactions Local Store Transactions 0 0 0 - 3 shared_load_transactions Shared Load Transactions 11828832 14828016 13776066 - 3 shared_store_transactions Shared Store Transactions 22480948 28190172 26195802 - 3 gld_transactions_per_request Global Load Transactions Per Request 0.989584 1.008769 1.000612 - 3 gst_transactions_per_request Global Store Transactions Per Request 0.992173 0.999395 0.996319 - 3 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 3 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 3 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 0.993390 1.008373 1.001339 - 3 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 0.996041 1.008678 1.002770 - 3 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 shared_load_throughput Shared Memory Load Throughput 60.554GB/s 61.553GB/s 61.205GB/s - 3 shared_store_throughput Shared Memory Store Throughput 115.08GB/s 117.04GB/s 116.38GB/s - 3 shared_efficiency Shared Memory Efficiency 60.22% 61.23% 60.67% - 3 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 3 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 3 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 3 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 73.11% 73.32% 73.23% - 3 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 20.61% 20.81% 20.69% - 3 stall_memory_dependency Issue Stall Reasons (Data Request) 0.25% 0.26% 0.26% - 3 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% - 3 inst_executed Instructions Executed 308007708 380595408 355815243 - 3 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 3 stall_other Issue Stall Reasons (Other) 5.63% 5.64% 5.64% - 3 inst_fp_32 FP Instructions(Single) 0 0 0 - 3 inst_fp_64 FP Instructions(Double) 0 0 0 - 3 inst_integer Integer Instructions 4296466378 5309069774 4963313090 - 3 inst_bit_convert Bit-Convert Instructions 1050234432 1302466816 1216307498 - 3 inst_control Control-Flow Instructions 435747790 537723404 502934757 - 3 inst_compute_ld_st Load/Store Instructions 1442401364 1780285438 1665001122 - 3 inst_misc Misc Instructions 1010288800 1247129024 1166305888 - 3 inst_inter_thread_communication Inter-Thread Instructions 391057600 482365984 451211434 - 3 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 3 atomic_transactions Atomic Transactions 0 0 0 - 3 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 3 inst_replay_overhead Instruction Replay Overhead 0.000014 0.000018 0.000016 - 3 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 - 3 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.001633 0.002193 0.001989 - 3 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 3 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 dram_read_throughput Device Memory Read Throughput 938.52MB/s 943.10MB/s 941.41MB/s - 3 dram_write_throughput Device Memory Write Throughput 12.727GB/s 12.869GB/s 12.815GB/s - 3 gst_throughput Global Store Throughput 14.043GB/s 14.193GB/s 14.119GB/s - 3 gld_throughput Global Load Throughput 61.463GB/s 62.660GB/s 62.249GB/s - 3 warp_execution_efficiency Warp Execution Efficiency 100.00% 100.00% 100.00% - 3 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 3 gld_efficiency Global Memory Load Efficiency 24.78% 25.26% 24.99% - 3 gst_efficiency Global Memory Store Efficiency 3.13% 3.13% 3.13% - 3 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 44.01% 46.92% 45.92% - 3 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 3 l2_l1_read_throughput L2 Throughput (L1 Reads) 2.4737GB/s 2.7541GB/s 2.6295GB/s - 3 local_memory_overhead Local Memory Overhead 0.00% 1.36% 0.45% - 3 issued_ipc Issued IPC 1.876506 1.878342 1.877216 - 3 issue_slot_utilization Issue Slot Utilization 74.94% 75.04% 74.97% - 3 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 3 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 3 l2_read_transactions L2 Read Transactions 2157399 2772266 2563014 - 3 l2_write_transactions L2 Write Transactions 10973199 13606914 12707168 - 3 tex_cache_transactions Texture Cache Transactions 0 0 0 - 3 dram_read_transactions Device Memory Read Transactions 719642 883162 827030 - 3 dram_write_transactions Device Memory Write Transactions 9944445 12382849 11535138 - 3 l2_read_throughput L2 Throughput (Reads) 2.7610GB/s 2.8921GB/s 2.8434GB/s - 3 l2_write_throughput L2 Throughput (Writes) 14.043GB/s 14.193GB/s 14.119GB/s - 3 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 cf_issued Issued Control-Flow Instructions 37949082 46823095 43795246 - 3 cf_executed Executed Control-Flow Instructions 37949082 46823095 43795246 - 3 ldst_issued Issued Load/Store Instructions 57580777 71229851 66569504 - 3 ldst_executed Executed Load/Store Instructions 57577748 71224454 66565249 - 3 l1_shared_utilization L1/Shared Memory Utilization Low (3) Low (3) Low (3) - 3 l2_utilization L2 Cache Utilization Low (1) Low (1) Low (1) - 3 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 3 dram_utilization Device Memory Utilization Low (2) Low (2) Low (2) - 3 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) - 3 ldst_fu_utilization Load/Store Function Unit Utilization Low (2) Low (2) Low (2) - 3 alu_fu_utilization Arithmetic Function Unit Utilization Mid (6) Mid (6) Mid (6) - 3 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 3 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 3 inst_issued Instructions Issued 308012403 380600700 355820999 - 3 issue_slots Issue Slots 246086802 303979584 284219420 - 3 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 1932884 2653804 2375109 - 3 l2_l1_write_transactions L2 Write Transactions (L1 write requests 10973184 13606912 12707157 - 3 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 3 l2_l1_write_throughput L2 Throughput (L1 Writes) 14.043GB/s 14.193GB/s 14.119GB/s - 3 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 3 eligible_warps_per_cycle Eligible Warps Per Active Cycle 2.424991 2.431796 2.427909 - 3 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 3 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 3 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.18% 0.18% 0.18% - 3 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% - Kernel: void mtf_scalar(unsigned char const *, unsigned char*, int, int) - 3 l1_cache_global_hit_rate L1 Global Hit Rate 91.57% 92.87% 92.22% - 3 branch_efficiency Branch Efficiency 100.00% 100.00% 100.00% - 3 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 3 sm_efficiency Multiprocessor Activity 98.27% 98.70% 98.49% - 3 achieved_occupancy Achieved Occupancy 0.637133 0.641072 0.639421 - 3 gld_requested_throughput Requested Global Load Throughput 15.160GB/s 15.406GB/s 15.275GB/s - 3 gst_requested_throughput Requested Global Store Throughput 15.156GB/s 15.402GB/s 15.271GB/s - 3 ipc Executed IPC 1.844299 1.858876 1.851619 - 3 sm_efficiency_instance Multiprocessor Activity 98.27% 98.70% 98.49% - 3 ipc_instance Executed IPC 1.844299 1.858876 1.851619 - 3 inst_per_warp Instructions per warp 1.0474e+05 1.0496e+05 1.0482e+05 - 3 gld_transactions Global Load Transactions 10708084 13461576 12526965 - 3 gst_transactions Global Store Transactions 10993664 13598720 12708522 - 3 local_load_transactions Local Load Transactions 0 0 0 - 3 local_store_transactions Local Store Transactions 0 0 0 - 3 shared_load_transactions Shared Load Transactions 11960460 14654208 13749297 - 3 shared_store_transactions Shared Store Transactions 22636912 27830492 26085346 - 3 gld_transactions_per_request Global Load Transactions Per Request 0.975608 0.993870 0.984954 - 3 gst_transactions_per_request Global Store Transactions Per Request 0.999395 1.001866 1.000220 - 3 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 3 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 3 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 0.993022 1.002182 0.998035 - 3 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 0.995808 1.002951 0.999254 - 3 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 shared_load_throughput Shared Memory Load Throughput 65.619GB/s 66.672GB/s 66.123GB/s - 3 shared_store_throughput Shared Memory Store Throughput 124.80GB/s 126.43GB/s 125.43GB/s - 3 shared_efficiency Shared Memory Efficiency 73.39% 73.85% 73.60% - 3 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 3 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 3 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 3 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 42.45% 42.98% 42.80% - 3 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 27.48% 27.84% 27.60% - 3 stall_memory_dependency Issue Stall Reasons (Data Request) 0.44% 0.45% 0.44% - 3 stall_sync Issue Stall Reasons (Synchronization) 23.17% 23.28% 23.21% - 3 inst_executed Instructions Executed 281299518 348152505 325328182 - 3 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 3 stall_other Issue Stall Reasons (Other) 5.79% 5.87% 5.82% - 3 inst_fp_32 FP Instructions(Single) 0 0 0 - 3 inst_fp_64 FP Instructions(Double) 0 0 0 - 3 inst_integer Integer Instructions 4206789776 5206774984 4865310322 - 3 inst_bit_convert Bit-Convert Instructions 432729312 536981568 501363040 - 3 inst_control Control-Flow Instructions 101768556 125192734 117211603 - 3 inst_compute_ld_st Load/Store Instructions 1514847280 1874470191 1751696478 - 3 inst_misc Misc Instructions 1343341408 1661814592 1553146357 - 3 inst_inter_thread_communication Inter-Thread Instructions 371025024 458805856 428847594 - 3 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 3 atomic_transactions Atomic Transactions 0 0 0 - 3 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 3 inst_replay_overhead Instruction Replay Overhead 0.000255 0.000384 0.000331 - 3 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 - 3 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.003213 0.003672 0.003380 - 3 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 3 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 dram_read_throughput Device Memory Read Throughput 973.20MB/s 987.03MB/s 979.47MB/s - 3 dram_write_throughput Device Memory Write Throughput 13.815GB/s 14.143GB/s 13.982GB/s - 3 gst_throughput Global Store Throughput 15.156GB/s 15.402GB/s 15.271GB/s - 3 gld_throughput Global Load Throughput 59.160GB/s 61.246GB/s 60.182GB/s - 3 warp_execution_efficiency Warp Execution Efficiency 100.00% 100.00% 100.00% - 3 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 3 gld_efficiency Global Memory Load Efficiency 25.15% 25.63% 25.38% - 3 gst_efficiency Global Memory Store Efficiency 100.00% 100.00% 100.00% - 3 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 44.16% 47.05% 45.35% - 3 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 3 l2_l1_read_throughput L2 Throughput (L1 Reads) 4.6231GB/s 5.5446GB/s 5.1445GB/s - 3 local_memory_overhead Local Memory Overhead 1.30% 2.30% 1.82% - 3 issued_ipc Issued IPC 1.840088 1.858911 1.851823 - 3 issue_slot_utilization Issue Slot Utilization 75.62% 76.41% 76.11% - 3 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 3 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 3 l2_read_transactions L2 Read Transactions 3387005 4816507 4266016 - 3 l2_write_transactions L2 Write Transactions 10973189 13606926 12707169 - 3 tex_cache_transactions Texture Cache Transactions 0 0 0 - 3 dram_read_transactions Device Memory Read Transactions 688089 852115 795879 - 3 dram_write_transactions Device Memory Write Transactions 10001947 12477394 11637790 - 3 l2_read_throughput L2 Throughput (Reads) 4.6781GB/s 5.3996GB/s 5.1012GB/s - 3 l2_write_throughput L2 Throughput (Writes) 15.156GB/s 15.402GB/s 15.271GB/s - 3 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 cf_issued Issued Control-Flow Instructions 25883133 32015926 29922526 - 3 cf_executed Executed Control-Flow Instructions 25883133 32015926 29922526 - 3 ldst_issued Issued Load/Store Instructions 67778254 83967515 78432926 - 3 ldst_executed Executed Load/Store Instructions 67711044 83845255 78336175 - 3 l1_shared_utilization L1/Shared Memory Utilization Mid (4) Mid (4) Mid (4) - 3 l2_utilization L2 Cache Utilization Low (1) Low (1) Low (1) - 3 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 3 dram_utilization Device Memory Utilization Low (2) Low (2) Low (2) - 3 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) - 3 ldst_fu_utilization Load/Store Function Unit Utilization Low (3) Low (3) Low (3) - 3 alu_fu_utilization Arithmetic Function Unit Utilization Mid (5) Mid (5) Mid (5) - 3 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 3 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 3 inst_issued Instructions Issued 281364607 348291307 325436550 - 3 issue_slots Issue Slots 231261773 286312730 267508774 - 3 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 3347128 4945860 4307554 - 3 l2_l1_write_transactions L2 Write Transactions (L1 write requests 10973184 13606912 12707157 - 3 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 3 l2_l1_write_throughput L2 Throughput (L1 Writes) 15.156GB/s 15.402GB/s 15.271GB/s - 3 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 3 eligible_warps_per_cycle Eligible Warps Per Active Cycle 2.887040 2.914665 2.904336 - 3 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 3 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 3 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.12% 0.13% 0.13% - 3 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% - Kernel: void mtf_4by8(unsigned char const *, unsigned char*, int, int) - 3 l1_cache_global_hit_rate L1 Global Hit Rate 32.95% 37.72% 35.85% - 3 branch_efficiency Branch Efficiency 77.35% 77.52% 77.47% - 3 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 3 sm_efficiency Multiprocessor Activity 87.15% 89.96% 88.47% - 3 achieved_occupancy Achieved Occupancy 0.689703 0.699788 0.696245 - 3 gld_requested_throughput Requested Global Load Throughput 4.1905GB/s 4.4628GB/s 4.3449GB/s - 3 gst_requested_throughput Requested Global Store Throughput 4.1895GB/s 4.4617GB/s 4.3439GB/s - 3 ipc Executed IPC 1.595750 1.665624 1.637432 - 3 sm_efficiency_instance Multiprocessor Activity 87.15% 89.96% 88.47% - 3 ipc_instance Executed IPC 1.595750 1.665624 1.637432 - 3 inst_per_warp Instructions per warp 2.8312e+05 2.8674e+05 2.8462e+05 - 3 gld_transactions Global Load Transactions 10502384 14507936 12644254 - 3 gst_transactions Global Store Transactions 10485760 14155776 12757674 - 3 local_load_transactions Local Load Transactions 0 0 0 - 3 local_store_transactions Local Store Transactions 0 0 0 - 3 shared_load_transactions Shared Load Transactions 14728900 19507600 17678784 - 3 shared_store_transactions Shared Store Transactions 23585988 31593344 28608148 - 3 gld_transactions_per_request Global Load Transactions Per Request 2.417622 2.726633 2.521349 - 3 gst_transactions_per_request Global Store Transactions Per Request 2.413974 2.660659 2.542468 - 3 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 3 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 3 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 1.886994 2.050581 1.969332 - 3 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 1.947465 2.136366 2.049854 - 3 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 shared_load_throughput Shared Memory Load Throughput 89.974GB/s 102.84GB/s 96.550GB/s - 3 shared_store_throughput Shared Memory Store Throughput 144.08GB/s 166.55GB/s 156.16GB/s - 3 shared_efficiency Shared Memory Efficiency 5.69% 6.20% 5.93% - 3 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 3 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 3 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 3 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 61.97% 65.57% 63.97% - 3 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 23.96% 25.88% 24.92% - 3 stall_memory_dependency Issue Stall Reasons (Data Request) 3.43% 4.59% 3.86% - 3 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% - 3 inst_executed Instructions Executed 192689564 236296641 221514924 - 3 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 3 stall_other Issue Stall Reasons (Other) 4.98% 5.27% 5.13% - 3 inst_fp_32 FP Instructions(Single) 0 0 0 - 3 inst_fp_64 FP Instructions(Double) 0 0 0 - 3 inst_integer Integer Instructions 1886926992 2324838304 2175476400 - 3 inst_bit_convert Bit-Convert Instructions 958941216 1175994648 1102053688 - 3 inst_control Control-Flow Instructions 186225360 227177392 213247232 - 3 inst_compute_ld_st Load/Store Instructions 479830280 591171111 553205017 - 3 inst_misc Misc Instructions 370874272 458097921 428319222 - 3 inst_inter_thread_communication Inter-Thread Instructions 136994664 168003032 157439786 - 3 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 3 atomic_transactions Atomic Transactions 0 0 0 - 3 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 3 inst_replay_overhead Instruction Replay Overhead 0.204238 0.211640 0.206792 - 3 shared_replay_overhead Shared Memory Replay Overhead 0.099166 0.113107 0.106375 - 3 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.036395 0.039734 0.038510 - 3 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 3 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 dram_read_throughput Device Memory Read Throughput 1.2585GB/s 1.2809GB/s 1.2709GB/s - 3 dram_write_throughput Device Memory Write Throughput 15.328GB/s 16.153GB/s 15.748GB/s - 3 gst_throughput Global Store Throughput 16.758GB/s 17.847GB/s 17.376GB/s - 3 gld_throughput Global Load Throughput 64.156GB/s 76.483GB/s 69.067GB/s - 3 warp_execution_efficiency Warp Execution Efficiency 72.16% 72.42% 72.31% - 3 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 3 gld_efficiency Global Memory Load Efficiency 5.84% 6.58% 6.32% - 3 gst_efficiency Global Memory Store Efficiency 25.00% 25.00% 25.00% - 3 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 49.49% 49.90% 49.76% - 3 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 3 l2_l1_read_throughput L2 Throughput (L1 Reads) 44.320GB/s 46.027GB/s 45.409GB/s - 3 local_memory_overhead Local Memory Overhead 0.00% 3.60% 1.94% - 3 issued_ipc Issued IPC 1.929964 2.022805 1.987545 - 3 issue_slot_utilization Issue Slot Utilization 73.16% 76.52% 75.24% - 3 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 3 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 3 l2_read_transactions L2 Read Transactions 29018255 35049148 32738258 - 3 l2_write_transactions L2 Write Transactions 10973200 13606926 12707167 - 3 tex_cache_transactions Texture Cache Transactions 0 0 0 - 3 dram_read_transactions Device Memory Read Transactions 838719 977316 927405 - 3 dram_write_transactions Device Memory Write Transactions 10036890 12256070 11511224 - 3 l2_read_throughput L2 Throughput (Reads) 44.316GB/s 45.133GB/s 44.818GB/s - 3 l2_write_throughput L2 Throughput (Writes) 16.758GB/s 17.847GB/s 17.376GB/s - 3 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 cf_issued Issued Control-Flow Instructions 27120271 33284794 31194691 - 3 cf_executed Executed Control-Flow Instructions 27120271 33284794 31194691 - 3 ldst_issued Issued Load/Store Instructions 69450407 83390607 78608356 - 3 ldst_executed Executed Load/Store Instructions 28604478 35056214 32867828 - 3 l1_shared_utilization L1/Shared Memory Utilization Mid (4) Mid (5) Mid (4) - 3 l2_utilization L2 Cache Utilization Low (3) Low (3) Low (3) - 3 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 3 dram_utilization Device Memory Utilization Low (2) Low (2) Low (2) - 3 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) - 3 ldst_fu_utilization Load/Store Function Unit Utilization Low (3) Low (3) Low (3) - 3 alu_fu_utilization Arithmetic Function Unit Utilization Mid (5) Mid (6) Mid (5) - 3 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 3 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 3 inst_issued Instructions Issued 233498620 284372080 267194316 - 3 issue_slots Issue Slots 177017540 215138501 202282435 - 3 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 29021068 35743248 33191864 - 3 l2_l1_write_transactions L2 Write Transactions (L1 write requests 10973184 13606912 12707157 - 3 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 3 l2_l1_write_throughput L2 Throughput (L1 Writes) 16.758GB/s 17.847GB/s 17.376GB/s - 3 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 3 eligible_warps_per_cycle Eligible Warps Per Active Cycle 3.383598 3.446767 3.408359 - 3 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 3 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 3 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 2.00% 2.29% 2.12% - 3 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% - Kernel: void mtf_thread_by4(unsigned char const *, unsigned char*, int, int) - 3 l1_cache_global_hit_rate L1 Global Hit Rate 31.16% 34.05% 32.44% - 3 branch_efficiency Branch Efficiency 74.61% 74.78% 74.68% - 3 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 3 sm_efficiency Multiprocessor Activity 81.96% 85.01% 83.87% - 3 achieved_occupancy Achieved Occupancy 0.126059 0.135121 0.131813 - 3 gld_requested_throughput Requested Global Load Throughput 304.31MB/s 349.62MB/s 331.44MB/s - 3 gst_requested_throughput Requested Global Store Throughput 304.23MB/s 349.54MB/s 331.36MB/s - 3 ipc Executed IPC 0.454444 0.490594 0.477948 - 3 sm_efficiency_instance Multiprocessor Activity 81.96% 85.01% 83.87% - 3 ipc_instance Executed IPC 0.454444 0.490594 0.477948 - 3 inst_per_warp Instructions per warp 1.0819e+06 1.1025e+06 1.0919e+06 - 3 gld_transactions Global Load Transactions 11449000 14320580 13236476 - 3 gst_transactions Global Store Transactions 11534336 14319616 13336576 - 3 local_load_transactions Local Load Transactions 0 0 0 - 3 local_store_transactions Local Store Transactions 0 0 0 - 3 shared_load_transactions Shared Load Transactions 4300620 5977620 5208418 - 3 shared_store_transactions Shared Store Transactions 4302028 5979412 5210061 - 3 gld_transactions_per_request Global Load Transactions Per Request 3.377378 3.485803 3.418060 - 3 gst_transactions_per_request Global Store Transactions Per Request 3.415632 3.484923 3.443198 - 3 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 3 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 3 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 0.997129 1.142721 1.049430 - 3 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 0.997145 1.142700 1.049432 - 3 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 shared_load_throughput Shared Memory Load Throughput 14.904GB/s 19.287GB/s 16.969GB/s - 3 shared_store_throughput Shared Memory Store Throughput 14.909GB/s 19.293GB/s 16.974GB/s - 3 shared_efficiency Shared Memory Efficiency 13.22% 15.10% 14.38% - 3 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 3 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 3 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 3 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 24.52% 26.74% 25.62% - 3 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 46.86% 48.55% 47.79% - 3 stall_memory_dependency Issue Stall Reasons (Data Request) 16.68% 17.09% 16.85% - 3 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% - 3 inst_executed Instructions Executed 92611037 113477939 106203349 - 3 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 3 stall_other Issue Stall Reasons (Other) 9.61% 9.84% 9.74% - 3 inst_fp_32 FP Instructions(Single) 0 0 0 - 3 inst_fp_64 FP Instructions(Double) 0 0 0 - 3 inst_integer Integer Instructions 249289519 305198998 286173476 - 3 inst_bit_convert Bit-Convert Instructions 240943458 294479586 276271707 - 3 inst_control Control-Flow Instructions 57813860 71302810 66702168 - 3 inst_compute_ld_st Load/Store Instructions 188093421 230114978 215817788 - 3 inst_misc Misc Instructions 96836565 118908544 111391166 - 3 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 - 3 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 3 atomic_transactions Atomic Transactions 0 0 0 - 3 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 3 inst_replay_overhead Instruction Replay Overhead 0.169358 0.171956 0.170855 - 3 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 - 3 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.083971 0.086260 0.084973 - 3 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 3 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 dram_read_throughput Device Memory Read Throughput 1.5383GB/s 1.5840GB/s 1.5661GB/s - 3 dram_write_throughput Device Memory Write Throughput 9.4030GB/s 10.823GB/s 10.254GB/s - 3 gst_throughput Global Store Throughput 9.5073GB/s 10.923GB/s 10.355GB/s - 3 gld_throughput Global Load Throughput 39.678GB/s 46.206GB/s 43.154GB/s - 3 warp_execution_efficiency Warp Execution Efficiency 31.82% 33.74% 32.48% - 3 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 3 gld_efficiency Global Memory Load Efficiency 0.74% 0.76% 0.75% - 3 gst_efficiency Global Memory Store Efficiency 3.13% 3.13% 3.13% - 3 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 48.20% 48.39% 48.33% - 3 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 3 l2_l1_read_throughput L2 Throughput (L1 Reads) 26.383GB/s 30.186GB/s 28.483GB/s - 3 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00% - 3 issued_ipc Issued IPC 0.531319 0.575020 0.559530 - 3 issue_slot_utilization Issue Slot Utilization 18.70% 20.26% 19.71% - 3 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 3 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 3 l2_read_transactions L2 Read Transactions 30407276 37246697 34864405 - 3 l2_write_transactions L2 Write Transactions 10973186 13606927 12707168 - 3 tex_cache_transactions Texture Cache Transactions 0 0 0 - 3 dram_read_transactions Device Memory Read Transactions 1775454 2016460 1918547 - 3 dram_write_transactions Device Memory Write Transactions 10852767 13481929 12583963 - 3 l2_read_throughput L2 Throughput (Reads) 26.345GB/s 30.045GB/s 28.420GB/s - 3 l2_write_throughput L2 Throughput (Writes) 9.5073GB/s 10.923GB/s 10.355GB/s - 3 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 cf_issued Issued Control-Flow Instructions 18738966 22962898 21489624 - 3 cf_executed Executed Control-Flow Instructions 18738966 22962898 21489624 - 3 ldst_issued Issued Load/Store Instructions 31061918 38297470 35799153 - 3 ldst_executed Executed Load/Store Instructions 15380564 18861813 17641134 - 3 l1_shared_utilization L1/Shared Memory Utilization Low (1) Low (2) Low (1) - 3 l2_utilization L2 Cache Utilization Low (2) Low (2) Low (2) - 3 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 3 dram_utilization Device Memory Utilization Low (1) Low (2) Low (1) - 3 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) - 3 ldst_fu_utilization Load/Store Function Unit Utilization Low (1) Low (1) Low (1) - 3 alu_fu_utilization Arithmetic Function Unit Utilization Low (1) Low (2) Low (1) - 3 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 3 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 3 inst_issued Instructions Issued 108298416 132912926 124362763 - 3 issue_slots Issue Slots 76233295 93650300 87597205 - 3 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 30450888 37422012 34942266 - 3 l2_l1_write_transactions L2 Write Transactions (L1 write requests 10973184 13606912 12707157 - 3 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 3 l2_l1_write_throughput L2 Throughput (L1 Writes) 9.5073GB/s 10.923GB/s 10.355GB/s - 3 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 3 eligible_warps_per_cycle Eligible Warps Per Active Cycle 0.404899 0.436961 0.425479 - 3 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 3 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 3 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.00% 0.00% 0.00% - 3 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% - Kernel: void mtf_thread(unsigned char const *, unsigned char*, int, int) - 3 l1_cache_global_hit_rate L1 Global Hit Rate 37.02% 43.37% 40.05% - 3 branch_efficiency Branch Efficiency 72.31% 72.68% 72.46% - 3 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 3 sm_efficiency Multiprocessor Activity 78.09% 81.62% 79.69% - 3 achieved_occupancy Achieved Occupancy 0.119964 0.132531 0.128054 - 3 gld_requested_throughput Requested Global Load Throughput 267.33MB/s 302.66MB/s 287.66MB/s - 3 gst_requested_throughput Requested Global Store Throughput 267.27MB/s 302.59MB/s 287.59MB/s - 3 ipc Executed IPC 0.509550 0.563715 0.543593 - 3 sm_efficiency_instance Multiprocessor Activity 78.09% 81.62% 79.69% - 3 ipc_instance Executed IPC 0.509550 0.563715 0.543593 - 3 inst_per_warp Instructions per warp 1.3316e+06 1.3606e+06 1.3470e+06 - 3 gld_transactions Global Load Transactions 11387052 13796572 12939348 - 3 gst_transactions Global Store Transactions 11386880 13795328 12937898 - 3 local_load_transactions Local Load Transactions 0 0 0 - 3 local_store_transactions Local Store Transactions 0 0 0 - 3 shared_load_transactions Shared Load Transactions 3964840 5218580 4605105 - 3 shared_store_transactions Shared Store Transactions 7072676 9058724 8116066 - 3 gld_transactions_per_request Global Load Transactions Per Request 3.768437 3.843924 3.814494 - 3 gst_transactions_per_request Global Store Transactions Per Request 3.766964 3.843184 3.813412 - 3 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 3 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 3 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 0.986790 1.130006 1.051646 - 3 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 0.988290 1.101823 1.044257 - 3 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 shared_load_throughput Shared Memory Load Throughput 12.071GB/s 14.577GB/s 13.037GB/s - 3 shared_store_throughput Shared Memory Store Throughput 21.533GB/s 25.303GB/s 22.982GB/s - 3 shared_efficiency Shared Memory Efficiency 10.33% 11.52% 10.95% - 3 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 3 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 3 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 3 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 25.67% 26.08% 25.85% - 3 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 55.28% 55.76% 55.45% - 3 stall_memory_dependency Issue Stall Reasons (Data Request) 6.16% 6.52% 6.29% - 3 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% - 3 inst_executed Instructions Executed 114291159 140275494 131017233 - 3 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 3 stall_other Issue Stall Reasons (Other) 12.39% 12.43% 12.41% - 3 inst_fp_32 FP Instructions(Single) 0 0 0 - 3 inst_fp_64 FP Instructions(Double) 0 0 0 - 3 inst_integer Integer Instructions 592349928 723589886 678959582 - 3 inst_bit_convert Bit-Convert Instructions 581349927 709949736 666221364 - 3 inst_control Control-Flow Instructions 155131014 189241096 177646691 - 3 inst_compute_ld_st Load/Store Instructions 177122916 216511388 203113733 - 3 inst_misc Misc Instructions 21997296 27276960 25473296 - 3 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 - 3 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 3 atomic_transactions Atomic Transactions 0 0 0 - 3 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 3 inst_replay_overhead Instruction Replay Overhead 0.143781 0.146546 0.145201 - 3 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 - 3 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.055065 0.062782 0.059291 - 3 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 3 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 dram_read_throughput Device Memory Read Throughput 1.1620GB/s 1.1890GB/s 1.1765GB/s - 3 dram_write_throughput Device Memory Write Throughput 8.2107GB/s 9.3362GB/s 8.8608GB/s - 3 gst_throughput Global Store Throughput 8.3521GB/s 9.4560GB/s 8.9873GB/s - 3 gld_throughput Global Load Throughput 34.669GB/s 38.537GB/s 36.631GB/s - 3 warp_execution_efficiency Warp Execution Efficiency 42.61% 44.86% 43.97% - 3 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 3 gld_efficiency Global Memory Load Efficiency 0.75% 0.78% 0.77% - 3 gst_efficiency Global Memory Store Efficiency 3.13% 3.13% 3.13% - 3 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 48.23% 48.61% 48.46% - 3 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 3 l2_l1_read_throughput L2 Throughput (L1 Reads) 20.688GB/s 23.073GB/s 21.902GB/s - 3 local_memory_overhead Local Memory Overhead 0.00% 3.75% 1.25% - 3 issued_ipc Issued IPC 0.582826 0.644669 0.622268 - 3 issue_slot_utilization Issue Slot Utilization 21.57% 23.87% 23.04% - 3 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 3 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 3 l2_read_transactions L2 Read Transactions 27151809 33022013 30789829 - 3 l2_write_transactions L2 Write Transactions 10973200 13606917 12707169 - 3 tex_cache_transactions Texture Cache Transactions 0 0 0 - 3 dram_read_transactions Device Memory Read Transactions 1526685 1751569 1660328 - 3 dram_write_transactions Device Memory Write Transactions 10787358 13431261 12529501 - 3 l2_read_throughput L2 Throughput (Reads) 20.666GB/s 23.059GB/s 21.795GB/s - 3 l2_write_throughput L2 Throughput (Writes) 8.3521GB/s 9.4560GB/s 8.9873GB/s - 3 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 cf_issued Issued Control-Flow Instructions 14495465 17830600 16625983 - 3 cf_executed Executed Control-Flow Instructions 14495465 17830600 16625983 - 3 ldst_issued Issued Load/Store Instructions 32958003 40624551 37974637 - 3 ldst_executed Executed Load/Store Instructions 16527711 20244807 18938432 - 3 l1_shared_utilization L1/Shared Memory Utilization Low (1) Low (2) Low (1) - 3 l2_utilization L2 Cache Utilization Low (2) Low (2) Low (2) - 3 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 3 dram_utilization Device Memory Utilization Low (1) Low (1) Low (1) - 3 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) - 3 ldst_fu_utilization Load/Store Function Unit Utilization Low (1) Low (1) Low (1) - 3 alu_fu_utilization Arithmetic Function Unit Utilization Low (2) Low (2) Low (2) - 3 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 3 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 3 inst_issued Instructions Issued 130722541 160655144 150053925 - 3 issue_slots Issue Slots 96738842 118963928 111097759 - 3 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 27180044 33040984 30947492 - 3 l2_l1_write_transactions L2 Write Transactions (L1 write requests 10973184 13606912 12707157 - 3 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 3 l2_l1_write_throughput L2 Throughput (L1 Writes) 8.3521GB/s 9.4560GB/s 8.9873GB/s - 3 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 3 eligible_warps_per_cycle Eligible Warps Per Active Cycle 0.475739 0.529159 0.509464 - 3 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 3 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 3 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.00% 0.00% 0.00% - 3 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% - Kernel: void mtf_thread(unsigned char const *, unsigned char*, int, int) - 3 l1_cache_global_hit_rate L1 Global Hit Rate 25.21% 26.89% 26.21% - 3 branch_efficiency Branch Efficiency 68.35% 68.36% 68.35% - 3 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 3 sm_efficiency Multiprocessor Activity 94.20% 94.75% 94.51% - 3 achieved_occupancy Achieved Occupancy 0.126176 0.141794 0.136185 - 3 gld_requested_throughput Requested Global Load Throughput 365.03MB/s 425.03MB/s 401.99MB/s - 3 gst_requested_throughput Requested Global Store Throughput 364.94MB/s 424.93MB/s 401.89MB/s - 3 ipc Executed IPC 0.325919 0.371263 0.354979 - 3 sm_efficiency_instance Multiprocessor Activity 94.20% 94.75% 94.51% - 3 ipc_instance Executed IPC 0.325919 0.371263 0.354979 - 3 inst_per_warp Instructions per warp 7.5136e+05 7.5777e+05 7.5393e+05 - 3 gld_transactions Global Load Transactions 12042936 14113980 13385318 - 3 gst_transactions Global Store Transactions 11911168 14319616 13462186 - 3 local_load_transactions Local Load Transactions 0 0 0 - 3 local_store_transactions Local Store Transactions 0 0 0 - 3 shared_load_transactions Shared Load Transactions 2163156 2607108 2458645 - 3 shared_store_transactions Shared Store Transactions 4171500 5040920 4749593 - 3 gld_transactions_per_request Global Load Transactions Per Request 6.146011 6.488245 6.273024 - 3 gst_transactions_per_request Global Store Transactions Per Request 6.202246 6.416297 6.301482 - 3 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 3 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 3 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 1.072849 1.094745 1.080429 - 3 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 1.069681 1.088476 1.076196 - 3 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 shared_load_throughput Shared Memory Load Throughput 8.9927GB/s 10.177GB/s 9.7236GB/s - 3 shared_store_throughput Shared Memory Store Throughput 17.342GB/s 19.678GB/s 18.783GB/s - 3 shared_efficiency Shared Memory Efficiency 11.50% 11.76% 11.67% - 3 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 3 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 3 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 3 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 14.36% 14.44% 14.40% - 3 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 51.42% 51.56% 51.49% - 3 stall_memory_dependency Issue Stall Reasons (Data Request) 24.44% 24.69% 24.55% - 3 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% - 3 inst_executed Instructions Executed 63652399 78276294 73356875 - 3 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 3 stall_other Issue Stall Reasons (Other) 9.53% 9.58% 9.56% - 3 inst_fp_32 FP Instructions(Single) 0 0 0 - 3 inst_fp_64 FP Instructions(Double) 0 0 0 - 3 inst_integer Integer Instructions 375775059 464215813 434152119 - 3 inst_bit_convert Bit-Convert Instructions 364775058 450575663 421413901 - 3 inst_control Control-Flow Instructions 93252480 115134218 107701702 - 3 inst_compute_ld_st Load/Store Instructions 115206876 142358002 133125311 - 3 inst_misc Misc Instructions 21959790 27230452 25429863 - 3 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 - 3 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 3 atomic_transactions Atomic Transactions 0 0 0 - 3 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 3 inst_replay_overhead Instruction Replay Overhead 0.325969 0.328137 0.326831 - 3 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 - 3 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.132995 0.138002 0.135320 - 3 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 3 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 dram_read_throughput Device Memory Read Throughput 4.5987GB/s 5.0938GB/s 4.9050GB/s - 3 dram_write_throughput Device Memory Write Throughput 11.226GB/s 13.102GB/s 12.381GB/s - 3 gst_throughput Global Store Throughput 11.404GB/s 13.279GB/s 12.559GB/s - 3 gld_throughput Global Load Throughput 50.065GB/s 55.096GB/s 52.964GB/s - 3 warp_execution_efficiency Warp Execution Efficiency 51.31% 51.77% 51.54% - 3 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 3 gld_efficiency Global Memory Load Efficiency 0.71% 0.76% 0.74% - 3 gst_efficiency Global Memory Store Efficiency 3.13% 3.13% 3.13% - 3 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 44.58% 45.38% 44.94% - 3 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 3 l2_l1_read_throughput L2 Throughput (L1 Reads) 35.085GB/s 39.843GB/s 38.112GB/s - 3 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00% - 3 issued_ipc Issued IPC 0.433102 0.491924 0.469404 - 3 issue_slot_utilization Issue Slot Utilization 16.23% 18.42% 17.58% - 3 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 3 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 3 l2_read_transactions L2 Read Transactions 33517355 41207528 38479966 - 3 l2_write_transactions L2 Write Transactions 10973200 13606927 12707169 - 3 tex_cache_transactions Texture Cache Transactions 0 0 0 - 3 dram_read_transactions Device Memory Read Transactions 4424815 5308510 4959935 - 3 dram_write_transactions Device Memory Write Transactions 10801581 13425032 12527110 - 3 l2_read_throughput L2 Throughput (Reads) 34.835GB/s 40.215GB/s 38.039GB/s - 3 l2_write_throughput L2 Throughput (Writes) 11.404GB/s 13.279GB/s 12.559GB/s - 3 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 cf_issued Issued Control-Flow Instructions 7821260 9619359 9014063 - 3 cf_executed Executed Control-Flow Instructions 7821260 9619359 9014063 - 3 ldst_issued Issued Load/Store Instructions 30464662 37244744 34939915 - 3 ldst_executed Executed Load/Store Instructions 9520876 11707027 10971837 - 3 l1_shared_utilization L1/Shared Memory Utilization Low (2) Low (2) Low (2) - 3 l2_utilization L2 Cache Utilization Low (3) Low (3) Low (3) - 3 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 3 dram_utilization Device Memory Utilization Low (2) Low (2) Low (2) - 3 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) - 3 ldst_fu_utilization Load/Store Function Unit Utilization Low (1) Low (1) Low (1) - 3 alu_fu_utilization Arithmetic Function Unit Utilization Low (2) Low (2) Low (2) - 3 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 3 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 3 inst_issued Instructions Issued 84585993 103840717 97330166 - 3 issue_slots Issue Slots 63400502 77787910 72915116 - 3 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 33758692 41068980 38551390 - 3 l2_l1_write_transactions L2 Write Transactions (L1 write requests 10973184 13606912 12707157 - 3 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 3 l2_l1_write_throughput L2 Throughput (L1 Writes) 11.404GB/s 13.279GB/s 12.559GB/s - 3 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 3 eligible_warps_per_cycle Eligible Warps Per Active Cycle 0.347562 0.395546 0.378099 - 3 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 3 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 3 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.00% 0.00% 0.00% - 3 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% - Kernel: void mtf_thread_by4(unsigned char const *, unsigned char*, int, int) - 3 l1_cache_global_hit_rate L1 Global Hit Rate 17.33% 19.07% 18.32% - 3 branch_efficiency Branch Efficiency 70.33% 70.35% 70.34% - 3 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 3 sm_efficiency Multiprocessor Activity 95.73% 98.46% 97.24% - 3 achieved_occupancy Achieved Occupancy 0.129062 0.141012 0.136857 - 3 gld_requested_throughput Requested Global Load Throughput 344.58MB/s 396.27MB/s 378.26MB/s - 3 gst_requested_throughput Requested Global Store Throughput 344.50MB/s 396.17MB/s 378.17MB/s - 3 ipc Executed IPC 0.281391 0.319062 0.305770 - 3 sm_efficiency_instance Multiprocessor Activity 95.73% 98.46% 97.24% - 3 ipc_instance Executed IPC 0.281391 0.319062 0.305770 - 3 inst_per_warp Instructions per warp 7.0622e+05 7.1048e+05 7.0807e+05 - 3 gld_transactions Global Load Transactions 12525744 15515492 14232317 - 3 gst_transactions Global Store Transactions 12058624 14680064 13686101 - 3 local_load_transactions Local Load Transactions 0 0 0 - 3 local_store_transactions Local Store Transactions 0 0 0 - 3 shared_load_transactions Shared Load Transactions 2484412 3098144 2865820 - 3 shared_store_transactions Shared Store Transactions 2484596 3098376 2866033 - 3 gld_transactions_per_request Global Load Transactions Per Request 4.752162 5.018109 4.921929 - 3 gst_transactions_per_request Global Store Transactions Per Request 4.642341 4.808353 4.732587 - 3 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 3 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 3 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 1.079905 1.109022 1.095043 - 3 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 1.079905 1.109023 1.095043 - 3 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 shared_load_throughput Shared Memory Load Throughput 9.7497GB/s 11.275GB/s 10.662GB/s - 3 shared_store_throughput Shared Memory Store Throughput 9.7504GB/s 11.276GB/s 10.663GB/s - 3 shared_efficiency Shared Memory Efficiency 16.23% 16.62% 16.41% - 3 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 3 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 3 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 3 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 15.76% 16.34% 16.08% - 3 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 40.14% 41.57% 40.86% - 3 stall_memory_dependency Issue Stall Reasons (Data Request) 34.92% 36.95% 35.72% - 3 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% - 3 inst_executed Instructions Executed 59680363 73579622 68902285 - 3 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 3 stall_other Issue Stall Reasons (Other) 7.14% 7.48% 7.33% - 3 inst_fp_32 FP Instructions(Single) 0 0 0 - 3 inst_fp_64 FP Instructions(Double) 0 0 0 - 3 inst_integer Integer Instructions 177553106 219439632 205186798 - 3 inst_bit_convert Bit-Convert Instructions 155177080 191740652 179308306 - 3 inst_control Control-Flow Instructions 48984481 60696570 56700233 - 3 inst_compute_ld_st Load/Store Instructions 126177381 155961592 145829366 - 3 inst_misc Misc Instructions 72911215 90229980 84329322 - 3 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 - 3 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 3 atomic_transactions Atomic Transactions 0 0 0 - 3 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 3 inst_replay_overhead Instruction Replay Overhead 0.328014 0.333754 0.330382 - 3 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 - 3 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.155101 0.159613 0.156828 - 3 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 3 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 dram_read_throughput Device Memory Read Throughput 6.2665GB/s 6.7456GB/s 6.5389GB/s - 3 dram_write_throughput Device Memory Write Throughput 10.613GB/s 12.219GB/s 11.657GB/s - 3 gst_throughput Global Store Throughput 10.766GB/s 12.380GB/s 11.818GB/s - 3 gld_throughput Global Load Throughput 49.155GB/s 56.467GB/s 52.968GB/s - 3 warp_execution_efficiency Warp Execution Efficiency 33.82% 34.20% 33.98% - 3 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 3 gld_efficiency Global Memory Load Efficiency 0.68% 0.72% 0.70% - 3 gst_efficiency Global Memory Store Efficiency 3.13% 3.13% 3.13% - 3 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 41.72% 43.20% 42.32% - 3 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 3 l2_l1_read_throughput L2 Throughput (L1 Reads) 34.981GB/s 40.224GB/s 38.301GB/s - 3 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00% - 3 issued_ipc Issued IPC 0.373267 0.421725 0.405503 - 3 issue_slot_utilization Issue Slot Utilization 14.40% 16.25% 15.63% - 3 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 3 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 3 l2_read_transactions L2 Read Transactions 35446410 43333615 40683852 - 3 l2_write_transactions L2 Write Transactions 10973189 13606925 12707164 - 3 tex_cache_transactions Texture Cache Transactions 0 0 0 - 3 dram_read_transactions Device Memory Read Transactions 6387295 7421971 7022711 - 3 dram_write_transactions Device Memory Write Transactions 10817705 13429507 12534840 - 3 l2_read_throughput L2 Throughput (Reads) 34.776GB/s 39.385GB/s 37.844GB/s - 3 l2_write_throughput L2 Throughput (Writes) 10.766GB/s 12.380GB/s 11.818GB/s - 3 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 cf_issued Issued Control-Flow Instructions 12045729 14850782 13906919 - 3 cf_executed Executed Control-Flow Instructions 12045729 14850782 13906919 - 3 ldst_issued Issued Load/Store Instructions 29475549 36034685 33785931 - 3 ldst_executed Executed Load/Store Instructions 9548183 11771721 11024112 - 3 l1_shared_utilization L1/Shared Memory Utilization Low (1) Low (1) Low (1) - 3 l2_utilization L2 Cache Utilization Low (3) Low (3) Low (3) - 3 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 3 dram_utilization Device Memory Utilization Low (2) Low (2) Low (2) - 3 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) - 3 ldst_fu_utilization Load/Store Function Unit Utilization Low (1) Low (1) Low (1) - 3 alu_fu_utilization Arithmetic Function Unit Utilization Low (1) Low (1) Low (1) - 3 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 3 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 3 inst_issued Instructions Issued 79565103 97826985 91644934 - 3 issue_slots Issue Slots 61371322 75395521 70640272 - 3 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 35655456 44209264 41181168 - 3 l2_l1_write_transactions L2 Write Transactions (L1 write requests 10973184 13606912 12707157 - 3 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 3 l2_l1_write_throughput L2 Throughput (L1 Writes) 10.766GB/s 12.380GB/s 11.818GB/s - 3 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 3 eligible_warps_per_cycle Eligible Warps Per Active Cycle 0.304730 0.345132 0.331511 - 3 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 3 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 3 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.00% 0.00% 0.00% - 3 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% - Kernel: void mtf_2buffers_depth32(unsigned char const *, unsigned char*, int, int) - 3 l1_cache_global_hit_rate L1 Global Hit Rate 75.64% 78.83% 77.70% - 3 branch_efficiency Branch Efficiency 100.00% 100.00% 100.00% - 3 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 3 sm_efficiency Multiprocessor Activity 99.32% 99.85% 99.57% - 3 achieved_occupancy Achieved Occupancy 0.623472 0.630879 0.628314 - 3 gld_requested_throughput Requested Global Load Throughput 23.264GB/s 23.410GB/s 23.326GB/s - 3 gst_requested_throughput Requested Global Store Throughput 23.259GB/s 23.405GB/s 23.320GB/s - 3 ipc Executed IPC 2.356864 2.365804 2.362715 - 3 sm_efficiency_instance Multiprocessor Activity 99.32% 99.85% 99.57% - 3 ipc_instance Executed IPC 2.356864 2.365804 2.362715 - 3 inst_per_warp Instructions per warp 1.7711e+05 1.7744e+05 1.7722e+05 - 3 gld_transactions Global Load Transactions 11266164 13479772 12627334 - 3 gst_transactions Global Store Transactions 11010048 13631488 12724906 - 3 local_load_transactions Local Load Transactions 0 0 0 - 3 local_store_transactions Local Store Transactions 0 0 0 - 3 shared_load_transactions Shared Load Transactions 10878976 13631488 12713984 - 3 shared_store_transactions Shared Store Transactions 21760608 27266304 25431072 - 3 gld_transactions_per_request Global Load Transactions Per Request 0.969832 1.026066 0.995438 - 3 gst_transactions_per_request Global Store Transactions Per Request 0.994582 1.006655 1.001407 - 3 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 3 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 3 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 0.991045 1.006655 0.999835 - 3 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 0.991045 1.006732 0.999884 - 3 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 shared_load_throughput Shared Memory Load Throughput 92.202GB/s 93.808GB/s 93.266GB/s - 3 shared_store_throughput Shared Memory Store Throughput 184.43GB/s 187.64GB/s 186.55GB/s - 3 shared_efficiency Shared Memory Efficiency 72.32% 73.54% 72.84% - 3 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 3 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 3 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 3 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 42.31% 43.42% 43.04% - 3 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 39.63% 40.78% 40.03% - 3 stall_memory_dependency Issue Stall Reasons (Data Request) 0.09% 0.11% 0.09% - 3 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% - 3 inst_executed Instructions Executed 237762900 294719562 275260848 - 3 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 3 stall_other Issue Stall Reasons (Other) 7.47% 7.68% 7.54% - 3 inst_fp_32 FP Instructions(Single) 0 0 0 - 3 inst_fp_64 FP Instructions(Double) 0 0 0 - 3 inst_integer Integer Instructions 3732918400 4627147840 4321642666 - 3 inst_bit_convert Bit-Convert Instructions 702545920 870842368 813345450 - 3 inst_control Control-Flow Instructions 186656640 231370752 216094592 - 3 inst_compute_ld_st Load/Store Instructions 1646377853 2039633005 1905300317 - 3 inst_misc Misc Instructions 702717440 871055264 813544213 - 3 inst_inter_thread_communication Inter-Thread Instructions 351272960 435421184 406672725 - 3 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 3 atomic_transactions Atomic Transactions 0 0 0 - 3 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 3 inst_replay_overhead Instruction Replay Overhead 0.000217 0.000310 0.000262 - 3 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 - 3 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.009880 0.012843 0.010961 - 3 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 3 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 dram_read_throughput Device Memory Read Throughput 1.2124GB/s 1.2197GB/s 1.2161GB/s - 3 dram_write_throughput Device Memory Write Throughput 18.064GB/s 18.250GB/s 18.156GB/s - 3 gst_throughput Global Store Throughput 23.259GB/s 23.405GB/s 23.320GB/s - 3 gld_throughput Global Load Throughput 90.399GB/s 95.483GB/s 92.875GB/s - 3 warp_execution_efficiency Warp Execution Efficiency 100.00% 100.00% 100.00% - 3 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 3 gld_efficiency Global Memory Load Efficiency 24.36% 25.78% 25.13% - 3 gst_efficiency Global Memory Store Efficiency 100.00% 100.00% 100.00% - 3 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 48.24% 51.29% 49.43% - 3 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 3 l2_l1_read_throughput L2 Throughput (L1 Reads) 19.781GB/s 21.515GB/s 20.784GB/s - 3 local_memory_overhead Local Memory Overhead 0.00% 3.60% 1.28% - 3 issued_ipc Issued IPC 2.354832 2.365519 2.361693 - 3 issue_slot_utilization Issue Slot Utilization 83.77% 84.15% 84.01% - 3 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 3 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 3 l2_read_transactions L2 Read Transactions 10305273 12067634 11212417 - 3 l2_write_transactions L2 Write Transactions 10977296 13606913 12708532 - 3 tex_cache_transactions Texture Cache Transactions 0 0 0 - 3 dram_read_transactions Device Memory Read Transactions 572192 709125 662750 - 3 dram_write_transactions Device Memory Write Transactions 8525762 10610223 9895693 - 3 l2_read_throughput L2 Throughput (Reads) 19.375GB/s 21.835GB/s 20.657GB/s - 3 l2_write_throughput L2 Throughput (Writes) 23.259GB/s 23.405GB/s 23.320GB/s - 3 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 cf_issued Issued Control-Flow Instructions 11325680 14038775 13111871 - 3 cf_executed Executed Control-Flow Instructions 11325680 14038775 13111871 - 3 ldst_issued Issued Load/Store Instructions 54979064 68109487 63629432 - 3 ldst_executed Executed Load/Store Instructions 54891760 68041204 63548818 - 3 l1_shared_utilization L1/Shared Memory Utilization Mid (5) Mid (5) Mid (5) - 3 l2_utilization L2 Cache Utilization Low (2) Low (3) Low (2) - 3 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 3 dram_utilization Device Memory Utilization Low (2) Low (2) Low (2) - 3 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) - 3 ldst_fu_utilization Load/Store Function Unit Utilization Low (3) Low (3) Low (3) - 3 alu_fu_utilization Arithmetic Function Unit Utilization High (7) High (7) High (7) - 3 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 3 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 3 inst_issued Instructions Issued 237829201 294801172 275331485 - 3 issue_slots Issue Slots 169210225 209744422 195890548 - 3 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 9937328 12508208 11314373 - 3 l2_l1_write_transactions L2 Write Transactions (L1 write requests 10977280 13606912 12708522 - 3 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 3 l2_l1_write_throughput L2 Throughput (L1 Writes) 23.259GB/s 23.405GB/s 23.320GB/s - 3 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 3 eligible_warps_per_cycle Eligible Warps Per Active Cycle 4.852871 4.917958 4.891943 - 3 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 3 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 3 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 9.12% 9.40% 9.31% - 3 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% - Kernel: void mtf_2buffers_depth32(unsigned char const *, unsigned char*, int, int) - 3 l1_cache_global_hit_rate L1 Global Hit Rate 70.30% 73.58% 72.19% - 3 branch_efficiency Branch Efficiency 100.00% 100.00% 100.00% - 3 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 3 sm_efficiency Multiprocessor Activity 98.35% 99.71% 98.95% - 3 achieved_occupancy Achieved Occupancy 0.606315 0.619858 0.615097 - 3 gld_requested_throughput Requested Global Load Throughput 22.725GB/s 22.873GB/s 22.781GB/s - 3 gst_requested_throughput Requested Global Store Throughput 22.719GB/s 22.867GB/s 22.775GB/s - 3 ipc Executed IPC 2.211524 2.246215 2.234125 - 3 sm_efficiency_instance Multiprocessor Activity 98.35% 99.71% 98.95% - 3 ipc_instance Executed IPC 2.211524 2.246215 2.234125 - 3 inst_per_warp Instructions per warp 2.5492e+05 2.5578e+05 2.5534e+05 - 3 gld_transactions Global Load Transactions 11007144 13442708 12516749 - 3 gst_transactions Global Store Transactions 10862592 13762560 12730368 - 3 local_load_transactions Local Load Transactions 0 0 0 - 3 local_store_transactions Local Store Transactions 0 0 0 - 3 shared_load_transactions Shared Load Transactions 11010048 13762560 12812288 - 3 shared_store_transactions Shared Store Transactions 22022784 27488332 25611550 - 3 gld_transactions_per_request Global Load Transactions Per Request 0.967198 1.002850 0.985715 - 3 gst_transactions_per_request Global Store Transactions Per Request 0.989922 1.010830 1.000856 - 3 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 3 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 3 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 1.003359 1.010830 1.007755 - 3 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 1.003359 1.009728 1.007311 - 3 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 shared_load_throughput Shared Memory Load Throughput 91.181GB/s 92.459GB/s 91.807GB/s - 3 shared_store_throughput Shared Memory Store Throughput 182.39GB/s 184.67GB/s 183.53GB/s - 3 shared_efficiency Shared Memory Efficiency 72.08% 72.63% 72.29% - 3 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 3 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 3 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 3 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 43.41% 44.67% 44.22% - 3 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 38.33% 39.54% 38.78% - 3 stall_memory_dependency Issue Stall Reasons (Data Request) 0.44% 0.48% 0.46% - 3 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% - 3 inst_executed Instructions Executed 228408888 283400916 264558692 - 3 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 3 stall_other Issue Stall Reasons (Other) 8.14% 8.47% 8.26% - 3 inst_fp_32 FP Instructions(Single) 0 0 0 - 3 inst_fp_64 FP Instructions(Double) 0 0 0 - 3 inst_integer Integer Instructions 3555797888 4411896448 4118566656 - 3 inst_bit_convert Bit-Convert Instructions 702283776 871366656 813432832 - 3 inst_control Control-Flow Instructions 124391424 154339968 144078549 - 3 inst_compute_ld_st Load/Store Instructions 1587233948 1968181197 1837688978 - 3 inst_misc Misc Instructions 702455520 871579392 813631584 - 3 inst_inter_thread_communication Inter-Thread Instructions 351141888 435683328 406716416 - 3 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 3 atomic_transactions Atomic Transactions 0 0 0 - 3 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 3 inst_replay_overhead Instruction Replay Overhead 0.000377 0.000525 0.000468 - 3 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 - 3 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.013408 0.014715 0.013947 - 3 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 3 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 dram_read_throughput Device Memory Read Throughput 1.1991GB/s 1.2060GB/s 1.2030GB/s - 3 dram_write_throughput Device Memory Write Throughput 19.573GB/s 19.864GB/s 19.734GB/s - 3 gst_throughput Global Store Throughput 22.719GB/s 22.867GB/s 22.775GB/s - 3 gld_throughput Global Load Throughput 87.995GB/s 91.157GB/s 89.821GB/s - 3 warp_execution_efficiency Warp Execution Efficiency 100.00% 100.00% 100.00% - 3 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 3 gld_efficiency Global Memory Load Efficiency 24.93% 25.85% 25.37% - 3 gst_efficiency Global Memory Store Efficiency 100.00% 100.00% 100.00% - 3 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 48.68% 49.54% 48.97% - 3 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 3 l2_l1_read_throughput L2 Throughput (L1 Reads) 24.539GB/s 27.040GB/s 25.840GB/s - 3 local_memory_overhead Local Memory Overhead 0.00% 5.25% 1.75% - 3 issued_ipc Issued IPC 2.213546 2.247172 2.235691 - 3 issue_slot_utilization Issue Slot Utilization 83.77% 85.04% 84.60% - 3 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 3 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 3 l2_read_transactions L2 Read Transactions 11792398 16436174 14734219 - 3 l2_write_transactions L2 Write Transactions 10973190 13615118 12709895 - 3 tex_cache_transactions Texture Cache Transactions 0 0 0 - 3 dram_read_transactions Device Memory Read Transactions 582482 716742 671100 - 3 dram_write_transactions Device Memory Write Transactions 9453591 11827240 11017062 - 3 l2_read_throughput L2 Throughput (Reads) 24.415GB/s 27.605GB/s 26.282GB/s - 3 l2_write_throughput L2 Throughput (Writes) 22.719GB/s 22.867GB/s 22.775GB/s - 3 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 cf_issued Issued Control-Flow Instructions 7549425 9367032 8744255 - 3 cf_executed Executed Control-Flow Instructions 7549425 9367032 8744255 - 3 ldst_issued Issued Load/Store Instructions 54970726 68195923 63662974 - 3 ldst_executed Executed Load/Store Instructions 54871278 68082168 63555646 - 3 l1_shared_utilization L1/Shared Memory Utilization Mid (5) Mid (5) Mid (5) - 3 l2_utilization L2 Cache Utilization Low (3) Low (3) Low (3) - 3 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 3 dram_utilization Device Memory Utilization Low (2) Low (2) Low (2) - 3 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) - 3 ldst_fu_utilization Load/Store Function Unit Utilization Low (3) Low (3) Low (3) - 3 alu_fu_utilization Arithmetic Function Unit Utilization High (7) High (7) High (7) - 3 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 3 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 3 inst_issued Instructions Issued 228509231 283515009 264671313 - 3 issue_slots Issue Slots 172949188 214578317 200317937 - 3 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 11852048 16102692 14466537 - 3 l2_l1_write_transactions L2 Write Transactions (L1 write requests 10973184 13615104 12709888 - 3 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 3 l2_l1_write_throughput L2 Throughput (L1 Writes) 22.719GB/s 22.867GB/s 22.775GB/s - 3 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 3 eligible_warps_per_cycle Eligible Warps Per Active Cycle 4.797141 4.911416 4.868216 - 3 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 3 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 3 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 8.14% 8.38% 8.29% - 3 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.01% 0.01% 0.01% - Kernel: void mtf_2buffers_depth32(unsigned char const *, unsigned char*, int, int) - 3 l1_cache_global_hit_rate L1 Global Hit Rate 47.51% 51.17% 49.13% - 3 branch_efficiency Branch Efficiency 100.00% 100.00% 100.00% - 3 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 3 sm_efficiency Multiprocessor Activity 98.09% 99.34% 98.88% - 3 achieved_occupancy Achieved Occupancy 0.591798 0.601140 0.597715 - 3 gld_requested_throughput Requested Global Load Throughput 20.777GB/s 21.232GB/s 21.078GB/s - 3 gst_requested_throughput Requested Global Store Throughput 20.772GB/s 21.227GB/s 21.073GB/s - 3 ipc Executed IPC 2.022524 2.086186 2.064255 - 3 sm_efficiency_instance Multiprocessor Activity 98.09% 99.34% 98.88% - 3 ipc_instance Executed IPC 2.022524 2.086186 2.064255 - 3 inst_per_warp Instructions per warp 3.3669e+05 3.3729e+05 3.3709e+05 - 3 gld_transactions Global Load Transactions 11322616 13674208 12795996 - 3 gst_transactions Global Store Transactions 11010048 13631488 12757674 - 3 local_load_transactions Local Load Transactions 0 0 0 - 3 local_store_transactions Local Store Transactions 0 0 0 - 3 shared_load_transactions Shared Load Transactions 10878976 13565952 12670293 - 3 shared_store_transactions Shared Store Transactions 21760608 27095072 25316605 - 3 gld_transactions_per_request Global Load Transactions Per Request 0.988068 1.031207 1.007790 - 3 gst_transactions_per_request Global Store Transactions Per Request 1.001203 1.006046 1.003411 - 3 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 3 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 3 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 0.991045 1.001209 0.996215 - 3 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 0.991045 1.000072 0.995467 - 3 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 shared_load_throughput Shared Memory Load Throughput 82.342GB/s 84.989GB/s 83.977GB/s - 3 shared_store_throughput Shared Memory Store Throughput 164.70GB/s 169.74GB/s 167.81GB/s - 3 shared_efficiency Shared Memory Efficiency 72.78% 73.54% 73.15% - 3 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 3 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 3 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 3 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 32.67% 33.59% 33.04% - 3 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 44.79% 45.43% 45.14% - 3 stall_memory_dependency Issue Stall Reasons (Data Request) 6.73% 8.10% 7.21% - 3 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% - 3 inst_executed Instructions Executed 226257008 280626216 262052884 - 3 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 3 stall_other Issue Stall Reasons (Other) 7.50% 7.82% 7.66% - 3 inst_fp_32 FP Instructions(Single) 0 0 0 - 3 inst_fp_64 FP Instructions(Double) 0 0 0 - 3 inst_integer Integer Instructions 3546026240 4398130816 4107039189 - 3 inst_bit_convert Bit-Convert Instructions 702545920 871366656 813694976 - 3 inst_control Control-Flow Instructions 93328384 115755008 108093738 - 3 inst_compute_ld_st Load/Store Instructions 1558538173 1931865389 1804350151 - 3 inst_misc Misc Instructions 702631872 871473120 813794432 - 3 inst_inter_thread_communication Inter-Thread Instructions 351272960 435683328 406847488 - 3 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 3 atomic_transactions Atomic Transactions 0 0 0 - 3 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 3 inst_replay_overhead Instruction Replay Overhead 0.014970 0.017760 0.016651 - 3 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 - 3 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.024815 0.026917 0.025709 - 3 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 3 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 dram_read_throughput Device Memory Read Throughput 1.6635GB/s 1.7686GB/s 1.7148GB/s - 3 dram_write_throughput Device Memory Write Throughput 19.258GB/s 19.660GB/s 19.526GB/s - 3 gst_throughput Global Store Throughput 20.772GB/s 21.227GB/s 21.073GB/s - 3 gld_throughput Global Load Throughput 83.894GB/s 85.700GB/s 84.956GB/s - 3 warp_execution_efficiency Warp Execution Efficiency 100.00% 100.00% 100.00% - 3 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 3 gld_efficiency Global Memory Load Efficiency 24.24% 25.30% 24.81% - 3 gst_efficiency Global Memory Store Efficiency 100.00% 100.00% 100.00% - 3 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 47.55% 50.27% 48.94% - 3 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 3 l2_l1_read_throughput L2 Throughput (L1 Reads) 43.450GB/s 43.595GB/s 43.545GB/s - 3 local_memory_overhead Local Memory Overhead 0.00% 2.99% 1.00% - 3 issued_ipc Issued IPC 2.084240 2.137214 2.116614 - 3 issue_slot_utilization Issue Slot Utilization 72.62% 74.36% 73.68% - 3 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 3 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 3 l2_read_transactions L2 Read Transactions 22950608 27451876 25889005 - 3 l2_write_transactions L2 Write Transactions 10977285 13615105 12713987 - 3 tex_cache_transactions Texture Cache Transactions 0 0 0 - 3 dram_read_transactions Device Memory Read Transactions 904828 1129211 1033682 - 3 dram_write_transactions Device Memory Write Transactions 10177290 12609629 11779794 - 3 l2_read_throughput L2 Throughput (Reads) 42.507GB/s 43.428GB/s 42.977GB/s - 3 l2_write_throughput L2 Throughput (Writes) 20.772GB/s 21.227GB/s 21.073GB/s - 3 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 cf_issued Issued Control-Flow Instructions 5662842 7023613 6558753 - 3 cf_executed Executed Control-Flow Instructions 5662842 7023613 6558753 - 3 ldst_issued Issued Load/Store Instructions 59250297 72971736 68064601 - 3 ldst_executed Executed Load/Store Instructions 54891760 68082168 63576128 - 3 l1_shared_utilization L1/Shared Memory Utilization Mid (5) Mid (5) Mid (5) - 3 l2_utilization L2 Cache Utilization Low (3) Mid (4) Low (3) - 3 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 3 dram_utilization Device Memory Utilization Low (2) Low (2) Low (2) - 3 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) - 3 ldst_fu_utilization Load/Store Function Unit Utilization Low (3) Low (3) Low (3) - 3 alu_fu_utilization Arithmetic Function Unit Utilization Mid (6) High (7) Mid (6) - 3 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 3 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 3 inst_issued Instructions Issued 230305742 284706690 266179234 - 3 issue_slots Issue Slots 160490142 198114570 185318223 - 3 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 23036812 27869208 26246888 - 3 l2_l1_write_transactions L2 Write Transactions (L1 write requests 10977280 13615104 12713984 - 3 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 3 l2_l1_write_throughput L2 Throughput (L1 Writes) 20.772GB/s 21.227GB/s 21.073GB/s - 3 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 3 eligible_warps_per_cycle Eligible Warps Per Active Cycle 3.932044 4.397506 4.212024 - 3 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 3 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 3 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 6.02% 7.44% 6.94% - 3 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% - Kernel: void mtf_2buffers(unsigned char const *, unsigned char*, int, int) - 3 l1_cache_global_hit_rate L1 Global Hit Rate 75.79% 81.35% 78.35% - 3 branch_efficiency Branch Efficiency 100.00% 100.00% 100.00% - 3 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 3 sm_efficiency Multiprocessor Activity 94.14% 95.59% 95.02% - 3 achieved_occupancy Achieved Occupancy 0.444506 0.452466 0.448095 - 3 gld_requested_throughput Requested Global Load Throughput 16.839GB/s 17.181GB/s 17.048GB/s - 3 gst_requested_throughput Requested Global Store Throughput 538.73MB/s 549.68MB/s 545.42MB/s - 3 ipc Executed IPC 1.891736 1.935820 1.909130 - 3 sm_efficiency_instance Multiprocessor Activity 94.14% 95.59% 95.02% - 3 ipc_instance Executed IPC 1.891736 1.935820 1.909130 - 3 inst_per_warp Instructions per warp 1.8585e+05 1.8719e+05 1.8631e+05 - 3 gld_transactions Global Load Transactions 11671988 13660536 12887558 - 3 gst_transactions Global Store Transactions 10878976 13533184 12648448 - 3 local_load_transactions Local Load Transactions 0 0 0 - 3 local_store_transactions Local Store Transactions 0 0 0 - 3 shared_load_transactions Shared Load Transactions 12526156 15208100 14247585 - 3 shared_store_transactions Shared Store Transactions 22973660 28045852 26269268 - 3 gld_transactions_per_request Global Load Transactions Per Request 0.979432 1.063038 1.017012 - 3 gst_transactions_per_request Global Store Transactions Per Request 0.991045 0.999395 0.995007 - 3 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 3 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 3 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 0.994242 1.022937 1.009705 - 3 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 0.994298 1.017493 1.006680 - 3 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 shared_load_throughput Shared Memory Load Throughput 75.535GB/s 77.167GB/s 76.515GB/s - 3 shared_store_throughput Shared Memory Store Throughput 139.85GB/s 142.31GB/s 141.03GB/s - 3 shared_efficiency Shared Memory Efficiency 72.41% 74.12% 73.18% - 3 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 3 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 3 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 3 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 44.62% 45.44% 45.15% - 3 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 44.36% 45.03% 44.61% - 3 stall_memory_dependency Issue Stall Reasons (Data Request) 0.36% 0.38% 0.37% - 3 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% - 3 inst_executed Instructions Executed 250833865 309251253 289310656 - 3 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 3 stall_other Issue Stall Reasons (Other) 9.53% 9.72% 9.60% - 3 inst_fp_32 FP Instructions(Single) 0 0 0 - 3 inst_fp_64 FP Instructions(Double) 0 0 0 - 3 inst_integer Integer Instructions 3670650836 4529026060 4235789612 - 3 inst_bit_convert Bit-Convert Instructions 427656256 530834688 495579797 - 3 inst_control Control-Flow Instructions 290582376 355546695 333447117 - 3 inst_compute_ld_st Load/Store Instructions 1380479890 1705301647 1594446655 - 3 inst_misc Misc Instructions 678696480 834224992 781208490 - 3 inst_inter_thread_communication Inter-Thread Instructions 381505632 470929568 440416960 - 3 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 3 atomic_transactions Atomic Transactions 0 0 0 - 3 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 3 inst_replay_overhead Instruction Replay Overhead 0.002026 0.002515 0.002239 - 3 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 - 3 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.008308 0.009745 0.008938 - 3 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 3 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 dram_read_throughput Device Memory Read Throughput 1.0026GB/s 1.0238GB/s 1.0132GB/s - 3 dram_write_throughput Device Memory Write Throughput 14.690GB/s 15.016GB/s 14.878GB/s - 3 gst_throughput Global Store Throughput 16.835GB/s 17.177GB/s 17.044GB/s - 3 gld_throughput Global Load Throughput 67.088GB/s 71.603GB/s 69.335GB/s - 3 warp_execution_efficiency Warp Execution Efficiency 100.00% 100.00% 100.00% - 3 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 3 gld_efficiency Global Memory Load Efficiency 23.52% 25.53% 24.61% - 3 gst_efficiency Global Memory Store Efficiency 3.13% 3.13% 3.13% - 3 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 46.93% 49.31% 47.96% - 3 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 3 l2_l1_read_throughput L2 Throughput (L1 Reads) 14.419GB/s 14.762GB/s 14.558GB/s - 3 local_memory_overhead Local Memory Overhead 0.00% 5.74% 1.91% - 3 issued_ipc Issued IPC 1.906308 1.913302 1.910771 - 3 issue_slot_utilization Issue Slot Utilization 74.99% 75.29% 75.18% - 3 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 3 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 3 l2_read_transactions L2 Read Transactions 8918673 11847564 10709124 - 3 l2_write_transactions L2 Write Transactions 10977286 13606924 12708530 - 3 tex_cache_transactions Texture Cache Transactions 0 0 0 - 3 dram_read_transactions Device Memory Read Transactions 653717 807053 755399 - 3 dram_write_transactions Device Memory Write Transactions 9578662 11864420 11093486 - 3 l2_read_throughput L2 Throughput (Reads) 13.678GB/s 15.029GB/s 14.334GB/s - 3 l2_write_throughput L2 Throughput (Writes) 16.835GB/s 17.177GB/s 17.044GB/s - 3 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 cf_issued Issued Control-Flow Instructions 27682508 34026574 31864036 - 3 cf_executed Executed Control-Flow Instructions 27682508 34026574 31864036 - 3 ldst_issued Issued Load/Store Instructions 57378705 70953916 66321726 - 3 ldst_executed Executed Load/Store Instructions 56781104 70261278 65658218 - 3 l1_shared_utilization L1/Shared Memory Utilization Mid (4) Mid (4) Mid (4) - 3 l2_utilization L2 Cache Utilization Low (2) Low (2) Low (2) - 3 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 3 dram_utilization Device Memory Utilization Low (2) Low (2) Low (2) - 3 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) - 3 ldst_fu_utilization Load/Store Function Unit Utilization Low (3) Low (3) Low (3) - 3 alu_fu_utilization Arithmetic Function Unit Utilization Mid (5) Mid (5) Mid (5) - 3 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 3 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 3 inst_issued Instructions Issued 251453629 309971425 289986377 - 3 issue_slots Issue Slots 197971656 243870984 228197533 - 3 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 9402096 11637248 10852560 - 3 l2_l1_write_transactions L2 Write Transactions (L1 write requests 10977280 13606912 12708522 - 3 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 3 l2_l1_write_throughput L2 Throughput (L1 Writes) 16.835GB/s 17.177GB/s 17.044GB/s - 3 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 3 eligible_warps_per_cycle Eligible Warps Per Active Cycle 2.704417 2.740957 2.726400 - 3 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 3 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 3 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.25% 0.27% 0.27% - 3 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% - Kernel: bsc_st567_encode_cuda_postsort(unsigned char*, __int64*, int, __int64, int*) - 12 l1_cache_global_hit_rate L1 Global Hit Rate 0.00% 0.00% 0.00% - 12 branch_efficiency Branch Efficiency 100.00% 100.00% 100.00% - 12 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 12 sm_efficiency Multiprocessor Activity 99.06% 99.80% 99.38% - 12 achieved_occupancy Achieved Occupancy 0.950043 0.953221 0.951549 - 12 gld_requested_throughput Requested Global Load Throughput 72.314GB/s 73.991GB/s 73.078GB/s - 12 gst_requested_throughput Requested Global Store Throughput 9.0392GB/s 9.2488GB/s 9.1348GB/s - 12 ipc Executed IPC 0.681616 0.690114 0.686196 - 12 sm_efficiency_instance Multiprocessor Activity 99.06% 99.80% 99.38% - 12 ipc_instance Executed IPC 0.681616 0.690114 0.686196 - 12 inst_per_warp Instructions per warp 3.5535e+04 4.4165e+04 4.1274e+04 - 12 gld_transactions Global Load Transactions 1704288 2118480 1979760 - 12 gst_transactions Global Store Transactions 852144 1059240 989880 - 12 local_load_transactions Local Load Transactions 0 0 0 - 12 local_store_transactions Local Store Transactions 0 0 0 - 12 shared_load_transactions Shared Load Transactions 0 0 0 - 12 shared_store_transactions Shared Store Transactions 0 0 0 - 12 gld_transactions_per_request Global Load Transactions Per Request 2.000023 2.000042 2.000033 - 12 gst_transactions_per_request Global Store Transactions Per Request 1.000011 1.000021 1.000016 - 12 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 12 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 12 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 0.000000 0.000000 0.000000 - 12 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 0.000000 0.000000 0.000000 - 12 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 12 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 12 shared_load_throughput Shared Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 12 shared_store_throughput Shared Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 12 shared_efficiency Shared Memory Efficiency 0.00% 0.00% 0.00% - 12 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 12 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 12 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 12 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 12 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 12 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 12 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 12 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 12 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 12 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 1.62% 1.64% 1.63% - 12 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 11.48% 11.62% 11.57% - 12 stall_memory_dependency Issue Stall Reasons (Data Request) 82.89% 83.29% 83.10% - 12 stall_sync Issue Stall Reasons (Synchronization) 0.93% 1.24% 1.08% - 12 inst_executed Instructions Executed 13645536 16959168 15849344 - 12 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 12 stall_other Issue Stall Reasons (Other) 2.32% 2.36% 2.34% - 12 inst_fp_32 FP Instructions(Single) 0 0 0 - 12 inst_fp_64 FP Instructions(Double) 0 0 0 - 12 inst_integer Integer Instructions 327314688 406841856 380206080 - 12 inst_bit_convert Bit-Convert Instructions 0 0 0 - 12 inst_control Control-Flow Instructions 27268032 33895296 31675648 - 12 inst_compute_ld_st Load/Store Instructions 54536065 67790593 63351297 - 12 inst_misc Misc Instructions 27489216 34116480 31896832 - 12 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 - 12 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 12 atomic_transactions Atomic Transactions 1 1 1 - 12 atomic_transactions_per_request Atomic Transactions Per Request 1.000000 1.000000 1.000000 - 12 inst_replay_overhead Instruction Replay Overhead 0.155332 0.159965 0.157318 - 12 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 - 12 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.124897 0.124917 0.124910 - 12 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 12 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 12 dram_read_throughput Device Memory Read Throughput 72.747GB/s 74.413GB/s 73.540GB/s - 12 dram_write_throughput Device Memory Write Throughput 9.0393GB/s 9.2488GB/s 9.1348GB/s - 12 gst_throughput Global Store Throughput 9.0392GB/s 9.2488GB/s 9.1348GB/s - 12 gld_throughput Global Load Throughput 72.315GB/s 73.992GB/s 73.079GB/s - 12 warp_execution_efficiency Warp Execution Efficiency 100.00% 100.00% 100.00% - 12 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 12 gld_efficiency Global Memory Load Efficiency 100.00% 100.00% 100.00% - 12 gst_efficiency Global Memory Store Efficiency 100.00% 100.00% 100.00% - 12 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 12 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 0.00% 0.00% 0.00% - 12 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 12 l2_l1_read_throughput L2 Throughput (L1 Reads) 72.314GB/s 73.991GB/s 73.078GB/s - 12 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00% - 12 issued_ipc Issued IPC 0.790412 0.803234 0.795151 - 12 issue_slot_utilization Issue Slot Utilization 30.99% 31.46% 31.17% - 12 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 12 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 12 l2_read_transactions L2 Read Transactions 6817106 8476964 7919446 - 12 l2_write_transactions L2 Write Transactions 852127 1059233 989868 - 12 tex_cache_transactions Texture Cache Transactions 0 0 0 - 12 dram_read_transactions Device Memory Read Transactions 6855909 8529959 7969006 - 12 dram_write_transactions Device Memory Write Transactions 852126 1059236 989866 - 12 l2_read_throughput L2 Throughput (Reads) 72.341GB/s 73.992GB/s 73.083GB/s - 12 l2_write_throughput L2 Throughput (Writes) 9.0393GB/s 9.2490GB/s 9.1348GB/s - 12 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 12 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 12 cf_issued Issued Control-Flow Instructions 854814 1061916 992552 - 12 cf_executed Executed Control-Flow Instructions 854814 1061916 992552 - 12 ldst_issued Issued Load/Store Instructions 3801406 4829393 4481194 - 12 ldst_executed Executed Load/Store Instructions 1705020 2119224 1980496 - 12 l1_shared_utilization L1/Shared Memory Utilization Low (1) Low (1) Low (1) - 12 l2_utilization L2 Cache Utilization Mid (4) Mid (4) Mid (4) - 12 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 12 dram_utilization Device Memory Utilization High (7) High (8) High (7) - 12 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) - 12 ldst_fu_utilization Load/Store Function Unit Utilization Low (1) Low (1) Low (1) - 12 alu_fu_utilization Arithmetic Function Unit Utilization Low (2) Low (2) Low (2) - 12 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 12 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 12 inst_issued Instructions Issued 15730381 19657806 18349984 - 12 issue_slots Issue Slots 12317007 15419982 14385647 - 12 l2_atomic_throughput L2 Throughput (Atomic requests) 8.9482KB/s 22.762KB/s 13.566KB/s - 12 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 6817008 8473824 7918912 - 12 l2_l1_write_transactions L2 Write Transactions (L1 write requests 852128 1059224 989862 - 12 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 12 l2_l1_write_throughput L2 Throughput (L1 Writes) 9.0392GB/s 9.2489GB/s 9.1348GB/s - 12 l2_atomic_transactions L2 Transactions (Atomic requests) 2 4 2 - 12 eligible_warps_per_cycle Eligible Warps Per Active Cycle 0.886632 0.900834 0.894494 - 12 atomic_throughput Atomic Throughput 35.793KB/s 45.523KB/s 39.140KB/s - 12 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 12 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 12 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.27% 0.28% 0.28% - 12 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% - Kernel: void cub::DeviceRadixSortDownsweepKernel::Policy520, bool=0, bool=0, __int64, unsigned char, int>(cub::DeviceRadixSortPolicy<__int64, unsigned char, int>::Policy520*, cub::DeviceRadixSortPolicy<__int64, unsigned char, int>::Policy520, bool=0*, bool=0, bool=0*, bool=0*, int, int, cub::GridEvenShare) - 36 l1_cache_global_hit_rate L1 Global Hit Rate 25.00% 25.02% 25.01% - 36 branch_efficiency Branch Efficiency 100.00% 100.00% 100.00% - 36 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 36 sm_efficiency Multiprocessor Activity 99.02% 99.92% 99.60% - 36 achieved_occupancy Achieved Occupancy 0.389789 0.392459 0.391064 - 36 gld_requested_throughput Requested Global Load Throughput 25.608GB/s 26.687GB/s 26.198GB/s - 36 gst_requested_throughput Requested Global Store Throughput 25.605GB/s 26.684GB/s 26.195GB/s - 36 ipc Executed IPC 1.147626 1.193852 1.173077 - 36 sm_efficiency_instance Multiprocessor Activity 99.02% 99.92% 99.60% - 36 ipc_instance Executed IPC 1.147626 1.193852 1.173077 - 36 inst_per_warp Instructions per warp 1.5293e+05 1.9009e+05 1.7764e+05 - 36 gld_transactions Global Load Transactions 2563828 3185684 2977295 - 36 gst_transactions Global Store Transactions 4658408 6798660 5730031 - 36 local_load_transactions Local Load Transactions 0 0 0 - 36 local_store_transactions Local Store Transactions 0 0 0 - 36 shared_load_transactions Shared Load Transactions 15941768 19817544 18519284 - 36 shared_store_transactions Shared Store Transactions 15858196 21307112 19203280 - 36 gld_transactions_per_request Global Load Transactions Per Request 1.503310 1.504431 1.503738 - 36 gst_transactions_per_request Global Store Transactions Per Request 2.709184 3.209258 2.894468 - 36 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 36 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 36 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 1.448273 1.448449 1.448353 - 36 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 1.547034 1.682114 1.622302 - 36 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 36 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 36 shared_load_throughput Shared Memory Load Throughput 212.90GB/s 221.89GB/s 217.81GB/s - 36 shared_store_throughput Shared Memory Store Throughput 219.41GB/s 231.21GB/s 225.81GB/s - 36 shared_efficiency Shared Memory Efficiency 65.32% 68.15% 66.55% - 36 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 36 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 36 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 36 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 36 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 36 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 36 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 36 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 36 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 36 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 6.48% 6.82% 6.63% - 36 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 44.62% 46.32% 45.46% - 36 stall_memory_dependency Issue Stall Reasons (Data Request) 10.86% 11.23% 11.05% - 36 stall_sync Issue Stall Reasons (Synchronization) 11.16% 11.54% 11.38% - 36 inst_executed Instructions Executed 73406613 91242976 85269546 - 36 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 36 stall_other Issue Stall Reasons (Other) 8.74% 8.86% 8.80% - 36 inst_fp_32 FP Instructions(Single) 0 0 0 - 36 inst_fp_64 FP Instructions(Double) 0 0 0 - 36 inst_integer Integer Instructions 1209311857 1582241079 1441698079 - 36 inst_bit_convert Bit-Convert Instructions 27268608 33895680 31676160 - 36 inst_control Control-Flow Instructions 13676416 16989952 15880192 - 36 inst_compute_ld_st Load/Store Instructions 792092484 984589029 920118525 - 36 inst_misc Misc Instructions 153466735 247241283 204664204 - 36 inst_inter_thread_communication Inter-Thread Instructions 3840 3840 3840 - 36 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 36 atomic_transactions Atomic Transactions 0 0 0 - 36 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 36 inst_replay_overhead Instruction Replay Overhead 0.130036 0.156094 0.141232 - 36 shared_replay_overhead Shared Memory Replay Overhead 0.083614 0.099976 0.092550 - 36 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.026177 0.026196 0.026184 - 36 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 36 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 36 dram_read_throughput Device Memory Read Throughput 26.100GB/s 27.179GB/s 26.690GB/s - 36 dram_write_throughput Device Memory Write Throughput 28.161GB/s 29.688GB/s 28.725GB/s - 36 gst_throughput Global Store Throughput 34.514GB/s 36.005GB/s 35.103GB/s - 36 gld_throughput Global Load Throughput 34.221GB/s 35.663GB/s 35.018GB/s - 36 warp_execution_efficiency Warp Execution Efficiency 100.00% 100.00% 100.00% - 36 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 36 gld_efficiency Global Memory Load Efficiency 74.78% 74.83% 74.81% - 36 gst_efficiency Global Memory Store Efficiency 71.28% 76.64% 74.64% - 36 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 36 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 0.05% 0.11% 0.07% - 36 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 36 l2_l1_read_throughput L2 Throughput (L1 Reads) 25.665GB/s 26.746GB/s 26.260GB/s - 36 local_memory_overhead Local Memory Overhead 0.00% 0.01% 0.00% - 36 issued_ipc Issued IPC 1.326363 1.350580 1.338790 - 36 issue_slot_utilization Issue Slot Utilization 53.55% 54.28% 53.89% - 36 sysmem_read_transactions System Memory Read Transactions 0 3 0 - 36 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 36 l2_read_transactions L2 Read Transactions 7692165 9556746 8931961 - 36 l2_write_transactions L2 Write Transactions 10010425 13370477 11941550 - 36 tex_cache_transactions Texture Cache Transactions 0 0 0 - 36 dram_read_transactions Device Memory Read Transactions 7809150 9722453 9077244 - 36 dram_write_transactions Device Memory Write Transactions 8173138 11025275 9771904 - 36 l2_read_throughput L2 Throughput (Reads) 25.671GB/s 26.750GB/s 26.264GB/s - 36 l2_write_throughput L2 Throughput (Writes) 34.514GB/s 36.005GB/s 35.104GB/s - 36 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 8.8047KB/s 250.000B/s - 36 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 36 cf_issued Issued Control-Flow Instructions 998008 1239620 1158700 - 36 cf_executed Executed Control-Flow Instructions 998008 1239620 1158700 - 36 ldst_issued Issued Load/Store Instructions 35694462 46718614 42406810 - 36 ldst_executed Executed Load/Store Instructions 26133732 32484676 30357636 - 36 l1_shared_utilization L1/Shared Memory Utilization Mid (6) High (7) Mid (6) - 36 l2_utilization L2 Cache Utilization Low (3) Low (3) Low (3) - 36 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 36 dram_utilization Device Memory Utilization Mid (5) Mid (5) Mid (5) - 36 sysmem_utilization System Memory Utilization Idle (0) Low (1) Idle (0) - 36 ldst_fu_utilization Load/Store Function Unit Utilization Low (3) Low (3) Low (3) - 36 alu_fu_utilization Arithmetic Function Unit Utilization Low (3) Low (3) Low (3) - 36 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 36 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 36 inst_issued Instructions Issued 82974894 105482799 97316150 - 36 issue_slots Issue Slots 66638372 85176884 78339597 - 36 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 36 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 7690812 9555200 8930761 - 36 l2_l1_write_transactions L2 Write Transactions (L1 write requests 10009508 13371869 11941136 - 36 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 36 l2_l1_write_throughput L2 Throughput (L1 Writes) 34.513GB/s 36.005GB/s 35.103GB/s - 36 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 36 eligible_warps_per_cycle Eligible Warps Per Active Cycle 2.245825 2.356118 2.304731 - 36 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 36 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 36 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 36 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 15.89% 17.34% 16.69% - 36 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.41% 0.47% 0.44% - Kernel: void cub::DeviceRadixSortUpsweepKernel::Policy520, bool=0, bool=0, __int64, int>(cub::DeviceRadixSortPolicy<__int64, cub::NullType, int>::Policy520*, bool=0*, cub::DeviceRadixSortPolicy<__int64, cub::NullType, int>::Policy520*, int, int, cub::GridEvenShare::Policy520*>) - 96 l1_cache_global_hit_rate L1 Global Hit Rate 0.00% 0.00% 0.00% - 96 branch_efficiency Branch Efficiency 100.00% 100.00% 100.00% - 96 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 96 sm_efficiency Multiprocessor Activity 98.87% 99.65% 99.30% - 96 achieved_occupancy Achieved Occupancy 0.315500 0.316720 0.316294 - 96 gld_requested_throughput Requested Global Load Throughput 102.00GB/s 103.06GB/s 102.46GB/s - 96 gst_requested_throughput Requested Global Store Throughput 9.4664MB/s 11.889MB/s 10.282MB/s - 96 ipc Executed IPC 1.101096 1.107131 1.103880 - 96 sm_efficiency_instance Multiprocessor Activity 98.87% 99.65% 99.30% - 96 ipc_instance Executed IPC 1.101096 1.107131 1.103880 - 96 inst_per_warp Instructions per warp 4.0812e+04 5.0687e+04 4.7381e+04 - 96 gld_transactions Global Load Transactions 1704240 2118528 1979757 - 96 gst_transactions Global Store Transactions 6144 6144 6144 - 96 local_load_transactions Local Load Transactions 0 0 0 - 96 local_store_transactions Local Store Transactions 0 0 0 - 96 shared_load_transactions Shared Load Transactions 968792 1200576 1123001 - 96 shared_store_transactions Shared Store Transactions 885848 1099200 1027769 - 96 gld_transactions_per_request Global Load Transactions Per Request 1.999915 2.000164 2.000036 - 96 gst_transactions_per_request Global Store Transactions Per Request 32.000000 32.000000 32.000000 - 96 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 96 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 96 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 0.999932 1.000072 1.000017 - 96 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 0.999926 1.000079 1.000019 - 96 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 96 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 96 shared_load_throughput Shared Memory Load Throughput 57.806GB/s 58.587GB/s 58.128GB/s - 96 shared_store_throughput Shared Memory Store Throughput 52.925GB/s 53.571GB/s 53.195GB/s - 96 shared_efficiency Shared Memory Efficiency 26.50% 26.62% 26.54% - 96 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 96 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 96 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 96 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 96 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 96 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 96 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 96 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 96 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 96 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 9.17% 9.28% 9.22% - 96 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 24.23% 25.15% 24.69% - 96 stall_memory_dependency Issue Stall Reasons (Data Request) 38.62% 39.89% 39.26% - 96 stall_sync Issue Stall Reasons (Synchronization) 10.97% 11.60% 11.32% - 96 inst_executed Instructions Executed 15671929 19463814 18194213 - 96 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 96 stall_other Issue Stall Reasons (Other) 8.07% 8.25% 8.16% - 96 inst_fp_32 FP Instructions(Single) 0 0 0 - 96 inst_fp_64 FP Instructions(Double) 0 0 0 - 96 inst_integer Integer Instructions 321088902 436591384 390464671 - 96 inst_bit_convert Bit-Convert Instructions 0 0 0 - 96 inst_control Control-Flow Instructions 1535040 1908302 1782705 - 96 inst_compute_ld_st Load/Store Instructions 86626701 107491689 100505092 - 96 inst_misc Misc Instructions 36883277 76035654 56935574 - 96 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 - 96 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 96 atomic_transactions Atomic Transactions 0 0 0 - 96 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 96 inst_replay_overhead Instruction Replay Overhead 0.070227 0.072423 0.071399 - 96 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 - 96 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.108736 0.108844 0.108808 - 96 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 96 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 96 dram_read_throughput Device Memory Read Throughput 102.46GB/s 103.50GB/s 102.93GB/s - 96 dram_write_throughput Device Memory Write Throughput 35.985MB/s 50.539MB/s 41.912MB/s - 96 gst_throughput Global Store Throughput 75.731MB/s 95.111MB/s 82.256MB/s - 96 gld_throughput Global Load Throughput 102.00GB/s 103.06GB/s 102.46GB/s - 96 warp_execution_efficiency Warp Execution Efficiency 100.00% 100.00% 100.00% - 96 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 96 gld_efficiency Global Memory Load Efficiency 99.99% 100.00% 100.00% - 96 gst_efficiency Global Memory Store Efficiency 12.50% 12.50% 12.50% - 96 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 96 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 0.00% 0.00% 0.00% - 96 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 96 l2_l1_read_throughput L2 Throughput (L1 Reads) 102.00GB/s 103.06GB/s 102.46GB/s - 96 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00% - 96 issued_ipc Issued IPC 1.179496 1.188010 1.182882 - 96 issue_slot_utilization Issue Slot Utilization 39.72% 40.04% 39.85% - 96 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 96 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 96 l2_read_transactions L2 Read Transactions 6817644 8474474 7919556 - 96 l2_write_transactions L2 Write Transactions 6144 6155 6144 - 96 tex_cache_transactions Texture Cache Transactions 0 0 0 - 96 dram_read_transactions Device Memory Read Transactions 6844193 8514453 7955223 - 96 dram_write_transactions Device Memory Write Transactions 2776 3390 3135 - 96 l2_read_throughput L2 Throughput (Reads) 102.01GB/s 103.07GB/s 102.47GB/s - 96 l2_write_throughput L2 Throughput (Writes) 75.731MB/s 95.111MB/s 82.268MB/s - 96 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 96 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 96 cf_issued Issued Control-Flow Instructions 69730 85067 79918 - 96 cf_executed Executed Control-Flow Instructions 69730 85067 79918 - 96 ldst_issued Issued Load/Store Instructions 3910451 4892950 4557777 - 96 ldst_executed Executed Load/Store Instructions 2809062 3485643 3259090 - 96 l1_shared_utilization L1/Shared Memory Utilization Low (3) Low (3) Low (3) - 96 l2_utilization L2 Cache Utilization Mid (5) Mid (5) Mid (5) - 96 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 96 dram_utilization Device Memory Utilization High (9) High (9) High (9) - 96 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) - 96 ldst_fu_utilization Load/Store Function Unit Utilization Low (2) Low (2) Low (2) - 96 alu_fu_utilization Arithmetic Function Unit Utilization Low (3) Low (3) Low (3) - 96 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 96 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 96 inst_issued Instructions Issued 16773586 20870723 19493522 - 96 issue_slots Issue Slots 11297324 14066589 13134032 - 96 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 96 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 6816972 8473804 7918893 - 96 l2_l1_write_transactions L2 Write Transactions (L1 write requests 6144 6144 6144 - 96 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 96 l2_l1_write_throughput L2 Throughput (L1 Writes) 75.731MB/s 95.111MB/s 82.256MB/s - 96 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 96 eligible_warps_per_cycle Eligible Warps Per Active Cycle 1.197778 1.209346 1.204318 - 96 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 96 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 96 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 96 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 7.11% 7.57% 7.35% - 96 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 19.61% 20.50% 20.03% - Kernel: void cub::DeviceRadixSortUpsweepKernel::Policy520, bool=0, bool=0, __int64, int>(cub::DeviceRadixSortPolicy<__int64, unsigned char, int>::Policy520*, bool=0*, cub::DeviceRadixSortPolicy<__int64, unsigned char, int>::Policy520*, int, int, cub::GridEvenShare::Policy520*>) - 36 l1_cache_global_hit_rate L1 Global Hit Rate 0.00% 0.00% 0.00% - 36 branch_efficiency Branch Efficiency 100.00% 100.00% 100.00% - 36 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 36 sm_efficiency Multiprocessor Activity 98.99% 99.56% 99.37% - 36 achieved_occupancy Achieved Occupancy 0.453636 0.454770 0.454288 - 36 gld_requested_throughput Requested Global Load Throughput 103.38GB/s 103.96GB/s 103.67GB/s - 36 gst_requested_throughput Requested Global Store Throughput 5.9965MB/s 7.4911MB/s 6.5018MB/s - 36 ipc Executed IPC 1.113990 1.117494 1.115829 - 36 sm_efficiency_instance Multiprocessor Activity 98.99% 99.56% 99.37% - 36 ipc_instance Executed IPC 1.113990 1.117494 1.115829 - 36 inst_per_warp Instructions per warp 3.2646e+04 4.0532e+04 3.7892e+04 - 36 gld_transactions Global Load Transactions 1704192 2118528 1979777 - 36 gst_transactions Global Store Transactions 3840 3840 3840 - 36 local_load_transactions Local Load Transactions 0 0 0 - 36 local_store_transactions Local Store Transactions 0 0 0 - 36 shared_load_transactions Shared Load Transactions 978816 1201344 1126861 - 36 shared_store_transactions Shared Store Transactions 886656 1097664 1027021 - 36 gld_transactions_per_request Global Load Transactions Per Request 1.999805 2.000164 2.000056 - 36 gst_transactions_per_request Global Store Transactions Per Request 32.000000 32.000000 32.000000 - 36 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 36 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 36 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 0.999973 1.000072 1.000036 - 36 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 0.999971 1.000079 1.000040 - 36 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 36 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 36 shared_load_throughput Shared Memory Load Throughput 58.625GB/s 59.677GB/s 59.041GB/s - 36 shared_store_throughput Shared Memory Store Throughput 53.565GB/s 54.059GB/s 53.789GB/s - 36 shared_efficiency Shared Memory Efficiency 26.38% 26.54% 26.43% - 36 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 36 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 36 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 36 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 36 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 36 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 36 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 36 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 36 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 36 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 4.99% 5.31% 5.14% - 36 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 20.43% 21.41% 20.92% - 36 stall_memory_dependency Issue Stall Reasons (Data Request) 36.51% 38.40% 37.47% - 36 stall_sync Issue Stall Reasons (Synchronization) 14.87% 16.00% 15.39% - 36 inst_executed Instructions Executed 15670197 19455177 18187922 - 36 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 36 stall_other Issue Stall Reasons (Other) 5.26% 5.48% 5.37% - 36 inst_fp_32 FP Instructions(Single) 0 0 0 - 36 inst_fp_64 FP Instructions(Double) 0 0 0 - 36 inst_integer Integer Instructions 317784774 434657945 387697000 - 36 inst_bit_convert Bit-Convert Instructions 0 0 0 - 36 inst_control Control-Flow Instructions 2315520 2862173 2676468 - 36 inst_compute_ld_st Load/Store Instructions 86968461 107464809 100601092 - 36 inst_misc Misc Instructions 37043853 77001123 57479209 - 36 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 - 36 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 36 atomic_transactions Atomic Transactions 0 0 0 - 36 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 36 inst_replay_overhead Instruction Replay Overhead 0.093211 0.096657 0.095093 - 36 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 - 36 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.108754 0.108893 0.108847 - 36 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 36 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 36 dram_read_throughput Device Memory Read Throughput 103.80GB/s 104.39GB/s 104.13GB/s - 36 dram_write_throughput Device Memory Write Throughput 21.384MB/s 27.870MB/s 23.803MB/s - 36 gst_throughput Global Store Throughput 47.972MB/s 59.929MB/s 52.015MB/s - 36 gld_throughput Global Load Throughput 103.38GB/s 103.96GB/s 103.67GB/s - 36 warp_execution_efficiency Warp Execution Efficiency 100.00% 100.00% 100.00% - 36 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 36 gld_efficiency Global Memory Load Efficiency 99.99% 100.01% 100.00% - 36 gst_efficiency Global Memory Store Efficiency 12.50% 12.50% 12.50% - 36 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 36 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 0.00% 0.00% 0.00% - 36 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 36 l2_l1_read_throughput L2 Throughput (L1 Reads) 103.38GB/s 103.96GB/s 103.67GB/s - 36 local_memory_overhead Local Memory Overhead 0.00% 0.01% 0.00% - 36 issued_ipc Issued IPC 1.218837 1.225523 1.222048 - 36 issue_slot_utilization Issue Slot Utilization 42.10% 42.49% 42.27% - 36 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 36 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 36 l2_read_transactions L2 Read Transactions 6817516 8474361 7919438 - 36 l2_write_transactions L2 Write Transactions 3840 3852 3841 - 36 tex_cache_transactions Texture Cache Transactions 0 0 0 - 36 dram_read_transactions Device Memory Read Transactions 6843662 8513713 7953984 - 36 dram_write_transactions Device Memory Write Transactions 1643 1869 1761 - 36 l2_read_throughput L2 Throughput (Reads) 103.39GB/s 103.97GB/s 103.67GB/s - 36 l2_write_throughput L2 Throughput (Writes) 47.972MB/s 59.997MB/s 52.036MB/s - 36 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 36 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 36 cf_issued Issued Control-Flow Instructions 88792 107534 101286 - 36 cf_executed Executed Control-Flow Instructions 88792 107534 101286 - 36 ldst_issued Issued Load/Store Instructions 4330886 5424782 5046904 - 36 ldst_executed Executed Load/Store Instructions 2867946 3543934 3317563 - 36 l1_shared_utilization L1/Shared Memory Utilization Low (3) Low (3) Low (3) - 36 l2_utilization L2 Cache Utilization Mid (5) Mid (5) Mid (5) - 36 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 36 dram_utilization Device Memory Utilization High (9) High (9) High (9) - 36 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) - 36 ldst_fu_utilization Load/Store Function Unit Utilization Low (2) Low (2) Low (2) - 36 alu_fu_utilization Arithmetic Function Unit Utilization Low (3) Low (3) Low (3) - 36 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 36 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 36 inst_issued Instructions Issued 17131849 21341017 19917587 - 36 issue_slots Issue Slots 11864987 14762385 13778786 - 36 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 36 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 6816972 8473804 7918893 - 36 l2_l1_write_transactions L2 Write Transactions (L1 write requests 3840 3840 3840 - 36 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 36 l2_l1_write_throughput L2 Throughput (L1 Writes) 47.972MB/s 59.929MB/s 52.015MB/s - 36 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 36 eligible_warps_per_cycle Eligible Warps Per Active Cycle 1.767743 1.848495 1.805747 - 36 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 36 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 36 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 36 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 14.95% 16.44% 15.73% - 36 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 14.31% 15.18% 14.76% - Kernel: void cub::DeviceRadixSortDownsweepKernel::Policy520, bool=1, bool=0, __int64, unsigned char, int>(cub::DeviceRadixSortPolicy<__int64, unsigned char, int>::Policy520*, cub::DeviceRadixSortPolicy<__int64, unsigned char, int>::Policy520, bool=1*, bool=1, bool=0*, bool=1*, int, int, cub::GridEvenShare) - 3 l1_cache_global_hit_rate L1 Global Hit Rate 25.00% 25.00% 25.00% - 3 branch_efficiency Branch Efficiency 100.00% 100.00% 100.00% - 3 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 3 sm_efficiency Multiprocessor Activity 99.72% 99.88% 99.79% - 3 achieved_occupancy Achieved Occupancy 0.391854 0.391928 0.391883 - 3 gld_requested_throughput Requested Global Load Throughput 28.058GB/s 28.073GB/s 28.065GB/s - 3 gst_requested_throughput Requested Global Store Throughput 28.056GB/s 28.071GB/s 28.063GB/s - 3 ipc Executed IPC 1.141278 1.143545 1.142154 - 3 sm_efficiency_instance Multiprocessor Activity 99.72% 99.88% 99.79% - 3 ipc_instance Executed IPC 1.141278 1.143545 1.142154 - 3 inst_per_warp Instructions per warp 1.3932e+05 1.7317e+05 1.6183e+05 - 3 gld_transactions Global Load Transactions 2560568 3181612 2973606 - 3 gst_transactions Global Store Transactions 5340268 6637180 6202846 - 3 local_load_transactions Local Load Transactions 0 0 0 - 3 local_store_transactions Local Store Transactions 0 0 0 - 3 shared_load_transactions Shared Load Transactions 13386996 16639752 15549970 - 3 shared_store_transactions Shared Store Transactions 15052836 18711228 17491578 - 3 gld_transactions_per_request Global Load Transactions Per Request 1.501672 1.502235 1.501861 - 3 gst_transactions_per_request Global Store Transactions Per Request 3.133033 3.133511 3.133209 - 3 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 3 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 3 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 1.532427 1.532578 1.532512 - 3 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 1.901101 1.902829 1.901682 - 3 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 shared_load_throughput Shared Memory Load Throughput 195.89GB/s 195.98GB/s 195.93GB/s - 3 shared_store_throughput Shared Memory Store Throughput 220.27GB/s 220.59GB/s 220.39GB/s - 3 shared_efficiency Shared Memory Efficiency 59.70% 59.72% 59.72% - 3 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 3 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 3 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 3 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 6.93% 6.94% 6.94% - 3 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 45.57% 45.63% 45.60% - 3 stall_memory_dependency Issue Stall Reasons (Data Request) 12.64% 12.70% 12.66% - 3 stall_sync Issue Stall Reasons (Synchronization) 10.16% 10.19% 10.18% - 3 inst_executed Instructions Executed 66872533 83121160 77679490 - 3 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 3 stall_other Issue Stall Reasons (Other) 8.51% 8.51% 8.51% - 3 inst_fp_32 FP Instructions(Single) 0 0 0 - 3 inst_fp_64 FP Instructions(Double) 0 0 0 - 3 inst_integer Integer Instructions 1141090049 1418361671 1325501871 - 3 inst_bit_convert Bit-Convert Instructions 27268608 33895680 31676160 - 3 inst_control Control-Flow Instructions 13678336 16991872 15882112 - 3 inst_compute_ld_st Load/Store Instructions 643813668 800275509 747873645 - 3 inst_misc Misc Instructions 154028943 191448755 178921436 - 3 inst_inter_thread_communication Inter-Thread Instructions 1920 1920 1920 - 3 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 3 atomic_transactions Atomic Transactions 0 0 0 - 3 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 3 inst_replay_overhead Instruction Replay Overhead 0.167570 0.168154 0.167835 - 3 shared_replay_overhead Shared Memory Replay Overhead 0.107898 0.108146 0.107999 - 3 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.028708 0.028716 0.028711 - 3 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 3 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 dram_read_throughput Device Memory Read Throughput 28.531GB/s 28.546GB/s 28.537GB/s - 3 dram_write_throughput Device Memory Write Throughput 31.774GB/s 31.793GB/s 31.784GB/s - 3 gst_throughput Global Store Throughput 38.946GB/s 38.972GB/s 38.958GB/s - 3 gld_throughput Global Load Throughput 37.454GB/s 37.477GB/s 37.469GB/s - 3 warp_execution_efficiency Warp Execution Efficiency 99.81% 99.81% 99.81% - 3 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 3 gld_efficiency Global Memory Load Efficiency 74.88% 74.91% 74.90% - 3 gst_efficiency Global Memory Store Efficiency 72.03% 72.04% 72.03% - 3 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 0.05% 0.06% 0.05% - 3 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 3 l2_l1_read_throughput L2 Throughput (L1 Reads) 28.091GB/s 28.106GB/s 28.100GB/s - 3 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00% - 3 issued_ipc Issued IPC 1.333923 1.334555 1.334234 - 3 issue_slot_utilization Issue Slot Utilization 52.14% 52.17% 52.15% - 3 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 3 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 3 l2_read_transactions L2 Read Transactions 7682008 9545412 8921318 - 3 l2_write_transactions L2 Write Transactions 10647171 13233457 12367902 - 3 tex_cache_transactions Texture Cache Transactions 0 0 0 - 3 dram_read_transactions Device Memory Read Transactions 7798008 9694322 9059119 - 3 dram_write_transactions Device Memory Write Transactions 8686603 10796178 10089868 - 3 l2_read_throughput L2 Throughput (Reads) 28.093GB/s 28.109GB/s 28.103GB/s - 3 l2_write_throughput L2 Throughput (Writes) 38.947GB/s 38.974GB/s 38.960GB/s - 3 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 cf_issued Issued Control-Flow Instructions 998008 1239620 1158700 - 3 cf_executed Executed Control-Flow Instructions 998008 1239620 1158700 - 3 ldst_issued Issued Load/Store Instructions 32800169 40787384 38113025 - 3 ldst_executed Executed Load/Store Instructions 21588964 26835396 25078276 - 3 l1_shared_utilization L1/Shared Memory Utilization Mid (6) Mid (6) Mid (6) - 3 l2_utilization L2 Cache Utilization Mid (4) Mid (4) Mid (4) - 3 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 3 dram_utilization Device Memory Utilization Mid (6) Mid (6) Mid (6) - 3 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) - 3 ldst_fu_utilization Load/Store Function Unit Utilization Low (3) Low (3) Low (3) - 3 alu_fu_utilization Arithmetic Function Unit Utilization Low (3) Low (3) Low (3) - 3 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 3 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 3 inst_issued Instructions Issued 78092204 97079497 90726011 - 3 issue_slots Issue Slots 61046574 75891904 70925578 - 3 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 7680576 9544868 8920422 - 3 l2_l1_write_transactions L2 Write Transactions (L1 write requests 10646917 13232972 12367135 - 3 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 3 l2_l1_write_throughput L2 Throughput (L1 Writes) 38.945GB/s 38.969GB/s 38.957GB/s - 3 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 3 eligible_warps_per_cycle Eligible Warps Per Active Cycle 2.227098 2.227878 2.227515 - 3 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 3 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 3 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 16.13% 16.16% 16.14% - 3 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.57% 0.59% 0.58% - Kernel: void cub::DeviceRadixSortDownsweepKernel::Policy520, bool=0, bool=0, __int64, cub::NullType, int>(cub::DeviceRadixSortPolicy<__int64, cub::NullType, int>::Policy520*, cub::DeviceRadixSortPolicy<__int64, cub::NullType, int>::Policy520, bool=0*, bool=0, bool=0*, bool=0*, int, int, cub::GridEvenShare) - 96 l1_cache_global_hit_rate L1 Global Hit Rate 0.17% 0.26% 0.20% - 96 branch_efficiency Branch Efficiency 100.00% 100.00% 100.00% - 96 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 96 sm_efficiency Multiprocessor Activity 99.11% 99.91% 99.58% - 96 achieved_occupancy Achieved Occupancy 0.322786 0.325930 0.324553 - 96 gld_requested_throughput Requested Global Load Throughput 33.616GB/s 35.996GB/s 35.069GB/s - 96 gst_requested_throughput Requested Global Store Throughput 33.610GB/s 35.990GB/s 35.063GB/s - 96 ipc Executed IPC 1.266958 1.362384 1.325387 - 96 sm_efficiency_instance Multiprocessor Activity 99.11% 99.91% 99.58% - 96 ipc_instance Executed IPC 1.266958 1.362384 1.325387 - 96 inst_per_warp Instructions per warp 1.4342e+05 1.7826e+05 1.6659e+05 - 96 gld_transactions Global Load Transactions 1715812 2131452 1992077 - 96 gst_transactions Global Store Transactions 3444888 5532976 4294255 - 96 local_load_transactions Local Load Transactions 0 0 0 - 96 local_store_transactions Local Store Transactions 0 0 0 - 96 shared_load_transactions Shared Load Transactions 10556312 13125272 12264018 - 96 shared_store_transactions Shared Store Transactions 10848928 15257272 13761571 - 96 gld_transactions_per_request Global Load Transactions Per Request 2.010574 2.014395 2.011824 - 96 gst_transactions_per_request Global Store Transactions Per Request 4.042293 5.227622 4.338154 - 96 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 96 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 96 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 1.311611 1.311976 1.311780 - 96 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 1.648625 1.865241 1.800238 - 96 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 96 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 96 shared_load_throughput Shared Memory Load Throughput 208.20GB/s 222.95GB/s 217.21GB/s - 96 shared_store_throughput Shared Memory Store Throughput 229.02GB/s 252.34GB/s 243.64GB/s - 96 shared_efficiency Shared Memory Efficiency 73.31% 78.19% 74.73% - 96 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 96 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 96 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 96 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 96 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 96 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 96 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 96 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 96 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 96 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 11.61% 12.32% 11.99% - 96 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 53.83% 56.01% 55.03% - 96 stall_memory_dependency Issue Stall Reasons (Data Request) 7.90% 9.01% 8.19% - 96 stall_sync Issue Stall Reasons (Synchronization) 7.90% 8.67% 8.33% - 96 inst_executed Instructions Executed 55071390 68451855 63970561 - 96 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 96 stall_other Issue Stall Reasons (Other) 10.50% 10.97% 10.78% - 96 inst_fp_32 FP Instructions(Single) 0 0 0 - 96 inst_fp_64 FP Instructions(Double) 0 0 0 - 96 inst_integer Integer Instructions 994096566 1310959016 1189935814 - 96 inst_bit_convert Bit-Convert Instructions 0 0 0 - 96 inst_control Control-Flow Instructions 6090304 7562944 7069717 - 96 inst_compute_ld_st Load/Store Instructions 522768967 649808066 607259315 - 96 inst_misc Misc Instructions 104613391 190278051 149670273 - 96 inst_inter_thread_communication Inter-Thread Instructions 6144 6144 6144 - 96 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 96 atomic_transactions Atomic Transactions 0 0 0 - 96 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 96 inst_replay_overhead Instruction Replay Overhead 0.058773 0.084957 0.070361 - 96 shared_replay_overhead Shared Memory Replay Overhead 0.051420 0.074290 0.066885 - 96 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.031065 0.031106 0.031081 - 96 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 96 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 96 dram_read_throughput Device Memory Read Throughput 34.093GB/s 36.507GB/s 35.569GB/s - 96 dram_write_throughput Device Memory Write Throughput 36.415GB/s 38.683GB/s 37.078GB/s - 96 gst_throughput Global Store Throughput 42.636GB/s 44.490GB/s 43.248GB/s - 96 gld_throughput Global Load Throughput 33.807GB/s 36.233GB/s 35.284GB/s - 96 warp_execution_efficiency Warp Execution Efficiency 100.00% 100.00% 100.00% - 96 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 96 gld_efficiency Global Memory Load Efficiency 99.26% 99.46% 99.39% - 96 gst_efficiency Global Memory Store Efficiency 75.62% 83.02% 81.09% - 96 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 96 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 0.12% 0.23% 0.17% - 96 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 96 l2_l1_read_throughput L2 Throughput (L1 Reads) 33.745GB/s 36.156GB/s 35.213GB/s - 96 local_memory_overhead Local Memory Overhead 0.00% 0.01% 0.00% - 96 issued_ipc Issued IPC 1.373833 1.443002 1.418505 - 96 issue_slot_utilization Issue Slot Utilization 48.66% 50.59% 49.96% - 96 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 96 sysmem_write_transactions System Memory Write Transactions 0 1 0 - 96 l2_read_transactions L2 Read Transactions 6850106 8510105 7953538 - 96 l2_write_transactions L2 Write Transactions 8223188 11204827 9770941 - 96 tex_cache_transactions Texture Cache Transactions 0 0 0 - 96 dram_read_transactions Device Memory Read Transactions 6907282 8602220 8033155 - 96 dram_write_transactions Device Memory Write Transactions 7014272 9743153 8377758 - 96 l2_read_throughput L2 Throughput (Reads) 33.747GB/s 36.158GB/s 35.217GB/s - 96 l2_write_throughput L2 Throughput (Writes) 42.636GB/s 44.490GB/s 43.248GB/s - 96 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 96 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 5.3633KB/s 57.0000B/s - 96 cf_issued Issued Control-Flow Instructions 476482 591532 552998 - 96 cf_executed Executed Control-Flow Instructions 476482 591532 552998 - 96 ldst_issued Issued Load/Store Instructions 20291510 26998312 24299630 - 96 ldst_executed Executed Load/Store Instructions 17044102 21185902 19798702 - 96 l1_shared_utilization L1/Shared Memory Utilization High (7) High (7) High (7) - 96 l2_utilization L2 Cache Utilization Mid (4) Mid (4) Mid (4) - 96 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 96 dram_utilization Device Memory Utilization High (7) High (7) High (7) - 96 sysmem_utilization System Memory Utilization Idle (0) Low (1) Idle (0) - 96 ldst_fu_utilization Load/Store Function Unit Utilization Low (3) Low (3) Low (3) - 96 alu_fu_utilization Arithmetic Function Unit Utilization Low (3) Low (3) Low (3) - 96 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 96 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 96 inst_issued Instructions Issued 58319452 74265484 68471672 - 96 issue_slots Issue Slots 40895815 52608009 48232223 - 96 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 96 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 6849024 8508208 7952551 - 96 l2_l1_write_transactions L2 Write Transactions (L1 write requests 8223185 11204825 9770939 - 96 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 96 l2_l1_write_throughput L2 Throughput (L1 Writes) 42.636GB/s 44.490GB/s 43.248GB/s - 96 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 96 eligible_warps_per_cycle Eligible Warps Per Active Cycle 1.414903 1.527476 1.482936 - 96 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 96 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 96 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 96 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 5.07% 6.30% 5.69% - 96 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.83% 1.01% 0.93% - Kernel: void cub::DeviceRadixSortDownsweepKernel::Policy520, bool=1, bool=0, __int64, cub::NullType, int>(cub::DeviceRadixSortPolicy<__int64, cub::NullType, int>::Policy520*, cub::DeviceRadixSortPolicy<__int64, cub::NullType, int>::Policy520, bool=1*, bool=1, bool=0*, bool=1*, int, int, cub::GridEvenShare) - 18 l1_cache_global_hit_rate L1 Global Hit Rate 0.08% 0.12% 0.10% - 18 branch_efficiency Branch Efficiency 100.00% 100.00% 100.00% - 18 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 18 sm_efficiency Multiprocessor Activity 99.13% 99.81% 99.53% - 18 achieved_occupancy Achieved Occupancy 0.322503 0.325363 0.324649 - 18 gld_requested_throughput Requested Global Load Throughput 38.123GB/s 39.786GB/s 38.751GB/s - 18 gst_requested_throughput Requested Global Store Throughput 38.120GB/s 39.782GB/s 38.747GB/s - 18 ipc Executed IPC 1.320072 1.381647 1.343977 - 18 sm_efficiency_instance Multiprocessor Activity 99.13% 99.81% 99.53% - 18 ipc_instance Executed IPC 1.320072 1.381647 1.343977 - 18 inst_per_warp Instructions per warp 1.3158e+05 1.6355e+05 1.5284e+05 - 18 gld_transactions Global Load Transactions 1710296 2124648 1985878 - 18 gst_transactions Global Store Transactions 3425408 4878228 4296088 - 18 local_load_transactions Local Load Transactions 0 0 0 - 18 local_store_transactions Local Store Transactions 0 0 0 - 18 shared_load_transactions Shared Load Transactions 8663604 10769568 10064001 - 18 shared_store_transactions Shared Store Transactions 10165244 13569952 12376623 - 18 gld_transactions_per_request Global Load Transactions Per Request 2.004881 2.006696 2.005501 - 18 gst_transactions_per_request Global Store Transactions Per Request 4.019856 4.607224 4.340058 - 18 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 18 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 18 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 1.325936 1.326152 1.326066 - 18 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 1.982462 2.157101 2.103412 - 18 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 18 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 18 shared_load_throughput Shared Memory Load Throughput 193.78GB/s 202.24GB/s 196.97GB/s - 18 shared_store_throughput Shared Memory Store Throughput 234.42GB/s 249.62GB/s 242.20GB/s - 18 shared_efficiency Shared Memory Efficiency 69.43% 72.71% 70.42% - 18 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 18 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 18 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 18 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 18 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 18 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 18 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 18 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 18 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 18 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 12.51% 13.12% 12.76% - 18 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 52.09% 53.47% 52.97% - 18 stall_memory_dependency Issue Stall Reasons (Data Request) 9.47% 10.38% 9.78% - 18 stall_sync Issue Stall Reasons (Synchronization) 7.20% 7.83% 7.48% - 18 inst_executed Instructions Executed 50526654 62802639 58691265 - 18 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 18 stall_other Issue Stall Reasons (Other) 10.49% 10.78% 10.62% - 18 inst_fp_32 FP Instructions(Single) 0 0 0 - 18 inst_fp_64 FP Instructions(Double) 0 0 0 - 18 inst_integer Integer Instructions 939529030 1167814840 1091358252 - 18 inst_bit_convert Bit-Convert Instructions 0 0 0 - 18 inst_control Control-Flow Instructions 6093376 7566016 7072789 - 18 inst_compute_ld_st Load/Store Instructions 422018103 524574562 490225705 - 18 inst_misc Misc Instructions 105361631 130952067 122384694 - 18 inst_inter_thread_communication Inter-Thread Instructions 3072 3072 3072 - 18 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 18 atomic_transactions Atomic Transactions 0 0 0 - 18 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 18 inst_replay_overhead Instruction Replay Overhead 0.062283 0.076761 0.072305 - 18 shared_replay_overhead Shared Memory Replay Overhead 0.059408 0.074890 0.071095 - 18 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.033795 0.033817 0.033803 - 18 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 18 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 18 dram_read_throughput Device Memory Read Throughput 38.567GB/s 40.303GB/s 39.221GB/s - 18 dram_write_throughput Device Memory Write Throughput 40.306GB/s 41.491GB/s 40.932GB/s - 18 gst_throughput Global Store Throughput 47.239GB/s 48.318GB/s 47.812GB/s - 18 gld_throughput Global Load Throughput 38.231GB/s 39.900GB/s 38.869GB/s - 18 warp_execution_efficiency Warp Execution Efficiency 99.67% 99.67% 99.67% - 18 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 18 gld_efficiency Global Memory Load Efficiency 99.63% 99.73% 99.70% - 18 gst_efficiency Global Memory Store Efficiency 79.31% 83.13% 81.05% - 18 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 18 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 0.05% 0.11% 0.09% - 18 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 18 l2_l1_read_throughput L2 Throughput (L1 Reads) 38.196GB/s 39.863GB/s 38.830GB/s - 18 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00% - 18 issued_ipc Issued IPC 1.418839 1.469003 1.441042 - 18 issue_slot_utilization Issue Slot Utilization 49.58% 51.03% 50.26% - 18 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 18 sysmem_write_transactions System Memory Write Transactions 0 2 0 - 18 l2_read_transactions L2 Read Transactions 6833816 8491761 7936301 - 18 l2_write_transactions L2 Write Transactions 8200235 10684121 9774157 - 18 tex_cache_transactions Texture Cache Transactions 0 0 0 - 18 dram_read_transactions Device Memory Read Transactions 6898501 8584971 8015700 - 18 dram_write_transactions Device Memory Write Transactions 6979169 9186921 8368062 - 18 l2_read_throughput L2 Throughput (Reads) 38.198GB/s 39.865GB/s 38.833GB/s - 18 l2_write_throughput L2 Throughput (Writes) 47.239GB/s 48.318GB/s 47.812GB/s - 18 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 18 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 9.6387KB/s 548.000B/s - 18 cf_issued Issued Control-Flow Instructions 476482 591532 552998 - 18 cf_executed Executed Control-Flow Instructions 476482 591532 552998 - 18 ldst_issued Issued Load/Store Instructions 17166159 22239594 20522434 - 18 ldst_executed Executed Load/Store Instructions 14014278 17419758 16279171 - 18 l1_shared_utilization L1/Shared Memory Utilization High (7) High (7) High (7) - 18 l2_utilization L2 Cache Utilization Mid (5) Mid (5) Mid (5) - 18 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 18 dram_utilization Device Memory Utilization High (7) High (7) High (7) - 18 sysmem_utilization System Memory Utilization Idle (0) Low (1) Idle (0) - 18 ldst_fu_utilization Load/Store Function Unit Utilization Low (3) Low (3) Low (3) - 18 alu_fu_utilization Arithmetic Function Unit Utilization Mid (4) Mid (4) Mid (4) - 18 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 18 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 18 inst_issued Instructions Issued 53676601 67621915 62935066 - 18 issue_slots Issue Slots 37294465 47259051 43905454 - 18 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 18 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 6833524 8490772 7935666 - 18 l2_l1_write_transactions L2 Write Transactions (L1 write requests 8200235 10684121 9774155 - 18 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 18 l2_l1_write_throughput L2 Throughput (L1 Writes) 47.239GB/s 48.318GB/s 47.812GB/s - 18 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 18 eligible_warps_per_cycle Eligible Warps Per Active Cycle 1.481715 1.551210 1.511917 - 18 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 18 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 18 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 18 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 6.21% 6.70% 6.40% - 18 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 1.03% 1.21% 1.09% - Kernel: void mtf_thread(unsigned char const *, unsigned char*, int, int) - 3 l1_cache_global_hit_rate L1 Global Hit Rate 52.59% 55.07% 53.88% - 3 branch_efficiency Branch Efficiency 72.98% 73.35% 73.13% - 3 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 3 sm_efficiency Multiprocessor Activity 77.32% 79.44% 78.39% - 3 achieved_occupancy Achieved Occupancy 0.100694 0.102777 0.101658 - 3 gld_requested_throughput Requested Global Load Throughput 223.14MB/s 228.20MB/s 226.18MB/s - 3 gst_requested_throughput Requested Global Store Throughput 223.08MB/s 228.14MB/s 226.12MB/s - 3 ipc Executed IPC 0.437893 0.456574 0.448179 - 3 sm_efficiency_instance Multiprocessor Activity 77.32% 79.44% 78.39% - 3 ipc_instance Executed IPC 0.437893 0.456574 0.448179 - 3 inst_per_warp Instructions per warp 1.3928e+06 1.4175e+06 1.4055e+06 - 3 gld_transactions Global Load Transactions 11390288 13638500 12680064 - 3 gst_transactions Global Store Transactions 11386880 13631488 12675754 - 3 local_load_transactions Local Load Transactions 0 0 0 - 3 local_store_transactions Local Store Transactions 0 0 0 - 3 shared_load_transactions Shared Load Transactions 4314856 5093580 4750556 - 3 shared_store_transactions Shared Store Transactions 7609976 8912500 8393533 - 3 gld_transactions_per_request Global Load Transactions Per Request 3.373574 3.600707 3.508246 - 3 gst_transactions_per_request Global Store Transactions Per Request 3.372364 3.598999 3.506462 - 3 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 3 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 3 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 0.947733 1.027938 0.986673 - 3 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 0.966429 1.033013 0.995163 - 3 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 shared_load_throughput Shared Memory Load Throughput 10.155GB/s 10.965GB/s 10.598GB/s - 3 shared_store_throughput Shared Memory Store Throughput 18.154GB/s 19.339GB/s 18.724GB/s - 3 shared_efficiency Shared Memory Efficiency 11.08% 11.87% 11.48% - 3 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 3 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 3 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 3 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 26.96% 27.25% 27.13% - 3 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 54.12% 54.92% 54.46% - 3 stall_memory_dependency Issue Stall Reasons (Data Request) 6.17% 6.33% 6.24% - 3 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% - 3 inst_executed Instructions Executed 119068686 146266743 136727219 - 3 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 3 stall_other Issue Stall Reasons (Other) 11.90% 12.47% 12.17% - 3 inst_fp_32 FP Instructions(Single) 0 0 0 - 3 inst_fp_64 FP Instructions(Double) 0 0 0 - 3 inst_integer Integer Instructions 549336819 669466620 628672677 - 3 inst_bit_convert Bit-Convert Instructions 549310002 669433382 628641616 - 3 inst_control Control-Flow Instructions 168476896 205007454 192609512 - 3 inst_compute_ld_st Load/Store Instructions 190597390 232437202 218225466 - 3 inst_misc Misc Instructions 33096393 41040006 38326263 - 3 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 - 3 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 3 atomic_transactions Atomic Transactions 0 0 0 - 3 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 3 inst_replay_overhead Instruction Replay Overhead 0.132005 0.134630 0.133554 - 3 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 - 3 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.040868 0.045371 0.042784 - 3 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 3 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 dram_read_throughput Device Memory Read Throughput 739.12MB/s 743.65MB/s 741.48MB/s - 3 dram_write_throughput Device Memory Write Throughput 6.9540GB/s 7.1162GB/s 7.0516GB/s - 3 gst_throughput Global Store Throughput 6.9713GB/s 7.1295GB/s 7.0663GB/s - 3 gld_throughput Global Load Throughput 27.270GB/s 28.945GB/s 28.270GB/s - 3 warp_execution_efficiency Warp Execution Efficiency 41.32% 42.06% 41.75% - 3 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 3 gld_efficiency Global Memory Load Efficiency 0.75% 0.82% 0.78% - 3 gst_efficiency Global Memory Store Efficiency 3.13% 3.13% 3.13% - 3 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 49.06% 49.54% 49.26% - 3 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 3 l2_l1_read_throughput L2 Throughput (L1 Reads) 13.026GB/s 13.133GB/s 13.093GB/s - 3 local_memory_overhead Local Memory Overhead 0.00% 2.35% 1.25% - 3 issued_ipc Issued IPC 0.510131 0.516849 0.512457 - 3 issue_slot_utilization Issue Slot Utilization 17.95% 18.16% 18.02% - 3 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 3 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 3 l2_read_transactions L2 Read Transactions 20756812 24994683 23572743 - 3 l2_write_transactions L2 Write Transactions 10973201 13606929 12707174 - 3 tex_cache_transactions Texture Cache Transactions 0 0 0 - 3 dram_read_transactions Device Memory Read Transactions 1143112 1382327 1300808 - 3 dram_write_transactions Device Memory Write Transactions 10945931 13581490 12681016 - 3 l2_read_throughput L2 Throughput (Reads) 13.087GB/s 13.187GB/s 13.123GB/s - 3 l2_write_throughput L2 Throughput (Writes) 6.9713GB/s 7.1295GB/s 7.0663GB/s - 3 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 cf_issued Issued Control-Flow Instructions 15839208 19525098 18216424 - 3 cf_executed Executed Control-Flow Instructions 15839208 19525098 18216424 - 3 ldst_issued Issued Load/Store Instructions 33608987 41526098 38796338 - 3 ldst_executed Executed Load/Store Instructions 17891612 21927546 20523902 - 3 l1_shared_utilization L1/Shared Memory Utilization Low (1) Low (1) Low (1) - 3 l2_utilization L2 Cache Utilization Low (1) Low (1) Low (1) - 3 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 3 dram_utilization Device Memory Utilization Low (1) Low (1) Low (1) - 3 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) - 3 ldst_fu_utilization Load/Store Function Unit Utilization Low (1) Low (1) Low (1) - 3 alu_fu_utilization Arithmetic Function Unit Utilization Low (2) Low (2) Low (2) - 3 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 3 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 3 inst_issued Instructions Issued 134786301 165868099 154999795 - 3 issue_slots Issue Slots 94731441 116704945 109021477 - 3 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 20653020 25053396 23522314 - 3 l2_l1_write_transactions L2 Write Transactions (L1 write requests 10973184 13606912 12707157 - 3 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 3 l2_l1_write_throughput L2 Throughput (L1 Writes) 6.9713GB/s 7.1295GB/s 7.0663GB/s - 3 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 3 eligible_warps_per_cycle Eligible Warps Per Active Cycle 0.382486 0.388008 0.384428 - 3 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 3 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 3 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.00% 0.00% 0.00% - 3 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% - Kernel: void cub::RadixSortScanBinsKernel::Policy520, int>(cub::NullType*, int) - 114 l1_cache_global_hit_rate L1 Global Hit Rate 0.00% 0.00% 0.00% - 114 branch_efficiency Branch Efficiency 100.00% 100.00% 100.00% - 114 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 114 sm_efficiency Multiprocessor Activity 6.87% 9.87% 9.68% - 114 achieved_occupancy Achieved Occupancy 0.329016 0.329199 0.329089 - 114 gld_requested_throughput Requested Global Load Throughput 1.9663GB/s 3.0275GB/s 2.9178GB/s - 114 gst_requested_throughput Requested Global Store Throughput 1.9663GB/s 3.0275GB/s 2.9178GB/s - 114 ipc Executed IPC 0.252612 0.262215 0.260029 - 114 sm_efficiency_instance Multiprocessor Activity 6.87% 9.87% 9.68% - 114 ipc_instance Executed IPC 0.252612 0.262215 0.260029 - 114 inst_per_warp Instructions per warp 134.500000 134.500000 134.500000 - 114 gld_transactions Global Load Transactions 1024 1024 1024 - 114 gst_transactions Global Store Transactions 1024 1024 1024 - 114 local_load_transactions Local Load Transactions 0 0 0 - 114 local_store_transactions Local Store Transactions 0 0 0 - 114 shared_load_transactions Shared Load Transactions 880 880 880 - 114 shared_store_transactions Shared Store Transactions 896 896 896 - 114 gld_transactions_per_request Global Load Transactions Per Request 16.000000 16.000000 16.000000 - 114 gst_transactions_per_request Global Store Transactions Per Request 16.000000 16.000000 16.000000 - 114 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 114 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 114 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 5.641026 5.641026 5.641026 - 114 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 5.600000 5.600000 5.600000 - 114 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 114 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 114 shared_load_throughput Shared Memory Load Throughput 6.7593GB/s 10.407GB/s 10.030GB/s - 114 shared_store_throughput Shared Memory Store Throughput 6.8822GB/s 10.596GB/s 10.212GB/s - 114 shared_efficiency Shared Memory Efficiency 17.36% 17.36% 17.36% - 114 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 114 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 114 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 114 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 114 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 114 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 114 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 114 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 114 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 114 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 9.45% 10.76% 9.71% - 114 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 14.15% 14.84% 14.52% - 114 stall_memory_dependency Issue Stall Reasons (Data Request) 27.12% 28.23% 27.58% - 114 stall_sync Issue Stall Reasons (Synchronization) 45.35% 47.00% 46.21% - 114 inst_executed Instructions Executed 2152 2152 2152 - 114 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 114 stall_other Issue Stall Reasons (Other) 1.84% 1.93% 1.89% - 114 inst_fp_32 FP Instructions(Single) 0 0 0 - 114 inst_fp_64 FP Instructions(Double) 0 0 0 - 114 inst_integer Integer Instructions 34944 34944 34944 - 114 inst_bit_convert Bit-Convert Instructions 0 0 0 - 114 inst_control Control-Flow Instructions 2048 2048 2048 - 114 inst_compute_ld_st Load/Store Instructions 13960 13960 13960 - 114 inst_misc Misc Instructions 14200 14200 14200 - 114 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 - 114 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 114 atomic_transactions Atomic Transactions 0 0 0 - 114 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 114 inst_replay_overhead Instruction Replay Overhead 0.151952 0.154740 0.152396 - 114 shared_replay_overhead Shared Memory Replay Overhead 0.237918 0.237918 0.237918 - 114 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.475836 0.475836 0.475836 - 114 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 114 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 114 dram_read_throughput Device Memory Read Throughput 2.4637GB/s 3.8501GB/s 3.6535GB/s - 114 dram_write_throughput Device Memory Write Throughput 1.9663GB/s 3.0275GB/s 2.9178GB/s - 114 gst_throughput Global Store Throughput 1.9663GB/s 3.0275GB/s 2.9178GB/s - 114 gld_throughput Global Load Throughput 7.8654GB/s 12.110GB/s 11.671GB/s - 114 warp_execution_efficiency Warp Execution Efficiency 99.64% 99.64% 99.64% - 114 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 114 gld_efficiency Global Memory Load Efficiency 25.00% 25.00% 25.00% - 114 gst_efficiency Global Memory Store Efficiency 100.00% 100.00% 100.00% - 114 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 114 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 0.00% 0.00% 0.00% - 114 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 114 l2_l1_read_throughput L2 Throughput (L1 Reads) 1.9663GB/s 3.0275GB/s 2.9178GB/s - 114 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00% - 114 issued_ipc Issued IPC 0.293408 0.302277 0.299569 - 114 issue_slot_utilization Issue Slot Utilization 12.09% 12.46% 12.35% - 114 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 114 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 114 l2_read_transactions L2 Read Transactions 1088 1115 1090 - 114 l2_write_transactions L2 Write Transactions 1024 1024 1024 - 114 tex_cache_transactions Texture Cache Transactions 0 0 0 - 114 dram_read_transactions Device Memory Read Transactions 1095 1335 1282 - 114 dram_write_transactions Device Memory Write Transactions 1024 1024 1024 - 114 l2_read_throughput L2 Throughput (Reads) 2.0892GB/s 3.2249GB/s 3.1084GB/s - 114 l2_write_throughput L2 Throughput (Writes) 1.9663GB/s 3.0275GB/s 2.9178GB/s - 114 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 114 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 114 cf_issued Issued Control-Flow Instructions 240 240 240 - 114 cf_executed Executed Control-Flow Instructions 240 240 240 - 114 ldst_issued Issued Load/Store Instructions 947 953 947 - 114 ldst_executed Executed Load/Store Instructions 700 700 700 - 114 l1_shared_utilization L1/Shared Memory Utilization Low (1) Low (1) Low (1) - 114 l2_utilization L2 Cache Utilization Low (1) Low (1) Low (1) - 114 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 114 dram_utilization Device Memory Utilization Low (1) Low (1) Low (1) - 114 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) - 114 ldst_fu_utilization Load/Store Function Unit Utilization Low (1) Low (1) Low (1) - 114 alu_fu_utilization Arithmetic Function Unit Utilization Low (1) Low (1) Low (1) - 114 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 114 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 114 inst_issued Instructions Issued 2473 2485 2479 - 114 issue_slots Issue Slots 2040 2049 2043 - 114 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 114 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 1024 1024 1024 - 114 l2_l1_write_transactions L2 Write Transactions (L1 write requests 1024 1024 1024 - 114 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 114 l2_l1_write_throughput L2 Throughput (L1 Writes) 1.9663GB/s 3.0275GB/s 2.9178GB/s - 114 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 114 eligible_warps_per_cycle Eligible Warps Per Active Cycle 0.418510 0.436337 0.427997 - 114 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 114 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 114 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 114 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.04% 0.16% 0.09% - 114 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% - Kernel: bsc_st567_encode_cuda_presort(unsigned char*, __int64*, int) - 12 l1_cache_global_hit_rate L1 Global Hit Rate 62.43% 62.46% 62.44% - 12 branch_efficiency Branch Efficiency 91.67% 91.67% 91.67% - 12 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 12 sm_efficiency Multiprocessor Activity 94.87% 97.32% 95.52% - 12 achieved_occupancy Achieved Occupancy 0.826462 0.833351 0.828500 - 12 gld_requested_throughput Requested Global Load Throughput 7.1789GB/s 7.4329GB/s 7.2468GB/s - 12 gst_requested_throughput Requested Global Store Throughput 55.134GB/s 57.085GB/s 55.656GB/s - 12 ipc Executed IPC 1.192946 1.209010 1.198058 - 12 sm_efficiency_instance Multiprocessor Activity 94.87% 97.32% 95.52% - 12 ipc_instance Executed IPC 1.192946 1.209010 1.198058 - 12 inst_per_warp Instructions per warp 7.8419e+04 9.7476e+04 9.1093e+04 - 12 gld_transactions Global Load Transactions 1136020 1412432 1319826 - 12 gst_transactions Global Store Transactions 1704288 2118480 1979760 - 12 local_load_transactions Local Load Transactions 0 0 0 - 12 local_store_transactions Local Store Transactions 0 0 0 - 12 shared_load_transactions Shared Load Transactions 6817152 8473920 7919040 - 12 shared_store_transactions Shared Store Transactions 1136192 1412320 1319840 - 12 gld_transactions_per_request Global Load Transactions Per Request 0.999870 1.000158 1.000003 - 12 gst_transactions_per_request Global Store Transactions Per Request 2.000023 2.000042 2.000033 - 12 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 12 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 12 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 1.000011 1.000021 1.000016 - 12 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 1.000011 1.000021 1.000016 - 12 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 12 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 12 shared_load_throughput Shared Memory Load Throughput 220.54GB/s 228.34GB/s 222.63GB/s - 12 shared_store_throughput Shared Memory Store Throughput 36.757GB/s 38.057GB/s 37.104GB/s - 12 shared_efficiency Shared Memory Efficiency 96.87% 96.87% 96.87% - 12 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 12 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 12 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 12 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 12 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 12 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 12 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 12 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 12 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 12 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 10.31% 10.40% 10.36% - 12 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 17.56% 17.77% 17.66% - 12 stall_memory_dependency Issue Stall Reasons (Data Request) 23.74% 24.31% 23.90% - 12 stall_sync Issue Stall Reasons (Synchronization) 23.54% 24.19% 23.98% - 12 inst_executed Instructions Executed 30113060 37430664 34979802 - 12 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 12 stall_other Issue Stall Reasons (Other) 3.99% 4.03% 4.01% - 12 inst_fp_32 FP Instructions(Single) 0 0 0 - 12 inst_fp_64 FP Instructions(Double) 0 0 0 - 12 inst_integer Integer Instructions 326283677 405569226 379014374 - 12 inst_bit_convert Bit-Convert Instructions 0 0 0 - 12 inst_control Control-Flow Instructions 53541917 66554826 62196454 - 12 inst_compute_ld_st Load/Store Instructions 302220688 375672864 351071765 - 12 inst_misc Misc Instructions 189943517 236092746 220636134 - 12 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 - 12 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 12 atomic_transactions Atomic Transactions 0 0 0 - 12 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 12 inst_replay_overhead Instruction Replay Overhead 0.130367 0.131460 0.130912 - 12 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 - 12 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.014165 0.014175 0.014170 - 12 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 12 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 12 dram_read_throughput Device Memory Read Throughput 8.4255GB/s 8.8107GB/s 8.5429GB/s - 12 dram_write_throughput Device Memory Write Throughput 55.134GB/s 57.085GB/s 55.656GB/s - 12 gst_throughput Global Store Throughput 55.134GB/s 57.085GB/s 55.656GB/s - 12 gld_throughput Global Load Throughput 36.752GB/s 38.058GB/s 37.104GB/s - 12 warp_execution_efficiency Warp Execution Efficiency 98.85% 98.85% 98.85% - 12 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 12 gld_efficiency Global Memory Load Efficiency 19.53% 19.53% 19.53% - 12 gst_efficiency Global Memory Store Efficiency 100.00% 100.00% 100.00% - 12 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 12 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 20.94% 21.62% 21.18% - 12 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 12 l2_l1_read_throughput L2 Throughput (L1 Reads) 13.796GB/s 14.284GB/s 13.926GB/s - 12 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00% - 12 issued_ipc Issued IPC 1.351181 1.368566 1.356106 - 12 issue_slot_utilization Issue Slot Utilization 59.11% 59.86% 59.32% - 12 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 12 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 12 l2_read_transactions L2 Read Transactions 1705728 2121263 1981652 - 12 l2_write_transactions L2 Write Transactions 6817008 8473835 7918913 - 12 tex_cache_transactions Texture Cache Transactions 0 0 0 - 12 dram_read_transactions Device Memory Read Transactions 1039824 1307889 1214876 - 12 dram_write_transactions Device Memory Write Transactions 6817008 8473826 7918915 - 12 l2_read_throughput L2 Throughput (Reads) 13.797GB/s 14.284GB/s 13.928GB/s - 12 l2_write_throughput L2 Throughput (Writes) 55.134GB/s 57.085GB/s 55.656GB/s - 12 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 12 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 12 cf_issued Issued Control-Flow Instructions 3409272 4237680 3960224 - 12 cf_executed Executed Control-Flow Instructions 3409272 4237680 3960224 - 12 ldst_issued Issued Load/Store Instructions 16982291 21162340 19756781 - 12 ldst_executed Executed Load/Store Instructions 13065932 16241496 15177914 - 12 l1_shared_utilization L1/Shared Memory Utilization Mid (5) Mid (5) Mid (5) - 12 l2_utilization L2 Cache Utilization Mid (4) Mid (4) Mid (4) - 12 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 12 dram_utilization Device Memory Utilization Mid (6) Mid (6) Mid (6) - 12 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) - 12 ldst_fu_utilization Load/Store Function Unit Utilization Mid (4) Mid (4) Mid (4) - 12 alu_fu_utilization Arithmetic Function Unit Utilization Low (2) Low (2) Low (2) - 12 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 12 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 12 inst_issued Instructions Issued 34057560 42360955 39562682 - 12 issue_slots Issue Slots 29795010 37062895 34611442 - 12 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 12 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 1705656 2120304 1981483 - 12 l2_l1_write_transactions L2 Write Transactions (L1 write requests 6817008 8473824 7918912 - 12 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 12 l2_l1_write_throughput L2 Throughput (L1 Writes) 55.134GB/s 57.085GB/s 55.656GB/s - 12 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 12 eligible_warps_per_cycle Eligible Warps Per Active Cycle 5.174226 5.237236 5.202852 - 12 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 12 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 12 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 12 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 20.16% 20.44% 20.34% - 12 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% - Kernel: void mtf_thread_by4(unsigned char const *, unsigned char*, int, int) - 3 l1_cache_global_hit_rate L1 Global Hit Rate 28.29% 30.31% 29.29% - 3 branch_efficiency Branch Efficiency 73.48% 73.60% 73.54% - 3 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 3 sm_efficiency Multiprocessor Activity 88.59% 89.60% 88.99% - 3 achieved_occupancy Achieved Occupancy 0.122602 0.136935 0.131952 - 3 gld_requested_throughput Requested Global Load Throughput 328.13MB/s 375.45MB/s 353.70MB/s - 3 gst_requested_throughput Requested Global Store Throughput 328.05MB/s 375.35MB/s 353.61MB/s - 3 ipc Executed IPC 0.417571 0.471507 0.452457 - 3 sm_efficiency_instance Multiprocessor Activity 88.59% 89.60% 88.99% - 3 ipc_instance Executed IPC 0.417571 0.471507 0.452457 - 3 inst_per_warp Instructions per warp 1.0125e+06 1.0326e+06 1.0216e+06 - 3 gld_transactions Global Load Transactions 11601644 14686076 13639786 - 3 gst_transactions Global Store Transactions 11534336 15106048 13773482 - 3 local_load_transactions Local Load Transactions 0 0 0 - 3 local_store_transactions Local Store Transactions 0 0 0 - 3 shared_load_transactions Shared Load Transactions 3845624 5079364 4644701 - 3 shared_store_transactions Shared Store Transactions 3846328 5080260 4645533 - 3 gld_transactions_per_request Global Load Transactions Per Request 3.529992 3.658906 3.614815 - 3 gst_transactions_per_request Global Store Transactions Per Request 3.508883 3.759407 3.646206 - 3 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 3 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 3 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 0.999335 1.079308 1.051450 - 3 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 0.999344 1.079307 1.051453 - 3 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 shared_load_throughput Shared Memory Load Throughput 14.371GB/s 17.356GB/s 16.135GB/s - 3 shared_store_throughput Shared Memory Store Throughput 14.373GB/s 17.359GB/s 16.138GB/s - 3 shared_efficiency Shared Memory Efficiency 14.34% 15.46% 14.76% - 3 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 3 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 3 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 3 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 22.86% 24.01% 23.54% - 3 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 47.01% 47.54% 47.34% - 3 stall_memory_dependency Issue Stall Reasons (Data Request) 19.31% 20.53% 19.74% - 3 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% - 3 inst_executed Instructions Executed 86740102 106038282 99358379 - 3 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 3 stall_other Issue Stall Reasons (Other) 9.12% 9.59% 9.37% - 3 inst_fp_32 FP Instructions(Single) 0 0 0 - 3 inst_fp_64 FP Instructions(Double) 0 0 0 - 3 inst_integer Integer Instructions 232210001 285078880 267066231 - 3 inst_bit_convert Bit-Convert Instructions 221661508 271772571 254706257 - 3 inst_control Control-Flow Instructions 55989180 69155181 64662300 - 3 inst_compute_ld_st Load/Store Instructions 174075203 213604640 200137955 - 3 inst_misc Misc Instructions 91529969 112658715 105455913 - 3 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 - 3 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 3 atomic_transactions Atomic Transactions 0 0 0 - 3 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 3 inst_replay_overhead Instruction Replay Overhead 0.185231 0.187518 0.186680 - 3 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 - 3 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.094068 0.099221 0.096816 - 3 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 3 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 dram_read_throughput Device Memory Read Throughput 2.0740GB/s 2.1330GB/s 2.1078GB/s - 3 dram_write_throughput Device Memory Write Throughput 10.141GB/s 11.621GB/s 10.942GB/s - 3 gst_throughput Global Store Throughput 10.251GB/s 11.730GB/s 11.050GB/s - 3 gld_throughput Global Load Throughput 43.354GB/s 50.697GB/s 47.425GB/s - 3 warp_execution_efficiency Warp Execution Efficiency 30.71% 31.89% 31.46% - 3 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 3 gld_efficiency Global Memory Load Efficiency 0.72% 0.74% 0.73% - 3 gst_efficiency Global Memory Store Efficiency 3.13% 3.13% 3.13% - 3 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 47.68% 48.02% 47.80% - 3 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 3 l2_l1_read_throughput L2 Throughput (L1 Reads) 29.192GB/s 32.891GB/s 31.326GB/s - 3 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00% - 3 issued_ipc Issued IPC 0.495058 0.559069 0.536660 - 3 issue_slot_utilization Issue Slot Utilization 17.47% 19.74% 18.95% - 3 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 3 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 3 l2_read_transactions L2 Read Transactions 31423940 38743476 36060555 - 3 l2_write_transactions L2 Write Transactions 10973199 13606927 12707172 - 3 tex_cache_transactions Texture Cache Transactions 0 0 0 - 3 dram_read_transactions Device Memory Read Transactions 2283174 2526552 2417635 - 3 dram_write_transactions Device Memory Write Transactions 10854816 13479413 12583386 - 3 l2_read_throughput L2 Throughput (Reads) 29.357GB/s 32.929GB/s 31.363GB/s - 3 l2_write_throughput L2 Throughput (Writes) 10.251GB/s 11.730GB/s 11.050GB/s - 3 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 cf_issued Issued Control-Flow Instructions 17540307 21443175 20091825 - 3 cf_executed Executed Control-Flow Instructions 17540307 21443175 20091825 - 3 ldst_issued Issued Load/Store Instructions 30329915 37304843 34898422 - 3 ldst_executed Executed Load/Store Instructions 14270810 17448774 16344045 - 3 l1_shared_utilization L1/Shared Memory Utilization Low (1) Low (2) Low (1) - 3 l2_utilization L2 Cache Utilization Low (2) Low (3) Low (2) - 3 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 3 dram_utilization Device Memory Utilization Low (2) Low (2) Low (2) - 3 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) - 3 ldst_fu_utilization Load/Store Function Unit Utilization Low (1) Low (1) Low (1) - 3 alu_fu_utilization Arithmetic Function Unit Utilization Low (1) Low (1) Low (1) - 3 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 3 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 3 inst_issued Instructions Issued 102804638 125891465 117912543 - 3 issue_slots Issue Slots 72550413 88911792 83252826 - 3 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 31246896 38854160 36024060 - 3 l2_l1_write_transactions L2 Write Transactions (L1 write requests 10973184 13606912 12707157 - 3 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 3 l2_l1_write_throughput L2 Throughput (L1 Writes) 10.251GB/s 11.730GB/s 11.050GB/s - 3 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 3 eligible_warps_per_cycle Eligible Warps Per Active Cycle 0.376025 0.423953 0.407575 - 3 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 3 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 3 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.00% 0.00% 0.00% - 3 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% - Kernel: void mtf_thread_by4(unsigned char const *, unsigned char*, int, int) - 3 l1_cache_global_hit_rate L1 Global Hit Rate 39.31% 42.12% 40.53% - 3 branch_efficiency Branch Efficiency 73.24% 73.43% 73.32% - 3 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 3 sm_efficiency Multiprocessor Activity 81.06% 83.14% 82.07% - 3 achieved_occupancy Achieved Occupancy 0.101600 0.105269 0.103168 - 3 gld_requested_throughput Requested Global Load Throughput 305.28MB/s 316.56MB/s 309.61MB/s - 3 gst_requested_throughput Requested Global Store Throughput 305.20MB/s 316.48MB/s 309.53MB/s - 3 ipc Executed IPC 0.432499 0.453078 0.439754 - 3 sm_efficiency_instance Multiprocessor Activity 81.06% 83.14% 82.07% - 3 ipc_instance Executed IPC 0.432499 0.453078 0.439754 - 3 inst_per_warp Instructions per warp 1.0481e+06 1.0649e+06 1.0560e+06 - 3 gld_transactions Global Load Transactions 11386940 13112596 12363532 - 3 gst_transactions Global Store Transactions 11386880 13107200 12358997 - 3 local_load_transactions Local Load Transactions 0 0 0 - 3 local_store_transactions Local Store Transactions 0 0 0 - 3 shared_load_transactions Shared Load Transactions 4811848 5344716 5159276 - 3 shared_store_transactions Shared Store Transactions 4817480 5350860 5165334 - 3 gld_transactions_per_request Global Load Transactions Per Request 2.986455 3.299917 3.135984 - 3 gst_transactions_per_request Global Store Transactions Per Request 2.984000 3.299326 3.134353 - 3 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 3 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 3 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 0.922167 1.023100 0.957570 - 3 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 0.922168 1.023128 0.957593 - 3 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 shared_load_throughput Shared Memory Load Throughput 14.992GB/s 17.347GB/s 15.803GB/s - 3 shared_store_throughput Shared Memory Store Throughput 15.010GB/s 17.368GB/s 15.821GB/s - 3 shared_efficiency Shared Memory Efficiency 14.61% 16.05% 15.56% - 3 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 3 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 3 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 3 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 28.48% 28.83% 28.69% - 3 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 53.06% 53.50% 53.29% - 3 stall_memory_dependency Issue Stall Reasons (Data Request) 5.11% 5.28% 5.18% - 3 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% - 3 inst_executed Instructions Executed 89455397 109726736 102728411 - 3 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 3 stall_other Issue Stall Reasons (Other) 12.79% 12.91% 12.84% - 3 inst_fp_32 FP Instructions(Single) 0 0 0 - 3 inst_fp_64 FP Instructions(Double) 0 0 0 - 3 inst_integer Integer Instructions 249984705 305590262 286686313 - 3 inst_bit_convert Bit-Convert Instructions 259316746 316185984 296871375 - 3 inst_control Control-Flow Instructions 62180173 76133956 71387755 - 3 inst_compute_ld_st Load/Store Instructions 201567895 246040792 230929521 - 3 inst_misc Misc Instructions 59722190 73568310 68848639 - 3 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 - 3 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 3 atomic_transactions Atomic Transactions 0 0 0 - 3 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 3 inst_replay_overhead Instruction Replay Overhead 0.170303 0.173240 0.172166 - 3 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 - 3 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.069572 0.073683 0.071749 - 3 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 3 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 dram_read_throughput Device Memory Read Throughput 878.23MB/s 908.50MB/s 889.41MB/s - 3 dram_write_throughput Device Memory Write Throughput 9.4099GB/s 9.7889GB/s 9.5579GB/s - 3 gst_throughput Global Store Throughput 9.5376GB/s 9.8899GB/s 9.6729GB/s - 3 gld_throughput Global Load Throughput 35.501GB/s 41.051GB/s 37.832GB/s - 3 warp_execution_efficiency Warp Execution Efficiency 32.55% 32.75% 32.66% - 3 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 3 gld_efficiency Global Memory Load Efficiency 0.75% 0.84% 0.80% - 3 gst_efficiency Global Memory Store Efficiency 3.13% 3.13% 3.13% - 3 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 49.25% 49.42% 49.31% - 3 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 3 l2_l1_read_throughput L2 Throughput (L1 Reads) 21.639GB/s 22.855GB/s 22.148GB/s - 3 local_memory_overhead Local Memory Overhead 0.00% 0.33% 0.11% - 3 issued_ipc Issued IPC 0.516615 0.530300 0.522357 - 3 issue_slot_utilization Issue Slot Utilization 18.98% 19.46% 19.18% - 3 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 3 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 3 l2_read_transactions L2 Read Transactions 25296493 31280193 29157552 - 3 l2_write_transactions L2 Write Transactions 10973199 13606918 12707169 - 3 tex_cache_transactions Texture Cache Transactions 0 0 0 - 3 dram_read_transactions Device Memory Read Transactions 984386 1221239 1141103 - 3 dram_write_transactions Device Memory Write Transactions 10861117 13441697 12554305 - 3 l2_read_throughput L2 Throughput (Reads) 21.778GB/s 22.799GB/s 22.203GB/s - 3 l2_write_throughput L2 Throughput (Writes) 9.5376GB/s 9.8899GB/s 9.6730GB/s - 3 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 cf_issued Issued Control-Flow Instructions 13977536 17161180 16057957 - 3 cf_executed Executed Control-Flow Instructions 13977536 17161180 16057957 - 3 ldst_issued Issued Load/Store Instructions 31547686 39003537 36438565 - 3 ldst_executed Executed Load/Store Instructions 16313734 20031147 18742887 - 3 l1_shared_utilization L1/Shared Memory Utilization Low (1) Low (1) Low (1) - 3 l2_utilization L2 Cache Utilization Low (2) Low (2) Low (2) - 3 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 3 dram_utilization Device Memory Utilization Low (1) Low (1) Low (1) - 3 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) - 3 ldst_fu_utilization Load/Store Function Unit Utilization Low (1) Low (1) Low (1) - 3 alu_fu_utilization Arithmetic Function Unit Utilization Low (1) Low (1) Low (1) - 3 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 3 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 3 inst_issued Instructions Issued 104689554 128702912 120427996 - 3 issue_slots Issue Slots 76846261 94570409 88462002 - 3 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 25358344 31162660 29073061 - 3 l2_l1_write_transactions L2 Write Transactions (L1 write requests 10973184 13606912 12707157 - 3 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 3 l2_l1_write_throughput L2 Throughput (L1 Writes) 9.5376GB/s 9.8899GB/s 9.6729GB/s - 3 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 3 eligible_warps_per_cycle Eligible Warps Per Active Cycle 0.397502 0.414427 0.405206 - 3 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 3 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 3 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.00% 0.00% 0.00% - 3 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% - Kernel: void cub::RadixSortScanBinsKernel::Policy520, int>(unsigned char*, int) - 39 l1_cache_global_hit_rate L1 Global Hit Rate 0.00% 0.00% 0.00% - 39 branch_efficiency Branch Efficiency 100.00% 100.00% 100.00% - 39 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 39 sm_efficiency Multiprocessor Activity 8.37% 8.48% 8.41% - 39 achieved_occupancy Achieved Occupancy 0.325600 0.325900 0.325707 - 39 gld_requested_throughput Requested Global Load Throughput 2.2928GB/s 15.911GB/s 2.7631GB/s - 39 gst_requested_throughput Requested Global Store Throughput 2.2928GB/s 15.911GB/s 2.7631GB/s - 39 ipc Executed IPC 0.251322 0.260583 0.258129 - 39 sm_efficiency_instance Multiprocessor Activity 8.37% 8.48% 8.41% - 39 ipc_instance Executed IPC 0.251322 0.260583 0.258129 - 39 inst_per_warp Instructions per warp 74.250000 74.250000 74.250000 - 39 gld_transactions Global Load Transactions 512 512 512 - 39 gst_transactions Global Store Transactions 512 512 512 - 39 local_load_transactions Local Load Transactions 0 0 0 - 39 local_store_transactions Local Store Transactions 0 0 0 - 39 shared_load_transactions Shared Load Transactions 440 440 440 - 39 shared_store_transactions Shared Store Transactions 448 448 448 - 39 gld_transactions_per_request Global Load Transactions Per Request 16.000000 16.000000 16.000000 - 39 gst_transactions_per_request Global Store Transactions Per Request 16.000000 16.000000 16.000000 - 39 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 39 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 39 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 5.641026 5.641026 5.641026 - 39 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 5.600000 5.600000 5.600000 - 39 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 39 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 39 shared_load_throughput Shared Memory Load Throughput 7.8816GB/s 54.695GB/s 9.4981GB/s - 39 shared_store_throughput Shared Memory Store Throughput 8.0249GB/s 55.689GB/s 9.6708GB/s - 39 shared_efficiency Shared Memory Efficiency 17.36% 17.36% 17.36% - 39 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 39 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 39 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 39 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 39 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 39 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 39 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 39 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 39 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 39 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 15.06% 16.60% 15.42% - 39 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 13.63% 14.28% 13.90% - 39 stall_memory_dependency Issue Stall Reasons (Data Request) 29.59% 31.65% 30.43% - 39 stall_sync Issue Stall Reasons (Synchronization) 37.70% 39.68% 38.41% - 39 inst_executed Instructions Executed 1188 1188 1188 - 39 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 39 stall_other Issue Stall Reasons (Other) 1.61% 1.72% 1.68% - 39 inst_fp_32 FP Instructions(Single) 0 0 0 - 39 inst_fp_64 FP Instructions(Double) 0 0 0 - 39 inst_integer Integer Instructions 18752 18752 18752 - 39 inst_bit_convert Bit-Convert Instructions 0 0 0 - 39 inst_control Control-Flow Instructions 1024 1024 1024 - 39 inst_compute_ld_st Load/Store Instructions 6980 6980 6980 - 39 inst_misc Misc Instructions 8892 8892 8892 - 39 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 - 39 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 39 atomic_transactions Atomic Transactions 0 0 0 - 39 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 39 inst_replay_overhead Instruction Replay Overhead 0.167508 0.175084 0.172688 - 39 shared_replay_overhead Shared Memory Replay Overhead 0.215488 0.215488 0.215488 - 39 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.430976 0.430976 0.430976 - 39 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 39 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 39 dram_read_throughput Device Memory Read Throughput 2.9531GB/s 21.722GB/s 3.8022GB/s - 39 dram_write_throughput Device Memory Write Throughput 2.2928GB/s 15.911GB/s 2.7631GB/s - 39 gst_throughput Global Store Throughput 2.2928GB/s 15.911GB/s 2.7631GB/s - 39 gld_throughput Global Load Throughput 9.1713GB/s 63.645GB/s 11.052GB/s - 39 warp_execution_efficiency Warp Execution Efficiency 99.67% 99.67% 99.67% - 39 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 39 gld_efficiency Global Memory Load Efficiency 25.00% 25.00% 25.00% - 39 gst_efficiency Global Memory Store Efficiency 100.00% 100.00% 100.00% - 39 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 39 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 0.00% 0.00% 0.00% - 39 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 39 l2_l1_read_throughput L2 Throughput (L1 Reads) 2.2928GB/s 15.911GB/s 2.7631GB/s - 39 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00% - 39 issued_ipc Issued IPC 0.297332 0.305818 0.303132 - 39 issue_slot_utilization Issue Slot Utilization 12.11% 12.46% 12.35% - 39 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 39 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 39 l2_read_transactions L2 Read Transactions 584 592 586 - 39 l2_write_transactions L2 Write Transactions 512 512 512 - 39 tex_cache_transactions Texture Cache Transactions 0 0 0 - 39 dram_read_transactions Device Memory Read Transactions 631 767 705 - 39 dram_write_transactions Device Memory Write Transactions 512 512 512 - 39 l2_read_throughput L2 Throughput (Reads) 2.6403GB/s 18.397GB/s 3.1696GB/s - 39 l2_write_throughput L2 Throughput (Writes) 2.2928GB/s 15.911GB/s 2.7631GB/s - 39 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 39 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 39 cf_issued Issued Control-Flow Instructions 136 136 136 - 39 cf_executed Executed Control-Flow Instructions 136 136 136 - 39 ldst_issued Issued Load/Store Instructions 475 478 475 - 39 ldst_executed Executed Load/Store Instructions 350 350 350 - 39 l1_shared_utilization L1/Shared Memory Utilization Low (1) Low (1) Low (1) - 39 l2_utilization L2 Cache Utilization Low (1) Low (1) Low (1) - 39 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 39 dram_utilization Device Memory Utilization Low (1) Mid (4) Low (1) - 39 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) - 39 ldst_fu_utilization Load/Store Function Unit Utilization Low (1) Low (1) Low (1) - 39 alu_fu_utilization Arithmetic Function Unit Utilization Low (1) Low (1) Low (1) - 39 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 39 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 39 inst_issued Instructions Issued 1393 1396 1393 - 39 issue_slots Issue Slots 1135 1138 1135 - 39 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 39 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 512 512 512 - 39 l2_l1_write_transactions L2 Write Transactions (L1 write requests 512 512 512 - 39 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 39 l2_l1_write_throughput L2 Throughput (L1 Writes) 2.2928GB/s 15.911GB/s 2.7631GB/s - 39 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 39 eligible_warps_per_cycle Eligible Warps Per Active Cycle 0.424910 0.449465 0.436826 - 39 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 39 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 39 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 39 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.06% 0.27% 0.17% - 39 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% - Kernel: void mtf_thread(unsigned char const *, unsigned char*, int, int) - 3 l1_cache_global_hit_rate L1 Global Hit Rate 32.11% 38.91% 36.46% - 3 branch_efficiency Branch Efficiency 71.21% 71.51% 71.39% - 3 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 3 sm_efficiency Multiprocessor Activity 82.20% 84.15% 83.29% - 3 achieved_occupancy Achieved Occupancy 0.121224 0.135052 0.130023 - 3 gld_requested_throughput Requested Global Load Throughput 294.08MB/s 339.18MB/s 319.67MB/s - 3 gst_requested_throughput Requested Global Store Throughput 294.00MB/s 339.10MB/s 319.59MB/s - 3 ipc Executed IPC 0.479023 0.534700 0.514863 - 3 sm_efficiency_instance Multiprocessor Activity 82.20% 84.15% 83.29% - 3 ipc_instance Executed IPC 0.479023 0.534700 0.514863 - 3 inst_per_warp Instructions per warp 1.1951e+06 1.2236e+06 1.2090e+06 - 3 gld_transactions Global Load Transactions 11011136 14060940 12957194 - 3 gst_transactions Global Store Transactions 11010048 14155776 13074432 - 3 local_load_transactions Local Load Transactions 0 0 0 - 3 local_store_transactions Local Store Transactions 0 0 0 - 3 shared_load_transactions Shared Load Transactions 3240216 4387804 3992226 - 3 shared_store_transactions Shared Store Transactions 5930128 7896356 7239426 - 3 gld_transactions_per_request Global Load Transactions Per Request 4.010973 4.187320 4.109500 - 3 gst_transactions_per_request Global Store Transactions Per Request 4.009857 4.236070 4.143832 - 3 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 3 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 3 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 0.960207 1.079231 1.031589 - 3 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 0.968844 1.065904 1.029936 - 3 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 shared_load_throughput Shared Memory Load Throughput 10.852GB/s 13.735GB/s 12.532GB/s - 3 shared_store_throughput Shared Memory Store Throughput 19.861GB/s 24.717GB/s 22.729GB/s - 3 shared_efficiency Shared Memory Efficiency 10.95% 12.03% 11.34% - 3 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 3 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 3 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 3 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 25.15% 25.37% 25.23% - 3 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 52.75% 53.66% 53.35% - 3 stall_memory_dependency Issue Stall Reasons (Data Request) 8.58% 9.43% 8.87% - 3 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% - 3 inst_executed Instructions Executed 102785237 125665260 117580331 - 3 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 3 stall_other Issue Stall Reasons (Other) 12.42% 12.66% 12.56% - 3 inst_fp_32 FP Instructions(Single) 0 0 0 - 3 inst_fp_64 FP Instructions(Double) 0 0 0 - 3 inst_integer Integer Instructions 543361177 665896719 624167030 - 3 inst_bit_convert Bit-Convert Instructions 532361176 652256569 611428813 - 3 inst_control Control-Flow Instructions 141134228 172757334 161991676 - 3 inst_compute_ld_st Load/Store Instructions 163104698 200001050 187433900 - 3 inst_misc Misc Instructions 21975864 27250384 25448477 - 3 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 - 3 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 3 atomic_transactions Atomic Transactions 0 0 0 - 3 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 3 inst_replay_overhead Instruction Replay Overhead 0.166507 0.169134 0.167896 - 3 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 - 3 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.068349 0.072749 0.069861 - 3 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 3 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 dram_read_throughput Device Memory Read Throughput 1.5879GB/s 1.6324GB/s 1.6137GB/s - 3 dram_write_throughput Device Memory Write Throughput 9.0330GB/s 10.455GB/s 9.8397GB/s - 3 gst_throughput Global Store Throughput 9.1876GB/s 10.597GB/s 9.9871GB/s - 3 gld_throughput Global Load Throughput 36.878GB/s 43.195GB/s 40.713GB/s - 3 warp_execution_efficiency Warp Execution Efficiency 45.60% 46.04% 45.90% - 3 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 3 gld_efficiency Global Memory Load Efficiency 0.76% 0.78% 0.77% - 3 gst_efficiency Global Memory Store Efficiency 3.13% 3.13% 3.13% - 3 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 47.98% 48.07% 48.03% - 3 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 3 l2_l1_read_throughput L2 Throughput (L1 Reads) 23.920GB/s 26.626GB/s 25.410GB/s - 3 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00% - 3 issued_ipc Issued IPC 0.558512 0.624830 0.601130 - 3 issue_slot_utilization Issue Slot Utilization 22.36% 25.02% 24.07% - 3 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 3 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 3 l2_read_transactions L2 Read Transactions 28448864 34176893 32224187 - 3 l2_write_transactions L2 Write Transactions 10973200 13606929 12707173 - 3 tex_cache_transactions Texture Cache Transactions 0 0 0 - 3 dram_read_transactions Device Memory Read Transactions 1896504 2167212 2049900 - 3 dram_write_transactions Device Memory Write Transactions 10788574 13412211 12520205 - 3 l2_read_throughput L2 Throughput (Reads) 23.820GB/s 26.643GB/s 25.342GB/s - 3 l2_write_throughput L2 Throughput (Writes) 9.1876GB/s 10.597GB/s 9.9871GB/s - 3 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 cf_issued Issued Control-Flow Instructions 12939437 15828016 14795156 - 3 cf_executed Executed Control-Flow Instructions 12939437 15828016 14795156 - 3 ldst_issued Issued Load/Store Instructions 32100664 39434062 36905069 - 3 ldst_executed Executed Load/Store Instructions 14986327 18317159 17153378 - 3 l1_shared_utilization L1/Shared Memory Utilization Low (1) Low (2) Low (1) - 3 l2_utilization L2 Cache Utilization Low (2) Low (2) Low (2) - 3 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 3 dram_utilization Device Memory Utilization Low (1) Low (2) Low (1) - 3 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) - 3 ldst_fu_utilization Load/Store Function Unit Utilization Low (1) Low (1) Low (1) - 3 alu_fu_utilization Arithmetic Function Unit Utilization Low (2) Low (2) Low (2) - 3 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 3 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 3 inst_issued Instructions Issued 119900661 146779026 137330901 - 3 issue_slots Issue Slots 96003408 117554667 109971595 - 3 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 28568384 34339076 32310654 - 3 l2_l1_write_transactions L2 Write Transactions (L1 write requests 10973184 13606912 12707157 - 3 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 3 l2_l1_write_throughput L2 Throughput (L1 Writes) 9.1876GB/s 10.597GB/s 9.9871GB/s - 3 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 3 eligible_warps_per_cycle Eligible Warps Per Active Cycle 0.491930 0.551252 0.530005 - 3 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 3 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 3 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.00% 0.00% 0.00% - 3 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% - Kernel: void mtf_4by8(unsigned char const *, unsigned char*, int, int) - 3 l1_cache_global_hit_rate L1 Global Hit Rate 37.22% 40.06% 38.78% - 3 branch_efficiency Branch Efficiency 71.12% 71.30% 71.21% - 3 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 3 sm_efficiency Multiprocessor Activity 78.97% 83.32% 81.00% - 3 achieved_occupancy Achieved Occupancy 0.377573 0.394957 0.386960 - 3 gld_requested_throughput Requested Global Load Throughput 1.7349GB/s 1.8470GB/s 1.7828GB/s - 3 gst_requested_throughput Requested Global Store Throughput 1.7345GB/s 1.8466GB/s 1.7824GB/s - 3 ipc Executed IPC 1.271188 1.319584 1.300284 - 3 sm_efficiency_instance Multiprocessor Activity 78.97% 83.32% 81.00% - 3 ipc_instance Executed IPC 1.271188 1.319584 1.300284 - 3 inst_per_warp Instructions per warp 5.0791e+05 5.1907e+05 5.1334e+05 - 3 gld_transactions Global Load Transactions 11719576 13966112 13061568 - 3 gst_transactions Global Store Transactions 11534336 13533184 12724906 - 3 local_load_transactions Local Load Transactions 0 0 0 - 3 local_store_transactions Local Store Transactions 0 0 0 - 3 shared_load_transactions Shared Load Transactions 18668452 22269980 20734924 - 3 shared_store_transactions Shared Store Transactions 28343060 33696952 31424128 - 3 gld_transactions_per_request Global Load Transactions Per Request 3.130583 3.305192 3.217984 - 3 gst_transactions_per_request Global Store Transactions Per Request 3.039617 3.252829 3.136920 - 3 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 3 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 3 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 2.367758 2.516625 2.446824 - 3 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 2.452312 2.615576 2.536778 - 3 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 shared_load_throughput Shared Memory Load Throughput 87.166GB/s 100.53GB/s 93.395GB/s - 3 shared_store_throughput Shared Memory Store Throughput 132.11GB/s 152.62GB/s 141.56GB/s - 3 shared_efficiency Shared Memory Efficiency 3.82% 4.09% 3.94% - 3 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 3 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 3 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 3 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 25.43% 26.65% 26.24% - 3 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 59.26% 60.06% 59.78% - 3 stall_memory_dependency Issue Stall Reasons (Data Request) 2.46% 4.77% 3.24% - 3 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% - 3 inst_executed Instructions Executed 174408302 213420539 199706357 - 3 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 3 stall_other Issue Stall Reasons (Other) 10.29% 10.57% 10.46% - 3 inst_fp_32 FP Instructions(Single) 0 0 0 - 3 inst_fp_64 FP Instructions(Double) 0 0 0 - 3 inst_integer Integer Instructions 1460752472 1791831520 1679147098 - 3 inst_bit_convert Bit-Convert Instructions 745436588 910686208 854527253 - 3 inst_control Control-Flow Instructions 169102960 205785520 193338018 - 3 inst_compute_ld_st Load/Store Instructions 317628448 389958487 365331618 - 3 inst_misc Misc Instructions 222052160 273396481 255899204 - 3 inst_inter_thread_communication Inter-Thread Instructions 106492472 130099928 122077094 - 3 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 3 atomic_transactions Atomic Transactions 0 0 0 - 3 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 3 inst_replay_overhead Instruction Replay Overhead 0.256859 0.258410 0.257734 - 3 shared_replay_overhead Shared Memory Replay Overhead 0.155214 0.161677 0.158341 - 3 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.037119 0.039317 0.037907 - 3 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 3 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 dram_read_throughput Device Memory Read Throughput 1.1637GB/s 1.3014GB/s 1.2139GB/s - 3 dram_write_throughput Device Memory Write Throughput 12.599GB/s 13.410GB/s 12.946GB/s - 3 gst_throughput Global Store Throughput 13.876GB/s 14.772GB/s 14.259GB/s - 3 gld_throughput Global Load Throughput 55.330GB/s 63.109GB/s 58.814GB/s - 3 warp_execution_efficiency Warp Execution Efficiency 58.69% 59.16% 58.88% - 3 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 3 gld_efficiency Global Memory Load Efficiency 2.93% 3.14% 3.04% - 3 gst_efficiency Global Memory Store Efficiency 12.50% 12.50% 12.50% - 3 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 48.24% 49.42% 48.70% - 3 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 3 l2_l1_read_throughput L2 Throughput (L1 Reads) 33.478GB/s 36.772GB/s 34.941GB/s - 3 local_memory_overhead Local Memory Overhead 0.00% 3.97% 1.32% - 3 issued_ipc Issued IPC 1.599611 1.651336 1.627531 - 3 issue_slot_utilization Issue Slot Utilization 60.53% 62.47% 61.57% - 3 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 3 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 3 l2_read_transactions L2 Read Transactions 27373277 33421824 30963448 - 3 l2_write_transactions L2 Write Transactions 10973188 13606914 12707164 - 3 tex_cache_transactions Texture Cache Transactions 0 0 0 - 3 dram_read_transactions Device Memory Read Transactions 966678 1135630 1078490 - 3 dram_write_transactions Device Memory Write Transactions 9961371 12356693 11537806 - 3 l2_read_throughput L2 Throughput (Reads) 32.888GB/s 36.851GB/s 34.814GB/s - 3 l2_write_throughput L2 Throughput (Writes) 13.876GB/s 14.772GB/s 14.259GB/s - 3 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 cf_issued Issued Control-Flow Instructions 22819049 27930979 26134315 - 3 cf_executed Executed Control-Flow Instructions 22819049 27930979 26134315 - 3 ldst_issued Issued Load/Store Instructions 70366786 85842204 80509351 - 3 ldst_executed Executed Load/Store Instructions 25346058 31000678 29032047 - 3 l1_shared_utilization L1/Shared Memory Utilization Mid (4) Mid (4) Mid (4) - 3 l2_utilization L2 Cache Utilization Low (3) Low (3) Low (3) - 3 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 3 dram_utilization Device Memory Utilization Low (2) Low (2) Low (2) - 3 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) - 3 ldst_fu_utilization Load/Store Function Unit Utilization Low (3) Low (3) Low (3) - 3 alu_fu_utilization Arithmetic Function Unit Utilization Mid (4) Mid (4) Mid (4) - 3 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 3 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 3 inst_issued Instructions Issued 219425891 268216979 251190501 - 3 issue_slots Issue Slots 166030173 202883665 190060334 - 3 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 27314812 33297640 31094380 - 3 l2_l1_write_transactions L2 Write Transactions (L1 write requests 10973184 13606912 12707157 - 3 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 3 l2_l1_write_throughput L2 Throughput (L1 Writes) 13.876GB/s 14.772GB/s 14.259GB/s - 3 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 3 eligible_warps_per_cycle Eligible Warps Per Active Cycle 1.979835 2.034533 2.009140 - 3 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 3 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 3 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.24% 0.30% 0.28% - 3 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% - Kernel: bsc_st8_encode_cuda_postsort(__int64*, int, __int64, int*) - 3 l1_cache_global_hit_rate L1 Global Hit Rate 0.00% 0.00% 0.00% - 3 branch_efficiency Branch Efficiency 100.00% 100.00% 100.00% - 3 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 3 sm_efficiency Multiprocessor Activity 99.21% 99.31% 99.27% - 3 achieved_occupancy Achieved Occupancy 0.957763 0.959282 0.958626 - 3 gld_requested_throughput Requested Global Load Throughput 101.05GB/s 101.48GB/s 101.33GB/s - 3 gst_requested_throughput Requested Global Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 ipc Executed IPC 0.653159 0.653804 0.653495 - 3 sm_efficiency_instance Multiprocessor Activity 99.21% 99.31% 99.27% - 3 ipc_instance Executed IPC 0.653159 0.653804 0.653495 - 3 inst_per_warp Instructions per warp 2.4432e+04 3.0364e+04 2.8377e+04 - 3 gld_transactions Global Load Transactions 1704288 2118480 1979760 - 3 gst_transactions Global Store Transactions 0 0 0 - 3 local_load_transactions Local Load Transactions 0 0 0 - 3 local_store_transactions Local Store Transactions 0 0 0 - 3 shared_load_transactions Shared Load Transactions 0 0 0 - 3 shared_store_transactions Shared Store Transactions 0 0 0 - 3 gld_transactions_per_request Global Load Transactions Per Request 2.000023 2.000042 2.000033 - 3 gst_transactions_per_request Global Store Transactions Per Request 0.000000 0.000000 0.000000 - 3 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 3 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 3 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 0.000000 0.000000 0.000000 - 3 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 0.000000 0.000000 0.000000 - 3 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 shared_load_throughput Shared Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 shared_store_throughput Shared Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 shared_efficiency Shared Memory Efficiency 0.00% 0.00% 0.00% - 3 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 3 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 3 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 3 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 2.16% 2.16% 2.16% - 3 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 14.75% 14.82% 14.78% - 3 stall_memory_dependency Issue Stall Reasons (Data Request) 80.06% 80.11% 80.09% - 3 stall_sync Issue Stall Reasons (Synchronization) 0.81% 0.86% 0.83% - 3 inst_executed Instructions Executed 9381834 11659956 10896952 - 3 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 3 stall_other Issue Stall Reasons (Other) 2.01% 2.02% 2.01% - 3 inst_fp_32 FP Instructions(Single) 0 0 0 - 3 inst_fp_64 FP Instructions(Double) 0 0 0 - 3 inst_integer Integer Instructions 218242560 271260672 253503488 - 3 inst_bit_convert Bit-Convert Instructions 0 0 0 - 3 inst_control Control-Flow Instructions 27268032 33895296 31675648 - 3 inst_compute_ld_st Load/Store Instructions 27268271 33895475 31675787 - 3 inst_misc Misc Instructions 27390912 34018176 31798528 - 3 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 - 3 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000052 0.000027 - 3 atomic_transactions Atomic Transactions 1 500 275 - 3 atomic_transactions_per_request Atomic Transactions Per Request 1.000000 62.500000 39.277778 - 3 inst_replay_overhead Instruction Replay Overhead 0.155582 0.156861 0.156126 - 3 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 - 3 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.181658 0.181689 0.181679 - 3 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 3 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 dram_read_throughput Device Memory Read Throughput 101.47GB/s 101.94GB/s 101.77GB/s - 3 dram_write_throughput Device Memory Write Throughput 37.547KB/s 188.36KB/s 116.92KB/s - 3 gst_throughput Global Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 gld_throughput Global Load Throughput 101.05GB/s 101.48GB/s 101.33GB/s - 3 warp_execution_efficiency Warp Execution Efficiency 100.00% 100.00% 100.00% - 3 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 3 gld_efficiency Global Memory Load Efficiency 100.00% 100.00% 100.00% - 3 gst_efficiency Global Memory Store Efficiency 0.00% 0.00% 0.00% - 3 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 0.00% 0.00% 0.00% - 3 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 3 l2_l1_read_throughput L2 Throughput (L1 Reads) 101.05GB/s 101.48GB/s 101.33GB/s - 3 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00% - 3 issued_ipc Issued IPC 0.754999 0.756529 0.755900 - 3 issue_slot_utilization Issue Slot Utilization 28.85% 28.89% 28.87% - 3 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 3 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 3 l2_read_transactions L2 Read Transactions 6817074 8473892 7918978 - 3 l2_write_transactions L2 Write Transactions 2 478 279 - 3 tex_cache_transactions Texture Cache Transactions 0 0 0 - 3 dram_read_transactions Device Memory Read Transactions 6849588 8508801 7953102 - 3 dram_write_transactions Device Memory Write Transactions 3 15 8 - 3 l2_read_throughput L2 Throughput (Reads) 101.05GB/s 101.48GB/s 101.33GB/s - 3 l2_write_throughput L2 Throughput (Writes) 25.031KB/s 7.2846MB/s 3.8997MB/s - 3 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 cf_issued Issued Control-Flow Instructions 853278 1060380 991016 - 3 cf_executed Executed Control-Flow Instructions 853278 1060380 991016 - 3 ldst_issued Issued Load/Store Instructions 2321936 2892586 2695369 - 3 ldst_executed Executed Load/Store Instructions 852894 1059996 990632 - 3 l1_shared_utilization L1/Shared Memory Utilization Low (2) Low (2) Low (2) - 3 l2_utilization L2 Cache Utilization Mid (5) Mid (5) Mid (5) - 3 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 3 dram_utilization Device Memory Utilization High (9) High (9) High (9) - 3 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) - 3 ldst_fu_utilization Load/Store Function Unit Utilization Low (1) Low (1) Low (1) - 3 alu_fu_utilization Arithmetic Function Unit Utilization Low (2) Low (2) Low (2) - 3 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 3 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 3 inst_issued Instructions Issued 10853032 13479065 12593795 - 3 issue_slots Issue Slots 8293582 10298309 9621131 - 3 l2_atomic_throughput L2 Throughput (Atomic requests) 25.031KB/s 3.6423MB/s 1.9539MB/s - 3 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 6817008 8473824 7918912 - 3 l2_l1_write_transactions L2 Write Transactions (L1 write requests 0 0 0 - 3 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 3 l2_l1_write_throughput L2 Throughput (L1 Writes) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_atomic_transactions L2 Transactions (Atomic requests) 4 478 280 - 3 eligible_warps_per_cycle Eligible Warps Per Active Cycle 0.799173 0.801847 0.800348 - 3 atomic_throughput Atomic Throughput 50.063KB/s 30.479MB/s 15.506MB/s - 3 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 3 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 3 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.13% 0.13% 0.13% - 3 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% - Kernel: void mtf_thread_by4(unsigned char const *, unsigned char*, int, int) - 3 l1_cache_global_hit_rate L1 Global Hit Rate 24.63% 27.16% 25.59% - 3 branch_efficiency Branch Efficiency 71.64% 71.67% 71.66% - 3 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 3 sm_efficiency Multiprocessor Activity 90.42% 94.49% 92.49% - 3 achieved_occupancy Achieved Occupancy 0.125623 0.138328 0.133715 - 3 gld_requested_throughput Requested Global Load Throughput 336.21MB/s 395.32MB/s 373.64MB/s - 3 gst_requested_throughput Requested Global Store Throughput 336.13MB/s 395.22MB/s 373.55MB/s - 3 ipc Executed IPC 0.388475 0.433035 0.416777 - 3 sm_efficiency_instance Multiprocessor Activity 90.42% 94.49% 92.49% - 3 ipc_instance Executed IPC 0.388475 0.433035 0.416777 - 3 inst_per_warp Instructions per warp 9.1963e+05 9.3194e+05 9.2505e+05 - 3 gld_transactions Global Load Transactions 12787320 14716924 13957553 - 3 gst_transactions Global Store Transactions 12435456 14680064 13757098 - 3 local_load_transactions Local Load Transactions 0 0 0 - 3 local_store_transactions Local Store Transactions 0 0 0 - 3 shared_load_transactions Shared Load Transactions 3597560 4256984 4018392 - 3 shared_store_transactions Shared Store Transactions 3597944 4257432 4018818 - 3 gld_transactions_per_request Global Load Transactions Per Request 3.707024 4.050414 3.856872 - 3 gst_transactions_per_request Global Store Transactions Per Request 3.651506 3.938201 3.797541 - 3 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 3 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 3 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 1.078170 1.132009 1.102581 - 3 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 1.078170 1.132010 1.102581 - 3 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 shared_load_throughput Shared Memory Load Throughput 13.775GB/s 15.298GB/s 14.775GB/s - 3 shared_store_throughput Shared Memory Store Throughput 13.777GB/s 15.300GB/s 14.776GB/s - 3 shared_efficiency Shared Memory Efficiency 14.47% 15.25% 14.91% - 3 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 3 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 3 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 3 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 21.32% 21.94% 21.58% - 3 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 44.33% 46.45% 45.35% - 3 stall_memory_dependency Issue Stall Reasons (Data Request) 23.36% 25.45% 24.21% - 3 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% - 3 inst_executed Instructions Executed 78283255 96051196 89992122 - 3 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 3 stall_other Issue Stall Reasons (Other) 8.70% 8.95% 8.85% - 3 inst_fp_32 FP Instructions(Single) 0 0 0 - 3 inst_fp_64 FP Instructions(Double) 0 0 0 - 3 inst_integer Integer Instructions 210069468 258891564 242260850 - 3 inst_bit_convert Bit-Convert Instructions 195535800 240815804 225402145 - 3 inst_control Control-Flow Instructions 53349063 66018410 61695896 - 3 inst_compute_ld_st Load/Store Instructions 155186601 191228558 178954483 - 3 inst_misc Misc Instructions 84271431 104051454 97310454 - 3 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 - 3 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 3 atomic_transactions Atomic Transactions 0 0 0 - 3 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 3 inst_replay_overhead Instruction Replay Overhead 0.212459 0.212615 0.212518 - 3 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 - 3 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.107409 0.116689 0.112375 - 3 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 3 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 dram_read_throughput Device Memory Read Throughput 3.0870GB/s 3.2804GB/s 3.1866GB/s - 3 dram_write_throughput Device Memory Write Throughput 10.375GB/s 12.217GB/s 11.542GB/s - 3 gst_throughput Global Store Throughput 10.504GB/s 12.351GB/s 11.674GB/s - 3 gld_throughput Global Load Throughput 48.963GB/s 52.887GB/s 51.339GB/s - 3 warp_execution_efficiency Warp Execution Efficiency 31.33% 32.00% 31.62% - 3 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 3 gld_efficiency Global Memory Load Efficiency 0.67% 0.74% 0.71% - 3 gst_efficiency Global Memory Store Efficiency 3.13% 3.13% 3.13% - 3 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 46.44% 46.76% 46.61% - 3 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 3 l2_l1_read_throughput L2 Throughput (L1 Reads) 32.043GB/s 36.597GB/s 34.830GB/s - 3 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00% - 3 issued_ipc Issued IPC 0.469315 0.525177 0.505244 - 3 issue_slot_utilization Issue Slot Utilization 16.63% 18.61% 17.90% - 3 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 3 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 3 l2_read_transactions L2 Read Transactions 33386031 40682414 38135092 - 3 l2_write_transactions L2 Write Transactions 10973199 13606927 12707172 - 3 tex_cache_transactions Texture Cache Transactions 0 0 0 - 3 dram_read_transactions Device Memory Read Transactions 3224807 3614024 3464098 - 3 dram_write_transactions Device Memory Write Transactions 10838228 13460005 12563659 - 3 l2_read_throughput L2 Throughput (Reads) 31.959GB/s 36.613GB/s 35.040GB/s - 3 l2_write_throughput L2 Throughput (Writes) 10.504GB/s 12.351GB/s 11.674GB/s - 3 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 cf_issued Issued Control-Flow Instructions 15813597 19402692 18178687 - 3 cf_executed Executed Control-Flow Instructions 15813597 19402692 18178687 - 3 ldst_issued Issued Load/Store Instructions 29308348 35954677 33684886 - 3 ldst_executed Executed Load/Store Instructions 12671089 15545263 14564537 - 3 l1_shared_utilization L1/Shared Memory Utilization Low (1) Low (2) Low (1) - 3 l2_utilization L2 Cache Utilization Low (3) Low (3) Low (3) - 3 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 3 dram_utilization Device Memory Utilization Low (2) Low (2) Low (2) - 3 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) - 3 ldst_fu_utilization Load/Store Function Unit Utilization Low (1) Low (1) Low (1) - 3 alu_fu_utilization Arithmetic Function Unit Utilization Low (1) Low (1) Low (1) - 3 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 3 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 3 inst_issued Instructions Issued 94927558 116471886 109121070 - 3 issue_slots Issue Slots 67279653 82545757 77334763 - 3 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 33473592 40319724 37899060 - 3 l2_l1_write_transactions L2 Write Transactions (L1 write requests 10973184 13606912 12707157 - 3 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 3 l2_l1_write_throughput L2 Throughput (L1 Writes) 10.504GB/s 12.351GB/s 11.674GB/s - 3 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 3 eligible_warps_per_cycle Eligible Warps Per Active Cycle 0.356921 0.399817 0.384029 - 3 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 3 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 3 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.00% 0.00% 0.00% - 3 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% - Kernel: void mtf_thread(unsigned char const *, unsigned char*, int, int) - 3 l1_cache_global_hit_rate L1 Global Hit Rate 32.14% 35.29% 33.55% - 3 branch_efficiency Branch Efficiency 69.82% 69.97% 69.91% - 3 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 3 sm_efficiency Multiprocessor Activity 89.90% 90.88% 90.54% - 3 achieved_occupancy Achieved Occupancy 0.121381 0.137052 0.130961 - 3 gld_requested_throughput Requested Global Load Throughput 333.14MB/s 387.74MB/s 366.64MB/s - 3 gst_requested_throughput Requested Global Store Throughput 333.06MB/s 387.65MB/s 366.55MB/s - 3 ipc Executed IPC 0.425332 0.484167 0.461340 - 3 sm_efficiency_instance Multiprocessor Activity 89.90% 90.88% 90.54% - 3 ipc_instance Executed IPC 0.425332 0.484167 0.461340 - 3 inst_per_warp Instructions per warp 1.0124e+06 1.0299e+06 1.0202e+06 - 3 gld_transactions Global Load Transactions 12060744 14344740 13492205 - 3 gst_transactions Global Store Transactions 12058624 14155776 13423957 - 3 local_load_transactions Local Load Transactions 0 0 0 - 3 local_store_transactions Local Store Transactions 0 0 0 - 3 shared_load_transactions Shared Load Transactions 3007940 3670928 3401485 - 3 shared_store_transactions Shared Store Transactions 5582740 6834572 6341140 - 3 gld_transactions_per_request Global Load Transactions Per Request 4.754477 4.996478 4.872284 - 3 gst_transactions_per_request Global Store Transactions Per Request 4.749035 4.994799 4.848280 - 3 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 3 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 3 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 1.049984 1.088146 1.074742 - 3 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 1.047497 1.078005 1.067819 - 3 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 shared_load_throughput Shared Memory Load Throughput 11.412GB/s 12.779GB/s 12.269GB/s - 3 shared_store_throughput Shared Memory Store Throughput 21.181GB/s 23.792GB/s 22.871GB/s - 3 shared_efficiency Shared Memory Efficiency 11.12% 11.56% 11.29% - 3 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 3 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 3 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 3 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 21.08% 21.28% 21.19% - 3 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 53.85% 54.97% 54.55% - 3 stall_memory_dependency Issue Stall Reasons (Data Request) 12.56% 13.65% 12.93% - 3 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% - 3 inst_executed Instructions Executed 86511911 105909339 99235239 - 3 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 3 stall_other Issue Stall Reasons (Other) 11.31% 11.35% 11.33% - 3 inst_fp_32 FP Instructions(Single) 0 0 0 - 3 inst_fp_64 FP Instructions(Double) 0 0 0 - 3 inst_integer Integer Instructions 477288576 587626940 550068311 - 3 inst_bit_convert Bit-Convert Instructions 466288575 573986790 537330093 - 3 inst_control Control-Flow Instructions 122256342 150394540 140820614 - 3 inst_compute_ld_st Load/Store Instructions 144216096 177624968 166250428 - 3 inst_misc Misc Instructions 21965148 27237096 25436068 - 3 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 - 3 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 3 atomic_transactions Atomic Transactions 0 0 0 - 3 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 3 inst_replay_overhead Instruction Replay Overhead 0.211079 0.211535 0.211329 - 3 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 - 3 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.086245 0.093096 0.089554 - 3 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 3 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 dram_read_throughput Device Memory Read Throughput 2.4139GB/s 2.5341GB/s 2.4891GB/s - 3 dram_write_throughput Device Memory Write Throughput 10.227GB/s 11.928GB/s 11.269GB/s - 3 gst_throughput Global Store Throughput 10.408GB/s 12.114GB/s 11.455GB/s - 3 gld_throughput Global Load Throughput 45.759GB/s 51.331GB/s 48.691GB/s - 3 warp_execution_efficiency Warp Execution Efficiency 47.83% 48.34% 48.04% - 3 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 3 gld_efficiency Global Memory Load Efficiency 0.71% 0.76% 0.73% - 3 gst_efficiency Global Memory Store Efficiency 3.13% 3.13% 3.13% - 3 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 46.92% 47.52% 47.21% - 3 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 3 l2_l1_read_throughput L2 Throughput (L1 Reads) 28.703GB/s 32.648GB/s 30.895GB/s - 3 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00% - 3 issued_ipc Issued IPC 0.516089 0.583549 0.558958 - 3 issue_slot_utilization Issue Slot Utilization 19.41% 21.95% 21.03% - 3 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 3 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 3 l2_read_transactions L2 Read Transactions 30330894 36703107 34409432 - 3 l2_write_transactions L2 Write Transactions 10973189 13606928 12707169 - 3 tex_cache_transactions Texture Cache Transactions 0 0 0 - 3 dram_read_transactions Device Memory Read Transactions 2545001 2894861 2757523 - 3 dram_write_transactions Device Memory Write Transactions 10781901 13390355 12502064 - 3 l2_read_throughput L2 Throughput (Reads) 28.769GB/s 32.379GB/s 31.030GB/s - 3 l2_write_throughput L2 Throughput (Writes) 10.408GB/s 12.114GB/s 11.455GB/s - 3 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 cf_issued Issued Control-Flow Instructions 10765738 13176474 12342672 - 3 cf_executed Executed Control-Flow Instructions 10765738 13176474 12342672 - 3 ldst_issued Issued Load/Store Instructions 31029479 38029401 35636740 - 3 ldst_executed Executed Load/Store Instructions 12771135 15639854 14657276 - 3 l1_shared_utilization L1/Shared Memory Utilization Low (2) Low (2) Low (2) - 3 l2_utilization L2 Cache Utilization Low (2) Low (3) Low (2) - 3 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 3 dram_utilization Device Memory Utilization Low (2) Low (2) Low (2) - 3 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) - 3 ldst_fu_utilization Load/Store Function Unit Utilization Low (1) Low (1) Low (1) - 3 alu_fu_utilization Arithmetic Function Unit Utilization Low (2) Low (2) Low (2) - 3 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 3 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 3 inst_issued Instructions Issued 104768535 128290734 120202902 - 3 issue_slots Issue Slots 78819027 96516442 90430593 - 3 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 30260936 36494668 34253152 - 3 l2_l1_write_transactions L2 Write Transactions (L1 write requests 10973184 13606912 12707157 - 3 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 3 l2_l1_write_throughput L2 Throughput (L1 Writes) 10.408GB/s 12.114GB/s 11.455GB/s - 3 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 3 eligible_warps_per_cycle Eligible Warps Per Active Cycle 0.422506 0.478932 0.458505 - 3 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 3 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 3 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.00% 0.00% 0.00% - 3 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% - Kernel: bsc_st8_encode_cuda_presort(unsigned char*, __int64*, unsigned char*, int) - 3 l1_cache_global_hit_rate L1 Global Hit Rate 62.45% 62.46% 62.46% - 3 branch_efficiency Branch Efficiency 91.67% 91.67% 91.67% - 3 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 3 sm_efficiency Multiprocessor Activity 96.17% 97.53% 96.98% - 3 achieved_occupancy Achieved Occupancy 0.816193 0.822183 0.819625 - 3 gld_requested_throughput Requested Global Load Throughput 6.7434GB/s 6.7825GB/s 6.7604GB/s - 3 gst_requested_throughput Requested Global Store Throughput 57.973GB/s 58.309GB/s 58.120GB/s - 3 ipc Executed IPC 1.219457 1.226427 1.223126 - 3 sm_efficiency_instance Multiprocessor Activity 96.17% 97.53% 96.98% - 3 ipc_instance Executed IPC 1.219457 1.226427 1.223126 - 3 inst_per_warp Instructions per warp 8.7296e+04 1.0851e+05 1.0140e+05 - 3 gld_transactions Global Load Transactions 1136288 1412208 1319782 - 3 gst_transactions Global Store Transactions 2556432 3177720 2969640 - 3 local_load_transactions Local Load Transactions 0 0 0 - 3 local_store_transactions Local Store Transactions 0 0 0 - 3 shared_load_transactions Shared Load Transactions 7669296 9533160 8908920 - 3 shared_store_transactions Shared Store Transactions 1136192 1412320 1319840 - 3 gld_transactions_per_request Global Load Transactions Per Request 0.999906 1.000106 0.999981 - 3 gst_transactions_per_request Global Store Transactions Per Request 1.500017 1.500032 1.500025 - 3 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 3 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 3 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 1.000011 1.000021 1.000016 - 3 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 1.000011 1.000021 1.000016 - 3 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 shared_load_throughput Shared Memory Load Throughput 231.90GB/s 233.24GB/s 232.48GB/s - 3 shared_store_throughput Shared Memory Store Throughput 34.355GB/s 34.554GB/s 34.442GB/s - 3 shared_efficiency Shared Memory Efficiency 97.23% 97.23% 97.23% - 3 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 3 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 3 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 3 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 10.37% 10.45% 10.41% - 3 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 19.07% 19.21% 19.15% - 3 stall_memory_dependency Issue Stall Reasons (Data Request) 22.97% 23.41% 23.22% - 3 stall_sync Issue Stall Reasons (Synchronization) 22.95% 23.18% 23.07% - 3 inst_executed Instructions Executed 33521564 41667576 38939258 - 3 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 3 stall_other Issue Stall Reasons (Other) 4.01% 4.03% 4.02% - 3 inst_fp_32 FP Instructions(Single) 0 0 0 - 3 inst_fp_64 FP Instructions(Double) 0 0 0 - 3 inst_integer Integer Instructions 380677720 473183280 442200693 - 3 inst_bit_convert Bit-Convert Instructions 0 0 0 - 3 inst_control Control-Flow Instructions 53399896 66378288 62031477 - 3 inst_compute_ld_st Load/Store Instructions 357040794 443816532 414753016 - 3 inst_misc Misc Instructions 189801496 235916208 220471157 - 3 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 - 3 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 3 atomic_transactions Atomic Transactions 0 0 0 - 3 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 3 inst_replay_overhead Instruction Replay Overhead 0.120835 0.121283 0.121071 - 3 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 - 3 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.012727 0.012729 0.012728 - 3 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 3 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 dram_read_throughput Device Memory Read Throughput 7.9587GB/s 8.0913GB/s 8.0264GB/s - 3 dram_write_throughput Device Memory Write Throughput 57.973GB/s 58.309GB/s 58.120GB/s - 3 gst_throughput Global Store Throughput 57.973GB/s 58.309GB/s 58.120GB/s - 3 gld_throughput Global Load Throughput 34.351GB/s 34.551GB/s 34.441GB/s - 3 warp_execution_efficiency Warp Execution Efficiency 98.94% 98.94% 98.94% - 3 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 3 gld_efficiency Global Memory Load Efficiency 19.63% 19.63% 19.63% - 3 gst_efficiency Global Memory Store Efficiency 100.00% 100.00% 100.00% - 3 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 20.26% 21.05% 20.61% - 3 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 3 l2_l1_read_throughput L2 Throughput (L1 Reads) 12.893GB/s 12.967GB/s 12.926GB/s - 3 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00% - 3 issued_ipc Issued IPC 1.369783 1.374804 1.372932 - 3 issue_slot_utilization Issue Slot Utilization 59.17% 59.39% 59.31% - 3 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 3 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 3 l2_read_transactions L2 Read Transactions 1705614 2120339 1982007 - 3 l2_write_transactions L2 Write Transactions 7669135 9533053 8908777 - 3 tex_cache_transactions Texture Cache Transactions 0 0 0 - 3 dram_read_transactions Device Memory Read Transactions 1068476 1319076 1229575 - 3 dram_write_transactions Device Memory Write Transactions 7669132 9533052 8908775 - 3 l2_read_throughput L2 Throughput (Reads) 12.906GB/s 12.967GB/s 12.930GB/s - 3 l2_write_throughput L2 Throughput (Writes) 57.973GB/s 58.309GB/s 58.120GB/s - 3 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 cf_issued Issued Control-Flow Instructions 3409272 4237680 3960224 - 3 cf_executed Executed Control-Flow Instructions 3409272 4237680 3960224 - 3 ldst_issued Issued Load/Store Instructions 18818198 23405818 21870743 - 3 ldst_executed Executed Load/Store Instructions 14770184 18359952 17157642 - 3 l1_shared_utilization L1/Shared Memory Utilization Mid (5) Mid (5) Mid (5) - 3 l2_utilization L2 Cache Utilization Mid (4) Mid (4) Mid (4) - 3 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 3 dram_utilization Device Memory Utilization Mid (6) Mid (6) Mid (6) - 3 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) - 3 ldst_fu_utilization Load/Store Function Unit Utilization Mid (4) Mid (4) Mid (4) - 3 alu_fu_utilization Arithmetic Function Unit Utilization Low (2) Low (2) Low (2) - 3 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 3 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 3 inst_issued Instructions Issued 37576042 46723060 43664206 - 3 issue_slots Issue Slots 32461366 40365772 37723102 - 3 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 1705732 2119940 1981280 - 3 l2_l1_write_transactions L2 Write Transactions (L1 write requests 7669136 9533052 8908776 - 3 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 3 l2_l1_write_throughput L2 Throughput (L1 Writes) 57.973GB/s 58.309GB/s 58.120GB/s - 3 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 3 eligible_warps_per_cycle Eligible Warps Per Active Cycle 5.294206 5.325456 5.307624 - 3 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 3 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 3 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 20.27% 20.40% 20.35% - 3 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% - Kernel: void cub::DeviceRadixSortUpsweepKernel::Policy520, bool=1, bool=0, __int64, int>(cub::DeviceRadixSortPolicy<__int64, unsigned char, int>::Policy520*, bool=1*, cub::DeviceRadixSortPolicy<__int64, unsigned char, int>::Policy520*, int, int, cub::GridEvenShare::Policy520*>) - 3 l1_cache_global_hit_rate L1 Global Hit Rate 0.00% 0.00% 0.00% - 3 branch_efficiency Branch Efficiency 100.00% 100.00% 100.00% - 3 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 3 sm_efficiency Multiprocessor Activity 99.40% 99.60% 99.52% - 3 achieved_occupancy Achieved Occupancy 0.526894 0.527292 0.527149 - 3 gld_requested_throughput Requested Global Load Throughput 102.57GB/s 102.69GB/s 102.64GB/s - 3 gst_requested_throughput Requested Global Store Throughput 2.9771MB/s 3.6978MB/s 3.2186MB/s - 3 ipc Executed IPC 1.055192 1.059076 1.057306 - 3 sm_efficiency_instance Multiprocessor Activity 99.40% 99.60% 99.52% - 3 ipc_instance Executed IPC 1.055192 1.059076 1.057306 - 3 inst_per_warp Instructions per warp 3.1157e+04 3.8838e+04 3.6266e+04 - 3 gld_transactions Global Load Transactions 1704012 2118336 1979652 - 3 gst_transactions Global Store Transactions 1920 1920 1920 - 3 local_load_transactions Local Load Transactions 0 0 0 - 3 local_store_transactions Local Store Transactions 0 0 0 - 3 shared_load_transactions Shared Load Transactions 917376 1132224 1060288 - 3 shared_store_transactions Shared Store Transactions 869376 1078464 1008448 - 3 gld_transactions_per_request Global Load Transactions Per Request 1.999728 2.000123 1.999914 - 3 gst_transactions_per_request Global Store Transactions Per Request 16.000000 16.000000 16.000000 - 3 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 3 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 3 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 0.999972 1.000057 1.000021 - 3 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 0.999970 1.000060 1.000022 - 3 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 shared_load_throughput Shared Memory Load Throughput 54.862GB/s 55.212GB/s 54.988GB/s - 3 shared_store_throughput Shared Memory Store Throughput 52.257GB/s 52.323GB/s 52.287GB/s - 3 shared_efficiency Shared Memory Efficiency 25.69% 25.78% 25.72% - 3 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 3 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 3 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 3 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 3 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 3 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 3.26% 3.36% 3.33% - 3 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 17.93% 18.09% 18.00% - 3 stall_memory_dependency Issue Stall Reasons (Data Request) 39.61% 39.73% 39.66% - 3 stall_sync Issue Stall Reasons (Synchronization) 17.65% 17.79% 17.70% - 3 inst_executed Instructions Executed 14955529 18642153 17407738 - 3 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 3 stall_other Issue Stall Reasons (Other) 4.27% 4.28% 4.27% - 3 inst_fp_32 FP Instructions(Single) 0 0 0 - 3 inst_fp_64 FP Instructions(Double) 0 0 0 - 3 inst_integer Integer Instructions 299731014 374540627 349486425 - 3 inst_bit_convert Bit-Convert Instructions 0 0 0 - 3 inst_control Control-Flow Instructions 2317440 2864093 2678388 - 3 inst_compute_ld_st Load/Store Instructions 84386061 104575209 97813892 - 3 inst_misc Misc Instructions 34799373 43380515 40513099 - 3 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 - 3 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 3 atomic_transactions Atomic Transactions 0 0 0 - 3 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 3 inst_replay_overhead Instruction Replay Overhead 0.112144 0.112849 0.112439 - 3 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 - 3 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.113632 0.113963 0.113746 - 3 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 3 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 dram_read_throughput Device Memory Read Throughput 103.06GB/s 103.12GB/s 103.08GB/s - 3 dram_write_throughput Device Memory Write Throughput 11.003MB/s 12.511MB/s 11.697MB/s - 3 gst_throughput Global Store Throughput 23.817MB/s 29.582MB/s 25.749MB/s - 3 gld_throughput Global Load Throughput 102.56GB/s 102.70GB/s 102.63GB/s - 3 warp_execution_efficiency Warp Execution Efficiency 100.00% 100.00% 100.00% - 3 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 3 gld_efficiency Global Memory Load Efficiency 99.99% 100.01% 100.00% - 3 gst_efficiency Global Memory Store Efficiency 12.50% 12.50% 12.50% - 3 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 0.00% 0.00% 0.00% - 3 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 3 l2_l1_read_throughput L2 Throughput (L1 Reads) 102.57GB/s 102.69GB/s 102.64GB/s - 3 local_memory_overhead Local Memory Overhead 0.00% 0.01% 0.01% - 3 issued_ipc Issued IPC 1.172857 1.177566 1.175450 - 3 issue_slot_utilization Issue Slot Utilization 40.65% 40.79% 40.72% - 3 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 3 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 3 l2_read_transactions L2 Read Transactions 6817491 8474309 7919389 - 3 l2_write_transactions L2 Write Transactions 1920 1922 1921 - 3 tex_cache_transactions Texture Cache Transactions 0 0 0 - 3 dram_read_transactions Device Memory Read Transactions 6849422 8508555 7953005 - 3 dram_write_transactions Device Memory Write Transactions 812 932 877 - 3 l2_read_throughput L2 Throughput (Reads) 102.58GB/s 102.70GB/s 102.64GB/s - 3 l2_write_throughput L2 Throughput (Writes) 23.842MB/s 29.613MB/s 25.768MB/s - 3 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 cf_issued Issued Control-Flow Instructions 84952 103214 97126 - 3 cf_executed Executed Control-Flow Instructions 84952 103214 97126 - 3 ldst_issued Issued Load/Store Instructions 4470854 5550353 5188478 - 3 ldst_executed Executed Load/Store Instructions 2789226 3455614 3232443 - 3 l1_shared_utilization L1/Shared Memory Utilization Low (3) Low (3) Low (3) - 3 l2_utilization L2 Cache Utilization Mid (5) Mid (5) Mid (5) - 3 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 3 dram_utilization Device Memory Utilization High (9) High (9) High (9) - 3 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) - 3 ldst_fu_utilization Load/Store Function Unit Utilization Low (2) Low (2) Low (2) - 3 alu_fu_utilization Arithmetic Function Unit Utilization Low (3) Low (3) Low (3) - 3 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 3 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 3 inst_issued Instructions Issued 16633055 20737907 19365857 - 3 issue_slots Issue Slots 11529394 14366099 13418750 - 3 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 3 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 6816972 8473804 7918893 - 3 l2_l1_write_transactions L2 Write Transactions (L1 write requests 1920 1920 1920 - 3 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 3 l2_l1_write_throughput L2 Throughput (L1 Writes) 23.817MB/s 29.582MB/s 25.749MB/s - 3 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 3 eligible_warps_per_cycle Eligible Warps Per Active Cycle 1.984479 2.007802 1.995895 - 3 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 3 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 3 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 3 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 16.86% 17.16% 17.06% - 3 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 11.78% 11.98% 11.91% - Kernel: void cub::DeviceRadixSortUpsweepKernel::Policy520, bool=1, bool=0, __int64, int>(cub::DeviceRadixSortPolicy<__int64, cub::NullType, int>::Policy520*, bool=1*, cub::DeviceRadixSortPolicy<__int64, cub::NullType, int>::Policy520*, int, int, cub::GridEvenShare::Policy520*>) - 18 l1_cache_global_hit_rate L1 Global Hit Rate 0.00% 0.00% 0.00% - 18 branch_efficiency Branch Efficiency 100.00% 100.00% 100.00% - 18 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% - 18 sm_efficiency Multiprocessor Activity 98.93% 99.55% 99.39% - 18 achieved_occupancy Achieved Occupancy 0.315515 0.316622 0.316322 - 18 gld_requested_throughput Requested Global Load Throughput 102.55GB/s 103.03GB/s 102.82GB/s - 18 gst_requested_throughput Requested Global Store Throughput 4.7656MB/s 5.9407MB/s 5.1587MB/s - 18 ipc Executed IPC 1.094188 1.097763 1.095606 - 18 sm_efficiency_instance Multiprocessor Activity 98.93% 99.55% 99.39% - 18 ipc_instance Executed IPC 1.094188 1.097763 1.095606 - 18 inst_per_warp Instructions per warp 4.0364e+04 5.0144e+04 4.6870e+04 - 18 gld_transactions Global Load Transactions 1704240 2118532 1979760 - 18 gst_transactions Global Store Transactions 3072 3072 3072 - 18 local_load_transactions Local Load Transactions 0 0 0 - 18 local_store_transactions Local Store Transactions 0 0 0 - 18 shared_load_transactions Shared Load Transactions 913560 1132992 1059520 - 18 shared_store_transactions Shared Store Transactions 869016 1079232 1008832 - 18 gld_transactions_per_request Global Load Transactions Per Request 1.999934 2.000164 2.000038 - 18 gst_transactions_per_request Global Store Transactions Per Request 16.000000 16.000000 16.000000 - 18 local_load_transactions_per_request Local Memory Load Transactions Per Reque 0.000000 0.000000 0.000000 - 18 local_store_transactions_per_request Local Memory Store Transactions Per Requ 0.000000 0.000000 0.000000 - 18 shared_load_transactions_per_request Shared Memory Load Transactions Per Requ 0.999969 1.000077 1.000025 - 18 shared_store_transactions_per_request Shared Memory Store Transactions Per Req 0.999968 1.000081 1.000026 - 18 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 18 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 18 shared_load_throughput Shared Memory Load Throughput 54.876GB/s 55.208GB/s 55.031GB/s - 18 shared_store_throughput Shared Memory Store Throughput 52.270GB/s 52.516GB/s 52.395GB/s - 18 shared_efficiency Shared Memory Efficiency 25.75% 25.80% 25.76% - 18 flop_count_sp Floating Point Operations(Single Precisi 0 0 0 - 18 flop_count_sp_add Floating Point Operations(Single Precisi 0 0 0 - 18 flop_count_sp_mul Floating Point Operation(Single Precisio 0 0 0 - 18 flop_count_sp_fma Floating Point Operations(Single Precisi 0 0 0 - 18 flop_count_dp Floating Point Operations(Double Precisi 0 0 0 - 18 flop_count_dp_add Floating Point Operations(Double Precisi 0 0 0 - 18 flop_count_dp_mul Floating Point Operations(Double Precisi 0 0 0 - 18 flop_count_dp_fma Floating Point Operations(Double Preciso 0 0 0 - 18 flop_count_sp_special Floating Point Operations(Single Precisi 0 0 0 - 18 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 8.60% 8.67% 8.64% - 18 stall_exec_dependency Issue Stall Reasons (Execution Dependenc 25.79% 25.95% 25.87% - 18 stall_memory_dependency Issue Stall Reasons (Data Request) 39.27% 39.52% 39.40% - 18 stall_sync Issue Stall Reasons (Synchronization) 12.93% 13.35% 13.16% - 18 inst_executed Instructions Executed 15499733 19255484 17997891 - 18 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% - 18 stall_other Issue Stall Reasons (Other) 7.85% 7.90% 7.87% - 18 inst_fp_32 FP Instructions(Single) 0 0 0 - 18 inst_fp_64 FP Instructions(Double) 0 0 0 - 18 inst_integer Integer Instructions 318199046 395424629 369565785 - 18 inst_bit_convert Bit-Convert Instructions 0 0 0 - 18 inst_control Control-Flow Instructions 1538112 1911374 1785777 - 18 inst_compute_ld_st Load/Store Instructions 84215181 104588649 97765892 - 18 inst_misc Misc Instructions 36683597 45547200 42582122 - 18 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 - 18 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 - 18 atomic_transactions Atomic Transactions 0 0 0 - 18 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 - 18 inst_replay_overhead Instruction Replay Overhead 0.069675 0.070333 0.069988 - 18 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 - 18 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.109953 0.110022 0.109997 - 18 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% - 18 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 18 dram_read_throughput Device Memory Read Throughput 103.04GB/s 103.52GB/s 103.29GB/s - 18 dram_write_throughput Device Memory Write Throughput 18.458MB/s 24.043MB/s 20.345MB/s - 18 gst_throughput Global Store Throughput 38.125MB/s 47.526MB/s 41.270MB/s - 18 gld_throughput Global Load Throughput 102.56GB/s 103.03GB/s 102.82GB/s - 18 warp_execution_efficiency Warp Execution Efficiency 100.00% 100.00% 100.00% - 18 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 - 18 gld_efficiency Global Memory Load Efficiency 99.99% 100.00% 100.00% - 18 gst_efficiency Global Memory Store Efficiency 12.50% 12.50% 12.50% - 18 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s - 18 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 0.00% 0.00% 0.00% - 18 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% - 18 l2_l1_read_throughput L2 Throughput (L1 Reads) 102.55GB/s 103.03GB/s 102.82GB/s - 18 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00% - 18 issued_ipc Issued IPC 1.170319 1.173696 1.172007 - 18 issue_slot_utilization Issue Slot Utilization 39.20% 39.32% 39.26% - 18 sysmem_read_transactions System Memory Read Transactions 0 0 0 - 18 sysmem_write_transactions System Memory Write Transactions 0 0 0 - 18 l2_read_transactions L2 Read Transactions 6817580 8474413 7919504 - 18 l2_write_transactions L2 Write Transactions 3072 3083 3072 - 18 tex_cache_transactions Texture Cache Transactions 0 0 0 - 18 dram_read_transactions Device Memory Read Transactions 6847098 8514209 7955679 - 18 dram_write_transactions Device Memory Write Transactions 1376 1578 1516 - 18 l2_read_throughput L2 Throughput (Reads) 102.56GB/s 103.04GB/s 102.82GB/s - 18 l2_write_throughput L2 Throughput (Writes) 38.125MB/s 47.526MB/s 41.282MB/s - 18 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 18 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 18 cf_issued Issued Control-Flow Instructions 62818 76619 71982 - 18 cf_executed Executed Control-Flow Instructions 62818 76619 71982 - 18 ldst_issued Issued Load/Store Instructions 3816657 4747016 4435055 - 18 ldst_executed Executed Load/Store Instructions 2736870 3398091 3176658 - 18 l1_shared_utilization L1/Shared Memory Utilization Low (3) Low (3) Low (3) - 18 l2_utilization L2 Cache Utilization Mid (5) Mid (5) Mid (5) - 18 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) - 18 dram_utilization Device Memory Utilization High (9) High (9) High (9) - 18 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) - 18 ldst_fu_utilization Load/Store Function Unit Utilization Low (2) Low (2) Low (2) - 18 alu_fu_utilization Arithmetic Function Unit Utilization Low (3) Low (3) Low (3) - 18 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) - 18 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) - 18 inst_issued Instructions Issued 16579423 20603837 19257321 - 18 issue_slots Issue Slots 11107933 13802772 12901275 - 18 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s - 18 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 6816972 8473804 7918893 - 18 l2_l1_write_transactions L2 Write Transactions (L1 write requests 3072 3072 3072 - 18 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 - 18 l2_l1_write_throughput L2 Throughput (L1 Writes) 38.125MB/s 47.526MB/s 41.270MB/s - 18 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 - 18 eligible_warps_per_cycle Eligible Warps Per Active Cycle 1.153524 1.157241 1.155432 - 18 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s - 18 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% - 18 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% - 18 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 5.02% 5.11% 5.06% - 18 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 16.64% 16.83% 16.76% + Kernel: void mtf_cuda_2buffers(unsigned char const *, unsigned char*, int, int) + 1 l1_cache_global_hit_rate L1 Global Hit Rate 80.92% 80.92% 80.92% + 1 branch_efficiency Branch Efficiency 100.00% 100.00% 100.00% + 1 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% + 1 sm_efficiency Multiprocessor Activity 97.72% 97.72% 97.72% + 1 achieved_occupancy Achieved Occupancy 0.466438 0.466438 0.466438 + 1 gld_requested_throughput Requested Global Load Throughput 18.231GB/s 18.231GB/s 18.231GB/s + 1 gst_requested_throughput Requested Global Store Throughput 583.25MB/s 583.25MB/s 582.70MB/s + 1 ipc Executed IPC 1.927710 1.927710 1.927710 + 1 sm_efficiency_instance Multiprocessor Activity 97.72% 97.72% 97.72% + 1 ipc_instance Executed IPC 1.927710 1.927710 1.927710 + 1 inst_per_warp Instructions per warp 1.8298e+05 1.8298e+05 1.8298e+05 + 1 gld_transactions Global Load Transactions 36577664 36577664 36577664 + 1 gst_transactions Global Store Transactions 36700160 36700160 36700160 + 1 local_load_transactions Local Load Transactions 0 0 0 + 1 local_store_transactions Local Store Transactions 0 0 0 + 1 shared_load_transactions Shared Load Transactions 40903172 40903172 40903172 + 1 shared_store_transactions Shared Store Transactions 75873184 75873184 75873184 + 1 gld_transactions_per_request Global Load Transactions Per Request 0.996207 0.996207 0.996207 + 1 gst_transactions_per_request Global Store Transactions Per Request 0.999777 0.999777 0.999777 + 1 local_load_transactions_per_request Local Memory Load Transactions Per Request 0.000000 0.000000 0.000000 + 1 local_store_transactions_per_request Local Memory Store Transactions Per Request 0.000000 0.000000 0.000000 + 1 shared_load_transactions_per_request Shared Memory Load Transactions Per Request 1.009505 1.009505 1.009505 + 1 shared_store_transactions_per_request Shared Memory Store Transactions Per Request 1.007546 1.007546 1.007546 + 1 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 shared_load_throughput Shared Memory Load Throughput 81.238GB/s 81.238GB/s 81.237GB/s + 1 shared_store_throughput Shared Memory Store Throughput 150.69GB/s 150.69GB/s 150.69GB/s + 1 shared_efficiency Shared Memory Efficiency 72.92% 72.92% 72.92% + 1 flop_count_sp Floating Point Operations(Single Precision) 0 0 0 + 1 flop_count_sp_add Floating Point Operations(Single Precision Add) 0 0 0 + 1 flop_count_sp_mul Floating Point Operation(Single Precision Mul) 0 0 0 + 1 flop_count_sp_fma Floating Point Operations(Single Precision FMA) 0 0 0 + 1 flop_count_dp Floating Point Operations(Double Precision) 0 0 0 + 1 flop_count_dp_add Floating Point Operations(Double Precision Add) 0 0 0 + 1 flop_count_dp_mul Floating Point Operations(Double Precision Mul) 0 0 0 + 1 flop_count_dp_fma Floating Point Operations(Double Precision FMA) 0 0 0 + 1 flop_count_sp_special Floating Point Operations(Single Precision Special) 0 0 0 + 1 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 50.09% 50.09% 50.09% + 1 stall_exec_dependency Issue Stall Reasons (Execution Dependency) 40.75% 40.75% 40.75% + 1 stall_memory_dependency Issue Stall Reasons (Data Request) 0.32% 0.32% 0.32% + 1 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% + 1 inst_executed Instructions Executed 820467872 820467872 820467872 + 1 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% + 1 stall_other Issue Stall Reasons (Other) 8.68% 8.68% 8.68% + 1 inst_fp_32 FP Instructions(Single) 0 0 0 + 1 inst_fp_64 FP Instructions(Double) 0 0 0 + 1 inst_integer Integer Instructions 1.1897e+10 1.1897e+10 1.1897e+10 + 1 inst_bit_convert Bit-Convert Instructions 1434060224 1434060224 1434060224 + 1 inst_control Control-Flow Instructions 919028158 919028158 919028158 + 1 inst_compute_ld_st Load/Store Instructions 4584213945 4584213945 4584213945 + 1 inst_misc Misc Instructions 2224891840 2224891840 2224891840 + 1 inst_inter_thread_communication Inter-Thread Instructions 1265646336 1265646336 1265646336 + 1 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 + 1 atomic_transactions Atomic Transactions 0 0 0 + 1 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 + 1 inst_replay_overhead Instruction Replay Overhead 0.002125 0.002125 0.002125 + 1 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 + 1 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.008411 0.008411 0.008411 + 1 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% + 1 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 dram_read_throughput Device Memory Read Throughput 1.0523GB/s 1.0523GB/s 1.0515GB/s + 1 dram_write_throughput Device Memory Write Throughput 16.128GB/s 16.128GB/s 16.128GB/s + 1 gst_throughput Global Store Throughput 18.227GB/s 18.227GB/s 18.226GB/s + 1 gld_throughput Global Load Throughput 72.647GB/s 72.647GB/s 72.647GB/s + 1 warp_execution_efficiency Warp Execution Efficiency 100.00% 100.00% 100.00% + 1 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 + 1 gld_efficiency Global Memory Load Efficiency 25.10% 25.10% 25.10% + 1 gst_efficiency Global Memory Store Efficiency 3.13% 3.13% 3.13% + 1 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 98.26% 98.26% 98.26% + 1 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% + 1 l2_l1_read_throughput L2 Throughput (L1 Reads) 13.985GB/s 13.985GB/s 13.985GB/s + 1 local_memory_overhead Local Memory Overhead 0.38% 0.38% 0.38% + 1 issued_ipc Issued IPC 1.928152 1.928152 1.928152 + 1 issue_slot_utilization Issue Slot Utilization 75.35% 75.35% 75.35% + 1 sysmem_read_transactions System Memory Read Transactions 0 0 0 + 1 sysmem_write_transactions System Memory Write Transactions 2 2 2 + 1 l2_read_transactions L2 Read Transactions 28821931 28821931 28821931 + 1 l2_write_transactions L2 Write Transactions 36708360 36708360 36708360 + 1 tex_cache_transactions Texture Cache Transactions 0 0 0 + 1 dram_read_transactions Device Memory Read Transactions 2119263 2119263 2119263 + 1 dram_write_transactions Device Memory Write Transactions 32481421 32481421 32481421 + 1 l2_read_throughput L2 Throughput (Reads) 14.311GB/s 14.311GB/s 14.311GB/s + 1 l2_write_throughput L2 Throughput (Writes) 18.227GB/s 18.227GB/s 18.226GB/s + 1 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 sysmem_write_throughput System Memory Write Throughput 1.0410KB/s 1.0410KB/s 0.00000B/s + 1 cf_issued Issued Control-Flow Instructions 91083323 91083323 91083323 + 1 cf_executed Executed Control-Flow Instructions 91083323 91083323 91083323 + 1 ldst_issued Issued Load/Store Instructions 190796277 190796277 190796277 + 1 ldst_executed Executed Load/Store Instructions 189251540 189251540 189251540 + 1 l1_shared_utilization L1/Shared Memory Utilization Mid (4) Mid (4) Mid (4) + 1 l2_utilization L2 Cache Utilization Low (2) Low (2) Low (2) + 1 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) + 1 dram_utilization Device Memory Utilization Low (2) Low (2) Low (2) + 1 sysmem_utilization System Memory Utilization Low (1) Low (1) Low (1) + 1 ldst_fu_utilization Load/Store Function Unit Utilization Low (3) Low (3) Low (3) + 1 alu_fu_utilization Arithmetic Function Unit Utilization Mid (5) Mid (5) Mid (5) + 1 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) + 1 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) + 1 inst_issued Instructions Issued 822067279 822067279 822067279 + 1 issue_slots Issue Slots 642527129 642527129 642527129 + 1 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 28165672 28165672 28165672 + 1 l2_l1_write_transactions L2 Write Transactions (L1 write requests) 36708352 36708352 36708352 + 1 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 + 1 l2_l1_write_throughput L2 Throughput (L1 Writes) 18.227GB/s 18.227GB/s 18.226GB/s + 1 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 + 1 eligible_warps_per_cycle Eligible Warps Per Active Cycle 2.638510 2.638510 2.638510 + 1 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% + 1 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% + 1 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.16% 0.16% 0.16% + 1 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% + Kernel: void mtf_cuda_thread_by4(unsigned char const *, unsigned char*, int, int) + 1 l1_cache_global_hit_rate L1 Global Hit Rate 13.54% 13.54% 13.54% + 1 branch_efficiency Branch Efficiency 74.62% 74.62% 74.62% + 1 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% + 1 sm_efficiency Multiprocessor Activity 90.20% 90.20% 90.20% + 1 achieved_occupancy Achieved Occupancy 0.416923 0.416923 0.416923 + 1 gld_requested_throughput Requested Global Load Throughput 383.05MB/s 383.05MB/s 382.42MB/s + 1 gst_requested_throughput Requested Global Store Throughput 382.96MB/s 382.96MB/s 382.42MB/s + 1 ipc Executed IPC 0.487053 0.487053 0.487053 + 1 sm_efficiency_instance Multiprocessor Activity 90.20% 90.20% 90.20% + 1 ipc_instance Executed IPC 0.487053 0.487053 0.487053 + 1 inst_per_warp Instructions per warp 1.0480e+06 1.0480e+06 1.0480e+06 + 1 gld_transactions Global Load Transactions 37774340 37774340 37774340 + 1 gst_transactions Global Store Transactions 37765120 37765120 37765120 + 1 local_load_transactions Local Load Transactions 0 0 0 + 1 local_store_transactions Local Store Transactions 0 0 0 + 1 shared_load_transactions Shared Load Transactions 13503852 13503852 13503852 + 1 shared_store_transactions Shared Store Transactions 13508460 13508460 13508460 + 1 gld_transactions_per_request Global Load Transactions Per Request 3.471167 3.471167 3.471167 + 1 gst_transactions_per_request Global Store Transactions Per Request 3.469682 3.469682 3.469682 + 1 local_load_transactions_per_request Local Memory Load Transactions Per Request 0.000000 0.000000 0.000000 + 1 local_store_transactions_per_request Local Memory Store Transactions Per Request 0.000000 0.000000 0.000000 + 1 shared_load_transactions_per_request Shared Memory Load Transactions Per Request 0.977316 0.977316 0.977316 + 1 shared_store_transactions_per_request Shared Memory Store Transactions Per Request 0.977331 0.977331 0.977331 + 1 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 shared_load_throughput Shared Memory Load Throughput 17.612GB/s 17.612GB/s 17.611GB/s + 1 shared_store_throughput Shared Memory Store Throughput 17.618GB/s 17.618GB/s 17.618GB/s + 1 shared_efficiency Shared Memory Efficiency 15.51% 15.51% 15.51% + 1 flop_count_sp Floating Point Operations(Single Precision) 0 0 0 + 1 flop_count_sp_add Floating Point Operations(Single Precision Add) 0 0 0 + 1 flop_count_sp_mul Floating Point Operation(Single Precision Mul) 0 0 0 + 1 flop_count_sp_fma Floating Point Operations(Single Precision FMA) 0 0 0 + 1 flop_count_dp Floating Point Operations(Double Precision) 0 0 0 + 1 flop_count_dp_add Floating Point Operations(Double Precision Add) 0 0 0 + 1 flop_count_dp_mul Floating Point Operations(Double Precision Mul) 0 0 0 + 1 flop_count_dp_fma Floating Point Operations(Double Precision FMA) 0 0 0 + 1 flop_count_sp_special Floating Point Operations(Single Precision Special) 0 0 0 + 1 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 7.67% 7.67% 7.67% + 1 stall_exec_dependency Issue Stall Reasons (Execution Dependency) 21.67% 21.67% 21.67% + 1 stall_memory_dependency Issue Stall Reasons (Data Request) 67.03% 67.03% 67.03% + 1 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% + 1 inst_executed Instructions Executed 297620701 297620701 297620701 + 1 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% + 1 stall_other Issue Stall Reasons (Other) 3.63% 3.63% 3.63% + 1 inst_fp_32 FP Instructions(Single) 0 0 0 + 1 inst_fp_64 FP Instructions(Double) 0 0 0 + 1 inst_integer Integer Instructions 809111344 809111344 809111344 + 1 inst_bit_convert Bit-Convert Instructions 778868645 778868645 778868645 + 1 inst_control Control-Flow Instructions 190952728 190952728 190952728 + 1 inst_compute_ld_st Load/Store Instructions 609420823 609420823 609420823 + 1 inst_misc Misc Instructions 316578862 316578862 316578862 + 1 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 + 1 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 + 1 atomic_transactions Atomic Transactions 0 0 0 + 1 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 + 1 inst_replay_overhead Instruction Replay Overhead 0.348337 0.348337 0.348337 + 1 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 + 1 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.110731 0.110731 0.110731 + 1 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% + 1 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 dram_read_throughput Device Memory Read Throughput 17.070GB/s 17.070GB/s 17.069GB/s + 1 dram_write_throughput Device Memory Write Throughput 11.747GB/s 11.747GB/s 11.747GB/s + 1 gst_throughput Global Store Throughput 11.968GB/s 11.968GB/s 11.967GB/s + 1 gld_throughput Global Load Throughput 49.266GB/s 49.266GB/s 49.265GB/s + 1 warp_execution_efficiency Warp Execution Efficiency 32.15% 32.15% 32.15% + 1 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 + 1 gld_efficiency Global Memory Load Efficiency 0.76% 0.76% 0.76% + 1 gst_efficiency Global Memory Store Efficiency 3.13% 3.13% 3.13% + 1 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 59.51% 59.51% 59.51% + 1 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% + 1 l2_l1_read_throughput L2 Throughput (L1 Reads) 41.298GB/s 41.298GB/s 41.298GB/s + 1 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00% + 1 issued_ipc Issued IPC 0.656715 0.656715 0.656715 + 1 issue_slot_utilization Issue Slot Utilization 24.40% 24.40% 24.40% + 1 sysmem_read_transactions System Memory Read Transactions 0 0 0 + 1 sysmem_write_transactions System Memory Write Transactions 0 0 0 + 1 l2_read_transactions L2 Read Transactions 126458270 126458270 126458270 + 1 l2_write_transactions L2 Write Transactions 36704261 36704261 36704261 + 1 tex_cache_transactions Texture Cache Transactions 0 0 0 + 1 dram_read_transactions Device Memory Read Transactions 52352521 52352521 52352521 + 1 dram_write_transactions Device Memory Write Transactions 36027884 36027884 36027884 + 1 l2_read_throughput L2 Throughput (Reads) 41.232GB/s 41.232GB/s 41.232GB/s + 1 l2_write_throughput L2 Throughput (Writes) 11.968GB/s 11.968GB/s 11.967GB/s + 1 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 cf_issued Issued Control-Flow Instructions 60219487 60219487 60219487 + 1 cf_executed Executed Control-Flow Instructions 60219487 60219487 60219487 + 1 ldst_issued Issued Load/Store Instructions 152650147 152650147 152650147 + 1 ldst_executed Executed Load/Store Instructions 49405704 49405704 49405704 + 1 l1_shared_utilization L1/Shared Memory Utilization Low (2) Low (2) Low (2) + 1 l2_utilization L2 Cache Utilization Low (3) Low (3) Low (3) + 1 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) + 1 dram_utilization Device Memory Utilization Low (3) Low (3) Low (3) + 1 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) + 1 ldst_fu_utilization Load/Store Function Unit Utilization Low (2) Low (2) Low (2) + 1 alu_fu_utilization Arithmetic Function Unit Utilization Low (2) Low (2) Low (2) + 1 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) + 1 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) + 1 inst_issued Instructions Issued 401477808 401477808 401477808 + 1 issue_slots Issue Slots 298344894 298344894 298344894 + 1 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 126660836 126660836 126660836 + 1 l2_l1_write_transactions L2 Write Transactions (L1 write requests) 36704256 36704256 36704256 + 1 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 + 1 l2_l1_write_throughput L2 Throughput (L1 Writes) 11.968GB/s 11.968GB/s 11.967GB/s + 1 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 + 1 eligible_warps_per_cycle Eligible Warps Per Active Cycle 0.549387 0.549387 0.549387 + 1 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% + 1 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% + 1 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.00% 0.00% 0.00% + 1 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% + Kernel: void mtf_cuda_4by8(unsigned char const *, unsigned char*, int, int) + 1 l1_cache_global_hit_rate L1 Global Hit Rate 41.64% 41.64% 41.64% + 1 branch_efficiency Branch Efficiency 77.96% 77.96% 77.96% + 1 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% + 1 sm_efficiency Multiprocessor Activity 95.10% 95.10% 95.10% + 1 achieved_occupancy Achieved Occupancy 0.729576 0.729576 0.729576 + 1 gld_requested_throughput Requested Global Load Throughput 5.0889GB/s 5.0889GB/s 5.0887GB/s + 1 gst_requested_throughput Requested Global Store Throughput 5.0876GB/s 5.0876GB/s 5.0869GB/s + 1 ipc Executed IPC 1.729037 1.729037 1.729037 + 1 sm_efficiency_instance Multiprocessor Activity 95.10% 95.10% 95.10% + 1 ipc_instance Executed IPC 1.729037 1.729037 1.729037 + 1 inst_per_warp Instructions per warp 2.7730e+05 2.7730e+05 2.7730e+05 + 1 gld_transactions Global Load Transactions 37757952 37757952 37757952 + 1 gst_transactions Global Store Transactions 36716544 36716544 36716544 + 1 local_load_transactions Local Load Transactions 0 0 0 + 1 local_store_transactions Local Store Transactions 0 0 0 + 1 shared_load_transactions Shared Load Transactions 51280572 51280572 51280572 + 1 shared_store_transactions Shared Store Transactions 83927896 83927896 83927896 + 1 gld_transactions_per_request Global Load Transactions Per Request 2.669231 2.669231 2.669231 + 1 gst_transactions_per_request Global Store Transactions Per Request 2.595823 2.595823 2.595823 + 1 local_load_transactions_per_request Local Memory Load Transactions Per Request 0.000000 0.000000 0.000000 + 1 local_store_transactions_per_request Local Memory Store Transactions Per Request 0.000000 0.000000 0.000000 + 1 shared_load_transactions_per_request Shared Memory Load Transactions Per Request 2.052570 2.052570 2.052570 + 1 shared_store_transactions_per_request Shared Memory Store Transactions Per Request 2.151085 2.151085 2.151085 + 1 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 shared_load_throughput Shared Memory Load Throughput 113.73GB/s 113.73GB/s 113.73GB/s + 1 shared_store_throughput Shared Memory Store Throughput 186.13GB/s 186.13GB/s 186.13GB/s + 1 shared_efficiency Shared Memory Efficiency 5.75% 5.75% 5.75% + 1 flop_count_sp Floating Point Operations(Single Precision) 0 0 0 + 1 flop_count_sp_add Floating Point Operations(Single Precision Add) 0 0 0 + 1 flop_count_sp_mul Floating Point Operation(Single Precision Mul) 0 0 0 + 1 flop_count_sp_fma Floating Point Operations(Single Precision FMA) 0 0 0 + 1 flop_count_dp Floating Point Operations(Double Precision) 0 0 0 + 1 flop_count_dp_add Floating Point Operations(Double Precision Add) 0 0 0 + 1 flop_count_dp_mul Floating Point Operations(Double Precision Mul) 0 0 0 + 1 flop_count_dp_fma Floating Point Operations(Double Precision FMA) 0 0 0 + 1 flop_count_sp_special Floating Point Operations(Single Precision Special) 0 0 0 + 1 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 69.54% 69.54% 69.54% + 1 stall_exec_dependency Issue Stall Reasons (Execution Dependency) 21.35% 21.35% 21.35% + 1 stall_memory_dependency Issue Stall Reasons (Data Request) 2.06% 2.06% 2.06% + 1 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% + 1 inst_executed Instructions Executed 623380211 623380211 623380211 + 1 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% + 1 stall_other Issue Stall Reasons (Other) 4.71% 4.71% 4.71% + 1 inst_fp_32 FP Instructions(Single) 0 0 0 + 1 inst_fp_64 FP Instructions(Double) 0 0 0 + 1 inst_integer Integer Instructions 6222351792 6222351792 6222351792 + 1 inst_bit_convert Bit-Convert Instructions 3129482600 3129482600 3129482600 + 1 inst_control Control-Flow Instructions 600596256 600596256 600596256 + 1 inst_compute_ld_st Load/Store Instructions 1581918969 1581918969 1581918969 + 1 inst_misc Misc Instructions 1230142023 1230142023 1230142023 + 1 inst_inter_thread_communication Inter-Thread Instructions 447079184 447079184 447079184 + 1 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 + 1 atomic_transactions Atomic Transactions 0 0 0 + 1 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 + 1 inst_replay_overhead Instruction Replay Overhead 0.198936 0.198936 0.198936 + 1 shared_replay_overhead Shared Memory Replay Overhead 0.112665 0.112665 0.112665 + 1 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.033682 0.033682 0.033682 + 1 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% + 1 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 dram_read_throughput Device Memory Read Throughput 1.2931GB/s 1.2931GB/s 1.2927GB/s + 1 dram_write_throughput Device Memory Write Throughput 18.333GB/s 18.333GB/s 18.332GB/s + 1 gst_throughput Global Store Throughput 20.351GB/s 20.351GB/s 20.350GB/s + 1 gld_throughput Global Load Throughput 83.739GB/s 83.739GB/s 83.739GB/s + 1 warp_execution_efficiency Warp Execution Efficiency 73.38% 73.38% 73.38% + 1 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 + 1 gld_efficiency Global Memory Load Efficiency 6.08% 6.08% 6.08% + 1 gst_efficiency Global Memory Store Efficiency 25.00% 25.00% 25.00% + 1 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 98.14% 98.14% 98.14% + 1 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% + 1 l2_l1_read_throughput L2 Throughput (L1 Reads) 47.294GB/s 47.294GB/s 47.293GB/s + 1 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00% + 1 issued_ipc Issued IPC 2.073341 2.073341 2.073341 + 1 issue_slot_utilization Issue Slot Utilization 78.36% 78.36% 78.36% + 1 sysmem_read_transactions System Memory Read Transactions 0 0 0 + 1 sysmem_write_transactions System Memory Write Transactions 2 2 2 + 1 l2_read_transactions L2 Read Transactions 84870239 84870239 84870239 + 1 l2_write_transactions L2 Write Transactions 36704259 36704259 36704259 + 1 tex_cache_transactions Texture Cache Transactions 0 0 0 + 1 dram_read_transactions Device Memory Read Transactions 2332185 2332185 2332185 + 1 dram_write_transactions Device Memory Write Transactions 33065288 33065288 33065288 + 1 l2_read_throughput L2 Throughput (Reads) 47.056GB/s 47.056GB/s 47.056GB/s + 1 l2_write_throughput L2 Throughput (Writes) 20.351GB/s 20.351GB/s 20.350GB/s + 1 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 sysmem_write_throughput System Memory Write Throughput 1.1621KB/s 1.1621KB/s 0.00000B/s + 1 cf_issued Issued Control-Flow Instructions 87972397 87972397 87972397 + 1 cf_executed Executed Control-Flow Instructions 87972397 87972397 87972397 + 1 ldst_issued Issued Load/Store Instructions 216534031 216534031 216534031 + 1 ldst_executed Executed Load/Store Instructions 92290236 92290236 92290236 + 1 l1_shared_utilization L1/Shared Memory Utilization Mid (5) Mid (5) Mid (5) + 1 l2_utilization L2 Cache Utilization Mid (4) Mid (4) Mid (4) + 1 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) + 1 dram_utilization Device Memory Utilization Low (2) Low (2) Low (2) + 1 sysmem_utilization System Memory Utilization Low (1) Low (1) Low (1) + 1 ldst_fu_utilization Load/Store Function Unit Utilization Mid (4) Mid (4) Mid (4) + 1 alu_fu_utilization Arithmetic Function Unit Utilization Mid (6) Mid (6) Mid (6) + 1 cf_fu_utilization Control-Flow Function Unit Utilization Low (2) Low (2) Low (2) + 1 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) + 1 inst_issued Instructions Issued 747443700 747443700 747443700 + 1 issue_slots Issue Slots 564951281 564951281 564951281 + 1 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 85298784 85298784 85298784 + 1 l2_l1_write_transactions L2 Write Transactions (L1 write requests) 36704256 36704256 36704256 + 1 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 + 1 l2_l1_write_throughput L2 Throughput (L1 Writes) 20.351GB/s 20.351GB/s 20.350GB/s + 1 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 + 1 eligible_warps_per_cycle Eligible Warps Per Active Cycle 3.690186 3.690186 3.690186 + 1 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% + 1 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% + 1 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 2.34% 2.34% 2.34% + 1 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% + Kernel: void mtf_cuda_thread(unsigned char const *, unsigned char*, int, int) + 1 l1_cache_global_hit_rate L1 Global Hit Rate 18.58% 18.58% 18.58% + 1 branch_efficiency Branch Efficiency 72.33% 72.33% 72.33% + 1 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% + 1 sm_efficiency Multiprocessor Activity 85.11% 85.11% 85.11% + 1 achieved_occupancy Achieved Occupancy 0.407642 0.407642 0.407642 + 1 gld_requested_throughput Requested Global Load Throughput 380.03MB/s 380.03MB/s 379.56MB/s + 1 gst_requested_throughput Requested Global Store Throughput 379.94MB/s 379.94MB/s 379.56MB/s + 1 ipc Executed IPC 0.646515 0.646515 0.646515 + 1 sm_efficiency_instance Multiprocessor Activity 85.11% 85.11% 85.11% + 1 ipc_instance Executed IPC 0.646515 0.646515 0.646515 + 1 inst_per_warp Instructions per warp 1.2859e+06 1.2859e+06 1.2859e+06 + 1 gld_transactions Global Load Transactions 37757952 37757952 37757952 + 1 gst_transactions Global Store Transactions 35651584 35651584 35651584 + 1 local_load_transactions Local Load Transactions 0 0 0 + 1 local_store_transactions Local Store Transactions 0 0 0 + 1 shared_load_transactions Shared Load Transactions 10874500 10874500 10874500 + 1 shared_store_transactions Shared Store Transactions 19564656 19564656 19564656 + 1 gld_transactions_per_request Global Load Transactions Per Request 3.970228 3.970228 3.970228 + 1 gst_transactions_per_request Global Store Transactions Per Request 3.748078 3.748078 3.748078 + 1 local_load_transactions_per_request Local Memory Load Transactions Per Request 0.000000 0.000000 0.000000 + 1 local_store_transactions_per_request Local Memory Store Transactions Per Request 0.000000 0.000000 0.000000 + 1 shared_load_transactions_per_request Shared Memory Load Transactions Per Request 0.894327 0.894327 0.894327 + 1 shared_store_transactions_per_request Shared Memory Store Transactions Per Request 0.902612 0.902612 0.902612 + 1 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 shared_load_throughput Shared Memory Load Throughput 14.071GB/s 14.071GB/s 14.070GB/s + 1 shared_store_throughput Shared Memory Store Throughput 25.315GB/s 25.315GB/s 25.314GB/s + 1 shared_efficiency Shared Memory Efficiency 12.83% 12.83% 12.83% + 1 flop_count_sp Floating Point Operations(Single Precision) 0 0 0 + 1 flop_count_sp_add Floating Point Operations(Single Precision Add) 0 0 0 + 1 flop_count_sp_mul Floating Point Operation(Single Precision Mul) 0 0 0 + 1 flop_count_sp_fma Floating Point Operations(Single Precision FMA) 0 0 0 + 1 flop_count_dp Floating Point Operations(Double Precision) 0 0 0 + 1 flop_count_dp_add Floating Point Operations(Double Precision Add) 0 0 0 + 1 flop_count_dp_mul Floating Point Operations(Double Precision Mul) 0 0 0 + 1 flop_count_dp_fma Floating Point Operations(Double Precision FMA) 0 0 0 + 1 flop_count_sp_special Floating Point Operations(Single Precision Special) 0 0 0 + 1 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 9.24% 9.24% 9.24% + 1 stall_exec_dependency Issue Stall Reasons (Execution Dependency) 27.65% 27.65% 27.65% + 1 stall_memory_dependency Issue Stall Reasons (Data Request) 58.30% 58.30% 58.30% + 1 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% + 1 inst_executed Instructions Executed 365209249 365209249 365209249 + 1 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% + 1 stall_other Issue Stall Reasons (Other) 4.80% 4.80% 4.80% + 1 inst_fp_32 FP Instructions(Single) 0 0 0 + 1 inst_fp_64 FP Instructions(Double) 0 0 0 + 1 inst_integer Integer Instructions 1912268312 1912268312 1912268312 + 1 inst_bit_convert Bit-Convert Instructions 1875491987 1875491987 1875491987 + 1 inst_control Control-Flow Instructions 499164806 499164806 499164806 + 1 inst_compute_ld_st Load/Store Instructions 572725528 572725528 572725528 + 1 inst_misc Misc Instructions 73579152 73579152 73579152 + 1 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 + 1 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 + 1 atomic_transactions Atomic Transactions 0 0 0 + 1 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 + 1 inst_replay_overhead Instruction Replay Overhead 0.303443 0.303443 0.303443 + 1 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 + 1 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.080036 0.080036 0.080036 + 1 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% + 1 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 dram_read_throughput Device Memory Read Throughput 13.632GB/s 13.632GB/s 13.631GB/s + 1 dram_write_throughput Device Memory Write Throughput 11.531GB/s 11.531GB/s 11.531GB/s + 1 gst_throughput Global Store Throughput 11.873GB/s 11.873GB/s 11.873GB/s + 1 gld_throughput Global Load Throughput 48.855GB/s 48.855GB/s 48.855GB/s + 1 warp_execution_efficiency Warp Execution Efficiency 45.32% 45.32% 45.32% + 1 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 + 1 gld_efficiency Global Memory Load Efficiency 0.76% 0.76% 0.76% + 1 gst_efficiency Global Memory Store Efficiency 3.13% 3.13% 3.13% + 1 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 66.32% 66.32% 66.32% + 1 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% + 1 l2_l1_read_throughput L2 Throughput (L1 Reads) 39.263GB/s 39.263GB/s 39.263GB/s + 1 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00% + 1 issued_ipc Issued IPC 0.835217 0.835217 0.835217 + 1 issue_slot_utilization Issue Slot Utilization 31.98% 31.98% 31.98% + 1 sysmem_read_transactions System Memory Read Transactions 0 0 0 + 1 sysmem_write_transactions System Memory Write Transactions 2 2 2 + 1 l2_read_transactions L2 Read Transactions 120170857 120170857 120170857 + 1 l2_write_transactions L2 Write Transactions 36704272 36704272 36704272 + 1 tex_cache_transactions Texture Cache Transactions 0 0 0 + 1 dram_read_transactions Device Memory Read Transactions 42141014 42141014 42141014 + 1 dram_write_transactions Device Memory Write Transactions 35647242 35647242 35647242 + 1 l2_read_throughput L2 Throughput (Reads) 38.873GB/s 38.873GB/s 38.872GB/s + 1 l2_write_throughput L2 Throughput (Writes) 11.873GB/s 11.873GB/s 11.873GB/s + 1 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 sysmem_write_throughput System Memory Write Throughput 694.000B/s 694.000B/s 0.00000B/s + 1 cf_issued Issued Control-Flow Instructions 46297675 46297675 46297675 + 1 cf_executed Executed Control-Flow Instructions 46297675 46297675 46297675 + 1 ldst_issued Issued Load/Store Instructions 163003546 163003546 163003546 + 1 ldst_executed Executed Load/Store Instructions 52857250 52857250 52857250 + 1 l1_shared_utilization L1/Shared Memory Utilization Low (2) Low (2) Low (2) + 1 l2_utilization L2 Cache Utilization Low (3) Low (3) Low (3) + 1 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) + 1 dram_utilization Device Memory Utilization Low (3) Low (3) Low (3) + 1 sysmem_utilization System Memory Utilization Low (1) Low (1) Low (1) + 1 ldst_fu_utilization Load/Store Function Unit Utilization Low (2) Low (2) Low (2) + 1 alu_fu_utilization Arithmetic Function Unit Utilization Low (3) Low (3) Low (3) + 1 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) + 1 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) + 1 inst_issued Instructions Issued 475678060 475678060 475678060 + 1 issue_slots Issue Slots 364269395 364269395 364269395 + 1 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 121378640 121378640 121378640 + 1 l2_l1_write_transactions L2 Write Transactions (L1 write requests) 36704256 36704256 36704256 + 1 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 + 1 l2_l1_write_throughput L2 Throughput (L1 Writes) 11.873GB/s 11.873GB/s 11.873GB/s + 1 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 + 1 eligible_warps_per_cycle Eligible Warps Per Active Cycle 0.777966 0.777966 0.777966 + 1 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% + 1 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% + 1 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.00% 0.00% 0.00% + 1 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% + Kernel: void mtf_cuda_thread_by4(unsigned char const *, unsigned char*, int, int) + 1 l1_cache_global_hit_rate L1 Global Hit Rate 48.88% 48.88% 48.88% + 1 branch_efficiency Branch Efficiency 73.21% 73.21% 73.21% + 1 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% + 1 sm_efficiency Multiprocessor Activity 91.38% 91.38% 91.38% + 1 achieved_occupancy Achieved Occupancy 0.116559 0.116559 0.116559 + 1 gld_requested_throughput Requested Global Load Throughput 400.84MB/s 400.84MB/s 400.54MB/s + 1 gst_requested_throughput Requested Global Store Throughput 400.74MB/s 400.74MB/s 400.54MB/s + 1 ipc Executed IPC 0.495656 0.495656 0.495656 + 1 sm_efficiency_instance Multiprocessor Activity 91.38% 91.38% 91.38% + 1 ipc_instance Executed IPC 0.495656 0.495656 0.495656 + 1 inst_per_warp Instructions per warp 1.0213e+06 1.0213e+06 1.0213e+06 + 1 gld_transactions Global Load Transactions 37233536 37233536 37233536 + 1 gst_transactions Global Store Transactions 37224448 37224448 37224448 + 1 local_load_transactions Local Load Transactions 0 0 0 + 1 local_store_transactions Local Store Transactions 0 0 0 + 1 shared_load_transactions Shared Load Transactions 15597040 15597040 15597040 + 1 shared_store_transactions Shared Store Transactions 15615216 15615216 15615216 + 1 gld_transactions_per_request Global Load Transactions Per Request 3.351022 3.351022 3.351022 + 1 gst_transactions_per_request Global Store Transactions Per Request 3.349599 3.349599 3.349599 + 1 local_load_transactions_per_request Local Memory Load Transactions Per Request 0.000000 0.000000 0.000000 + 1 local_store_transactions_per_request Local Memory Store Transactions Per Request 0.000000 0.000000 0.000000 + 1 shared_load_transactions_per_request Shared Memory Load Transactions Per Request 1.038220 1.038220 1.038220 + 1 shared_store_transactions_per_request Shared Memory Store Transactions Per Request 1.038187 1.038187 1.038187 + 1 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 shared_load_throughput Shared Memory Load Throughput 21.286GB/s 21.286GB/s 21.286GB/s + 1 shared_store_throughput Shared Memory Store Throughput 21.311GB/s 21.311GB/s 21.311GB/s + 1 shared_efficiency Shared Memory Efficiency 14.49% 14.49% 14.49% + 1 flop_count_sp Floating Point Operations(Single Precision) 0 0 0 + 1 flop_count_sp_add Floating Point Operations(Single Precision Add) 0 0 0 + 1 flop_count_sp_mul Floating Point Operation(Single Precision Mul) 0 0 0 + 1 flop_count_sp_fma Floating Point Operations(Single Precision FMA) 0 0 0 + 1 flop_count_dp Floating Point Operations(Double Precision) 0 0 0 + 1 flop_count_dp_add Floating Point Operations(Double Precision Add) 0 0 0 + 1 flop_count_dp_mul Floating Point Operations(Double Precision Mul) 0 0 0 + 1 flop_count_dp_fma Floating Point Operations(Double Precision FMA) 0 0 0 + 1 flop_count_sp_special Floating Point Operations(Single Precision Special) 0 0 0 + 1 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 31.02% 31.02% 31.02% + 1 stall_exec_dependency Issue Stall Reasons (Execution Dependency) 52.40% 52.40% 52.40% + 1 stall_memory_dependency Issue Stall Reasons (Data Request) 4.77% 4.77% 4.77% + 1 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% + 1 inst_executed Instructions Executed 286973422 286973422 286973422 + 1 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% + 1 stall_other Issue Stall Reasons (Other) 11.80% 11.80% 11.80% + 1 inst_fp_32 FP Instructions(Single) 0 0 0 + 1 inst_fp_64 FP Instructions(Double) 0 0 0 + 1 inst_integer Integer Instructions 809374727 809374727 809374727 + 1 inst_bit_convert Bit-Convert Instructions 834693512 834693512 834693512 + 1 inst_control Control-Flow Instructions 202146652 202146652 202146652 + 1 inst_compute_ld_st Load/Store Instructions 650399987 650399987 650399987 + 1 inst_misc Misc Instructions 196808082 196808082 196808082 + 1 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 + 1 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 + 1 atomic_transactions Atomic Transactions 0 0 0 + 1 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 + 1 inst_replay_overhead Instruction Replay Overhead 0.179612 0.179612 0.179612 + 1 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 + 1 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.066226 0.066226 0.066226 + 1 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% + 1 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 dram_read_throughput Device Memory Read Throughput 972.43MB/s 972.43MB/s 971.79MB/s + 1 dram_write_throughput Device Memory Write Throughput 12.391GB/s 12.391GB/s 12.390GB/s + 1 gst_throughput Global Store Throughput 12.523GB/s 12.523GB/s 12.523GB/s + 1 gld_throughput Global Load Throughput 50.815GB/s 50.815GB/s 50.815GB/s + 1 warp_execution_efficiency Warp Execution Efficiency 33.02% 33.02% 33.02% + 1 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 + 1 gld_efficiency Global Memory Load Efficiency 0.77% 0.77% 0.77% + 1 gst_efficiency Global Memory Store Efficiency 3.13% 3.13% 3.13% + 1 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 98.33% 98.33% 98.33% + 1 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% + 1 l2_l1_read_throughput L2 Throughput (L1 Reads) 25.940GB/s 25.940GB/s 25.939GB/s + 1 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00% + 1 issued_ipc Issued IPC 0.588104 0.588104 0.588104 + 1 issue_slot_utilization Issue Slot Utilization 21.65% 21.65% 21.65% + 1 sysmem_read_transactions System Memory Read Transactions 0 0 0 + 1 sysmem_write_transactions System Memory Write Transactions 2 2 2 + 1 l2_read_transactions L2 Read Transactions 76829517 76829517 76829517 + 1 l2_write_transactions L2 Write Transactions 36704263 36704263 36704263 + 1 tex_cache_transactions Texture Cache Transactions 0 0 0 + 1 dram_read_transactions Device Memory Read Transactions 2783284 2783284 2783284 + 1 dram_write_transactions Device Memory Write Transactions 36317162 36317162 36317162 + 1 l2_read_throughput L2 Throughput (Reads) 26.214GB/s 26.214GB/s 26.213GB/s + 1 l2_write_throughput L2 Throughput (Writes) 12.523GB/s 12.523GB/s 12.523GB/s + 1 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 sysmem_write_throughput System Memory Write Throughput 732.000B/s 732.000B/s 0.00000B/s + 1 cf_issued Issued Control-Flow Instructions 44842775 44842775 44842775 + 1 cf_executed Executed Control-Flow Instructions 44842775 44842775 44842775 + 1 ldst_issued Issued Load/Store Instructions 103826366 103826366 103826366 + 1 ldst_executed Executed Load/Store Instructions 52287907 52287907 52287907 + 1 l1_shared_utilization L1/Shared Memory Utilization Low (2) Low (2) Low (2) + 1 l2_utilization L2 Cache Utilization Low (2) Low (2) Low (2) + 1 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) + 1 dram_utilization Device Memory Utilization Low (2) Low (2) Low (2) + 1 sysmem_utilization System Memory Utilization Low (1) Low (1) Low (1) + 1 ldst_fu_utilization Load/Store Function Unit Utilization Low (1) Low (1) Low (1) + 1 alu_fu_utilization Arithmetic Function Unit Utilization Low (2) Low (2) Low (2) + 1 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) + 1 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) + 1 inst_issued Instructions Issued 338509558 338509558 338509558 + 1 issue_slots Issue Slots 249202145 249202145 249202145 + 1 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 76027132 76027132 76027132 + 1 l2_l1_write_transactions L2 Write Transactions (L1 write requests) 36704256 36704256 36704256 + 1 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 + 1 l2_l1_write_throughput L2 Throughput (L1 Writes) 12.523GB/s 12.523GB/s 12.523GB/s + 1 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 + 1 eligible_warps_per_cycle Eligible Warps Per Active Cycle 0.464800 0.464800 0.464800 + 1 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% + 1 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% + 1 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.00% 0.00% 0.00% + 1 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% + Kernel: void mtf_cuda_2buffers_depth32(unsigned char const *, unsigned char*, int, int) + 1 l1_cache_global_hit_rate L1 Global Hit Rate 80.68% 80.68% 80.68% + 1 branch_efficiency Branch Efficiency 100.00% 100.00% 100.00% + 1 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% + 1 sm_efficiency Multiprocessor Activity 99.51% 99.51% 99.51% + 1 achieved_occupancy Achieved Occupancy 0.647691 0.647691 0.647691 + 1 gld_requested_throughput Requested Global Load Throughput 23.706GB/s 23.706GB/s 23.706GB/s + 1 gst_requested_throughput Requested Global Store Throughput 23.700GB/s 23.700GB/s 23.700GB/s + 1 ipc Executed IPC 2.402347 2.402347 2.402347 + 1 sm_efficiency_instance Multiprocessor Activity 99.51% 99.51% 99.51% + 1 ipc_instance Executed IPC 2.402347 2.402347 2.402347 + 1 inst_per_warp Instructions per warp 1.7732e+05 1.7732e+05 1.7732e+05 + 1 gld_transactions Global Load Transactions 36709120 36709120 36709120 + 1 gst_transactions Global Store Transactions 36700160 36700160 36700160 + 1 local_load_transactions Local Load Transactions 0 0 0 + 1 local_store_transactions Local Store Transactions 0 0 0 + 1 shared_load_transactions Shared Load Transactions 36831232 36831232 36831232 + 1 shared_store_transactions Shared Store Transactions 73671452 73671452 73671452 + 1 gld_transactions_per_request Global Load Transactions Per Request 0.999777 0.999777 0.999777 + 1 gst_transactions_per_request Global Store Transactions Per Request 0.999777 0.999777 0.999777 + 1 local_load_transactions_per_request Local Memory Load Transactions Per Request 0.000000 0.000000 0.000000 + 1 local_store_transactions_per_request Local Memory Store Transactions Per Request 0.000000 0.000000 0.000000 + 1 shared_load_transactions_per_request Shared Memory Load Transactions Per Request 1.003347 1.003347 1.003347 + 1 shared_store_transactions_per_request Shared Memory Store Transactions Per Request 1.003410 1.003410 1.003410 + 1 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 shared_load_throughput Shared Memory Load Throughput 95.119GB/s 95.119GB/s 95.119GB/s + 1 shared_store_throughput Shared Memory Store Throughput 190.26GB/s 190.26GB/s 190.26GB/s + 1 shared_efficiency Shared Memory Efficiency 72.42% 72.42% 72.42% + 1 flop_count_sp Floating Point Operations(Single Precision) 0 0 0 + 1 flop_count_sp_add Floating Point Operations(Single Precision Add) 0 0 0 + 1 flop_count_sp_mul Floating Point Operation(Single Precision Mul) 0 0 0 + 1 flop_count_sp_fma Floating Point Operations(Single Precision FMA) 0 0 0 + 1 flop_count_dp Floating Point Operations(Double Precision) 0 0 0 + 1 flop_count_dp_add Floating Point Operations(Double Precision Add) 0 0 0 + 1 flop_count_dp_mul Floating Point Operations(Double Precision Mul) 0 0 0 + 1 flop_count_dp_fma Floating Point Operations(Double Precision FMA) 0 0 0 + 1 flop_count_sp_special Floating Point Operations(Single Precision Special) 0 0 0 + 1 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 45.35% 45.35% 45.35% + 1 stall_exec_dependency Issue Stall Reasons (Execution Dependency) 37.86% 37.86% 37.86% + 1 stall_memory_dependency Issue Stall Reasons (Data Request) 0.06% 0.06% 0.06% + 1 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% + 1 inst_executed Instructions Executed 795086262 795086262 795086262 + 1 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% + 1 stall_other Issue Stall Reasons (Other) 7.05% 7.05% 7.05% + 1 inst_fp_32 FP Instructions(Single) 0 0 0 + 1 inst_fp_64 FP Instructions(Double) 0 0 0 + 1 inst_integer Integer Instructions 1.2483e+10 1.2483e+10 1.2483e+10 + 1 inst_bit_convert Bit-Convert Instructions 2349334528 2349334528 2349334528 + 1 inst_control Control-Flow Instructions 624185472 624185472 624185472 + 1 inst_compute_ld_st Load/Store Instructions 5497762341 5497762341 5497762341 + 1 inst_misc Misc Instructions 2349908384 2349908384 2349908384 + 1 inst_inter_thread_communication Inter-Thread Instructions 1174667264 1174667264 1174667264 + 1 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 + 1 atomic_transactions Atomic Transactions 0 0 0 + 1 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 + 1 inst_replay_overhead Instruction Replay Overhead 0.000242 0.000242 0.000242 + 1 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 + 1 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.009719 0.009719 0.009719 + 1 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% + 1 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 dram_read_throughput Device Memory Read Throughput 1.2304GB/s 1.2304GB/s 1.2303GB/s + 1 dram_write_throughput Device Memory Write Throughput 18.449GB/s 18.449GB/s 18.449GB/s + 1 gst_throughput Global Store Throughput 23.700GB/s 23.700GB/s 23.700GB/s + 1 gld_throughput Global Load Throughput 94.803GB/s 94.803GB/s 94.803GB/s + 1 warp_execution_efficiency Warp Execution Efficiency 100.00% 100.00% 100.00% + 1 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 + 1 gld_efficiency Global Memory Load Efficiency 25.01% 25.01% 25.01% + 1 gst_efficiency Global Memory Store Efficiency 100.00% 100.00% 100.00% + 1 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 100.00% 100.00% 100.00% + 1 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% + 1 l2_l1_read_throughput L2 Throughput (L1 Reads) 20.278GB/s 20.278GB/s 20.278GB/s + 1 local_memory_overhead Local Memory Overhead 4.47% 4.47% 4.47% + 1 issued_ipc Issued IPC 2.403957 2.403957 2.403957 + 1 issue_slot_utilization Issue Slot Utilization 85.52% 85.52% 85.52% + 1 sysmem_read_transactions System Memory Read Transactions 0 0 0 + 1 sysmem_write_transactions System Memory Write Transactions 1 1 1 + 1 l2_read_transactions L2 Read Transactions 30610463 30610463 30610463 + 1 l2_write_transactions L2 Write Transactions 36708355 36708355 36708355 + 1 tex_cache_transactions Texture Cache Transactions 0 0 0 + 1 dram_read_transactions Device Memory Read Transactions 1905769 1905769 1905769 + 1 dram_write_transactions Device Memory Write Transactions 28574704 28574704 28574704 + 1 l2_read_throughput L2 Throughput (Reads) 19.763GB/s 19.763GB/s 19.763GB/s + 1 l2_write_throughput L2 Throughput (Writes) 23.700GB/s 23.700GB/s 23.700GB/s + 1 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 sysmem_write_throughput System Memory Write Throughput 693.000B/s 693.000B/s 0.00000B/s + 1 cf_issued Issued Control-Flow Instructions 37873415 37873415 37873415 + 1 cf_executed Executed Control-Flow Instructions 37873415 37873415 37873415 + 1 ldst_issued Issued Load/Store Instructions 183711864 183711864 183711864 + 1 ldst_executed Executed Load/Store Instructions 183559684 183559684 183559684 + 1 l1_shared_utilization L1/Shared Memory Utilization Mid (5) Mid (5) Mid (5) + 1 l2_utilization L2 Cache Utilization Low (3) Low (3) Low (3) + 1 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) + 1 dram_utilization Device Memory Utilization Low (2) Low (2) Low (2) + 1 sysmem_utilization System Memory Utilization Low (1) Low (1) Low (1) + 1 ldst_fu_utilization Load/Store Function Unit Utilization Low (3) Low (3) Low (3) + 1 alu_fu_utilization Arithmetic Function Unit Utilization High (7) High (7) High (7) + 1 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) + 1 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) + 1 inst_issued Instructions Issued 795289381 795289381 795289381 + 1 issue_slots Issue Slots 565826071 565826071 565826071 + 1 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 31408388 31408388 31408388 + 1 l2_l1_write_transactions L2 Write Transactions (L1 write requests) 36708352 36708352 36708352 + 1 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 + 1 l2_l1_write_throughput L2 Throughput (L1 Writes) 23.700GB/s 23.700GB/s 23.700GB/s + 1 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 + 1 eligible_warps_per_cycle Eligible Warps Per Active Cycle 5.022511 5.022511 5.022511 + 1 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% + 1 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% + 1 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 9.68% 9.68% 9.68% + 1 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% + Kernel: void mtf_cuda_2buffers_depth32(unsigned char const *, unsigned char*, int, int) + 1 l1_cache_global_hit_rate L1 Global Hit Rate 69.14% 69.14% 69.14% + 1 branch_efficiency Branch Efficiency 100.00% 100.00% 100.00% + 1 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% + 1 sm_efficiency Multiprocessor Activity 99.29% 99.29% 99.29% + 1 achieved_occupancy Achieved Occupancy 0.640721 0.640721 0.640721 + 1 gld_requested_throughput Requested Global Load Throughput 23.435GB/s 23.435GB/s 23.435GB/s + 1 gst_requested_throughput Requested Global Store Throughput 23.429GB/s 23.429GB/s 23.428GB/s + 1 ipc Executed IPC 2.290456 2.290456 2.290456 + 1 sm_efficiency_instance Multiprocessor Activity 99.29% 99.29% 99.29% + 1 ipc_instance Executed IPC 2.290456 2.290456 2.290456 + 1 inst_per_warp Instructions per warp 2.5569e+05 2.5569e+05 2.5569e+05 + 1 gld_transactions Global Load Transactions 36725508 36725508 36725508 + 1 gst_transactions Global Store Transactions 36569088 36569088 36569088 + 1 local_load_transactions Local Load Transactions 0 0 0 + 1 local_store_transactions Local Store Transactions 0 0 0 + 1 shared_load_transactions Shared Load Transactions 36569088 36569088 36569088 + 1 shared_store_transactions Shared Store Transactions 73147104 73147104 73147104 + 1 gld_transactions_per_request Global Load Transactions Per Request 1.000335 1.000335 1.000335 + 1 gst_transactions_per_request Global Store Transactions Per Request 0.996317 0.996317 0.996317 + 1 local_load_transactions_per_request Local Memory Load Transactions Per Request 0.000000 0.000000 0.000000 + 1 local_store_transactions_per_request Local Memory Store Transactions Per Request 0.000000 0.000000 0.000000 + 1 shared_load_transactions_per_request Shared Memory Load Transactions Per Request 0.996317 0.996317 0.996317 + 1 shared_store_transactions_per_request Shared Memory Store Transactions Per Request 0.996324 0.996324 0.996324 + 1 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 shared_load_throughput Shared Memory Load Throughput 93.372GB/s 93.372GB/s 93.372GB/s + 1 shared_store_throughput Shared Memory Store Throughput 186.77GB/s 186.77GB/s 186.77GB/s + 1 shared_efficiency Shared Memory Efficiency 72.93% 72.93% 72.93% + 1 flop_count_sp Floating Point Operations(Single Precision) 0 0 0 + 1 flop_count_sp_add Floating Point Operations(Single Precision Add) 0 0 0 + 1 flop_count_sp_mul Floating Point Operation(Single Precision Mul) 0 0 0 + 1 flop_count_sp_fma Floating Point Operations(Single Precision FMA) 0 0 0 + 1 flop_count_dp Floating Point Operations(Double Precision) 0 0 0 + 1 flop_count_dp_add Floating Point Operations(Double Precision Add) 0 0 0 + 1 flop_count_dp_mul Floating Point Operations(Double Precision Mul) 0 0 0 + 1 flop_count_dp_fma Floating Point Operations(Double Precision FMA) 0 0 0 + 1 flop_count_sp_special Floating Point Operations(Single Precision Special) 0 0 0 + 1 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 47.54% 47.54% 47.54% + 1 stall_exec_dependency Issue Stall Reasons (Execution Dependency) 35.70% 35.70% 35.70% + 1 stall_memory_dependency Issue Stall Reasons (Data Request) 0.19% 0.19% 0.19% + 1 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% + 1 inst_executed Instructions Executed 764005908 764005908 764005908 + 1 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% + 1 stall_other Issue Stall Reasons (Other) 7.60% 7.60% 7.60% + 1 inst_fp_32 FP Instructions(Single) 0 0 0 + 1 inst_fp_64 FP Instructions(Double) 0 0 0 + 1 inst_integer Integer Instructions 1.1894e+10 1.1894e+10 1.1894e+10 + 1 inst_bit_convert Bit-Convert Instructions 2349072384 2349072384 2349072384 + 1 inst_control Control-Flow Instructions 416077184 416077184 416077184 + 1 inst_compute_ld_st Load/Store Instructions 5301368613 5301368613 5301368613 + 1 inst_misc Misc Instructions 2349645984 2349645984 2349645984 + 1 inst_inter_thread_communication Inter-Thread Instructions 1174536192 1174536192 1174536192 + 1 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 + 1 atomic_transactions Atomic Transactions 0 0 0 + 1 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 + 1 inst_replay_overhead Instruction Replay Overhead 0.000475 0.000475 0.000475 + 1 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 + 1 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.015250 0.015250 0.015250 + 1 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% + 1 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 dram_read_throughput Device Memory Read Throughput 1.2196GB/s 1.2196GB/s 1.2191GB/s + 1 dram_write_throughput Device Memory Write Throughput 20.290GB/s 20.290GB/s 20.289GB/s + 1 gst_throughput Global Store Throughput 23.429GB/s 23.429GB/s 23.428GB/s + 1 gld_throughput Global Load Throughput 93.771GB/s 93.771GB/s 93.770GB/s + 1 warp_execution_efficiency Warp Execution Efficiency 100.00% 100.00% 100.00% + 1 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 + 1 gld_efficiency Global Memory Load Efficiency 24.99% 24.99% 24.99% + 1 gst_efficiency Global Memory Store Efficiency 100.00% 100.00% 100.00% + 1 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 100.00% 100.00% 100.00% + 1 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% + 1 l2_l1_read_throughput L2 Throughput (L1 Reads) 30.186GB/s 30.186GB/s 30.185GB/s + 1 local_memory_overhead Local Memory Overhead 2.33% 2.33% 2.33% + 1 issued_ipc Issued IPC 2.288490 2.288490 2.288490 + 1 issue_slot_utilization Issue Slot Utilization 86.60% 86.60% 86.60% + 1 sysmem_read_transactions System Memory Read Transactions 0 0 0 + 1 sysmem_write_transactions System Memory Write Transactions 0 0 0 + 1 l2_read_transactions L2 Read Transactions 47919723 47919723 47919723 + 1 l2_write_transactions L2 Write Transactions 36704259 36704259 36704259 + 1 tex_cache_transactions Texture Cache Transactions 0 0 0 + 1 dram_read_transactions Device Memory Read Transactions 1910640 1910640 1910640 + 1 dram_write_transactions Device Memory Write Transactions 31785646 31785646 31785646 + 1 l2_read_throughput L2 Throughput (Reads) 30.588GB/s 30.588GB/s 30.587GB/s + 1 l2_write_throughput L2 Throughput (Writes) 23.429GB/s 23.429GB/s 23.428GB/s + 1 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 cf_issued Issued Control-Flow Instructions 25252099 25252099 25252099 + 1 cf_executed Executed Control-Flow Instructions 25252099 25252099 25252099 + 1 ldst_issued Issued Load/Store Instructions 183880004 183880004 183880004 + 1 ldst_executed Executed Load/Store Instructions 183539202 183539202 183539202 + 1 l1_shared_utilization L1/Shared Memory Utilization Mid (5) Mid (5) Mid (5) + 1 l2_utilization L2 Cache Utilization Low (3) Low (3) Low (3) + 1 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) + 1 dram_utilization Device Memory Utilization Low (2) Low (2) Low (2) + 1 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) + 1 ldst_fu_utilization Load/Store Function Unit Utilization Low (3) Low (3) Low (3) + 1 alu_fu_utilization Arithmetic Function Unit Utilization High (7) High (7) High (7) + 1 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) + 1 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) + 1 inst_issued Instructions Issued 764378379 764378379 764378379 + 1 issue_slots Issue Slots 578535942 578535942 578535942 + 1 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 47288924 47288924 47288924 + 1 l2_l1_write_transactions L2 Write Transactions (L1 write requests) 36704256 36704256 36704256 + 1 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 + 1 l2_l1_write_throughput L2 Throughput (L1 Writes) 23.429GB/s 23.429GB/s 23.428GB/s + 1 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 + 1 eligible_warps_per_cycle Eligible Warps Per Active Cycle 5.074431 5.074431 5.074431 + 1 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% + 1 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% + 1 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 8.97% 8.97% 8.97% + 1 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% + Kernel: void mtf_cuda_thread(unsigned char const *, unsigned char*, int, int) + 1 l1_cache_global_hit_rate L1 Global Hit Rate 60.55% 60.55% 60.55% + 1 branch_efficiency Branch Efficiency 72.94% 72.94% 72.94% + 1 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% + 1 sm_efficiency Multiprocessor Activity 89.80% 89.80% 89.80% + 1 achieved_occupancy Achieved Occupancy 0.113808 0.113808 0.113808 + 1 gld_requested_throughput Requested Global Load Throughput 294.70MB/s 294.70MB/s 294.69MB/s + 1 gst_requested_throughput Requested Global Store Throughput 294.63MB/s 294.63MB/s 293.73MB/s + 1 ipc Executed IPC 0.480226 0.480226 0.480226 + 1 sm_efficiency_instance Multiprocessor Activity 89.80% 89.80% 89.80% + 1 ipc_instance Executed IPC 0.480226 0.480226 0.480226 + 1 inst_per_warp Instructions per warp 1.3148e+06 1.3148e+06 1.3148e+06 + 1 gld_transactions Global Load Transactions 37249924 37249924 37249924 + 1 gst_transactions Global Store Transactions 37240832 37240832 37240832 + 1 local_load_transactions Local Load Transactions 0 0 0 + 1 local_store_transactions Local Store Transactions 0 0 0 + 1 shared_load_transactions Shared Load Transactions 13646164 13646164 13646164 + 1 shared_store_transactions Shared Store Transactions 24275828 24275828 24275828 + 1 gld_transactions_per_request Global Load Transactions Per Request 3.678756 3.678756 3.678756 + 1 gst_transactions_per_request Global Store Transactions Per Request 3.677219 3.677219 3.677219 + 1 local_load_transactions_per_request Local Memory Load Transactions Per Request 0.000000 0.000000 0.000000 + 1 local_store_transactions_per_request Local Memory Store Transactions Per Request 0.000000 0.000000 0.000000 + 1 shared_load_transactions_per_request Shared Memory Load Transactions Per Request 1.023684 1.023684 1.023684 + 1 shared_store_transactions_per_request Shared Memory Store Transactions Per Request 1.034088 1.034088 1.034088 + 1 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 shared_load_throughput Shared Memory Load Throughput 13.692GB/s 13.692GB/s 13.692GB/s + 1 shared_store_throughput Shared Memory Store Throughput 24.358GB/s 24.358GB/s 24.358GB/s + 1 shared_efficiency Shared Memory Efficiency 11.17% 11.17% 11.17% + 1 flop_count_sp Floating Point Operations(Single Precision) 0 0 0 + 1 flop_count_sp_add Floating Point Operations(Single Precision Add) 0 0 0 + 1 flop_count_sp_mul Floating Point Operation(Single Precision Mul) 0 0 0 + 1 flop_count_sp_fma Floating Point Operations(Single Precision FMA) 0 0 0 + 1 flop_count_dp Floating Point Operations(Double Precision) 0 0 0 + 1 flop_count_dp_add Floating Point Operations(Double Precision Add) 0 0 0 + 1 flop_count_dp_mul Floating Point Operations(Double Precision Mul) 0 0 0 + 1 flop_count_dp_fma Floating Point Operations(Double Precision FMA) 0 0 0 + 1 flop_count_sp_special Floating Point Operations(Single Precision Special) 0 0 0 + 1 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 28.17% 28.17% 28.17% + 1 stall_exec_dependency Issue Stall Reasons (Execution Dependency) 55.77% 55.77% 55.77% + 1 stall_memory_dependency Issue Stall Reasons (Data Request) 3.36% 3.36% 3.36% + 1 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% + 1 inst_executed Instructions Executed 369460401 369460401 369460401 + 1 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% + 1 stall_other Issue Stall Reasons (Other) 12.70% 12.70% 12.70% + 1 inst_fp_32 FP Instructions(Single) 0 0 0 + 1 inst_fp_64 FP Instructions(Double) 0 0 0 + 1 inst_integer Integer Instructions 1765985145 1765985145 1765985145 + 1 inst_bit_convert Bit-Convert Instructions 1765913364 1765913364 1765913364 + 1 inst_control Control-Flow Instructions 539713746 539713746 539713746 + 1 inst_compute_ld_st Load/Store Instructions 613704692 613704692 613704692 + 1 inst_misc Misc Instructions 74008992 74008992 74008992 + 1 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 + 1 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 + 1 atomic_transactions Atomic Transactions 0 0 0 + 1 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 + 1 inst_replay_overhead Instruction Replay Overhead 0.144397 0.144397 0.144397 + 1 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 + 1 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.039923 0.039923 0.039923 + 1 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% + 1 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 dram_read_throughput Device Memory Read Throughput 808.92MB/s 808.92MB/s 808.72MB/s + 1 dram_write_throughput Device Memory Write Throughput 9.1937GB/s 9.1937GB/s 9.1931GB/s + 1 gst_throughput Global Store Throughput 9.2071GB/s 9.2071GB/s 9.2071GB/s + 1 gld_throughput Global Load Throughput 37.376GB/s 37.376GB/s 37.376GB/s + 1 warp_execution_efficiency Warp Execution Efficiency 43.32% 43.32% 43.32% + 1 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 + 1 gld_efficiency Global Memory Load Efficiency 0.77% 0.77% 0.77% + 1 gst_efficiency Global Memory Store Efficiency 3.13% 3.13% 3.13% + 1 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 98.09% 98.09% 98.09% + 1 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% + 1 l2_l1_read_throughput L2 Throughput (L1 Reads) 15.255GB/s 15.255GB/s 15.255GB/s + 1 local_memory_overhead Local Memory Overhead 2.09% 2.09% 2.09% + 1 issued_ipc Issued IPC 0.556291 0.556291 0.556291 + 1 issue_slot_utilization Issue Slot Utilization 20.74% 20.74% 20.74% + 1 sysmem_read_transactions System Memory Read Transactions 0 0 0 + 1 sysmem_write_transactions System Memory Write Transactions 0 0 0 + 1 l2_read_transactions L2 Read Transactions 61039573 61039573 61039573 + 1 l2_write_transactions L2 Write Transactions 36704276 36704276 36704276 + 1 tex_cache_transactions Texture Cache Transactions 0 0 0 + 1 dram_read_transactions Device Memory Read Transactions 3149175 3149175 3149175 + 1 dram_write_transactions Device Memory Write Transactions 36650507 36650507 36650507 + 1 l2_read_throughput L2 Throughput (Reads) 15.312GB/s 15.312GB/s 15.311GB/s + 1 l2_write_throughput L2 Throughput (Writes) 9.2072GB/s 9.2072GB/s 9.2071GB/s + 1 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 cf_issued Issued Control-Flow Instructions 50466752 50466752 50466752 + 1 cf_executed Executed Control-Flow Instructions 50466752 50466752 50466752 + 1 ldst_issued Issued Load/Store Instructions 110402797 110402797 110402797 + 1 ldst_executed Executed Load/Store Instructions 57059165 57059165 57059165 + 1 l1_shared_utilization L1/Shared Memory Utilization Low (1) Low (1) Low (1) + 1 l2_utilization L2 Cache Utilization Low (2) Low (2) Low (2) + 1 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) + 1 dram_utilization Device Memory Utilization Low (1) Low (1) Low (1) + 1 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) + 1 ldst_fu_utilization Load/Store Function Unit Utilization Low (1) Low (1) Low (1) + 1 alu_fu_utilization Arithmetic Function Unit Utilization Low (2) Low (2) Low (2) + 1 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) + 1 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) + 1 inst_issued Instructions Issued 422808398 422808398 422808398 + 1 issue_slots Issue Slots 315291577 315291577 315291577 + 1 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 60814372 60814372 60814372 + 1 l2_l1_write_transactions L2 Write Transactions (L1 write requests) 36704256 36704256 36704256 + 1 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 + 1 l2_l1_write_throughput L2 Throughput (L1 Writes) 9.2071GB/s 9.2071GB/s 9.2071GB/s + 1 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 + 1 eligible_warps_per_cycle Eligible Warps Per Active Cycle 0.441823 0.441823 0.441823 + 1 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% + 1 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% + 1 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.00% 0.00% 0.00% + 1 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% + Kernel: void mtf_cuda_2buffers_depth32(unsigned char const *, unsigned char*, int, int) + 1 l1_cache_global_hit_rate L1 Global Hit Rate 53.93% 53.93% 53.93% + 1 branch_efficiency Branch Efficiency 100.00% 100.00% 100.00% + 1 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% + 1 sm_efficiency Multiprocessor Activity 99.28% 99.28% 99.28% + 1 achieved_occupancy Achieved Occupancy 0.636195 0.636195 0.636195 + 1 gld_requested_throughput Requested Global Load Throughput 23.560GB/s 23.560GB/s 23.560GB/s + 1 gst_requested_throughput Requested Global Store Throughput 23.554GB/s 23.554GB/s 23.554GB/s + 1 ipc Executed IPC 2.278861 2.278861 2.278861 + 1 sm_efficiency_instance Multiprocessor Activity 99.28% 99.28% 99.28% + 1 ipc_instance Executed IPC 2.278861 2.278861 2.278861 + 1 inst_per_warp Instructions per warp 3.3725e+05 3.3725e+05 3.3725e+05 + 1 gld_transactions Global Load Transactions 36512464 36512464 36512464 + 1 gst_transactions Global Store Transactions 36700160 36700160 36700160 + 1 local_load_transactions Local Load Transactions 0 0 0 + 1 local_store_transactions Local Store Transactions 0 0 0 + 1 shared_load_transactions Shared Load Transactions 36765696 36765696 36765696 + 1 shared_store_transactions Shared Store Transactions 73489380 73489380 73489380 + 1 gld_transactions_per_request Global Load Transactions Per Request 0.994199 0.994199 0.994199 + 1 gst_transactions_per_request Global Store Transactions Per Request 0.999554 0.999554 0.999554 + 1 local_load_transactions_per_request Local Memory Load Transactions Per Request 0.000000 0.000000 0.000000 + 1 local_store_transactions_per_request Local Memory Store Transactions Per Request 0.000000 0.000000 0.000000 + 1 shared_load_transactions_per_request Shared Memory Load Transactions Per Request 1.001339 1.001339 1.001339 + 1 shared_store_transactions_per_request Shared Memory Store Transactions Per Request 1.000818 1.000818 1.000818 + 1 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 shared_load_throughput Shared Memory Load Throughput 94.344GB/s 94.344GB/s 94.343GB/s + 1 shared_store_throughput Shared Memory Store Throughput 188.58GB/s 188.58GB/s 188.58GB/s + 1 shared_efficiency Shared Memory Efficiency 72.59% 72.59% 72.59% + 1 flop_count_sp Floating Point Operations(Single Precision) 0 0 0 + 1 flop_count_sp_add Floating Point Operations(Single Precision Add) 0 0 0 + 1 flop_count_sp_mul Floating Point Operation(Single Precision Mul) 0 0 0 + 1 flop_count_sp_fma Floating Point Operations(Single Precision FMA) 0 0 0 + 1 flop_count_dp Floating Point Operations(Double Precision) 0 0 0 + 1 flop_count_dp_add Floating Point Operations(Double Precision Add) 0 0 0 + 1 flop_count_dp_mul Floating Point Operations(Double Precision Mul) 0 0 0 + 1 flop_count_dp_fma Floating Point Operations(Double Precision FMA) 0 0 0 + 1 flop_count_sp_special Floating Point Operations(Single Precision Special) 0 0 0 + 1 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 31.40% 31.40% 31.40% + 1 stall_exec_dependency Issue Stall Reasons (Execution Dependency) 45.24% 45.24% 45.24% + 1 stall_memory_dependency Issue Stall Reasons (Data Request) 4.25% 4.25% 4.25% + 1 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% + 1 inst_executed Instructions Executed 756779004 756779004 756779004 + 1 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% + 1 stall_other Issue Stall Reasons (Other) 7.54% 7.54% 7.54% + 1 inst_fp_32 FP Instructions(Single) 0 0 0 + 1 inst_fp_64 FP Instructions(Double) 0 0 0 + 1 inst_integer Integer Instructions 1.1861e+10 1.1861e+10 1.1861e+10 + 1 inst_bit_convert Bit-Convert Instructions 2349858816 2349858816 2349858816 + 1 inst_control Control-Flow Instructions 312162432 312162432 312162432 + 1 inst_compute_ld_st Load/Store Instructions 5205138085 5205138085 5205138085 + 1 inst_misc Misc Instructions 2350145952 2350145952 2350145952 + 1 inst_inter_thread_communication Inter-Thread Instructions 1174929408 1174929408 1174929408 + 1 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 + 1 atomic_transactions Atomic Transactions 0 0 0 + 1 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 + 1 inst_replay_overhead Instruction Replay Overhead 0.008258 0.008258 0.008258 + 1 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 + 1 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.023553 0.023553 0.023553 + 1 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% + 1 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 dram_read_throughput Device Memory Read Throughput 1.5367GB/s 1.5367GB/s 1.5358GB/s + 1 dram_write_throughput Device Memory Write Throughput 21.738GB/s 21.738GB/s 21.738GB/s + 1 gst_throughput Global Store Throughput 23.554GB/s 23.554GB/s 23.554GB/s + 1 gld_throughput Global Load Throughput 93.694GB/s 93.694GB/s 93.694GB/s + 1 warp_execution_efficiency Warp Execution Efficiency 100.00% 100.00% 100.00% + 1 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 + 1 gld_efficiency Global Memory Load Efficiency 25.15% 25.15% 25.15% + 1 gst_efficiency Global Memory Store Efficiency 100.00% 100.00% 100.00% + 1 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 96.95% 96.95% 96.95% + 1 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% + 1 l2_l1_read_throughput L2 Throughput (L1 Reads) 44.744GB/s 44.744GB/s 44.744GB/s + 1 local_memory_overhead Local Memory Overhead 2.31% 2.31% 2.31% + 1 issued_ipc Issued IPC 2.290065 2.290065 2.290065 + 1 issue_slot_utilization Issue Slot Utilization 79.46% 79.46% 79.46% + 1 sysmem_read_transactions System Memory Read Transactions 0 0 0 + 1 sysmem_write_transactions System Memory Write Transactions 0 0 0 + 1 l2_read_transactions L2 Read Transactions 70456279 70456279 70456279 + 1 l2_write_transactions L2 Write Transactions 36716547 36716547 36716547 + 1 tex_cache_transactions Texture Cache Transactions 0 0 0 + 1 dram_read_transactions Device Memory Read Transactions 2395357 2395357 2395357 + 1 dram_write_transactions Device Memory Write Transactions 33885397 33885397 33885397 + 1 l2_read_throughput L2 Throughput (Reads) 45.199GB/s 45.199GB/s 45.199GB/s + 1 l2_write_throughput L2 Throughput (Writes) 23.554GB/s 23.554GB/s 23.554GB/s + 1 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 cf_issued Issued Control-Flow Instructions 18940935 18940935 18940935 + 1 cf_executed Executed Control-Flow Instructions 18940935 18940935 18940935 + 1 ldst_issued Issued Load/Store Instructions 189768605 189768605 189768605 + 1 ldst_executed Executed Load/Store Instructions 183600648 183600648 183600648 + 1 l1_shared_utilization L1/Shared Memory Utilization Mid (5) Mid (5) Mid (5) + 1 l2_utilization L2 Cache Utilization Mid (4) Mid (4) Mid (4) + 1 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) + 1 dram_utilization Device Memory Utilization Low (2) Low (2) Low (2) + 1 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) + 1 ldst_fu_utilization Load/Store Function Unit Utilization Low (3) Low (3) Low (3) + 1 alu_fu_utilization Arithmetic Function Unit Utilization High (7) High (7) High (7) + 1 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) + 1 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) + 1 inst_issued Instructions Issued 762916472 762916472 762916472 + 1 issue_slots Issue Slots 529399528 529399528 529399528 + 1 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 69746344 69746344 69746344 + 1 l2_l1_write_transactions L2 Write Transactions (L1 write requests) 36716544 36716544 36716544 + 1 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 + 1 l2_l1_write_throughput L2 Throughput (L1 Writes) 23.554GB/s 23.554GB/s 23.554GB/s + 1 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 + 1 eligible_warps_per_cycle Eligible Warps Per Active Cycle 5.158843 5.158843 5.158843 + 1 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% + 1 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% + 1 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 11.58% 11.58% 11.58% + 1 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% + Kernel: void mtf_cuda_thread(unsigned char const *, unsigned char*, int, int) + 1 l1_cache_global_hit_rate L1 Global Hit Rate 6.37% 6.37% 6.37% + 1 branch_efficiency Branch Efficiency 68.31% 68.31% 68.31% + 1 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% + 1 sm_efficiency Multiprocessor Activity 96.20% 96.20% 96.20% + 1 achieved_occupancy Achieved Occupancy 0.590574 0.590574 0.590574 + 1 gld_requested_throughput Requested Global Load Throughput 334.92MB/s 334.92MB/s 334.74MB/s + 1 gst_requested_throughput Requested Global Store Throughput 334.84MB/s 334.84MB/s 334.74MB/s + 1 ipc Executed IPC 0.290120 0.290120 0.290120 + 1 sm_efficiency_instance Multiprocessor Activity 96.20% 96.20% 96.20% + 1 ipc_instance Executed IPC 0.290120 0.290120 0.290120 + 1 inst_per_warp Instructions per warp 7.3366e+05 7.3366e+05 7.3366e+05 + 1 gld_transactions Global Load Transactions 37757952 37757952 37757952 + 1 gst_transactions Global Store Transactions 37765120 37765120 37765120 + 1 local_load_transactions Local Load Transactions 0 0 0 + 1 local_store_transactions Local Store Transactions 0 0 0 + 1 shared_load_transactions Shared Load Transactions 6523276 6523276 6523276 + 1 shared_store_transactions Shared Store Transactions 12638436 12638436 12638436 + 1 gld_transactions_per_request Global Load Transactions Per Request 6.207785 6.207785 6.207785 + 1 gst_transactions_per_request Global Store Transactions Per Request 6.208020 6.208020 6.208020 + 1 local_load_transactions_per_request Local Memory Load Transactions Per Request 0.000000 0.000000 0.000000 + 1 local_store_transactions_per_request Local Memory Store Transactions Per Request 0.000000 0.000000 0.000000 + 1 shared_load_transactions_per_request Shared Memory Load Transactions Per Request 1.009360 1.009360 1.009360 + 1 shared_store_transactions_per_request Shared Memory Store Transactions Per Request 1.007340 1.007340 1.007340 + 1 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 shared_load_throughput Shared Memory Load Throughput 7.4386GB/s 7.4386GB/s 7.4385GB/s + 1 shared_store_throughput Shared Memory Store Throughput 14.412GB/s 14.412GB/s 14.411GB/s + 1 shared_efficiency Shared Memory Efficiency 12.55% 12.55% 12.55% + 1 flop_count_sp Floating Point Operations(Single Precision) 0 0 0 + 1 flop_count_sp_add Floating Point Operations(Single Precision Add) 0 0 0 + 1 flop_count_sp_mul Floating Point Operation(Single Precision Mul) 0 0 0 + 1 flop_count_sp_fma Floating Point Operations(Single Precision FMA) 0 0 0 + 1 flop_count_dp Floating Point Operations(Double Precision) 0 0 0 + 1 flop_count_dp_add Floating Point Operations(Double Precision Add) 0 0 0 + 1 flop_count_dp_mul Floating Point Operations(Double Precision Mul) 0 0 0 + 1 flop_count_dp_fma Floating Point Operations(Double Precision FMA) 0 0 0 + 1 flop_count_sp_special Floating Point Operations(Single Precision Special) 0 0 0 + 1 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 2.86% 2.86% 2.86% + 1 stall_exec_dependency Issue Stall Reasons (Execution Dependency) 43.56% 43.56% 43.56% + 1 stall_memory_dependency Issue Stall Reasons (Data Request) 48.31% 48.31% 48.31% + 1 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% + 1 inst_executed Instructions Executed 208358080 208358080 208358080 + 1 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% + 1 stall_other Issue Stall Reasons (Other) 5.25% 5.25% 5.25% + 1 inst_fp_32 FP Instructions(Single) 0 0 0 + 1 inst_fp_64 FP Instructions(Double) 0 0 0 + 1 inst_integer Integer Instructions 1242625800 1242625800 1242625800 + 1 inst_bit_convert Bit-Convert Instructions 1205849475 1205849475 1205849475 + 1 inst_control Control-Flow Instructions 307838374 307838374 307838374 + 1 inst_compute_ld_st Load/Store Instructions 381273642 381273642 381273642 + 1 inst_misc Misc Instructions 73453698 73453698 73453698 + 1 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 + 1 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 + 1 atomic_transactions Atomic Transactions 0 0 0 + 1 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 + 1 inst_replay_overhead Instruction Replay Overhead 2.279377 2.279377 2.279377 + 1 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 + 1 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.169283 0.169283 0.169283 + 1 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% + 1 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 dram_read_throughput Device Memory Read Throughput 27.083GB/s 27.083GB/s 27.083GB/s + 1 dram_write_throughput Device Memory Write Throughput 10.409GB/s 10.409GB/s 10.408GB/s + 1 gst_throughput Global Store Throughput 10.464GB/s 10.464GB/s 10.463GB/s + 1 gld_throughput Global Load Throughput 43.056GB/s 43.056GB/s 43.056GB/s + 1 warp_execution_efficiency Warp Execution Efficiency 51.91% 51.91% 51.91% + 1 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 + 1 gld_efficiency Global Memory Load Efficiency 0.76% 0.76% 0.76% + 1 gst_efficiency Global Memory Store Efficiency 3.13% 3.13% 3.13% + 1 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 31.54% 31.54% 31.54% + 1 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% + 1 l2_l1_read_throughput L2 Throughput (L1 Reads) 39.041GB/s 39.041GB/s 39.040GB/s + 1 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00% + 1 issued_ipc Issued IPC 0.949456 0.949456 0.949456 + 1 issue_slot_utilization Issue Slot Utilization 42.63% 42.63% 42.63% + 1 sysmem_read_transactions System Memory Read Transactions 0 0 0 + 1 sysmem_write_transactions System Memory Write Transactions 0 0 0 + 1 l2_read_transactions L2 Read Transactions 137206138 137206138 137206138 + 1 l2_write_transactions L2 Write Transactions 36704264 36704264 36704264 + 1 tex_cache_transactions Texture Cache Transactions 0 0 0 + 1 dram_read_transactions Device Memory Read Transactions 95003159 95003159 95003159 + 1 dram_write_transactions Device Memory Write Transactions 36512794 36512794 36512794 + 1 l2_read_throughput L2 Throughput (Reads) 39.115GB/s 39.115GB/s 39.115GB/s + 1 l2_write_throughput L2 Throughput (Writes) 10.464GB/s 10.464GB/s 10.463GB/s + 1 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 cf_issued Issued Control-Flow Instructions 25595085 25595085 25595085 + 1 cf_executed Executed Control-Flow Instructions 25595085 25595085 25595085 + 1 ldst_issued Issued Load/Store Instructions 496833314 496833314 496833314 + 1 ldst_executed Executed Load/Store Instructions 31174760 31174760 31174760 + 1 l1_shared_utilization L1/Shared Memory Utilization Low (1) Low (1) Low (1) + 1 l2_utilization L2 Cache Utilization Low (3) Low (3) Low (3) + 1 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) + 1 dram_utilization Device Memory Utilization Mid (4) Mid (4) Mid (4) + 1 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) + 1 ldst_fu_utilization Load/Store Function Unit Utilization Mid (4) Mid (4) Mid (4) + 1 alu_fu_utilization Arithmetic Function Unit Utilization Low (1) Low (1) Low (1) + 1 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) + 1 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) + 1 inst_issued Instructions Issued 680230926 680230926 680230926 + 1 issue_slots Issue Slots 610878538 610878538 610878538 + 1 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 136946420 136946420 136946420 + 1 l2_l1_write_transactions L2 Write Transactions (L1 write requests) 36704256 36704256 36704256 + 1 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 + 1 l2_l1_write_throughput L2 Throughput (L1 Writes) 10.464GB/s 10.464GB/s 10.463GB/s + 1 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 + 1 eligible_warps_per_cycle Eligible Warps Per Active Cycle 0.966622 0.966622 0.966622 + 1 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% + 1 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% + 1 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.02% 0.02% 0.02% + 1 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% + Kernel: void mtf_cuda_thread_by4(unsigned char const *, unsigned char*, int, int) + 1 l1_cache_global_hit_rate L1 Global Hit Rate 5.98% 5.98% 5.98% + 1 branch_efficiency Branch Efficiency 70.37% 70.37% 70.37% + 1 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% + 1 sm_efficiency Multiprocessor Activity 96.74% 96.74% 96.74% + 1 achieved_occupancy Achieved Occupancy 0.587422 0.587422 0.587422 + 1 gld_requested_throughput Requested Global Load Throughput 325.42MB/s 325.42MB/s 325.20MB/s + 1 gst_requested_throughput Requested Global Store Throughput 325.35MB/s 325.35MB/s 325.20MB/s + 1 ipc Executed IPC 0.265937 0.265937 0.265937 + 1 sm_efficiency_instance Multiprocessor Activity 96.74% 96.74% 96.74% + 1 ipc_instance Executed IPC 0.265937 0.265937 0.265937 + 1 inst_per_warp Instructions per warp 6.9209e+05 6.9209e+05 6.9209e+05 + 1 gld_transactions Global Load Transactions 37757952 37757952 37757952 + 1 gst_transactions Global Store Transactions 37765120 37765120 37765120 + 1 local_load_transactions Local Load Transactions 0 0 0 + 1 local_store_transactions Local Store Transactions 0 0 0 + 1 shared_load_transactions Shared Load Transactions 8070680 8070680 8070680 + 1 shared_store_transactions Shared Store Transactions 8071288 8071288 8071288 + 1 gld_transactions_per_request Global Load Transactions Per Request 4.575650 4.575650 4.575650 + 1 gst_transactions_per_request Global Store Transactions Per Request 4.575713 4.575713 4.575713 + 1 local_load_transactions_per_request Local Memory Load Transactions Per Request 0.000000 0.000000 0.000000 + 1 local_store_transactions_per_request Local Memory Store Transactions Per Request 0.000000 0.000000 0.000000 + 1 shared_load_transactions_per_request Shared Memory Load Transactions Per Request 1.079581 1.079581 1.079581 + 1 shared_store_transactions_per_request Shared Memory Store Transactions Per Request 1.079581 1.079581 1.079581 + 1 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 shared_load_throughput Shared Memory Load Throughput 8.9423GB/s 8.9423GB/s 8.9416GB/s + 1 shared_store_throughput Shared Memory Store Throughput 8.9430GB/s 8.9430GB/s 8.9426GB/s + 1 shared_efficiency Shared Memory Efficiency 16.68% 16.68% 16.68% + 1 flop_count_sp Floating Point Operations(Single Precision) 0 0 0 + 1 flop_count_sp_add Floating Point Operations(Single Precision Add) 0 0 0 + 1 flop_count_sp_mul Floating Point Operation(Single Precision Mul) 0 0 0 + 1 flop_count_sp_fma Floating Point Operations(Single Precision FMA) 0 0 0 + 1 flop_count_dp Floating Point Operations(Double Precision) 0 0 0 + 1 flop_count_dp_add Floating Point Operations(Double Precision Add) 0 0 0 + 1 flop_count_dp_mul Floating Point Operations(Double Precision Mul) 0 0 0 + 1 flop_count_dp_fma Floating Point Operations(Double Precision FMA) 0 0 0 + 1 flop_count_sp_special Floating Point Operations(Single Precision Special) 0 0 0 + 1 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 3.18% 3.18% 3.18% + 1 stall_exec_dependency Issue Stall Reasons (Execution Dependency) 37.59% 37.59% 37.59% + 1 stall_memory_dependency Issue Stall Reasons (Data Request) 54.49% 54.49% 54.49% + 1 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% + 1 inst_executed Instructions Executed 196552394 196552394 196552394 + 1 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% + 1 stall_other Issue Stall Reasons (Other) 4.73% 4.73% 4.73% + 1 inst_fp_32 FP Instructions(Single) 0 0 0 + 1 inst_fp_64 FP Instructions(Double) 0 0 0 + 1 inst_integer Integer Instructions 588004206 588004206 588004206 + 1 inst_bit_convert Bit-Convert Instructions 513547846 513547846 513547846 + 1 inst_control Control-Flow Instructions 163495243 163495243 163495243 + 1 inst_compute_ld_st Load/Store Instructions 417968937 417968937 417968937 + 1 inst_misc Misc Instructions 242459041 242459041 242459041 + 1 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 + 1 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 + 1 atomic_transactions Atomic Transactions 0 0 0 + 1 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 + 1 inst_replay_overhead Instruction Replay Overhead 2.282965 2.282965 2.282965 + 1 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 + 1 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.188248 0.188248 0.188248 + 1 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% + 1 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 dram_read_throughput Device Memory Read Throughput 27.380GB/s 27.380GB/s 27.380GB/s + 1 dram_write_throughput Device Memory Write Throughput 10.113GB/s 10.113GB/s 10.113GB/s + 1 gst_throughput Global Store Throughput 10.167GB/s 10.167GB/s 10.166GB/s + 1 gld_throughput Global Load Throughput 41.836GB/s 41.836GB/s 41.835GB/s + 1 warp_execution_efficiency Warp Execution Efficiency 34.02% 34.02% 34.02% + 1 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 + 1 gld_efficiency Global Memory Load Efficiency 0.76% 0.76% 0.76% + 1 gst_efficiency Global Memory Store Efficiency 3.13% 3.13% 3.13% + 1 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 29.01% 29.01% 29.01% + 1 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% + 1 l2_l1_read_throughput L2 Throughput (L1 Reads) 38.193GB/s 38.193GB/s 38.193GB/s + 1 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00% + 1 issued_ipc Issued IPC 0.870610 0.870610 0.870610 + 1 issue_slot_utilization Issue Slot Utilization 39.51% 39.51% 39.51% + 1 sysmem_read_transactions System Memory Read Transactions 0 0 0 + 1 sysmem_write_transactions System Memory Write Transactions 1 1 1 + 1 l2_read_transactions L2 Read Transactions 137942398 137942398 137942398 + 1 l2_write_transactions L2 Write Transactions 36704264 36704264 36704264 + 1 tex_cache_transactions Texture Cache Transactions 0 0 0 + 1 dram_read_transactions Device Memory Read Transactions 98845652 98845652 98845652 + 1 dram_write_transactions Device Memory Write Transactions 36510402 36510402 36510402 + 1 l2_read_throughput L2 Throughput (Reads) 38.210GB/s 38.210GB/s 38.209GB/s + 1 l2_write_throughput L2 Throughput (Writes) 10.167GB/s 10.167GB/s 10.166GB/s + 1 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 sysmem_write_throughput System Memory Write Throughput 297.000B/s 297.000B/s 0.00000B/s + 1 cf_issued Issued Control-Flow Instructions 39670061 39670061 39670061 + 1 cf_executed Executed Control-Flow Instructions 39670061 39670061 39670061 + 1 ldst_issued Issued Load/Store Instructions 474302357 474302357 474302357 + 1 ldst_executed Executed Load/Store Instructions 31457379 31457379 31457379 + 1 l1_shared_utilization L1/Shared Memory Utilization Low (1) Low (1) Low (1) + 1 l2_utilization L2 Cache Utilization Low (3) Low (3) Low (3) + 1 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) + 1 dram_utilization Device Memory Utilization Mid (4) Mid (4) Mid (4) + 1 sysmem_utilization System Memory Utilization Low (1) Low (1) Low (1) + 1 ldst_fu_utilization Load/Store Function Unit Utilization Mid (4) Mid (4) Mid (4) + 1 alu_fu_utilization Arithmetic Function Unit Utilization Low (1) Low (1) Low (1) + 1 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) + 1 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) + 1 inst_issued Instructions Issued 649199669 649199669 649199669 + 1 issue_slots Issue Slots 589287447 589287447 589287447 + 1 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 137882280 137882280 137882280 + 1 l2_l1_write_transactions L2 Write Transactions (L1 write requests) 36704256 36704256 36704256 + 1 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 + 1 l2_l1_write_throughput L2 Throughput (L1 Writes) 10.167GB/s 10.167GB/s 10.166GB/s + 1 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 + 1 eligible_warps_per_cycle Eligible Warps Per Active Cycle 0.882725 0.882725 0.882725 + 1 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% + 1 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% + 1 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.01% 0.01% 0.01% + 1 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% + Kernel: void mtf_cuda_thread_by4(unsigned char const *, unsigned char*, int, int) + 1 l1_cache_global_hit_rate L1 Global Hit Rate 7.09% 7.09% 7.09% + 1 branch_efficiency Branch Efficiency 73.57% 73.57% 73.57% + 1 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% + 1 sm_efficiency Multiprocessor Activity 91.44% 91.44% 91.44% + 1 achieved_occupancy Achieved Occupancy 0.585803 0.585803 0.585803 + 1 gld_requested_throughput Requested Global Load Throughput 328.29MB/s 328.29MB/s 328.06MB/s + 1 gst_requested_throughput Requested Global Store Throughput 328.21MB/s 328.21MB/s 328.06MB/s + 1 ipc Executed IPC 0.402875 0.402875 0.402875 + 1 sm_efficiency_instance Multiprocessor Activity 91.44% 91.44% 91.44% + 1 ipc_instance Executed IPC 0.402875 0.402875 0.402875 + 1 inst_per_warp Instructions per warp 9.8331e+05 9.8331e+05 9.8331e+05 + 1 gld_transactions Global Load Transactions 37774340 37774340 37774340 + 1 gst_transactions Global Store Transactions 37748736 37748736 37748736 + 1 local_load_transactions Local Load Transactions 0 0 0 + 1 local_store_transactions Local Store Transactions 0 0 0 + 1 shared_load_transactions Shared Load Transactions 12243160 12243160 12243160 + 1 shared_store_transactions Shared Store Transactions 12245464 12245464 12245464 + 1 gld_transactions_per_request Global Load Transactions Per Request 3.566406 3.566406 3.566406 + 1 gst_transactions_per_request Global Store Transactions Per Request 3.563344 3.563344 3.563344 + 1 local_load_transactions_per_request Local Memory Load Transactions Per Request 0.000000 0.000000 0.000000 + 1 local_store_transactions_per_request Local Memory Store Transactions Per Request 0.000000 0.000000 0.000000 + 1 shared_load_transactions_per_request Shared Memory Load Transactions Per Request 0.989193 0.989193 0.989193 + 1 shared_store_transactions_per_request Shared Memory Store Transactions Per Request 0.989200 0.989200 0.989200 + 1 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 shared_load_throughput Shared Memory Load Throughput 13.685GB/s 13.685GB/s 13.684GB/s + 1 shared_store_throughput Shared Memory Store Throughput 13.687GB/s 13.687GB/s 13.687GB/s + 1 shared_efficiency Shared Memory Efficiency 15.75% 15.75% 15.75% + 1 flop_count_sp Floating Point Operations(Single Precision) 0 0 0 + 1 flop_count_sp_add Floating Point Operations(Single Precision Add) 0 0 0 + 1 flop_count_sp_mul Floating Point Operation(Single Precision Mul) 0 0 0 + 1 flop_count_sp_fma Floating Point Operations(Single Precision FMA) 0 0 0 + 1 flop_count_dp Floating Point Operations(Double Precision) 0 0 0 + 1 flop_count_dp_add Floating Point Operations(Double Precision Add) 0 0 0 + 1 flop_count_dp_mul Floating Point Operations(Double Precision Mul) 0 0 0 + 1 flop_count_dp_fma Floating Point Operations(Double Precision FMA) 0 0 0 + 1 flop_count_sp_special Floating Point Operations(Single Precision Special) 0 0 0 + 1 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 4.70% 4.70% 4.70% + 1 stall_exec_dependency Issue Stall Reasons (Execution Dependency) 29.19% 29.19% 29.19% + 1 stall_memory_dependency Issue Stall Reasons (Data Request) 62.08% 62.08% 62.08% + 1 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% + 1 inst_executed Instructions Executed 279259990 279259990 279259990 + 1 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% + 1 stall_other Issue Stall Reasons (Other) 4.03% 4.03% 4.03% + 1 inst_fp_32 FP Instructions(Single) 0 0 0 + 1 inst_fp_64 FP Instructions(Double) 0 0 0 + 1 inst_integer Integer Instructions 757334568 757334568 757334568 + 1 inst_bit_convert Bit-Convert Instructions 720455768 720455768 720455768 + 1 inst_control Control-Flow Instructions 185431059 185431059 185431059 + 1 inst_compute_ld_st Load/Store Instructions 566944049 566944049 566944049 + 1 inst_misc Misc Instructions 300499383 300499383 300499383 + 1 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 + 1 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 + 1 atomic_transactions Atomic Transactions 0 0 0 + 1 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 + 1 inst_replay_overhead Instruction Replay Overhead 1.085396 1.085396 1.085396 + 1 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 + 1 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.125243 0.125243 0.125243 + 1 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% + 1 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 dram_read_throughput Device Memory Read Throughput 25.034GB/s 25.034GB/s 25.033GB/s + 1 dram_write_throughput Device Memory Write Throughput 10.150GB/s 10.150GB/s 10.150GB/s + 1 gst_throughput Global Store Throughput 10.256GB/s 10.256GB/s 10.256GB/s + 1 gld_throughput Global Load Throughput 42.222GB/s 42.222GB/s 42.222GB/s + 1 warp_execution_efficiency Warp Execution Efficiency 31.96% 31.96% 31.96% + 1 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 + 1 gld_efficiency Global Memory Load Efficiency 0.76% 0.76% 0.76% + 1 gst_efficiency Global Memory Store Efficiency 3.13% 3.13% 3.13% + 1 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 34.95% 34.95% 34.95% + 1 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% + 1 l2_l1_read_throughput L2 Throughput (L1 Reads) 37.694GB/s 37.694GB/s 37.693GB/s + 1 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00% + 1 issued_ipc Issued IPC 0.841036 0.841036 0.841036 + 1 issue_slot_utilization Issue Slot Utilization 35.01% 35.01% 35.01% + 1 sysmem_read_transactions System Memory Read Transactions 0 0 0 + 1 sysmem_write_transactions System Memory Write Transactions 1 1 1 + 1 l2_read_transactions L2 Read Transactions 134735486 134735486 134735486 + 1 l2_write_transactions L2 Write Transactions 36704287 36704287 36704287 + 1 tex_cache_transactions Texture Cache Transactions 0 0 0 + 1 dram_read_transactions Device Memory Read Transactions 89586981 89586981 89586981 + 1 dram_write_transactions Device Memory Write Transactions 36324319 36324319 36324319 + 1 l2_read_throughput L2 Throughput (Reads) 37.650GB/s 37.650GB/s 37.650GB/s + 1 l2_write_throughput L2 Throughput (Writes) 10.257GB/s 10.257GB/s 10.256GB/s + 1 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 sysmem_write_throughput System Memory Write Throughput 300.000B/s 300.000B/s 0.00000B/s + 1 cf_issued Issued Control-Flow Instructions 56470353 56470353 56470353 + 1 cf_executed Executed Control-Flow Instructions 56470353 56470353 56470353 + 1 ldst_issued Issued Load/Store Instructions 349311796 349311796 349311796 + 1 ldst_executed Executed Load/Store Instructions 45941410 45941410 45941410 + 1 l1_shared_utilization L1/Shared Memory Utilization Low (1) Low (1) Low (1) + 1 l2_utilization L2 Cache Utilization Low (3) Low (3) Low (3) + 1 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) + 1 dram_utilization Device Memory Utilization Low (3) Low (3) Low (3) + 1 sysmem_utilization System Memory Utilization Low (1) Low (1) Low (1) + 1 ldst_fu_utilization Load/Store Function Unit Utilization Low (3) Low (3) Low (3) + 1 alu_fu_utilization Arithmetic Function Unit Utilization Low (1) Low (1) Low (1) + 1 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) + 1 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) + 1 inst_issued Instructions Issued 582349507 582349507 582349507 + 1 issue_slots Issue Slots 484898579 484898579 484898579 + 1 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 134893624 134893624 134893624 + 1 l2_l1_write_transactions L2 Write Transactions (L1 write requests) 36704256 36704256 36704256 + 1 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 + 1 l2_l1_write_throughput L2 Throughput (L1 Writes) 10.256GB/s 10.256GB/s 10.256GB/s + 1 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 + 1 eligible_warps_per_cycle Eligible Warps Per Active Cycle 0.798229 0.798229 0.798229 + 1 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% + 1 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% + 1 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.00% 0.00% 0.00% + 1 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% + Kernel: void mtf_cuda_4by8(unsigned char const *, unsigned char*, int, int) + 1 l1_cache_global_hit_rate L1 Global Hit Rate 46.62% 46.62% 46.62% + 1 branch_efficiency Branch Efficiency 71.16% 71.16% 71.16% + 1 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% + 1 sm_efficiency Multiprocessor Activity 90.80% 90.80% 90.80% + 1 achieved_occupancy Achieved Occupancy 0.436712 0.436712 0.436712 + 1 gld_requested_throughput Requested Global Load Throughput 2.3500GB/s 2.3500GB/s 2.3497GB/s + 1 gst_requested_throughput Requested Global Store Throughput 2.3494GB/s 2.3494GB/s 2.3488GB/s + 1 ipc Executed IPC 1.455590 1.455590 1.455590 + 1 sm_efficiency_instance Multiprocessor Activity 90.80% 90.80% 90.80% + 1 ipc_instance Executed IPC 1.455590 1.455590 1.455590 + 1 inst_per_warp Instructions per warp 4.9624e+05 4.9624e+05 4.9624e+05 + 1 gld_transactions Global Load Transactions 35676676 35676676 35676676 + 1 gst_transactions Global Store Transactions 35651584 35651584 35651584 + 1 local_load_transactions Local Load Transactions 0 0 0 + 1 local_store_transactions Local Store Transactions 0 0 0 + 1 shared_load_transactions Shared Load Transactions 58249812 58249812 58249812 + 1 shared_store_transactions Shared Store Transactions 88333224 88333224 88333224 + 1 gld_transactions_per_request Global Load Transactions Per Request 3.126515 3.126515 3.126515 + 1 gst_transactions_per_request Global Store Transactions Per Request 3.124210 3.124210 3.124210 + 1 local_load_transactions_per_request Local Memory Load Transactions Per Request 0.000000 0.000000 0.000000 + 1 local_store_transactions_per_request Local Memory Store Transactions Per Request 0.000000 0.000000 0.000000 + 1 shared_load_transactions_per_request Shared Memory Load Transactions Per Request 2.460295 2.460295 2.460295 + 1 shared_store_transactions_per_request Shared Memory Store Transactions Per Request 2.545633 2.545633 2.545633 + 1 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 shared_load_throughput Shared Memory Load Throughput 119.31GB/s 119.31GB/s 119.31GB/s + 1 shared_store_throughput Shared Memory Store Throughput 180.93GB/s 180.93GB/s 180.93GB/s + 1 shared_efficiency Shared Memory Efficiency 3.97% 3.97% 3.97% + 1 flop_count_sp Floating Point Operations(Single Precision) 0 0 0 + 1 flop_count_sp_add Floating Point Operations(Single Precision Add) 0 0 0 + 1 flop_count_sp_mul Floating Point Operation(Single Precision Mul) 0 0 0 + 1 flop_count_sp_fma Floating Point Operations(Single Precision FMA) 0 0 0 + 1 flop_count_dp Floating Point Operations(Double Precision) 0 0 0 + 1 flop_count_dp_add Floating Point Operations(Double Precision Add) 0 0 0 + 1 flop_count_dp_mul Floating Point Operations(Double Precision Mul) 0 0 0 + 1 flop_count_dp_fma Floating Point Operations(Double Precision FMA) 0 0 0 + 1 flop_count_sp_special Floating Point Operations(Single Precision Special) 0 0 0 + 1 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 26.39% 26.39% 26.39% + 1 stall_exec_dependency Issue Stall Reasons (Execution Dependency) 60.40% 60.40% 60.40% + 1 stall_memory_dependency Issue Stall Reasons (Data Request) 2.70% 2.70% 2.70% + 1 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% + 1 inst_executed Instructions Executed 557769056 557769056 557769056 + 1 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% + 1 stall_other Issue Stall Reasons (Other) 10.24% 10.24% 10.24% + 1 inst_fp_32 FP Instructions(Single) 0 0 0 + 1 inst_fp_64 FP Instructions(Double) 0 0 0 + 1 inst_integer Integer Instructions 4767799528 4767799528 4767799528 + 1 inst_bit_convert Bit-Convert Instructions 2410619508 2410619508 2410619508 + 1 inst_control Control-Flow Instructions 541977616 541977616 541977616 + 1 inst_compute_ld_st Load/Store Instructions 1038696361 1038696361 1038696361 + 1 inst_misc Misc Instructions 731000375 731000375 731000375 + 1 inst_inter_thread_communication Inter-Thread Instructions 344379336 344379336 344379336 + 1 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 + 1 atomic_transactions Atomic Transactions 0 0 0 + 1 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 + 1 inst_replay_overhead Instruction Replay Overhead 0.264168 0.264168 0.264168 + 1 shared_replay_overhead Shared Memory Replay Overhead 0.162445 0.162445 0.162445 + 1 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.035867 0.035867 0.035867 + 1 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% + 1 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 dram_read_throughput Device Memory Read Throughput 1.3319GB/s 1.3319GB/s 1.3318GB/s + 1 dram_write_throughput Device Memory Write Throughput 16.776GB/s 16.776GB/s 16.776GB/s + 1 gst_throughput Global Store Throughput 18.795GB/s 18.795GB/s 18.795GB/s + 1 gld_throughput Global Load Throughput 73.076GB/s 73.076GB/s 73.075GB/s + 1 warp_execution_efficiency Warp Execution Efficiency 59.80% 59.80% 59.80% + 1 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 + 1 gld_efficiency Global Memory Load Efficiency 3.22% 3.22% 3.22% + 1 gst_efficiency Global Memory Store Efficiency 12.50% 12.50% 12.50% + 1 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 97.85% 97.85% 97.85% + 1 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% + 1 l2_l1_read_throughput L2 Throughput (L1 Reads) 41.133GB/s 41.133GB/s 41.132GB/s + 1 local_memory_overhead Local Memory Overhead 3.55% 3.55% 3.55% + 1 issued_ipc Issued IPC 1.816981 1.816981 1.816981 + 1 issue_slot_utilization Issue Slot Utilization 68.87% 68.87% 68.87% + 1 sysmem_read_transactions System Memory Read Transactions 0 0 0 + 1 sysmem_write_transactions System Memory Write Transactions 0 0 0 + 1 l2_read_transactions L2 Read Transactions 80757365 80757365 80757365 + 1 l2_write_transactions L2 Write Transactions 36704279 36704279 36704279 + 1 tex_cache_transactions Texture Cache Transactions 0 0 0 + 1 dram_read_transactions Device Memory Read Transactions 2601071 2601071 2601071 + 1 dram_write_transactions Device Memory Write Transactions 32761356 32761356 32761356 + 1 l2_read_throughput L2 Throughput (Reads) 41.353GB/s 41.353GB/s 41.353GB/s + 1 l2_write_throughput L2 Throughput (Writes) 18.795GB/s 18.795GB/s 18.795GB/s + 1 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 cf_issued Issued Control-Flow Instructions 73032967 73032967 73032967 + 1 cf_executed Executed Control-Flow Instructions 73032967 73032967 73032967 + 1 ldst_issued Issued Load/Store Instructions 228598523 228598523 228598523 + 1 ldst_executed Executed Load/Store Instructions 81198246 81198246 81198246 + 1 l1_shared_utilization L1/Shared Memory Utilization Mid (5) Mid (5) Mid (5) + 1 l2_utilization L2 Cache Utilization Low (3) Low (3) Low (3) + 1 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) + 1 dram_utilization Device Memory Utilization Low (2) Low (2) Low (2) + 1 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) + 1 ldst_fu_utilization Load/Store Function Unit Utilization Low (3) Low (3) Low (3) + 1 alu_fu_utilization Arithmetic Function Unit Utilization Mid (5) Mid (5) Mid (5) + 1 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) + 1 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) + 1 inst_issued Instructions Issued 705258437 705258437 705258437 + 1 issue_slots Issue Slots 534618795 534618795 534618795 + 1 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 80326416 80326416 80326416 + 1 l2_l1_write_transactions L2 Write Transactions (L1 write requests) 36704256 36704256 36704256 + 1 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 + 1 l2_l1_write_throughput L2 Throughput (L1 Writes) 18.795GB/s 18.795GB/s 18.795GB/s + 1 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 + 1 eligible_warps_per_cycle Eligible Warps Per Active Cycle 2.319359 2.319359 2.319359 + 1 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% + 1 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% + 1 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.26% 0.26% 0.26% + 1 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% + Kernel: void mtf_cuda_2symbols(unsigned char const *, unsigned char*, int, int) + 1 l1_cache_global_hit_rate L1 Global Hit Rate 96.67% 96.67% 96.67% + 1 branch_efficiency Branch Efficiency 100.00% 100.00% 100.00% + 1 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% + 1 sm_efficiency Multiprocessor Activity 99.09% 99.09% 99.09% + 1 achieved_occupancy Achieved Occupancy 0.617368 0.617368 0.617368 + 1 gld_requested_throughput Requested Global Load Throughput 15.939GB/s 15.939GB/s 15.939GB/s + 1 gst_requested_throughput Requested Global Store Throughput 466.02MB/s 466.02MB/s 465.39MB/s + 1 ipc Executed IPC 1.905197 1.905197 1.905197 + 1 sm_efficiency_instance Multiprocessor Activity 99.09% 99.09% 99.09% + 1 ipc_instance Executed IPC 1.905197 1.905197 1.905197 + 1 inst_per_warp Instructions per warp 1.1415e+05 1.1415e+05 1.1415e+05 + 1 gld_transactions Global Load Transactions 40327856 40327856 40327856 + 1 gst_transactions Global Store Transactions 36831232 36831232 36831232 + 1 local_load_transactions Local Load Transactions 0 0 0 + 1 local_store_transactions Local Store Transactions 0 0 0 + 1 shared_load_transactions Shared Load Transactions 39682128 39682128 39682128 + 1 shared_store_transactions Shared Store Transactions 75494344 75494344 75494344 + 1 gld_transactions_per_request Global Load Transactions Per Request 1.003905 1.003905 1.003905 + 1 gst_transactions_per_request Global Store Transactions Per Request 1.003459 1.003459 1.003459 + 1 local_load_transactions_per_request Local Memory Load Transactions Per Request 0.000000 0.000000 0.000000 + 1 local_store_transactions_per_request Local Memory Store Transactions Per Request 0.000000 0.000000 0.000000 + 1 shared_load_transactions_per_request Shared Memory Load Transactions Per Request 1.004051 1.004051 1.004051 + 1 shared_store_transactions_per_request Shared Memory Store Transactions Per Request 1.002569 1.002569 1.002569 + 1 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 shared_load_throughput Shared Memory Load Throughput 62.979GB/s 62.979GB/s 62.979GB/s + 1 shared_store_throughput Shared Memory Store Throughput 119.82GB/s 119.82GB/s 119.82GB/s + 1 shared_efficiency Shared Memory Efficiency 60.29% 60.29% 60.29% + 1 flop_count_sp Floating Point Operations(Single Precision) 0 0 0 + 1 flop_count_sp_add Floating Point Operations(Single Precision Add) 0 0 0 + 1 flop_count_sp_mul Floating Point Operation(Single Precision Mul) 0 0 0 + 1 flop_count_sp_fma Floating Point Operations(Single Precision FMA) 0 0 0 + 1 flop_count_dp Floating Point Operations(Double Precision) 0 0 0 + 1 flop_count_dp_add Floating Point Operations(Double Precision Add) 0 0 0 + 1 flop_count_dp_mul Floating Point Operations(Double Precision Mul) 0 0 0 + 1 flop_count_dp_fma Floating Point Operations(Double Precision FMA) 0 0 0 + 1 flop_count_sp_special Floating Point Operations(Single Precision Special) 0 0 0 + 1 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 72.53% 72.53% 72.53% + 1 stall_exec_dependency Issue Stall Reasons (Execution Dependency) 21.36% 21.36% 21.36% + 1 stall_memory_dependency Issue Stall Reasons (Data Request) 0.25% 0.25% 0.25% + 1 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% + 1 inst_executed Instructions Executed 1023285210 1023285210 1023285210 + 1 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% + 1 stall_other Issue Stall Reasons (Other) 5.67% 5.67% 5.67% + 1 inst_fp_32 FP Instructions(Single) 0 0 0 + 1 inst_fp_64 FP Instructions(Double) 0 0 0 + 1 inst_integer Integer Instructions 1.4274e+10 1.4274e+10 1.4274e+10 + 1 inst_bit_convert Bit-Convert Instructions 3513906624 3513906624 3513906624 + 1 inst_control Control-Flow Instructions 1444059457 1444059457 1444059457 + 1 inst_compute_ld_st Load/Store Instructions 4778964203 4778964203 4778964203 + 1 inst_misc Misc Instructions 3350163008 3350163008 3350163008 + 1 inst_inter_thread_communication Inter-Thread Instructions 1294838720 1294838720 1294838720 + 1 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 + 1 atomic_transactions Atomic Transactions 0 0 0 + 1 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 + 1 inst_replay_overhead Instruction Replay Overhead 0.000015 0.000015 0.000015 + 1 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 + 1 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.001598 0.001598 0.001598 + 1 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% + 1 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 dram_read_throughput Device Memory Read Throughput 956.41MB/s 956.41MB/s 955.58MB/s + 1 dram_write_throughput Device Memory Write Throughput 13.190GB/s 13.190GB/s 13.189GB/s + 1 gst_throughput Global Store Throughput 14.563GB/s 14.563GB/s 14.563GB/s + 1 gld_throughput Global Load Throughput 64.004GB/s 64.004GB/s 64.003GB/s + 1 warp_execution_efficiency Warp Execution Efficiency 100.00% 100.00% 100.00% + 1 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 + 1 gld_efficiency Global Memory Load Efficiency 24.90% 24.90% 24.90% + 1 gst_efficiency Global Memory Store Efficiency 3.13% 3.13% 3.13% + 1 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 88.89% 88.89% 88.89% + 1 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% + 1 l2_l1_read_throughput L2 Throughput (L1 Reads) 2.6131GB/s 2.6131GB/s 2.6124GB/s + 1 local_memory_overhead Local Memory Overhead 2.80% 2.80% 2.80% + 1 issued_ipc Issued IPC 1.901964 1.901964 1.901964 + 1 issue_slot_utilization Issue Slot Utilization 75.93% 75.93% 75.93% + 1 sysmem_read_transactions System Memory Read Transactions 0 0 0 + 1 sysmem_write_transactions System Memory Write Transactions 0 0 0 + 1 l2_read_transactions L2 Read Transactions 7113116 7113116 7113116 + 1 l2_write_transactions L2 Write Transactions 36704262 36704262 36704262 + 1 tex_cache_transactions Texture Cache Transactions 0 0 0 + 1 dram_read_transactions Device Memory Read Transactions 2353993 2353993 2353993 + 1 dram_write_transactions Device Memory Write Transactions 33243077 33243077 33243077 + 1 l2_read_throughput L2 Throughput (Reads) 2.8223GB/s 2.8223GB/s 2.8219GB/s + 1 l2_write_throughput L2 Throughput (Writes) 14.563GB/s 14.563GB/s 14.563GB/s + 1 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 cf_issued Issued Control-Flow Instructions 125725296 125725296 125725296 + 1 cf_executed Executed Control-Flow Instructions 125725296 125725296 125725296 + 1 ldst_issued Issued Load/Store Instructions 191710339 191710339 191710339 + 1 ldst_executed Executed Load/Store Instructions 191698788 191698788 191698788 + 1 l1_shared_utilization L1/Shared Memory Utilization Mid (4) Mid (4) Mid (4) + 1 l2_utilization L2 Cache Utilization Low (1) Low (1) Low (1) + 1 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) + 1 dram_utilization Device Memory Utilization Low (2) Low (2) Low (2) + 1 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) + 1 ldst_fu_utilization Load/Store Function Unit Utilization Low (2) Low (2) Low (2) + 1 alu_fu_utilization Arithmetic Function Unit Utilization Mid (6) Mid (6) Mid (6) + 1 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) + 1 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) + 1 inst_issued Instructions Issued 1023296241 1023296241 1023296241 + 1 issue_slots Issue Slots 817052818 817052818 817052818 + 1 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 6586004 6586004 6586004 + 1 l2_l1_write_transactions L2 Write Transactions (L1 write requests) 36704256 36704256 36704256 + 1 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 + 1 l2_l1_write_throughput L2 Throughput (L1 Writes) 14.563GB/s 14.563GB/s 14.563GB/s + 1 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 + 1 eligible_warps_per_cycle Eligible Warps Per Active Cycle 2.484451 2.484451 2.484451 + 1 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% + 1 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% + 1 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.19% 0.19% 0.19% + 1 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% + Kernel: void mtf_cuda_thread_by4(unsigned char const *, unsigned char*, int, int) + 1 l1_cache_global_hit_rate L1 Global Hit Rate 6.29% 6.29% 6.29% + 1 branch_efficiency Branch Efficiency 71.69% 71.69% 71.69% + 1 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% + 1 sm_efficiency Multiprocessor Activity 95.14% 95.14% 95.14% + 1 achieved_occupancy Achieved Occupancy 0.589123 0.589123 0.589123 + 1 gld_requested_throughput Requested Global Load Throughput 333.76MB/s 333.76MB/s 332.83MB/s + 1 gst_requested_throughput Requested Global Store Throughput 333.67MB/s 333.67MB/s 332.83MB/s + 1 ipc Executed IPC 0.353973 0.353973 0.353973 + 1 sm_efficiency_instance Multiprocessor Activity 95.14% 95.14% 95.14% + 1 ipc_instance Executed IPC 0.353973 0.353973 0.353973 + 1 inst_per_warp Instructions per warp 8.9380e+05 8.9380e+05 8.9380e+05 + 1 gld_transactions Global Load Transactions 37757952 37757952 37757952 + 1 gst_transactions Global Store Transactions 37748736 37748736 37748736 + 1 local_load_transactions Local Load Transactions 0 0 0 + 1 local_store_transactions Local Store Transactions 0 0 0 + 1 shared_load_transactions Shared Load Transactions 10279388 10279388 10279388 + 1 shared_store_transactions Shared Store Transactions 10280540 10280540 10280540 + 1 gld_transactions_per_request Global Load Transactions Per Request 3.689765 3.689765 3.689765 + 1 gst_transactions_per_request Global Store Transactions Per Request 3.688195 3.688195 3.688195 + 1 local_load_transactions_per_request Local Memory Load Transactions Per Request 0.000000 0.000000 0.000000 + 1 local_store_transactions_per_request Local Memory Store Transactions Per Request 0.000000 0.000000 0.000000 + 1 shared_load_transactions_per_request Shared Memory Load Transactions Per Request 0.996800 0.996800 0.996800 + 1 shared_store_transactions_per_request Shared Memory Store Transactions Per Request 0.996803 0.996803 0.996803 + 1 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 shared_load_throughput Shared Memory Load Throughput 11.681GB/s 11.681GB/s 11.681GB/s + 1 shared_store_throughput Shared Memory Store Throughput 11.682GB/s 11.682GB/s 11.682GB/s + 1 shared_efficiency Shared Memory Efficiency 16.58% 16.58% 16.58% + 1 flop_count_sp Floating Point Operations(Single Precision) 0 0 0 + 1 flop_count_sp_add Floating Point Operations(Single Precision Add) 0 0 0 + 1 flop_count_sp_mul Floating Point Operation(Single Precision Mul) 0 0 0 + 1 flop_count_sp_fma Floating Point Operations(Single Precision FMA) 0 0 0 + 1 flop_count_dp Floating Point Operations(Double Precision) 0 0 0 + 1 flop_count_dp_add Floating Point Operations(Double Precision Add) 0 0 0 + 1 flop_count_dp_mul Floating Point Operations(Double Precision Mul) 0 0 0 + 1 flop_count_dp_fma Floating Point Operations(Double Precision FMA) 0 0 0 + 1 flop_count_sp_special Floating Point Operations(Single Precision Special) 0 0 0 + 1 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 4.12% 4.12% 4.12% + 1 stall_exec_dependency Issue Stall Reasons (Execution Dependency) 29.61% 29.61% 29.61% + 1 stall_memory_dependency Issue Stall Reasons (Data Request) 62.27% 62.27% 62.27% + 1 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% + 1 inst_executed Instructions Executed 253839379 253839379 253839379 + 1 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% + 1 stall_other Issue Stall Reasons (Other) 3.99% 3.99% 3.99% + 1 inst_fp_32 FP Instructions(Single) 0 0 0 + 1 inst_fp_64 FP Instructions(Double) 0 0 0 + 1 inst_integer Integer Instructions 690384106 690384106 690384106 + 1 inst_bit_convert Bit-Convert Instructions 641281716 641281716 641281716 + 1 inst_control Control-Flow Instructions 177403645 177403645 177403645 + 1 inst_compute_ld_st Load/Store Instructions 509718307 509718307 509718307 + 1 inst_misc Misc Instructions 278479385 278479385 278479385 + 1 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 + 1 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 + 1 atomic_transactions Atomic Transactions 0 0 0 + 1 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 + 1 inst_replay_overhead Instruction Replay Overhead 1.294875 1.294875 1.294875 + 1 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 + 1 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.138905 0.138905 0.138905 + 1 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% + 1 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 dram_read_throughput Device Memory Read Throughput 26.866GB/s 26.866GB/s 26.865GB/s + 1 dram_write_throughput Device Memory Write Throughput 10.349GB/s 10.349GB/s 10.349GB/s + 1 gst_throughput Global Store Throughput 10.427GB/s 10.427GB/s 10.427GB/s + 1 gld_throughput Global Load Throughput 42.907GB/s 42.907GB/s 42.906GB/s + 1 warp_execution_efficiency Warp Execution Efficiency 31.76% 31.76% 31.76% + 1 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 + 1 gld_efficiency Global Memory Load Efficiency 0.76% 0.76% 0.76% + 1 gst_efficiency Global Memory Store Efficiency 3.13% 3.13% 3.13% + 1 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 31.56% 31.56% 31.56% + 1 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% + 1 l2_l1_read_throughput L2 Throughput (L1 Reads) 38.692GB/s 38.692GB/s 38.692GB/s + 1 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00% + 1 issued_ipc Issued IPC 0.813608 0.813608 0.813608 + 1 issue_slot_utilization Issue Slot Utilization 34.43% 34.43% 34.43% + 1 sysmem_read_transactions System Memory Read Transactions 0 0 0 + 1 sysmem_write_transactions System Memory Write Transactions 2 2 2 + 1 l2_read_transactions L2 Read Transactions 136080115 136080115 136080115 + 1 l2_write_transactions L2 Write Transactions 36704262 36704262 36704262 + 1 tex_cache_transactions Texture Cache Transactions 0 0 0 + 1 dram_read_transactions Device Memory Read Transactions 94567595 94567595 94567595 + 1 dram_write_transactions Device Memory Write Transactions 36428872 36428872 36428872 + 1 l2_read_throughput L2 Throughput (Reads) 38.659GB/s 38.659GB/s 38.658GB/s + 1 l2_write_throughput L2 Throughput (Writes) 10.427GB/s 10.427GB/s 10.427GB/s + 1 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 sysmem_write_throughput System Memory Write Throughput 610.000B/s 610.000B/s 0.00000B/s + 1 cf_issued Issued Control-Flow Instructions 51275685 51275685 51275685 + 1 cf_executed Executed Control-Flow Instructions 51275685 51275685 51275685 + 1 ldst_issued Issued Load/Store Instructions 370321967 370321967 370321967 + 1 ldst_executed Executed Load/Store Instructions 41094075 41094075 41094075 + 1 l1_shared_utilization L1/Shared Memory Utilization Low (1) Low (1) Low (1) + 1 l2_utilization L2 Cache Utilization Low (3) Low (3) Low (3) + 1 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) + 1 dram_utilization Device Memory Utilization Mid (4) Mid (4) Mid (4) + 1 sysmem_utilization System Memory Utilization Low (1) Low (1) Low (1) + 1 ldst_fu_utilization Load/Store Function Unit Utilization Low (3) Low (3) Low (3) + 1 alu_fu_utilization Arithmetic Function Unit Utilization Low (1) Low (1) Low (1) + 1 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) + 1 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) + 1 inst_issued Instructions Issued 583817539 583817539 583817539 + 1 issue_slots Issue Slots 494151271 494151271 494151271 + 1 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 136196344 136196344 136196344 + 1 l2_l1_write_transactions L2 Write Transactions (L1 write requests) 36704256 36704256 36704256 + 1 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 + 1 l2_l1_write_throughput L2 Throughput (L1 Writes) 10.427GB/s 10.427GB/s 10.427GB/s + 1 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 + 1 eligible_warps_per_cycle Eligible Warps Per Active Cycle 0.771487 0.771487 0.771487 + 1 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% + 1 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% + 1 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.00% 0.00% 0.00% + 1 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% + Kernel: void mtf_cuda_thread(unsigned char const *, unsigned char*, int, int) + 1 l1_cache_global_hit_rate L1 Global Hit Rate 9.04% 9.04% 9.04% + 1 branch_efficiency Branch Efficiency 71.35% 71.35% 71.35% + 1 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% + 1 sm_efficiency Multiprocessor Activity 90.58% 90.58% 90.58% + 1 achieved_occupancy Achieved Occupancy 0.575594 0.575594 0.575594 + 1 gld_requested_throughput Requested Global Load Throughput 332.23MB/s 332.23MB/s 331.88MB/s + 1 gst_requested_throughput Requested Global Store Throughput 332.15MB/s 332.15MB/s 331.88MB/s + 1 ipc Executed IPC 0.483597 0.483597 0.483597 + 1 sm_efficiency_instance Multiprocessor Activity 90.58% 90.58% 90.58% + 1 ipc_instance Executed IPC 0.483597 0.483597 0.483597 + 1 inst_per_warp Instructions per warp 1.1603e+06 1.1603e+06 1.1603e+06 + 1 gld_transactions Global Load Transactions 37757952 37757952 37757952 + 1 gst_transactions Global Store Transactions 37748736 37748736 37748736 + 1 local_load_transactions Local Load Transactions 0 0 0 + 1 local_store_transactions Local Store Transactions 0 0 0 + 1 shared_load_transactions Shared Load Transactions 10855168 10855168 10855168 + 1 shared_store_transactions Shared Store Transactions 19596620 19596620 19596620 + 1 gld_transactions_per_request Global Load Transactions Per Request 4.269880 4.269880 4.269880 + 1 gst_transactions_per_request Global Store Transactions Per Request 4.268101 4.268101 4.268101 + 1 local_load_transactions_per_request Local Memory Load Transactions Per Request 0.000000 0.000000 0.000000 + 1 local_store_transactions_per_request Local Memory Store Transactions Per Request 0.000000 0.000000 0.000000 + 1 shared_load_transactions_per_request Shared Memory Load Transactions Per Request 1.007018 1.007018 1.007018 + 1 shared_store_transactions_per_request Shared Memory Store Transactions Per Request 0.998510 0.998510 0.998510 + 1 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 shared_load_throughput Shared Memory Load Throughput 12.279GB/s 12.279GB/s 12.279GB/s + 1 shared_store_throughput Shared Memory Store Throughput 22.167GB/s 22.167GB/s 22.166GB/s + 1 shared_efficiency Shared Memory Efficiency 11.73% 11.73% 11.73% + 1 flop_count_sp Floating Point Operations(Single Precision) 0 0 0 + 1 flop_count_sp_add Floating Point Operations(Single Precision Add) 0 0 0 + 1 flop_count_sp_mul Floating Point Operation(Single Precision Mul) 0 0 0 + 1 flop_count_sp_fma Floating Point Operations(Single Precision FMA) 0 0 0 + 1 flop_count_dp Floating Point Operations(Double Precision) 0 0 0 + 1 flop_count_dp_add Floating Point Operations(Double Precision Add) 0 0 0 + 1 flop_count_dp_mul Floating Point Operations(Double Precision Mul) 0 0 0 + 1 flop_count_dp_fma Floating Point Operations(Double Precision FMA) 0 0 0 + 1 flop_count_sp_special Floating Point Operations(Single Precision Special) 0 0 0 + 1 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 5.28% 5.28% 5.28% + 1 stall_exec_dependency Issue Stall Reasons (Execution Dependency) 32.61% 32.61% 32.61% + 1 stall_memory_dependency Issue Stall Reasons (Data Request) 57.15% 57.15% 57.15% + 1 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% + 1 inst_executed Instructions Executed 329512395 329512395 329512395 + 1 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% + 1 stall_other Issue Stall Reasons (Other) 4.94% 4.94% 4.94% + 1 inst_fp_32 FP Instructions(Single) 0 0 0 + 1 inst_fp_64 FP Instructions(Double) 0 0 0 + 1 inst_integer Integer Instructions 1763850511 1763850511 1763850511 + 1 inst_bit_convert Bit-Convert Instructions 1727074186 1727074186 1727074186 + 1 inst_control Control-Flow Instructions 456759720 456759720 456759720 + 1 inst_compute_ld_st Load/Store Instructions 530248754 530248754 530248754 + 1 inst_misc Misc Instructions 73507464 73507464 73507464 + 1 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 + 1 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 + 1 atomic_transactions Atomic Transactions 0 0 0 + 1 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 + 1 inst_replay_overhead Instruction Replay Overhead 0.972768 0.972768 0.972768 + 1 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 + 1 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.103707 0.103707 0.103707 + 1 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% + 1 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 dram_read_throughput Device Memory Read Throughput 22.093GB/s 22.093GB/s 22.093GB/s + 1 dram_write_throughput Device Memory Write Throughput 10.248GB/s 10.248GB/s 10.247GB/s + 1 gst_throughput Global Store Throughput 10.380GB/s 10.380GB/s 10.380GB/s + 1 gld_throughput Global Load Throughput 42.710GB/s 42.710GB/s 42.710GB/s + 1 warp_execution_efficiency Warp Execution Efficiency 46.25% 46.25% 46.25% + 1 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 + 1 gld_efficiency Global Memory Load Efficiency 0.76% 0.76% 0.76% + 1 gst_efficiency Global Memory Store Efficiency 3.13% 3.13% 3.13% + 1 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 41.98% 41.98% 41.98% + 1 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% + 1 l2_l1_read_throughput L2 Throughput (L1 Reads) 37.359GB/s 37.359GB/s 37.358GB/s + 1 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00% + 1 issued_ipc Issued IPC 0.962714 0.962714 0.962714 + 1 issue_slot_utilization Issue Slot Utilization 42.30% 42.30% 42.30% + 1 sysmem_read_transactions System Memory Read Transactions 0 0 0 + 1 sysmem_write_transactions System Memory Write Transactions 0 0 0 + 1 l2_read_transactions L2 Read Transactions 131898349 131898349 131898349 + 1 l2_write_transactions L2 Write Transactions 36704495 36704495 36704495 + 1 tex_cache_transactions Texture Cache Transactions 0 0 0 + 1 dram_read_transactions Device Memory Read Transactions 78125450 78125450 78125450 + 1 dram_write_transactions Device Memory Write Transactions 36237571 36237571 36237571 + 1 l2_read_throughput L2 Throughput (Reads) 37.300GB/s 37.300GB/s 37.299GB/s + 1 l2_write_throughput L2 Throughput (Writes) 10.380GB/s 10.380GB/s 10.380GB/s + 1 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 cf_issued Issued Control-Flow Instructions 41451447 41451447 41451447 + 1 cf_executed Executed Control-Flow Instructions 41451447 41451447 41451447 + 1 ldst_issued Issued Load/Store Instructions 368603652 368603652 368603652 + 1 ldst_executed Executed Load/Store Instructions 48092635 48092635 48092635 + 1 l1_shared_utilization L1/Shared Memory Utilization Low (2) Low (2) Low (2) + 1 l2_utilization L2 Cache Utilization Low (3) Low (3) Low (3) + 1 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) + 1 dram_utilization Device Memory Utilization Low (3) Low (3) Low (3) + 1 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) + 1 ldst_fu_utilization Load/Store Function Unit Utilization Low (3) Low (3) Low (3) + 1 alu_fu_utilization Arithmetic Function Unit Utilization Low (2) Low (2) Low (2) + 1 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) + 1 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) + 1 inst_issued Instructions Issued 649890168 649890168 649890168 + 1 issue_slots Issue Slots 571131893 571131893 571131893 + 1 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 132107444 132107444 132107444 + 1 l2_l1_write_transactions L2 Write Transactions (L1 write requests) 36704256 36704256 36704256 + 1 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 + 1 l2_l1_write_throughput L2 Throughput (L1 Writes) 10.380GB/s 10.380GB/s 10.380GB/s + 1 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 + 1 eligible_warps_per_cycle Eligible Warps Per Active Cycle 1.030854 1.030854 1.030854 + 1 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% + 1 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% + 1 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.01% 0.01% 0.01% + 1 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% + Kernel: void mtf_cuda_scalar(unsigned char const *, unsigned char*, int, int) + 1 l1_cache_global_hit_rate L1 Global Hit Rate 91.12% 91.12% 91.12% + 1 branch_efficiency Branch Efficiency 100.00% 100.00% 100.00% + 1 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% + 1 sm_efficiency Multiprocessor Activity 99.66% 99.66% 99.66% + 1 achieved_occupancy Achieved Occupancy 0.654251 0.654251 0.654251 + 1 gld_requested_throughput Requested Global Load Throughput 15.845GB/s 15.845GB/s 15.845GB/s + 1 gst_requested_throughput Requested Global Store Throughput 15.841GB/s 15.841GB/s 15.841GB/s + 1 ipc Executed IPC 1.907166 1.907166 1.907166 + 1 sm_efficiency_instance Multiprocessor Activity 99.66% 99.66% 99.66% + 1 ipc_instance Executed IPC 1.907166 1.907166 1.907166 + 1 inst_per_warp Instructions per warp 1.0512e+05 1.0512e+05 1.0512e+05 + 1 gld_transactions Global Load Transactions 36725320 36725320 36725320 + 1 gst_transactions Global Store Transactions 36765696 36765696 36765696 + 1 local_load_transactions Local Load Transactions 0 0 0 + 1 local_store_transactions Local Store Transactions 0 0 0 + 1 shared_load_transactions Shared Load Transactions 39717232 39717232 39717232 + 1 shared_store_transactions Shared Store Transactions 75391552 75391552 75391552 + 1 gld_transactions_per_request Global Load Transactions Per Request 1.000335 1.000335 1.000335 + 1 gst_transactions_per_request Global Store Transactions Per Request 1.001674 1.001674 1.001674 + 1 local_load_transactions_per_request Local Memory Load Transactions Per Request 0.000000 0.000000 0.000000 + 1 local_store_transactions_per_request Local Memory Store Transactions Per Request 0.000000 0.000000 0.000000 + 1 shared_load_transactions_per_request Shared Memory Load Transactions Per Request 1.002954 1.002954 1.002954 + 1 shared_store_transactions_per_request Shared Memory Store Transactions Per Request 1.001204 1.001204 1.001204 + 1 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 shared_load_throughput Shared Memory Load Throughput 68.566GB/s 68.566GB/s 68.565GB/s + 1 shared_store_throughput Shared Memory Store Throughput 130.15GB/s 130.15GB/s 130.15GB/s + 1 shared_efficiency Shared Memory Efficiency 73.18% 73.18% 73.18% + 1 flop_count_sp Floating Point Operations(Single Precision) 0 0 0 + 1 flop_count_sp_add Floating Point Operations(Single Precision Add) 0 0 0 + 1 flop_count_sp_mul Floating Point Operation(Single Precision Mul) 0 0 0 + 1 flop_count_sp_fma Floating Point Operations(Single Precision FMA) 0 0 0 + 1 flop_count_dp Floating Point Operations(Double Precision) 0 0 0 + 1 flop_count_dp_add Floating Point Operations(Double Precision Add) 0 0 0 + 1 flop_count_dp_mul Floating Point Operations(Double Precision Mul) 0 0 0 + 1 flop_count_dp_fma Floating Point Operations(Double Precision FMA) 0 0 0 + 1 flop_count_sp_special Floating Point Operations(Single Precision Special) 0 0 0 + 1 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 42.07% 42.07% 42.07% + 1 stall_exec_dependency Issue Stall Reasons (Execution Dependency) 28.09% 28.09% 28.09% + 1 stall_memory_dependency Issue Stall Reasons (Data Request) 0.43% 0.43% 0.43% + 1 stall_sync Issue Stall Reasons (Synchronization) 23.39% 23.39% 23.39% + 1 inst_executed Instructions Executed 942303984 942303984 942303984 + 1 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% + 1 stall_other Issue Stall Reasons (Other) 5.86% 5.86% 5.86% + 1 inst_fp_32 FP Instructions(Single) 0 0 0 + 1 inst_fp_64 FP Instructions(Double) 0 0 0 + 1 inst_integer Integer Instructions 1.4022e+10 1.4022e+10 1.4022e+10 + 1 inst_bit_convert Bit-Convert Instructions 1449513632 1449513632 1449513632 + 1 inst_control Control-Flow Instructions 335459052 335459052 335459052 + 1 inst_compute_ld_st Load/Store Instructions 5044891705 5044891705 5044891705 + 1 inst_misc Misc Instructions 4625302368 4625302368 4625302368 + 1 inst_inter_thread_communication Inter-Thread Instructions 1234584512 1234584512 1234584512 + 1 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 + 1 atomic_transactions Atomic Transactions 0 0 0 + 1 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 + 1 inst_replay_overhead Instruction Replay Overhead 0.000407 0.000407 0.000407 + 1 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 + 1 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.003704 0.003704 0.003704 + 1 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% + 1 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 dram_read_throughput Device Memory Read Throughput 997.30MB/s 997.30MB/s 996.59MB/s + 1 dram_write_throughput Device Memory Write Throughput 14.523GB/s 14.523GB/s 14.522GB/s + 1 gst_throughput Global Store Throughput 15.841GB/s 15.841GB/s 15.841GB/s + 1 gld_throughput Global Load Throughput 63.401GB/s 63.401GB/s 63.400GB/s + 1 warp_execution_efficiency Warp Execution Efficiency 100.00% 100.00% 100.00% + 1 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 + 1 gld_efficiency Global Memory Load Efficiency 24.99% 24.99% 24.99% + 1 gst_efficiency Global Memory Store Efficiency 100.00% 100.00% 100.00% + 1 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 87.95% 87.95% 87.95% + 1 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% + 1 l2_l1_read_throughput L2 Throughput (L1 Reads) 5.9021GB/s 5.9021GB/s 5.9018GB/s + 1 local_memory_overhead Local Memory Overhead 1.25% 1.25% 1.25% + 1 issued_ipc Issued IPC 1.910859 1.910859 1.910859 + 1 issue_slot_utilization Issue Slot Utilization 78.19% 78.19% 78.19% + 1 sysmem_read_transactions System Memory Read Transactions 0 0 0 + 1 sysmem_write_transactions System Memory Write Transactions 0 0 0 + 1 l2_read_transactions L2 Read Transactions 14053562 14053562 14053562 + 1 l2_write_transactions L2 Write Transactions 36704271 36704271 36704271 + 1 tex_cache_transactions Texture Cache Transactions 0 0 0 + 1 dram_read_transactions Device Memory Read Transactions 2256612 2256612 2256612 + 1 dram_write_transactions Device Memory Write Transactions 33650257 33650257 33650257 + 1 l2_read_throughput L2 Throughput (Reads) 6.0653GB/s 6.0653GB/s 6.0648GB/s + 1 l2_write_throughput L2 Throughput (Writes) 15.841GB/s 15.841GB/s 15.841GB/s + 1 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 sysmem_write_throughput System Memory Write Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 cf_issued Issued Control-Flow Instructions 86175411 86175411 86175411 + 1 cf_executed Executed Control-Flow Instructions 86175411 86175411 86175411 + 1 ldst_issued Issued Load/Store Instructions 226260725 226260725 226260725 + 1 ldst_executed Executed Load/Store Instructions 225889086 225889086 225889086 + 1 l1_shared_utilization L1/Shared Memory Utilization Mid (4) Mid (4) Mid (4) + 1 l2_utilization L2 Cache Utilization Low (2) Low (2) Low (2) + 1 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) + 1 dram_utilization Device Memory Utilization Low (2) Low (2) Low (2) + 1 sysmem_utilization System Memory Utilization Idle (0) Idle (0) Idle (0) + 1 ldst_fu_utilization Load/Store Function Unit Utilization Low (3) Low (3) Low (3) + 1 alu_fu_utilization Arithmetic Function Unit Utilization Mid (5) Mid (5) Mid (5) + 1 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) + 1 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) + 1 inst_issued Instructions Issued 942702045 942702045 942702045 + 1 issue_slots Issue Slots 771484670 771484670 771484670 + 1 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 13675484 13675484 13675484 + 1 l2_l1_write_transactions L2 Write Transactions (L1 write requests) 36704256 36704256 36704256 + 1 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 + 1 l2_l1_write_throughput L2 Throughput (L1 Writes) 15.841GB/s 15.841GB/s 15.841GB/s + 1 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 + 1 eligible_warps_per_cycle Eligible Warps Per Active Cycle 3.041417 3.041417 3.041417 + 1 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% + 1 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% + 1 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.16% 0.16% 0.16% + 1 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% + Kernel: void mtf_cuda_thread(unsigned char const *, unsigned char*, int, int) + 1 l1_cache_global_hit_rate L1 Global Hit Rate 7.89% 7.89% 7.89% + 1 branch_efficiency Branch Efficiency 69.80% 69.80% 69.80% + 1 l1_cache_local_hit_rate L1 Local Hit Rate 0.00% 0.00% 0.00% + 1 sm_efficiency Multiprocessor Activity 93.49% 93.49% 93.49% + 1 achieved_occupancy Achieved Occupancy 0.589260 0.589260 0.589260 + 1 gld_requested_throughput Requested Global Load Throughput 338.56MB/s 338.56MB/s 338.55MB/s + 1 gst_requested_throughput Requested Global Store Throughput 338.47MB/s 338.47MB/s 337.60MB/s + 1 ipc Executed IPC 0.407709 0.407709 0.407709 + 1 sm_efficiency_instance Multiprocessor Activity 93.49% 93.49% 93.49% + 1 ipc_instance Executed IPC 0.407709 0.407709 0.407709 + 1 inst_per_warp Instructions per warp 9.8336e+05 9.8336e+05 9.8336e+05 + 1 gld_transactions Global Load Transactions 37757952 37757952 37757952 + 1 gst_transactions Global Store Transactions 37748736 37748736 37748736 + 1 local_load_transactions Local Load Transactions 0 0 0 + 1 local_store_transactions Local Store Transactions 0 0 0 + 1 shared_load_transactions Shared Load Transactions 9057364 9057364 9057364 + 1 shared_store_transactions Shared Store Transactions 17019016 17019016 17019016 + 1 gld_transactions_per_request Global Load Transactions Per Request 4.822532 4.822532 4.822532 + 1 gst_transactions_per_request Global Store Transactions Per Request 4.820532 4.820532 4.820532 + 1 local_load_transactions_per_request Local Memory Load Transactions Per Request 0.000000 0.000000 0.000000 + 1 local_store_transactions_per_request Local Memory Store Transactions Per Request 0.000000 0.000000 0.000000 + 1 shared_load_transactions_per_request Shared Memory Load Transactions Per Request 1.018087 1.018087 1.018087 + 1 shared_store_transactions_per_request Shared Memory Store Transactions Per Request 1.017390 1.017390 1.017390 + 1 local_load_throughput Local Memory Load Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 local_store_throughput Local Memory Store Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 shared_load_throughput Shared Memory Load Throughput 10.440GB/s 10.440GB/s 10.440GB/s + 1 shared_store_throughput Shared Memory Store Throughput 19.618GB/s 19.618GB/s 19.617GB/s + 1 shared_efficiency Shared Memory Efficiency 11.98% 11.98% 11.98% + 1 flop_count_sp Floating Point Operations(Single Precision) 0 0 0 + 1 flop_count_sp_add Floating Point Operations(Single Precision Add) 0 0 0 + 1 flop_count_sp_mul Floating Point Operation(Single Precision Mul) 0 0 0 + 1 flop_count_sp_fma Floating Point Operations(Single Precision FMA) 0 0 0 + 1 flop_count_dp Floating Point Operations(Double Precision) 0 0 0 + 1 flop_count_dp_add Floating Point Operations(Double Precision Add) 0 0 0 + 1 flop_count_dp_mul Floating Point Operations(Double Precision Mul) 0 0 0 + 1 flop_count_dp_fma Floating Point Operations(Double Precision FMA) 0 0 0 + 1 flop_count_sp_special Floating Point Operations(Single Precision Special) 0 0 0 + 1 stall_inst_fetch Issue Stall Reasons (Instructions Fetch) 4.42% 4.42% 4.42% + 1 stall_exec_dependency Issue Stall Reasons (Execution Dependency) 36.75% 36.75% 36.75% + 1 stall_memory_dependency Issue Stall Reasons (Data Request) 54.08% 54.08% 54.08% + 1 stall_sync Issue Stall Reasons (Synchronization) 0.00% 0.00% 0.00% + 1 inst_executed Instructions Executed 279274913 279274913 279274913 + 1 stall_texture Issue Stall Reasons (Texture) 0.00% 0.00% 0.00% + 1 stall_other Issue Stall Reasons (Other) 4.74% 4.74% 4.74% + 1 inst_fp_32 FP Instructions(Single) 0 0 0 + 1 inst_fp_64 FP Instructions(Double) 0 0 0 + 1 inst_integer Integer Instructions 1563685868 1563685868 1563685868 + 1 inst_bit_convert Bit-Convert Instructions 1526909543 1526909543 1526909543 + 1 inst_control Control-Flow Instructions 399569822 399569822 399569822 + 1 inst_compute_ld_st Load/Store Instructions 473023012 473023012 473023012 + 1 inst_misc Misc Instructions 73471620 73471620 73471620 + 1 inst_inter_thread_communication Inter-Thread Instructions 0 0 0 + 1 atomic_replay_overhead Atomic Replay Overhead 0.000000 0.000000 0.000000 + 1 atomic_transactions Atomic Transactions 0 0 0 + 1 atomic_transactions_per_request Atomic Transactions Per Request 0.000000 0.000000 0.000000 + 1 inst_replay_overhead Instruction Replay Overhead 1.321066 1.321066 1.321066 + 1 shared_replay_overhead Shared Memory Replay Overhead 0.000000 0.000000 0.000000 + 1 global_cache_replay_overhead Global Memory Cache Replay Overhead 0.124758 0.124758 0.124758 + 1 tex_cache_hit_rate Texture Cache Hit Rate 0.00% 0.00% 0.00% + 1 tex_cache_throughput Texture Cache Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 dram_read_throughput Device Memory Read Throughput 24.589GB/s 24.589GB/s 24.589GB/s + 1 dram_write_throughput Device Memory Write Throughput 10.482GB/s 10.482GB/s 10.482GB/s + 1 gst_throughput Global Store Throughput 10.577GB/s 10.577GB/s 10.577GB/s + 1 gld_throughput Global Load Throughput 43.524GB/s 43.524GB/s 43.523GB/s + 1 warp_execution_efficiency Warp Execution Efficiency 48.51% 48.51% 48.51% + 1 local_replay_overhead Local Memory Cache Replay Overhead 0.000000 0.000000 0.000000 + 1 gld_efficiency Global Memory Load Efficiency 0.76% 0.76% 0.76% + 1 gst_efficiency Global Memory Store Efficiency 3.13% 3.13% 3.13% + 1 l2_texture_read_throughput L2 Throughput (Texture Reads) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_hit_rate L2 Hit Rate (L1 Reads) 37.57% 37.57% 37.57% + 1 l2_texture_read_hit_rate L2 Hit Rate (Texture Reads) 0.00% 0.00% 0.00% + 1 l2_l1_read_throughput L2 Throughput (L1 Reads) 38.699GB/s 38.699GB/s 38.698GB/s + 1 local_memory_overhead Local Memory Overhead 0.00% 0.00% 0.00% + 1 issued_ipc Issued IPC 0.946531 0.946531 0.946531 + 1 issue_slot_utilization Issue Slot Utilization 41.14% 41.14% 41.14% + 1 sysmem_read_transactions System Memory Read Transactions 0 0 0 + 1 sysmem_write_transactions System Memory Write Transactions 2 2 2 + 1 l2_read_transactions L2 Read Transactions 134055034 134055034 134055034 + 1 l2_write_transactions L2 Write Transactions 36704262 36704262 36704262 + 1 tex_cache_transactions Texture Cache Transactions 0 0 0 + 1 dram_read_transactions Device Memory Read Transactions 85326521 85326521 85326521 + 1 dram_write_transactions Device Memory Write Transactions 36373922 36373922 36373922 + 1 l2_read_throughput L2 Throughput (Reads) 38.631GB/s 38.631GB/s 38.631GB/s + 1 l2_write_throughput L2 Throughput (Writes) 10.577GB/s 10.577GB/s 10.577GB/s + 1 sysmem_read_throughput System Memory Read Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 sysmem_write_throughput System Memory Write Throughput 618.000B/s 618.000B/s 0.00000B/s + 1 cf_issued Issued Control-Flow Instructions 34707981 34707981 34707981 + 1 cf_executed Executed Control-Flow Instructions 34707981 34707981 34707981 + 1 ldst_issued Issued Load/Store Instructions 411428587 411428587 411428587 + 1 ldst_executed Executed Load/Store Instructions 41284886 41284886 41284886 + 1 l1_shared_utilization L1/Shared Memory Utilization Low (1) Low (1) Low (1) + 1 l2_utilization L2 Cache Utilization Low (3) Low (3) Low (3) + 1 tex_utilization Texture Cache Utilization Idle (0) Idle (0) Idle (0) + 1 dram_utilization Device Memory Utilization Low (3) Low (3) Low (3) + 1 sysmem_utilization System Memory Utilization Low (1) Low (1) Low (1) + 1 ldst_fu_utilization Load/Store Function Unit Utilization Mid (4) Mid (4) Mid (4) + 1 alu_fu_utilization Arithmetic Function Unit Utilization Low (2) Low (2) Low (2) + 1 cf_fu_utilization Control-Flow Function Unit Utilization Low (1) Low (1) Low (1) + 1 tex_fu_utilization Texture Function Unit Utilization Idle (0) Idle (0) Idle (0) + 1 inst_issued Instructions Issued 650053162 650053162 650053162 + 1 issue_slots Issue Slots 565106872 565106872 565106872 + 1 l2_atomic_throughput L2 Throughput (Atomic requests) 0.00000B/s 0.00000B/s 0.00000B/s + 1 l2_l1_read_transactions L2 Read Transactions (L1 read requests) 134289156 134289156 134289156 + 1 l2_l1_write_transactions L2 Write Transactions (L1 write requests) 36704256 36704256 36704256 + 1 l2_tex_read_transactions L2 Transactions (Texture Reads) 0 0 0 + 1 l2_l1_write_throughput L2 Throughput (L1 Writes) 10.577GB/s 10.577GB/s 10.577GB/s + 1 l2_atomic_transactions L2 Transactions (Atomic requests) 0 0 0 + 1 eligible_warps_per_cycle Eligible Warps Per Active Cycle 0.972841 0.972841 0.972841 + 1 atomic_throughput Atomic Throughput 0.00000B/s 0.00000B/s 0.00000B/s + 1 flop_sp_efficiency FLOP Efficiency(Peak Single) 0.00% 0.00% 0.00% + 1 flop_dp_efficiency FLOP Efficiency(Peak Double) 0.00% 0.00% 0.00% + 1 stall_pipe_busy Issue Stall Reasons (Pipe Busy) 0.01% 0.01% 0.01% + 1 stall_memory_throttle Issue Stall Reasons (Memory Throttle) 0.00% 0.00% 0.00% diff --git a/app_bslab/resource-usage.txt b/app_bslab/resource-usage.txt index 6a9d998..a29df8b 100644 --- a/app_bslab/resource-usage.txt +++ b/app_bslab/resource-usage.txt @@ -1,76 +1,144 @@ -ptxas info : 1808 bytes gmem, 184 bytes cmem[14] -ptxas info : Compiling entry function '_Z8mtf_4by8ILi4096ELi128ELi32EhLi256EEvPKhPhii' for 'sm_20' -ptxas info : Function properties for _Z8mtf_4by8ILi4096ELi128ELi32EhLi256EEvPKhPhii +ptxas info : 2113 bytes gmem, 272 bytes cmem[14] +ptxas info : Compiling entry function '_ZN3cub31DeviceRadixSortSingleTileKernelINS_21DeviceRadixSortPolicyIyNS_8NullTypeEiE9Policy520ELb0EyS2_iEEvPT1_S6_PT2_S8_T3_ii' for 'sm_20' +ptxas info : Function properties for _ZN3cub31DeviceRadixSortSingleTileKernelINS_21DeviceRadixSortPolicyIyNS_8NullTypeEiE9Policy520ELb0EyS2_iEEvPT1_S6_PT2_S8_T3_ii + 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +ptxas info : Used 56 registers, 4752 bytes smem, 76 bytes cmem[0] +ptxas info : Compiling entry function '_Z25mtf_cuda_2buffers_depth32ILi4096ELi4ELi2EjLi32EEvPKhPhii' for 'sm_20' +ptxas info : Function properties for _Z25mtf_cuda_2buffers_depth32ILi4096ELi4ELi2EjLi32EEvPKhPhii + 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +ptxas info : Used 19 registers, 1056 bytes smem, 56 bytes cmem[0] +ptxas info : Compiling entry function '_ZN3cub28DeviceRadixSortUpsweepKernelINS_21DeviceRadixSortPolicyIyhiE9Policy520ELb1ELb0EyiEEvPT2_PT3_S6_iiNS_13GridEvenShareIS6_EE' for 'sm_20' +ptxas info : Function properties for _ZN3cub28DeviceRadixSortUpsweepKernelINS_21DeviceRadixSortPolicyIyhiE9Policy520ELb1ELb0EyiEEvPT2_PT3_S6_iiNS_13GridEvenShareIS6_EE + 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +ptxas info : Used 33 registers, 2112 bytes smem, 96 bytes cmem[0] +ptxas info : Compiling entry function '_Z15mtf_cuda_threadILi4096ELi128ELi16EEvPKhPhii' for 'sm_20' +ptxas info : Function properties for _Z15mtf_cuda_threadILi4096ELi128ELi16EEvPKhPhii + 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +ptxas info : Used 15 registers, 2048 bytes smem, 56 bytes cmem[0] +ptxas info : Compiling entry function '_Z19mtf_cuda_thread_by4ILi4096ELi128ELi16EEvPKhPhii' for 'sm_20' +ptxas info : Function properties for _Z19mtf_cuda_thread_by4ILi4096ELi128ELi16EEvPKhPhii + 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +ptxas info : Used 16 registers, 2048 bytes smem, 56 bytes cmem[0] +ptxas info : Compiling entry function '_Z28bsc_st8_encode_cuda_postsortPyiyPi' for 'sm_20' +ptxas info : Function properties for _Z28bsc_st8_encode_cuda_postsortPyiyPi + 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +ptxas info : Used 13 registers, 64 bytes cmem[0] +ptxas info : Compiling entry function '_ZN3cub30DeviceRadixSortDownsweepKernelINS_21DeviceRadixSortPolicyIyNS_8NullTypeEiE9Policy520ELb0ELb0EyS2_iEEvPT2_S6_PT3_S8_PT4_S9_iiNS_13GridEvenShareIS9_EE' for 'sm_20' +ptxas info : Function properties for _ZN3cub30DeviceRadixSortDownsweepKernelINS_21DeviceRadixSortPolicyIyNS_8NullTypeEiE9Policy520ELb0ELb0EyS2_iEEvPT2_S6_PT3_S8_PT4_S9_iiNS_13GridEvenShareIS9_EE + 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +ptxas info : Used 52 registers, 4888 bytes smem, 120 bytes cmem[0] +ptxas info : Compiling entry function '_Z13mtf_cuda_4by8ILi4096ELi128ELi32EhLi256EEvPKhPhii' for 'sm_20' +ptxas info : Function properties for _Z13mtf_cuda_4by8ILi4096ELi128ELi32EhLi256EEvPKhPhii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 8192 bytes smem, 56 bytes cmem[0] -ptxas info : Compiling entry function '_Z14mtf_thread_by4ILi4096ELi32ELi16EEvPKhPhii' for 'sm_20' -ptxas info : Function properties for _Z14mtf_thread_by4ILi4096ELi32ELi16EEvPKhPhii +ptxas info : Compiling entry function '_Z19mtf_cuda_thread_by4ILi4096ELi128ELi64EEvPKhPhii' for 'sm_20' +ptxas info : Function properties for _Z19mtf_cuda_thread_by4ILi4096ELi128ELi64EEvPKhPhii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -ptxas info : Used 16 registers, 512 bytes smem, 56 bytes cmem[0] -ptxas info : Compiling entry function '_Z20mtf_2buffers_depth32ILi4096ELi4ELi2EjLi32EEvPKhPhii' for 'sm_20' -ptxas info : Function properties for _Z20mtf_2buffers_depth32ILi4096ELi4ELi2EjLi32EEvPKhPhii +ptxas info : Used 21 registers, 8192 bytes smem, 56 bytes cmem[0] +ptxas info : Compiling entry function '_Z17mtf_cuda_2symbolsILi4096ELi4EjEvPKhPhii' for 'sm_20' +ptxas info : Function properties for _Z17mtf_cuda_2symbolsILi4096ELi4EjEvPKhPhii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -ptxas info : Used 19 registers, 1056 bytes smem, 56 bytes cmem[0] -ptxas info : Compiling entry function '_Z10mtf_threadILi4096ELi32ELi16EEvPKhPhii' for 'sm_20' -ptxas info : Function properties for _Z10mtf_threadILi4096ELi32ELi16EEvPKhPhii +ptxas info : Used 26 registers, 4096 bytes smem, 56 bytes cmem[0] +ptxas info : Compiling entry function '_ZN3cub31DeviceRadixSortSingleTileKernelINS_21DeviceRadixSortPolicyIyhiE9Policy520ELb0EyhiEEvPT1_S5_PT2_S7_T3_ii' for 'sm_20' +ptxas info : Function properties for _ZN3cub31DeviceRadixSortSingleTileKernelINS_21DeviceRadixSortPolicyIyhiE9Policy520ELb0EyhiEEvPT1_S5_PT2_S7_T3_ii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -ptxas info : Used 16 registers, 512 bytes smem, 56 bytes cmem[0] -ptxas info : Compiling entry function '_Z20mtf_2buffers_depth32ILi4096ELi4ELi4EjLi32EEvPKhPhii' for 'sm_20' -ptxas info : Function properties for _Z20mtf_2buffers_depth32ILi4096ELi4ELi4EjLi32EEvPKhPhii +ptxas info : Used 53 registers, 9496 bytes smem, 76 bytes cmem[0] +ptxas info : Compiling entry function '_Z19mtf_cuda_thread_by4ILi4096ELi32ELi256EEvPKhPhii' for 'sm_20' +ptxas info : Function properties for _Z19mtf_cuda_thread_by4ILi4096ELi32ELi256EEvPKhPhii + 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +ptxas info : Used 23 registers, 8192 bytes smem, 56 bytes cmem[0] +ptxas info : Compiling entry function '_ZN3cub28DeviceRadixSortUpsweepKernelINS_21DeviceRadixSortPolicyIyNS_8NullTypeEiE9Policy520ELb1ELb0EyiEEvPT2_PT3_S7_iiNS_13GridEvenShareIS7_EE' for 'sm_20' +ptxas info : Function properties for _ZN3cub28DeviceRadixSortUpsweepKernelINS_21DeviceRadixSortPolicyIyNS_8NullTypeEiE9Policy520ELb1ELb0EyiEEvPT2_PT3_S7_iiNS_13GridEvenShareIS7_EE + 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +ptxas info : Used 45 registers, 2112 bytes smem, 96 bytes cmem[0] +ptxas info : Compiling entry function '_Z25mtf_cuda_2buffers_depth32ILi4096ELi4ELi4EjLi32EEvPKhPhii' for 'sm_20' +ptxas info : Function properties for _Z25mtf_cuda_2buffers_depth32ILi4096ELi4ELi4EjLi32EEvPKhPhii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 20 registers, 2112 bytes smem, 56 bytes cmem[0] -ptxas info : Compiling entry function '_Z10mtf_threadILi4096ELi32ELi64EEvPKhPhii' for 'sm_20' -ptxas info : Function properties for _Z10mtf_threadILi4096ELi32ELi64EEvPKhPhii +ptxas info : Compiling entry function '_Z15mtf_cuda_threadILi4096ELi128ELi64EEvPKhPhii' for 'sm_20' +ptxas info : Function properties for _Z15mtf_cuda_threadILi4096ELi128ELi64EEvPKhPhii + 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +ptxas info : Used 21 registers, 8192 bytes smem, 56 bytes cmem[0] +ptxas info : Compiling entry function '_Z27bsc_st8_encode_cuda_presortPhPyS_i' for 'sm_20' +ptxas info : Function properties for _Z27bsc_st8_encode_cuda_presortPhPyS_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -ptxas info : Used 23 registers, 2048 bytes smem, 56 bytes cmem[0] -ptxas info : Compiling entry function '_Z14mtf_thread_by4ILi4096ELi32ELi256EEvPKhPhii' for 'sm_20' -ptxas info : Function properties for _Z14mtf_thread_by4ILi4096ELi32ELi256EEvPKhPhii +ptxas info : Used 18 registers, 804 bytes smem, 60 bytes cmem[0] +ptxas info : Compiling entry function '_ZN3cub30DeviceRadixSortDownsweepKernelINS_21DeviceRadixSortPolicyIyhiE9Policy520ELb0ELb0EyhiEEvPT2_S5_PT3_S7_PT4_S8_iiNS_13GridEvenShareIS8_EE' for 'sm_20' +ptxas info : Function properties for _ZN3cub30DeviceRadixSortDownsweepKernelINS_21DeviceRadixSortPolicyIyhiE9Policy520ELb0ELb0EyhiEEvPT2_S5_PT3_S7_PT4_S8_iiNS_13GridEvenShareIS8_EE + 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +ptxas info : Used 46 registers, 9632 bytes smem, 120 bytes cmem[0] +ptxas info : Compiling entry function '_Z19mtf_cuda_thread_by4ILi4096ELi128ELi32EEvPKhPhii' for 'sm_20' +ptxas info : Function properties for _Z19mtf_cuda_thread_by4ILi4096ELi128ELi32EEvPKhPhii + 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +ptxas info : Used 19 registers, 4096 bytes smem, 56 bytes cmem[0] +ptxas info : Compiling entry function '_Z15mtf_cuda_scalarILi4096ELi4EjEvPKhPhii' for 'sm_20' +ptxas info : Function properties for _Z15mtf_cuda_scalarILi4096ELi4EjEvPKhPhii + 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +ptxas info : Used 21 registers, 4096 bytes smem, 56 bytes cmem[0] +ptxas info : Compiling entry function '_ZN3cub30DeviceRadixSortDownsweepKernelINS_21DeviceRadixSortPolicyIyNS_8NullTypeEiE9Policy520ELb1ELb0EyS2_iEEvPT2_S6_PT3_S8_PT4_S9_iiNS_13GridEvenShareIS9_EE' for 'sm_20' +ptxas info : Function properties for _ZN3cub30DeviceRadixSortDownsweepKernelINS_21DeviceRadixSortPolicyIyNS_8NullTypeEiE9Policy520ELb1ELb0EyS2_iEEvPT2_S6_PT3_S8_PT4_S9_iiNS_13GridEvenShareIS9_EE + 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +ptxas info : Used 52 registers, 4680 bytes smem, 120 bytes cmem[0] +ptxas info : Compiling entry function '_Z15mtf_cuda_threadILi4096ELi32ELi256EEvPKhPhii' for 'sm_20' +ptxas info : Function properties for _Z15mtf_cuda_threadILi4096ELi32ELi256EEvPKhPhii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 22 registers, 8192 bytes smem, 56 bytes cmem[0] -ptxas info : Compiling entry function '_Z14mtf_thread_by4ILi4096ELi32ELi64EEvPKhPhii' for 'sm_20' -ptxas info : Function properties for _Z14mtf_thread_by4ILi4096ELi32ELi64EEvPKhPhii +ptxas info : Compiling entry function '_ZN3cub28DeviceRadixSortUpsweepKernelINS_21DeviceRadixSortPolicyIyNS_8NullTypeEiE9Policy520ELb0ELb0EyiEEvPT2_PT3_S7_iiNS_13GridEvenShareIS7_EE' for 'sm_20' +ptxas info : Function properties for _ZN3cub28DeviceRadixSortUpsweepKernelINS_21DeviceRadixSortPolicyIyNS_8NullTypeEiE9Policy520ELb0ELb0EyiEEvPT2_PT3_S7_iiNS_13GridEvenShareIS7_EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -ptxas info : Used 22 registers, 2048 bytes smem, 56 bytes cmem[0] -ptxas info : Compiling entry function '_Z12mtf_2symbolsILi4096ELi4EjEvPKhPhii' for 'sm_20' -ptxas info : Function properties for _Z12mtf_2symbolsILi4096ELi4EjEvPKhPhii +ptxas info : Used 53 registers, 4224 bytes smem, 96 bytes cmem[0] +ptxas info : Compiling entry function '_Z25mtf_cuda_2buffers_depth32ILi4096ELi4ELi3EjLi32EEvPKhPhii' for 'sm_20' +ptxas info : Function properties for _Z25mtf_cuda_2buffers_depth32ILi4096ELi4ELi3EjLi32EEvPKhPhii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -ptxas info : Used 26 registers, 4096 bytes smem, 56 bytes cmem[0] -ptxas info : Compiling entry function '_Z8mtf_4by8ILi4096ELi256ELi32EhLi256EEvPKhPhii' for 'sm_20' -ptxas info : Function properties for _Z8mtf_4by8ILi4096ELi256ELi32EhLi256EEvPKhPhii +ptxas info : Used 19 registers, 1584 bytes smem, 56 bytes cmem[0] +ptxas info : Compiling entry function '_Z15mtf_cuda_threadILi4096ELi128ELi32EEvPKhPhii' for 'sm_20' +ptxas info : Function properties for _Z15mtf_cuda_threadILi4096ELi128ELi32EEvPKhPhii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -ptxas info : Used 23 registers, 8192 bytes smem, 56 bytes cmem[0] -ptxas info : Compiling entry function '_Z14mtf_thread_by4ILi4096ELi32ELi8EEvPKhPhii' for 'sm_20' -ptxas info : Function properties for _Z14mtf_thread_by4ILi4096ELi32ELi8EEvPKhPhii +ptxas info : Used 18 registers, 4096 bytes smem, 56 bytes cmem[0] +ptxas info : Compiling entry function '_Z29bsc_st567_encode_cuda_presortPhPyi' for 'sm_20' +ptxas info : Function properties for _Z29bsc_st567_encode_cuda_presortPhPyi + 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +ptxas info : Used 16 registers, 800 bytes smem, 52 bytes cmem[0] +ptxas info : Compiling entry function '_ZN3cub23RadixSortScanBinsKernelINS_21DeviceRadixSortPolicyIyhiE9Policy520EiEEvPT0_i' for 'sm_20' +ptxas info : Function properties for _ZN3cub23RadixSortScanBinsKernelINS_21DeviceRadixSortPolicyIyhiE9Policy520EiEEvPT0_i + 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +ptxas info : Used 31 registers, 2620 bytes smem, 44 bytes cmem[0], 4 bytes cmem[16] +ptxas info : Compiling entry function '_ZN3cub11EmptyKernelIvEEvv' for 'sm_20' +ptxas info : Function properties for _ZN3cub11EmptyKernelIvEEvv 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -ptxas info : Used 16 registers, 256 bytes smem, 56 bytes cmem[0] -ptxas info : Compiling entry function '_Z12mtf_2buffersILi4096ELi4EjEvPKhPhii' for 'sm_20' -ptxas info : Function properties for _Z12mtf_2buffersILi4096ELi4EjEvPKhPhii +ptxas info : Used 2 registers, 32 bytes cmem[0] +ptxas info : Compiling entry function '_Z17mtf_cuda_2buffersILi4096ELi4EjEvPKhPhii' for 'sm_20' +ptxas info : Function properties for _Z17mtf_cuda_2buffersILi4096ELi4EjEvPKhPhii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 27 registers, 8192 bytes smem, 56 bytes cmem[0] -ptxas info : Compiling entry function '_Z10mtf_threadILi4096ELi32ELi8EEvPKhPhii' for 'sm_20' -ptxas info : Function properties for _Z10mtf_threadILi4096ELi32ELi8EEvPKhPhii +ptxas info : Compiling entry function '_ZN3cub28DeviceRadixSortUpsweepKernelINS_21DeviceRadixSortPolicyIyhiE9Policy520ELb0ELb0EyiEEvPT2_PT3_S6_iiNS_13GridEvenShareIS6_EE' for 'sm_20' +ptxas info : Function properties for _ZN3cub28DeviceRadixSortUpsweepKernelINS_21DeviceRadixSortPolicyIyhiE9Policy520ELb0ELb0EyiEEvPT2_PT3_S6_iiNS_13GridEvenShareIS6_EE 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -ptxas info : Used 14 registers, 256 bytes smem, 56 bytes cmem[0] -ptxas info : Compiling entry function '_Z20mtf_2buffers_depth32ILi4096ELi4ELi3EjLi32EEvPKhPhii' for 'sm_20' -ptxas info : Function properties for _Z20mtf_2buffers_depth32ILi4096ELi4ELi3EjLi32EEvPKhPhii +ptxas info : Used 38 registers, 4224 bytes smem, 96 bytes cmem[0] +ptxas info : Compiling entry function '_Z15mtf_cuda_threadILi4096ELi128ELi8EEvPKhPhii' for 'sm_20' +ptxas info : Function properties for _Z15mtf_cuda_threadILi4096ELi128ELi8EEvPKhPhii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -ptxas info : Used 19 registers, 1584 bytes smem, 56 bytes cmem[0] -ptxas info : Compiling entry function '_Z10mtf_threadILi4096ELi32ELi32EEvPKhPhii' for 'sm_20' -ptxas info : Function properties for _Z10mtf_threadILi4096ELi32ELi32EEvPKhPhii +ptxas info : Used 14 registers, 1024 bytes smem, 56 bytes cmem[0] +ptxas info : Compiling entry function '_ZN3cub23RadixSortScanBinsKernelINS_21DeviceRadixSortPolicyIyNS_8NullTypeEiE9Policy520EiEEvPT0_i' for 'sm_20' +ptxas info : Function properties for _ZN3cub23RadixSortScanBinsKernelINS_21DeviceRadixSortPolicyIyNS_8NullTypeEiE9Policy520EiEEvPT0_i 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -ptxas info : Used 18 registers, 1024 bytes smem, 56 bytes cmem[0] -ptxas info : Compiling entry function '_Z10mtf_threadILi4096ELi32ELi256EEvPKhPhii' for 'sm_20' -ptxas info : Function properties for _Z10mtf_threadILi4096ELi32ELi256EEvPKhPhii +ptxas info : Used 31 registers, 2620 bytes smem, 44 bytes cmem[0], 4 bytes cmem[16] +ptxas info : Compiling entry function '_Z13mtf_cuda_4by8ILi4096ELi256ELi32EhLi256EEvPKhPhii' for 'sm_20' +ptxas info : Function properties for _Z13mtf_cuda_4by8ILi4096ELi256ELi32EhLi256EEvPKhPhii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads ptxas info : Used 23 registers, 8192 bytes smem, 56 bytes cmem[0] -ptxas info : Compiling entry function '_Z14mtf_thread_by4ILi4096ELi32ELi32EEvPKhPhii' for 'sm_20' -ptxas info : Function properties for _Z14mtf_thread_by4ILi4096ELi32ELi32EEvPKhPhii +ptxas info : Compiling entry function '_Z19mtf_cuda_thread_by4ILi4096ELi128ELi8EEvPKhPhii' for 'sm_20' +ptxas info : Function properties for _Z19mtf_cuda_thread_by4ILi4096ELi128ELi8EEvPKhPhii 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -ptxas info : Used 18 registers, 1024 bytes smem, 56 bytes cmem[0] -ptxas info : Compiling entry function '_Z10mtf_scalarILi4096ELi4EjEvPKhPhii' for 'sm_20' -ptxas info : Function properties for _Z10mtf_scalarILi4096ELi4EjEvPKhPhii +ptxas info : Used 16 registers, 1024 bytes smem, 56 bytes cmem[0] +ptxas info : Compiling entry function '_Z30bsc_st567_encode_cuda_postsortPhPyiyPi' for 'sm_20' +ptxas info : Function properties for _Z30bsc_st567_encode_cuda_postsortPhPyiyPi 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads -ptxas info : Used 21 registers, 4096 bytes smem, 56 bytes cmem[0] +ptxas info : Used 17 registers, 72 bytes cmem[0] +ptxas info : Compiling entry function '_ZN3cub30DeviceRadixSortDownsweepKernelINS_21DeviceRadixSortPolicyIyhiE9Policy520ELb1ELb0EyhiEEvPT2_S5_PT3_S7_PT4_S8_iiNS_13GridEvenShareIS8_EE' for 'sm_20' +ptxas info : Function properties for _ZN3cub30DeviceRadixSortDownsweepKernelINS_21DeviceRadixSortPolicyIyhiE9Policy520ELb1ELb0EyhiEEvPT2_S5_PT3_S7_PT4_S8_iiNS_13GridEvenShareIS8_EE + 0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads +ptxas info : Used 45 registers, 6216 bytes smem, 120 bytes cmem[0] +bslab.cu cuda.lib -mtf.cu - Creating library mtf.lib and object mtf.exp + Creating library bslab-cuda-x64.lib and object bslab-cuda-x64.exp