From 22b9d59c51429c06a2667ec26399b3fc77047713 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wu Date: Wed, 31 Jan 2024 16:11:49 +0800 Subject: [PATCH] sched: fix a scheduling issue The original code assumes the last 4 bits of the CPU cycle count is uniformly distributed, but that is not true, at lease Intel IceLake Intel(R) Xeon(R) Platinum 8369B CPU @ 2.70GHz, the CPU cycle is always ODD number. This fact will result expensive ops are frequently scheduled to signle thread, which will greatly increase the RT time (in custom scenario, from ~30ms to ~45ms). Signed-off-by: Xiaoguang Wu --- tensorflow/core/common_runtime/executor.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc index fd38329a1fa..3df0d2a15be 100644 --- a/tensorflow/core/common_runtime/executor.cc +++ b/tensorflow/core/common_runtime/executor.cc @@ -730,15 +730,16 @@ Status ExecutorState::ProcessSync( } else if (kernel_stats_->HasExpensiveMarker(item)) { KernelTimer timer; + static uint64 update_counter = 0; device->Compute(op_kernel, &ctx); - // For expensive kernels, always update the cost estimate. For inexpensive - // kernels, update the cost estimate with ~1/16 probability. This assumes - // that the last 4 bits of the CPU cycle count is uniformly distributed. + constexpr int kKernelExecutionTrackingInvocationSkipCount = 16; if (is_expensive || - timer.start_cycles % kKernelExecutionTrackingInvocationSkipCount == 0) { + update_counter % kKernelExecutionTrackingInvocationSkipCount == 0) { kernel_stats_->UpdateCostEstimate(item, timer.ElapsedCycles()); } + + update_counter++; } else { device->Compute(op_kernel, &ctx); }