Skip to content

Commit

Permalink
Move max frequency retrieval to the begining and add manual input whe…
Browse files Browse the repository at this point in the history
…n error.
  • Loading branch information
waynehuangntu committed Jan 16, 2025
1 parent ce603f2 commit c531281
Show file tree
Hide file tree
Showing 4 changed files with 220 additions and 74 deletions.
80 changes: 48 additions & 32 deletions tensilelite/Tensile/LibraryLogic.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,7 @@ def addFromCSV(self, dataFileName, numSolutions, solutionMap):

# iterate over rows
rowIdx = 0
deviceMaxFreq = None
for row in csvFile:
rowIdx+=1
if rowIdx == 1:
Expand All @@ -472,13 +473,6 @@ def addFromCSV(self, dataFileName, numSolutions, solutionMap):
except ValueError as e:
csvHasWinnerColumn = False
print1(f"Error: Could not find WinnerGFlops or WinnerIdx column in CSV file: {e}")

# get the column index of Frequency(MHz)
try:
columnOfFreqIdx = row.index(" DeviceMaxFreq")
except ValueError as e:
columnOfFreqIdx = None
print1(f"Error: Could not find DeviceMaxFreq column in the CSV file: {e}")

# get the length of each row, and derive the first column of the solution instead of using wrong "solutionStartIdx = totalSizeIdx + 1"
rowLength = len(row)
Expand Down Expand Up @@ -516,20 +510,24 @@ def addFromCSV(self, dataFileName, numSolutions, solutionMap):
solutionIdx += 1

if globalParameters["UseEffLike"]:
if not deviceMaxFreq:
deviceMaxFreq = read_max_freq()
if not deviceMaxFreq or deviceMaxFreq <= 0 or math.isnan(deviceMaxFreq):
deviceMaxFreq = handle_frequency_issue("Warning: Error when reading frequency.")

# calculate effLike
# effLike = winnerGFlops / Frequency(MHz)
try:
frequency = float(row[columnOfFreqIdx])
if frequency != 0 and not math.isnan(frequency):
# calculate effLike
# effLike = winnerGFlops / Frequency(MHz)
performance_metric = round(float(winnerGFlops) / frequency, 2)
else:
handle_frequency_issue("Warning: Frequency is NaN or 0.")
performance_metric = float(winnerGFlops)
except(ValueError, TypeError):
handle_frequency_issue("Warning: Error when reading frequency.")
performance_metric = float(winnerGFlops)
performance_metric = round(float(winnerGFlops) / deviceMaxFreq, 2)
except:
print1("Error: Could not convert winnerGFlops to float.")
performance_metric = float('nan')
else:
performance_metric = float(winnerGFlops)
try:
performance_metric = float(winnerGFlops)
except:
print1("Error: Could not convert winnerGFlops to float.")
performance_metric = float('nan')

if winnerIdx != -1:
if problemSize in self.exactWinners:
Expand Down Expand Up @@ -1523,26 +1521,44 @@ def generateLogic(config, benchmarkDataPath, libraryLogicPath, cxxCompiler: str)
print1("%s\n# Finish Analysing data to %s in %.3fs\n%s" % (HR, os.path.split(libraryLogicPath)[0], elapsedTime, HR) )
popWorkingPath()

try:
os.remove("tmp_max_frequency.txt")
except FileNotFoundError:
pass

##############################################################################
# Error handling for frequency issues
##############################################################################
def handle_frequency_issue(message):
print1(message)
print1(" - Type 'yes(y)' to abort the operation.")
print1(" - Type 'no(n)' to continue and use GFlops as the efficiency metric.")
print1("Input the frequency manually to proceed.")

while True:
user_choice = input("Do you want to abort (yes(y)/no(n))? ").strip().lower()
if user_choice in ['yes', 'no', 'y', 'n']:
break
else:
print1("Invalid input. Please type 'yes(y)' or 'no(n)'.")
if user_choice == "yes" or user_choice == 'y':
print1("Operation aborted by the user.")
raise Exception("User chose to abort due to frequency issue.")
else:
globalParameters["UseEffLike"] = False
print1("Proceeding with GFlops as the efficiency metric.")
frequency_input = input("Frequency: ").strip()
if frequency_input == "":
print1("Frequency cannot be empty")
continue
try:
frequency = float(frequency_input)
if frequency > 0:
return frequency
else:
print1("Frequency cannot be negative or zero.")
except ValueError:
print1("Invalid frequency.Please input a valid frequency.")

def read_max_freq():
try:
with open("tmp_max_frequency.txt", "r") as f:
max_freq = float(f.read().strip())
return max_freq
except FileNotFoundError:
print("Frequency file not found")
return None
except Exception as e:
print(f"Error reading from file: {e}")
return None

################################################################################
################################################################################
###
Expand Down
112 changes: 72 additions & 40 deletions tensilelite/Tensile/Source/client/source/HardwareMonitor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,19 +37,20 @@

#include "ResultReporter.hpp"

#define RSMI_CHECK_EXC(expr) \
do \
{ \
rsmi_status_t e = (expr); \
if(e) \
{ \
const char* errName = nullptr; \
rsmi_status_string(e, &errName); \
std::ostringstream msg; \
msg << "Error " << e << "(" << errName << ") " << __FILE__ << ":" << __LINE__ << ": " << std::endl \
<< #expr << std::endl; \
throw std::runtime_error(msg.str()); \
} \
#define RSMI_CHECK_EXC(expr) \
do \
{ \
rsmi_status_t e = (expr); \
if(e) \
{ \
const char* errName = nullptr; \
rsmi_status_string(e, &errName); \
std::ostringstream msg; \
msg << "Error " << e << "(" << errName << ") " << __FILE__ << ":" << __LINE__ << ": " \
<< std::endl \
<< #expr << std::endl; \
throw std::runtime_error(msg.str()); \
} \
} while(0)

namespace TensileLite
Expand All @@ -68,8 +69,9 @@ namespace TensileLite
HIP_CHECK_EXC(hipRuntimeGetVersion(&hip_version));
if(hip_version >= 50220730)
{
HIP_CHECK_EXC(hipDeviceGetAttribute(
&props.multiProcessorCount, hipDeviceAttributePhysicalMultiProcessorCount, hipDeviceIndex));
HIP_CHECK_EXC(hipDeviceGetAttribute(&props.multiProcessorCount,
hipDeviceAttributePhysicalMultiProcessorCount,
hipDeviceIndex));
}
#endif

Expand Down Expand Up @@ -102,7 +104,8 @@ namespace TensileLite
}

msg << "]" << std::endl;
std::time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
std::time_t now
= std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
msg << std::put_time(gmtime(&now), "%F %T %z");

throw std::runtime_error(concatenate("RSMI Can't find a device with PCI ID ",
Expand Down Expand Up @@ -179,7 +182,8 @@ namespace TensileLite
m_thread = std::thread([this]() { this->runLoop(); });
}

void HardwareMonitor::addTempMonitor(rsmi_temperature_type_t sensorType, rsmi_temperature_metric_t metric)
void HardwareMonitor::addTempMonitor(rsmi_temperature_type_t sensorType,
rsmi_temperature_metric_t metric)
{
assertNotActive();

Expand All @@ -203,7 +207,8 @@ namespace TensileLite
m_fanValues.resize(m_fanMetrics.size());
}

double HardwareMonitor::getAverageTemp(rsmi_temperature_type_t sensorType, rsmi_temperature_metric_t metric)
double HardwareMonitor::getAverageTemp(rsmi_temperature_type_t sensorType,
rsmi_temperature_metric_t metric)
{
assertNotActive();

Expand All @@ -222,8 +227,8 @@ namespace TensileLite
}
}

throw std::runtime_error(
concatenate("Can't read temp value that wasn't requested: ", sensorType, " - ", metric));
throw std::runtime_error(concatenate(
"Can't read temp value that wasn't requested: ", sensorType, " - ", metric));
}

double HardwareMonitor::getAverageClock(rsmi_clk_type_t clockType)
Expand Down Expand Up @@ -252,7 +257,8 @@ namespace TensileLite
}
}

throw std::runtime_error(concatenate("Can't read clock value that wasn't requested: ", clockType));
throw std::runtime_error(
concatenate("Can't read clock value that wasn't requested: ", clockType));
}

double HardwareMonitor::getAverageFanSpeed(uint32_t sensorIndex)
Expand All @@ -274,7 +280,8 @@ namespace TensileLite
}
}

throw std::runtime_error(concatenate("Can't read fan value that wasn't requested: ", sensorIndex));
throw std::runtime_error(
concatenate("Can't read fan value that wasn't requested: ", sensorIndex));
}

void HardwareMonitor::start()
Expand Down Expand Up @@ -303,7 +310,8 @@ namespace TensileLite

m_hasStopEvent = stopEvent != nullptr;

m_task = std::move(Task([this, startEvent, stopEvent]() { this->collect(startEvent, stopEvent); }));
m_task = std::move(Task(
[this, startEvent, stopEvent]() { this->collect(startEvent, stopEvent); }));
m_future = m_task.get_future();

m_stop = false;
Expand All @@ -326,8 +334,9 @@ namespace TensileLite
m_lastCollection = clock::time_point();
m_nextCollection = clock::time_point();

m_SYSCLK_sum = std::vector<uint64_t>(m_XCDCount, 0);
m_SYSCLK_array = std::vector<std::vector<uint64_t>>(m_XCDCount, std::vector<uint64_t>{});
m_SYSCLK_sum = std::vector<uint64_t>(m_XCDCount, 0);
m_SYSCLK_array
= std::vector<std::vector<uint64_t>>(m_XCDCount, std::vector<uint64_t>{});
}

void HardwareMonitor::collectOnce()
Expand All @@ -345,7 +354,8 @@ namespace TensileLite
std::tie(sensorType, metric) = m_tempMetrics[i];

int64_t newValue = 0;
auto status = rsmi_dev_temp_metric_get(m_smiDeviceIndex, sensorType, metric, &newValue);
auto status
= rsmi_dev_temp_metric_get(m_smiDeviceIndex, sensorType, metric, &newValue);
if(status != RSMI_STATUS_SUCCESS)
m_tempValues[i] = std::numeric_limits<int64_t>::max();
else
Expand All @@ -372,14 +382,16 @@ namespace TensileLite
for(uint32_t xcd = 0; xcd < m_XCDCount; xcd++)
{
m_SYSCLK_sum[xcd] += gpuMetrics.current_gfxclks[xcd] * cMhzToHz;
m_SYSCLK_array[xcd].push_back(gpuMetrics.current_gfxclks[xcd] * cMhzToHz);
m_SYSCLK_array[xcd].push_back(gpuMetrics.current_gfxclks[xcd]
* cMhzToHz);
sysclkSum += gpuMetrics.current_gfxclks[xcd] * cMhzToHz;
}
m_clockValues[i] += sysclkSum;
}
#else
// XCD0
auto status = rsmi_dev_gpu_clk_freq_get(m_smiDeviceIndex, m_clockMetrics[i], &freq);
auto status
= rsmi_dev_gpu_clk_freq_get(m_smiDeviceIndex, m_clockMetrics[i], &freq);
if(status != RSMI_STATUS_SUCCESS)
{
m_clockValues[i] = std::numeric_limits<uint64_t>::max();
Expand All @@ -392,7 +404,8 @@ namespace TensileLite
}
else
{
auto status = rsmi_dev_gpu_clk_freq_get(m_smiDeviceIndex, m_clockMetrics[i], &freq);
auto status
= rsmi_dev_gpu_clk_freq_get(m_smiDeviceIndex, m_clockMetrics[i], &freq);
if(status != RSMI_STATUS_SUCCESS)
{
m_clockValues[i] = std::numeric_limits<uint64_t>::max();
Expand All @@ -413,7 +426,7 @@ namespace TensileLite
rsmi_frequencies_t freq;

int64_t newValue = 0;
auto status = rsmi_dev_fan_rpms_get(m_smiDeviceIndex, m_fanMetrics[i], &newValue);
auto status = rsmi_dev_fan_rpms_get(m_smiDeviceIndex, m_fanMetrics[i], &newValue);
if(status != RSMI_STATUS_SUCCESS)
m_fanValues[i] = std::numeric_limits<int64_t>::max();
else
Expand All @@ -422,23 +435,42 @@ namespace TensileLite

// Retrieves the maximum hardware supported frequency.
rsmi_frequencies_t freqs;
auto status = rsmi_dev_gpu_clk_freq_get(m_smiDeviceIndex, RSMI_CLK_TYPE_SYS, &freqs);
if(status != RSMI_STATUS_SUCCESS)
{
m_hasInvalidGpuFreqStatus = true;
}
else
const int MAX_RETRY = 10;
const int SLEEP_TIME = 100; // sleep time in milliseconds
bool success = false;

if(!has_maxFreqValues && !m_hasInvalidGpuFreqStatus)
{
if(!m_hasInvalidGpuFreqStatus && !has_maxFreqValues)
for(int retry = 0; retry < MAX_RETRY; ++retry)
{
m_maxFreqValues = 0;
for(auto freq : freqs.frequency)
auto status
= rsmi_dev_gpu_clk_freq_get(m_smiDeviceIndex, RSMI_CLK_TYPE_SYS, &freqs);

if(status == RSMI_STATUS_SUCCESS)
{
m_maxFreqValues = std::max(m_maxFreqValues, freq);
success = true;
break;
}
// Sleep before next retry
std::this_thread::sleep_for(std::chrono::milliseconds(SLEEP_TIME));
}

if(!success)
{
m_hasInvalidGpuFreqStatus = true;
}
else if(freqs.num_supported > 0)
{
m_maxFreqValues
= *std::max_element(freqs.frequency, freqs.frequency + freqs.num_supported);

has_maxFreqValues = true;
m_maxFreqValues /= cMhzToHz; // Convert to MHz
}
else
{
m_hasInvalidGpuFreqStatus = true;
}
}
m_dataPoints++;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,6 @@ namespace TensileLite
m_output.setHeaderForKey(ResultKey::LDA, "LDA");
m_output.setHeaderForKey(ResultKey::LDB, "LDB");
m_output.setHeaderForKey(ResultKey::TotalFlops, "TotalFlops");
m_output.setHeaderForKey(ResultKey::GfxFrequency, "DeviceMaxFreq");
if(m_extraCol)
{
m_output.setHeaderForKey(ResultKey::TilesPerCu, "TilesPerCu");
Expand Down
Loading

0 comments on commit c531281

Please sign in to comment.