Add example benchmarks

(cherry picked from commit 4b6680e1046f24bb6f67fa016f7b9263bde92978)
Szczepaniak-M · Nov 6, 2024 · 1615e47 · 1615e47
1 parent ffb7fc5
commit 1615e47
Show file tree

Hide file tree

Showing 42 changed files with 6,366 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -11,6 +11,7 @@ The repository consists from 4 main parts:
 - `benchmark-service` - Spring Boot service responsible for excecuting benchmarks
 - `webpage-backend` - Spring Boot service responsible for exposing data about benchmark results
 - `webpage-frontend` - Angular application responsible for presenting results to the user
+- `example-benchmarks` - example benchmarks with their configuration files that can be executed by the service after moving them to separate repository
 
 ## Architecture overview
 

diff --git a/example-benchmarks/README.md b/example-benchmarks/README.md
@@ -0,0 +1,14 @@
+# TUManyBenchmarks - Benchmark repository
+## About
+The repository store benchmarks executed by TUManyBenchmarks service.
+Each directory contains separate benchmark, which can be reproduced locally.
+
+## How to add new benchmark?
+1. Fork a repository
+2. Add benchmark in a new directory
+3. Add Ansible playbook to configure benchmark environment
+4. Configure benchmark execution in the `configuration.yml` file
+5. Create merge request
+6. The GitHub action pipeline validates the `configuration.yml` file
+7. The repository maintainer accepts new benchmark
+8. Benchmark is automatically and regularly executed
diff --git a/example-benchmarks/cache_latency/ansible.yml b/example-benchmarks/cache_latency/ansible.yml
@@ -0,0 +1,28 @@
+---
+- name: Install benchmark and dependencies
+  hosts: localhost
+  tasks:
+    - name: Update apt cache
+      become: yes
+      apt:
+        update_cache: yes
+
+    - name: Install dependencies
+      become: yes
+      apt:
+        name:
+          - bc
+          - g++
+          - numactl
+        state: present
+
+    - name: Install dependencies
+      command: g++ latency.cpp -o latency -O3 -march=native
+      args:
+        chdir: /home/ubuntu/cache_latency
+
+
+    - name: Make run_latency.sh executable
+      file:
+        path: /home/ubuntu/cache_latency/run_latency.sh
+        mode: '0755'
diff --git a/example-benchmarks/cache_latency/configuration.yml b/example-benchmarks/cache_latency/configuration.yml
@@ -0,0 +1,64 @@
+configuration:
+  name: "Cache latency"
+  description: "Benchmark analyzing the basic properties of the CPU cache"
+  directory: cache_latency
+  cron: "0 * * * *"
+  instance-number: 1
+  instance-types:
+    # Graviton
+    - c6g.medium
+    - c6g.2xlarge
+    - c6g.metal
+
+    - c7g.metal
+    - c7g.2xlarge
+    - c7g.medium
+
+    - c8g.medium
+    - c8g.2xlarge
+    - c8g.metal-24xl
+
+    # Intel
+    - c5.metal
+    - c5.2xlarge
+    - c5.large
+
+    - c6i.metal
+    - c6i.2xlarge
+    - c6i.large
+
+    - c7i.metal-24xl
+    - c7i.2xlarge
+    - c7i.large
+
+    # AMD
+    - m5a.24xlarge
+    - m5a.2xlarge
+    - m5a.large
+
+    - c5a.24xlarge
+    - c5a.2xlarge
+    - c5a.large
+
+    - c6a.metal
+    - c6a.2xlarge
+    - c6a.large
+
+    - c7a.metal-48xl
+    - c7a.2xlarge
+    - c7a.large
+
+nodes:
+  - node-id: 0
+    ansible-configuration: ansible.yml
+    benchmark-command: ./run_latency.sh
+    output-command: python3 format_output.py results.csv
+plots:
+  - type: line
+    title: Cache latency for different input sizes
+    xaxis: Input size [10^x MB]
+    yaxis: Latency [ns]
+    series:
+      - x: input_size_log10
+        y: latency
+        legend: Latency
diff --git a/example-benchmarks/cache_latency/format_output.py b/example-benchmarks/cache_latency/format_output.py
@@ -0,0 +1,34 @@
+import json
+import sys
+import math
+from collections import defaultdict
+
+
+def parse_file_to_json(file_path):
+    with open(file_path, 'r') as file:
+        lines = file.readlines()
+
+    results = defaultdict(list)
+    data = {
+        "input_size_log10": [],
+        "latency": [],
+    }
+    for i in range(0, len(lines)):
+        size, time = lines[i].split(",")
+        results[size].append(float(time))
+    for key, values in results.items():
+        size = math.log10(int(key) / 1024 / 1024)
+        time = sum(values) / len(values)
+        data["input_size_log10"].append(size)
+        data["latency"].append(time)
+
+    json_output = json.dumps(data, indent=4)
+    print(json_output)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: python format_output.py <file_path>")
+    else:
+        file_path = sys.argv[1]
+        parse_file_to_json(file_path)
diff --git a/example-benchmarks/cache_latency/latency.cpp b/example-benchmarks/cache_latency/latency.cpp
@@ -0,0 +1,44 @@
+#include <atomic>
+#include <cassert>
+#include <iostream>
+#include <algorithm>
+#include <pthread.h>
+#include <cstdint>
+#include <sys/time.h>
+
+using namespace std;
+
+static inline double gettime(void) {
+   struct timeval now_tv;
+   gettimeofday (&now_tv, NULL);
+   return ((double)now_tv.tv_sec) + ((double)now_tv.tv_usec)/1000000.0;
+}
+
+int main(int argc, char** argv) {
+   uint64_t n = atol(argv[1]) / 8;
+   unsigned rep = atoi(argv[2]);
+   if (!n)
+      n = 16;
+
+   uint64_t* v2 = new uint64_t[n];
+   for (uint64_t i=0; i<n;i++)
+      v2[i] = i;
+   random_shuffle(v2,v2+n);
+
+   uint64_t* v = new uint64_t[n];
+   for (uint64_t i=0; i<n; i++)
+      v[v2[i]] = v2[(i+1)%n];
+
+
+   uint64_t x = 0, count = 0;
+
+   double start = gettime(), end;
+   for (unsigned i=0; i<rep; i++)
+      x = v[x];
+   end = gettime();
+   count += rep;
+   cout << (n*8) << "," << (((end-start)*1e9) / count) << endl;
+   assert(x+1);
+
+   return 0;
+}
diff --git a/example-benchmarks/cache_latency/run_latency.sh b/example-benchmarks/cache_latency/run_latency.sh
@@ -0,0 +1,67 @@
+#!/usr/bin/env bash
+
+convert_to_bytes() {
+    size=$1
+    unit=$2
+    instances=$3
+
+    case $unit in
+        KiB) echo $(echo "$size * 1024 / $instances / 1" | bc) ;;
+        MiB) echo $(echo "$size * 1024 * 1024 / $instances / 1" | bc) ;;
+        GiB) echo $(echo "$size * 1024 * 1024 * 1024 / $instances / 1" | bc) ;;
+        *) echo $size ;;
+    esac
+}
+
+interpolate() {
+    local s=$1
+    local e=$2
+    local num_points=$3
+
+    for ((i=0; i<num_points; i++)); do
+        # Generate value with a higher density towards the edges
+        x=$(echo "$i / ($num_points - 1)" | bc -l)  # Linear interpolation between 0 and 1
+
+        # Sigmoid-like function using awk (adjustable slope factor 12 for sharper edges)
+        adj_x=$(awk -v x="$x" 'BEGIN { print 1 / (1 + exp(-12 * (x - 0.5))) }')
+
+        # Final value between start and end
+        value=$(awk -v s="$s" -v e="$e" -v adj_x="$adj_x" 'BEGIN { print int(s + e * adj_x) }')
+
+        echo "$value"
+    done
+}
+
+lscpu_output=$(lscpu)
+
+l1d=$(echo "$lscpu_output" | grep "L1d" | awk '{print $3, $4}')
+l2=$(echo "$lscpu_output" | grep "L2" | awk '{print $3, $4}')
+l3=$(echo "$lscpu_output" | grep "L3" | awk '{print $3, $4}')
+
+l1d_instances=$(echo "$lscpu_output" | grep "L1d" | awk '{print $5}' | sed 's/[^0-9]//g')
+l2_instances=$(echo "$lscpu_output" | grep "L2" | awk '{print $5}' | sed 's/[^0-9]//g')
+l3_instances=$(echo "$lscpu_output" | grep "L3" | awk '{print $5}' | sed 's/[^0-9]//g')
+
+l1d_bytes=$(convert_to_bytes $(echo $l1d) $l1d_instances)
+l2_bytes=$(convert_to_bytes $(echo $l2) $l2_instances)
+l3_bytes=$(convert_to_bytes $(echo $l3) $l3_instances)
+
+l12_bytes=$(("$l1d_bytes + $l2_bytes"))
+l123_bytes=$(("$l1d_bytes + $l2_bytes + $l3_bytes"))
+double_cache=$(echo "$l123_bytes * 2" | bc)
+triple_cache=$(echo "$l123_bytes * 3" | bc)
+cache_sizes=(
+  $(interpolate 0 $l1d_bytes 10)
+  "$l1d_bytes"
+  $(interpolate $l1d_bytes $l2_bytes 10)
+  "$l12_bytes"
+  $(interpolate $l12_bytes $l3_bytes 16)
+  "$l123_bytes"
+  $(interpolate $l123_bytes $double_cache 8)
+  "$triple_cache"
+)
+
+for cache_size in "${cache_sizes[@]}";
+do
+  numactl --cpubind=0 --membind=0 ./latency $cache_size 200000000 >> results.csv
+done
diff --git a/example-benchmarks/memory_bandwidth/ansible.yml b/example-benchmarks/memory_bandwidth/ansible.yml
@@ -0,0 +1,28 @@
+---
+- name: Install packages and compile programs
+  hosts: localhost
+  tasks:
+    - name: Update APT package list
+      become: yes
+      apt:
+        update_cache: yes
+
+    - name: Install necessary packages
+      become: yes
+      apt:
+        name:
+          - g++
+          - numactl
+          - libtbb-dev
+          - bc
+        state: present
+
+    - name: Compile membw.cpp with optimizations
+      command: g++ membw.cpp -o membw -O3 -ltbb -march=native
+      args:
+        chdir: /home/ubuntu/memory_bandwidth
+
+    - name: Make run_membw.sh executable
+      file:
+        path: /home/ubuntu/memory_bandwidth/run_membw.sh
+        mode: '0755'
diff --git a/example-benchmarks/memory_bandwidth/configuration.yml b/example-benchmarks/memory_bandwidth/configuration.yml
@@ -0,0 +1,101 @@
+---
+configuration:
+    name: "Memory Bandwidth"
+    description: "Benchmark testing Memory Bandwidth for different number of threads"
+    directory: memory_bandwidth
+    cron: "0 * * * *"
+    instance-number: 1
+    instance-types:
+      # Graviton
+      - c6g.metal
+      - c6g.4xlarge
+      - c6g.2xlarge
+      - c6g.xlarge
+      - c6g.large
+
+      - c7g.metal
+      - c7g.4xlarge
+      - c7g.2xlarge
+      - c7g.xlarge
+      - c7g.large
+
+      - c8g.metal-24xl
+      - c8g.8xlarge
+      - c8g.4xlarge
+      - c8g.2xlarge
+      - c8g.xlarge
+      - c8g.large
+
+      # Intel
+      - c4.8xlarge
+      - c4.2xlarge
+      - c4.xlarge
+      - c4.large
+
+      - c5.metal
+      - c5.9xlarge
+      - c5.4xlarge
+      - c5.2xlarge
+      - c5.xlarge
+      - c5.large
+
+      - c6i.metal
+      - c6i.8xlarge
+      - c6i.2xlarge
+      - c6i.xlarge
+      - c6i.large
+
+      - c7i.metal-24xl
+      - c7i.8xlarge
+      - c7i.2xlarge
+      - c7i.xlarge
+      - c7i.large
+
+      # AMD
+      - m5a.24xlarge
+      - m5a.2xlarge
+      - m5a.xlarge
+      - m5a.large
+
+      - c5a.24xlarge
+      - c5a.16xlarge
+      - c5a.12xlarge
+      - c5a.8xlarge
+      - c5a.4xlarge
+      - c5a.2xlarge
+      - c5a.xlarge
+      - c5a.large
+
+      - c6a.metal
+      - c6a.16xlarge
+      - c6a.12xlarge
+      - c6a.8xlarge
+      - c6a.4xlarge
+      - c6a.2xlarge
+      - c6a.xlarge
+      - c6a.large
+
+      - c7a.metal-48xl
+      - c7a.16xlarge
+      - c7a.12xlarge
+      - c7a.8xlarge
+      - c7a.4xlarge
+      - c7a.2xlarge
+      - c7a.xlarge
+      - c7a.large
+
+nodes:
+  - node-id: 0
+    ansible-configuration: ansible.yml
+    benchmark-command: ./run_membw.sh
+    output-command: python3 format_output.py results.csv
+
+plots:
+  - type: line
+    title: Bandwidth depending on number of threads
+    xaxis: Threads count
+    yaxis: Memory Bandwidth [GB/s]
+    series:
+      - x: thread_counts
+        y: memory_bandwidth
+        legend: Memory bandwidth