From 4d6452ed5b812963c0cf6e254d63187ca477a3b1 Mon Sep 17 00:00:00 2001 From: Vulcan <93451215+trholding@users.noreply.github.com> Date: Fri, 5 Apr 2024 21:37:48 +0530 Subject: [PATCH] Makefile: LLVM BOLT Support - Makefile: Add LLVM BOLT build Usage: make BOLTPREP=1 ; make run_bolt - run.c / runq.c : Enable exit command in prompt in embedded model builds - README.md: Update usage --- Makefile | 168 +++++++++++++++++++++++++++++++++++------------------- README.md | 8 +++ run.c | 1 + runq.c | 1 + 4 files changed, 120 insertions(+), 58 deletions(-) diff --git a/Makefile b/Makefile index b82f73ee..4d2fa7af 100644 --- a/Makefile +++ b/Makefile @@ -22,6 +22,17 @@ TOK_PATH = tokenizer.bin CC = gcc +#LLVM BOLT +#Set required linker flags +BOLT := +ifeq ($(BOLTPREP),1) + ifeq ($(CC),gcc) + BOLT := -fno-reorder-blocks-and-partition -Wl,--emit-relocs + else + BOLT := -Wl,--emit-relocs + endif +endif + ##@ Simple Builds # the most basic way of building that is most likely to work on most systems .PHONY: run @@ -32,11 +43,11 @@ runq: runq_cc .PHONY: run_cc run_cc: ## - Standard build with basic optimizations - $(CC) -O3 -march=native -mtune=native -o run run.c -lm + $(CC) -O3 -march=native -mtune=native -o run run.c $(BOLT) -lm .PHONY: runq_cc runq_cc: ## - Same for quantized build - $(CC) -O3 -march=native -mtune=native -o run runq.c -lm + $(CC) -O3 -march=native -mtune=native -o run runq.c $(BOLT) -lm # https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html # https://simonbyrne.github.io/notes/fastmath/ @@ -49,20 +60,20 @@ runq_cc: ## - Same for quantized build # In our specific application this is *probably* okay to use .PHONY: run_cc_fast run_cc_fast: ## - More Optimized build. Disregards strict standards compliance - $(CC) -Ofast -march=native -mtune=native -o run run.c -lm + $(CC) -Ofast -march=native -mtune=native -o run run.c $(BOLT) -lm .PHONY: runq_cc_fast runq_cc_fast: ## - Same for quantized build - $(CC) -Ofast -march=native -mtune=native -o run runq.c -lm + $(CC) -Ofast -march=native -mtune=native -o run runq.c $(BOLT) -lm # compiles with gnu99 standard flags for amazon linux, coreos, etc. compatibility .PHONY: run_cc_gnu run_cc_gnu: ## - Optimized Generic linux distro build - $(CC) -Ofast -march=native -mtune=native -std=gnu11 -o run run.c -lm + $(CC) -Ofast -march=native -mtune=native -std=gnu11 -o run run.c $(BOLT) -lm .PHONY: runq_cc_gnu runq_cc_gnu: ## - Same for quantized build - $(CC) -Ofast -march=native -mtune=native -std=gnu11 -o run runq.c -lm + $(CC) -Ofast -march=native -mtune=native -std=gnu11 -o run runq.c $(BOLT) -lm ##@ Accelerated Builds # additionally compiles with OpenMP, allowing multithreaded runs @@ -71,92 +82,92 @@ runq_cc_gnu: ## - Same for quantized build .PHONY: run_cc_avx run_cc_avx: ## - ***NEW*** AVX accelerated build - $(CC) -D OPENMP -D ACCELAVX -Ofast -fopenmp -mavx -march=native -mtune=native run.c -lm -o run + $(CC) -D OPENMP -D ACCELAVX -Ofast -fopenmp -mavx -march=native -mtune=native run.c $(BOLT) -lm -o run .PHONY: run_cc_openmp run_cc_openmp: ## - OpenMP accelerated build - $(CC) -D OPENMP -Ofast -fopenmp -march=native -mtune=native run.c -lm -o run + $(CC) -D OPENMP -Ofast -fopenmp -march=native -mtune=native run.c $(BOLT) -lm -o run .PHONY: runq_cc_openmp runq_cc_openmp: ## - Same for quantized build - $(CC) -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c -lm -o run + $(CC) -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -o run .PHONY: run_cc_openacc run_cc_openacc: ## - OpenACC accelerated build - $(CC) -D OPENACC -Ofast -fopenacc -march=native -mtune=native run.c -lm -o run + $(CC) -D OPENACC -Ofast -fopenacc -march=native -mtune=native run.c $(BOLT) -lm -o run .PHONY: runq_cc_openacc runq_cc_openacc: ## - Same for quantized build - $(CC) -D OPENACC -Ofast -fopenacc -march=native -mtune=native runq.c -lm -o run + $(CC) -D OPENACC -Ofast -fopenacc -march=native -mtune=native runq.c $(BOLT) -lm -o run .PHONY: run_cc_omp_gnu run_cc_omp_gnu: ## - Generic linux distro + OpenMP build - $(CC) -D OPENMP -Ofast -fopenmp -march=native -mtune=native -std=gnu11 run.c -lm -o run + $(CC) -D OPENMP -Ofast -fopenmp -march=native -mtune=native -std=gnu11 run.c $(BOLT) -lm -o run .PHONY: runq_cc_omp_gnu runq_cc_omp_gnu: ## - Same for quantized build - $(CC) -D OPENMP -Ofast -fopenmp -march=native -mtune=native -std=gnu11 runq.c -lm -o run + $(CC) -D OPENMP -Ofast -fopenmp -march=native -mtune=native -std=gnu11 runq.c $(BOLT) -lm -o run .PHONY: run_cc_clblast run_cc_clblast: ## - CLBlast OpenCL CBLAS GPU accelerated build - $(CC) -D OPENMP -D CLBLAST -Ofast -fopenmp -march=native -mtune=native run.c -lm -lclblast -o run + $(CC) -D OPENMP -D CLBLAST -Ofast -fopenmp -march=native -mtune=native run.c $(BOLT) -lm -lclblast -o run .PHONY: runq_cc_clblast runq_cc_clblast: ## - Same for quantized build - $(CC) -D OPENMP -D CLBLAST -Ofast -fopenmp -march=native -mtune=native runq.c -lm -lclblast -o run + $(CC) -D OPENMP -D CLBLAST -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -lclblast -o run .PHONY: run_cc_openblas run_cc_openblas: ## - Openblas CBLAS accelerated build - $(CC) -D OPENMP -D OPENBLAS -Ofast -fopenmp -march=native -mtune=native -I$(OPENBLAS_INC) run.c -lm -lopenblas -o run + $(CC) -D OPENMP -D OPENBLAS -Ofast -fopenmp -march=native -mtune=native -I$(OPENBLAS_INC) run.c $(BOLT) -lm -lopenblas -o run .PHONY: runq_cc_openblas runq_cc_openblas: ## - Same for quantized build - $(CC) -D OPENMP -D OPENBLAS -Ofast -fopenmp -march=native -mtune=native -I$(OPENBLAS_INC) runq.c -lm -lopenblas -o run + $(CC) -D OPENMP -D OPENBLAS -Ofast -fopenmp -march=native -mtune=native -I$(OPENBLAS_INC) runq.c $(BOLT) -lm -lopenblas -o run .PHONY: run_cc_cblas run_cc_cblas: ## - Generic CBLAS accelerated build - $(CC) -D CBLAS -Ofast -fopenmp -march=native -mtune=native run.c -lm -lcblas -o run + $(CC) -D CBLAS -Ofast -fopenmp -march=native -mtune=native run.c $(BOLT) -lm -lcblas -o run .PHONY: runq_cc_cblas runq_cc_cblas: ## - Same for quantized build - $(CC) -D CBLAS -Ofast -fopenmp -march=native -mtune=native runq.c -lm -lcblas -o run + $(CC) -D CBLAS -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -lcblas -o run .PHONY: run_cc_blis run_cc_blis: ## - BLIS accelerated build - $(CC) -D BLIS -Ofast -fopenmp -march=native -mtune=native -I$(BLIS_INC) run.c -lm -lblis -o run + $(CC) -D BLIS -Ofast -fopenmp -march=native -mtune=native -I$(BLIS_INC) run.c $(BOLT) -lm -lblis -o run .PHONY: runq_cc_blis runq_cc_blis: ## - Same for quantized build - $(CC) -D BLIS -Ofast -fopenmp -march=native -mtune=native -I$(BLIS_INC) runq.c -lm -lblis -o run + $(CC) -D BLIS -Ofast -fopenmp -march=native -mtune=native -I$(BLIS_INC) runq.c $(BOLT) -lm -lblis -o run ##@ Special Builds ##@ ---> x86_64 # amd64 (x86_64) / Intel Mac (WIP) Do not use! .PHONY: run_cc_mkl run_cc_mkl: ## - ***NEW*** OpenMP + Intel MKL CBLAS build (x86_64 / intel Mac) - $(CC) -D MKL -D OPENMP -Ofast -fopenmp -march=native -mtune=native -I$(MKL_INC) -L$(MKL_LIB) run.c -lmkl_rt -lpthread -lm -o run + $(CC) -D MKL -D OPENMP -Ofast -fopenmp -march=native -mtune=native -I$(MKL_INC) -L$(MKL_LIB) run.c -lmkl_rt -lpthread $(BOLT) -lm -o run .PHONY: runq_cc_mkl runq_cc_mkl: ## - Same for quantized build - $(CC) -D MKL -D OPENMP -Ofast -fopenmp -march=native -mtune=native -I$(MKL_INC) -L$(MKL_LIB) runq.c -lmkl_rt -lpthread -lm -o run + $(CC) -D MKL -D OPENMP -Ofast -fopenmp -march=native -mtune=native -I$(MKL_INC) -L$(MKL_LIB) runq.c -lmkl_rt -lpthread $(BOLT) -lm -o run ##@ ---> ARM64 / aarch64 .PHONY: run_cc_armpl run_cc_armpl: ## - ARM PL BLAS accelerated build (ARM64 & Mac) (WIP) - $(CC) -D ARMPL -D OPENMP -Ofast -fopenmp -march=native -mtune=native run.c -lm -larmpl_lp64_mp -o run + $(CC) -D ARMPL -D OPENMP -Ofast -fopenmp -march=native -mtune=native run.c $(BOLT) -lm -larmpl_lp64_mp -o run .PHONY: runq_cc_armpl runq_cc_armpl: ## - Same for quantized build - $(CC) -D ARMPL -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c -lm -larmpl_lp64_mp -o run + $(CC) -D ARMPL -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -larmpl_lp64_mp -o run ##@ ---> Macintosh .PHONY: run_cc_mac_accel run_cc_mac_accel: ## - Mac OS OPENMP + CBLAS via Accelerate Framework build (WIP/TEST) - $(CC) -D AAF -D OPENMP -Ofast -fopenmp -march=native -mtune=native run.c -lm -framework Accelerate -o run + $(CC) -D AAF -D OPENMP -Ofast -fopenmp -march=native -mtune=native run.c $(BOLT) -lm -framework Accelerate -o run .PHONY: runq_cc_mac_accel runq_cc_mac_accel: ## - Same for quantized build - $(CC) -D AAF -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c -lm -framework Accelerate -o run + $(CC) -D AAF -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -framework Accelerate -o run ##@ ---> Windows .PHONY: run_win64 @@ -179,151 +190,192 @@ runq_win_msvc: ## - Same for quantized build # Cosmocc .PHONY: run_cosmocc run_cosmocc: ## - Optimized Portable + cosmocc (runs on all OSes) - cosmocc -Ofast -D COSMO_BLINK -D COSMO_METAL run.c -lm -o run.com + cosmocc -Ofast -D COSMO_BLINK -D COSMO_METAL run.c $(BOLT) -lm -o run.com .PHONY: runq_cosmocc runq_cosmocc: ## - Same for quantized build - cosmocc -Ofast -D COSMO_BLINK -D COSMO_METAL runq.c -lm -o run.com + cosmocc -Ofast -D COSMO_BLINK -D COSMO_METAL runq.c $(BOLT) -lm -o run.com ##@ ---> MultiOS Builds ---> with Embedded Models # Cosmocc + embedded model & tokenizer .PHONY: run_cosmocc_zipos run_cosmocc_zipos: ## - Optimized Portable + cosmocc + embedded zip model build (runs on all OSes) - cosmocc -Ofast -D COSMO_BLINK -D COSMO_METAL -D COSMO_ZIP run.c -lm -o run.com + cosmocc -Ofast -D COSMO_BLINK -D COSMO_METAL -D COSMO_ZIP run.c $(BOLT) -lm -o run.com zip run.com $(MOD_PATH) zip run.com $(TOK_PATH) .PHONY: runq_cosmocc_zipos runq_cosmocc_zipos: ## - Same for quantized build - cosmocc -Ofast -D COSMO_BLINK -D COSMO_METAL -D COSMO_ZIP runq.c -lm -o run.com + cosmocc -Ofast -D COSMO_BLINK -D COSMO_METAL -D COSMO_ZIP runq.c $(BOLT) -lm -o run.com zip run.com $(MOD_PATH) zip run.com $(TOK_PATH) .PHONY: run_cosmocc_incbin run_cosmocc_incbin: ## - Optimized Portable + cosmocc + embedded model fast build (runs on all OSes) - cosmocc -Ofast -D COSMO_BLINK -D COSMO_METAL -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP run.c -lm -o run.com + cosmocc -Ofast -D COSMO_BLINK -D COSMO_METAL -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP run.c $(BOLT) -lm -o run.com .PHONY: runq_cosmocc_incbin runq_cosmocc_incbin: ## - Same for quantized build - cosmocc -Ofast -D COSMO_BLINK -D COSMO_METAL -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP runq.c -lm -o run.com + cosmocc -Ofast -D COSMO_BLINK -D COSMO_METAL -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP runq.c $(BOLT) -lm -o run.com .PHONY: run_cosmocc_strlit run_cosmocc_strlit: ## - Optimized Portable + cosmocc + embedded model build (runs on all OSes) gcc -Ofast strliteral.c -o strlit ./strlit -i emb_Model_data $(MOD_PATH) model.h ./strlit -i emb_Tokenizer_data $(TOK_PATH) tokenizer.h - cosmocc -Ofast -D COSMO_BLINK -D COSMO_METAL -D STRLIT -D LLOOP run.c -lm -o run.com + cosmocc -Ofast -D COSMO_BLINK -D COSMO_METAL -D STRLIT -D LLOOP run.c $(BOLT) -lm -o run.com .PHONY: runq_cosmocc_strlit runq_cosmocc_strlit: ## - Same for quantized build gcc -Ofast strliteral.c -o strlit ./strlit -i emb_Model_data $(MOD_PATH) model.h ./strlit -i emb_Tokenizer_data $(TOK_PATH) tokenizer.h - cosmocc -Ofast -D COSMO_BLINK -D COSMO_METAL -D STRLIT -D LLOOP runq.c -lm -o run.com + cosmocc -Ofast -D COSMO_BLINK -D COSMO_METAL -D STRLIT -D LLOOP runq.c $(BOLT) -lm -o run.com ##@ ---> GCC/Clang Embedded Model Builds # GCC OpenMP + embedded model & tokenizer .PHONY: run_gcc_openmp_incbin run_gcc_openmp_incbin: ## - Gcc + OpenMP + embedded model fast build - gcc -D OPENMP -Ofast -fopenmp -foffload-options="-Ofast -lm" -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP run.c -lm -o run + gcc -D OPENMP -Ofast -fopenmp -foffload-options="-Ofast $(BOLT) -lm" -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP -Wl,--emit-relocs run.c $(BOLT) -lm -o run .PHONY: runq_gcc_openmp_incbin runq_gcc_openmp_incbin: ## - Same for quantized build - gcc -D OPENMP -Ofast -fopenmp -foffload-options="-Ofast -lm" -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP runq.c -lm -o run + gcc -D OPENMP -Ofast -fopenmp -foffload-options="-Ofast $(BOLT) -lm" -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP runq.c $(BOLT) -lm -o run .PHONY: run_gcc_openmp_strlit run_gcc_openmp_strlit: ## - Gcc + OpenMP + embedded model build gcc -Ofast strliteral.c -o strlit ./strlit -i emb_Model_data $(MOD_PATH) model.h ./strlit -i emb_Tokenizer_data $(TOK_PATH) tokenizer.h - gcc -D OPENMP -Ofast -fopenmp -foffload-options="-Ofast -lm" -march=native -mtune=native -D STRLIT -D LLOOP run.c -lm -o run + gcc -D OPENMP -Ofast -fopenmp -foffload-options="-Ofast $(BOLT) -lm" -march=native -mtune=native -D STRLIT -D LLOOP run.c $(BOLT) -lm -o run .PHONY: runq_gcc_openmp_strlit runq_gcc_openmp_strlit: ## - Same for quantized build gcc -Ofast strliteral.c -o strlit ./strlit -i emb_Model_data $(MOD_PATH) model.h ./strlit -i emb_Tokenizer_data $(TOK_PATH) tokenizer.h - gcc -D OPENMP -Ofast -fopenmp -foffload-options="-Ofast -lm" -march=native -mtune=native -D STRLIT -D LLOOP runq.c -lm -o run + gcc -D OPENMP -Ofast -fopenmp -foffload-options="-Ofast $(BOLT) -lm" -march=native -mtune=native -D STRLIT -D LLOOP runq.c $(BOLT) -lm -o run # Clang OpenMP + embedded model & tokenizer .PHONY: run_clang_openmp_incbin run_clang_openmp_incbin: ## - Clang + OpenMP + embedded model fast build - clang -D OPENMP -Ofast -fopenmp -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP run.c -lm -o run + clang -D OPENMP -Ofast -fopenmp -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP run.c $(BOLT) -lm -o run .PHONY: runq_clang_openmp_incbin runq_clang_openmp_incbin: ## - Same for quantized build - clang -D OPENMP -Ofast -fopenmp -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP runq.c -lm -o run + clang -D OPENMP -Ofast -fopenmp -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP runq.c $(BOLT) -lm -o run .PHONY: run_clang_openmp_strlit run_clang_openmp_strlit: ## - Clang + OpenMP + embedded model build clang -Ofast strliteral.c -o strlit ./strlit -i emb_Model_data $(MOD_PATH) model.h ./strlit -i emb_Tokenizer_data $(TOK_PATH) tokenizer.h - clang -D OPENMP -Ofast -fopenmp -march=native -mtune=native -D STRLIT -D LLOOP run.c -lm -o run + clang -D OPENMP -Ofast -fopenmp -march=native -mtune=native -D STRLIT -D LLOOP run.c $(BOLT) -lm -o run .PHONY: runq_clang_openmp_strlit runq_clang_openmp_strlit: ## - Same for quantized build clang -Ofast strliteral.c -o strlit ./strlit -i emb_Model_data $(MOD_PATH) model.h ./strlit -i emb_Tokenizer_data $(TOK_PATH) tokenizer.h - clang -D OPENMP -Ofast -fopenmp -march=native -mtune=native -D STRLIT -D LLOOP runq.c -lm -o run + clang -D OPENMP -Ofast -fopenmp -march=native -mtune=native -D STRLIT -D LLOOP runq.c $(BOLT) -lm -o run ##@ ---> GCC/Clang Embedded Model Builds ---> Statically Linked # GCC static + embedded model & tokenizer .PHONY: run_gcc_static_incbin run_gcc_static_incbin: ## - Optimized Static gcc + embedded model fast build - gcc -Ofast -static -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP run.c -lm -o run + gcc -Ofast -static -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP run.c $(BOLT) -lm -o run .PHONY: runq_gcc_static_incbin runq_gcc_static_incbin: ## - Same for quantized build - gcc -Ofast -static -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP runq.c -lm -o run + gcc -Ofast -static -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP runq.c $(BOLT) -lm -o run .PHONY: run_gcc_static_strlit run_gcc_static_strlit: ## - Optimized Static gcc + embedded model build gcc -Ofast strliteral.c -o strlit ./strlit -i emb_Model_data $(MOD_PATH) model.h ./strlit -i emb_Tokenizer_data $(TOK_PATH) tokenizer.h - gcc -Ofast -static -march=native -mtune=native -D STRLIT -D LLOOP run.c -lm -o run + gcc -Ofast -static -march=native -mtune=native -D STRLIT -D LLOOP run.c $(BOLT) -lm -o run .PHONY: runq_gcc_static_strlit runq_gcc_static_strlit: ## - Same for quantized build gcc -Ofast strliteral.c -o strlit ./strlit -i emb_Model_data $(MOD_PATH) model.h ./strlit -i emb_Tokenizer_data $(TOK_PATH) tokenizer.h - gcc -Ofast -static -march=native -mtune=native -D STRLIT -D LLOOP runq.c -lm -o run + gcc -Ofast -static -march=native -mtune=native -D STRLIT -D LLOOP runq.c $(BOLT) -lm -o run # Clang static + embedded model & tokenizer .PHONY: run_clang_static_incbin run_clang_static_incbin: ## - Optimized Static clang + embedded model fast build - clang -Ofast -static -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP run.c -lm -o run + clang -Ofast -static -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP run.c $(BOLT) -lm -o run .PHONY: runq_clang_static_incbin runq_clang_static_incbin: ## - Same for quantized build - clang -Ofast -static -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP runq.c -lm -o run + clang -Ofast -static -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP runq.c $(BOLT) -lm -o run .PHONY: run_clang_static_strlit run_clang_static_strlit: ## - Optimized Static clang + embedded model build clang -Ofast strliteral.c -o strlit ./strlit -i emb_Model_data $(MOD_PATH) model.h ./strlit -i emb_Tokenizer_data $(TOK_PATH) tokenizer.h - clang -Ofast -static -march=native -mtune=native -D STRLIT -D LLOOP run.c -lm -o run + clang -Ofast -static -march=native -mtune=native -D STRLIT -D LLOOP run.c $(BOLT) -lm -o run .PHONY: runq_clang_static_strlit runq_clang_static_strlit: ## - Same for quantized build clang -Ofast strliteral.c -o strlit ./strlit -i emb_Model_data $(MOD_PATH) model.h ./strlit -i emb_Tokenizer_data $(TOK_PATH) tokenizer.h - clang -Ofast -static -march=native -mtune=native -D STRLIT -D LLOOP runq.c -lm -o run + clang -Ofast -static -march=native -mtune=native -D STRLIT -D LLOOP runq.c $(BOLT) -lm -o run # Build for termux on Android ##@ ---> Android .PHONY: run_incbin_tmux run_incbin_tmux: get_model ## - Optimized build + Embedded Model for Termux on Android - $(CC) -Ofast -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -o run run.c -lm + $(CC) -Ofast -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -o run run.c $(BOLT) -lm .PHONY: runq_incbin_tmux runq_incbin_tmux: get_model ## - Same for quantized build - $(CC) -Ofast -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -o run runq.c -lm + $(CC) -Ofast -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -o run runq.c $(BOLT) -lm + + +##@ Post Compile Optimizations +##@ Binary Optimization +# These are applied after the compile step and require respective tools. +# -Wl,--emit-relocs option needed +.PHONY: run_bolt +run_bolt: ## - ***NEW*** Apply llvm bolt binary optimization + echo "NOTE: Needs llvm-bolt to be installed. Needs two passes, follow usage info." + echo "Usage: make BOLTPREP=1 ; make run_bolt" + echo "Note: Not all targets are supported. Does not work without the first \"make BOLTPREP=1 \" pass. Does not work for statically linked builds yet as well as any builds that have been stripped..." + echo "Instrumenting..." + + if [ -f "run" ]; then echo "Optimizing run binary" ;\ + llvm-bolt run -instrument -o run.bolt.instrumentation ;\ + echo "Run the executable once to create profile data..." ;\ + echo "NOTE: For embedded model builds type a prompt, wait for response then type exit on next prompt..." ;\ + ./run.bolt.instrumentation out/model.bin ;\ + echo "Create run.bolt optimized binary..." ;\ + llvm-bolt run -o run.bolt --use-gnu-stack -data=/tmp/prof.fdata -reorder-blocks=ext-tsp -reorder-functions=hfsort -split-functions -split-all-cold -split-eh -dyno-stats ;\ + fi + + if [ -f "runq" ]; then echo "Optimizing runq binary" ;\ + llvm-bolt runq -instrument -o runq.bolt.instrumentation ;\ + echo "Run the executable once to create profile data..." ;\ + echo "NOTE: For embedded model builds type a prompt, wait for response then type exit on next prompt..." ;\ + ./run.bolt.instrumentation out/model.bin ;\ + echo "Create runq.bolt optimized binary..." ;\ + llvm-bolt runq -o runq.bolt --use-gnu-stack -data=/tmp/prof.fdata -reorder-blocks=ext-tsp -reorder-functions=hfsort -split-functions -split-all-cold -split-eh -dyno-stats ;\ + fi + + echo "Done. Check out run.bolt or runq.bolt binaries." + +##@ Strip Symbols +# Stripping symbols make binaries small +.PHONY: run_strip +run_strip: ## - ***NEW*** Strip symbols to make binaries smaller + strip -s run + strip -s runq + strip -s run.bolt + strip -s runq.bolt ##@ ---> L2E Unikernel (Asteroid) # Unikraft Unikernel build @@ -521,7 +573,7 @@ boot_l2e_iso_uefi: ## - Boot L2E OS ISO Image with UEFI in qemu # $ valgrind --leak-check=full ./run out/model.bin -n 3 .PHONY: run_debug run_debug: ## - Debug build which can be analyzed with tools like valgrind. - $(CC) -g -o run run.c -lm + $(CC) -g -o run run.c $(BOLT) -lm .PHONY: run_cc_bcdebug run_cc_bcdebug: ## - ***NEW*** Emit LLVM bitcode & transpile to C debug build @@ -529,7 +581,7 @@ run_cc_bcdebug: ## - ***NEW*** Emit LLVM bitcode & transpile to C debug build echo "Get llvm-cbe here: https://github.com/JuliaHubOSS/llvm-cbe" clang-17 -march=native -mtune=native -S -emit-llvm -g run.c llvm-cbe run.ll - $(CC) -Ofast -march=native -mtune=native -o run run.cbe.c -lm + $(CC) -Ofast -march=native -mtune=native -o run run.cbe.c $(BOLT) -lm .PHONY: runq_cc_bcdebug runq_cc_bcdebug: ## - Same for quantized build @@ -537,11 +589,11 @@ runq_cc_bcdebug: ## - Same for quantized build echo "Get llvm-cbe here: https://github.com/JuliaHubOSS/llvm-cbe" clang-17 -march=native -mtune=native -S -emit-llvm -g runq.c llvm-cbe runq.ll - $(CC) -Ofast -march=native -mtune=native -o run runq.cbe.c -lm + $(CC) -Ofast -march=native -mtune=native -o run runq.cbe.c $(BOLT) -lm .PHONY: run_cc_mmdebug run_cc_mmdebug: ## - ***NEW*** Matmul Debug Log build (Warning: Huge Logs) - $(CC) -D MMDEBUG -Ofast -march=native -mtune=native run.c -lm -o run + $(CC) -D MMDEBUG -Ofast -march=native -mtune=native run.c $(BOLT) -lm -o run ##@ Testing @@ -557,7 +609,7 @@ testc: ## - run only tests for run.c C implementation (needs python) VERBOSITY ?= 0 .PHONY: testcc testcc: ## - run the C tests, without touching pytest / python - $(CC) -DVERBOSITY=$(VERBOSITY) -O3 -o testc test.c -lm + $(CC) -DVERBOSITY=$(VERBOSITY) -O3 -o testc test.c $(BOLT) -lm ./testc ##@ Clean/ Purge diff --git a/README.md b/README.md index 8b87e2e1..019b84ee 100644 --- a/README.md +++ b/README.md @@ -529,6 +529,14 @@ Special Builds run_incbin_tmux - Optimized build + Embedded Model for Termux on Android runq_incbin_tmux - Same for quantized build +Post Compile Optimizations + +Binary Optimization + run_bolt - ***NEW*** Apply llvm bolt binary optimization + +Strip Symbols + run_strip - ***NEW*** Strip symbols to make binaries smaller + ---> L2E Unikernel (Asteroid) l2e_unik_qemu - L2E Unikernel (Asteroid) for kvm / qemu x86_64 diff --git a/run.c b/run.c index 571d1d69..d77fa3df 100644 --- a/run.c +++ b/run.c @@ -1293,6 +1293,7 @@ int main(int argc, char *argv[]) { printf("\n" DEFTOSTR(OSPROMPT)" "); fflush(stdout); inprompt(prompt); // read prompt + if (!strcmp(prompt, "exit")) { exit(0);} // Exit when prompt contains exit #else #ifdef MMDEBUG FILE* dLogFile = freopen("debug_matmul.log", "w", stderr); diff --git a/runq.c b/runq.c index 8f4d7a3b..6e101fa1 100644 --- a/runq.c +++ b/runq.c @@ -1322,6 +1322,7 @@ int main(int argc, char *argv[]) { printf("\n" DEFTOSTR(OSPROMPT)" "); fflush(stdout); inprompt(prompt); // read prompt + if (!strcmp(prompt, "exit")) { exit(0);} // Exit when prompt contains exit #else // END L2E Addition // poor man's C argparse so we can override the defaults above from the command line