diff --git a/.clang-format b/.clang-format index 3e8142bf7a..8178613b64 100644 --- a/.clang-format +++ b/.clang-format @@ -63,6 +63,8 @@ ForEachMacros: SortIncludes: true IncludeBlocks: Regroup IncludeCategories: + - Regex: '"V3Pch.*\.h"' + Priority: -2 # Precompiled headers - Regex: '"(config_build|verilated_config|verilatedos)\.h"' Priority: -1 # Sepecials before main header - Regex: '(<|")verilated.*' diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ee8fc91629..42eb920036 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -63,7 +63,7 @@ jobs: CC: ${{ matrix.compiler.cc }} CXX: ${{ matrix.compiler.cxx }} CACHE_BASE_KEY: build-${{ matrix.os }}-${{ matrix.compiler.cc }}-m32=${{ matrix.m32 }} - CCACHE_MAXSIZE: 250M # Per build matrix entry (2000M in total) + CCACHE_MAXSIZE: 1000M # Per build matrix entry (* 5 = 5000M in total) VERILATOR_ARCHIVE: verilator-${{ github.sha }}-${{ matrix.os }}-${{ matrix.compiler.cc }}${{ matrix.m32 && '-m32' || '' }}.tar.gz steps: @@ -136,7 +136,7 @@ jobs: CC: ${{ matrix.compiler.cc }} CXX: ${{ matrix.compiler.cxx }} CACHE_BASE_KEY: test-${{ matrix.os }}-${{ matrix.compiler.cc }}-m32=${{ matrix.m32 }}-${{ matrix.suite }} - CCACHE_MAXSIZE: 64M # Per build matrix entry (2160M in total) + CCACHE_MAXSIZE: 100M # Per build per suite (* 5 * 5 = 2500M in total) VERILATOR_ARCHIVE: verilator-${{ github.sha }}-${{ matrix.os }}-${{ matrix.compiler.cc }}${{ matrix.m32 && '-m32' || '' }}.tar.gz steps: @@ -167,3 +167,30 @@ jobs: env: TESTS: ${{ matrix.suite }} run: ./ci/ci-script.bash + + lint-py: + runs-on: ubuntu-22.04 + name: Lint Python + env: + CI_BUILD_STAGE_NAME: build + CI_RUNS_ON: ubuntu-22.04 + CI_M32: 0 + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + path: repo + + - name: Install packages for build + run: ./ci/ci-install.bash + + # We use specific version numbers, otherwise a Python package + # update may add a warning and break our build + - name: Install packages for lint + run: sudo pip3 install pylint==3.0.2 ruff==0.1.3 clang sphinx sphinx_rtd_theme sphinxcontrib-spelling breathe ruff + + - name: Configure + run: autoconf && ./configure --enable-longtests --enable-ccwarn + + - name: Lint + run: make -k lint-py diff --git a/.gitignore b/.gitignore index e7e3d788a3..c04dcedeb2 100644 --- a/.gitignore +++ b/.gitignore @@ -43,3 +43,4 @@ verilator-config-version.cmake /.vscode/ /.idea/ /cmake-build-*/ +/test_regress/snapshot/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 87a695c0b1..0041e6d579 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,7 +15,7 @@ cmake_minimum_required(VERSION 3.15) cmake_policy(SET CMP0091 NEW) # Use MSVC_RUNTIME_LIBRARY to select the runtime project(Verilator - VERSION 5.017 + VERSION 5.019 HOMEPAGE_URL https://verilator.org LANGUAGES CXX ) @@ -35,7 +35,7 @@ if (NOT WIN32) message(WARNING "CMake support on Linux/OSX is experimental.") endif() -if (MSVC) +if (WIN32) if (DEFINED ENV{WIN_FLEX_BISON}) set(WIN_FLEX_BISON "$ENV{WIN_FLEX_BISON}") endif() @@ -48,6 +48,27 @@ if (MSVC) set(CMAKE_CXX_STANDARD 20) endif() +set(OBJCACHE "" CACHE STRING "Path for ccache, auto-detected if empty") +option(OBJCACHE_ENABLED "Compile Verilator with ccache" ON) + +if (OBJCACHE_ENABLED) + if (OBJCACHE STREQUAL "") + find_program(OBJCACHE_PATH ccache) + if (OBJCACHE_PATH STREQUAL "OBJCACHE_PATH-NOTFOUND") + set(OBJCACHE_PATH "") + endif() + else() + set(OBJCACHE_PATH "${OBJCACHE}") + endif() + if (NOT OBJCACHE_PATH STREQUAL "") + execute_process(COMMAND "${OBJCACHE_PATH}" --version + OUTPUT_VARIABLE objcache_version) + string(REGEX MATCH "[^\n\r]+" objcache_version "${objcache_version}") + message(STATUS "Found ccache: ${OBJCACHE_PATH} (\"${objcache_version}\")") + set(CMAKE_CXX_COMPILER_LAUNCHER "${OBJCACHE_PATH}") + endif() +endif() + find_package(BISON) find_package(FLEX) diff --git a/Changes b/Changes index cc313c39fd..083a2c4bc1 100644 --- a/Changes +++ b/Changes @@ -8,19 +8,66 @@ The changes in each Verilator version are described below. The contributors that suggested a given feature are shown in []. Thanks! -Verilator 5.017 devel +Verilator 5.019 devel ========================== +**Major:** + +* Support compilation with precompiled headers with Make and GCC or CLang. +* Change include of systemc instead of systemc.h (#4622) (#4623). [Chih-Mao Chen] + This may require that SystemC programs add 'using namespace sc_core', 'using namespace sc_dt'. + +**Minor:** + +* Support ccache when compiling Verilator with CMake (#4678). [Anthony Donlon] +* Support passing constraints to --xml-only output (still otherwise unsupported) (#4683). [Shahid Ikram] +* Remove deprecated options (#4663). [Geza Lore] +* Optimize timing-delayed queue (#4584). [qrqiuren] +* Fix VPI TOP level variable iteration (#3919) (#4618). [Marlon James] +* Fix display with no % printing assoc array (#4376). [Alex Solomatnikov] +* Fix scheduling of external force signals (#4577) (#4668). [Geza Lore] +* Fix a memory leak in V3Fork (#4628). [Krzysztof Boroński] +* Fix linking parameterized hierarchical blocks and recursive hierarchical blocks (#4654). [Anthony Donlon] +* Fix identifiers that end with '_' on Windows (#4655). [Anthony Donlon] +* Fix 'for' loop with outside variable reference (#4660). [David Harris] +* Fix interface parameters used in loop generate constructs (#4664) (#4665). [Anthony Donlon] +* Fix MingW compilation (#4675). [David Ledger] +* Fix trace when using SystemC with certain configurations (#4676). [Anthony Donlon] +* Fix C++20 compilation errors (#4670). + + +Verilator 5.018 2023-10-30 +========================== + +**Major:** + +* Support compilation with precompiled headers with Make and GCC or CLang. +* Change include of systemc instead of systemc.h (#4622) (#4623). [Chih-Mao Chen] + This may require that SystemC programs add 'using namespace sc_core', 'using namespace sc_dt'. + **Minor:** * Add SIDEEFFECT warning on mishandled side effect cases. * Add trace() API even when Verilated without --trace (#4462). [phelter] * Add warning on interface instantiation without parens (#4094). [Gökçe Aydos] -* Support randc (#4349). +* Add sv_vpi_user.h from IEEE 1800-2017 Annex M (#4606). [Marlon James] +* Support 'disable fork' (#4125) (#4569). [Aleksander Kiryk, Antmicro Ltd.] +* Support 'wait fork' (#4586). [Aleksander Kiryk, Antmicro Ltd.] +* Support 'randc' (#4349). +* Support assigning events (#4403). [Krzysztof Boroński] * Support resizing function call inout arguments (#4467). +* Support NBAs in non-inlined functions/tasks (#4496) (#4572). [Krzysztof Bieganski, Antmicro Ltd.] * Support converting parameters inside modules to localparams (#4511). [Anthony Donlon] +* Support concatenation of unpacked arrays (#4558). [Yutetsu TAKATSUKASA] +* Support Clang 16 (#4592). [Mariusz Glebocki] +* Support VPI variables of real and string data types (#4594). [Marlon James] +* Support making VL_LOCK_SPINS configurable (#4599). [Geza Lore] +* Change code --stats output (#4597). [Geza Lore] +* Change --prof-exec infrastructure and report (#4602). [Geza Lore] * Change lint_off to not propagate upwards to files including where the lint_off is. * Optimize empty expression statements (#4544). +* Optimize trace internals (#4610) (#4612). [Geza Lore] +* Optimize internal performance issues (#4638). [Geza Lore] * Fix conversion of impure logical expressions to bit expressions (#487 partial) (#4437). [Ryszard Rozak, Antmicro Ltd.] * Fix enum functions in localparams (#3999). [Andrew Nolte] * Fix passing arguments by reference (#3385 partial) (#4489). [Ryszard Rozak, Antmicro Ltd.] @@ -38,8 +85,30 @@ Verilator 5.017 devel * Fix object destruction after a copy constructor (#4540) (#4541). [Ryszard Rozak, Antmicro Ltd.] * Fix inlining of real functions miscasting (#4543). [Andrew Nolte] * Fix broken link error for enum references (#4551). [Anthony Donlon] +* Fix logical expressions with class objects - caching in v3Const (#4552). [Ryszard Rozak, Antmicro Ltd.] +* Fix using functions/tasks following class definition inside module (#4553). [Anthony Donlon] +* Fix large constant buffer overflow (#4556). [Varun Koyyalagunta] * Fix instance arrays connecting to array of structs (#4557). [raphmaster] +* Fix error message for invalid parameter overrides (#4559). [Anthony Donlon] * Fix shift to remove operation side effects (#4563). +* Fix compile warning on unused member function variable (#4567). +* Fix method narrowing conversion compiler error (#4568). +* Fix interface comparison (#4570). [Krzysztof Bieganski, Antmicro Ltd.] +* Fix dynamic triggers for named events (#4571). [Krzysztof Bieganski, Antmicro Ltd.] +* Fix dictionaries with keys of class types (#4576). [Ryszard Rozak, Antmicro Ltd.] +* Fix to not remap local assign intervals in forks (#4583). [Krzysztof Bieganski, Antmicro Ltd.] +* Fix display optimization ignoring side effects (#4585). +* Fix PLI/DPI user defined system task/function grammar (#4587) (#4588). [Quentin Corradi] +* Fix fault on empty clocking block (#4593). [Alex Mykyta] +* Fix creating implicit nets for inputs of gate primitives (#4603). [Geza Lore] +* Fix try_put method of unbounded mailbox (#4608). [Ryszard Rozak, Antmicro Ltd.] +* Fix stable name generation in V3Fork (#4615) (#4624). [Krzysztof Boroński] +* Fix virtual methods (#4616). [Ryszard Rozak, Antmicro Ltd.] +* Fix insertion at queue end (#4619). [Krzysztof Boroński] +* Fix rand fields of reference types (#4627). [Ryszard Rozak, Antmicro Ltd.] +* Fix dynamic casts of null values (#4631). [Ryszard Rozak, Antmicro Ltd.] +* Fix signals read via virtual interfaces being misoptimized (#4645). [Krzysztof Bieganski, Antmicro Ltd.] +* Fix handling of static keyword in methods (#4649). [Ryszard Rozak, Antmicro Ltd.] * Fix preprocessor to show `line 2 on resumed file. @@ -469,7 +538,7 @@ Verilator 5.002 2022-10-29 * Fix null access on optimized-out fork statements (#3658). [Krzysztof Bieganski, Antmicro Ltd] * Fix VPI inline module naming mismatch (#3690) (#3694). [Jiuyang Liu] * Fix deadlock in timeprecision when using SystemC (#3707). [Kamil Rakoczy, Antmicro Ltd] -* Fix width mismatch on inside operator (#3714). [Alex Torregrosa] +* Fix width mismatch on inside operator (#3714). [Àlex Torregrosa] Verilator 4.228 2022-10-01 @@ -699,7 +768,7 @@ Verilator 4.214 2021-10-17 * Fix removing if statement with side effect in condition (#3131). [Alexander Grobman] * Fix --waiver-output for multiline warnings (#2429) (#3141). [Keith Colbert] * Fix internal error on bad widths (#3140) (#3145). [Zhanglei Wang] -* Fix crash on clang 12/13 (#3148). [Kouping Hsu] +* Fix crash on clang 12/13 (#3148). [Kuoping Hsu] * Fix cygwin compile error due to missing -std=gnu++14 (#3149). [Sun Kim] * Fix $urandom_range when the range is 0 ... UINT_MAX (#3161). [Iru Cai] * Fix constructor-parameter argument comma-separation in C++ (#3162). [Matthew Ballance] @@ -814,12 +883,12 @@ Verilator 4.202 2021-04-24 * Mark --no-relative-cfuncs as scheduled for deprecation. * Add --coverage-max-width (#2853). [xuejiazidi] * Add VerilatedCovContext::forcePerInstance (#2793). [Kevin Laeufer] -* Add FST SystemC tracing (#2806). [Alex Torregrosa] +* Add FST SystemC tracing (#2806). [Àlex Torregrosa] * Add PINNOTFOUND warning in place of error (#2868). [Udi Finkelstein] * Support overlaps in priority case statements (#2864). [Rupert Swarbrick] * Support for null ports (#2875). [Udi Finkelstein] * Fix class unpacked-array compile error (#2774). [Iru Cai] -* Fix scope types in FST and VCD traces (#2805). [Alex Torregrosa] +* Fix scope types in FST and VCD traces (#2805). [Àlex Torregrosa] * Fix exceeding command-line ar limit (#2834). [Yinan Xu] * Fix false $dumpfile warning on model save (#2834). [Yinan Xu] * Fix --timescale-override not suppressing TIMESCALEMOD (#2838). [Kaleb Barrett] @@ -876,7 +945,7 @@ Verilator 4.110 2021-02-25 **Minor:** * Support concat selection (#2721). -* Support struct scopes when dumping structs to VCD (#2776) [Alex Torregrosa] +* Support struct scopes when dumping structs to VCD (#2776) [Àlex Torregrosa] * Generate SELRANGE for potentially unreachable code (#2625) (#2754) [Pierre-Henri Horrein] * For --flatten, override inlining of public and no_inline modules (#2761) [James Hanlon] * Fix little endian interface pin swizzling (#2475). [Don Owen] @@ -887,7 +956,7 @@ Verilator 4.110 2021-02-25 * Fix class extends with VM_PARALLEL_BUILDS (#2775). [Iru Cai] * Fix shifts by > 32 bit values (#2785). [qrq992] * Fix examples not flushing vcd (#2787). [Richard E George] -* Fix little endian packed array pattern assignment (#2795). [Alex Torregrosa] +* Fix little endian packed array pattern assignment (#2795). [Àlex Torregrosa] Verilator 4.108 2021-01-10 @@ -1363,7 +1432,7 @@ Verilator 4.014 2019-05-08 **Minor:** * Add --trace-fst-thread. -* Support '#' comments in $readmem. (#1411) [Frederick Requin] +* Support '#' comments in $readmem. (#1411) [Frédéric Requin] * Support "'dx" constants. (#1423) [Udi Finkelstein] * For FST tracing use LZ4 compression. [Tony Bybell] * Add error when use parameters without value. (#1424) [Peter Gerst] @@ -1633,19 +1702,19 @@ Verilator 3.912 2017-09-23 **Minor:** -* Support or/and/xor array intrinsic methods. (#1210) [Mike Popoloski] +* Support or/and/xor array intrinsic methods. (#1210) [Michael Popoloski] * Support package export. (#1217) [Usuario Eda] -* Support module port parameters without defaults. (#1213) [Mike Popoloski] +* Support module port parameters without defaults. (#1213) [Michael Popoloski] * Add performance information to --stats file. * Simplify VL_CONST_W macro generation for faster compiles. * Optimize improvements for Shift-And, and replication constructs. -* Fix ordering of arrayed cell wide connections. (#1202 partial) [Mike Popoloski] -* Fix LITENDIAN warning on arrayed cells. (#1202) [Mike Popoloski] -* Fix enum ranges without colons. (#1204) [Mike Popoloski] -* Fix GCC noreturn compile error. (#1209) [Mike Popoloski] -* Fix constant function default parameters. (#1211) [Mike Popoloski] -* Fix non-colon array of interface modports. (#1212) [Mike Popoloski] -* Fix .name connections on interfaces. (#1214) [Mike Popoloski] +* Fix ordering of arrayed cell wide connections. (#1202 partial) [Michael Popoloski] +* Fix LITENDIAN warning on arrayed cells. (#1202) [Michael Popoloski] +* Fix enum ranges without colons. (#1204) [Michael Popoloski] +* Fix GCC noreturn compile error. (#1209) [Michael Popoloski] +* Fix constant function default parameters. (#1211) [Michael Popoloski] +* Fix non-colon array of interface modports. (#1212) [Michael Popoloski] +* Fix .name connections on interfaces. (#1214) [Michael Popoloski] * Fix wide array indices causing compile error. @@ -1767,8 +1836,8 @@ Verilator 3.888 2016-10-14 **Minor:** -* Add --no-decoration to remove output comments, msg2015. [Frederic Requin] -* If VM_PARALLEL_BUILDS=1, use OPT_FAST and OPT_SLOW. [Frederic Requin] +* Add --no-decoration to remove output comments, msg2015. [Frédéric Requin] +* If VM_PARALLEL_BUILDS=1, use OPT_FAST and OPT_SLOW. [Frédéric Requin] Set VM_DEFAULT_RULES=0 for old behavior. * Add error on DPI functions > 32 bits. (#1898) [Elliot Mednick] * Improve Verilation performance on internal strings. (#1896) [Johan Bjork] @@ -1804,7 +1873,7 @@ Verilator 3.884 2016-05-18 * Fix --output-split of constructors. (#1035) [Johan Bjork] * Fix removal of empty packages, modules and cells. (#1034) [Johan Bjork] * Fix core dump on Arch Linux/GCC 6.1.1. (#1058) [Jannis Harder] -* Fix $value$plusargs to string. (#1880) [Frederic Requin] +* Fix $value$plusargs to string. (#1880) [Frédéric Requin] Verilator 3.882 2016-03-01 @@ -1883,13 +1952,13 @@ Verilator 3.876 2015-08-12 **Minor:** -* Add tracing_on, etc to vlt files. (#932) [Frederic Requin] +* Add tracing_on, etc to vlt files. (#932) [Frédéric Requin] * Support extraction of enum bits. (#951) [Jonathon Donaldson] * Fix MinGW compiler error. (#927) (#929) [Hans Tichelaar] * Fix .c files to be treated as .cpp. (#930) [Jonathon Donaldson] * Fix string-to-int space conversion. (#931) [Fabrizio Ferrandi] * Fix dpi imports inside generates. [Michael Tresidder] -* Fix rounding in trace $timescale. (#946) [Frederic Requin] +* Fix rounding in trace $timescale. (#946) [Frédéric Requin] * Fix $fopen with SV string. (#947) [Sven Stucki] * Fix hashed error with typedef inside block. (#948) [Sven Stucki] * Fix makefile with --coverage. (#953) [Eivind Liland] @@ -2129,7 +2198,7 @@ Verilator 3.852 2013-09-29 * Support named function and task arguments. [Chris Randall] * Report SELRANGE warning for non-generate if. (#675) [Roland Kruse] -* Fix ordering of $fgetc. (#1808) [Frederic Requin] +* Fix ordering of $fgetc. (#1808) [Frédéric Requin] * Fix --output-split-cfunc to count internal functions. [Chris Randall] * Fix crash on 32-bit Ubuntu. (#670) [Mark Jackson Pulver] @@ -2197,7 +2266,7 @@ Verilator 3.846 2013-03-09 * Fix DETECTARRAY on packed structures. (#610) [Jeremy Bennett] * Fix LITENDIAN on unpacked structures. (#614) [Wai Sum Mong] * Fix 32-bit OS VPI scan issue. (#615) [Jeremy Bennett, Rich Porter] -* Fix opening a VerilatedVcdC file multiple times. (#1774) [Frederic Requin] +* Fix opening a VerilatedVcdC file multiple times. (#1774) [Frédéric Requin] * Fix UNOPTFLAT circular array bounds crossing. (#630) [Jie Xu] @@ -3509,7 +3578,7 @@ Verilator 3.460 2005-07-27 Beta * Fix false warning when a clock is constant. * Fix X/Z in decimal numbers. [Wim Michiels] * Fix genvar statements in non-named generate blocks. -* Fix core dump when missing newline in `define. [David van der bokke] +* Fix core dump when missing newline in `define. [David van der Bokke] Verilator 3.450 2005-07-12 diff --git a/Makefile.in b/Makefile.in index 3588234111..8eb581f1e0 100644 --- a/Makefile.in +++ b/Makefile.in @@ -173,6 +173,10 @@ smoke-test: all_nomsg test_regress: all_nomsg $(MAKE) -C test_regress +.PHONY: test-snap test-diff +test-snap test-diff: + $(MAKE) -C test_regress $@ + examples: all_nomsg for p in $(EXAMPLES) ; do \ $(MAKE) -C $$p VERILATOR_ROOT=`pwd` || exit 10; \ @@ -430,9 +434,15 @@ PYLINT_FLAGS = --score=n --disable=R0801 RUFF = ruff RUFF_FLAGS = check --ignore=E402,E501,E701 +# "make -k" so can see all tool result errors lint-py: - -$(PYLINT) $(PYLINT_FLAGS) $(PY_PROGRAMS) - -$(RUFF) $(RUFF_FLAGS) $(PY_PROGRAMS) + $(MAKE) -k lint-py-pylint lint-py-ruff + +lint-py-pylint: + $(PYLINT) $(PYLINT_FLAGS) $(PY_PROGRAMS) + +lint-py-ruff: + $(RUFF) $(RUFF_FLAGS) $(PY_PROGRAMS) format-pl-exec: -chmod a+x test_regress/t/*.pl diff --git a/README.rst b/README.rst index 2c3d989616..d6fed59755 100644 --- a/README.rst +++ b/README.rst @@ -150,7 +150,7 @@ the terms of either the GNU Lesser General Public License Version 3 or the Perl Artistic License Version 2.0. See the documentation for more details. .. _CHIPS Alliance: https://chipsalliance.org -.. _Icarus Verilog: http://iverilog.icarus.com +.. _Icarus Verilog: https://steveicarus.github.io/iverilog .. _Linux Foundation: https://www.linuxfoundation.org .. |Logo| image:: https://www.veripool.org/img/verilator_256_200_min.png .. |verilator multithreaded performance| image:: https://www.veripool.org/img/verilator_multithreaded_performance_bg-min.png diff --git a/bin/verilator_gantt b/bin/verilator_gantt index 0015cb57f3..60814b209f 100755 --- a/bin/verilator_gantt +++ b/bin/verilator_gantt @@ -1,33 +1,32 @@ #!/usr/bin/env python3 -# pylint: disable=C0103,C0114,C0116,C0209,C0301,R0914,R0912,R0915,W0511,eval-used +# pylint: disable=C0103,C0114,C0116,C0209,C0301,R0914,R0912,R0915,W0511,W0603,eval-used ###################################################################### import argparse +import bisect import collections import math import re import statistics # from pprint import pprint -Threads = collections.defaultdict(lambda: collections.defaultdict(lambda: {})) -Mtasks = collections.defaultdict(lambda: {}) -Evals = collections.defaultdict(lambda: {}) -EvalLoops = collections.defaultdict(lambda: {}) +Sections = [] +LongestVcdStrValueLength = 0 +Threads = collections.defaultdict(lambda: []) # List of records per thread id +Mtasks = collections.defaultdict(lambda: {'elapsed': 0, 'end': 0}) +Cpus = collections.defaultdict(lambda: {'mtask_time': 0}) Global = { 'args': {}, 'cpuinfo': collections.defaultdict(lambda: {}), - 'rdtsc_cycle_time': 0, 'stats': {} } +ElapsedTime = None # total elapsed time +ExecGraphTime = 0 # total elapsed time excuting an exec graph +ExecGraphIntervals = [] # list of (start, end) pairs ###################################################################### -def process(filename): - read_data(filename) - report() - - def read_data(filename): with open(filename, "r", encoding="utf8") as fh: re_thread = re.compile(r'^VLPROFTHREAD (\d+)$') @@ -39,14 +38,17 @@ def read_data(filename): re_arg1 = re.compile(r'VLPROF arg\s+(\S+)\+([0-9.]*)\s*') re_arg2 = re.compile(r'VLPROF arg\s+(\S+)\s+([0-9.]*)\s*$') re_stat = re.compile(r'VLPROF stat\s+(\S+)\s+([0-9.]+)') - re_time = re.compile(r'rdtsc time = (\d+) ticks') re_proc_cpu = re.compile(r'VLPROFPROC processor\s*:\s*(\d+)\s*$') re_proc_dat = re.compile(r'VLPROFPROC ([a-z_ ]+)\s*:\s*(.*)$') cpu = None thread = None + execGraphStart = None + + global LongestVcdStrValueLength + global ExecGraphTime - lastEvalBeginTick = None - lastEvalLoopBeginTick = None + SectionStack = [] + mTaskThread = {} for line in fh: recordMatch = re_record.match(line) @@ -54,29 +56,31 @@ def read_data(filename): kind, tick, payload = recordMatch.groups() tick = int(tick) payload = payload.strip() - if kind == "EVAL_BEGIN": - Evals[tick]['start'] = tick - lastEvalBeginTick = tick - elif kind == "EVAL_END": - Evals[lastEvalBeginTick]['end'] = tick - lastEvalBeginTick = None - elif kind == "EVAL_LOOP_BEGIN": - EvalLoops[tick]['start'] = tick - lastEvalLoopBeginTick = tick - elif kind == "EVAL_LOOP_END": - EvalLoops[lastEvalLoopBeginTick]['end'] = tick - lastEvalLoopBeginTick = None + if kind == "SECTION_PUSH": + LongestVcdStrValueLength = max(LongestVcdStrValueLength, + len(payload)) + SectionStack.append(payload) + Sections.append((tick, tuple(SectionStack))) + elif kind == "SECTION_POP": + assert SectionStack, "SECTION_POP without SECTION_PUSH" + SectionStack.pop() + Sections.append((tick, tuple(SectionStack))) elif kind == "MTASK_BEGIN": mtask, predict_start, ecpu = re_payload_mtaskBegin.match( payload).groups() mtask = int(mtask) predict_start = int(predict_start) ecpu = int(ecpu) - Threads[thread][tick]['mtask'] = mtask - Threads[thread][tick]['predict_start'] = predict_start - Threads[thread][tick]['cpu'] = ecpu - if 'elapsed' not in Mtasks[mtask]: - Mtasks[mtask] = {'end': 0, 'elapsed': 0} + mTaskThread[mtask] = thread + records = Threads[thread] + assert not records or records[-1]['start'] <= records[-1][ + 'end'] <= tick + records.append({ + 'start': tick, + 'mtask': mtask, + 'predict_start': predict_start, + 'cpu': ecpu + }) Mtasks[mtask]['begin'] = tick Mtasks[mtask]['thread'] = thread Mtasks[mtask]['predict_start'] = predict_start @@ -86,11 +90,18 @@ def read_data(filename): mtask = int(mtask) predict_cost = int(predict_cost) begin = Mtasks[mtask]['begin'] - Threads[thread][begin]['end'] = tick - Threads[thread][begin]['predict_cost'] = predict_cost + record = Threads[mTaskThread[mtask]][-1] + record['end'] = tick + record['predict_cost'] = predict_cost Mtasks[mtask]['elapsed'] += tick - begin Mtasks[mtask]['predict_cost'] = predict_cost Mtasks[mtask]['end'] = max(Mtasks[mtask]['end'], tick) + elif kind == "EXEC_GRAPH_BEGIN": + execGraphStart = tick + elif kind == "EXEC_GRAPH_END": + ExecGraphTime += tick - execGraphStart + ExecGraphIntervals.append((execGraphStart, tick)) + execGraphStart = None elif Args.debug: print("-Unknown execution trace record: %s" % line) elif re_thread.match(line): @@ -109,7 +120,7 @@ def read_data(filename): elif re_proc_cpu.match(line): match = re_proc_cpu.match(line) cpu = int(match.group(1)) - elif cpu and re_proc_dat.match(line): + elif cpu is not None and re_proc_dat.match(line): match = re_proc_dat.match(line) term = match.group(1) value = match.group(2) @@ -121,11 +132,6 @@ def read_data(filename): pass elif Args.debug: print("-Unk: %s" % line) - # TODO -- this is parsing text printed by a client. - # Really, verilator proper should generate this - # if it's useful... - if re_time.match(line): - Global['rdtsc_cycle_time'] = re_time.group(1) def re_match_result(regexp, line, result_to): @@ -144,125 +150,33 @@ def report(): plus = "+" if re.match(r'^\+', arg) else " " print(" %s%s%s" % (arg, plus, Global['args'][arg])) - nthreads = int(Global['stats']['threads']) - Global['cpus'] = {} - for thread in Threads: - # Make potentially multiple characters per column - for start in Threads[thread]: - if not Threads[thread][start]: - continue - cpu = Threads[thread][start]['cpu'] - elapsed = Threads[thread][start]['end'] - start - if cpu not in Global['cpus']: - Global['cpus'][cpu] = {'cpu_time': 0} - Global['cpus'][cpu]['cpu_time'] += elapsed - - measured_mt_mtask_time = 0 - predict_mt_mtask_time = 0 - long_mtask_time = 0 - measured_last_end = 0 - predict_last_end = 0 - for mtask in Mtasks: - measured_mt_mtask_time += Mtasks[mtask]['elapsed'] - predict_mt_mtask_time += Mtasks[mtask]['predict_cost'] - measured_last_end = max(measured_last_end, Mtasks[mtask]['end']) - predict_last_end = max( - predict_last_end, - Mtasks[mtask]['predict_start'] + Mtasks[mtask]['predict_cost']) - long_mtask_time = max(long_mtask_time, Mtasks[mtask]['elapsed']) - Global['measured_last_end'] = measured_last_end - Global['predict_last_end'] = predict_last_end - - # If we know cycle time in the same (rdtsc) units, - # this will give us an actual utilization number, - # (how effectively we keep the cores busy.) - # - # It also gives us a number we can compare against - # serial mode, to estimate the overhead of data sharing, - # which will show up in the total elapsed time. (Overhead - # of synchronization and scheduling should not.) - print("\nAnalysis:") - print(" Total threads = %d" % nthreads) - print(" Total mtasks = %d" % len(Mtasks)) - ncpus = max(len(Global['cpus']), 1) - print(" Total cpus used = %d" % ncpus) - print(" Total yields = %d" % - int(Global['stats'].get('yields', 0))) - print(" Total evals = %d" % len(Evals)) - print(" Total eval loops = %d" % len(EvalLoops)) - if Mtasks: - print(" Total eval time = %d rdtsc ticks" % - Global['measured_last_end']) - print(" Longest mtask time = %d rdtsc ticks" % long_mtask_time) - print(" All-thread mtask time = %d rdtsc ticks" % - measured_mt_mtask_time) - long_efficiency = long_mtask_time / (Global.get( - 'measured_last_end', 1) or 1) - print(" Longest-thread efficiency = %0.1f%%" % - (long_efficiency * 100.0)) - mt_efficiency = measured_mt_mtask_time / ( - Global.get('measured_last_end', 1) * nthreads or 1) - print(" All-thread efficiency = %0.1f%%" % - (mt_efficiency * 100.0)) - print(" All-thread speedup = %0.1f" % - (mt_efficiency * nthreads)) - if Global['rdtsc_cycle_time'] > 0: - ut = measured_mt_mtask_time / Global['rdtsc_cycle_time'] - print("tot_mtask_cpu=" + measured_mt_mtask_time + " cyc=" + - Global['rdtsc_cycle_time'] + " ut=" + ut) - - predict_mt_efficiency = predict_mt_mtask_time / ( - Global.get('predict_last_end', 1) * nthreads or 1) - print("\nPrediction (what Verilator used for scheduling):") - print(" All-thread efficiency = %0.1f%%" % - (predict_mt_efficiency * 100.0)) - print(" All-thread speedup = %0.1f" % - (predict_mt_efficiency * nthreads)) - - p2e_ratios = [] - min_p2e = 1000000 - min_mtask = None - max_p2e = -1000000 - max_mtask = None - - for mtask in sorted(Mtasks.keys()): - if Mtasks[mtask]['elapsed'] > 0: - if Mtasks[mtask]['predict_cost'] == 0: - Mtasks[mtask]['predict_cost'] = 1 # don't log(0) below - p2e_ratio = math.log(Mtasks[mtask]['predict_cost'] / - Mtasks[mtask]['elapsed']) - p2e_ratios.append(p2e_ratio) - - if p2e_ratio > max_p2e: - max_p2e = p2e_ratio - max_mtask = mtask - if p2e_ratio < min_p2e: - min_p2e = p2e_ratio - min_mtask = mtask - - print("\nMTask statistics:") - print(" min log(p2e) = %0.3f" % min_p2e, end="") - print(" from mtask %d (predict %d," % - (min_mtask, Mtasks[min_mtask]['predict_cost']), - end="") - print(" elapsed %d)" % Mtasks[min_mtask]['elapsed']) - print(" max log(p2e) = %0.3f" % max_p2e, end="") - print(" from mtask %d (predict %d," % - (max_mtask, Mtasks[max_mtask]['predict_cost']), - end="") - print(" elapsed %d)" % Mtasks[max_mtask]['elapsed']) - - stddev = statistics.pstdev(p2e_ratios) - mean = statistics.mean(p2e_ratios) - print(" mean = %0.3f" % mean) - print(" stddev = %0.3f" % stddev) - print(" e ^ stddev = %0.3f" % math.exp(stddev)) + for records in Threads.values(): + for record in records: + cpu = record['cpu'] + elapsed = record['end'] - record['start'] + Cpus[cpu]['mtask_time'] += elapsed + global ElapsedTime + ElapsedTime = int(Global['stats']['ticks']) + nthreads = int(Global['stats']['threads']) + ncpus = max(len(Cpus), 1) + + print("\nSummary:") + print(" Total elapsed time = {} rdtsc ticks".format(ElapsedTime)) + print(" Parallelized code = {:.2%} of elapsed time".format( + ExecGraphTime / ElapsedTime)) + print(" Total threads = %d" % nthreads) + print(" Total CPUs used = %d" % ncpus) + print(" Total mtasks = %d" % len(Mtasks)) + print(" Total yields = %d" % int(Global['stats'].get('yields', 0))) + + report_mtasks() report_cpus() + report_sections() if nthreads > ncpus: print() - print("%%Warning: There were fewer CPUs (%d) then threads (%d)." % + print("%%Warning: There were fewer CPUs (%d) than threads (%d)." % (ncpus, nthreads)) print(" : See docs on use of numactl.") else: @@ -279,32 +193,130 @@ def report(): print() +def report_mtasks(): + if not Mtasks: + return + + nthreads = int(Global['stats']['threads']) + + # If we know cycle time in the same (rdtsc) units, + # this will give us an actual utilization number, + # (how effectively we keep the cores busy.) + # + # It also gives us a number we can compare against + # serial mode, to estimate the overhead of data sharing, + # which will show up in the total elapsed time. (Overhead + # of synchronization and scheduling should not.) + total_mtask_time = 0 + thread_mtask_time = collections.defaultdict(lambda: 0) + long_mtask_time = 0 + long_mtask = None + predict_mtask_time = 0 + predict_elapsed = 0 + for mtaskId in Mtasks: + record = Mtasks[mtaskId] + predict_mtask_time += record['predict_cost'] + total_mtask_time += record['elapsed'] + thread_mtask_time[record['thread']] += record['elapsed'] + predict_end = record['predict_start'] + record['predict_cost'] + predict_elapsed = max(predict_elapsed, predict_end) + if record['elapsed'] > long_mtask_time: + long_mtask_time = record['elapsed'] + long_mtask = mtaskId + Global['predict_last_end'] = predict_elapsed + + serialTime = ElapsedTime - ExecGraphTime + + def subReport(elapsed, work): + print(" Thread utilization = {:7.2%}".format(work / + (elapsed * nthreads))) + print(" Speedup = {:6.3}x".format(work / elapsed)) + + print("\nParallelized code, measured:") + subReport(ExecGraphTime, total_mtask_time) + + print("\nParallelized code, predicted during static scheduling:") + subReport(predict_elapsed, predict_mtask_time) + + print("\nAll code, measured:") + subReport(ElapsedTime, serialTime + total_mtask_time) + + print("\nAll code, measured, scaled by predicted speedup:") + expectedParallelSpeedup = predict_mtask_time / predict_elapsed + scaledElapsed = serialTime + total_mtask_time / expectedParallelSpeedup + subReport(scaledElapsed, serialTime + total_mtask_time) + + p2e_ratios = [] + min_p2e = 1000000 + min_mtask = None + max_p2e = -1000000 + max_mtask = None + + for mtask in sorted(Mtasks.keys()): + if Mtasks[mtask]['elapsed'] > 0: + if Mtasks[mtask]['predict_cost'] == 0: + Mtasks[mtask]['predict_cost'] = 1 # don't log(0) below + p2e_ratio = math.log(Mtasks[mtask]['predict_cost'] / + Mtasks[mtask]['elapsed']) + p2e_ratios.append(p2e_ratio) + + if p2e_ratio > max_p2e: + max_p2e = p2e_ratio + max_mtask = mtask + if p2e_ratio < min_p2e: + min_p2e = p2e_ratio + min_mtask = mtask + + print("\nMTask statistics:") + print(" Longest mtask id = {}".format(long_mtask)) + print(" Longest mtask time = {:.2%} of time elapsed in parallelized code". + format(long_mtask_time / ExecGraphTime)) + print(" min log(p2e) = %0.3f" % min_p2e, end="") + + print(" from mtask %d (predict %d," % + (min_mtask, Mtasks[min_mtask]['predict_cost']), + end="") + print(" elapsed %d)" % Mtasks[min_mtask]['elapsed']) + print(" max log(p2e) = %0.3f" % max_p2e, end="") + print(" from mtask %d (predict %d," % + (max_mtask, Mtasks[max_mtask]['predict_cost']), + end="") + print(" elapsed %d)" % Mtasks[max_mtask]['elapsed']) + + stddev = statistics.pstdev(p2e_ratios) + mean = statistics.mean(p2e_ratios) + print(" mean = %0.3f" % mean) + print(" stddev = %0.3f" % stddev) + print(" e ^ stddev = %0.3f" % math.exp(stddev)) + + def report_cpus(): - print("\nCPUs:") + print("\nCPU info:") Global['cpu_sockets'] = collections.defaultdict(lambda: 0) Global['cpu_socket_cores'] = collections.defaultdict(lambda: 0) - for cpu in sorted(Global['cpus'].keys()): - print(" cpu %d: " % cpu, end='') - print("cpu_time=%d" % Global['cpus'][cpu]['cpu_time'], end='') - - socket = None + print(" Id | Time spent executing MTask | Socket | Core | Model") + print(" | % of elapsed ticks / ticks | | |") + print(" ====|============================|========|======|======") + for cpu in sorted(Cpus): + socket = "" + core = "" + model = "" if cpu in Global['cpuinfo']: cpuinfo = Global['cpuinfo'][cpu] if 'physical_id' in cpuinfo and 'core_id' in cpuinfo: - socket = int(cpuinfo['physical_id']) + socket = cpuinfo['physical_id'] Global['cpu_sockets'][socket] += 1 - print(" socket=%d" % socket, end='') - - core = int(cpuinfo['core_id']) - Global['cpu_socket_cores'][str(socket) + "__" + str(core)] += 1 - print(" core=%d" % core, end='') + core = cpuinfo['core_id'] + Global['cpu_socket_cores'][socket + "__" + core] += 1 if 'model_name' in cpuinfo: model = cpuinfo['model_name'] - print(" %s" % model, end='') - print() + + print(" {:3d} | {:7.2%} / {:16d} | {:>6s} | {:>4s} | {}".format( + cpu, Cpus[cpu]['mtask_time'] / ElapsedTime, + Cpus[cpu]['mtask_time'], socket, core, model)) if len(Global['cpu_sockets']) > 1: Global['cpu_sockets_warning'] = True @@ -313,25 +325,68 @@ def report_cpus(): Global['cpu_socket_cores_warning'] = True +def report_sections(): + if not Sections: + return + print("\nSection profile:") + + totalTime = collections.defaultdict(lambda: 0) + selfTime = collections.defaultdict(lambda: 0) + + sectionTree = [0, {}, 1] # [selfTime, childTrees, numberOfTimesEntered] + prevTime = 0 + prevStack = () + for time, stack in Sections: + if len(stack) > len(prevStack): + scope = sectionTree + for item in stack: + scope = scope[1].setdefault(item, [0, {}, 0]) + scope[2] += 1 + dt = time - prevTime + scope = sectionTree + for item in prevStack: + scope = scope[1].setdefault(item, [0, {}, 0]) + scope[0] += dt + + if prevStack: + for name in prevStack: + totalTime[name] += dt + selfTime[prevStack[-1]] += dt + prevTime = time + prevStack = stack + + def treeSum(tree): + n = tree[0] + for subTree in tree[1].values(): + n += treeSum(subTree) + return n + + # Make sure the tree sums to the elapsed time + sectionTree[0] += ElapsedTime - treeSum(sectionTree) + + def printTree(prefix, name, entries, tree): + print(" {:7.2%} | {:7.2%} | {:8} | {:10.2f} | {}".format( + treeSum(tree) / ElapsedTime, tree[0] / ElapsedTime, tree[2], + tree[2] / entries, prefix + name)) + for k in sorted(tree[1], key=lambda _: -treeSum(tree[1][_])): + printTree(prefix + " ", k, tree[2], tree[1][k]) + + print(" Total | Self | Total | Relative | Section") + print(" time | time | entries | entries | name ") + print("==========|=========|==========|============|========") + printTree("", "*TOTAL*", 1, sectionTree) + + ###################################################################### def write_vcd(filename): print("Writing %s" % filename) with open(filename, "w", encoding="utf8") as fh: - vcd = { - 'values': - collections.defaultdict(lambda: {}), # {