diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3ae16aa5d..97e933807 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -8,7 +8,8 @@ on: workflow_dispatch: jobs: - test-callgrind: + test: + timeout-minutes: 30 strategy: matrix: runner: @@ -16,6 +17,9 @@ jobs: ubuntu-version: 22.04 - platform: ubuntu-24.04 ubuntu-version: 24.04 + tool: + - callgrind + - tracegrind runs-on: ${{ matrix.runner.platform }} @@ -32,7 +36,7 @@ jobs: path-exclude /usr/share/man/* path-exclude /usr/share/info/* EOF - + - name: Update apt-get cache run: sudo apt-get update @@ -51,6 +55,10 @@ jobs: docbook-xml \ xsltproc + - name: Install uv + if: matrix.tool == 'tracegrind' + uses: astral-sh/setup-uv@v7 + - name: Run autogen run: ./autogen.sh @@ -63,11 +71,11 @@ jobs: - name: Build test dependencies run: | make -C tests arch_test os_test true - make -C callgrind/tests check + make -C ${{ matrix.tool }}/tests check - - name: Run Callgrind tests + - name: Run tests run: | - cd callgrind/tests + cd ${{ matrix.tool }}/tests TESTS=$(ls *.vgtest | grep -v bug497723.vgtest) perl ../../tests/vg_regtest --valgrind=../../vg-in-place $TESTS @@ -75,5 +83,5 @@ jobs: if: failure() uses: actions/upload-artifact@v4 with: - name: callgrind-test-logs-${{ matrix.runner.ubuntu-version }} - path: callgrind/tests/*.log + name: ${{ matrix.tool }}-test-logs-${{ matrix.runner.ubuntu-version }} + path: ${{ matrix.tool }}/tests/*.log diff --git a/.gitignore b/.gitignore index ea71bb0aa..132e768e7 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,8 @@ /autom4te.cache /bin /cachegrind.out.* +/callgrind.out.* +/tracegrind.out.* /compile /config.guess /config.h* @@ -17,6 +19,7 @@ /config.status /config.sub /configure +/configure~ /default.supp /depcomp /glibc-2.X.supp @@ -161,6 +164,31 @@ /callgrind/tests/inline-samefile /callgrind/tests/inline-crossfile +# /tracegrind/ +/tracegrind/*.so +/tracegrind/.deps +/tracegrind/tracegrind-*-darwin +/tracegrind/tracegrind-*-linux +/tracegrind/tracegrind-*-solaris +/tracegrind/tracegrind-*-freebsd +/tracegrind/Makefile +/tracegrind/Makefile.in + +# /tracegrind/tests/ +/tracegrind/tests/*.dSYM +/tracegrind/tests/*.post.diff* +/tracegrind/tests/*.post.out +/tracegrind/tests/*.stderr.diff* +/tracegrind/tests/*.stderr.out +/tracegrind/tests/*.stdout.diff* +/tracegrind/tests/*.stdout.out +/tracegrind/tests/.deps +/tracegrind/tests/Makefile +/tracegrind/tests/Makefile.in +/tracegrind/tests/tracegrind.out.* +/tracegrind/tests/fibo +/tracegrind/tests/*.bin + # /coregrind/ /coregrind/*.a /coregrind/*.dSYM diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..bc102d4a9 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,7 @@ +repos: + - repo: https://github.com/pre-commit/mirrors-clang-format + rev: v14.0.6 + hooks: + - id: clang-format + files: ^tracegrind/ + types_or: [c, c++] diff --git a/Makefile.am b/Makefile.am index 6c5b9f5b6..2cfe16d16 100644 --- a/Makefile.am +++ b/Makefile.am @@ -9,6 +9,7 @@ TOOLS = \ memcheck \ cachegrind \ callgrind \ + tracegrind \ helgrind \ drd \ massif \ diff --git a/bench/bench.py b/bench/bench.py index 18e2c472a..d577a67ec 100755 --- a/bench/bench.py +++ b/bench/bench.py @@ -41,16 +41,33 @@ def __init__( raise RuntimeError(f"Valgrind not found at: {self.valgrind_path}") self.valgrind_version = result.stdout.strip() - def run_valgrind(self, *args: str) -> None: - """Execute valgrind with given arguments. + # Check which tools are available + self.available_tools = self._detect_available_tools() + + def _detect_available_tools(self) -> set: + """Detect which valgrind tools are available.""" + tools = set() + for tool in ["callgrind", "tracegrind"]: + result = subprocess.run( + [self.valgrind_path, f"--tool={tool}", "--help"], + capture_output=True, + text=True, + ) + if result.returncode == 0: + tools.add(tool) + return tools + + def run_valgrind(self, tool: str, *args: str) -> None: + """Execute valgrind with given tool and arguments. Args: + tool: Valgrind tool to use (callgrind, tracegrind) *args: Valgrind arguments """ cmd = [ self.valgrind_path, - "--tool=callgrind", + f"--tool={tool}", "--log-file=/dev/null", *args, *shlex.split(self.cmd), @@ -75,76 +92,119 @@ def runner(request): return request.config._valgrind_runner +CACHE_SIM_OPTIONS = [ + "--cache-sim=yes", + "--I1=32768,8,64", + "--D1=32768,8,64", + "--LL=8388608,16,64", +] + def pytest_generate_tests(metafunc): """Parametrize tests with valgrind configurations.""" - if "valgrind_args" in metafunc.fixturenames: + if "tool_and_args" in metafunc.fixturenames: runner = getattr(metafunc.config, "_valgrind_runner", None) if not runner: return - # Define valgrind configurations - configs = [ - (["--read-inline-info=no"], "no-inline"), - (["--read-inline-info=yes"], "inline"), + # Define configurations for each tool + # Format: (tool, args, config_name) + all_configs = [ + # Callgrind configurations + ("callgrind", ["--read-inline-info=no"], "cg/no-inline"), + ("callgrind", ["--read-inline-info=yes"], "cg/inline"), ( + "callgrind", [ + *CACHE_SIM_OPTIONS, "--trace-children=yes", - "--cache-sim=yes", - "--I1=32768,8,64", - "--D1=32768,8,64", - "--LL=8388608,16,64", "--collect-systime=nsec", "--compress-strings=no", "--combine-dumps=yes", "--dump-line=no", "--read-inline-info=yes", ], - "full-with-inline", + "cg/full-inline", ), ( + "callgrind", [ + *CACHE_SIM_OPTIONS, "--trace-children=yes", - "--cache-sim=yes", - "--I1=32768,8,64", - "--D1=32768,8,64", - "--LL=8388608,16,64", "--collect-systime=nsec", "--compress-strings=no", "--combine-dumps=yes", "--dump-line=no", + "--read-inline-info=no", + ], + "cg/full-no-inline", + ), + # Tracegrind configurations (only available in codspeed fork) + ("tracegrind", ["--read-inline-info=no"], "tg/no-inline"), + ("tracegrind", ["--read-inline-info=yes"], "tg/inline"), + ( + "tracegrind", + [ + *CACHE_SIM_OPTIONS, + "--trace-children=yes", + "--collect-systime=nsec", + "--read-inline-info=no", ], - "full-no-inline", + "tg/full-no-inline", + ), + ( + "tracegrind", + [ + *CACHE_SIM_OPTIONS, + "--trace-children=yes", + "--collect-systime=nsec", + "--read-inline-info=yes", + ], + "tg/full-inline", ), ] + # Filter configs to only include available tools + configs = [ + (tool, args, name) + for tool, args, name in all_configs + if tool in runner.available_tools + ] + + if not configs: + return + # If the valgrind version is from CodSpeed, we don't want to display the exact version - # to allow comparison against older versions. + # to allow comparison against older versions. if ".codspeed" in runner.valgrind_version: - runner.valgrind_version = "valgrind.codspeed" + runner.valgrind_version = "codspeed" + # Clean valgrind version names + else: + runner.valgrind_version.removeprefix("valgrind-") # Create test IDs with format: valgrind-version, command, config-name test_ids = [ - f"{runner.valgrind_version}, {runner.cmd}, {config_name}" - for _, config_name in configs + f"{runner.valgrind_version}/{config_name}, {runner.cmd}" + for _, _, config_name in configs ] - # Parametrize with just the args + # Parametrize with (tool, args) tuples metafunc.parametrize( - "valgrind_args", - [args for args, _ in configs], + "tool_and_args", + [(tool, args) for tool, args, _ in configs], ids=test_ids, ) @pytest.mark.benchmark -def test_valgrind(runner, valgrind_args): +def test_valgrind(runner, tool_and_args): if runner: - runner.run_valgrind(*valgrind_args) + tool, args = tool_and_args + runner.run_valgrind(tool, *args) def main(): parser = argparse.ArgumentParser( - description="Benchmark Valgrind with pytest-codspeed", + description="Benchmark Valgrind tools (callgrind, tracegrind) with pytest-codspeed", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: @@ -179,6 +239,7 @@ def main(): valgrind_path=args.valgrind_path, ) print(f"Valgrind version: {runner.valgrind_version}") + print(f"Available tools: {', '.join(sorted(runner.available_tools))}") print(f"Command: {args.cmd}") # Plugin to pass runner to tests @@ -187,7 +248,7 @@ def pytest_configure(self, config): config._valgrind_runner = runner exit_code = pytest.main( - [__file__, "-v", "--codspeed", "--codspeed-warmup-time=0", "--codspeed-max-time=5"], + [__file__, "-v", "--codspeed", "--codspeed-warmup-time=0", "--codspeed-max-time=30"], plugins=[RunnerPlugin()], ) if exit_code != 0 and exit_code != 5: diff --git a/configure.ac b/configure.ac index f3f3867ef..fcc1afea4 100644 --- a/configure.ac +++ b/configure.ac @@ -5807,6 +5807,8 @@ AC_CONFIG_FILES([ callgrind/callgrind_annotate callgrind/callgrind_control callgrind/tests/Makefile + tracegrind/Makefile + tracegrind/tests/Makefile helgrind/Makefile helgrind/tests/Makefile drd/Makefile diff --git a/tracegrind/Makefile.am b/tracegrind/Makefile.am new file mode 100644 index 000000000..562c4f6c6 --- /dev/null +++ b/tracegrind/Makefile.am @@ -0,0 +1,88 @@ +include $(top_srcdir)/Makefile.tool.am + +EXTRA_DIST = + +#---------------------------------------------------------------------------- +# Headers, etc +#---------------------------------------------------------------------------- + +pkginclude_HEADERS = tracegrind.h + +noinst_HEADERS = \ + costs.h \ + events.h \ + global.h \ + lz4.c \ + lz4.h \ + tg_lz4.h \ + tg_msgpack.h + +#---------------------------------------------------------------------------- +# tracegrind- +#---------------------------------------------------------------------------- + +noinst_PROGRAMS = tracegrind-@VGCONF_ARCH_PRI@-@VGCONF_OS@ +if VGCONF_HAVE_PLATFORM_SEC +noinst_PROGRAMS += tracegrind-@VGCONF_ARCH_SEC@-@VGCONF_OS@ +endif + +TRACEGRIND_SOURCES_COMMON = \ + bb.c \ + bbcc.c \ + callstack.c \ + clo.c \ + context.c \ + costs.c \ + debug.c \ + dump.c \ + events.c \ + fn.c \ + jumps.c \ + main.c \ + sim.c \ + threads.c \ + tg_lz4.c \ + tg_msgpack.c + +# We sneakily include "cg_branchpred.c" and "cg_arch.c" from cachegrind +TRACEGRIND_CFLAGS_COMMON = -I$(top_srcdir)/cachegrind + +tracegrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_SOURCES = \ + $(TRACEGRIND_SOURCES_COMMON) +tracegrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_CPPFLAGS = \ + $(AM_CPPFLAGS_@VGCONF_PLATFORM_PRI_CAPS@) +tracegrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_CFLAGS = $(LTO_CFLAGS) \ + $(AM_CFLAGS_@VGCONF_PLATFORM_PRI_CAPS@) $(TRACEGRIND_CFLAGS_COMMON) +tracegrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_DEPENDENCIES = \ + $(TOOL_DEPENDENCIES_@VGCONF_PLATFORM_PRI_CAPS@) +tracegrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_LDADD = \ + $(TOOL_LDADD_@VGCONF_PLATFORM_PRI_CAPS@) +tracegrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_LDFLAGS = \ + $(TOOL_LDFLAGS_@VGCONF_PLATFORM_PRI_CAPS@) +tracegrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_LINK = \ + $(top_builddir)/coregrind/link_tool_exe_@VGCONF_OS@ \ + @VALT_LOAD_ADDRESS_PRI@ \ + $(LINK) \ + $(tracegrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_CFLAGS) \ + $(tracegrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_LDFLAGS) + +if VGCONF_HAVE_PLATFORM_SEC +tracegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_SOURCES = \ + $(TRACEGRIND_SOURCES_COMMON) +tracegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_CPPFLAGS = \ + $(AM_CPPFLAGS_@VGCONF_PLATFORM_SEC_CAPS@) +tracegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_CFLAGS = $(LTO_CFLAGS) \ + $(AM_CFLAGS_@VGCONF_PLATFORM_SEC_CAPS@) $(TRACEGRIND_CFLAGS_COMMON) +tracegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_DEPENDENCIES = \ + $(TOOL_DEPENDENCIES_@VGCONF_PLATFORM_SEC_CAPS@) +tracegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_LDADD = \ + $(TOOL_LDADD_@VGCONF_PLATFORM_SEC_CAPS@) +tracegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_LDFLAGS = \ + $(TOOL_LDFLAGS_@VGCONF_PLATFORM_SEC_CAPS@) +tracegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_LINK = \ + $(top_builddir)/coregrind/link_tool_exe_@VGCONF_OS@ \ + @VALT_LOAD_ADDRESS_SEC@ \ + $(LINK) \ + $(tracegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_CFLAGS) \ + $(tracegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_LDFLAGS) +endif diff --git a/tracegrind/bb.c b/tracegrind/bb.c new file mode 100644 index 000000000..ff5f4111c --- /dev/null +++ b/tracegrind/bb.c @@ -0,0 +1,341 @@ +/*--------------------------------------------------------------------*/ +/*--- Tracegrind ---*/ +/*--- bb.c ---*/ +/*--------------------------------------------------------------------*/ + +/* + This file is part of Tracegrind, a Valgrind tool for call tracing. + + Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de) + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see . + + The GNU General Public License is contained in the file COPYING. +*/ + +#include "global.h" + +/*------------------------------------------------------------*/ +/*--- Basic block (BB) operations ---*/ +/*------------------------------------------------------------*/ + +/* BB hash, resizable */ +bb_hash bbs; + +void TG_(init_bb_hash)(void) +{ + Int i; + + bbs.size = 8437; + bbs.entries = 0; + bbs.table = (BB**)TG_MALLOC("cl.bb.ibh.1", bbs.size * sizeof(BB*)); + + for (i = 0; i < bbs.size; i++) + bbs.table[i] = NULL; +} + +bb_hash* TG_(get_bb_hash)(void) { return &bbs; } + +/* The hash stores BBs according to + * - ELF object (is 0 for code in anonymous mapping) + * - BB base as object file offset + */ +static __inline__ UInt bb_hash_idx(obj_node* obj, PtrdiffT offset, UInt size) +{ + return (((Addr)obj) + offset) % size; +} + +/* double size of bb table */ +static void resize_bb_table(void) +{ + Int i, new_size, conflicts1 = 0, conflicts2 = 0; + BB **new_table, *curr, *next; + UInt new_idx; + + new_size = 2 * bbs.size + 3; + new_table = (BB**)TG_MALLOC("cl.bb.rbt.1", new_size * sizeof(BB*)); + + for (i = 0; i < new_size; i++) + new_table[i] = NULL; + + for (i = 0; i < bbs.size; i++) { + if (bbs.table[i] == NULL) + continue; + + curr = bbs.table[i]; + while (NULL != curr) { + next = curr->next; + + new_idx = bb_hash_idx(curr->obj, curr->offset, new_size); + + curr->next = new_table[new_idx]; + new_table[new_idx] = curr; + if (curr->next) { + conflicts1++; + if (curr->next->next) + conflicts2++; + } + + curr = next; + } + } + + VG_(free)(bbs.table); + + TG_DEBUG(0, "Resize BB Hash: %u => %d (entries %u, conflicts %d/%d)\n", + bbs.size, new_size, bbs.entries, conflicts1, conflicts2); + + bbs.size = new_size; + bbs.table = new_table; + TG_(stat).bb_hash_resizes++; +} + +/** + * Allocate new BB structure (including space for event type list) + * Not initialized: + * - instr_len, cost_count, instr[] + */ +static BB* new_bb(obj_node* obj, + PtrdiffT offset, + UInt instr_count, + UInt cjmp_count, + Bool cjmp_inverted) +{ + BB* bb; + UInt idx, size; + + /* check fill degree of bb hash table and resize if needed (>80%) */ + bbs.entries++; + if (10 * bbs.entries / bbs.size > 8) + resize_bb_table(); + + size = sizeof(BB) + instr_count * sizeof(InstrInfo) + + (cjmp_count + 1) * sizeof(CJmpInfo); + bb = (BB*)TG_MALLOC("cl.bb.nb.1", size); + VG_(memset)(bb, 0, size); + + bb->obj = obj; + bb->offset = offset; + + bb->instr_count = instr_count; + bb->cjmp_count = cjmp_count; + bb->cjmp_inverted = cjmp_inverted; + bb->jmp = (CJmpInfo*)&(bb->instr[instr_count]); + bb->instr_len = 0; + bb->cost_count = 0; + bb->sect_kind = VG_(DebugInfo_sect_kind)(NULL, offset + obj->offset); + bb->fn = 0; + bb->line = 0; + bb->is_entry = 0; + bb->inl_fns = NULL; + bb->inl_depth = 0; + bb->bbcc_list = 0; + bb->last_bbcc = 0; + + /* insert into BB hash table */ + idx = bb_hash_idx(obj, offset, bbs.size); + bb->next = bbs.table[idx]; + bbs.table[idx] = bb; + + TG_(stat).distinct_bbs++; + +#if TG_ENABLE_DEBUG + TG_DEBUGIF(3) + { + VG_(printf)( + " new_bb (instr %u, jmps %u, inv %s) [now %d]: ", instr_count, + cjmp_count, cjmp_inverted ? "yes" : "no", TG_(stat).distinct_bbs); + TG_(print_bb)(0, bb); + VG_(printf)("\n"); + } +#endif + + TG_(get_fn_node)(bb); + + return bb; +} + +/* get the BB structure for a BB start address */ +static __inline__ BB* lookup_bb(obj_node* obj, PtrdiffT offset) +{ + BB* bb; + Int idx; + + idx = bb_hash_idx(obj, offset, bbs.size); + bb = bbs.table[idx]; + + while (bb) { + if ((bb->obj == obj) && (bb->offset == offset)) + break; + bb = bb->next; + } + + TG_DEBUG(5, " lookup_bb (Obj %s, off %#lx): %p\n", obj->name, (UWord)offset, + bb); + return bb; +} + +static __inline__ obj_node* obj_of_address(Addr addr) +{ + obj_node* obj; + DebugInfo* di; + PtrdiffT offset; + + DiEpoch ep = VG_(current_DiEpoch)(); + di = VG_(find_DebugInfo)(ep, addr); + obj = TG_(get_obj_node)(di); + + /* Update symbol offset in object if remapped */ + /* FIXME (or at least check this) 2008 Feb 19: 'offset' is + only correct for text symbols, not for data symbols */ + offset = di ? VG_(DebugInfo_get_text_bias)(di) : 0; + if (obj->offset != offset) { + Addr start = di ? VG_(DebugInfo_get_text_avma)(di) : 0; + + TG_DEBUG(0, "Mapping changed for '%s': %#lx -> %#lx\n", obj->name, + obj->start, start); + + /* Size should be the same, and offset diff == start diff */ + TG_ASSERT(obj->size == (di ? VG_(DebugInfo_get_text_size)(di) : 0)); + TG_ASSERT(obj->start - start == obj->offset - offset); + obj->offset = offset; + obj->start = start; + } + + return obj; +} + +/* Get the BB structure for a BB start address. + * If the BB has to be created, the IRBB is needed to + * compute the event type list for costs, and seen_before is + * set to False. Otherwise, seen_before is set to True. + * + * BBs are never discarded. There are 2 cases where this function + * is called from TG_(instrument)() and a BB already exists: + * - The instrumented version was removed from Valgrinds TT cache + * - The ELF object of the BB was unmapped and mapped again. + * This involves a possibly different address, but is handled by + * looking up a BB keyed by (obj_node, file offset). + * + * bbIn==0 is possible for artificial BB without real code. + * Such a BB is created when returning to an unknown function. + */ +BB* TG_(get_bb)(Addr addr, IRSB* bbIn, /*OUT*/ Bool* seen_before) +{ + BB* bb; + obj_node* obj; + UInt n_instrs, n_jmps; + Bool cjmp_inverted = False; + + TG_DEBUG(5, "+ get_bb(BB %#lx)\n", addr); + + obj = obj_of_address(addr); + bb = lookup_bb(obj, addr - obj->offset); + + n_instrs = 0; + n_jmps = 0; + TG_(collectBlockInfo)(bbIn, &n_instrs, &n_jmps, &cjmp_inverted); + + *seen_before = bb ? True : False; + if (*seen_before) { + if (bb->instr_count != n_instrs) { + VG_(message)(Vg_DebugMsg, + "ERROR: BB Retranslation Mismatch at BB %#lx\n", addr); + VG_(message)( + Vg_DebugMsg, " new: Obj %s, Off %#lx, BBOff %#lx, Instrs %u\n", + obj->name, (UWord)obj->offset, addr - obj->offset, n_instrs); + VG_(message)(Vg_DebugMsg, + " old: Obj %s, Off %#lx, BBOff %#lx, Instrs %u\n", + bb->obj->name, (UWord)bb->obj->offset, (UWord)bb->offset, + bb->instr_count); + TG_ASSERT(bb->instr_count == n_instrs); + } + TG_ASSERT(bb->cjmp_count == n_jmps); + TG_(stat).bb_retranslations++; + + TG_DEBUG(5, "- get_bb(BB %#lx): seen before.\n", addr); + return bb; + } + + bb = new_bb(obj, addr - obj->offset, n_instrs, n_jmps, cjmp_inverted); + + TG_DEBUG(5, "- get_bb(BB %#lx)\n", addr); + + return bb; +} + +/* Delete the BB info for the bb with unredirected entry-point + address 'addr'. */ +void TG_(delete_bb)(Addr addr) +{ + BB *bb, *bp; + Int idx, size; + + obj_node* obj = obj_of_address(addr); + PtrdiffT offset = addr - obj->offset; + + idx = bb_hash_idx(obj, offset, bbs.size); + bb = bbs.table[idx]; + + /* bb points at the current bb under consideration, and bp is the + one before. */ + bp = NULL; + while (bb) { + if ((bb->obj == obj) && (bb->offset == offset)) + break; + bp = bb; + bb = bb->next; + } + + if (bb == NULL) { + TG_DEBUG(3, " delete_bb (Obj %s, off %#lx): NOT FOUND\n", obj->name, + (UWord)offset); + + /* we didn't find it. + * this happens when tracegrinds instrumentation mode + * was off at BB translation time, ie. no BB was created. + */ + return; + } + + /* unlink it from hash table */ + + if (bp == NULL) { + /* we found the first one in the list. */ + tl_assert(bb == bbs.table[idx]); + bbs.table[idx] = bb->next; + } else { + tl_assert(bb != bbs.table[idx]); + bp->next = bb->next; + } + + TG_DEBUG(3, " delete_bb (Obj %s, off %#lx): %p, BBCC head: %p\n", obj->name, + (UWord)offset, bb, bb->bbcc_list); + + if (bb->bbcc_list == 0) { + /* can be safely deleted */ + + if (bb->inl_fns) + VG_(free)(bb->inl_fns); + + /* Fill the block up with junk and then free it, so we will + hopefully get a segfault if it is used again by mistake. */ + size = sizeof(BB) + bb->instr_count * sizeof(InstrInfo) + + (bb->cjmp_count + 1) * sizeof(CJmpInfo); + VG_(memset)(bb, 0xAA, size); + TG_FREE(bb); + return; + } + TG_DEBUG(3, " delete_bb: BB in use, can not free!\n"); +} diff --git a/tracegrind/bbcc.c b/tracegrind/bbcc.c new file mode 100644 index 000000000..15143c621 --- /dev/null +++ b/tracegrind/bbcc.c @@ -0,0 +1,864 @@ +/*--------------------------------------------------------------------*/ +/*--- Tracegrind ---*/ +/*--- bbcc.c ---*/ +/*--------------------------------------------------------------------*/ + +/* + This file is part of Tracegrind, a Valgrind tool for call tracing. + + Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de) + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see . + + The GNU General Public License is contained in the file COPYING. +*/ + +#include "costs.h" +#include "global.h" + +#include "pub_tool_threadstate.h" + +/*------------------------------------------------------------*/ +/*--- BBCC operations ---*/ +/*------------------------------------------------------------*/ + +#define N_BBCC_INITIAL_ENTRIES 10437 + +/* BBCC table (key is BB/Context), per thread, resizable */ +bbcc_hash current_bbccs; + +void TG_(init_bbcc_hash)(bbcc_hash* bbccs) +{ + Int i; + + TG_ASSERT(bbccs != 0); + + bbccs->size = N_BBCC_INITIAL_ENTRIES; + bbccs->entries = 0; + bbccs->table = + (BBCC**)TG_MALLOC("cl.bbcc.ibh.1", bbccs->size * sizeof(BBCC*)); + + for (i = 0; i < bbccs->size; i++) + bbccs->table[i] = NULL; +} + +void TG_(copy_current_bbcc_hash)(bbcc_hash* dst) +{ + TG_ASSERT(dst != 0); + + dst->size = current_bbccs.size; + dst->entries = current_bbccs.entries; + dst->table = current_bbccs.table; +} + +bbcc_hash* TG_(get_current_bbcc_hash)(void) { return ¤t_bbccs; } + +void TG_(set_current_bbcc_hash)(bbcc_hash* h) +{ + TG_ASSERT(h != 0); + + current_bbccs.size = h->size; + current_bbccs.entries = h->entries; + current_bbccs.table = h->table; +} + +/* All BBCCs for recursion level 0 are inserted into a + * thread specific hash table with key + * - address of BB structure (unique, as never freed) + * - current context (includes caller chain) + * BBCCs for other recursion levels are in bbcc->rec_array. + * + * The hash is used in setup_bb(), i.e. to find the cost + * counters to be changed in the execution of a BB. + */ + +static __inline__ UInt bbcc_hash_idx(BB* bb, Context* cxt, UInt size) +{ + TG_ASSERT(bb != 0); + TG_ASSERT(cxt != 0); + + return ((Addr)bb + (Addr)cxt) % size; +} + +/* Lookup for a BBCC in hash. + */ +static BBCC* lookup_bbcc(BB* bb, Context* cxt) +{ + BBCC* bbcc = bb->last_bbcc; + UInt idx; + + /* check LRU */ + if (bbcc->cxt == cxt) { + if (!TG_(clo).separate_threads) { + /* if we don't dump threads separate, tid doesn't have to match */ + return bbcc; + } + if (bbcc->tid == TG_(current_tid)) + return bbcc; + } + + TG_(stat).bbcc_lru_misses++; + + idx = bbcc_hash_idx(bb, cxt, current_bbccs.size); + bbcc = current_bbccs.table[idx]; + while (bbcc && (bb != bbcc->bb || cxt != bbcc->cxt)) { + bbcc = bbcc->next; + } + + TG_DEBUG(2, " lookup_bbcc(BB %#lx, Cxt %u, fn '%s'): %p (tid %u)\n", + bb_addr(bb), cxt->base_number, cxt->fn[0]->name, bbcc, + bbcc ? bbcc->tid : 0); + + TG_DEBUGIF(2) + if (bbcc) + TG_(print_bbcc)(-2, bbcc); + + return bbcc; +} + +/* double size of hash table 1 (addr->BBCC) */ +static void resize_bbcc_hash(void) +{ + Int i, new_size, conflicts1 = 0, conflicts2 = 0; + BBCC** new_table; + UInt new_idx; + BBCC * curr_BBCC, *next_BBCC; + + new_size = 2 * current_bbccs.size + 3; + new_table = (BBCC**)TG_MALLOC("cl.bbcc.rbh.1", new_size * sizeof(BBCC*)); + + for (i = 0; i < new_size; i++) + new_table[i] = NULL; + + for (i = 0; i < current_bbccs.size; i++) { + if (current_bbccs.table[i] == NULL) + continue; + + curr_BBCC = current_bbccs.table[i]; + while (NULL != curr_BBCC) { + next_BBCC = curr_BBCC->next; + + new_idx = bbcc_hash_idx(curr_BBCC->bb, curr_BBCC->cxt, new_size); + + curr_BBCC->next = new_table[new_idx]; + new_table[new_idx] = curr_BBCC; + if (curr_BBCC->next) { + conflicts1++; + if (curr_BBCC->next->next) + conflicts2++; + } + + curr_BBCC = next_BBCC; + } + } + + VG_(free)(current_bbccs.table); + + TG_DEBUG(0, "Resize BBCC Hash: %u => %d (entries %u, conflicts %d/%d)\n", + current_bbccs.size, new_size, current_bbccs.entries, conflicts1, + conflicts2); + + current_bbccs.size = new_size; + current_bbccs.table = new_table; + TG_(stat).bbcc_hash_resizes++; +} + +static __inline BBCC** new_recursion(int size) +{ + BBCC** bbccs; + int i; + + bbccs = (BBCC**)TG_MALLOC("cl.bbcc.nr.1", sizeof(BBCC*) * size); + for (i = 0; i < size; i++) + bbccs[i] = 0; + + TG_DEBUG(3, " new_recursion(size %d): %p\n", size, bbccs); + + return bbccs; +} + +/* + * Allocate a new BBCC + * + * Uninitialized: + * cxt, rec_index, rec_array, next_bbcc, next1, next2 + */ +static __inline__ BBCC* new_bbcc(BB* bb) +{ + BBCC* bbcc; + Int i; + + /* We need cjmp_count+1 JmpData structs: + * the last is for the unconditional jump/call/ret at end of BB + */ + bbcc = (BBCC*)TG_MALLOC("cl.bbcc.nb.1", sizeof(BBCC) + (bb->cjmp_count + 1) * + sizeof(JmpData)); + bbcc->bb = bb; + bbcc->tid = TG_(current_tid); + + bbcc->skipped = 0; + bbcc->cost = TG_(get_costarray)(bb->cost_count); + for (i = 0; i < bb->cost_count; i++) + bbcc->cost[i] = 0; + for (i = 0; i <= bb->cjmp_count; i++) { + bbcc->jmp[i].ecounter = 0; + bbcc->jmp[i].jcc_list = 0; + } + bbcc->ecounter_sum = 0; + + /* Init pointer caches (LRU) */ + bbcc->lru_next_bbcc = 0; + bbcc->lru_from_jcc = 0; + bbcc->lru_to_jcc = 0; + + TG_(stat).distinct_bbccs++; + + TG_DEBUG(3, " new_bbcc(BB %#lx): %p (now %d)\n", bb_addr(bb), bbcc, + TG_(stat).distinct_bbccs); + + return bbcc; +} + +/** + * Inserts a new BBCC into hashes. + * BBCC specific items must be set as this is used for the hash + * keys: + * fn : current function + * tid : current thread ID + * from : position where current function is called from + * + * Recursion level doesn't need to be set as this is not included + * in the hash key: Only BBCCs with rec level 0 are in hashes. + */ +static void insert_bbcc_into_hash(BBCC* bbcc) +{ + UInt idx; + + TG_ASSERT(bbcc->cxt != 0); + + TG_DEBUG(3, "+ insert_bbcc_into_hash(BB %#lx, fn '%s')\n", bb_addr(bbcc->bb), + bbcc->cxt->fn[0]->name); + + /* check fill degree of hash and resize if needed (>90%) */ + current_bbccs.entries++; + if (100 * current_bbccs.entries / current_bbccs.size > 90) + resize_bbcc_hash(); + + idx = bbcc_hash_idx(bbcc->bb, bbcc->cxt, current_bbccs.size); + bbcc->next = current_bbccs.table[idx]; + current_bbccs.table[idx] = bbcc; + + TG_DEBUG(3, "- insert_bbcc_into_hash: %u entries\n", current_bbccs.entries); +} + +/* String is returned in a dynamically allocated buffer. Caller is + responsible for free'ing it. */ +static HChar* mangled_cxt(const Context* cxt, Int rec_index) +{ + Int i, p; + + if (!cxt) + return VG_(strdup)("cl.bbcc.mcxt", "(no context)"); + + /* Overestimate the number of bytes we need to hold the string. */ + SizeT need = 20; // rec_index + nul-terminator + for (i = 0; i < cxt->size; ++i) + need += VG_(strlen)(cxt->fn[i]->name) + 1; // 1 for leading ' + + HChar* mangled = TG_MALLOC("cl.bbcc.mcxt", need); + p = VG_(sprintf)(mangled, "%s", cxt->fn[0]->name); + if (rec_index > 0) + p += VG_(sprintf)(mangled + p, "'%d", rec_index + 1); + for (i = 1; i < cxt->size; i++) + p += VG_(sprintf)(mangled + p, "'%s", cxt->fn[i]->name); + + return mangled; +} + +/* Create a new BBCC as a copy of an existing one, + * but with costs set to 0 and jcc chains empty. + * + * This is needed when a BB is executed in another context than + * the one at instrumentation time of the BB. + * + * Use cases: + * rec_index == 0: clone from a BBCC with differing tid/cxt + * and insert into hashes + * rec_index >0 : clone from a BBCC with same tid/cxt and rec_index 0 + * don't insert into hashes + */ +static BBCC* clone_bbcc(BBCC* orig, Context* cxt, Int rec_index) +{ + BBCC* bbcc; + + TG_DEBUG(3, "+ clone_bbcc(BB %#lx, rec %d, fn %s)\n", bb_addr(orig->bb), + rec_index, cxt->fn[0]->name); + + bbcc = new_bbcc(orig->bb); + + if (rec_index == 0) { + + /* hash insertion is only allowed if tid or cxt is different */ + TG_ASSERT((orig->tid != TG_(current_tid)) || (orig->cxt != cxt)); + + bbcc->rec_index = 0; + bbcc->cxt = cxt; + bbcc->rec_array = new_recursion(cxt->fn[0]->separate_recursions); + bbcc->rec_array[0] = bbcc; + + insert_bbcc_into_hash(bbcc); + } else { + if (TG_(clo).separate_threads) + TG_ASSERT(orig->tid == TG_(current_tid)); + + TG_ASSERT(orig->cxt == cxt); + TG_ASSERT(orig->rec_array); + TG_ASSERT(cxt->fn[0]->separate_recursions > rec_index); + TG_ASSERT(orig->rec_array[rec_index] == 0); + + /* new BBCC will only have differing recursion level */ + bbcc->rec_index = rec_index; + bbcc->cxt = cxt; + bbcc->rec_array = orig->rec_array; + bbcc->rec_array[rec_index] = bbcc; + } + + /* update list of BBCCs for same BB */ + bbcc->next_bbcc = orig->bb->bbcc_list; + orig->bb->bbcc_list = bbcc; + + TG_DEBUGIF(3) + TG_(print_bbcc)(-2, bbcc); + + HChar* mangled_orig = mangled_cxt(orig->cxt, orig->rec_index); + HChar* mangled_bbcc = mangled_cxt(bbcc->cxt, bbcc->rec_index); + TG_DEBUG(2, + "- clone_BBCC(%p, %d) for BB %#lx\n" + " orig %s\n" + " new %s\n", + orig, rec_index, bb_addr(orig->bb), mangled_orig, mangled_bbcc); + TG_FREE(mangled_orig); + TG_FREE(mangled_bbcc); + + TG_(stat).bbcc_clones++; + + return bbcc; +}; + +/* Get a pointer to the cost centre structure for given basic block + * address. If created, the BBCC is inserted into the BBCC hash. + * Also sets BB_seen_before by reference. + * + */ +BBCC* TG_(get_bbcc)(BB* bb) +{ + BBCC* bbcc; + + TG_DEBUG(3, "+ get_bbcc(BB %#lx)\n", bb_addr(bb)); + + bbcc = bb->bbcc_list; + + if (!bbcc) { + bbcc = new_bbcc(bb); + + /* initialize BBCC */ + bbcc->cxt = 0; + bbcc->rec_array = 0; + bbcc->rec_index = 0; + + bbcc->next_bbcc = bb->bbcc_list; + bb->bbcc_list = bbcc; + bb->last_bbcc = bbcc; + + TG_DEBUGIF(3) + TG_(print_bbcc)(-2, bbcc); + } + + TG_DEBUG(3, "- get_bbcc(BB %#lx): BBCC %p\n", bb_addr(bb), bbcc); + + return bbcc; +} + +/* Tracegrind manages its own call stack for each thread. + * When leaving a function, a underflow can happen when + * Tracegrind's tracing was switched on in the middle of + * a run, i.e. when Tracegrind was not able to trace the + * call instruction. + * This function tries to reconstruct the original call. + * As we know the return address (the address following + * the CALL instruction), we can detect the function + * we return back to, but the original call site is unknown. + * We suppose a call site at return address - 1. + * (TODO: other heuristic: lookup info of instrumented BBs). + */ +static void handleUnderflow(BB* bb) +{ + /* RET at top of call stack */ + BBCC* source_bbcc; + BB* source_bb; + Bool seen_before; + fn_node* caller; + int fn_number; + unsigned* pactive; + call_entry* call_entry_up; + + TG_DEBUG(1, " Callstack underflow !\n"); + + /* we emulate an old call from the function we return to + * by using ( -1) */ + source_bb = TG_(get_bb)(bb_addr(bb) - 1, 0, &seen_before); + source_bbcc = TG_(get_bbcc)(source_bb); + + /* seen_before can be true if RET from a signal handler */ + if (!seen_before) { + source_bbcc->ecounter_sum = TG_(current_state).collect ? 1 : 0; + } else if (TG_(current_state).collect) + source_bbcc->ecounter_sum++; + + /* Force a new top context, will be set active by push_cxt() */ + TG_(current_fn_stack).top--; + TG_(current_state).cxt = 0; + caller = TG_(get_fn_node)(bb); + TG_(push_cxt)(caller); + + if (!seen_before) { + /* set rec array for source BBCC: this is at rec level 1 */ + source_bbcc->rec_array = new_recursion(caller->separate_recursions); + source_bbcc->rec_array[0] = source_bbcc; + + TG_ASSERT(source_bbcc->cxt == 0); + source_bbcc->cxt = TG_(current_state).cxt; + insert_bbcc_into_hash(source_bbcc); + } + TG_ASSERT(TG_(current_state).bbcc); + + /* correct active counts */ + fn_number = TG_(current_state).bbcc->cxt->fn[0]->number; + pactive = TG_(get_fn_entry)(fn_number); + (*pactive)--; + + /* This assertion is not correct for reentrant + * signal handlers */ + /* TG_ASSERT(*pactive == 0); */ + + TG_(current_state).nonskipped = 0; /* we didn't skip this function */ + /* back to current context */ + TG_(push_cxt)(TG_(current_state).bbcc->cxt->fn[0]); + TG_(push_call_stack) + (source_bbcc, 0, TG_(current_state).bbcc, (Addr)-1, False); + call_entry_up = + &(TG_(current_call_stack).entry[TG_(current_call_stack).sp - 1]); + /* assume this call is lasting since last dump or + * for a signal handler since it's call */ + if (TG_(current_state).sig == 0) + TG_(copy_cost) + (TG_(sets).full, call_entry_up->enter_cost, + TG_(get_current_thread)()->lastdump_cost); + else TG_(zero_cost)(TG_(sets).full, call_entry_up->enter_cost); +} + +/* + * Helper function called at start of each instrumented BB to setup + * pointer to costs for current thread/context/recursion level + */ + +VG_REGPARM(1) +void TG_(setup_bbcc)(BB* bb) +{ + BBCC * bbcc, *last_bbcc; + Bool call_emulation = False, delayed_push = False, skip = False; + Addr sp; + BB* last_bb; + ThreadId tid; + TgJumpKind jmpkind; + Bool isConditionalJump; + Int passed = 0, csp; + Bool ret_without_call = False; + Int popcount_on_return = 1; + + TG_DEBUG(3, "+ setup_bbcc(BB %#lx)\n", bb_addr(bb)); + + /* This is needed because thread switches can not reliable be tracked + * with callback TG_(run_thread) only: we have otherwise no way to get + * the thread ID after a signal handler returns. + * This could be removed again if that bug is fixed in Valgrind. + * This is in the hot path but hopefully not to costly. + */ + tid = VG_(get_running_tid)(); +#if 1 + /* TG_(switch_thread) is a no-op when tid is equal to TG_(current_tid). + * As this is on the hot path, we only call TG_(switch_thread)(tid) + * if tid differs from the TG_(current_tid). + */ + if (UNLIKELY(tid != TG_(current_tid))) + TG_(switch_thread)(tid); +#else + TG_ASSERT(VG_(get_running_tid)() == TG_(current_tid)); +#endif + + sp = VG_(get_SP)(tid); + last_bbcc = TG_(current_state).bbcc; + last_bb = last_bbcc ? last_bbcc->bb : 0; + + if (last_bb) { + passed = TG_(current_state).jmps_passed; + TG_ASSERT(passed <= last_bb->cjmp_count); + jmpkind = last_bb->jmp[passed].jmpkind; + isConditionalJump = (passed < last_bb->cjmp_count); + + if (TG_(current_state).collect) { + if (!TG_(current_state).nonskipped) { + last_bbcc->ecounter_sum++; + last_bbcc->jmp[passed].ecounter++; + if (!TG_(clo).simulate_cache) { + /* update Ir cost */ + UInt instr_count = last_bb->jmp[passed].instr + 1; + TG_(current_state).cost[fullOffset(EG_IR)] += instr_count; + } + } else { + /* do not increment exe counter of BBs in skipped functions, as it + * would fool dumping code */ + if (!TG_(clo).simulate_cache) { + /* update Ir cost */ + UInt instr_count = last_bb->jmp[passed].instr + 1; + TG_(current_state).cost[fullOffset(EG_IR)] += instr_count; + TG_(current_state).nonskipped->skipped[fullOffset(EG_IR)] += + instr_count; + } + } + } + + TG_DEBUGIF(4) + { + TG_(print_execstate)(-2, &TG_(current_state)); + TG_(print_bbcc_cost)(-2, last_bbcc); + } + } else { + jmpkind = jk_None; + isConditionalJump = False; + } + + /* Manipulate JmpKind if needed, only using BB specific info */ + + csp = TG_(current_call_stack).sp; + + /* A return not matching the top call in our callstack is a jump */ + if ((jmpkind == jk_Return) && (csp > 0)) { + Int csp_up = csp - 1; + call_entry* top_ce = &(TG_(current_call_stack).entry[csp_up]); + + /* We have a real return if + * - the stack pointer (SP) left the current stack frame, or + * - SP has the same value as when reaching the current function + * and the address of this BB is the return address of last call + * (we even allow to leave multiple frames if the SP stays the + * same and we find a matching return address) + * The latter condition is needed because on PPC, SP can stay + * the same over CALL=b(c)l / RET=b(c)lr boundaries + */ + if (sp < top_ce->sp) + popcount_on_return = 0; + else if (top_ce->sp == sp) { + while (1) { + if (top_ce->ret_addr == bb_addr(bb)) + break; + if (csp_up > 0) { + csp_up--; + top_ce = &(TG_(current_call_stack).entry[csp_up]); + if (top_ce->sp == sp) { + popcount_on_return++; + continue; + } + } + popcount_on_return = 0; + break; + } + } + if (popcount_on_return == 0) { + jmpkind = jk_Jump; + ret_without_call = True; + } + } + + /* Should this jump be converted to call or pop/call ? */ + if ((jmpkind != jk_Return) && (jmpkind != jk_Call) && last_bb) { + + /* We simulate a JMP/Cont to be a CALL if + * - jump is in another ELF object or section kind + * - jump is to first instruction of a function (tail recursion) + */ + if (ret_without_call || + /* This is for detection of optimized tail recursion. + * On PPC, this is only detected as call when going to another + * function. The problem is that on PPC it can go wrong + * more easily (no stack frame setup needed) + */ +#if defined(VGA_ppc32) + (bb->is_entry && (last_bb->fn != bb->fn)) || +#else + bb->is_entry || +#endif + (last_bb->sect_kind != bb->sect_kind) || + (last_bb->obj->number != bb->obj->number)) { + + TG_DEBUG(1, " JMP: %s[%s] to %s[%s]%s!\n", last_bb->fn->name, + last_bb->obj->name, bb->fn->name, bb->obj->name, + ret_without_call ? " (RET w/o CALL)" : ""); + + if (TG_(get_fn_node)(last_bb)->pop_on_jump && (csp > 0)) { + + call_entry* top_ce = &(TG_(current_call_stack).entry[csp - 1]); + + if (top_ce->jcc) { + + TG_DEBUG(1, " Pop on Jump!\n"); + + /* change source for delayed push */ + TG_(current_state).bbcc = top_ce->jcc->from; + sp = top_ce->sp; + passed = top_ce->jcc->jmp; + TG_(pop_call_stack)(); + } else { + TG_ASSERT(TG_(current_state).nonskipped != 0); + } + } + + jmpkind = jk_Call; + call_emulation = True; + } + } + + if (jmpkind == jk_Call) { + fn_node* node = TG_(get_fn_node)(bb); + skip = node->skip; + } + + TG_DEBUGIF(1) + { + if (isConditionalJump) + VG_(printf)("Cond-"); + switch (jmpkind) { + case jk_None: + VG_(printf)("Fall-through"); + break; + case jk_Jump: + VG_(printf)("Jump"); + break; + case jk_Call: + VG_(printf)("Call"); + break; + case jk_Return: + VG_(printf)("Return"); + break; + default: + tl_assert(0); + } + VG_(printf)(" %08lx -> %08lx, SP %08lx\n", + last_bb ? bb_jmpaddr(last_bb) : 0, bb_addr(bb), sp); + } + + /* Handle CALL/RET and update context to get correct BBCC */ + + if (jmpkind == jk_Return) { + + if ((csp == 0) || + ((TG_(current_fn_stack).top > TG_(current_fn_stack).bottom) && + (*(TG_(current_fn_stack).top - 1) == 0))) { + + /* On an empty call stack or at a signal separation marker, + * a RETURN generates an call stack underflow. + */ + handleUnderflow(bb); + TG_(pop_call_stack)(); + } else { + TG_ASSERT(popcount_on_return > 0); + TG_(unwind_call_stack)(sp, popcount_on_return); + } + } else { + Int unwind_count = TG_(unwind_call_stack)(sp, 0); + if (unwind_count > 0) { + /* if unwinding was done, this actually is a return */ + jmpkind = jk_Return; + } + + if (jmpkind == jk_Call) { + delayed_push = True; + + csp = TG_(current_call_stack).sp; + if (call_emulation && csp > 0) + sp = TG_(current_call_stack).entry[csp - 1].sp; + } + } + + /* Change new context if needed, taking delayed_push into account */ + if ((delayed_push && !skip) || (TG_(current_state).cxt == 0)) { + TG_(push_cxt)(TG_(get_fn_node)(bb)); + } + TG_ASSERT(TG_(current_fn_stack).top > TG_(current_fn_stack).bottom); + + /* If there is a fresh instrumented BBCC, assign current context */ + bbcc = TG_(get_bbcc)(bb); + if (bbcc->cxt == 0) { + TG_ASSERT(bbcc->rec_array == 0); + + bbcc->cxt = TG_(current_state).cxt; + bbcc->rec_array = + new_recursion((*TG_(current_fn_stack).top)->separate_recursions); + bbcc->rec_array[0] = bbcc; + + insert_bbcc_into_hash(bbcc); + } else { + /* get BBCC with current context */ + + /* first check LRU of last bbcc executed */ + + if (last_bbcc) { + bbcc = last_bbcc->lru_next_bbcc; + if (bbcc && + ((bbcc->bb != bb) || (bbcc->cxt != TG_(current_state).cxt))) + bbcc = 0; + } else + bbcc = 0; + + if (!bbcc) + bbcc = lookup_bbcc(bb, TG_(current_state).cxt); + if (!bbcc) + bbcc = clone_bbcc(bb->bbcc_list, TG_(current_state).cxt, 0); + + bb->last_bbcc = bbcc; + } + + /* save for fast lookup */ + if (last_bbcc) + last_bbcc->lru_next_bbcc = bbcc; + + if ((*TG_(current_fn_stack).top)->separate_recursions > 1) { + UInt level, idx; + fn_node* top = *(TG_(current_fn_stack).top); + + level = *TG_(get_fn_entry)(top->number); + + if (delayed_push && !skip) { + level++; + } + if (level > top->separate_recursions) + level = top->separate_recursions; + + if (level == 0) { + /* can only happen if instrumentation just was switched on */ + level = 1; + *TG_(get_fn_entry)(top->number) = 1; + } + + idx = level - 1; + if (bbcc->rec_array[idx]) + bbcc = bbcc->rec_array[idx]; + else + bbcc = clone_bbcc(bbcc, TG_(current_state).cxt, idx); + + TG_ASSERT(bbcc->rec_array[bbcc->rec_index] == bbcc); + } + + if (delayed_push) { + if (!skip && TG_(current_state).nonskipped) { + /* a call from skipped to nonskipped */ + TG_(current_state).bbcc = TG_(current_state).nonskipped; + /* FIXME: take the real passed count from shadow stack */ + passed = TG_(current_state).bbcc->bb->cjmp_count; + } + TG_(push_call_stack)(TG_(current_state).bbcc, passed, bbcc, sp, skip); + } + + if (TG_(clo).collect_jumps && (jmpkind == jk_Jump)) { + + /* Handle conditional jumps followed, i.e. trace arcs + * This uses JCC structures, too */ + + jCC* jcc = TG_(get_jcc)(last_bbcc, passed, bbcc); + TG_ASSERT(jcc != 0); + // Change from default, and check if already changed + if (jcc->jmpkind == jk_Call) + jcc->jmpkind = isConditionalJump ? jk_CondJump : jk_Jump; + else { + // FIXME: Why can this fail? + // TG_ASSERT(jcc->jmpkind == jmpkind); + } + + jcc->call_counter++; + if (isConditionalJump) + TG_(stat).jcnd_counter++; + else + TG_(stat).jump_counter++; + } + + TG_(current_state).bbcc = bbcc; + + /* Check for inline function transitions */ + if (TG_(current_state).collect) { + thread_info* ti = TG_(get_current_thread)(); + if (ti) { + UInt old_depth = ti->cur_inl_depth; + UInt new_depth = bb->inl_depth; + + /* Fast path: both empty (most BBs) */ + if (old_depth != 0 || new_depth != 0) { + /* Find longest common prefix */ + UInt common = 0; + UInt min_depth = old_depth < new_depth ? old_depth : new_depth; + while (common < min_depth && + ti->cur_inl_fns[common] == bb->inl_fns[common]) + common++; + + /* EXIT from deepest down to common level */ + for (Int i = (Int)old_depth - 1; i >= (Int)common; i--) + TG_(trace_emit_exit_inlined) + (TG_(current_tid), bb, ti->cur_inl_fns[i]); + + /* ENTER from common level up to new deepest */ + for (UInt i = common; i < new_depth; i++) + TG_(trace_emit_enter_inlined) + (TG_(current_tid), bb, bb->inl_fns[i]); + + /* Update thread state */ + for (UInt i = 0; i < new_depth; i++) + ti->cur_inl_fns[i] = bb->inl_fns[i]; + ti->cur_inl_depth = new_depth; + } + } + } + + /* Even though this will be set in instrumented code directly before + * side exits, it needs to be set to 0 here in case an exception + * happens in first instructions of the BB */ + TG_(current_state).jmps_passed = 0; + // needed for log_* handlers called in this BB + TG_(bb_base) = bb->obj->offset + bb->offset; + TG_(cost_base) = bbcc->cost; + + TG_DEBUGIF(1) + { + VG_(printf)(" "); + TG_(print_bbcc_fn)(bbcc); + VG_(printf)("\n"); + } + + TG_DEBUG(3, "- setup_bbcc (BB %#lx): Cost %p (Len %u), Instrs %u (Len %u)\n", + bb_addr(bb), bbcc->cost, bb->cost_count, bb->instr_count, + bb->instr_len); + TG_DEBUGIF(3) + TG_(print_cxt)(-8, TG_(current_state).cxt, bbcc->rec_index); + TG_DEBUG(3, "\n"); + + TG_(stat).bb_executions++; +} diff --git a/tracegrind/callstack.c b/tracegrind/callstack.c new file mode 100644 index 000000000..1cf056a3f --- /dev/null +++ b/tracegrind/callstack.c @@ -0,0 +1,420 @@ +/*--------------------------------------------------------------------*/ +/*--- Tracegrind ---*/ +/*--- ct_callstack.c ---*/ +/*--------------------------------------------------------------------*/ + +/* + This file is part of Tracegrind, a Valgrind tool for call tracing. + + Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de) + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see . + + The GNU General Public License is contained in the file COPYING. +*/ + +#include "global.h" + +/*------------------------------------------------------------*/ +/*--- Call stack, operations ---*/ +/*------------------------------------------------------------*/ + +/* Stack of current thread. Gets initialized when switching to 1st thread. + * + * The artificial call stack is an array of call_entry's, representing + * stack frames of the executing program. + * Array call_stack and call_stack_esp have same size and grow on demand. + * Array call_stack_esp holds SPs of corresponding stack frames. + * + */ + +#define N_CALL_STACK_INITIAL_ENTRIES 500 + +call_stack TG_(current_call_stack); + +void TG_(init_call_stack)(call_stack* s) +{ + Int i; + + TG_ASSERT(s != 0); + + s->size = N_CALL_STACK_INITIAL_ENTRIES; + s->entry = (call_entry*)TG_MALLOC("cl.callstack.ics.1", + s->size * sizeof(call_entry)); + s->sp = 0; + s->entry[0].cxt = 0; /* for assertion in push_cxt() */ + + for (i = 0; i < s->size; i++) + s->entry[i].enter_cost = 0; +} + +call_entry* TG_(get_call_entry)(Int sp) +{ + TG_ASSERT(sp <= TG_(current_call_stack).sp); + return &(TG_(current_call_stack).entry[sp]); +} + +void TG_(copy_current_call_stack)(call_stack* dst) +{ + TG_ASSERT(dst != 0); + + dst->size = TG_(current_call_stack).size; + dst->entry = TG_(current_call_stack).entry; + dst->sp = TG_(current_call_stack).sp; +} + +void TG_(set_current_call_stack)(call_stack* s) +{ + TG_ASSERT(s != 0); + + TG_(current_call_stack).size = s->size; + TG_(current_call_stack).entry = s->entry; + TG_(current_call_stack).sp = s->sp; +} + +static __inline__ void ensure_stack_size(Int i) +{ + Int oldsize; + call_stack* cs = &TG_(current_call_stack); + + if (i < cs->size) + return; + + oldsize = cs->size; + cs->size *= 2; + while (i > cs->size) + cs->size *= 2; + + cs->entry = (call_entry*)VG_(realloc)("cl.callstack.ess.1", cs->entry, + cs->size * sizeof(call_entry)); + + for (i = oldsize; i < cs->size; i++) + cs->entry[i].enter_cost = 0; + + TG_(stat).call_stack_resizes++; + + TG_DEBUGIF(2) + VG_(printf)(" call stack enlarged to %u entries\n", + TG_(current_call_stack).size); +} + +/* Called when function entered nonrecursive */ +static void function_entered(fn_node* fn) +{ + TG_ASSERT(fn != 0); + +#if TG_ENABLE_DEBUG + if (fn->verbosity >= 0) { + Int old = TG_(clo).verbose; + TG_(clo).verbose = fn->verbosity; + fn->verbosity = old; + VG_(message)(Vg_DebugMsg, "Entering %s: Verbosity set to %d\n", fn->name, + TG_(clo).verbose); + } +#endif + + if (fn->toggle_collect) { + TG_(current_state).collect = !TG_(current_state).collect; + TG_DEBUG(2, " entering %s: toggled collection state to %s\n", fn->name, + TG_(current_state).collect ? "ON" : "OFF"); + } +} + +/* Called when function left (no recursive level active) */ +static void function_left(fn_node* fn) +{ + TG_ASSERT(fn != 0); + + if (fn->toggle_collect) { + TG_(current_state).collect = !TG_(current_state).collect; + TG_DEBUG(2, " leaving %s: toggled collection state to %s\n", fn->name, + TG_(current_state).collect ? "ON" : "OFF"); + } + +#if TG_ENABLE_DEBUG + if (fn->verbosity >= 0) { + Int old = TG_(clo).verbose; + TG_(clo).verbose = fn->verbosity; + fn->verbosity = old; + VG_(message)(Vg_DebugMsg, "Leaving %s: Verbosity set back to %d\n", + fn->name, TG_(clo).verbose); + } +#endif +} + +/* Push call on call stack. + * + * Increment the usage count for the function called. + * A jump from to , with . + * If is true, this is a call to a function to be skipped; + * for this, we set jcc = 0. + */ +void TG_(push_call_stack)(BBCC* from, UInt jmp, BBCC* to, Addr sp, Bool skip) +{ + jCC* jcc; + UInt* pdepth; + call_entry* current_entry; + Addr ret_addr; + + /* Ensure a call stack of size +1. + * The +1 is needed as push_cxt will store the + * context at [current_sp] + */ + ensure_stack_size(TG_(current_call_stack).sp + 1); + current_entry = &(TG_(current_call_stack).entry[TG_(current_call_stack).sp]); + + if (skip) { + jcc = 0; + } else { + fn_node* to_fn = to->cxt->fn[0]; + + if (TG_(current_state).nonskipped) { + /* this is a jmp from skipped to nonskipped */ + TG_ASSERT(TG_(current_state).nonskipped == from); + } + + /* As push_cxt() has to be called before push_call_stack if not + * skipping, the old context should already be saved on the stack */ + TG_ASSERT(current_entry->cxt != 0); + TG_(copy_cost_lz) + (TG_(sets).full, &(current_entry->enter_cost), TG_(current_state).cost); + + jcc = TG_(get_jcc)(from, jmp, to); + TG_ASSERT(jcc != 0); + + pdepth = TG_(get_fn_entry)(to_fn->number); + (*pdepth)++; + + if (*pdepth > 1) + TG_(stat).rec_call_counter++; + + jcc->call_counter++; + TG_(stat).call_counter++; + + if (*pdepth == 1) + function_entered(to_fn); + } + + /* return address is only is useful with a real call; + * used to detect RET w/o CALL */ + if (from->bb->jmp[jmp].jmpkind == jk_Call) { + UInt instr = from->bb->jmp[jmp].instr; + ret_addr = bb_addr(from->bb) + from->bb->instr[instr].instr_offset + + from->bb->instr[instr].instr_size; + } else + ret_addr = 0; + + /* put jcc on call stack */ + current_entry->jcc = jcc; + current_entry->sp = sp; + current_entry->ret_addr = ret_addr; + current_entry->nonskipped = TG_(current_state).nonskipped; + + TG_(current_call_stack).sp++; + + /* Emit trace sample on function entry */ + if (!skip && TG_(current_state).collect) { + /* Exit entire inline stack, deepest first */ + thread_info* ti = TG_(get_current_thread)(); + if (ti && ti->cur_inl_depth > 0 && TG_(current_state).bbcc) { + for (Int i = (Int)ti->cur_inl_depth - 1; i >= 0; i--) + TG_(trace_emit_exit_inlined) + (TG_(current_tid), TG_(current_state).bbcc->bb, ti->cur_inl_fns[i]); + ti->cur_inl_depth = 0; + } + fn_node* to_fn = to->cxt->fn[0]; + TG_(trace_emit_sample)(TG_(current_tid), True, to_fn); + } + + /* To allow for above assertion we set context of next frame to 0 */ + TG_ASSERT(TG_(current_call_stack).sp < TG_(current_call_stack).size); + current_entry++; + current_entry->cxt = 0; + + if (!skip) + TG_(current_state).nonskipped = 0; + else if (!TG_(current_state).nonskipped) { + /* a call from nonskipped to skipped */ + TG_(current_state).nonskipped = from; + if (!TG_(current_state).nonskipped->skipped) { + TG_(init_cost_lz) + (TG_(sets).full, &TG_(current_state).nonskipped->skipped); + TG_(stat).distinct_skips++; + } + } + +#if TG_ENABLE_DEBUG + TG_DEBUGIF(0) + { + if (TG_(clo).verbose < 2) { + if (jcc && jcc->to && jcc->to->bb) { + const HChar spaces[][41] = { + " . . . . . . . . . .", + " . . . . . . . . . . ", + " . . . . . . . . . . ", + ". . . . . . . . . . "}; + + int s = TG_(current_call_stack).sp; + UInt* pars = (UInt*)sp; + + BB* bb = jcc->to->bb; + if (s > 40) + s = 40; + VG_(printf)( + "%s> %s(0x%x, 0x%x, ...) [%s / %#lx]\n", spaces[s % 4] + 40 - s, + bb->fn->name, pars ? pars[1] : 0, pars ? pars[2] : 0, + bb->obj->name + bb->obj->last_slash_pos, (UWord)bb->offset); + } + } else if (TG_(clo).verbose < 4) { + VG_(printf)("+ %2d ", TG_(current_call_stack).sp); + TG_(print_short_jcc)(jcc); + VG_(printf)(", SP %#lx, RA %#lx\n", sp, ret_addr); + } else { + VG_(printf)(" Pushed "); + TG_(print_stackentry)(3, TG_(current_call_stack).sp - 1); + } + } +#endif +} + +/* Pop call stack and update inclusive sums. + * Returns modified fcc. + * + * If the JCC becomes inactive, call entries are freed if possible + */ +void TG_(pop_call_stack)(void) +{ + jCC* jcc; + Int depth = 0; + call_entry* lower_entry; + + if (TG_(current_state).sig > 0) { + /* Check if we leave a signal handler; this can happen when + * calling longjmp() in the handler */ + TG_(run_post_signal_on_call_stack_bottom)(); + } + + lower_entry = + &(TG_(current_call_stack).entry[TG_(current_call_stack).sp - 1]); + + TG_DEBUG(4, "+ pop_call_stack: frame %d, jcc %p\n", + TG_(current_call_stack).sp, lower_entry->jcc); + + /* jCC item not any more on real stack: pop */ + jcc = lower_entry->jcc; + TG_(current_state).nonskipped = lower_entry->nonskipped; + + if (jcc) { + fn_node* to_fn = jcc->to->cxt->fn[0]; + UInt* pdepth = TG_(get_fn_entry)(to_fn->number); + (*pdepth)--; + depth = *pdepth; + + /* add cost difference to sum */ + if (TG_(add_diff_cost_lz)(TG_(sets).full, &(jcc->cost), + lower_entry->enter_cost, + TG_(current_state).cost)) { + } + TG_(stat).ret_counter++; + + /* Emit trace sample on function exit */ + if (TG_(current_state).collect) { + /* Exit entire inline stack, deepest first */ + thread_info* ti = TG_(get_current_thread)(); + if (ti && ti->cur_inl_depth > 0 && TG_(current_state).bbcc) { + for (Int i = (Int)ti->cur_inl_depth - 1; i >= 0; i--) + TG_(trace_emit_exit_inlined) + (TG_(current_tid), TG_(current_state).bbcc->bb, ti->cur_inl_fns[i]); + ti->cur_inl_depth = 0; + } + TG_(trace_emit_sample)(TG_(current_tid), False, to_fn); + } + + /* restore context */ + TG_(current_state).cxt = lower_entry->cxt; + TG_(current_fn_stack).top = + TG_(current_fn_stack).bottom + lower_entry->fn_sp; + TG_ASSERT(TG_(current_state).cxt != 0); + + if (depth == 0) + function_left(to_fn); + } + + /* To allow for an assertion in push_call_stack() */ + lower_entry->cxt = 0; + + TG_(current_call_stack).sp--; + +#if TG_ENABLE_DEBUG + TG_DEBUGIF(1) + { + if (TG_(clo).verbose < 4) { + if (jcc) { + /* popped JCC target first */ + VG_(printf)("- %2d %#lx => ", TG_(current_call_stack).sp, + bb_addr(jcc->to->bb)); + TG_(print_addr)(bb_jmpaddr(jcc->from->bb)); + VG_(printf)( + ", SP %#lx\n", + TG_(current_call_stack).entry[TG_(current_call_stack).sp].sp); + TG_(print_cost)(10, TG_(sets).full, jcc->cost); + } else + VG_(printf)( + "- %2d [Skipped JCC], SP %#lx\n", TG_(current_call_stack).sp, + TG_(current_call_stack).entry[TG_(current_call_stack).sp].sp); + } else { + VG_(printf)(" Popped "); + TG_(print_stackentry)(7, TG_(current_call_stack).sp); + if (jcc) { + VG_(printf)(" returned to "); + TG_(print_addr_ln)(bb_jmpaddr(jcc->from->bb)); + } + } + } +#endif +} + +/* Unwind enough CallStack items to sync with current stack pointer. + * Returns the number of stack frames unwinded. + */ +Int TG_(unwind_call_stack)(Addr sp, Int minpops) +{ + Int csp; + Int unwind_count = 0; + TG_DEBUG(4, "+ unwind_call_stack(sp %#lx, minpops %d): frame %d\n", sp, + minpops, TG_(current_call_stack).sp); + + /* We pop old stack frames. + * For a call, be p the stack address with return address. + * - call_stack_esp[] has SP after the CALL: p-4 + * - current sp is after a RET: >= p + */ + + while ((csp = TG_(current_call_stack).sp) > 0) { + call_entry* top_ce = &(TG_(current_call_stack).entry[csp - 1]); + + if ((top_ce->sp < sp) || ((top_ce->sp == sp) && minpops > 0)) { + + minpops--; + unwind_count++; + TG_(pop_call_stack)(); + csp = TG_(current_call_stack).sp; + continue; + } + break; + } + + TG_DEBUG(4, "- unwind_call_stack\n"); + return unwind_count; +} diff --git a/tracegrind/clo.c b/tracegrind/clo.c new file mode 100644 index 000000000..5bfa108fa --- /dev/null +++ b/tracegrind/clo.c @@ -0,0 +1,613 @@ +/* + This file is part of Tracegrind, a Valgrind tool for call graph + profiling programs. + + Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de) + + This tool is derived from and contains lot of code from Cachegrind + Copyright (C) 2002-2017 Nicholas Nethercote (njn@valgrind.org) + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see . + + The GNU General Public License is contained in the file COPYING. +*/ + +#include "config.h" // for VG_PREFIX + +#include "global.h" + +/*------------------------------------------------------------*/ +/*--- Function specific configuration options ---*/ +/*------------------------------------------------------------*/ + +/* Special value for separate_callers: automatic = adaptive */ +#define CONFIG_AUTO -1 + +#define CONFIG_DEFAULT -1 +#define CONFIG_FALSE 0 +#define CONFIG_TRUE 1 + +/* Logging configuration for a function */ +struct _fn_config { + Int toggle_collect; + + Int group; /* don't change caller dependency inside group !=0 */ + Int pop_on_jump; + + Int separate_callers; /* separate logging dependent on caller */ + Int separate_recursions; /* separate logging of rec. levels */ + +#if TG_ENABLE_DEBUG + Int verbosity; /* Change debug verbosity level while in function */ +#endif +}; + +/* Configurations for function name prefix patterns. + * Currently, only very limit patterns are possible: + * Exact prefix patterns and "*::" are allowed. + * E.g. + * - "abc" matches all functions starting with "abc". + * - "abc*::def" matches all functions starting with "abc" and + * starting with "def" after the first "::" separator. + * - "*::print(" matches C++ methods "print" in all classes + * without namespace. I.e. "*" doesn't match a "::". + * + * We build a trie from patterns, and for a given function, we + * go down the tree and apply all non-default configurations. + */ + +#define NODE_DEGREE 30 + +/* node of compressed trie search structure */ +typedef struct _config_node config_node; +struct _config_node { + Int length; + + fn_config* config; + config_node* sub_node[NODE_DEGREE]; + config_node* next; + config_node* wild_star; + config_node* wild_char; + + HChar name[1]; +}; + +/* root of trie */ +static config_node* fn_configs = 0; + +static __inline__ fn_config* new_fnc(void) +{ + fn_config* fnc = (fn_config*)TG_MALLOC("cl.clo.nf.1", sizeof(fn_config)); + + fnc->toggle_collect = CONFIG_DEFAULT; + fnc->pop_on_jump = CONFIG_DEFAULT; + fnc->group = CONFIG_DEFAULT; + fnc->separate_callers = CONFIG_DEFAULT; + fnc->separate_recursions = CONFIG_DEFAULT; + +#if TG_ENABLE_DEBUG + fnc->verbosity = CONFIG_DEFAULT; +#endif + + return fnc; +} + +static config_node* new_config(const HChar* name, int length) +{ + int i; + config_node* node = + (config_node*)TG_MALLOC("cl.clo.nc.1", sizeof(config_node) + length); + + for (i = 0; i < length; i++) { + if (name[i] == 0) + break; + node->name[i] = name[i]; + } + node->name[i] = 0; + + node->length = length; + node->config = 0; + for (i = 0; i < NODE_DEGREE; i++) + node->sub_node[i] = 0; + node->next = 0; + node->wild_char = 0; + node->wild_star = 0; + + TG_DEBUG(3, " new_config('%s', len %d)\n", node->name, length); + + return node; +} + +static __inline__ Bool is_wild(HChar n) { return (n == '*') || (n == '?'); } + +/* Recursively build up function matching tree (prefix tree). + * Returns function config object for pattern + * and starting at tree node <*pnode>. + * + * Tree nodes (config_node) are created as needed, + * tree root is stored into <*pnode>, and the created + * leaf (fn_config) for the given pattern is returned. + */ +static fn_config* get_fnc2(config_node* node, const HChar* name) +{ + config_node *new_sub, *n, *nprev; + int offset, len; + + TG_DEBUG(3, " get_fnc2(%p, '%s')\n", node, name); + + if (name[0] == 0) { + if (!node->config) + node->config = new_fnc(); + return node->config; + } + + if (is_wild(*name)) { + if (*name == '*') { + while (name[1] == '*') + name++; + new_sub = node->wild_star; + } else + new_sub = node->wild_char; + + if (!new_sub) { + new_sub = new_config(name, 1); + if (*name == '*') + node->wild_star = new_sub; + else + node->wild_char = new_sub; + } + + return get_fnc2(new_sub, name + 1); + } + + n = node->sub_node[name[0] % NODE_DEGREE]; + nprev = 0; + len = 0; + while (n) { + for (len = 0; name[len] == n->name[len]; len++) + ; + if (len > 0) + break; + nprev = n; + n = n->next; + } + + if (!n) { + len = 1; + while (name[len] && (!is_wild(name[len]))) + len++; + new_sub = new_config(name, len); + new_sub->next = node->sub_node[name[0] % NODE_DEGREE]; + node->sub_node[name[0] % NODE_DEGREE] = new_sub; + + if (name[len] == 0) { + new_sub->config = new_fnc(); + return new_sub->config; + } + + /* recurse on wildcard */ + return get_fnc2(new_sub, name + len); + } + + if (len < n->length) { + + /* split up the subnode */ + config_node* new_node; + int i; + + new_node = new_config(n->name, len); + if (nprev) + nprev->next = new_node; + else + node->sub_node[n->name[0] % NODE_DEGREE] = new_node; + new_node->next = n->next; + + new_node->sub_node[n->name[len] % NODE_DEGREE] = n; + + for (i = 0, offset = len; offset < n->length; i++, offset++) + n->name[i] = n->name[offset]; + n->name[i] = 0; + n->length = i; + + name += len; + offset = 0; + while (name[offset] && (!is_wild(name[offset]))) + offset++; + new_sub = new_config(name, offset); + /* this sub_node of new_node could already be set: chain! */ + new_sub->next = new_node->sub_node[name[0] % NODE_DEGREE]; + new_node->sub_node[name[0] % NODE_DEGREE] = new_sub; + + if (name[offset] == 0) { + new_sub->config = new_fnc(); + return new_sub->config; + } + + /* recurse on wildcard */ + return get_fnc2(new_sub, name + offset); + } + + name += n->length; + + if (name[0] == 0) { + /* name and node name are the same */ + if (!n->config) + n->config = new_fnc(); + return n->config; + } + + offset = 1; + while (name[offset] && (!is_wild(name[offset]))) + offset++; + + new_sub = new_config(name, offset); + new_sub->next = n->sub_node[name[0] % NODE_DEGREE]; + n->sub_node[name[0] % NODE_DEGREE] = new_sub; + + return get_fnc2(new_sub, name + offset); +} + +static void print_config_node(int depth, int hash, config_node* node) +{ + config_node* n; + int i; + + if (node != fn_configs) { + const HChar sp[] = " "; + + if (depth > 40) + depth = 40; + VG_(printf)("%s", sp + 40 - depth); + if (hash >= 0) + VG_(printf)(" [hash %2d]", hash); + else if (hash == -2) + VG_(printf)(" [wildc ?]"); + else if (hash == -3) + VG_(printf)(" [wildc *]"); + VG_(printf)(" '%s' (len %d)\n", node->name, node->length); + } + for (i = 0; i < NODE_DEGREE; i++) { + n = node->sub_node[i]; + while (n) { + print_config_node(depth + 1, i, n); + n = n->next; + } + } + if (node->wild_char) + print_config_node(depth + 1, -2, node->wild_char); + if (node->wild_star) + print_config_node(depth + 1, -3, node->wild_star); +} + +/* get a function config for a name pattern (from command line) */ +static fn_config* get_fnc(const HChar* name) +{ + fn_config* fnc; + + TG_DEBUG(3, " +get_fnc(%s)\n", name); + if (fn_configs == 0) + fn_configs = new_config(name, 0); + fnc = get_fnc2(fn_configs, name); + + TG_DEBUGIF(3) + { + TG_DEBUG(3, " -get_fnc(%s):\n", name); + print_config_node(3, -1, fn_configs); + } + return fnc; +} + +static void update_fn_config1(fn_node* fn, fn_config* fnc) +{ + if (fnc->toggle_collect != CONFIG_DEFAULT) + fn->toggle_collect = (fnc->toggle_collect == CONFIG_TRUE); + + if (fnc->pop_on_jump != CONFIG_DEFAULT) + fn->pop_on_jump = (fnc->pop_on_jump == CONFIG_TRUE); + + if (fnc->group != CONFIG_DEFAULT) + fn->group = fnc->group; + + if (fnc->separate_callers != CONFIG_DEFAULT) + fn->separate_callers = fnc->separate_callers; + + if (fnc->separate_recursions != CONFIG_DEFAULT) + fn->separate_recursions = fnc->separate_recursions; + +#if TG_ENABLE_DEBUG + if (fnc->verbosity != CONFIG_DEFAULT) + fn->verbosity = fnc->verbosity; +#endif +} + +/* Recursively go down the function matching tree, + * looking for a match to . For every matching leaf, + * is updated with the pattern config. + */ +static void update_fn_config2(fn_node* fn, const HChar* name, config_node* node) +{ + config_node* n; + + TG_DEBUG(3, " update_fn_config2('%s', node '%s'): \n", name, node->name); + if ((*name == 0) && node->config) { + TG_DEBUG(3, " found!\n"); + update_fn_config1(fn, node->config); + return; + } + + n = node->sub_node[name[0] % NODE_DEGREE]; + while (n) { + if (VG_(strncmp)(name, n->name, n->length) == 0) + break; + n = n->next; + } + if (n) { + TG_DEBUG(3, " '%s' matching at hash %d\n", n->name, + name[0] % NODE_DEGREE); + update_fn_config2(fn, name + n->length, n); + } + + if (node->wild_char) { + TG_DEBUG(3, " skip '%c' for wildcard '?'\n", *name); + update_fn_config2(fn, name + 1, node->wild_char); + } + + if (node->wild_star) { + TG_DEBUG(3, " wildcard '*'\n"); + while (*name) { + update_fn_config2(fn, name, node->wild_star); + name++; + } + update_fn_config2(fn, name, node->wild_star); + } +} + +/* Update function config according to configs of name prefixes */ +void TG_(update_fn_config)(fn_node* fn) +{ + TG_DEBUG(3, " update_fn_config('%s')\n", fn->name); + if (fn_configs) + update_fn_config2(fn, fn->name, fn_configs); +} + +/*--------------------------------------------------------------------*/ +/*--- Command line processing ---*/ +/*--------------------------------------------------------------------*/ + +Bool TG_(process_cmd_line_option)(const HChar* arg) +{ + const HChar* tmp_str; + + if VG_BOOL_CLO (arg, "--skip-plt", TG_(clo).skip_plt) { + } + + else if VG_BOOL_CLO (arg, "--collect-jumps", TG_(clo).collect_jumps) { + } + /* compatibility alias, deprecated option */ + else if VG_BOOL_CLO (arg, "--trace-jump", TG_(clo).collect_jumps) { + } + + else if VG_BOOL_CLO (arg, "--collect-atstart", TG_(clo).collect_atstart) { + } + + else if VG_BOOL_CLO (arg, "--instr-atstart", TG_(clo).instrument_atstart) { + } + + else if VG_BOOL_CLO (arg, "--separate-threads", TG_(clo).separate_threads) { + } + + else if VG_STR_CLO (arg, "--toggle-collect", tmp_str) { + fn_config* fnc = get_fnc(tmp_str); + fnc->toggle_collect = CONFIG_TRUE; + /* defaults to initial collection off */ + TG_(clo).collect_atstart = False; + } + + else if VG_INT_CLO (arg, "--separate-recs", TG_(clo).separate_recursions) { + } + + /* change handling of a jump between functions to ret+call */ + else if VG_XACT_CLO (arg, "--pop-on-jump", TG_(clo).pop_on_jump, True) { + } else if VG_STR_CLO (arg, "--pop-on-jump", tmp_str) { + fn_config* fnc = get_fnc(tmp_str); + fnc->pop_on_jump = CONFIG_TRUE; + } + +#if TG_ENABLE_DEBUG + else if VG_INT_CLO (arg, "--ct-verbose", TG_(clo).verbose) { + } else if VG_INT_CLO (arg, "--ct-vstart", TG_(clo).verbose_start) { + } + + else if VG_STREQN (12, arg, "--ct-verbose") { + fn_config* fnc; + HChar* s; + UInt n = VG_(strtoll10)(arg + 12, &s); + if ((n <= 0) || *s != '=') + return False; + fnc = get_fnc(s + 1); + fnc->verbosity = n; + } +#endif + + else if VG_XACT_CLO (arg, "--separate-callers=auto", + TG_(clo).separate_callers, CONFIG_AUTO) { + } else if VG_INT_CLO (arg, "--separate-callers", TG_(clo).separate_callers) { + } + + else if VG_STREQN (10, arg, "--fn-group") { + fn_config* fnc; + HChar* s; + UInt n = VG_(strtoll10)(arg + 10, &s); + if ((n <= 0) || *s != '=') + return False; + fnc = get_fnc(s + 1); + fnc->group = n; + } + + else if VG_STREQN (18, arg, "--separate-callers") { + fn_config* fnc; + HChar* s; + UInt n = VG_(strtoll10)(arg + 18, &s); + if ((n <= 0) || *s != '=') + return False; + fnc = get_fnc(s + 1); + fnc->separate_callers = n; + } + + else if VG_STREQN (15, arg, "--separate-recs") { + fn_config* fnc; + HChar* s; + UInt n = VG_(strtoll10)(arg + 15, &s); + if ((n <= 0) || *s != '=') + return False; + fnc = get_fnc(s + 1); + fnc->separate_recursions = n; + } + + else if VG_STR_CLO (arg, "--tracegrind-out-file", TG_(clo).out_format) { + } + + else if VG_XACT_CLO (arg, "--collect-systime=no", TG_(clo).collect_systime, + systime_no) { + } else if VG_XACT_CLO (arg, "--collect-systime=msec", + TG_(clo).collect_systime, systime_msec) { + } else if VG_XACT_CLO (arg, + "--collect-systime=yes", /* backward compatibility. */ + TG_(clo).collect_systime, systime_msec) { + } else if VG_XACT_CLO (arg, "--collect-systime=usec", + TG_(clo).collect_systime, systime_usec) { + } else if VG_XACT_CLO (arg, "--collect-systime=nsec", + TG_(clo).collect_systime, systime_nsec) { +#if defined(VGO_darwin) + VG_(fmsg_bad_option) + (arg, "--collect-systime=nsec not supported on darwin\n"); +#endif + } + + else if VG_BOOL_CLO (arg, "--collect-bus", TG_(clo).collect_bus) { + } + /* for option compatibility with cachegrind */ + else if VG_BOOL_CLO (arg, "--cache-sim", TG_(clo).simulate_cache) { + } + /* compatibility alias, deprecated option */ + else if VG_BOOL_CLO (arg, "--simulate-cache", TG_(clo).simulate_cache) { + } + /* for option compatibility with cachegrind */ + else if VG_BOOL_CLO (arg, "--branch-sim", TG_(clo).simulate_branch) { + } else { + Bool isCachesimOption = (*TG_(cachesim).parse_opt)(arg); + + /* cache simulator is used if a simulator option is given */ + if (isCachesimOption) + TG_(clo).simulate_cache = True; + + return isCachesimOption; + } + + return True; +} + +void TG_(print_usage)(void) +{ + VG_(printf)( + "\n output options:\n" + " --tracegrind-out-file= Output file name " + "[tracegrind.out.%%p.msgpack.lz4]\n" + + "\n data collection options:\n" + " --instr-atstart=no|yes Do instrumentation at tracegrind start " + "[yes]\n" + " --collect-atstart=no|yes Collect at process/thread start [yes]\n" + " --toggle-collect= Toggle collection on enter/leave " + "function\n" + " --collect-jumps=no|yes Collect jumps? [no]\n" + " --collect-bus=no|yes Collect global bus events? [no]\n" + " --collect-systime=no|yes|msec|usec|nsec Collect system call time " + "info? [no]\n" + " no Do not collect system call time info.\n" + " msec|yes Collect syscount, syscall elapsed time " + "(milli-seconds).\n" + " usec Collect syscount, syscall elapsed time " + "(micro-seconds).\n" + " nsec Collect syscount, syscall elapsed and syscall cpu " + "time (nano-seconds).\n" + + "\n cost entity separation options:\n" + " --separate-threads=no|yes Separate data per thread [no]\n" + " --separate-callers= Separate functions by call chain length " + "[0]\n" + " --separate-callers= Separate callers for function \n" + " --separate-recs= Separate function recursions up to level " + "[2]\n" + " --separate-recs= Separate recursions for function \n" + " --skip-plt=no|yes Ignore calls to/from PLT sections? [yes]\n" +#if TG_EXPERIMENTAL + " --fn-group= Put function into separation group \n" +#endif + "\n simulation options:\n" + " --branch-sim=no|yes Do branch prediction simulation [no]\n" + " --cache-sim=no|yes Do cache simulation [no]\n"); + + (*TG_(cachesim).print_opts)(); + + // VG_(printf)("\n" + // " For full tracegrind documentation, see\n" + // " "VG_PREFIX"/share/doc/tracegrind/html/tracegrind.html\n\n"); +} + +void TG_(print_debug_usage)(void) +{ + VG_(printf)( + +#if TG_ENABLE_DEBUG + " --ct-verbose= Verbosity of standard debug output [0]\n" + " --ct-vstart= Only be verbose after basic block [0]\n" + " --ct-verbose= Verbosity while in \n" +#else + " (none)\n" +#endif + + ); +} + +void TG_(set_clo_defaults)(void) +{ + /* Default values for command line arguments */ + + /* Output */ + TG_(clo).out_format = 0; + + /* Collection */ + TG_(clo).separate_threads = False; + TG_(clo).collect_atstart = True; + TG_(clo).collect_jumps = False; + TG_(clo).collect_systime = systime_no; + TG_(clo).collect_bus = False; + + TG_(clo).skip_plt = True; + TG_(clo).separate_callers = 0; + TG_(clo).separate_recursions = 2; + /* Instrumentation */ + TG_(clo).instrument_atstart = True; + TG_(clo).simulate_cache = False; + TG_(clo).simulate_branch = False; + + /* Call graph */ + TG_(clo).pop_on_jump = False; + +#if TG_ENABLE_DEBUG + TG_(clo).verbose = 0; + TG_(clo).verbose_start = 0; +#endif +} diff --git a/tracegrind/context.c b/tracegrind/context.c new file mode 100644 index 000000000..44fc16331 --- /dev/null +++ b/tracegrind/context.c @@ -0,0 +1,335 @@ +/*--------------------------------------------------------------------*/ +/*--- Tracegrind ---*/ +/*--- ct_context.c ---*/ +/*--------------------------------------------------------------------*/ + +/* + This file is part of Tracegrind, a Valgrind tool for call tracing. + + Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de) + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see . + + The GNU General Public License is contained in the file COPYING. +*/ + +#include "global.h" + +/*------------------------------------------------------------*/ +/*--- Context operations ---*/ +/*------------------------------------------------------------*/ + +#define N_FNSTACK_INITIAL_ENTRIES 500 +#define N_CXT_INITIAL_ENTRIES 2537 + +fn_stack TG_(current_fn_stack); + +void TG_(init_fn_stack)(fn_stack* s) +{ + TG_ASSERT(s != 0); + + s->size = N_FNSTACK_INITIAL_ENTRIES; + s->bottom = + (fn_node**)TG_MALLOC("cl.context.ifs.1", s->size * sizeof(fn_node*)); + s->top = s->bottom; + s->bottom[0] = 0; +} + +void TG_(copy_current_fn_stack)(fn_stack* dst) +{ + TG_ASSERT(dst != 0); + + dst->size = TG_(current_fn_stack).size; + dst->bottom = TG_(current_fn_stack).bottom; + dst->top = TG_(current_fn_stack).top; +} + +void TG_(set_current_fn_stack)(fn_stack* s) +{ + TG_ASSERT(s != 0); + + TG_(current_fn_stack).size = s->size; + TG_(current_fn_stack).bottom = s->bottom; + TG_(current_fn_stack).top = s->top; +} + +static cxt_hash cxts; + +void TG_(init_cxt_table)(void) +{ + Int i; + + cxts.size = N_CXT_INITIAL_ENTRIES; + cxts.entries = 0; + cxts.table = + (Context**)TG_MALLOC("cl.context.ict.1", cxts.size * sizeof(Context*)); + + for (i = 0; i < cxts.size; i++) + cxts.table[i] = 0; +} + +/* double size of cxt table */ +static void resize_cxt_table(void) +{ + UInt i, new_size, conflicts1 = 0, conflicts2 = 0; + Context **new_table, *curr, *next; + UInt new_idx; + + new_size = 2 * cxts.size + 3; + new_table = + (Context**)TG_MALLOC("cl.context.rct.1", new_size * sizeof(Context*)); + + for (i = 0; i < new_size; i++) + new_table[i] = NULL; + + for (i = 0; i < cxts.size; i++) { + if (cxts.table[i] == NULL) + continue; + + curr = cxts.table[i]; + while (NULL != curr) { + next = curr->next; + + new_idx = (UInt)(curr->hash % new_size); + + curr->next = new_table[new_idx]; + new_table[new_idx] = curr; + if (curr->next) { + conflicts1++; + if (curr->next->next) + conflicts2++; + } + + curr = next; + } + } + + VG_(free)(cxts.table); + + TG_DEBUG(0, "Resize Context Hash: %u => %u (entries %u, conflicts %u/%u)\n", + cxts.size, new_size, cxts.entries, conflicts1, conflicts2); + + cxts.size = new_size; + cxts.table = new_table; + TG_(stat).cxt_hash_resizes++; +} + +__inline__ static UWord cxt_hash_val(fn_node** fn, UInt size) +{ + UWord hash = 0; + UInt count = size; + while (*fn != 0) { + hash = (hash << 7) + (hash >> 25) + (UWord)(*fn); + fn--; + count--; + if (count == 0) + break; + } + return hash; +} + +__inline__ static Bool is_cxt(UWord hash, fn_node** fn, Context* cxt) +{ + int count; + fn_node** cxt_fn; + + if (hash != cxt->hash) + return False; + + count = cxt->size; + cxt_fn = &(cxt->fn[0]); + while ((*fn != 0) && (count > 0)) { + if (*cxt_fn != *fn) + return False; + fn--; + cxt_fn++; + count--; + } + return True; +} + +/** + * Allocate new Context structure + */ +static Context* new_cxt(fn_node** fn) +{ + Context* cxt; + UInt idx, offset; + UWord hash; + int size, recs; + fn_node* top_fn; + + TG_ASSERT(fn); + top_fn = *fn; + if (top_fn == 0) + return 0; + + size = top_fn->separate_callers + 1; + recs = top_fn->separate_recursions; + if (recs < 1) + recs = 1; + + /* check fill degree of context hash table and resize if needed (>80%) */ + cxts.entries++; + if (10 * cxts.entries / cxts.size > 8) + resize_cxt_table(); + + cxt = (Context*)TG_MALLOC("cl.context.nc.1", + sizeof(Context) + sizeof(fn_node*) * size); + + // hash value calculation similar to cxt_hash_val(), but additionally + // copying function pointers in one run + hash = 0; + offset = 0; + while (*fn != 0) { + hash = (hash << 7) + (hash >> 25) + (UWord)(*fn); + cxt->fn[offset] = *fn; + offset++; + fn--; + if (offset >= size) + break; + } + if (offset < size) + size = offset; + + cxt->size = size; + cxt->base_number = TG_(stat).context_counter; + cxt->hash = hash; + + TG_(stat).context_counter += recs; + TG_(stat).distinct_contexts++; + + /* insert into Context hash table */ + idx = (UInt)(hash % cxts.size); + cxt->next = cxts.table[idx]; + cxts.table[idx] = cxt; + +#if TG_ENABLE_DEBUG + TG_DEBUGIF(3) + { + VG_(printf)(" new_cxt ox%p: ", cxt); + TG_(print_cxt)(12, cxt, 0); + } +#endif + + return cxt; +} + +/* get the Context structure for current context */ +Context* TG_(get_cxt)(fn_node** fn) +{ + Context* cxt; + UInt size, idx; + UWord hash; + + TG_ASSERT(fn != 0); + if (*fn == 0) + return 0; + size = (*fn)->separate_callers + 1; + if (size <= 0) { + size = -size + 1; + } + + TG_DEBUG(5, "+ get_cxt(fn '%s'): size %u\n", (*fn)->name, size); + + hash = cxt_hash_val(fn, size); + + if (((cxt = (*fn)->last_cxt) != 0) && is_cxt(hash, fn, cxt)) { + TG_DEBUG(5, "- get_cxt: %p\n", cxt); + return cxt; + } + + TG_(stat).cxt_lru_misses++; + + idx = (UInt)(hash % cxts.size); + cxt = cxts.table[idx]; + + while (cxt) { + if (is_cxt(hash, fn, cxt)) + break; + cxt = cxt->next; + } + + if (!cxt) + cxt = new_cxt(fn); + + (*fn)->last_cxt = cxt; + + TG_DEBUG(5, "- get_cxt: %p\n", cxt); + + return cxt; +} + +/** + * Change execution context by calling a new function from current context + * Pushing 0x0 specifies a marker for a signal handler entry + */ +void TG_(push_cxt)(fn_node* fn) +{ + call_stack* cs = &TG_(current_call_stack); + Int fn_entries; + + TG_DEBUG(5, "+ push_cxt(fn '%s'): old ctx %d\n", fn ? fn->name : "0x0", + TG_(current_state).cxt ? (Int)TG_(current_state).cxt->base_number + : -1); + + /* save old context on stack (even if not changed at all!) */ + TG_ASSERT(cs->sp < cs->size); + TG_ASSERT(cs->entry[cs->sp].cxt == 0); + cs->entry[cs->sp].cxt = TG_(current_state).cxt; + cs->entry[cs->sp].fn_sp = + TG_(current_fn_stack).top - TG_(current_fn_stack).bottom; + + if (fn && (*(TG_(current_fn_stack).top) == fn)) + return; + if (fn && (fn->group > 0) && + ((*(TG_(current_fn_stack).top))->group == fn->group)) + return; + + /* resizing needed ? */ + fn_entries = TG_(current_fn_stack).top - TG_(current_fn_stack).bottom; + if (fn_entries == TG_(current_fn_stack).size - 1) { + UInt new_size = TG_(current_fn_stack).size * 2; + fn_node** new_array = + (fn_node**)TG_MALLOC("cl.context.pc.1", new_size * sizeof(fn_node*)); + int i; + for (i = 0; i < TG_(current_fn_stack).size; i++) + new_array[i] = TG_(current_fn_stack).bottom[i]; + VG_(free)(TG_(current_fn_stack).bottom); + TG_(current_fn_stack).top = new_array + fn_entries; + TG_(current_fn_stack).bottom = new_array; + + TG_DEBUG(0, "Resize Context Stack: %u => %u (pushing '%s')\n", + TG_(current_fn_stack).size, new_size, fn ? fn->name : "0x0"); + + TG_(current_fn_stack).size = new_size; + } + + if (fn && (*(TG_(current_fn_stack).top) == 0)) { + UInt* pactive; + + /* this is first function: increment its active count */ + pactive = TG_(get_fn_entry)(fn->number); + (*pactive)++; + } + + TG_(current_fn_stack).top++; + *(TG_(current_fn_stack).top) = fn; + TG_(current_state).cxt = TG_(get_cxt)(TG_(current_fn_stack).top); + + TG_DEBUG( + 5, "- push_cxt(fn '%s'): new cxt %d, fn_sp %ld\n", fn ? fn->name : "0x0", + TG_(current_state).cxt ? (Int)TG_(current_state).cxt->base_number : -1, + TG_(current_fn_stack).top - TG_(current_fn_stack).bottom + 0L); +} diff --git a/tracegrind/costs.c b/tracegrind/costs.c new file mode 100644 index 000000000..bc7cd41eb --- /dev/null +++ b/tracegrind/costs.c @@ -0,0 +1,68 @@ +/*--------------------------------------------------------------------*/ +/*--- Tracegrind ---*/ +/*--- ct_costs.c ---*/ +/*--------------------------------------------------------------------*/ + +/* + This file is part of Tracegrind, a Valgrind tool for call tracing. + + Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de) + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see . + + The GNU General Public License is contained in the file COPYING. +*/ + +#include "global.h" + +#include "pub_tool_mallocfree.h" + +#define COSTCHUNK_SIZE 100000 + +UInt TG_(costarray_entries) = 0; +UInt TG_(costarray_chunks) = 0; +static CostChunk* cost_chunk_base = 0; +static CostChunk* cost_chunk_current = 0; + +ULong* TG_(get_costarray)(Int size) +{ + ULong* ptr; + + if (!cost_chunk_current || + (cost_chunk_current->size - cost_chunk_current->used < size)) { + CostChunk* cc = (CostChunk*)TG_MALLOC( + "cl.costs.gc.1", sizeof(CostChunk) + COSTCHUNK_SIZE * sizeof(ULong)); + TG_ASSERT(size < COSTCHUNK_SIZE); + + cc->size = COSTCHUNK_SIZE; + cc->used = 0; + cc->next = 0; + + if (cost_chunk_current) + cost_chunk_current->next = cc; + cost_chunk_current = cc; + + if (!cost_chunk_base) + cost_chunk_base = cc; + + TG_(costarray_chunks)++; + } + + ptr = &(cost_chunk_current->data[cost_chunk_current->used]); + cost_chunk_current->used += size; + + TG_(costarray_entries) += size; + + return ptr; +} diff --git a/tracegrind/costs.h b/tracegrind/costs.h new file mode 100644 index 000000000..2e51c344d --- /dev/null +++ b/tracegrind/costs.h @@ -0,0 +1,54 @@ +/*--------------------------------------------------------------------*/ +/*--- Tracegrind cost array interface. costs.h ---*/ +/*--------------------------------------------------------------------*/ + +/* + This file is part of Valgrind, a dynamic binary instrumentation + framework. + + Copyright (C) 2004-2017 Josef Weidendorfer + josef.weidendorfer@gmx.de + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see . + + The GNU General Public License is contained in the file COPYING. +*/ + +#ifndef TG_COSTS +#define TG_COSTS + +#include "pub_tool_basics.h" + +#define TG_(str) VGAPPEND(vgTracegrind_, str) + +extern UInt TG_(costarray_entries); +extern UInt TG_(costarray_chunks); + +/* Array of 64bit costs. This is separated from other structs + * to support a dynamic number of costs for a cost item. + * Chunks are allocated on demand. + */ +typedef struct _CostChunk CostChunk; +struct _CostChunk { + Int size; + Int used; + CostChunk *next, *prev; + ULong data[0]; +}; + +/* Allocate a number of 64bit cost values. + * Typically used from ct_events.c */ +ULong* TG_(get_costarray)(Int size); + +#endif /* TG_COSTS */ diff --git a/tracegrind/debug.c b/tracegrind/debug.c new file mode 100644 index 000000000..fa8f876e2 --- /dev/null +++ b/tracegrind/debug.c @@ -0,0 +1,451 @@ +/* + This file is part of Tracegrind, a Valgrind tool for call graph + profiling programs. + + Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de) + + This tool is derived from and contains lot of code from Cachegrind + Copyright (C) 2002-2017 Nicholas Nethercote (njn@valgrind.org) + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see . + + The GNU General Public License is contained in the file COPYING. +*/ + +#include "events.h" +#include "global.h" + +/* If debugging mode of, dummy functions are provided (see below) + */ +#if TG_ENABLE_DEBUG + +/*------------------------------------------------------------*/ +/*--- Debug output helpers ---*/ +/*------------------------------------------------------------*/ + +static void print_indent(int s) +{ + /* max of 40 spaces */ + const HChar sp[] = " "; + if (s > 40) + s = 40; + VG_(printf)("%s", sp + 40 - s); +} + +void TG_(print_bb)(int s, BB* bb) +{ + if (s < 0) { + s = -s; + print_indent(s); + } + + VG_(printf)("BB %#lx (Obj '%s')", bb_addr(bb), bb->obj->name); +} + +static void print_mangled_cxt(Context* cxt, int rec_index) +{ + int i; + + if (!cxt) + VG_(printf)("(none)"); + else { + VG_(printf)("%s", cxt->fn[0]->name); + if (rec_index > 0) + VG_(printf)("'%d", rec_index + 1); + for (i = 1; i < cxt->size; i++) + VG_(printf)("'%s", cxt->fn[i]->name); + } +} + +void TG_(print_cxt)(Int s, Context* cxt, int rec_index) +{ + if (s < 0) { + s = -s; + print_indent(s); + } + + if (cxt) { + UInt* pactive = TG_(get_fn_entry)(cxt->fn[0]->number); + TG_ASSERT(rec_index < cxt->fn[0]->separate_recursions); + + VG_(printf)("Cxt %u", cxt->base_number + rec_index); + if (*pactive > 0) + VG_(printf)(" [active=%u]", *pactive); + VG_(printf)(": "); + print_mangled_cxt(cxt, rec_index); + VG_(printf)("\n"); + } else + VG_(printf)("(no context)\n"); +} + +void TG_(print_execstate)(int s, exec_state* es) +{ + if (s < 0) { + s = -s; + print_indent(s); + } + + if (!es) { + VG_(printf)("ExecState 0x0\n"); + return; + } + + VG_(printf)( + "ExecState [Sig %d, collect %s, nonskipped %p]: jmps_passed %d\n", + es->sig, es->collect ? "yes" : "no", es->nonskipped, es->jmps_passed); +} + +void TG_(print_bbcc)(int s, BBCC* bbcc) +{ + BB* bb; + + if (s < 0) { + s = -s; + print_indent(s); + } + + if (!bbcc) { + VG_(printf)("BBCC 0x0\n"); + return; + } + + bb = bbcc->bb; + TG_ASSERT(bb != 0); + + VG_(printf)("%s +%#lx=%#lx, ", bb->obj->name + bb->obj->last_slash_pos, + (UWord)bb->offset, bb_addr(bb)); + TG_(print_cxt)(s + 8, bbcc->cxt, bbcc->rec_index); +} + +void TG_(print_eventset)(int s, EventSet* es) +{ + int i, j; + UInt mask; + EventGroup* eg; + + if (s < 0) { + s = -s; + print_indent(s); + } + + if (!es) { + VG_(printf)("(EventSet not set)\n"); + return; + } + + VG_(printf)("EventSet %u (%d groups, size %d):", es->mask, es->count, + es->size); + + if (es->count == 0) { + VG_(printf)("-\n"); + return; + } + + for (i = 0, mask = 1; i < MAX_EVENTGROUP_COUNT; i++, mask = mask << 1) { + if ((es->mask & mask) == 0) + continue; + eg = TG_(get_event_group)(i); + if (!eg) + continue; + VG_(printf)(" (%d: %s", i, eg->name[0]); + for (j = 1; j < eg->size; j++) + VG_(printf)(" %s", eg->name[j]); + VG_(printf)(")"); + } + VG_(printf)("\n"); +} + +void TG_(print_cost)(int s, EventSet* es, ULong* c) +{ + Int i, j, pos, off; + UInt mask; + EventGroup* eg; + + if (s < 0) { + s = -s; + print_indent(s); + } + + if (!es) { + VG_(printf)("Cost (Nothing, EventSet not set)\n"); + return; + } + if (!c) { + VG_(printf)("Cost (Null, EventSet %u)\n", es->mask); + return; + } + + if (es->size == 0) { + VG_(printf)("Cost (Nothing, EventSet with len 0)\n"); + return; + } + + pos = s; + pos += VG_(printf)("Cost [%p]: ", c); + off = 0; + for (i = 0, mask = 1; i < MAX_EVENTGROUP_COUNT; i++, mask = mask << 1) { + if ((es->mask & mask) == 0) + continue; + eg = TG_(get_event_group)(i); + if (!eg) + continue; + for (j = 0; j < eg->size; j++) { + + if (off > 0) { + if (pos > 70) { + VG_(printf)(",\n"); + print_indent(s + 5); + pos = s + 5; + } else + pos += VG_(printf)(", "); + } + + pos += VG_(printf)("%s %llu", eg->name[j], c[off++]); + } + } + VG_(printf)("\n"); +} + +void TG_(print_short_jcc)(jCC* jcc) +{ + if (jcc) + VG_(printf)("%#lx => %#lx [calls %llu/Ir %llu, Dr %llu, Dw %llu]", + bb_jmpaddr(jcc->from->bb), bb_addr(jcc->to->bb), + jcc->call_counter, + jcc->cost ? jcc->cost[fullOffset(EG_IR)] : 0, + jcc->cost ? jcc->cost[fullOffset(EG_DR)] : 0, + jcc->cost ? jcc->cost[fullOffset(EG_DW)] : 0); + else + VG_(printf)("[Skipped JCC]"); +} + +void TG_(print_jcc)(int s, jCC* jcc) +{ + if (s < 0) { + s = -s; + print_indent(s); + } + + if (!jcc) { + VG_(printf)("JCC to skipped function\n"); + return; + } + VG_(printf)("JCC %p from ", jcc); + TG_(print_bbcc)(s + 9, jcc->from); + print_indent(s + 4); + VG_(printf)("to "); + TG_(print_bbcc)(s + 9, jcc->to); + print_indent(s + 4); + VG_(printf)("Calls %llu\n", jcc->call_counter); + print_indent(s + 4); + TG_(print_cost)(s + 9, TG_(sets).full, jcc->cost); +} + +/* dump out the current call stack */ +void TG_(print_stackentry)(int s, int sp) +{ + call_entry* ce; + + if (s < 0) { + s = -s; + print_indent(s); + } + + ce = TG_(get_call_entry)(sp); + VG_(printf)("[%-2d] SP %#lx, RA %#lx", sp, ce->sp, ce->ret_addr); + if (ce->nonskipped) + VG_(printf)(" NonSkipped BB %#lx / %s", bb_addr(ce->nonskipped->bb), + ce->nonskipped->cxt->fn[0]->name); + VG_(printf)("\n"); + print_indent(s + 5); + TG_(print_jcc)(5, ce->jcc); +} + +/* debug output */ +#if 0 +static void print_call_stack() +{ + int c; + + VG_(printf)("Call Stack:\n"); + for(c=0;cbb), + (bbcc->bb->sect_kind == Vg_SectText) ? 'T' + : (bbcc->bb->sect_kind == Vg_SectData) ? 'D' + : (bbcc->bb->sect_kind == Vg_SectBSS) ? 'B' + : (bbcc->bb->sect_kind == Vg_SectGOT) ? 'G' + : (bbcc->bb->sect_kind == Vg_SectPLT) ? 'P' + : 'U', + bbcc->cxt->base_number + bbcc->rec_index); + print_mangled_cxt(bbcc->cxt, bbcc->rec_index); + + obj = bbcc->cxt->fn[0]->file->obj; + if (obj->name[0]) + VG_(printf)(" %s", obj->name + obj->last_slash_pos); + + if (VG_(strcmp)(bbcc->cxt->fn[0]->file->name, "???") != 0) { + VG_(printf)(" %s", bbcc->cxt->fn[0]->file->name); + if ((bbcc->cxt->fn[0] == bbcc->bb->fn) && (bbcc->bb->line > 0)) + VG_(printf)(":%u", bbcc->bb->line); + } +} + +void TG_(print_bbcc_cost)(int s, BBCC* bbcc) +{ + BB* bb; + Int i, cjmpNo; + ULong ecounter; + + if (s < 0) { + s = -s; + print_indent(s); + } + + if (!bbcc) { + VG_(printf)("BBCC 0x0\n"); + return; + } + + bb = bbcc->bb; + TG_ASSERT(bb != 0); + + TG_(print_bbcc)(s, bbcc); + + ecounter = bbcc->ecounter_sum; + + print_indent(s + 2); + VG_(printf)("ECounter: sum %llu ", ecounter); + for (i = 0; i < bb->cjmp_count; i++) { + VG_(printf)("[%u]=%llu ", bb->jmp[i].instr, bbcc->jmp[i].ecounter); + } + VG_(printf)("\n"); + + cjmpNo = 0; + for (i = 0; i < bb->instr_count; i++) { + InstrInfo* ii = &(bb->instr[i]); + print_indent(s + 2); + VG_(printf)("[%2d] IOff %2u ecnt %3llu ", i, ii->instr_offset, ecounter); + TG_(print_cost)(s + 5, ii->eventset, bbcc->cost + ii->cost_offset); + + /* update execution counter */ + if (cjmpNo < bb->cjmp_count) + if (bb->jmp[cjmpNo].instr == i) { + ecounter -= bbcc->jmp[cjmpNo].ecounter; + cjmpNo++; + } + } +} + +/* dump out an address with source info if available */ +void TG_(print_addr)(Addr addr) +{ + const HChar *fn_buf, *fl_buf, *dir_buf; + const HChar* obj_name; + DebugInfo* di; + UInt ln, i = 0, opos = 0; + + if (addr == 0) { + VG_(printf)("%08lx", addr); + return; + } + + TG_(get_debug_info)(addr, &dir_buf, &fl_buf, &fn_buf, &ln, &di); + + if (VG_(strcmp)(fn_buf, "???") == 0) + VG_(printf)("%#lx", addr); + else + VG_(printf)("%#lx %s", addr, fn_buf); + + if (di) { + obj_name = VG_(DebugInfo_get_filename)(di); + if (obj_name) { + while (obj_name[i]) { + if (obj_name[i] == '/') + opos = i + 1; + i++; + } + if (obj_name[0]) + VG_(printf)(" %s", obj_name + opos); + } + } + + if (ln > 0) { + if (dir_buf[0]) + VG_(printf)(" (%s/%s:%u)", dir_buf, fl_buf, ln); + else + VG_(printf)(" (%s:%u)", fl_buf, ln); + } +} + +void TG_(print_addr_ln)(Addr addr) +{ + TG_(print_addr)(addr); + VG_(printf)("\n"); +} + +static ULong bb_written = 0; + +void TG_(print_bbno)(void) +{ + if (bb_written != TG_(stat).bb_executions) { + bb_written = TG_(stat).bb_executions; + VG_(printf)("BB# %llu\n", TG_(stat).bb_executions); + } +} + +void TG_(print_context)(void) +{ + BBCC* bbcc; + + TG_DEBUG(0, "In tid %u [%d] ", TG_(current_tid), TG_(current_call_stack).sp); + bbcc = TG_(current_state).bbcc; + print_mangled_cxt(TG_(current_state).cxt, bbcc ? bbcc->rec_index : 0); + VG_(printf)("\n"); +} + +void* TG_(malloc)(const HChar* cc, UWord s, const HChar* f) +{ + TG_DEBUG(3, "Malloc(%lu) in %s.\n", s, f); + return VG_(malloc)(cc, s); +} + +#else /* TG_ENABLE_DEBUG */ + +void TG_(print_bbno)(void) {} +void TG_(print_context)(void) {} +void TG_(print_jcc)(int s, jCC* jcc) {} +void TG_(print_bbcc)(int s, BBCC* bbcc) {} +void TG_(print_bbcc_fn)(BBCC* bbcc) {} +void TG_(print_cost)(int s, EventSet* es, ULong* cost) {} +void TG_(print_bb)(int s, BB* bb) {} +void TG_(print_cxt)(int s, Context* cxt, int rec_index) {} +void TG_(print_short_jcc)(jCC* jcc) {} +void TG_(print_stackentry)(int s, int sp) {} +void TG_(print_addr)(Addr addr) {} +void TG_(print_addr_ln)(Addr addr) {} + +#endif diff --git a/tracegrind/docs/tracegrind-msgpack-format.md b/tracegrind/docs/tracegrind-msgpack-format.md new file mode 100644 index 000000000..f6dabeb31 --- /dev/null +++ b/tracegrind/docs/tracegrind-msgpack-format.md @@ -0,0 +1,228 @@ +# Tracegrind MsgPack+LZ4 Output Format + +## Overview + +Tracegrind produces a binary trace file combining MsgPack serialization with LZ4 block compression. The default output file name is `tracegrind.out..msgpack.lz4`. + +## File Structure + +``` +┌─────────────────────────────────┐ +│ File Header (8 bytes) │ +├─────────────────────────────────┤ +│ Schema Chunk │ +├─────────────────────────────────┤ +│ Data Chunk 1..N │ +├─────────────────────────────────┤ +│ End Marker (8 bytes) │ +└─────────────────────────────────┘ +``` + +## File Header + +| Offset | Size | Field | Description | +|--------|------|---------|-------------| +| 0 | 4 | magic | ASCII `TGMP` (0x54 0x47 0x4D 0x50) | +| 4 | 4 | version | Format version, uint32 LE (currently 4) | + +## Chunk Format + +Each chunk (schema and data) has the same header: + +| Offset | Size | Field | Description | +|--------|------|-------------------|-------------| +| 0 | 4 | uncompressed_size | Size after decompression, uint32 LE | +| 4 | 4 | compressed_size | Size of LZ4 block, uint32 LE | +| 8 | N | data | LZ4 block-compressed MsgPack data | + +## Schema Chunk + +The first chunk contains a MsgPack map describing the discriminated union schema: + +```json +{ + "version": 4, + "format": "tracegrind-msgpack", + "creator": "valgrind-tracegrind", + "creator_version": "3.26.0.codspeed", + "event_schemas": { + "0": ["seq", "tid", "event", "marker"], + "1": ["seq", "tid", "event", "fn", "obj", "file", "line", "counters"], + "2": ["seq", "tid", "event", "fn", "obj", "file", "line", "counters"], + "3": ["seq", "tid", "event", "fn", "obj", "file", "line", "counters"], + "4": ["seq", "tid", "event", "fn", "obj", "file", "line", "counters"], + "5": ["seq", "tid", "event", "child_pid"], + "6": ["seq", "tid", "event", "child_tid"] + }, + "counters": ["Ir"], + "counter_units": { + "sysTime": "ns", + "sysCpuTime": "ns" + } +} +``` + +The `counters` array lists the dynamic counter column names (e.g. `["Ir"]` or `["Ir", "sysCount", "sysTime", "sysCpuTime"]`). Event schemas for types 1-4 use `"counters"` as a sentinel at index 7 to indicate that a sub-array of counter deltas appears at that position in data rows. + +### Event Types + +| Type | Name | Description | +|------|--------|-------------| +| 0 | MARKER | Named marker | +| 1 | ENTER_FN | Function entry | +| 2 | EXIT_FN | Function exit | +| 3 | ENTER_INLINED_FN | Inlined function entry | +| 4 | EXIT_INLINED_FN | Inlined function exit | +| 5 | FORK | Child process created | +| 6 | THREAD_CREATE | New thread created | + +### Row Schemas + +**MARKER rows (event 0):** + +| Index | Name | Type | Description | +|-------|--------|--------|-------------| +| 0 | seq | uint64 | Sequence number | +| 1 | tid | int32 | Thread ID | +| 2 | event | int | 0 = MARKER | +| 3 | marker | string | Marker label | + +**ENTER_FN/EXIT_FN rows (event 1, 2):** + +| Index | Name | Type | Description | +|-------|----------|--------|-------------| +| 0 | seq | uint64 | Sequence number | +| 1 | tid | int32 | Thread ID | +| 2 | event | int | 1 = ENTER_FN, 2 = EXIT_FN | +| 3 | fn | string | Function name | +| 4 | obj | string | Shared object path | +| 5 | file | string | Source file path | +| 6 | line | int32 | Line number (0 if unknown) | +| 7 | counters | array | Counter deltas sub-array (Ir, Dr, Dw, etc.) | + +**ENTER_INLINED_FN/EXIT_INLINED_FN rows (event 3, 4):** + +Same schema as ENTER_FN/EXIT_FN rows. + +| Index | Name | Type | Description | +|-------|----------|--------|-------------| +| 0 | seq | uint64 | Sequence number | +| 1 | tid | int32 | Thread ID | +| 2 | event | int | 3 = ENTER_INLINED_FN, 4 = EXIT_INLINED_FN | +| 3 | fn | string | Function name | +| 4 | obj | string | Shared object path | +| 5 | file | string | Source file path | +| 6 | line | int32 | Line number (0 if unknown) | +| 7 | counters | array | Counter deltas sub-array (Ir, Dr, Dw, etc.) | + +**FORK rows (event 5):** + +| Index | Name | Type | Description | +|-------|-----------|--------|-------------| +| 0 | seq | uint64 | Sequence number | +| 1 | tid | int32 | Thread ID that called fork | +| 2 | event | int | 5 = FORK | +| 3 | child_pid | int32 | PID of the new child process | + +**THREAD_CREATE rows (event 6):** + +| Index | Name | Type | Description | +|-------|-----------|--------|-------------| +| 0 | seq | uint64 | Sequence number | +| 1 | tid | int32 | Thread ID that created the new thread | +| 2 | event | int | 6 = THREAD_CREATE | +| 3 | child_tid | int32 | Thread ID of the new child thread | + +### Event Counter Columns + +For ENTER_FN/EXIT_FN/ENTER_INLINED_FN/EXIT_INLINED_FN rows, event counters appear as a sub-array at index 7. The order of values in the sub-array corresponds to the top-level `counters` array in the schema. Which counters are present depends on Tracegrind options: + +`Ir`, `Dr`, `Dw`, `I1mr`, `D1mr`, `D1mw`, `ILmr`, `DLmr`, `DLmw`, `Bc`, `Bcm`, `Bi`, `Bim` + +### Counter Units + +The `counter_units` field is a map from event counter name to its unit string. Only time-based counters are listed; counters absent from the map are dimensionless. + +| `--collect-systime` | Entries in `counter_units` | +|---------------------|--------------------| +| `msec` | `"sysTime": "ms"` | +| `usec` | `"sysTime": "us"` | +| `nsec` | `"sysTime": "ns"`, `"sysCpuTime": "ns"` | + +When `--collect-systime` is not set, the `counter_units` map is empty. + +## Data Chunks + +Each data chunk contains concatenated MsgPack arrays. The row format depends on the event type (index 2): + +``` +[seq, tid, 0, marker] # MARKER +[seq, tid, 1, fn, obj, file, line, [delta_Ir, ...]] # ENTER_FN +[seq, tid, 2, fn, obj, file, line, [delta_Ir, ...]] # EXIT_FN +[seq, tid, 3, fn, obj, file, line, [delta_Ir, ...]] # ENTER_INLINED_FN +[seq, tid, 4, fn, obj, file, line, [delta_Ir, ...]] # EXIT_INLINED_FN +[seq, tid, 5, child_pid] # FORK +[seq, tid, 6, child_tid] # THREAD_CREATE +``` + +The reference implementation writes 4096 rows per chunk. + +## End Marker + +8 zero bytes (uncompressed_size = 0, compressed_size = 0). + +## Example: Reading in Python + +```python +import struct, lz4.block, msgpack + +def read_tracegrind(filepath): + with open(filepath, 'rb') as f: + assert f.read(4) == b'TGMP' + version = struct.unpack('. + + The GNU General Public License is contained in the file COPYING. +*/ + +#include "config.h" +#include "global.h" +#include "tg_lz4.h" +#include "tg_msgpack.h" + +#include "pub_tool_libcfile.h" +#include "pub_tool_threadstate.h" + +/* Total reads/writes/misses sum over all threads. */ +FullCost TG_(total_cost) = 0; + +EventMapping* TG_(dumpmap) = 0; + +/* ================================================================== */ +/* === Trace output === */ +/* ================================================================== */ + +trace_output TG_(trace_out) = { + .fd = -1, .seq = 0, .initialized = False, .header_written = False}; + +/* ================================================================== */ +/* === MsgPack + LZ4 output === */ +/* ================================================================== */ + +#define MSGPACK_CHUNK_ROWS 4096 /* Rows per compressed chunk */ +#define MSGPACK_INITIAL_BUF (256 * 1024) /* Initial buffer size */ + +typedef struct { + msgpack_buffer buf; /* Buffer for serializing rows */ + UInt rows_in_chunk; /* Number of rows in current chunk */ + UInt n_event_cols; /* Number of dynamic event columns */ + const HChar** col_names; /* Column names (for header) */ + Int ncols; /* Total columns including events */ +} msgpack_state; + +static msgpack_state mp_state; + +/* Write a compressed chunk to the trace output */ +static void msgpack_flush_chunk(void) +{ + if (mp_state.rows_in_chunk == 0) + return; + if (TG_(trace_out).fd < 0) + return; + + /* Compress the msgpack data with zstd */ + SizeT src_size = mp_state.buf.size; + SizeT dst_capacity = tg_lz4_compress_bound(src_size); + UChar* compressed = VG_(malloc)("tg.mp.compress", dst_capacity); + + SizeT compressed_size = + tg_lz4_compress(compressed, dst_capacity, mp_state.buf.data, src_size); + + if (compressed_size == 0) { + /* Compression failed, write raw with size=0 marker */ + VG_(free)(compressed); + return; + } + + /* Write chunk header: 4 bytes uncompressed size, 4 bytes compressed size */ + UChar hdr[8]; + hdr[0] = (UChar)(src_size & 0xff); + hdr[1] = (UChar)((src_size >> 8) & 0xff); + hdr[2] = (UChar)((src_size >> 16) & 0xff); + hdr[3] = (UChar)((src_size >> 24) & 0xff); + hdr[4] = (UChar)(compressed_size & 0xff); + hdr[5] = (UChar)((compressed_size >> 8) & 0xff); + hdr[6] = (UChar)((compressed_size >> 16) & 0xff); + hdr[7] = (UChar)((compressed_size >> 24) & 0xff); + VG_(write)(TG_(trace_out).fd, hdr, 8); + + /* Write compressed data */ + VG_(write)(TG_(trace_out).fd, compressed, compressed_size); + + VG_(free)(compressed); + + /* Reset buffer for next chunk */ + msgpack_reset(&mp_state.buf); + mp_state.rows_in_chunk = 0; +} + +/* Write file header with schema metadata (discriminated union format) */ +static void msgpack_write_header(void) +{ + msgpack_buffer hdr; + msgpack_init(&hdr, 2048); + + /* Header is a map with metadata */ + msgpack_write_map_header(&hdr, 7); + + /* version */ + msgpack_write_key(&hdr, "version"); + msgpack_write_uint(&hdr, 4); + + /* format */ + msgpack_write_key(&hdr, "format"); + msgpack_write_str(&hdr, "tracegrind-msgpack", -1); + + /* creator */ + msgpack_write_key(&hdr, "creator"); + msgpack_write_str(&hdr, "valgrind-tracegrind", -1); + + /* creator_version */ + msgpack_write_key(&hdr, "creator_version"); + msgpack_write_str(&hdr, VERSION, -1); + + /* event_schemas - discriminated union: each event type has its own schema */ + msgpack_write_key(&hdr, "event_schemas"); + msgpack_write_map_header(&hdr, 7); /* 7 event types */ + + /* Event type 0 (MARKER) schema */ + msgpack_write_key(&hdr, "0"); + msgpack_write_array_header(&hdr, 4); + msgpack_write_str(&hdr, "seq", -1); + msgpack_write_str(&hdr, "tid", -1); + msgpack_write_str(&hdr, "event", -1); + msgpack_write_str(&hdr, "marker", -1); + + /* Event types 1-4: 7 fixed columns + "counters" sentinel */ + { + const HChar* ev_keys[] = {"1", "2", "3", "4"}; + Int k; + for (k = 0; k < 4; k++) { + msgpack_write_key(&hdr, ev_keys[k]); + msgpack_write_array_header(&hdr, 8); + msgpack_write_str(&hdr, "seq", -1); + msgpack_write_str(&hdr, "tid", -1); + msgpack_write_str(&hdr, "event", -1); + msgpack_write_str(&hdr, "fn", -1); + msgpack_write_str(&hdr, "obj", -1); + msgpack_write_str(&hdr, "file", -1); + msgpack_write_str(&hdr, "line", -1); + msgpack_write_str(&hdr, "counters", -1); + } + } + + /* Event type 5 (FORK) schema */ + msgpack_write_key(&hdr, "5"); + msgpack_write_array_header(&hdr, 4); + msgpack_write_str(&hdr, "seq", -1); + msgpack_write_str(&hdr, "tid", -1); + msgpack_write_str(&hdr, "event", -1); + msgpack_write_str(&hdr, "child_pid", -1); + + /* Event type 6 (THREAD_CREATE) schema */ + msgpack_write_key(&hdr, "6"); + msgpack_write_array_header(&hdr, 4); + msgpack_write_str(&hdr, "seq", -1); + msgpack_write_str(&hdr, "tid", -1); + msgpack_write_str(&hdr, "event", -1); + msgpack_write_str(&hdr, "child_tid", -1); + + /* counters - array of dynamic counter column names */ + msgpack_write_key(&hdr, "counters"); + msgpack_write_array_header(&hdr, mp_state.n_event_cols); + { + Int i; + for (i = 7; i < mp_state.ncols; i++) { + msgpack_write_str(&hdr, mp_state.col_names[i], -1); + } + } + + /* counter_units - map from counter name to unit string. + Following callgrind's convention: only time counters get units. */ + msgpack_write_key(&hdr, "counter_units"); + { + Int n_units = 0; + const HChar* unit_str = NULL; + switch (TG_(clo).collect_systime) { + case systime_no: + break; + case systime_msec: + unit_str = "ms"; + n_units = 1; + break; + case systime_usec: + unit_str = "us"; + n_units = 1; + break; + case systime_nsec: + unit_str = "ns"; + n_units = 2; + break; + } + msgpack_write_map_header(&hdr, n_units); + if (unit_str) { + msgpack_write_key(&hdr, "sysTime"); + msgpack_write_str(&hdr, unit_str, -1); + if (TG_(clo).collect_systime == systime_nsec) { + msgpack_write_key(&hdr, "sysCpuTime"); + msgpack_write_str(&hdr, unit_str, -1); + } + } + } + + /* Compress and write header chunk */ + SizeT src_size = hdr.size; + SizeT dst_capacity = tg_lz4_compress_bound(src_size); + UChar* compressed = VG_(malloc)("tg.mp.hdr", dst_capacity); + + SizeT compressed_size = + tg_lz4_compress(compressed, dst_capacity, hdr.data, src_size); + + /* Magic + version (8 bytes): "TGMP" + version(4) - version 4 */ + UChar magic[8] = {'T', 'G', 'M', 'P', 0x04, 0x00, 0x00, 0x00}; + VG_(write)(TG_(trace_out).fd, magic, 8); + + /* Header chunk size (4 bytes uncompressed, 4 bytes compressed) */ + UChar hdr_size[8]; + hdr_size[0] = (UChar)(src_size & 0xff); + hdr_size[1] = (UChar)((src_size >> 8) & 0xff); + hdr_size[2] = (UChar)((src_size >> 16) & 0xff); + hdr_size[3] = (UChar)((src_size >> 24) & 0xff); + hdr_size[4] = (UChar)(compressed_size & 0xff); + hdr_size[5] = (UChar)((compressed_size >> 8) & 0xff); + hdr_size[6] = (UChar)((compressed_size >> 16) & 0xff); + hdr_size[7] = (UChar)((compressed_size >> 24) & 0xff); + VG_(write)(TG_(trace_out).fd, hdr_size, 8); + + /* Compressed header data */ + VG_(write)(TG_(trace_out).fd, compressed, compressed_size); + + VG_(free)(compressed); + msgpack_free(&hdr); +} + +/* Initialize msgpack state with schema from event sets */ +static void msgpack_init_state(void) +{ + EventSet* es = TG_(sets).full; + Int g, i; + + /* Count dynamic event columns */ + Int n_events = 0; + for (g = 0; g < MAX_EVENTGROUP_COUNT; g++) { + if (!(es->mask & (1u << g))) + continue; + EventGroup* eg = TG_(get_event_group)(g); + if (!eg) + continue; + n_events += eg->size; + } + + mp_state.n_event_cols = n_events; + mp_state.ncols = 7 + n_events; /* 7 fixed + dynamic */ + + /* Allocate column names array */ + mp_state.col_names = + VG_(malloc)("tg.mp.cols", mp_state.ncols * sizeof(HChar*)); + + /* Fixed columns */ + mp_state.col_names[0] = "seq"; + mp_state.col_names[1] = "tid"; + mp_state.col_names[2] = "event"; + mp_state.col_names[3] = "fn"; + mp_state.col_names[4] = "obj"; + mp_state.col_names[5] = "file"; + mp_state.col_names[6] = "line"; + + /* Dynamic event columns */ + Int c = 7; + for (g = 0; g < MAX_EVENTGROUP_COUNT; g++) { + if (!(es->mask & (1u << g))) + continue; + EventGroup* eg = TG_(get_event_group)(g); + if (!eg) + continue; + for (i = 0; i < eg->size; i++) { + mp_state.col_names[c++] = eg->name[i]; + } + } + + /* Initialize buffer */ + msgpack_init(&mp_state.buf, MSGPACK_INITIAL_BUF); + mp_state.rows_in_chunk = 0; + + /* Write file header */ + msgpack_write_header(); +} + +/* Add an ENTER/EXIT row to the msgpack output */ +static void msgpack_add_row(ULong seq, + Int tid, + Int event, + const HChar* fn_name, + Int fn_len, + const HChar* obj_name, + Int obj_len, + const HChar* file_name, + Int file_len, + Int line, + const ULong* deltas, + Int n_deltas) +{ + /* Each row is a msgpack array: 7 fixed + 1 counters sub-array */ + msgpack_write_array_header(&mp_state.buf, 8); + + /* Fixed columns */ + msgpack_write_uint(&mp_state.buf, seq); + msgpack_write_int(&mp_state.buf, tid); + msgpack_write_int(&mp_state.buf, event); + msgpack_write_str(&mp_state.buf, fn_name, fn_len); + msgpack_write_str(&mp_state.buf, obj_name, obj_len); + msgpack_write_str(&mp_state.buf, file_name, file_len); + msgpack_write_int(&mp_state.buf, line); + + /* Counters sub-array */ + msgpack_write_array_header(&mp_state.buf, n_deltas); + for (Int i = 0; i < n_deltas; i++) { + msgpack_write_uint(&mp_state.buf, deltas[i]); + } + + mp_state.rows_in_chunk++; + + /* Flush if chunk is full */ + if (mp_state.rows_in_chunk >= MSGPACK_CHUNK_ROWS) { + msgpack_flush_chunk(); + } +} + +/* Add a FORK row to the msgpack output (minimal schema: seq, tid, event, + * child_pid) */ +static void msgpack_add_fork_row(ULong seq, Int tid, Int child_pid) +{ + /* FORK row is a 4-element array */ + msgpack_write_array_header(&mp_state.buf, 4); + msgpack_write_uint(&mp_state.buf, seq); + msgpack_write_int(&mp_state.buf, tid); + msgpack_write_int(&mp_state.buf, TG_EV_FORK); + msgpack_write_int(&mp_state.buf, child_pid); + + mp_state.rows_in_chunk++; + + /* Flush if chunk is full */ + if (mp_state.rows_in_chunk >= MSGPACK_CHUNK_ROWS) { + msgpack_flush_chunk(); + } +} + +/* Add a THREAD_CREATE row to the msgpack output (seq, tid, event, child_tid) */ +static void msgpack_add_thread_create_row(ULong seq, Int tid, Int child_tid) +{ + msgpack_write_array_header(&mp_state.buf, 4); + msgpack_write_uint(&mp_state.buf, seq); + msgpack_write_int(&mp_state.buf, tid); + msgpack_write_int(&mp_state.buf, TG_EV_THREAD_CREATE); + msgpack_write_int(&mp_state.buf, child_tid); + + mp_state.rows_in_chunk++; + + if (mp_state.rows_in_chunk >= MSGPACK_CHUNK_ROWS) { + msgpack_flush_chunk(); + } +} + +/* Add a MARKER row to the msgpack output (seq, tid, event, marker_str) */ +static void msgpack_add_marker_row(ULong seq, Int tid, const HChar* marker) +{ + msgpack_write_array_header(&mp_state.buf, 4); + msgpack_write_uint(&mp_state.buf, seq); + msgpack_write_int(&mp_state.buf, tid); + msgpack_write_int(&mp_state.buf, TG_EV_MARKER); + msgpack_write_str(&mp_state.buf, marker, -1); + + mp_state.rows_in_chunk++; + + if (mp_state.rows_in_chunk >= MSGPACK_CHUNK_ROWS) { + msgpack_flush_chunk(); + } +} + +/* Close msgpack output */ +static void msgpack_close_output(void) +{ + /* Flush any remaining rows */ + msgpack_flush_chunk(); + + /* Write end marker (zero-size chunk) */ + UChar end[8] = {0, 0, 0, 0, 0, 0, 0, 0}; + VG_(write)(TG_(trace_out).fd, end, 8); + + /* Cleanup */ + msgpack_free(&mp_state.buf); + if (mp_state.col_names) { + VG_(free)(mp_state.col_names); + mp_state.col_names = NULL; + } +} + +void TG_(trace_open_output)(void) +{ + SysRes res; + HChar filename[512]; + + if (TG_(trace_out).initialized) + return; + + if (!TG_(clo).out_format) + TG_(clo).out_format = DEFAULT_OUTFORMAT; + + HChar* expanded = + VG_(expand_file_name)("--tracegrind-out-file", TG_(clo).out_format); + VG_(strncpy)(filename, expanded, sizeof(filename) - 1); + filename[sizeof(filename) - 1] = '\0'; + VG_(free)(expanded); + + res = VG_(open)(filename, VKI_O_CREAT | VKI_O_WRONLY | VKI_O_TRUNC, + VKI_S_IRUSR | VKI_S_IWUSR); + if (sr_isError(res)) { + VG_(message)(Vg_UserMsg, "Error: cannot open trace output file '%s'\n", + filename); + VG_(exit)(1); + } + + TG_(trace_out).fd = (Int)sr_Res(res); + TG_(trace_out).seq = 0; + TG_(trace_out).initialized = True; + TG_(trace_out).header_written = False; + + /* Initialize msgpack writer */ + msgpack_init_state(); + + if (VG_(clo_verbosity) > 1) + VG_(message)(Vg_DebugMsg, "Trace output to %s\n", filename); +} + +/* + * Called in child process after fork. + * Closes the inherited file descriptor (without writing end marker) + * and opens a new trace file with the child's PID. + */ +void TG_(trace_reopen_child)(void) +{ + /* Close inherited fd without flushing/finalizing (that's parent's job) */ + if (TG_(trace_out).fd >= 0) { + VG_(close)(TG_(trace_out).fd); + } + + /* Reset state completely */ + TG_(trace_out).fd = -1; + TG_(trace_out).seq = 0; + TG_(trace_out).initialized = False; + TG_(trace_out).header_written = False; + + /* Open new trace file with child's PID (also re-inits msgpack state) */ + TG_(trace_open_output)(); +} + +void TG_(trace_emit_sample)(ThreadId tid, Bool is_enter, fn_node* fn) +{ + Int i; + + if (!TG_(trace_out).initialized) + return; + if (TG_(trace_out).fd < 0) + return; + + /* Get current thread info for per-thread last_sample_cost */ + thread_info* ti = TG_(get_current_thread)(); + if (!ti) + return; + + EventSet* es = TG_(sets).full; + FullCost current_cost = TG_(current_state).cost; + + /* If last_sample_cost not yet allocated, allocate and zero it */ + if (!ti->last_sample_cost) { + ti->last_sample_cost = TG_(get_eventset_cost)(es); + TG_(init_cost)(es, ti->last_sample_cost); + } + + TG_(trace_out).seq++; + + /* Resolve function info with cached lengths */ + const HChar* fn_name; + Int fn_len; + const HChar* obj_name; + Int obj_len; + const HChar* file_name; + Int file_len; + + if (fn) { + fn_name = fn->name; + fn_len = (Int)fn->name_len; + if (fn->file) { + file_name = fn->file->name; + file_len = (Int)fn->file->name_len; + if (fn->file->obj) { + obj_name = fn->file->obj->name; + obj_len = (Int)fn->file->obj->name_len; + } else { + obj_name = "???"; + obj_len = 3; + } + } else { + file_name = "???"; + file_len = 3; + obj_name = "???"; + obj_len = 3; + } + } else { + fn_name = "???"; + fn_len = 3; + obj_name = "???"; + obj_len = 3; + file_name = "???"; + file_len = 3; + } + + /* Compute deltas for all event counters */ + ULong deltas[64]; /* es->size is always small */ + tl_assert(es->size <= 64); + if (current_cost && ti->last_sample_cost) { + for (i = 0; i < es->size; i++) { + deltas[i] = current_cost[i] - ti->last_sample_cost[i]; + } + TG_(copy_cost)(es, ti->last_sample_cost, current_cost); + } else { + for (i = 0; i < es->size; i++) { + deltas[i] = 0; + } + } + + Int event_val = is_enter ? TG_EV_ENTER_FN : TG_EV_EXIT_FN; + + msgpack_add_row(TG_(trace_out).seq, (Int)tid, event_val, fn_name, fn_len, + obj_name, obj_len, file_name, file_len, 0, deltas, es->size); +} + +void TG_(trace_emit_enter_inlined)(ThreadId tid, BB* bb, const HChar* inl_fn) +{ + Int i; + + if (!TG_(trace_out).initialized) + return; + if (TG_(trace_out).fd < 0) + return; + + thread_info* ti = TG_(get_current_thread)(); + if (!ti) + return; + + EventSet* es = TG_(sets).full; + FullCost current_cost = TG_(current_state).cost; + + if (!ti->last_sample_cost) { + ti->last_sample_cost = TG_(get_eventset_cost)(es); + TG_(init_cost)(es, ti->last_sample_cost); + } + + TG_(trace_out).seq++; + + const HChar* fn_name = inl_fn; + Int fn_len = -1; /* inlined fn names not cached, use strlen */ + const HChar* obj_name; + Int obj_len; + const HChar* file_name; + Int file_len; + + if (bb->obj) { + obj_name = bb->obj->name; + obj_len = (Int)bb->obj->name_len; + } else { + obj_name = "???"; + obj_len = 3; + } + if (bb->fn && bb->fn->file) { + file_name = bb->fn->file->name; + file_len = (Int)bb->fn->file->name_len; + } else { + file_name = "???"; + file_len = 3; + } + + ULong deltas[64]; + tl_assert(es->size <= 64); + if (current_cost && ti->last_sample_cost) { + for (i = 0; i < es->size; i++) { + deltas[i] = current_cost[i] - ti->last_sample_cost[i]; + } + TG_(copy_cost)(es, ti->last_sample_cost, current_cost); + } else { + for (i = 0; i < es->size; i++) { + deltas[i] = 0; + } + } + + msgpack_add_row(TG_(trace_out).seq, (Int)tid, TG_EV_ENTER_INLINED_FN, + fn_name, fn_len, obj_name, obj_len, file_name, file_len, + (Int)bb->line, deltas, es->size); +} + +void TG_(trace_emit_exit_inlined)(ThreadId tid, BB* bb, const HChar* inl_fn) +{ + Int i; + + if (!TG_(trace_out).initialized) + return; + if (TG_(trace_out).fd < 0) + return; + + thread_info* ti = TG_(get_current_thread)(); + if (!ti) + return; + + EventSet* es = TG_(sets).full; + FullCost current_cost = TG_(current_state).cost; + + if (!ti->last_sample_cost) { + ti->last_sample_cost = TG_(get_eventset_cost)(es); + TG_(init_cost)(es, ti->last_sample_cost); + } + + TG_(trace_out).seq++; + + const HChar* fn_name = inl_fn; + Int fn_len = -1; /* inlined fn names not cached, use strlen */ + const HChar* obj_name; + Int obj_len; + const HChar* file_name; + Int file_len; + + if (bb->obj) { + obj_name = bb->obj->name; + obj_len = (Int)bb->obj->name_len; + } else { + obj_name = "???"; + obj_len = 3; + } + if (bb->fn && bb->fn->file) { + file_name = bb->fn->file->name; + file_len = (Int)bb->fn->file->name_len; + } else { + file_name = "???"; + file_len = 3; + } + + ULong deltas[64]; + tl_assert(es->size <= 64); + if (current_cost && ti->last_sample_cost) { + for (i = 0; i < es->size; i++) { + deltas[i] = current_cost[i] - ti->last_sample_cost[i]; + } + TG_(copy_cost)(es, ti->last_sample_cost, current_cost); + } else { + for (i = 0; i < es->size; i++) { + deltas[i] = 0; + } + } + + msgpack_add_row(TG_(trace_out).seq, (Int)tid, TG_EV_EXIT_INLINED_FN, fn_name, + fn_len, obj_name, obj_len, file_name, file_len, + (Int)bb->line, deltas, es->size); +} + +/* + * Emit a FORK event when a child process is created. + * Called from the post-syscall handler when fork/clone returns in parent. + * child_pid is the PID of the newly created child process. + */ +void TG_(trace_emit_fork)(ThreadId tid, Int child_pid) +{ + if (!TG_(trace_out).initialized) + return; + if (TG_(trace_out).fd < 0) + return; + + TG_(trace_out).seq++; + + /* FORK uses minimal schema: [seq, tid, event, child_pid] */ + msgpack_add_fork_row(TG_(trace_out).seq, (Int)tid, child_pid); +} + +void TG_(trace_emit_thread_create)(ThreadId tid, ThreadId child) +{ + if (!TG_(trace_out).initialized) + return; + if (TG_(trace_out).fd < 0) + return; + + TG_(trace_out).seq++; + + msgpack_add_thread_create_row(TG_(trace_out).seq, (Int)tid, (Int)child); +} + +void TG_(trace_emit_marker)(ThreadId tid, const HChar* marker) +{ + if (!TG_(trace_out).initialized) + return; + if (TG_(trace_out).fd < 0) + return; + + TG_(trace_out).seq++; + + msgpack_add_marker_row(TG_(trace_out).seq, (Int)tid, marker); +} + +void TG_(trace_close_output)(void) +{ + if (!TG_(trace_out).initialized) + return; + if (TG_(trace_out).fd < 0) + return; + + /* Flush remaining rows, write end marker */ + msgpack_close_output(); + VG_(close)(TG_(trace_out).fd); + + TG_(trace_out).fd = -1; + TG_(trace_out).initialized = False; + + if (VG_(clo_verbosity) > 1) + VG_(message)(Vg_DebugMsg, "Trace output closed (%llu samples written)\n", + TG_(trace_out).seq); +} + +/* Sum costs from all threads into total_cost */ +void TG_(compute_total_cost)(void) +{ + if (!TG_(total_cost)) { + TG_(total_cost) = TG_(get_eventset_cost)(TG_(sets).full); + TG_(init_cost)(TG_(sets).full, TG_(total_cost)); + } + + { + Int t; + thread_info** thr = TG_(get_threads)(); + for (t = 1; t < VG_N_THREADS; t++) { + if (!thr[t]) + continue; + TG_(add_diff_cost) + (TG_(sets).full, TG_(total_cost), thr[t]->lastdump_cost, + thr[t]->states.entry[0]->cost); + TG_(copy_cost) + (TG_(sets).full, thr[t]->lastdump_cost, thr[t]->states.entry[0]->cost); + } + } +} diff --git a/tracegrind/events.c b/tracegrind/events.c new file mode 100644 index 000000000..4e91967b8 --- /dev/null +++ b/tracegrind/events.c @@ -0,0 +1,524 @@ +/*--------------------------------------------------------------------*/ +/*--- Tracegrind ---*/ +/*--- events.c ---*/ +/*--------------------------------------------------------------------*/ + +/* + This file is part of Tracegrind, a Valgrind tool for call tracing. + + Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de) + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see . + + The GNU General Public License is contained in the file COPYING. +*/ + +#include "global.h" + +/* This should be 2**MAX_EVENTGROUP_COUNT */ +#define MAX_EVENTSET_COUNT 1024 + +static EventGroup* eventGroup[MAX_EVENTGROUP_COUNT]; +static EventSet* eventSetTable[MAX_EVENTSET_COUNT]; +static Bool eventSets_initialized = 0; + +static void initialize_event_sets(void) +{ + Int i; + + if (eventSets_initialized) + return; + + for (i = 0; i < MAX_EVENTGROUP_COUNT; i++) + eventGroup[i] = 0; + + for (i = 0; i < MAX_EVENTSET_COUNT; i++) + eventSetTable[i] = 0; + + eventSets_initialized = 1; +} + +static EventGroup* new_event_group(int id, int n) +{ + EventGroup* eg; + + initialize_event_sets(); + + TG_ASSERT(id >= 0 && id < MAX_EVENTGROUP_COUNT); + TG_ASSERT(eventGroup[id] == 0); + + eg = (EventGroup*)TG_MALLOC("cl.events.group.1", + sizeof(EventGroup) + n * sizeof(HChar*)); + eg->size = n; + eventGroup[id] = eg; + return eg; +} + +EventGroup* TG_(register_event_group)(int id, const HChar* n1) +{ + EventGroup* eg = new_event_group(id, 1); + eg->name[0] = n1; + + return eg; +} + +EventGroup* TG_(register_event_group2)(int id, const HChar* n1, const HChar* n2) +{ + EventGroup* eg = new_event_group(id, 2); + eg->name[0] = n1; + eg->name[1] = n2; + + return eg; +} + +EventGroup* TG_(register_event_group3)(int id, + const HChar* n1, + const HChar* n2, + const HChar* n3) +{ + EventGroup* eg = new_event_group(id, 3); + eg->name[0] = n1; + eg->name[1] = n2; + eg->name[2] = n3; + + return eg; +} + +EventGroup* TG_(register_event_group4)( + int id, const HChar* n1, const HChar* n2, const HChar* n3, const HChar* n4) +{ + EventGroup* eg = new_event_group(id, 4); + eg->name[0] = n1; + eg->name[1] = n2; + eg->name[2] = n3; + eg->name[3] = n4; + + return eg; +} + +EventGroup* TG_(get_event_group)(int id) +{ + TG_ASSERT(id >= 0 && id < MAX_EVENTGROUP_COUNT); + + return eventGroup[id]; +} + +static EventSet* eventset_from_mask(UInt mask) +{ + EventSet* es; + Int i, count, offset; + + if (mask >= MAX_EVENTSET_COUNT) + return 0; + + initialize_event_sets(); + if (eventSetTable[mask]) + return eventSetTable[mask]; + + es = (EventSet*)TG_MALLOC("cl.events.eventset.1", sizeof(EventSet)); + es->mask = mask; + + offset = 0; + count = 0; + for (i = 0; i < MAX_EVENTGROUP_COUNT; i++) { + es->offset[i] = offset; + if (((mask & (1u << i)) == 0) || (eventGroup[i] == 0)) + continue; + + offset += eventGroup[i]->size; + count++; + } + es->size = offset; + es->count = count; + + eventSetTable[mask] = es; + return es; +} + +EventSet* TG_(get_event_set)(Int id) +{ + TG_ASSERT(id >= 0 && id < MAX_EVENTGROUP_COUNT); + return eventset_from_mask(1u << id); +} + +EventSet* TG_(get_event_set2)(Int id1, Int id2) +{ + TG_ASSERT(id1 >= 0 && id1 < MAX_EVENTGROUP_COUNT); + TG_ASSERT(id2 >= 0 && id2 < MAX_EVENTGROUP_COUNT); + return eventset_from_mask((1u << id1) | (1u << id2)); +} + +EventSet* TG_(add_event_group)(EventSet* es, Int id) +{ + TG_ASSERT(id >= 0 && id < MAX_EVENTGROUP_COUNT); + if (!es) + es = eventset_from_mask(0); + return eventset_from_mask(es->mask | (1u << id)); +} + +EventSet* TG_(add_event_group2)(EventSet* es, Int id1, Int id2) +{ + TG_ASSERT(id1 >= 0 && id1 < MAX_EVENTGROUP_COUNT); + TG_ASSERT(id2 >= 0 && id2 < MAX_EVENTGROUP_COUNT); + if (!es) + es = eventset_from_mask(0); + return eventset_from_mask(es->mask | (1u << id1) | (1u << id2)); +} + +EventSet* TG_(add_event_set)(EventSet* es1, EventSet* es2) +{ + if (!es1) + es1 = eventset_from_mask(0); + if (!es2) + es2 = eventset_from_mask(0); + return eventset_from_mask(es1->mask | es2->mask); +} + +/* Get cost array for an event set */ +ULong* TG_(get_eventset_cost)(EventSet* es) +{ + return TG_(get_costarray)(es->size); +} + +/* Set all costs of an event set to zero */ +void TG_(init_cost)(EventSet* es, ULong* cost) +{ + Int i; + + if (!cost) + return; + + for (i = 0; i < es->size; i++) + cost[i] = 0; +} + +/* Set all costs of an event set to zero */ +void TG_(init_cost_lz)(EventSet* es, ULong** cost) +{ + Int i; + + TG_ASSERT(cost != 0); + if (!(*cost)) + *cost = TG_(get_eventset_cost)(es); + + for (i = 0; i < es->size; i++) + (*cost)[i] = 0; +} + +void TG_(zero_cost)(EventSet* es, ULong* cost) +{ + Int i; + + if (!cost) + return; + + for (i = 0; i < es->size; i++) + cost[i] = 0; +} + +Bool TG_(is_zero_cost)(EventSet* es, ULong* cost) +{ + Int i; + + if (!cost) + return True; + + for (i = 0; i < es->size; i++) + if (cost[i] != 0) + return False; + + return True; +} + +void TG_(copy_cost)(EventSet* es, ULong* dst, ULong* src) +{ + Int i; + + if (!src) { + TG_(zero_cost)(es, dst); + return; + } + TG_ASSERT(dst != 0); + + for (i = 0; i < es->size; i++) + dst[i] = src[i]; +} + +void TG_(copy_cost_lz)(EventSet* es, ULong** pdst, ULong* src) +{ + Int i; + ULong* dst; + + TG_ASSERT(pdst != 0); + + if (!src) { + TG_(zero_cost)(es, *pdst); + return; + } + dst = *pdst; + if (!dst) + dst = *pdst = TG_(get_eventset_cost)(es); + + for (i = 0; i < es->size; i++) + dst[i] = src[i]; +} + +void TG_(add_cost)(EventSet* es, ULong* dst, ULong* src) +{ + Int i; + + if (!src) + return; + TG_ASSERT(dst != 0); + + for (i = 0; i < es->size; i++) + dst[i] += src[i]; +} + +void TG_(add_cost_lz)(EventSet* es, ULong** pdst, ULong* src) +{ + Int i; + ULong* dst; + + if (!src) + return; + TG_ASSERT(pdst != 0); + + dst = *pdst; + if (!dst) { + dst = *pdst = TG_(get_eventset_cost)(es); + TG_(copy_cost)(es, dst, src); + return; + } + + for (i = 0; i < es->size; i++) + dst[i] += src[i]; +} + +/* Adds src to dst and zeros src. Returns false if nothing changed */ +Bool TG_(add_and_zero_cost)(EventSet* es, ULong* dst, ULong* src) +{ + Int i; + Bool is_nonzero = False; + + TG_ASSERT((es != 0) && (dst != 0)); + if (!src) + return False; + + for (i = 0; i < es->size; i++) { + if (src[i] == 0) + continue; + dst[i] += src[i]; + src[i] = 0; + is_nonzero = True; + } + + return is_nonzero; +} + +/* Adds src to dst and zeros src. Returns false if nothing changed */ +Bool TG_(add_and_zero_cost2)(EventSet* esDst, + ULong* dst, + EventSet* esSrc, + ULong* src) +{ + Int i, j; + Bool is_nonzero = False; + UInt mask; + EventGroup* eg; + ULong * egDst, *egSrc; + + TG_ASSERT((esDst != 0) && (dst != 0) && (esSrc != 0)); + if (!src) + return False; + + for (i = 0, mask = 1; i < MAX_EVENTGROUP_COUNT; i++, mask = mask << 1) { + if ((esSrc->mask & mask) == 0) + continue; + if (eventGroup[i] == 0) + continue; + + /* if src has a subset, dst must have, too */ + TG_ASSERT((esDst->mask & mask) > 0); + eg = eventGroup[i]; + egSrc = src + esSrc->offset[i]; + egDst = dst + esDst->offset[i]; + for (j = 0; j < eg->size; j++) { + if (egSrc[j] == 0) + continue; + egDst[j] += egSrc[j]; + egSrc[j] = 0; + is_nonzero = True; + } + } + + return is_nonzero; +} + +/* Adds difference of new and old to dst, and set old to new. + * Returns false if nothing changed */ +Bool TG_(add_diff_cost)(EventSet* es, ULong* dst, ULong* old, ULong* new_cost) +{ + Int i; + Bool is_nonzero = False; + + TG_ASSERT((es != 0) && (dst != 0)); + TG_ASSERT(old && new_cost); + + for (i = 0; i < es->size; i++) { + if (new_cost[i] == old[i]) + continue; + dst[i] += new_cost[i] - old[i]; + old[i] = new_cost[i]; + is_nonzero = True; + } + + return is_nonzero; +} + +Bool TG_(add_diff_cost_lz)(EventSet* es, + ULong** pdst, + ULong* old, + ULong* new_cost) +{ + Int i; + ULong* dst; + Bool is_nonzero = False; + + TG_ASSERT((es != 0) && (pdst != 0)); + TG_ASSERT(old && new_cost); + + dst = *pdst; + if (!dst) { + dst = *pdst = TG_(get_eventset_cost)(es); + TG_(zero_cost)(es, dst); + } + + for (i = 0; i < es->size; i++) { + if (new_cost[i] == old[i]) + continue; + dst[i] += new_cost[i] - old[i]; + old[i] = new_cost[i]; + is_nonzero = True; + } + + return is_nonzero; +} + +/* Allocate space for an event mapping */ +EventMapping* TG_(get_eventmapping)(EventSet* es) +{ + EventMapping* em; + + TG_ASSERT(es != 0); + + em = (EventMapping*)TG_MALLOC( + "cl.events.geMapping.1", + sizeof(EventMapping) + sizeof(struct EventMappingEntry) * es->size); + em->capacity = es->size; + em->size = 0; + em->es = es; + + return em; +} + +void TG_(append_event)(EventMapping* em, const HChar* n) +{ + Int i, j, offset = 0; + UInt mask; + EventGroup* eg; + + TG_ASSERT(em != 0); + for (i = 0, mask = 1; i < MAX_EVENTGROUP_COUNT; i++, mask = mask << 1) { + if ((em->es->mask & mask) == 0) + continue; + if (eventGroup[i] == 0) + continue; + + eg = eventGroup[i]; + for (j = 0; j < eg->size; j++, offset++) { + if (VG_(strcmp)(n, eg->name[j]) != 0) + continue; + + TG_ASSERT(em->capacity > em->size); + em->entry[em->size].group = i; + em->entry[em->size].index = j; + em->entry[em->size].offset = offset; + em->size++; + return; + } + } +} + +/* Returns pointer to dynamically string. The string will be overwritten + with each invocation. */ +HChar* TG_(eventmapping_as_string)(const EventMapping* em) +{ + Int i; + EventGroup* eg; + + TG_ASSERT(em != 0); + + XArray* xa = + VG_(newXA)(VG_(malloc), "cl.events.emas", VG_(free), sizeof(HChar)); + + for (i = 0; i < em->size; i++) { + if (i > 0) { + VG_(xaprintf)(xa, "%c", ' '); + } + eg = eventGroup[em->entry[i].group]; + TG_ASSERT(eg != 0); + VG_(xaprintf)(xa, "%s", eg->name[em->entry[i].index]); + } + VG_(xaprintf)(xa, "%c", '\0'); // zero terminate the string + + HChar* buf = VG_(strdup)("cl.events.emas", VG_(indexXA)(xa, 0)); + VG_(deleteXA)(xa); + + return buf; +} + +/* Returns pointer to dynamically allocated string. Caller needs to + VG_(free) it. */ +HChar* TG_(mappingcost_as_string)(const EventMapping* em, const ULong* c) +{ + Int i, skipped = 0; + + if (!c || em->size == 0) + return VG_(strdup)("cl.events.mcas", ""); + + XArray* xa = + VG_(newXA)(VG_(malloc), "cl.events.mcas", VG_(free), sizeof(HChar)); + + /* At least one entry */ + VG_(xaprintf)(xa, "%llu", c[em->entry[0].offset]); + + for (i = 1; i < em->size; i++) { + if (c[em->entry[i].offset] == 0) { + skipped++; + continue; + } + while (skipped > 0) { + VG_(xaprintf)(xa, " 0"); + skipped--; + } + VG_(xaprintf)(xa, " %llu", c[em->entry[i].offset]); + } + VG_(xaprintf)(xa, "%c", '\0'); // zero terminate the string + + HChar* buf = VG_(strdup)("cl.events.mas", VG_(indexXA)(xa, 0)); + VG_(deleteXA)(xa); + + return buf; +} diff --git a/tracegrind/events.h b/tracegrind/events.h new file mode 100644 index 000000000..bac264c45 --- /dev/null +++ b/tracegrind/events.h @@ -0,0 +1,131 @@ +/*--------------------------------------------------------------------*/ +/*--- Tracegrind ---*/ +/*--- events.h ---*/ +/*--------------------------------------------------------------------*/ + +/* + This file is part of Tracegrind, a Valgrind tool for call tracing. + + Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de) + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see . + + The GNU General Public License is contained in the file COPYING. +*/ + +/* Abstractions for 64-bit cost lists (events.h) */ + +#ifndef TG_EVENTS +#define TG_EVENTS + +#include "pub_tool_basics.h" + +#define TG_(str) VGAPPEND(vgTracegrind_, str) + +/* Event groups consist of one or more named event types. + * Event sets are constructed from such event groups. + * + * Event groups have to be registered globally with a unique ID + * before they can be used in an event set. + * A group can appear at most once in a event set. + */ + +#define MAX_EVENTGROUP_COUNT 10 + +typedef struct _EventGroup EventGroup; +struct _EventGroup { + Int size; + const HChar* name[0]; +}; + +/* return 0 if event group can not be registered */ +EventGroup* TG_(register_event_group)(int id, const HChar*); +EventGroup* TG_(register_event_group2)(int id, const HChar*, const HChar*); +EventGroup* + TG_(register_event_group3)(int id, const HChar*, const HChar*, const HChar*); +EventGroup* TG_(register_event_group4)( + int id, const HChar*, const HChar*, const HChar*, const HChar*); +EventGroup* TG_(get_event_group)(int id); + +/* Event sets are defined by event groups they consist of. */ + +typedef struct _EventSet EventSet; +struct _EventSet { + /* if subset with ID x is in the set, then bit x is set */ + UInt mask; + Int count; + Int size; + Int offset[MAX_EVENTGROUP_COUNT]; +}; + +/* Same event set is returned when requesting same event groups */ +EventSet* TG_(get_event_set)(Int id); +EventSet* TG_(get_event_set2)(Int id1, Int id2); +EventSet* TG_(add_event_group)(EventSet*, Int id); +EventSet* TG_(add_event_group2)(EventSet*, Int id1, Int id2); +EventSet* TG_(add_event_set)(EventSet*, EventSet*); + +/* Operations on costs. A cost pointer of 0 means zero cost. + * Functions ending in _lz allocate cost arrays only when needed + */ +ULong* TG_(get_eventset_cost)(EventSet*); +/* Set costs of event set to 0 */ +void TG_(init_cost)(EventSet*, ULong*); +/* This always allocates counter and sets them to 0 */ +void TG_(init_cost_lz)(EventSet*, ULong**); +/* Set costs of an event set to zero */ +void TG_(zero_cost)(EventSet*, ULong*); +Bool TG_(is_zero_cost)(EventSet*, ULong*); +void TG_(copy_cost)(EventSet*, ULong* dst, ULong* src); +void TG_(copy_cost_lz)(EventSet*, ULong** pdst, ULong* src); +void TG_(add_cost)(EventSet*, ULong* dst, ULong* src); +void TG_(add_cost_lz)(EventSet*, ULong** pdst, ULong* src); +/* Adds src to dst and zeros src. Returns false if nothing changed */ +Bool TG_(add_and_zero_cost)(EventSet*, ULong* dst, ULong* src); +Bool TG_(add_and_zero_cost2)(EventSet*, ULong* dst, EventSet*, ULong* src); +/* Adds difference of new and old to to dst, and set old to new. + * Returns false if nothing changed */ +Bool TG_(add_diff_cost)(EventSet*, ULong* dst, ULong* old, ULong* new_cost); +Bool + TG_(add_diff_cost_lz)(EventSet*, ULong** pdst, ULong* old, ULong* new_cost); + +/* EventMapping: An ordered subset of events from an event set. + * This is used to print out part of an EventSet, or in another order. + */ +struct EventMappingEntry { + Int group; + Int index; + Int offset; +}; +typedef struct _EventMapping EventMapping; +struct _EventMapping { + EventSet* es; + Int size; + Int capacity; + struct EventMappingEntry entry[0]; +}; + +/* Allocate space for an event mapping */ +EventMapping* TG_(get_eventmapping)(EventSet*); +void TG_(append_event)(EventMapping*, const HChar*); +/* Returns event mapping as a character string. That string is dynamically + allocated and it is the caller's responsibility to free it. + The function never returns NULL. */ +HChar* TG_(eventmapping_as_string)(const EventMapping*); +/* Returns mapping cost as a character string. That string is dynamically + allocated and it is the caller's responsibility to free it. + The function never returns NULL. */ +HChar* TG_(mappingcost_as_string)(const EventMapping*, const ULong*); + +#endif /* TG_EVENTS */ diff --git a/tracegrind/examples/.gitignore b/tracegrind/examples/.gitignore new file mode 100644 index 000000000..585f5d244 --- /dev/null +++ b/tracegrind/examples/.gitignore @@ -0,0 +1,2 @@ +*.tgtrace +*.txt diff --git a/tracegrind/examples/README.md b/tracegrind/examples/README.md new file mode 100644 index 000000000..d1a6b2834 --- /dev/null +++ b/tracegrind/examples/README.md @@ -0,0 +1,59 @@ +# Tracegrind example output files + +This directory contains pre-generated tracegrind output files for use as +reference material when implementing a trace parser. + +Each test produces two files: + +- **`.tgtrace`** — binary trace file (msgpack + lz4 compressed) +- **`.txt`** — full human-readable dump from `tracegrind-analyzer` + +## Files + +| Name | Description | Extra options | +|------|-------------|---------------| +| `test_basic` | Full program trace (loader + libc + main) | — | +| `test_marker` | `VALGRIND_TRACEGRIND_MARKER` client request | — | +| `test_toggle_collect` | `--toggle-collect` style collection | — | +| `test_foo_bar_baz` | Simple call chain: `foo -> bar -> baz` | `--instr-atstart=no` | +| `test_inline` | Inlined function calls | `--instr-atstart=no` | +| `test_enter_inlined` | `ENTER_INLINED_FN` / `EXIT_INLINED_FN` events | `--instr-atstart=no --read-inline-info=yes` | +| `test_nested_inlined` | Nested inlined function calls | `--instr-atstart=no --read-inline-info=yes` | +| `test_recursion` | Recursive function calls | `--instr-atstart=no` | +| `test_tailcall` | Tail-call optimized functions | `--instr-atstart=no` | +| `test_longjmp` | `setjmp` / `longjmp` unwinding | `--instr-atstart=no` | +| `test_signal` | Signal handler invocation | `--instr-atstart=no` | +| `test_exception` | C++ exception throw/catch | `--instr-atstart=no` | +| `test_thread_create` | `THREAD_CREATE` events | `--instr-atstart=no` | +| `test_thread_interleave` | Multi-thread interleaved callstacks | `--instr-atstart=no` | +| `test_syscall` | System call timing (`sysCount`, `sysTime` counters) | `--instr-atstart=no --collect-systime=nsec` | +| `test_instr_toggle` | Instrumentation toggle on/off mid-run | `--instr-atstart=no` | + +## Regenerating + +From the repository root (after building valgrind): + +```bash +bash tracegrind/examples/generate.sh +``` + +## Format + +The `.tgtrace` files use the tracegrind msgpack format (lz4-compressed msgpack). +See `tracegrind/docs/tracegrind-msgpack-format.md` for the format specification. + +Use `tracegrind/scripts/tracegrind-analyzer` to inspect any trace file: + +```bash +# Full dump +./tracegrind/scripts/tracegrind-analyzer tracegrind/examples/test_foo_bar_baz.tgtrace + +# Schema only +./tracegrind/scripts/tracegrind-analyzer tracegrind/examples/test_foo_bar_baz.tgtrace --schema + +# Statistics +./tracegrind/scripts/tracegrind-analyzer tracegrind/examples/test_foo_bar_baz.tgtrace --stats + +# Filter by event type +./tracegrind/scripts/tracegrind-analyzer tracegrind/examples/test_foo_bar_baz.tgtrace --event ENTER_FN +``` diff --git a/tracegrind/examples/generate.sh b/tracegrind/examples/generate.sh new file mode 100755 index 000000000..00aa6b072 --- /dev/null +++ b/tracegrind/examples/generate.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +# +# Generate tracegrind example output files. +# +# Run from the valgrind-codspeed repository root: +# bash tracegrind/examples/generate.sh +# +set -euo pipefail + +ROOT="$(cd "$(dirname "$0")/../.." && pwd)" +VG="$ROOT/vg-in-place" +ANALYZER="$ROOT/tracegrind/scripts/tracegrind-analyzer" +TESTS="$ROOT/tracegrind/tests" +OUT="$ROOT/tracegrind/examples" + +if [ ! -x "$VG" ]; then + echo "Error: vg-in-place not found at $VG" >&2 + echo "Build valgrind first (./configure && make)" >&2 + exit 1 +fi + +generate() { + local name="$1" + local binary="$2" + shift 2 + local vgopts=("$@") + + local trace="$OUT/${name}.tgtrace" + local txt="$OUT/${name}.txt" + + echo "Generating $name ..." + "$VG" --tool=tracegrind \ + --tracegrind-out-file="$trace" \ + "${vgopts[@]}" \ + "$TESTS/$binary" > /dev/null 2>&1 + + "$ANALYZER" "$trace" > "$txt" 2>&1 + + echo " -> $(wc -c < "$trace") bytes, $(wc -l < "$txt") lines" +} + +# Remove previous outputs +rm -f "$OUT"/*.tgtrace "$OUT"/*.txt + +generate test_basic test_basic.bin +generate test_marker test_marker.bin +generate test_toggle_collect test_toggle_collect.bin +generate test_foo_bar_baz test_foo_bar_baz.bin --instr-atstart=no +generate test_inline test_inline.bin --instr-atstart=no +generate test_enter_inlined test_enter_inlined.bin --instr-atstart=no --read-inline-info=yes +generate test_nested_inlined test_nested_inlined.bin --instr-atstart=no --read-inline-info=yes +generate test_recursion test_recursion.bin --instr-atstart=no +generate test_tailcall test_tailcall.bin --instr-atstart=no +generate test_longjmp test_longjmp.bin --instr-atstart=no +generate test_signal test_signal.bin --instr-atstart=no +generate test_exception test_exception.bin --instr-atstart=no +generate test_thread_create test_thread_create.bin --instr-atstart=no +generate test_thread_interleave test_thread_interleave.bin --instr-atstart=no +generate test_syscall test_syscall.bin --instr-atstart=no --collect-systime=nsec +generate test_instr_toggle test_instr_toggle.bin --instr-atstart=no + +echo "" +echo "Done. Generated $(ls "$OUT"/*.tgtrace 2>/dev/null | wc -l) trace files." diff --git a/tracegrind/fn.c b/tracegrind/fn.c new file mode 100644 index 000000000..47702dccc --- /dev/null +++ b/tracegrind/fn.c @@ -0,0 +1,809 @@ +/*--------------------------------------------------------------------*/ +/*--- Tracegrind ---*/ +/*--- ct_fn.c ---*/ +/*--------------------------------------------------------------------*/ + +/* + This file is part of Tracegrind, a Valgrind tool for call tracing. + + Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de) + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see . + + The GNU General Public License is contained in the file COPYING. +*/ + +#include "global.h" + +#define N_INITIAL_FN_ARRAY_SIZE 10071 + +static fn_array current_fn_active; + +/* x86_64 defines 4 variants. */ +#define MAX_RESOLVE_ADDRS 4 +static int runtime_resolve_addrs = 0; +static Addr runtime_resolve_addr[MAX_RESOLVE_ADDRS]; +static int runtime_resolve_length[MAX_RESOLVE_ADDRS]; + +// a code pattern is a list of tuples (start offset, length) +struct chunk_t { + int start, len; +}; +struct pattern { + const HChar* name; + int len; + struct chunk_t chunk[]; +}; + +/* Scan for a pattern in the code of an ELF object. + * If found, return true and set runtime_resolve_{addr,length} + */ +__attribute__((unused)) // Possibly; depends on the platform. +static Bool +check_code(obj_node* obj, UChar code[], struct pattern* pat) +{ + Bool found; + Addr addr, end; + int chunk, start, len; + + /* first chunk of pattern should always start at offset 0 and + * have at least 3 bytes */ + TG_ASSERT((pat->chunk[0].start == 0) && (pat->chunk[0].len > 2)); + + /* and we cannot be called more than MAX_RESOLVE_ADDRS times */ + TG_ASSERT(runtime_resolve_addrs < MAX_RESOLVE_ADDRS); + + TG_DEBUG(1, "check_code: %s, pattern %s, check %d bytes of [%x %x %x...]\n", + obj->name, pat->name, pat->chunk[0].len, code[0], code[1], code[2]); + + end = obj->start + obj->size - pat->len; + addr = obj->start; + while (addr < end) { + found = (VG_(memcmp)((void*)addr, code, pat->chunk[0].len) == 0); + + if (found) { + chunk = 1; + while (1) { + start = pat->chunk[chunk].start; + len = pat->chunk[chunk].len; + if (len == 0) + break; + + TG_ASSERT(len > 2); + TG_DEBUG(1, + " found chunk %d at %#lx, checking %d bytes " + "of [%x %x %x...]\n", + chunk - 1, addr - obj->start, len, code[start], + code[start + 1], code[start + 2]); + + if (VG_(memcmp)((void*)(addr + start), code + start, len) != 0) { + found = False; + break; + } + chunk++; + } + + if (found) { + TG_DEBUG(1, "found at offset %#lx.\n", addr - obj->start); + if (VG_(clo_verbosity) > 1) + VG_(message)(Vg_DebugMsg, + "Found runtime_resolve (%s): " + "%s +%#lx=%#lx, length %d\n", + pat->name, obj->name + obj->last_slash_pos, + addr - obj->start, addr, pat->len); + + runtime_resolve_addr[runtime_resolve_addrs] = addr; + runtime_resolve_length[runtime_resolve_addrs] = pat->len; + runtime_resolve_addrs++; + return True; + } + } + addr++; + } + TG_DEBUG(1, " found nothing.\n"); + return False; +} + +/* _ld_runtime_resolve, located in ld.so, needs special handling: + * The jump at end into the resolved function should not be + * represented as a call (as usually done in tracegrind with jumps), + * but as a return + call. Otherwise, the repeated existence of + * _ld_runtime_resolve in call chains will lead to huge cycles, + * making the profile almost worthless. + * + * If ld.so is stripped, the symbol will not appear. But as this + * function is handcrafted assembler, we search for it. + * + * We stop if the ELF object name does not seem to be the runtime linker + */ +static Bool search_runtime_resolve(obj_node* obj) +{ +#if defined(VGP_x86_linux) + static UChar code[] = { + /* 0*/ 0x50, 0x51, 0x52, 0x8b, 0x54, 0x24, 0x10, 0x8b, + /* 8*/ 0x44, 0x24, 0x0c, 0xe8, 0x70, 0x01, 0x00, 0x00, + /*16*/ 0x5a, 0x59, 0x87, 0x04, 0x24, 0xc2, 0x08, 0x00}; + /* Check ranges [0-11] and [16-23] ([12-15] is an absolute address) */ + static struct pattern pat = {"x86-def", 24, {{0, 12}, {16, 8}, {24, 0}}}; + + /* Pattern for glibc-2.8 on OpenSuse11.0 */ + static UChar code_28[] = { + /* 0*/ 0x50, 0x51, 0x52, 0x8b, 0x54, 0x24, 0x10, 0x8b, + /* 8*/ 0x44, 0x24, 0x0c, 0xe8, 0x70, 0x01, 0x00, 0x00, + /*16*/ 0x5a, 0x8b, 0x0c, 0x24, 0x89, 0x04, 0x24, 0x8b, + /*24*/ 0x44, 0x24, 0x04, 0xc2, 0x0c, 0x00}; + static struct pattern pat_28 = { + "x86-glibc2.8", 30, {{0, 12}, {16, 14}, {30, 0}}}; + + if (VG_(strncmp)(obj->name, "/lib/ld", 7) != 0) + return False; + Bool pat_p = check_code(obj, code, &pat); + Bool pat_28_p = check_code(obj, code_28, &pat_28); + if (pat_p || pat_28_p) + return True; + return False; +#endif + +#if defined(VGP_ppc32_linux) + static UChar code[] = {/* 0*/ 0x94, 0x21, 0xff, 0xc0, 0x90, + 0x01, 0x00, 0x0c, + /* 8*/ 0x90, 0x61, 0x00, 0x10, 0x90, + 0x81, 0x00, 0x14, + /*16*/ 0x7d, 0x83, 0x63, 0x78, 0x90, + 0xa1, 0x00, 0x18, + /*24*/ 0x7d, 0x64, 0x5b, 0x78, 0x90, + 0xc1, 0x00, 0x1c, + /*32*/ 0x7c, 0x08, 0x02, 0xa6, 0x90, + 0xe1, 0x00, 0x20, + /*40*/ 0x90, 0x01, 0x00, 0x30, 0x91, + 0x01, 0x00, 0x24, + /*48*/ 0x7c, 0x00, 0x00, 0x26, 0x91, + 0x21, 0x00, 0x28, + /*56*/ 0x91, 0x41, 0x00, 0x2c, 0x90, + 0x01, 0x00, 0x08, + /*64*/ 0x48, 0x00, 0x02, 0x91, 0x7c, + 0x69, 0x03, 0xa6, /* at 64: bl aff0 */ + /*72*/ 0x80, 0x01, 0x00, 0x30, 0x81, + 0x41, 0x00, 0x2c, + /*80*/ 0x81, 0x21, 0x00, 0x28, 0x7c, + 0x08, 0x03, 0xa6, + /*88*/ 0x81, 0x01, 0x00, 0x24, 0x80, + 0x01, 0x00, 0x08, + /*96*/ 0x80, 0xe1, 0x00, 0x20, 0x80, + 0xc1, 0x00, 0x1c, + /*104*/ 0x7c, 0x0f, 0xf1, 0x20, 0x80, + 0xa1, 0x00, 0x18, + /*112*/ 0x80, 0x81, 0x00, 0x14, 0x80, + 0x61, 0x00, 0x10, + /*120*/ 0x80, 0x01, 0x00, 0x0c, 0x38, + 0x21, 0x00, 0x40, + /*128*/ 0x4e, 0x80, 0x04, 0x20}; + static struct pattern pat = { + "ppc32-def", 132, {{0, 65}, {68, 64}, {132, 0}}}; + + if (VG_(strncmp)(obj->name, "/lib/ld", 7) != 0) + return False; + return check_code(obj, code, &pat); +#endif + +#if defined(VGP_amd64_linux) + static UChar code[] = { + /* 0*/ 0x48, 0x83, 0xec, 0x38, 0x48, 0x89, 0x04, 0x24, + /* 8*/ 0x48, 0x89, 0x4c, 0x24, 0x08, 0x48, 0x89, 0x54, 0x24, 0x10, + /*18*/ 0x48, 0x89, 0x74, 0x24, 0x18, 0x48, 0x89, 0x7c, 0x24, 0x20, + /*28*/ 0x4c, 0x89, 0x44, 0x24, 0x28, 0x4c, 0x89, 0x4c, 0x24, 0x30, + /*38*/ 0x48, 0x8b, 0x74, 0x24, 0x40, 0x49, 0x89, 0xf3, + /*46*/ 0x4c, 0x01, 0xde, 0x4c, 0x01, 0xde, 0x48, 0xc1, 0xe6, 0x03, + /*56*/ 0x48, 0x8b, 0x7c, 0x24, 0x38, 0xe8, 0xee, 0x01, 0x00, 0x00, + /*66*/ 0x49, 0x89, 0xc3, 0x4c, 0x8b, 0x4c, 0x24, 0x30, + /*74*/ 0x4c, 0x8b, 0x44, 0x24, 0x28, 0x48, 0x8b, 0x7c, 0x24, 0x20, + /*84*/ 0x48, 0x8b, 0x74, 0x24, 0x18, 0x48, 0x8b, 0x54, 0x24, 0x10, + /*94*/ 0x48, 0x8b, 0x4c, 0x24, 0x08, 0x48, 0x8b, 0x04, 0x24, + /*103*/ 0x48, 0x83, 0xc4, 0x48, 0x41, 0xff, 0xe3}; + static struct pattern pat = { + "amd64-def", 110, {{0, 62}, {66, 44}, {110, 0}}}; + + static UChar code_xsavec[] = { + /* 0*/ 0x53, 0x48, 0x89, 0xe3, 0x48, 0x83, 0xe4, 0xc0, + /* 8*/ 0x48, 0x2b, 0x25, 0x00, 0x00, 0x00, 0x00, /* sub (%rip),%rsp + */ + /*15*/ 0x48, + /*16*/ 0x89, 0x04, 0x24, 0x48, 0x89, 0x4c, 0x24, 0x08, + /*24*/ 0x48, 0x89, 0x54, 0x24, 0x10, 0x48, 0x89, 0x74, + /*32*/ 0x24, 0x18, 0x48, 0x89, 0x7c, 0x24, 0x20, 0x4c, + /*40*/ 0x89, 0x44, 0x24, 0x28, 0x4c, 0x89, 0x4c, 0x24, + /*48*/ 0x30, 0xb8, 0xee, 0x00, 0x00, 0x00, 0x31, 0xd2, + /*56*/ 0x48, 0x89, 0x94, 0x24, 0x50, 0x02, 0x00, 0x00, + /*64*/ 0x48, 0x89, 0x94, 0x24, 0x58, 0x02, 0x00, 0x00, + /*72*/ 0x48, 0x89, 0x94, 0x24, 0x60, 0x02, 0x00, 0x00, + /*80*/ 0x48, 0x89, 0x94, 0x24, 0x68, 0x02, 0x00, 0x00, + /*88*/ 0x48, 0x89, 0x94, 0x24, 0x70, 0x02, 0x00, 0x00, + /*96*/ 0x48, 0x89, 0x94, 0x24, 0x78, 0x02, 0x00, 0x00, + /*04*/ 0x0f, 0xc7, 0x64, 0x24, 0x40, 0x48, 0x8b, 0x73, + /*112*/ 0x10, 0x48, 0x8b, 0x7b, 0x08, + /*117*/ 0xe8, 0x00, 0x00, 0x00, 0x00, /* callq <_dl_fixup> */ + /*122*/ 0x49, 0x89, 0xc3, 0xb8, 0xee, 0x00, + /*128*/ 0x00, 0x00, 0x31, 0xd2, 0x0f, 0xae, 0x6c, 0x24, + /*136*/ 0x40, 0x4c, 0x8b, 0x4c, 0x24, 0x30, 0x4c, 0x8b, + /*144*/ 0x44, 0x24, 0x28, 0x48, 0x8b, 0x7c, 0x24, 0x20, + /*152*/ 0x48, 0x8b, 0x74, 0x24, 0x18, 0x48, 0x8b, 0x54, + /*160*/ 0x24, 0x10, 0x48, 0x8b, 0x4c, 0x24, 0x08, 0x48, + /*168*/ 0x8b, 0x04, 0x24, 0x48, 0x89, 0xdc, 0x48, 0x8b, + /*176*/ 0x1c, 0x24, 0x48, 0x83, 0xc4, 0x18, 0xf2, 0x41, + /*184*/ 0xff, 0xe3}; + static struct pattern pat_xsavec = { + "amd64-xsavec", 186, {{0, 11}, {15, 103}, {122, 64}, {186, 0}}}; + + static UChar code_xsave[] = { + /* 0*/ 0x53, 0x48, 0x89, 0xe3, 0x48, 0x83, 0xe4, 0xc0, + /* 8*/ 0x48, 0x2b, 0x25, 0x00, 0x00, 0x00, 0x00, /* sub (%rip),%rsp + */ + /*15*/ 0x48, + /*16*/ 0x89, 0x04, 0x24, 0x48, 0x89, 0x4c, 0x24, 0x08, + /*24*/ 0x48, 0x89, 0x54, 0x24, 0x10, 0x48, 0x89, 0x74, + /*32*/ 0x24, 0x18, 0x48, 0x89, 0x7c, 0x24, 0x20, 0x4c, + /*40*/ 0x89, 0x44, 0x24, 0x28, 0x4c, 0x89, 0x4c, 0x24, + /*48*/ 0x30, 0xb8, 0xee, 0x00, 0x00, 0x00, 0x31, 0xd2, + /*56*/ 0x48, 0x89, 0x94, 0x24, 0x40, 0x02, 0x00, 0x00, + /*64*/ 0x48, 0x89, 0x94, 0x24, 0x48, 0x02, 0x00, 0x00, + /*72*/ 0x48, 0x89, 0x94, 0x24, 0x50, 0x02, 0x00, 0x00, + /*80*/ 0x48, 0x89, 0x94, 0x24, 0x58, 0x02, 0x00, 0x00, + /*88*/ 0x48, 0x89, 0x94, 0x24, 0x60, 0x02, 0x00, 0x00, + /*96*/ 0x48, 0x89, 0x94, 0x24, 0x68, 0x02, 0x00, 0x00, + /*104*/ 0x48, 0x89, 0x94, 0x24, 0x70, 0x02, 0x00, 0x00, + /*112*/ 0x48, 0x89, 0x94, 0x24, 0x78, 0x02, 0x00, 0x00, + /*120*/ 0x0f, 0xae, 0x64, 0x24, 0x40, 0x48, 0x8b, 0x73, + /*128*/ 0x10, 0x48, 0x8b, 0x7b, 0x08, + /*133*/ 0xe8, 0x00, 0x00, 0x00, 0x00, /* callq <_dl_fixup> */ + /*138*/ 0x49, 0x89, 0xc3, 0xb8, 0xee, 0x00, + /*144*/ 0x00, 0x00, 0x31, 0xd2, 0x0f, 0xae, 0x6c, 0x24, + /*152*/ 0x40, 0x4c, 0x8b, 0x4c, 0x24, 0x30, 0x4c, 0x8b, + /*160*/ 0x44, 0x24, 0x28, 0x48, 0x8b, 0x7c, 0x24, 0x20, + /*168*/ 0x48, 0x8b, 0x74, 0x24, 0x18, 0x48, 0x8b, 0x54, + /*176*/ 0x24, 0x10, 0x48, 0x8b, 0x4c, 0x24, 0x08, 0x48, + /*184*/ 0x8b, 0x04, 0x24, 0x48, 0x89, 0xdc, 0x48, 0x8b, + /*192*/ 0x1c, 0x24, 0x48, 0x83, 0xc4, 0x18, 0xf2, 0x41, + /*200*/ 0xff, 0xe3}; + static struct pattern pat_xsave = { + "amd64-xsave", 202, {{0, 11}, {15, 119}, {138, 64}, {202, 0}}}; + + static UChar code_fxsave[] = { + /* 0*/ 0x53, 0x48, 0x89, 0xe3, 0x48, 0x83, 0xe4, 0xf0, + /* 8*/ 0x48, 0x81, 0xec, 0x40, 0x02, 0x00, 0x00, 0x48, + /*16*/ 0x89, 0x04, 0x24, 0x48, 0x89, 0x4c, 0x24, 0x08, + /*24*/ 0x48, 0x89, 0x54, 0x24, 0x10, 0x48, 0x89, 0x74, + /*32*/ 0x24, 0x18, 0x48, 0x89, 0x7c, 0x24, 0x20, 0x4c, + /*40*/ 0x89, 0x44, 0x24, 0x28, 0x4c, 0x89, 0x4c, 0x24, + /*48*/ 0x30, 0x0f, 0xae, 0x44, 0x24, 0x40, 0x48, 0x8b, + /*56*/ 0x73, 0x10, 0x48, 0x8b, 0x7b, 0x08, + /*62*/ 0xe8, 0x00, 0x00, 0x00, 0x00, /* callq <_dl_fixup> */ + /*67*/ 0x49, 0x89, 0xc3, 0x0f, 0xae, + /*72*/ 0x4c, 0x24, 0x40, 0x4c, 0x8b, 0x4c, 0x24, 0x30, + /*80*/ 0x4c, 0x8b, 0x44, 0x24, 0x28, 0x48, 0x8b, 0x7c, + /*88*/ 0x24, 0x20, 0x48, 0x8b, 0x74, 0x24, 0x18, 0x48, + /*96*/ 0x8b, 0x54, 0x24, 0x10, 0x48, 0x8b, 0x4c, 0x24, + /*104*/ 0x08, 0x48, 0x8b, 0x04, 0x24, 0x48, 0x89, 0xdc, + /*112*/ 0x48, 0x8b, 0x1c, 0x24, 0x48, 0x83, 0xc4, 0x18, + /*120*/ 0xf2, 0x41, 0xff, 0xe3}; + static struct pattern pat_fxsave = { + "amd64-fxsave", 124, {{0, 63}, {67, 57}, {124, 0}}}; + + if ((VG_(strncmp)(obj->name, "/lib/ld", 7) != 0) && + (VG_(strncmp)(obj->name, "/lib64/ld", 9) != 0) && + (VG_(strncmp)(obj->name, "/usr/lib/ld", 11) != 0) && + (VG_(strncmp)(obj->name, "/usr/lib64/ld", 13) != 0)) + return False; + Bool pat_p = check_code(obj, code, &pat); + Bool pat_xsavec_p = check_code(obj, code_xsavec, &pat_xsavec); + Bool pat_xsave_p = check_code(obj, code_xsave, &pat_xsave); + Bool pat_fxsave_p = check_code(obj, code_fxsave, &pat_fxsave); + if (pat_p || pat_xsavec_p || pat_xsave_p || pat_fxsave_p) + return True; +#endif + + /* For other platforms, no patterns known */ + return False; +} + +/*------------------------------------------------------------*/ +/*--- Object/File/Function hash entry operations ---*/ +/*------------------------------------------------------------*/ + +/* Object hash table, fixed */ +static obj_node* obj_table[N_OBJ_ENTRIES]; + +void TG_(init_obj_table)(void) +{ + Int i; + for (i = 0; i < N_OBJ_ENTRIES; i++) + obj_table[i] = 0; +} + +#define HASH_CONSTANT 256 + +static UInt str_hash(const HChar* s, UInt table_size) +{ + int hash_value = 0; + for (; *s; s++) + hash_value = (HASH_CONSTANT * hash_value + *s) % table_size; + return hash_value; +} + +static const HChar* anonymous_obj = "???"; + +static __inline__ obj_node* new_obj_node(DebugInfo* di, obj_node* next) +{ + Int i; + obj_node* obj; + + obj = (obj_node*)TG_MALLOC("cl.fn.non.1", sizeof(obj_node)); + obj->name = di ? VG_(strdup)("cl.fn.non.2", VG_(DebugInfo_get_filename)(di)) + : anonymous_obj; + for (i = 0; i < N_FILE_ENTRIES; i++) { + obj->files[i] = NULL; + } + TG_(stat).distinct_objs++; + obj->number = TG_(stat).distinct_objs; + /* JRS 2008 Feb 19: maybe rename .start/.size/.offset to + .text_avma/.text_size/.test_bias to make it clearer what these + fields really mean */ + obj->start = di ? VG_(DebugInfo_get_text_avma)(di) : 0; + obj->size = di ? VG_(DebugInfo_get_text_size)(di) : 0; + obj->offset = di ? VG_(DebugInfo_get_text_bias)(di) : 0; + obj->next = next; + + // not only used for debug output (see static.c) + obj->last_slash_pos = 0; + i = 0; + while (obj->name[i]) { + if (obj->name[i] == '/') + obj->last_slash_pos = i + 1; + i++; + } + obj->name_len = i; + + if (runtime_resolve_addrs == 0) + search_runtime_resolve(obj); + + return obj; +} + +obj_node* TG_(get_obj_node)(DebugInfo* di) +{ + obj_node* curr_obj_node; + UInt objname_hash; + const HChar* obj_name; + + obj_name = di ? VG_(DebugInfo_get_filename)(di) : anonymous_obj; + + /* lookup in obj hash */ + objname_hash = str_hash(obj_name, N_OBJ_ENTRIES); + curr_obj_node = obj_table[objname_hash]; + while (NULL != curr_obj_node && + VG_(strcmp)(obj_name, curr_obj_node->name) != 0) { + curr_obj_node = curr_obj_node->next; + } + if (NULL == curr_obj_node) { + obj_table[objname_hash] = curr_obj_node = + new_obj_node(di, obj_table[objname_hash]); + } + + return curr_obj_node; +} + +static __inline__ file_node* +new_file_node(const HChar* filename, obj_node* obj, file_node* next) +{ + Int i; + file_node* file = (file_node*)TG_MALLOC("cl.fn.nfn.1", sizeof(file_node)); + file->name = VG_(strdup)("cl.fn.nfn.2", filename); + file->name_len = VG_(strlen)(filename); + for (i = 0; i < N_FN_ENTRIES; i++) { + file->fns[i] = NULL; + } + TG_(stat).distinct_files++; + file->obj = obj; + file->next = next; + return file; +} + +file_node* +TG_(get_file_node)(obj_node* curr_obj_node, const HChar* dir, const HChar* file) +{ + file_node* curr_file_node; + UInt filename_hash; + + /* Build up an absolute pathname, if there is a directory available */ + HChar filename[VG_(strlen)(dir) + 1 + VG_(strlen)(file) + 1]; + VG_(strcpy)(filename, dir); + if (filename[0] != '\0') { + VG_(strcat)(filename, "/"); + } + VG_(strcat)(filename, file); + + /* lookup in file hash */ + filename_hash = str_hash(filename, N_FILE_ENTRIES); + curr_file_node = curr_obj_node->files[filename_hash]; + while (NULL != curr_file_node && + VG_(strcmp)(filename, curr_file_node->name) != 0) { + curr_file_node = curr_file_node->next; + } + if (NULL == curr_file_node) { + curr_obj_node->files[filename_hash] = curr_file_node = new_file_node( + filename, curr_obj_node, curr_obj_node->files[filename_hash]); + } + + return curr_file_node; +} + +/* forward decl. */ +static void resize_fn_array(void); + +static __inline__ fn_node* +new_fn_node(const HChar* fnname, file_node* file, fn_node* next) +{ + fn_node* fn = (fn_node*)TG_MALLOC("cl.fn.nfnnd.1", sizeof(fn_node)); + fn->name = VG_(strdup)("cl.fn.nfnnd.2", fnname); + fn->name_len = VG_(strlen)(fnname); + + TG_(stat).distinct_fns++; + fn->number = TG_(stat).distinct_fns; + fn->last_cxt = 0; + fn->pure_cxt = 0; + fn->file = file; + fn->next = next; + + fn->toggle_collect = False; + fn->skip = False; + fn->pop_on_jump = TG_(clo).pop_on_jump; + fn->group = 0; + fn->separate_callers = TG_(clo).separate_callers; + fn->separate_recursions = TG_(clo).separate_recursions; + +#if TG_ENABLE_DEBUG + fn->verbosity = -1; +#endif + + if (TG_(stat).distinct_fns >= current_fn_active.size) + resize_fn_array(); + + return fn; +} + +/* Get a function node in hash2 with known file node. + * hash nodes are created if needed + */ +static fn_node* get_fn_node_infile(file_node* curr_file_node, + const HChar* fnname) +{ + fn_node* curr_fn_node; + UInt fnname_hash; + + TG_ASSERT(curr_file_node != 0); + + /* lookup in function hash */ + fnname_hash = str_hash(fnname, N_FN_ENTRIES); + curr_fn_node = curr_file_node->fns[fnname_hash]; + while (NULL != curr_fn_node && + VG_(strcmp)(fnname, curr_fn_node->name) != 0) { + curr_fn_node = curr_fn_node->next; + } + if (NULL == curr_fn_node) { + curr_file_node->fns[fnname_hash] = curr_fn_node = + new_fn_node(fnname, curr_file_node, curr_file_node->fns[fnname_hash]); + } + + return curr_fn_node; +} + +/* Get a function node in a Segment. + * Hash nodes are created if needed. + */ +static __inline__ fn_node* get_fn_node_inseg(DebugInfo* di, + const HChar* dirname, + const HChar* filename, + const HChar* fnname) +{ + obj_node* obj = TG_(get_obj_node)(di); + file_node* file = TG_(get_file_node)(obj, dirname, filename); + fn_node* fn = get_fn_node_infile(file, fnname); + + return fn; +} + +Bool TG_(get_debug_info)(Addr instr_addr, + const HChar** dir, + const HChar** file, + const HChar** fn_name, + UInt* line_num, + DebugInfo** pDebugInfo) +{ + Bool found_file_line, found_fn, result = True; + UInt line; + + TG_DEBUG(6, " + get_debug_info(%#lx)\n", instr_addr); + + DiEpoch ep = VG_(current_DiEpoch)(); + if (pDebugInfo) { + *pDebugInfo = VG_(find_DebugInfo)(ep, instr_addr); + + // for generated code in anonymous space, pSegInfo is 0 + } + + found_file_line = + VG_(get_filename_linenum)(ep, instr_addr, file, dir, &line); + found_fn = VG_(get_fnname)(ep, instr_addr, fn_name); + + if (!found_file_line && !found_fn) { + TG_(stat).no_debug_BBs++; + *file = "???"; + *fn_name = "???"; + if (line_num) + *line_num = 0; + result = False; + + } else if (found_file_line && found_fn) { + TG_(stat).full_debug_BBs++; + if (line_num) + *line_num = line; + + } else if (found_file_line && !found_fn) { + TG_(stat).file_line_debug_BBs++; + *fn_name = "???"; + if (line_num) + *line_num = line; + + } else /*(!found_file_line && found_fn)*/ { + TG_(stat).fn_name_debug_BBs++; + *file = "???"; + if (line_num) + *line_num = 0; + } + + TG_DEBUG(6, " - get_debug_info(%#lx): seg '%s', fn %s\n", instr_addr, + !pDebugInfo ? "-" + : (*pDebugInfo) ? VG_(DebugInfo_get_filename)(*pDebugInfo) + : "(None)", + *fn_name); + + return result; +} + +/* for _libc_freeres_wrapper => _exit renaming */ +static BB* exit_bb = 0; + +/* + * Attach function struct to a BB from debug info. + */ +fn_node* TG_(get_fn_node)(BB* bb) +{ + const HChar *fnname, *filename, *dirname; + DebugInfo* di; + UInt line_num; + fn_node* fn; + Int i; + + /* fn from debug info is idempotent for a BB */ + if (bb->fn) + return bb->fn; + + TG_DEBUG(3, "+ get_fn_node(BB %#lx)\n", bb_addr(bb)); + + /* get function/file name, line number and object of + * the BB according to debug information + */ + TG_(get_debug_info) + (bb_addr(bb), &dirname, &filename, &fnname, &line_num, &di); + + DiEpoch ep = VG_(current_DiEpoch)(); + + /* Build inline stack for this BB using InlIPCursor */ + { + InlIPCursor* iipc = VG_(new_IIPC)(ep, bb_addr(bb)); + if (iipc) { + const HChar* tmp[TG_MAX_INL_DEPTH + 1]; + Int total = 0; + do { + const HChar* fn_name = NULL; + VG_(get_fnname_inl)(ep, bb_addr(bb), &fn_name, iipc); + if (fn_name && total < TG_MAX_INL_DEPTH + 1) + tmp[total++] = fn_name; + } while (VG_(next_IIPC)(iipc)); + VG_(delete_IIPC)(iipc); + + /* tmp[] is innermost-first; last entry is the non-inlined function + * (skip it) */ + Int inl_count = total - 1; + if (inl_count > 0) { + bb->inl_depth = inl_count; + bb->inl_fns = VG_(malloc)("tg.bb.inl", inl_count * sizeof(HChar*)); + /* Reverse into outermost-first order */ + for (Int i = 0; i < inl_count; i++) + bb->inl_fns[i] = tmp[inl_count - 1 - i]; + } + } + } + + if (0 == VG_(strcmp)(fnname, "???")) { + int p; + static HChar buf[32]; // for sure large enough + /* Use address as found in library */ + if (sizeof(Addr) == 4) + p = VG_(sprintf)(buf, "%#08lx", (UWord)bb->offset); + else + // 64bit address + p = VG_(sprintf)(buf, "%#016lx", (UWord)bb->offset); + + VG_(sprintf)(buf + p, "%s", + (bb->sect_kind == Vg_SectData) ? " [Data]" + : (bb->sect_kind == Vg_SectBSS) ? " [BSS]" + : (bb->sect_kind == Vg_SectGOT) ? " [GOT]" + : (bb->sect_kind == Vg_SectPLT) ? " [PLT]" + : ""); + fnname = buf; + } else { + if (VG_(get_fnname_if_entry)(ep, bb_addr(bb), &fnname)) + bb->is_entry = 1; + } + + /* HACK for correct _exit: + * _exit is redirected to VG_(__libc_freeres_wrapper) by valgrind, + * so we rename it back again :-) + */ + if (0 == VG_(strcmp)(fnname, "vgPlain___libc_freeres_wrapper") && exit_bb) { + TG_(get_debug_info) + (bb_addr(exit_bb), &dirname, &filename, &fnname, &line_num, &di); + + TG_DEBUG(1, "__libc_freeres_wrapper renamed to _exit\n"); + } + if (0 == VG_(strcmp)(fnname, "_exit") && !exit_bb) + exit_bb = bb; + + for (i = 0; i < runtime_resolve_addrs; i++) { + if ((bb_addr(bb) >= runtime_resolve_addr[i]) && + (bb_addr(bb) < runtime_resolve_addr[i] + runtime_resolve_length[i])) { + /* BB in runtime_resolve found by code check; use this name */ + fnname = "_dl_runtime_resolve"; + break; + } + } + + /* get fn_node struct for this function */ + fn = get_fn_node_inseg(di, dirname, filename, fnname); + + /* if this is the 1st time the function is seen, + * some attributes are set */ + if (fn->pure_cxt == 0) { + + /* Every function gets a "pure" context, i.e. a context with stack + * depth 1 only with this function. This is for compression of mangled + * names + */ + fn_node* pure[2]; + pure[0] = 0; + pure[1] = fn; + fn->pure_cxt = TG_(get_cxt)(pure + 1); + + if (bb->sect_kind == Vg_SectPLT || bb->sect_kind == Vg_SectPLTSEC) + fn->skip = TG_(clo).skip_plt; + + if (VG_(strncmp)(fn->name, "_dl_runtime_resolve", 19) == 0) { + fn->pop_on_jump = True; + + if (VG_(clo_verbosity) > 1) + VG_(message)(Vg_DebugMsg, + "Symbol match: found runtime_resolve:" + " %s +%#lx=%#lx\n", + bb->obj->name + bb->obj->last_slash_pos, + (UWord)bb->offset, bb_addr(bb)); + } + + /* apply config options from function name patterns + * given on command line */ + TG_(update_fn_config)(fn); + } + + bb->fn = fn; + bb->line = line_num; + + if (dirname[0]) { + TG_DEBUG(3, "- get_fn_node(BB %#lx): %s (in %s:%u)\n", bb_addr(bb), + fnname, filename, line_num); + } else + TG_DEBUG(3, "- get_fn_node(BB %#lx): %s (in %s/%s:%u)\n", bb_addr(bb), + fnname, dirname, filename, line_num); + + return fn; +} + +/*------------------------------------------------------------*/ +/*--- Active function array operations ---*/ +/*------------------------------------------------------------*/ + +/* The active function array is a thread-specific array + * of UInts, mapping function numbers to the active count of + * functions. + * The active count is the number of times a function appears + * in the current call stack, and is used when costs for recursion + * levels should be separated. + */ + +UInt* TG_(get_fn_entry)(Int n) +{ + TG_ASSERT(n < current_fn_active.size); + return current_fn_active.array + n; +} + +void TG_(init_fn_array)(fn_array* a) +{ + Int i; + + TG_ASSERT(a != 0); + + a->size = N_INITIAL_FN_ARRAY_SIZE; + if (a->size <= TG_(stat).distinct_fns) + a->size = TG_(stat).distinct_fns + 1; + + a->array = (UInt*)TG_MALLOC("cl.fn.gfe.1", a->size * sizeof(UInt)); + for (i = 0; i < a->size; i++) + a->array[i] = 0; +} + +void TG_(copy_current_fn_array)(fn_array* dst) +{ + TG_ASSERT(dst != 0); + + dst->size = current_fn_active.size; + dst->array = current_fn_active.array; +} + +fn_array* TG_(get_current_fn_array)(void) { return ¤t_fn_active; } + +void TG_(set_current_fn_array)(fn_array* a) +{ + TG_ASSERT(a != 0); + + current_fn_active.size = a->size; + current_fn_active.array = a->array; + if (current_fn_active.size <= TG_(stat).distinct_fns) + resize_fn_array(); +} + +/* ensure that active_array is big enough: + * is the highest index, so + * has to be bigger than that. + */ +static void resize_fn_array(void) +{ + UInt* new_array; + Int i; + + UInt newsize = current_fn_active.size; + while (newsize <= TG_(stat).distinct_fns) + newsize *= 2; + + TG_DEBUG(0, "Resize fn_active_array: %u => %u\n", current_fn_active.size, + newsize); + + new_array = (UInt*)TG_MALLOC("cl.fn.rfa.1", newsize * sizeof(UInt)); + for (i = 0; i < current_fn_active.size; i++) + new_array[i] = current_fn_active.array[i]; + while (i < newsize) + new_array[i++] = 0; + + VG_(free)(current_fn_active.array); + current_fn_active.size = newsize; + current_fn_active.array = new_array; + TG_(stat).fn_array_resizes++; +} diff --git a/tracegrind/global.h b/tracegrind/global.h new file mode 100644 index 000000000..1c6196c52 --- /dev/null +++ b/tracegrind/global.h @@ -0,0 +1,828 @@ +/*--------------------------------------------------------------------*/ +/*--- Tracegrind data structures, functions. global.h ---*/ +/*--------------------------------------------------------------------*/ + +/* + This file is part of Valgrind, a dynamic binary instrumentation + framework. + + Copyright (C) 2004-2017 Josef Weidendorfer + josef.weidendorfer@gmx.de + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see . + + The GNU General Public License is contained in the file COPYING. +*/ + +#ifndef TG_GLOBAL +#define TG_GLOBAL + +#include "pub_tool_basics.h" +#include "pub_tool_clientstate.h" +#include "pub_tool_debuginfo.h" +#include "pub_tool_libcassert.h" +#include "pub_tool_libcbase.h" +#include "pub_tool_libcfile.h" +#include "pub_tool_libcprint.h" +#include "pub_tool_libcproc.h" +#include "pub_tool_machine.h" +#include "pub_tool_machine.h" // VG_(fnptr_to_fnentry) +#include "pub_tool_mallocfree.h" +#include "pub_tool_options.h" +#include "pub_tool_tooliface.h" +#include "pub_tool_vki.h" +#include "pub_tool_vkiscnums.h" +#include "pub_tool_xarray.h" + +#include "costs.h" +#include "events.h" // defines TG_ macro + +/*------------------------------------------------------------*/ +/*--- Tracegrind compile options --- */ +/*------------------------------------------------------------*/ + +/* Enable debug output */ +#define TG_ENABLE_DEBUG 1 + +/* Enable experimental features? */ +#define TG_EXPERIMENTAL 0 + +/* Maximum depth of inline call stack tracking */ +#define TG_MAX_INL_DEPTH 16 + +/*------------------------------------------------------------*/ +/*--- Command line options ---*/ +/*------------------------------------------------------------*/ + +#define DEFAULT_OUTFORMAT "tracegrind.out.%p.msgpack.lz4" + +/* If and how to collect syscall time. + systime_no : do not collect systime + systime_msec : collect syscount, systime elapsed, milli second precision. + systime_usec : collect syscount, systime elapsed, micro second precision. + systime_nsec : collect syscount, systime elapsed, systime cpu, nano second + precision. */ +typedef enum { + systime_no, + systime_msec, + systime_usec, + systime_nsec +} Collect_Systime; + +/* Trace event types */ +typedef enum { + TG_EV_MARKER = 0, + TG_EV_ENTER_FN = 1, + TG_EV_EXIT_FN = 2, + TG_EV_ENTER_INLINED_FN = 3, + TG_EV_EXIT_INLINED_FN = 4, + TG_EV_FORK = 5, + TG_EV_THREAD_CREATE = 6 +} TraceEventType; + +typedef struct _CommandLineOptions CommandLineOptions; +struct _CommandLineOptions { + + /* Output options */ + const HChar* out_format; /* Format string for tracegrind output file name */ + + /* Collection options */ + Bool separate_threads; /* Separate threads in dump? */ + Int separate_callers; /* Separate dependent on how many callers? */ + Int separate_recursions; /* Max level of recursions to separate */ + Bool skip_plt; /* Skip functions in PLT section? */ + + Bool collect_atstart; /* Start in collecting state ? */ + Bool collect_jumps; /* Collect (cond.) jumps in functions ? */ + + Collect_Systime collect_systime; /* Collect time for system calls */ + + Bool collect_bus; /* Collect global bus events */ + + /* Instrument options */ + Bool instrument_atstart; /* Instrument at start? */ + Bool simulate_cache; /* Call into cache simulator ? */ + Bool simulate_branch; /* Call into branch prediction simulator ? */ + + /* Call graph generation */ + Bool pop_on_jump; /* Handle a jump between functions as ret+call */ + +#if TG_ENABLE_DEBUG + Int verbose; + ULong verbose_start; +#endif +}; + +/*------------------------------------------------------------*/ +/*--- Constants ---*/ +/*------------------------------------------------------------*/ + +/* Minimum cache line size allowed */ +#define MIN_LINE_SIZE 16 + +/*------------------------------------------------------------*/ +/*--- Statistics ---*/ +/*------------------------------------------------------------*/ + +typedef struct _Statistics Statistics; +struct _Statistics { + ULong call_counter; + ULong jcnd_counter; + ULong jump_counter; + ULong rec_call_counter; + ULong ret_counter; + ULong bb_executions; + + Int context_counter; + Int bb_retranslations; + + Int distinct_objs; + Int distinct_files; + Int distinct_fns; + Int distinct_contexts; + Int distinct_bbs; + Int distinct_jccs; + Int distinct_bbccs; + Int distinct_instrs; + Int distinct_skips; + + Int bb_hash_resizes; + Int bbcc_hash_resizes; + Int jcc_hash_resizes; + Int cxt_hash_resizes; + Int fn_array_resizes; + Int call_stack_resizes; + Int fn_stack_resizes; + + Int full_debug_BBs; + Int file_line_debug_BBs; + Int fn_name_debug_BBs; + Int no_debug_BBs; + Int bbcc_lru_misses; + Int jcc_lru_misses; + Int cxt_lru_misses; + Int bbcc_clones; +}; + +/*------------------------------------------------------------*/ +/*--- Structure declarations ---*/ +/*------------------------------------------------------------*/ + +typedef struct _Context Context; +typedef struct _CC CC; +typedef struct _BB BB; +typedef struct _BBCC BBCC; +typedef struct _jCC jCC; +typedef struct _fn_node fn_node; +typedef struct _file_node file_node; +typedef struct _obj_node obj_node; +typedef struct _fn_config fn_config; +typedef struct _call_entry call_entry; +typedef struct _thread_info thread_info; + +/* Cost arrays: aliases to arrays of 64-bit event counters */ +typedef ULong* FullCost; + +/* The types of control flow changes that can happen between + * execution of two BBs in a thread. + */ +typedef enum { + jk_None = 0, /* no explicit change by a guest instruction */ + jk_Jump, /* regular jump */ + jk_Call, + jk_Return, + jk_CondJump /* conditional jump taken (only used as jCC type) */ +} TgJumpKind; + +/* JmpCall cost center + * for subroutine call (from->bb->jmp_addr => to->bb->addr) + * + * Each BB has at most one CALL instruction. The list of JCC from + * this call is a pointer to the list head (stored in BBCC), and + * in the JCC struct. + * + * For fast lookup, JCCs are reachable with a hash table, keyed by + * the (from_bbcc,to) pair. is used for the JCC chain + * of one hash table entry. + * + * Cost holds event counts for already returned executions. + * are the event counters at last enter of the subroutine. + * is updated on returning from the subroutine by + * adding the diff of and current event counters to . + * + * After updating, is set to current event counters. Thus, + * events are not counted twice for recursive calls (TODO: True?) + */ + +struct _jCC { + TgJumpKind jmpkind; /* jk_Call, jk_Jump, jk_CondJump */ + jCC* next_hash; /* for hash entry chain */ + jCC* next_from; /* next JCC from a BBCC */ + BBCC * from, *to; /* call arc from/to this BBCC */ + UInt jmp; /* jump no. in source */ + + ULong call_counter; /* no wraparound with 64 bit */ + + FullCost cost; /* simulator + user counters */ +}; + +/* + * Info for one instruction of a basic block. + */ +typedef struct _InstrInfo InstrInfo; +struct _InstrInfo { + UInt instr_offset; + UInt instr_size; + UInt cost_offset; + EventSet* eventset; +}; + +/* + * Info for a side exit in a BB + */ +typedef struct _CJmpInfo CJmpInfo; +struct _CJmpInfo { + UInt instr; /* instruction index for BB.instr array */ + TgJumpKind jmpkind; /* jump kind when leaving BB at this side exit */ +}; + +/** + * An instrumented basic block (BB). + * + * BBs are put into a resizable hash to allow for fast detection if a + * BB is to be retranslated but cost info is already available. + * The key for a BB is a (object, offset) tupel making it independent + * from possibly multiple mappings of the same ELF object. + * + * At the beginning of each instrumented BB, + * a call to setup_bbcc(), specifying a pointer to the + * according BB structure, is added. + * + * As cost of a BB has to be distinguished depending on the context, + * multiple cost centers for one BB (struct BBCC) exist and the according + * BBCC is set by setup_bbcc. + */ +struct _BB { + obj_node* obj; /* ELF object of BB */ + PtrdiffT offset; /* offset of BB in ELF object file */ + BB* next; /* chaining for a hash entry */ + + VgSectKind sect_kind; /* section of this BB, e.g. PLT */ + UInt instr_count; + + /* filled by TG_(get_fn_node) if debug info is available */ + fn_node* fn; /* debug info for this BB */ + UInt line; + Bool is_entry; /* True if this BB is a function entry */ + + BBCC* bbcc_list; /* BBCCs for same BB (see next_bbcc in BBCC) */ + BBCC* last_bbcc; /* Temporary: Cached for faster access (LRU) */ + + /* filled by TG_(instrument) if not seen before */ + UInt cjmp_count; /* number of side exits */ + CJmpInfo* jmp; /* array of info for condition jumps, + * allocated directly after this struct */ + Bool cjmp_inverted; /* is last side exit actually fall through? */ + + const HChar** + inl_fns; /* inlined fn names at BB start (outermost first), or NULL */ + UInt inl_depth; /* number of entries in inl_fns */ + + UInt instr_len; + UInt cost_count; + InstrInfo instr[0]; /* info on instruction sizes and costs */ +}; + +/** + * Function context + * + * Basic blocks are always executed in the scope of a context. + * A function context is a list of function nodes representing + * the call chain to the current context: I.e. fn[0] is the + * function we are currently in, fn[1] has called fn[0], and so on. + * Recursion levels are used for fn[0]. + * + * To get a unique number for a full execution context, use + * rec_index = min(rec_separation>,) - 1; + * unique_no = + rec_index + * + * For each Context, recursion index and BB, there can be a BBCC. + */ +struct _Context { + UInt size; // number of function dependencies + UInt base_number; // for context compression & dump array + Context* next; // entry chaining for hash + UWord hash; // for faster lookup... + fn_node* fn[0]; +}; + +/* + * Cost info for a side exits from a BB + */ +typedef struct _JmpData JmpData; +struct _JmpData { + ULong ecounter; /* number of times the BB was left at this exit */ + jCC* jcc_list; /* JCCs used for this exit */ +}; + +/* + * Basic Block Cost Center + * + * On demand, multiple BBCCs will be created for the same BB + * dependent on command line options and: + * - current function (it's possible that a BB is executed in the + * context of different functions, e.g. in manual assembler/PLT) + * - current thread ID + * - position where current function is called from + * - recursion level of current function + * + * The cost centres for the instructions of a basic block are + * stored in a contiguous array. + * They are distinguishable by their tag field. + */ +struct _BBCC { + BB* bb; /* BB for this cost center */ + + Context* cxt; /* execution context of this BBCC */ + ThreadId tid; /* only for assertion check purpose */ + UInt rec_index; /* Recursion index in rec->bbcc for this bbcc */ + BBCC** rec_array; /* Variable sized array of pointers to + * recursion BBCCs. Shared. */ + BBCC* next_bbcc; /* Chain of BBCCs for same BB */ + BBCC* lru_next_bbcc; /* BBCC executed next the last time */ + + jCC* lru_from_jcc; /* Temporary: Cached for faster access (LRU) */ + jCC* lru_to_jcc; /* Temporary: Cached for faster access (LRU) */ + FullCost skipped; /* cost for skipped functions called from + * jmp_addr. Allocated lazy */ + + BBCC* next; /* entry chain in hash */ + ULong* cost; /* start of 64bit costs for this BBCC */ + ULong ecounter_sum; /* execution counter for first instruction of BB */ + JmpData jmp[0]; +}; + +struct _fn_node { + HChar* name; + UInt name_len; + UInt number; + Context* last_cxt; /* LRU info */ + Context* pure_cxt; /* the context with only the function itself */ + file_node* file; /* reverse mapping for 2nd hash */ + fn_node* next; + + Bool toggle_collect : 1; + Bool skip : 1; + Bool pop_on_jump : 1; + + Int group; + Int separate_callers; + Int separate_recursions; +#if TG_ENABLE_DEBUG + Int verbosity; /* Stores old verbosity level while in function */ +#endif +}; + +/* Quite arbitrary fixed hash sizes */ + +#define N_OBJ_ENTRIES 47 +#define N_FILE_ENTRIES 53 +#define N_FN_ENTRIES 87 + +struct _file_node { + HChar* name; + UInt name_len; + fn_node* fns[N_FN_ENTRIES]; + obj_node* obj; + file_node* next; +}; + +/* If an object is dlopened multiple times, we hope that is unique; + * and can change with each dlopen, and is + * zero when object is unmapped (possible at dump time). + */ +struct _obj_node { + const HChar* name; + UInt name_len; + UInt last_slash_pos; + + Addr start; /* Start address of text segment mapping */ + SizeT size; /* Length of mapping */ + PtrdiffT offset; /* Offset between symbol address and file offset */ + + file_node* files[N_FILE_ENTRIES]; + UInt number; + obj_node* next; +}; + +/* an entry in the callstack + * + * is 0 if the function called is not skipped (usual case). + * Otherwise, it is the last non-skipped BBCC. This one gets all + * the calls to non-skipped functions and all costs in skipped + * instructions. + */ +struct _call_entry { + jCC* jcc; /* jCC for this call */ + FullCost enter_cost; /* cost event counters at entering frame */ + Addr sp; /* stack pointer directly after call */ + Addr ret_addr; /* address to which to return to + * is 0 on a simulated call */ + BBCC* nonskipped; /* see above */ + Context* cxt; /* context before call */ + Int fn_sp; /* function stack index before call */ +}; + +/* + * Execution state of main thread or a running signal handler in + * a thread while interrupted by another signal handler. + * As there's no scheduling among running signal handlers of one thread, + * we only need a subset of a full thread state: + * - event counter + * - collect state + * - last BB, last jump kind, last nonskipped BB + * - callstack pointer for sanity checking and correct unwinding + * after exit + */ +typedef struct _exec_state exec_state; +struct _exec_state { + + /* the signum of the handler, 0 for main thread context + */ + Int sig; + + /* the old call stack pointer at entering the signal handler */ + Int orig_sp; + + FullCost cost; + Bool collect; + Context* cxt; + + /* number of conditional jumps passed in last BB */ + Int jmps_passed; + BBCC* bbcc; /* last BB executed */ + BBCC* nonskipped; + + Int call_stack_bottom; /* Index into fn_stack */ +}; + +/* Global state structures */ +typedef struct _bb_hash bb_hash; +struct _bb_hash { + UInt size, entries; + BB** table; +}; + +typedef struct _cxt_hash cxt_hash; +struct _cxt_hash { + UInt size, entries; + Context** table; +}; + +/* Thread specific state structures, i.e. parts of a thread state. + * There are variables for the current state of each part, + * on which a thread state is copied at thread switch. + */ +typedef struct _bbcc_hash bbcc_hash; +struct _bbcc_hash { + UInt size, entries; + BBCC** table; +}; + +typedef struct _jcc_hash jcc_hash; +struct _jcc_hash { + UInt size, entries; + jCC** table; + jCC* spontaneous; +}; + +typedef struct _fn_array fn_array; +struct _fn_array { + UInt size; + UInt* array; +}; + +typedef struct _call_stack call_stack; +struct _call_stack { + UInt size; + Int sp; + call_entry* entry; +}; + +typedef struct _fn_stack fn_stack; +struct _fn_stack { + UInt size; + fn_node **bottom, **top; +}; + +/* The maximum number of simultaneous running signal handlers per thread. + * This is the number of execution states storable in a thread. + */ +#define MAX_SIGHANDLERS 10 + +typedef struct _exec_stack exec_stack; +struct _exec_stack { + Int sp; /* > 0 if a handler is running */ + exec_state* entry[MAX_SIGHANDLERS]; +}; + +/* Thread State + * + * This structure stores thread specific info while a thread is *not* + * running. See function switch_thread() for save/restore on thread switch. + * + * If --separate-threads=no, BBCCs and JCCs can be shared by all threads, i.e. + * only structures of thread 1 are used. + * This involves variables fn_info_table, bbcc_table and jcc_table. + */ +struct _thread_info { + + /* state */ + fn_stack fns; /* function stack */ + call_stack calls; /* context call arc stack */ + exec_stack states; /* execution states interrupted by signals */ + + /* cost tracking */ + FullCost lastdump_cost; /* Cost at last total cost computation */ + + /* CSV trace: per-thread snapshot of cost at last sample emission */ + FullCost last_sample_cost; + + /* Inline tracking: current inline call stack (outermost first) */ + const HChar* cur_inl_fns[TG_MAX_INL_DEPTH]; + UInt cur_inl_depth; + + /* thread specific data structure containers */ + fn_array fn_active; + jcc_hash jccs; + bbcc_hash bbccs; +}; + +/*------------------------------------------------------------*/ +/*--- Cache simulator interface ---*/ +/*------------------------------------------------------------*/ + +struct cachesim_if { + void (*print_opts)(void); + Bool (*parse_opt)(const HChar* arg); + void (*post_clo_init)(void); + void (*clear)(void); + void (*printstat)(Int, Int, Int); + void (*finish)(void); + + void (*log_1I0D)(InstrInfo*) VG_REGPARM(1); + void (*log_2I0D)(InstrInfo*, InstrInfo*) VG_REGPARM(2); + void (*log_3I0D)(InstrInfo*, InstrInfo*, InstrInfo*) VG_REGPARM(3); + + void (*log_1I1Dr)(InstrInfo*, Addr, Word) VG_REGPARM(3); + void (*log_1I1Dw)(InstrInfo*, Addr, Word) VG_REGPARM(3); + + void (*log_0I1Dr)(InstrInfo*, Addr, Word) VG_REGPARM(3); + void (*log_0I1Dw)(InstrInfo*, Addr, Word) VG_REGPARM(3); + + // function names of helpers (for debugging generated code) + const HChar *log_1I0D_name, *log_2I0D_name, *log_3I0D_name; + const HChar *log_1I1Dr_name, *log_1I1Dw_name; + const HChar *log_0I1Dr_name, *log_0I1Dw_name; +}; + +// Event groups +#define EG_USE 0 +#define EG_IR 1 +#define EG_DR 2 +#define EG_DW 3 +#define EG_BC 4 +#define EG_BI 5 +#define EG_BUS 6 +#define EG_SYS 7 + +struct event_sets { + EventSet *base, *full; +}; + +#define fullOffset(group) (TG_(sets).full->offset[group]) + +/*------------------------------------------------------------*/ +/*--- Trace output state ---*/ +/*------------------------------------------------------------*/ + +typedef struct { + Int fd; /* Output file descriptor (-1 if not open) */ + ULong seq; /* Global sequence counter */ + Bool initialized; /* Has the output been opened? */ + Bool header_written; /* Has the schema chunk been written? */ +} trace_output; + +/*------------------------------------------------------------*/ +/*--- Functions ---*/ +/*------------------------------------------------------------*/ + +/* from clo.c */ + +void TG_(set_clo_defaults)(void); +void TG_(update_fn_config)(fn_node*); +Bool TG_(process_cmd_line_option)(const HChar*); +void TG_(print_usage)(void); +void TG_(print_debug_usage)(void); + +/* from sim.c */ +void TG_(init_eventsets)(void); + +/* from main.c */ +Bool TG_(get_debug_info)(Addr, + const HChar** dirname, + const HChar** filename, + const HChar** fn_name, + UInt*, + DebugInfo**); +void TG_(collectBlockInfo)(IRSB* bbIn, UInt*, UInt*, Bool*); +void TG_(set_instrument_state)(const HChar*, Bool); +void TG_(compute_total_cost)(void); +void TG_(fini)(Int exitcode); + +/* from bb.c */ +void TG_(init_bb_hash)(void); +bb_hash* TG_(get_bb_hash)(void); +BB* TG_(get_bb)(Addr addr, IRSB* bb_in, Bool* seen_before); +void TG_(delete_bb)(Addr addr); + +static __inline__ Addr bb_addr(BB* bb) { return bb->offset + bb->obj->offset; } +static __inline__ Addr bb_jmpaddr(BB* bb) +{ + UInt off = + (bb->instr_count > 0) ? bb->instr[bb->instr_count - 1].instr_offset : 0; + return off + bb->offset + bb->obj->offset; +} + +/* from fn.c */ +void TG_(init_fn_array)(fn_array*); +void TG_(copy_current_fn_array)(fn_array* dst); +fn_array* TG_(get_current_fn_array)(void); +void TG_(set_current_fn_array)(fn_array*); +UInt* TG_(get_fn_entry)(Int n); + +void TG_(init_obj_table)(void); +obj_node* TG_(get_obj_node)(DebugInfo* si); +file_node* + TG_(get_file_node)(obj_node*, const HChar* dirname, const HChar* filename); +fn_node* TG_(get_fn_node)(BB* bb); + +/* from bbcc.c */ +void TG_(init_bbcc_hash)(bbcc_hash* bbccs); +void TG_(copy_current_bbcc_hash)(bbcc_hash* dst); +bbcc_hash* TG_(get_current_bbcc_hash)(void); +void TG_(set_current_bbcc_hash)(bbcc_hash*); +BBCC* TG_(get_bbcc)(BB* bb); +void TG_(setup_bbcc)(BB* bb) VG_REGPARM(1); + +/* from jumps.c */ +void TG_(init_jcc_hash)(jcc_hash*); +void TG_(copy_current_jcc_hash)(jcc_hash* dst); +void TG_(set_current_jcc_hash)(jcc_hash*); +jCC* TG_(get_jcc)(BBCC* from, UInt, BBCC* to); + +/* from callstack.c */ +void TG_(init_call_stack)(call_stack*); +void TG_(copy_current_call_stack)(call_stack* dst); +void TG_(set_current_call_stack)(call_stack*); +call_entry* TG_(get_call_entry)(Int n); + +void TG_(push_call_stack)(BBCC* from, UInt jmp, BBCC* to, Addr sp, Bool skip); +void TG_(pop_call_stack)(void); +Int TG_(unwind_call_stack)(Addr sp, Int); + +/* from context.c */ +void TG_(init_fn_stack)(fn_stack*); +void TG_(copy_current_fn_stack)(fn_stack*); +void TG_(set_current_fn_stack)(fn_stack*); + +void TG_(init_cxt_table)(void); +Context* TG_(get_cxt)(fn_node** fn); +void TG_(push_cxt)(fn_node* fn); + +/* from threads.c */ +void TG_(init_threads)(void); +thread_info** TG_(get_threads)(void); +thread_info* TG_(get_current_thread)(void); +void TG_(switch_thread)(ThreadId tid); +void TG_(forall_threads)(void (*func)(thread_info*)); +void TG_(run_thread)(ThreadId tid); + +void TG_(init_exec_state)(exec_state* es); +void TG_(init_exec_stack)(exec_stack*); +void TG_(copy_current_exec_stack)(exec_stack*); +void TG_(set_current_exec_stack)(exec_stack*); +void TG_(pre_signal)(ThreadId tid, Int sigNum, Bool alt_stack); +void TG_(post_signal)(ThreadId tid, Int sigNum); +void TG_(run_post_signal_on_call_stack_bottom)(void); + +/* from dump.c */ + +/* Trace output (from dump.c) */ +void TG_(trace_open_output)(void); +void TG_(trace_reopen_child)(void); +void TG_(trace_emit_sample)(ThreadId tid, Bool is_enter, fn_node* fn); +void TG_(trace_emit_enter_inlined)(ThreadId tid, BB* bb, const HChar* inl_fn); +void TG_(trace_emit_exit_inlined)(ThreadId tid, BB* bb, const HChar* inl_fn); +void TG_(trace_emit_fork)(ThreadId tid, Int child_pid); +void TG_(trace_emit_thread_create)(ThreadId tid, ThreadId child); +void TG_(trace_emit_marker)(ThreadId tid, const HChar* marker); +void TG_(trace_close_output)(void); + +/*------------------------------------------------------------*/ +/*--- Exported global variables ---*/ +/*------------------------------------------------------------*/ + +extern CommandLineOptions TG_(clo); +extern Statistics TG_(stat); +extern EventMapping* TG_(dumpmap); +extern trace_output TG_(trace_out); + +/* Function active counter array, indexed by function number */ +extern UInt* TG_(fn_active_array); +extern Bool TG_(instrument_state); +/* min of L1 and LL cache line sizes */ +extern Int TG_(min_line_size); +extern call_stack TG_(current_call_stack); +extern fn_stack TG_(current_fn_stack); +extern exec_state TG_(current_state); +extern ThreadId TG_(current_tid); +extern FullCost TG_(total_cost); +extern struct cachesim_if TG_(cachesim); +extern struct event_sets TG_(sets); + +// set by setup_bbcc at start of every BB, and needed by log_* helpers +extern Addr TG_(bb_base); +extern ULong* TG_(cost_base); + +/*------------------------------------------------------------*/ +/*--- Debug output ---*/ +/*------------------------------------------------------------*/ + +#if TG_ENABLE_DEBUG + +#define TG_DEBUGIF(x) \ + if (UNLIKELY((TG_(clo).verbose > x) && \ + (TG_(stat).bb_executions >= TG_(clo).verbose_start))) + +#define TG_DEBUG(x, format, args...) \ + TG_DEBUGIF(x) \ + { \ + TG_(print_bbno)(); \ + VG_(printf)(format, ##args); \ + } + +#define TG_ASSERT(cond) \ + if (UNLIKELY(!(cond))) { \ + TG_(print_context)(); \ + TG_(print_bbno)(); \ + tl_assert(cond); \ + } + +#else +#define TG_DEBUGIF(x) if (0) +#define TG_DEBUG(x...) \ + { \ + } +#define TG_ASSERT(cond) tl_assert(cond); +#endif + +/* from debug.c */ +void TG_(print_bbno)(void); +void TG_(print_context)(void); +void TG_(print_jcc)(int s, jCC* jcc); +void TG_(print_bbcc)(int s, BBCC* bbcc); +void TG_(print_bbcc_fn)(BBCC* bbcc); +void TG_(print_execstate)(int s, exec_state* es); +void TG_(print_eventset)(int s, EventSet* es); +void TG_(print_cost)(int s, EventSet*, ULong* cost); +void TG_(print_bb)(int s, BB* bb); +void TG_(print_bbcc_cost)(int s, BBCC*); +void TG_(print_cxt)(int s, Context* cxt, int rec_index); +void TG_(print_short_jcc)(jCC* jcc); +void TG_(print_stackentry)(int s, int sp); +void TG_(print_addr)(Addr addr); +void TG_(print_addr_ln)(Addr addr); + +void* TG_(malloc)(const HChar* cc, UWord s, const HChar* f); +void* TG_(free)(void* p, const HChar* f); +#if 0 +#define TG_MALLOC(_cc, x) TG_(malloc)((_cc), x, __FUNCTION__) +#define TG_FREE(p) TG_(free)(p, __FUNCTION__) +#else +#define TG_MALLOC(_cc, x) VG_(malloc)((_cc), x) +#define TG_FREE(p) VG_(free)(p) +#endif + +#endif /* TG_GLOBAL */ diff --git a/tracegrind/jumps.c b/tracegrind/jumps.c new file mode 100644 index 000000000..f25d062cb --- /dev/null +++ b/tracegrind/jumps.c @@ -0,0 +1,219 @@ +/*--------------------------------------------------------------------*/ +/*--- Tracegrind ---*/ +/*--- ct_jumps.c ---*/ +/*--------------------------------------------------------------------*/ + +/* + This file is part of Tracegrind, a Valgrind tool for call tracing. + + Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de) + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see . + + The GNU General Public License is contained in the file COPYING. +*/ + +#include "global.h" + +/*------------------------------------------------------------*/ +/*--- Jump Cost Center (JCC) operations, including Calls ---*/ +/*------------------------------------------------------------*/ + +#define N_JCC_INITIAL_ENTRIES 4437 + +static jcc_hash current_jccs; + +void TG_(init_jcc_hash)(jcc_hash* jccs) +{ + Int i; + + TG_ASSERT(jccs != 0); + + jccs->size = N_JCC_INITIAL_ENTRIES; + jccs->entries = 0; + jccs->table = (jCC**)TG_MALLOC("cl.jumps.ijh.1", jccs->size * sizeof(jCC*)); + jccs->spontaneous = 0; + + for (i = 0; i < jccs->size; i++) + jccs->table[i] = 0; +} + +void TG_(copy_current_jcc_hash)(jcc_hash* dst) +{ + TG_ASSERT(dst != 0); + + dst->size = current_jccs.size; + dst->entries = current_jccs.entries; + dst->table = current_jccs.table; + dst->spontaneous = current_jccs.spontaneous; +} + +void TG_(set_current_jcc_hash)(jcc_hash* h) +{ + TG_ASSERT(h != 0); + + current_jccs.size = h->size; + current_jccs.entries = h->entries; + current_jccs.table = h->table; + current_jccs.spontaneous = h->spontaneous; +} + +__inline__ static UInt jcc_hash_idx(BBCC* from, UInt jmp, BBCC* to, UInt size) +{ + return (UInt)((UWord)from + 7 * (UWord)to + 13 * jmp) % size; +} + +/* double size of jcc table */ +static void resize_jcc_table(void) +{ + Int i, new_size, conflicts1 = 0, conflicts2 = 0; + jCC** new_table; + UInt new_idx; + jCC * curr_jcc, *next_jcc; + + new_size = 2 * current_jccs.size + 3; + new_table = (jCC**)TG_MALLOC("cl.jumps.rjt.1", new_size * sizeof(jCC*)); + + for (i = 0; i < new_size; i++) + new_table[i] = NULL; + + for (i = 0; i < current_jccs.size; i++) { + if (current_jccs.table[i] == NULL) + continue; + + curr_jcc = current_jccs.table[i]; + while (NULL != curr_jcc) { + next_jcc = curr_jcc->next_hash; + + new_idx = + jcc_hash_idx(curr_jcc->from, curr_jcc->jmp, curr_jcc->to, new_size); + + curr_jcc->next_hash = new_table[new_idx]; + new_table[new_idx] = curr_jcc; + if (curr_jcc->next_hash) { + conflicts1++; + if (curr_jcc->next_hash->next_hash) + conflicts2++; + } + + curr_jcc = next_jcc; + } + } + + VG_(free)(current_jccs.table); + + TG_DEBUG(0, "Resize JCC Hash: %u => %d (entries %u, conflicts %d/%d)\n", + current_jccs.size, new_size, current_jccs.entries, conflicts1, + conflicts2); + + current_jccs.size = new_size; + current_jccs.table = new_table; + TG_(stat).jcc_hash_resizes++; +} + +/* new jCC structure: a call was done to a BB of a BBCC + * for a spontaneous call, from is 0 (i.e. caller unknown) + */ +static jCC* new_jcc(BBCC* from, UInt jmp, BBCC* to) +{ + jCC* jcc; + UInt new_idx; + + /* check fill degree of jcc hash table and resize if needed (>80%) */ + current_jccs.entries++; + if (10 * current_jccs.entries / current_jccs.size > 8) + resize_jcc_table(); + + jcc = (jCC*)TG_MALLOC("cl.jumps.nj.1", sizeof(jCC)); + + jcc->from = from; + jcc->jmp = jmp; + jcc->to = to; + jcc->jmpkind = jk_Call; + jcc->call_counter = 0; + jcc->cost = 0; + + /* insert into JCC chain of calling BBCC. + * This list is only used at dumping time */ + + if (from) { + /* Prohibit corruption by array overrun */ + TG_ASSERT(jmp <= from->bb->cjmp_count); + jcc->next_from = from->jmp[jmp].jcc_list; + from->jmp[jmp].jcc_list = jcc; + } else { + jcc->next_from = current_jccs.spontaneous; + current_jccs.spontaneous = jcc; + } + + /* insert into JCC hash table */ + new_idx = jcc_hash_idx(from, jmp, to, current_jccs.size); + jcc->next_hash = current_jccs.table[new_idx]; + current_jccs.table[new_idx] = jcc; + + TG_(stat).distinct_jccs++; + + TG_DEBUGIF(3) + { + VG_(printf)(" new_jcc (now %d): %p\n", TG_(stat).distinct_jccs, jcc); + } + + return jcc; +} + +/* get the jCC for a call arc (BBCC->BBCC) */ +jCC* TG_(get_jcc)(BBCC* from, UInt jmp, BBCC* to) +{ + jCC* jcc; + UInt idx; + + TG_DEBUG(5, "+ get_jcc(bbcc %p/%u => bbcc %p)\n", from, jmp, to); + + /* first check last recently used JCC */ + jcc = to->lru_to_jcc; + if (jcc && (jcc->from == from) && (jcc->jmp == jmp)) { + TG_ASSERT(to == jcc->to); + TG_DEBUG(5, "- get_jcc: [LRU to] jcc %p\n", jcc); + return jcc; + } + + jcc = from->lru_from_jcc; + if (jcc && (jcc->to == to) && (jcc->jmp == jmp)) { + TG_ASSERT(from == jcc->from); + TG_DEBUG(5, "- get_jcc: [LRU from] jcc %p\n", jcc); + return jcc; + } + + TG_(stat).jcc_lru_misses++; + + idx = jcc_hash_idx(from, jmp, to, current_jccs.size); + jcc = current_jccs.table[idx]; + + while (jcc) { + if ((jcc->from == from) && (jcc->jmp == jmp) && (jcc->to == to)) + break; + jcc = jcc->next_hash; + } + + if (!jcc) + jcc = new_jcc(from, jmp, to); + + /* set LRU */ + from->lru_from_jcc = jcc; + to->lru_to_jcc = jcc; + + TG_DEBUG(5, "- get_jcc(bbcc %p => bbcc %p)\n", from, to); + + return jcc; +} diff --git a/tracegrind/lz4.c b/tracegrind/lz4.c new file mode 100644 index 000000000..e0af37e2b --- /dev/null +++ b/tracegrind/lz4.c @@ -0,0 +1,3417 @@ +/* + LZ4 - Fast LZ compression algorithm + Copyright (c) Yann Collet. All rights reserved. + + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 homepage : http://www.lz4.org + - LZ4 source repository : https://github.com/lz4/lz4 +*/ + +/*-************************************ + * Tuning parameters + **************************************/ +/* + * LZ4_HEAPMODE : + * Select how stateless compression functions like `LZ4_compress_default()` + * allocate memory for their hash table, + * in memory stack (0:default, fastest), or in memory heap (1:requires + * malloc()). + */ +#ifndef LZ4_HEAPMODE +#define LZ4_HEAPMODE 0 +#endif + +/* + * LZ4_ACCELERATION_DEFAULT : + * Select "acceleration" for LZ4_compress_fast() when parameter value <= 0 + */ +#define LZ4_ACCELERATION_DEFAULT 1 +/* + * LZ4_ACCELERATION_MAX : + * Any "acceleration" value higher than this threshold + * get treated as LZ4_ACCELERATION_MAX instead (fix #876) + */ +#define LZ4_ACCELERATION_MAX 65537 + +/*-************************************ + * CPU Feature Detection + **************************************/ +/* LZ4_FORCE_MEMORY_ACCESS + * By default, access to unaligned memory is controlled by `memcpy()`, which is + * safe and portable. Unfortunately, on some target/compiler combinations, the + * generated assembly is sub-optimal. The below switch allow to select different + * access method for improved performance. Method 0 (default) : use `memcpy()`. + * Safe and portable. Method 1 : `__packed` statement. It depends on compiler + * extension (ie, not portable). This method is safe if your compiler supports + * it, and *generally* as fast or faster than `memcpy`. Method 2 : direct + * access. This method is portable but violate C standard. It can generate buggy + * code on targets which assembly generation depends on alignment. But in some + * circumstances, it's the only known way to get the most performance (ie GCC + + * ARMv6) See + * https://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html + * for details. Prefer these methods in priority order (0 > 1 > 2) + */ +#ifndef LZ4_FORCE_MEMORY_ACCESS /* can be defined externally */ +#if defined(__GNUC__) && \ + (defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ + defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || \ + defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) || \ + (defined(__riscv) && defined(__riscv_zicclsm))) +#define LZ4_FORCE_MEMORY_ACCESS 2 +#elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) || defined(__GNUC__) || \ + defined(_MSC_VER) +#define LZ4_FORCE_MEMORY_ACCESS 1 +#endif +#endif + +/* + * LZ4_FORCE_SW_BITCOUNT + * Define this parameter if your target system or compiler does not support + * hardware bit count + */ +#if defined(_MSC_VER) && \ + defined(_WIN32_WCE) /* Visual Studio for WinCE doesn't support Hardware bit \ + count */ +#undef LZ4_FORCE_SW_BITCOUNT /* avoid double def */ +#define LZ4_FORCE_SW_BITCOUNT +#endif + +/*-************************************ + * Dependency + **************************************/ +/* + * LZ4_SRC_INCLUDED: + * Amalgamation flag, whether lz4.c is included + */ +#ifndef LZ4_SRC_INCLUDED +#define LZ4_SRC_INCLUDED 1 +#endif + +#ifndef LZ4_DISABLE_DEPRECATE_WARNINGS +#define LZ4_DISABLE_DEPRECATE_WARNINGS /* due to \ + LZ4_decompress_safe_withPrefix64k */ +#endif + +#ifndef LZ4_STATIC_LINKING_ONLY +#define LZ4_STATIC_LINKING_ONLY +#endif +#include "lz4.h" +/* see also "memory routines" below */ + +/*-************************************ + * Compiler Options + **************************************/ +#if defined(_MSC_VER) && (_MSC_VER >= 1400) /* Visual Studio 2005+ */ +#include /* only present in VS2005+ */ +#pragma warning( \ + disable : 4127) /* disable: C4127: conditional expression is constant */ +#pragma warning( \ + disable : 6237) /* disable: C6237: conditional expression is always 0 */ +#pragma warning( \ + disable : 6239) /* disable: C6239: ( && ) \ + always evaluates to the result of */ +#pragma warning( \ + disable : 6240) /* disable: C6240: ( && ) \ + always evaluates to the result of */ +#pragma warning(disable : 6326) /* disable: C6326: Potential comparison of a \ + constant with another constant */ +#endif /* _MSC_VER */ + +#ifndef LZ4_FORCE_INLINE +#if defined(_MSC_VER) && !defined(__clang__) /* MSVC */ +#define LZ4_FORCE_INLINE static __forceinline +#else +#if defined(__cplusplus) || \ + defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ +#if defined(__GNUC__) || defined(__clang__) +#define LZ4_FORCE_INLINE static inline __attribute__((always_inline)) +#else +#define LZ4_FORCE_INLINE static inline +#endif +#else +#define LZ4_FORCE_INLINE static +#endif /* __STDC_VERSION__ */ +#endif /* _MSC_VER */ +#endif /* LZ4_FORCE_INLINE */ + +/* LZ4_FORCE_O2 and LZ4_FORCE_INLINE + * gcc on ppc64le generates an unrolled SIMDized loop for LZ4_wildCopy8, + * together with a simple 8-byte copy loop as a fall-back path. + * However, this optimization hurts the decompression speed by >30%, + * because the execution does not go to the optimized loop + * for typical compressible data, and all of the preamble checks + * before going to the fall-back path become useless overhead. + * This optimization happens only with the -O3 flag, and -O2 generates + * a simple 8-byte copy loop. + * With gcc on ppc64le, all of the LZ4_decompress_* and LZ4_wildCopy8 + * functions are annotated with __attribute__((optimize("O2"))), + * and also LZ4_wildCopy8 is forcibly inlined, so that the O2 attribute + * of LZ4_wildCopy8 does not affect the compression speed. + */ +#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) && defined(__GNUC__) && \ + !defined(__clang__) +#define LZ4_FORCE_O2 __attribute__((optimize("O2"))) +#undef LZ4_FORCE_INLINE +#define LZ4_FORCE_INLINE \ + static __inline __attribute__((optimize("O2"), always_inline)) +#else +#define LZ4_FORCE_O2 +#endif + +#if (defined(__GNUC__) && (__GNUC__ >= 3)) || \ + (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || \ + defined(__clang__) +#define expect(expr, value) (__builtin_expect((expr), (value))) +#else +#define expect(expr, value) (expr) +#endif + +#ifndef likely +#define likely(expr) expect((expr) != 0, 1) +#endif +#ifndef unlikely +#define unlikely(expr) expect((expr) != 0, 0) +#endif + +/* Should the alignment test prove unreliable, for some reason, + * it can be disabled by setting LZ4_ALIGN_TEST to 0 */ +#ifndef LZ4_ALIGN_TEST /* can be externally provided */ +#define LZ4_ALIGN_TEST 1 +#endif + +/*-************************************ + * Memory routines + **************************************/ + +/*! LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION : + * Disable relatively high-level LZ4/HC functions that use dynamic memory + * allocation functions (malloc(), calloc(), free()). + * + * Note that this is a compile-time switch. And since it disables + * public/stable LZ4 v1 API functions, we don't recommend using this + * symbol to generate a library for distribution. + * + * The following public functions are removed when this symbol is defined. + * - lz4 : LZ4_createStream, LZ4_freeStream, + * LZ4_createStreamDecode, LZ4_freeStreamDecode, LZ4_create + * (deprecated) + * - lz4hc : LZ4_createStreamHC, LZ4_freeStreamHC, + * LZ4_createHC (deprecated), LZ4_freeHC (deprecated) + * - lz4frame, lz4file : All LZ4F_* functions + */ +#if defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) +#define ALLOC(s) lz4_error_memory_allocation_is_disabled +#define ALLOC_AND_ZERO(s) lz4_error_memory_allocation_is_disabled +#define FREEMEM(p) lz4_error_memory_allocation_is_disabled +#elif defined(LZ4_USER_MEMORY_FUNCTIONS) +/* memory management functions can be customized by user project. + * Below functions must exist somewhere in the Project + * and be available at link time */ +void* LZ4_malloc(size_t s); +void* LZ4_calloc(size_t n, size_t s); +void LZ4_free(void* p); +#define ALLOC(s) LZ4_malloc(s) +#define ALLOC_AND_ZERO(s) LZ4_calloc(1, s) +#define FREEMEM(p) LZ4_free(p) +#else +#include /* malloc, calloc, free */ +#define ALLOC(s) malloc(s) +#define ALLOC_AND_ZERO(s) calloc(1, s) +#define FREEMEM(p) free(p) +#endif + +#if !LZ4_FREESTANDING +#include /* memset, memcpy */ +#endif +#if !defined(LZ4_memset) +#define LZ4_memset(p, v, s) memset((p), (v), (s)) +#endif +#define MEM_INIT(p, v, s) LZ4_memset((p), (v), (s)) + +/*-************************************ + * Common Constants + **************************************/ +#define MINMATCH 4 + +#define WILDCOPYLENGTH 8 +#define LASTLITERALS \ + 5 /* see ../doc/lz4_Block_format.md#parsing-restrictions \ + */ +#define MFLIMIT 12 /* see ../doc/lz4_Block_format.md#parsing-restrictions */ +#define MATCH_SAFEGUARD_DISTANCE \ + ((2 * WILDCOPYLENGTH) - \ + MINMATCH) /* ensure it's possible to write 2 x wildcopyLength without \ + overflowing output buffer */ +#define FASTLOOP_SAFE_DISTANCE 64 +static const int LZ4_minLength = (MFLIMIT + 1); + +#define KB *(1 << 10) +#define MB *(1 << 20) +#define GB *(1U << 30) + +#define LZ4_DISTANCE_ABSOLUTE_MAX 65535 +#if (LZ4_DISTANCE_MAX > \ + LZ4_DISTANCE_ABSOLUTE_MAX) /* max supported by LZ4 format */ +#error "LZ4_DISTANCE_MAX is too big : must be <= 65535" +#endif + +#define ML_BITS 4 +#define ML_MASK ((1U << ML_BITS) - 1) +#define RUN_BITS (8 - ML_BITS) +#define RUN_MASK ((1U << RUN_BITS) - 1) + +/*-************************************ + * Error detection + **************************************/ +#if defined(LZ4_DEBUG) && (LZ4_DEBUG >= 1) +#include +#else +#ifndef assert +#define assert(condition) ((void)0) +#endif +#endif + +#define LZ4_STATIC_ASSERT(c) \ + { \ + enum { LZ4_static_assert = 1 / (int)(!!(c)) }; \ + } /* use after variable declarations */ + +#if defined(LZ4_DEBUG) && (LZ4_DEBUG >= 2) +#include +static int g_debuglog_enable = 1; +#define DEBUGLOG(l, ...) \ + { \ + if ((g_debuglog_enable) && (l <= LZ4_DEBUG)) { \ + fprintf(stderr, __FILE__ " %i: ", __LINE__); \ + fprintf(stderr, __VA_ARGS__); \ + fprintf(stderr, " \n"); \ + } \ + } +#else +#define DEBUGLOG(l, ...) \ + { \ + } /* disabled */ +#endif + +static int LZ4_isAligned(const void* ptr, size_t alignment) +{ + return ((size_t)ptr & (alignment - 1)) == 0; +} + +/*-************************************ + * Types + **************************************/ +#include +#if defined(__cplusplus) || \ + (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +#include +typedef unsigned char + BYTE; /*uint8_t not necessarily blessed to alias arbitrary type*/ +typedef uint16_t U16; +typedef uint32_t U32; +typedef int32_t S32; +typedef uint64_t U64; +typedef uintptr_t uptrval; +#else +#if UINT_MAX != 4294967295UL +#error "LZ4 code (when not C++ or C99) assumes that sizeof(int) == 4" +#endif +typedef unsigned char BYTE; +typedef unsigned short U16; +typedef unsigned int U32; +typedef signed int S32; +typedef unsigned long long U64; +typedef size_t uptrval; /* generally true, except OpenVMS-64 */ +#endif + +#if defined(__x86_64__) +typedef U64 reg_t; /* 64-bits in x32 mode */ +#else +typedef size_t reg_t; /* 32-bits in x32 mode */ +#endif + +typedef enum { + notLimited = 0, + limitedOutput = 1, + fillOutput = 2 +} limitedOutput_directive; + +/*-************************************ + * Reading and writing into memory + **************************************/ + +/** + * LZ4 relies on memcpy with a constant size being inlined. In freestanding + * environments, the compiler can't assume the implementation of memcpy() is + * standard compliant, so it can't apply its specialized memcpy() inlining + * logic. When possible, use __builtin_memcpy() to tell the compiler to analyze + * memcpy() as if it were standard compliant, so it can inline it in + * freestanding environments. This is needed when decompressing the Linux + * Kernel, for example. + */ +#if !defined(LZ4_memcpy) +#if defined(__GNUC__) && (__GNUC__ >= 4) +#define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size) +#else +#define LZ4_memcpy(dst, src, size) memcpy(dst, src, size) +#endif +#endif + +#if !defined(LZ4_memmove) +#if defined(__GNUC__) && (__GNUC__ >= 4) +#define LZ4_memmove __builtin_memmove +#else +#define LZ4_memmove memmove +#endif +#endif + +static unsigned LZ4_isLittleEndian(void) +{ + const union { + U32 u; + BYTE c[4]; + } one = {1}; /* don't use static : performance detrimental */ + return one.c[0]; +} + +#if defined(__GNUC__) || defined(__INTEL_COMPILER) +#define LZ4_PACK(__Declaration__) __Declaration__ __attribute__((__packed__)) +#elif defined(_MSC_VER) +#define LZ4_PACK(__Declaration__) \ + __pragma(pack(push, 1)) __Declaration__ __pragma(pack(pop)) +#endif + +#if defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS == 2) +/* lie to the compiler about data alignment; use with caution */ + +static U16 LZ4_read16(const void* memPtr) { return *(const U16*)memPtr; } +static U32 LZ4_read32(const void* memPtr) { return *(const U32*)memPtr; } +static reg_t LZ4_read_ARCH(const void* memPtr) { return *(const reg_t*)memPtr; } + +static void LZ4_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; } +static void LZ4_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; } + +#elif defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS == 1) + +/* __pack instructions are safer, but compiler specific, hence potentially + * problematic for some compilers */ +/* currently only defined for gcc and icc */ +LZ4_PACK(typedef struct { U16 u16; }) LZ4_unalign16; +LZ4_PACK(typedef struct { U32 u32; }) LZ4_unalign32; +LZ4_PACK(typedef struct { reg_t uArch; }) LZ4_unalignST; + +static U16 LZ4_read16(const void* ptr) +{ + return ((const LZ4_unalign16*)ptr)->u16; +} +static U32 LZ4_read32(const void* ptr) +{ + return ((const LZ4_unalign32*)ptr)->u32; +} +static reg_t LZ4_read_ARCH(const void* ptr) +{ + return ((const LZ4_unalignST*)ptr)->uArch; +} + +static void LZ4_write16(void* memPtr, U16 value) +{ + ((LZ4_unalign16*)memPtr)->u16 = value; +} +static void LZ4_write32(void* memPtr, U32 value) +{ + ((LZ4_unalign32*)memPtr)->u32 = value; +} + +#else /* safe and portable access using memcpy() */ + +static U16 LZ4_read16(const void* memPtr) +{ + U16 val; + LZ4_memcpy(&val, memPtr, sizeof(val)); + return val; +} + +static U32 LZ4_read32(const void* memPtr) +{ + U32 val; + LZ4_memcpy(&val, memPtr, sizeof(val)); + return val; +} + +static reg_t LZ4_read_ARCH(const void* memPtr) +{ + reg_t val; + LZ4_memcpy(&val, memPtr, sizeof(val)); + return val; +} + +static void LZ4_write16(void* memPtr, U16 value) +{ + LZ4_memcpy(memPtr, &value, sizeof(value)); +} + +static void LZ4_write32(void* memPtr, U32 value) +{ + LZ4_memcpy(memPtr, &value, sizeof(value)); +} + +#endif /* LZ4_FORCE_MEMORY_ACCESS */ + +static U16 LZ4_readLE16(const void* memPtr) +{ + if (LZ4_isLittleEndian()) { + return LZ4_read16(memPtr); + } else { + const BYTE* p = (const BYTE*)memPtr; + return (U16)((U16)p[0] | (p[1] << 8)); + } +} + +#ifdef LZ4_STATIC_LINKING_ONLY_ENDIANNESS_INDEPENDENT_OUTPUT +static U32 LZ4_readLE32(const void* memPtr) +{ + if (LZ4_isLittleEndian()) { + return LZ4_read32(memPtr); + } else { + const BYTE* p = (const BYTE*)memPtr; + return (U32)p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24); + } +} +#endif + +static void LZ4_writeLE16(void* memPtr, U16 value) +{ + if (LZ4_isLittleEndian()) { + LZ4_write16(memPtr, value); + } else { + BYTE* p = (BYTE*)memPtr; + p[0] = (BYTE)value; + p[1] = (BYTE)(value >> 8); + } +} + +/* customized variant of memcpy, which can overwrite up to 8 bytes beyond dstEnd + */ +LZ4_FORCE_INLINE +void LZ4_wildCopy8(void* dstPtr, const void* srcPtr, void* dstEnd) +{ + BYTE* d = (BYTE*)dstPtr; + const BYTE* s = (const BYTE*)srcPtr; + BYTE* const e = (BYTE*)dstEnd; + + do { + LZ4_memcpy(d, s, 8); + d += 8; + s += 8; + } while (d < e); +} + +static const unsigned inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4}; +static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3}; + +#ifndef LZ4_FAST_DEC_LOOP +#if defined __i386__ || defined _M_IX86 || defined __x86_64__ || defined _M_X64 +#define LZ4_FAST_DEC_LOOP 1 +#elif defined(__aarch64__) +#if defined(__clang__) && defined(__ANDROID__) +/* On Android aarch64, we disable this optimization for clang because + * on certain mobile chipsets, performance is reduced with clang. For + * more information refer to https://github.com/lz4/lz4/pull/707 */ +#define LZ4_FAST_DEC_LOOP 0 +#else +#define LZ4_FAST_DEC_LOOP 1 +#endif +#else +#define LZ4_FAST_DEC_LOOP 0 +#endif +#endif + +#if LZ4_FAST_DEC_LOOP + +LZ4_FORCE_INLINE void LZ4_memcpy_using_offset_base(BYTE* dstPtr, + const BYTE* srcPtr, + BYTE* dstEnd, + const size_t offset) +{ + assert(srcPtr + offset == dstPtr); + if (offset < 8) { + LZ4_write32(dstPtr, 0); /* silence an msan warning when offset==0 */ + dstPtr[0] = srcPtr[0]; + dstPtr[1] = srcPtr[1]; + dstPtr[2] = srcPtr[2]; + dstPtr[3] = srcPtr[3]; + srcPtr += inc32table[offset]; + LZ4_memcpy(dstPtr + 4, srcPtr, 4); + srcPtr -= dec64table[offset]; + dstPtr += 8; + } else { + LZ4_memcpy(dstPtr, srcPtr, 8); + dstPtr += 8; + srcPtr += 8; + } + + LZ4_wildCopy8(dstPtr, srcPtr, dstEnd); +} + +/* customized variant of memcpy, which can overwrite up to 32 bytes beyond + * dstEnd this version copies two times 16 bytes (instead of one time 32 bytes) + * because it must be compatible with offsets >= 16. */ +LZ4_FORCE_INLINE void +LZ4_wildCopy32(void* dstPtr, const void* srcPtr, void* dstEnd) +{ + BYTE* d = (BYTE*)dstPtr; + const BYTE* s = (const BYTE*)srcPtr; + BYTE* const e = (BYTE*)dstEnd; + + do { + LZ4_memcpy(d, s, 16); + LZ4_memcpy(d + 16, s + 16, 16); + d += 32; + s += 32; + } while (d < e); +} + +/* LZ4_memcpy_using_offset() presumes : + * - dstEnd >= dstPtr + MINMATCH + * - there is at least 12 bytes available to write after dstEnd */ +LZ4_FORCE_INLINE void LZ4_memcpy_using_offset(BYTE* dstPtr, + const BYTE* srcPtr, + BYTE* dstEnd, + const size_t offset) +{ + BYTE v[8]; + + assert(dstEnd >= dstPtr + MINMATCH); + + switch (offset) { + case 1: + MEM_INIT(v, *srcPtr, 8); + break; + case 2: + LZ4_memcpy(v, srcPtr, 2); + LZ4_memcpy(&v[2], srcPtr, 2); +#if defined(_MSC_VER) && (_MSC_VER <= 1937) /* MSVC 2022 ver 17.7 or earlier \ + */ +#pragma warning(push) +#pragma warning( \ + disable : 6385) /* warning C6385: Reading invalid data from 'v'. */ +#endif + LZ4_memcpy(&v[4], v, 4); +#if defined(_MSC_VER) && (_MSC_VER <= 1937) /* MSVC 2022 ver 17.7 or earlier \ + */ +#pragma warning(pop) +#endif + break; + case 4: + LZ4_memcpy(v, srcPtr, 4); + LZ4_memcpy(&v[4], srcPtr, 4); + break; + default: + LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset); + return; + } + + LZ4_memcpy(dstPtr, v, 8); + dstPtr += 8; + while (dstPtr < dstEnd) { + LZ4_memcpy(dstPtr, v, 8); + dstPtr += 8; + } +} +#endif + +/*-************************************ + * Common functions + **************************************/ +static unsigned LZ4_NbCommonBytes(reg_t val) +{ + assert(val != 0); + if (LZ4_isLittleEndian()) { + if (sizeof(val) == 8) { +#if defined(_MSC_VER) && (_MSC_VER >= 1800) && \ + (defined(_M_AMD64) && !defined(_M_ARM64EC)) && \ + !defined(LZ4_FORCE_SW_BITCOUNT) +/*-************************************************************************************************* + * ARM64EC is a Microsoft-designed ARM64 ABI compatible with AMD64 applications + *on ARM64 Windows 11. The ARM64EC ABI does not support AVX/AVX2/AVX512 + *instructions, nor their relevant intrinsics including _tzcnt_u64. Therefore, + *we need to neuter the _tzcnt_u64 code path for ARM64EC. + ****************************************************************************************************/ +#if defined(__clang__) && (__clang_major__ < 10) + /* Avoid undefined clang-cl intrinsics issue. + * See https://github.com/lz4/lz4/pull/1017 for details. */ + return (unsigned)__builtin_ia32_tzcnt_u64(val) >> 3; +#else + /* x64 CPUS without BMI support interpret `TZCNT` as `REP BSF` */ + return (unsigned)_tzcnt_u64(val) >> 3; +#endif +#elif defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanForward64(&r, (U64)val); + return (unsigned)r >> 3; +#elif (defined(__clang__) || \ + (defined(__GNUC__) && \ + ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \ + !defined(LZ4_FORCE_SW_BITCOUNT) + return (unsigned)__builtin_ctzll((U64)val) >> 3; +#else + const U64 m = 0x0101010101010101ULL; + val ^= val - 1; + return (unsigned)(((U64)((val & (m - 1)) * m)) >> 56); +#endif + } else /* 32 bits */ { +#if defined(_MSC_VER) && (_MSC_VER >= 1400) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r; + _BitScanForward(&r, (U32)val); + return (unsigned)r >> 3; +#elif (defined(__clang__) || \ + (defined(__GNUC__) && \ + ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \ + !defined(__TINYC__) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (unsigned)__builtin_ctz((U32)val) >> 3; +#else + const U32 m = 0x01010101; + return (unsigned)((((val - 1) ^ val) & (m - 1)) * m) >> 24; +#endif + } + } else /* Big Endian CPU */ { + if (sizeof(val) == 8) { +#if (defined(__clang__) || \ + (defined(__GNUC__) && \ + ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \ + !defined(__TINYC__) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (unsigned)__builtin_clzll((U64)val) >> 3; +#else +#if 1 + /* this method is probably faster, + * but adds a 128 bytes lookup table */ + static const unsigned char ctz7_tab[128] = { + 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, + 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, + 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, + 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, + 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, + }; + U64 const mask = 0x0101010101010101ULL; + U64 const t = (((val >> 8) - mask) | val) & mask; + return ctz7_tab[(t * 0x0080402010080402ULL) >> 57]; +#else + /* this method doesn't consume memory space like the previous one, + * but it contains several branches, + * that may end up slowing execution */ + static const U32 by32 = + sizeof(val) * 4; /* 32 on 64 bits (goal), 16 on 32 bits. +Just to avoid some static analyzer complaining about shift by 32 on 32-bits +target. Note that this code path is never triggered in 32-bits mode. */ + unsigned r; + if (!(val >> by32)) { + r = 4; + } else { + r = 0; + val >>= by32; + } + if (!(val >> 16)) { + r += 2; + val >>= 8; + } else { + val >>= 24; + } + r += (!val); + return r; +#endif +#endif + } else /* 32 bits */ { +#if (defined(__clang__) || \ + (defined(__GNUC__) && \ + ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \ + !defined(LZ4_FORCE_SW_BITCOUNT) + return (unsigned)__builtin_clz((U32)val) >> 3; +#else + val >>= 8; + val = + ((((val + 0x00FFFF00) | 0x00FFFFFF) + val) | (val + 0x00FF0000)) >> + 24; + return (unsigned)val ^ 3; +#endif + } + } +} + +#define STEPSIZE sizeof(reg_t) +LZ4_FORCE_INLINE +unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit) +{ + const BYTE* const pStart = pIn; + + if (likely(pIn < pInLimit - (STEPSIZE - 1))) { + reg_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn); + if (!diff) { + pIn += STEPSIZE; + pMatch += STEPSIZE; + } else { + return LZ4_NbCommonBytes(diff); + } + } + + while (likely(pIn < pInLimit - (STEPSIZE - 1))) { + reg_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn); + if (!diff) { + pIn += STEPSIZE; + pMatch += STEPSIZE; + continue; + } + pIn += LZ4_NbCommonBytes(diff); + return (unsigned)(pIn - pStart); + } + + if ((STEPSIZE == 8) && (pIn < (pInLimit - 3)) && + (LZ4_read32(pMatch) == LZ4_read32(pIn))) { + pIn += 4; + pMatch += 4; + } + if ((pIn < (pInLimit - 1)) && (LZ4_read16(pMatch) == LZ4_read16(pIn))) { + pIn += 2; + pMatch += 2; + } + if ((pIn < pInLimit) && (*pMatch == *pIn)) + pIn++; + return (unsigned)(pIn - pStart); +} + +#ifndef LZ4_COMMONDEFS_ONLY +/*-************************************ + * Local Constants + **************************************/ +static const int LZ4_64Klimit = ((64 KB) + (MFLIMIT - 1)); +static const U32 LZ4_skipTrigger = 6; /* Increase this value ==> compression run + slower on incompressible data */ + +/*-************************************ + * Local Structures and types + **************************************/ +typedef enum { clearedTable = 0, byPtr, byU32, byU16 } tableType_t; + +/** + * This enum distinguishes several different modes of accessing previous + * content in the stream. + * + * - noDict : There is no preceding content. + * - withPrefix64k : Table entries up to ctx->dictSize before the current blob + * blob being compressed are valid and refer to the preceding + * content (of length ctx->dictSize), which is available + * contiguously preceding in memory the content currently + * being compressed. + * - usingExtDict : Like withPrefix64k, but the preceding content is somewhere + * else in memory, starting at ctx->dictionary with length + * ctx->dictSize. + * - usingDictCtx : Everything concerning the preceding content is + * in a separate context, pointed to by ctx->dictCtx. + * ctx->dictionary, ctx->dictSize, and table entries + * in the current context that refer to positions + * preceding the beginning of the current compression are + * ignored. Instead, ctx->dictCtx->dictionary and ctx->dictCtx + * ->dictSize describe the location and size of the preceding + * content, and matches are found by looking in the ctx + * ->dictCtx->hashTable. + */ +typedef enum { + noDict = 0, + withPrefix64k, + usingExtDict, + usingDictCtx +} dict_directive; +typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive; + +/*-************************************ + * Local Utils + **************************************/ +int LZ4_versionNumber(void) { return LZ4_VERSION_NUMBER; } +const char* LZ4_versionString(void) { return LZ4_VERSION_STRING; } +int LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); } +int LZ4_sizeofState(void) { return sizeof(LZ4_stream_t); } + +/*-**************************************** + * Internal Definitions, used only in Tests + *******************************************/ +#if defined(__cplusplus) +extern "C" { +#endif + +int LZ4_compress_forceExtDict(LZ4_stream_t* LZ4_dict, + const char* source, + char* dest, + int srcSize); + +int LZ4_decompress_safe_forceExtDict(const char* source, + char* dest, + int compressedSize, + int maxOutputSize, + const void* dictStart, + size_t dictSize); +int LZ4_decompress_safe_partial_forceExtDict(const char* source, + char* dest, + int compressedSize, + int targetOutputSize, + int dstCapacity, + const void* dictStart, + size_t dictSize); +#if defined(__cplusplus) +} +#endif + +/*-****************************** + * Compression functions + ********************************/ +LZ4_FORCE_INLINE U32 LZ4_hash4(U32 sequence, tableType_t const tableType) +{ + if (tableType == byU16) + return ((sequence * 2654435761U) >> ((MINMATCH * 8) - (LZ4_HASHLOG + 1))); + else + return ((sequence * 2654435761U) >> ((MINMATCH * 8) - LZ4_HASHLOG)); +} + +LZ4_FORCE_INLINE U32 LZ4_hash5(U64 sequence, tableType_t const tableType) +{ + const U32 hashLog = (tableType == byU16) ? LZ4_HASHLOG + 1 : LZ4_HASHLOG; + if (LZ4_isLittleEndian()) { + const U64 prime5bytes = 889523592379ULL; + return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog)); + } else { + const U64 prime8bytes = 11400714785074694791ULL; + return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog)); + } +} + +LZ4_FORCE_INLINE U32 LZ4_hashPosition(const void* const p, + tableType_t const tableType) +{ + if ((sizeof(reg_t) == 8) && (tableType != byU16)) + return LZ4_hash5(LZ4_read_ARCH(p), tableType); + +#ifdef LZ4_STATIC_LINKING_ONLY_ENDIANNESS_INDEPENDENT_OUTPUT + return LZ4_hash4(LZ4_readLE32(p), tableType); +#else + return LZ4_hash4(LZ4_read32(p), tableType); +#endif +} + +LZ4_FORCE_INLINE void +LZ4_clearHash(U32 h, void* tableBase, tableType_t const tableType) +{ + switch (tableType) { + default: /* fallthrough */ + case clearedTable: { /* illegal! */ + assert(0); + return; + } + case byPtr: { + const BYTE** hashTable = (const BYTE**)tableBase; + hashTable[h] = NULL; + return; + } + case byU32: { + U32* hashTable = (U32*)tableBase; + hashTable[h] = 0; + return; + } + case byU16: { + U16* hashTable = (U16*)tableBase; + hashTable[h] = 0; + return; + } + } +} + +LZ4_FORCE_INLINE void +LZ4_putIndexOnHash(U32 idx, U32 h, void* tableBase, tableType_t const tableType) +{ + switch (tableType) { + default: /* fallthrough */ + case clearedTable: /* fallthrough */ + case byPtr: { /* illegal! */ + assert(0); + return; + } + case byU32: { + U32* hashTable = (U32*)tableBase; + hashTable[h] = idx; + return; + } + case byU16: { + U16* hashTable = (U16*)tableBase; + assert(idx < 65536); + hashTable[h] = (U16)idx; + return; + } + } +} + +/* LZ4_putPosition*() : only used in byPtr mode */ +LZ4_FORCE_INLINE void LZ4_putPositionOnHash(const BYTE* p, + U32 h, + void* tableBase, + tableType_t const tableType) +{ + const BYTE** const hashTable = (const BYTE**)tableBase; + assert(tableType == byPtr); + (void)tableType; + hashTable[h] = p; +} + +LZ4_FORCE_INLINE void +LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType) +{ + U32 const h = LZ4_hashPosition(p, tableType); + LZ4_putPositionOnHash(p, h, tableBase, tableType); +} + +/* LZ4_getIndexOnHash() : + * Index of match position registered in hash table. + * hash position must be calculated by using base+index, or dictBase+index. + * Assumption 1 : only valid if tableType == byU32 or byU16. + * Assumption 2 : h is presumed valid (within limits of hash table) + */ +LZ4_FORCE_INLINE U32 LZ4_getIndexOnHash(U32 h, + const void* tableBase, + tableType_t tableType) +{ + LZ4_STATIC_ASSERT(LZ4_MEMORY_USAGE > 2); + if (tableType == byU32) { + const U32* const hashTable = (const U32*)tableBase; + assert(h < (1U << (LZ4_MEMORY_USAGE - 2))); + return hashTable[h]; + } + if (tableType == byU16) { + const U16* const hashTable = (const U16*)tableBase; + assert(h < (1U << (LZ4_MEMORY_USAGE - 1))); + return hashTable[h]; + } + assert(0); + return 0; /* forbidden case */ +} + +static const BYTE* +LZ4_getPositionOnHash(U32 h, const void* tableBase, tableType_t tableType) +{ + assert(tableType == byPtr); + (void)tableType; + { + const BYTE* const* hashTable = (const BYTE* const*)tableBase; + return hashTable[h]; + } +} + +LZ4_FORCE_INLINE const BYTE* +LZ4_getPosition(const BYTE* p, const void* tableBase, tableType_t tableType) +{ + U32 const h = LZ4_hashPosition(p, tableType); + return LZ4_getPositionOnHash(h, tableBase, tableType); +} + +LZ4_FORCE_INLINE void LZ4_prepareTable(LZ4_stream_t_internal* const cctx, + const int inputSize, + const tableType_t tableType) +{ + /* If the table hasn't been used, it's guaranteed to be zeroed out, and is + * therefore safe to use no matter what mode we're in. Otherwise, we figure + * out if it's safe to leave as is or whether it needs to be reset. + */ + if ((tableType_t)cctx->tableType != clearedTable) { + assert(inputSize >= 0); + if ((tableType_t)cctx->tableType != tableType || + ((tableType == byU16) && + cctx->currentOffset + (unsigned)inputSize >= 0xFFFFU) || + ((tableType == byU32) && cctx->currentOffset > 1 GB) || + tableType == byPtr || inputSize >= 4 KB) { + DEBUGLOG(4, "LZ4_prepareTable: Resetting table in %p", (void*)cctx); + MEM_INIT(cctx->hashTable, 0, LZ4_HASHTABLESIZE); + cctx->currentOffset = 0; + cctx->tableType = (U32)clearedTable; + } else { + DEBUGLOG(4, "LZ4_prepareTable: Re-use hash table (no reset)"); + } + } + + /* Adding a gap, so all previous entries are > LZ4_DISTANCE_MAX back, + * is faster than compressing without a gap. + * However, compressing with currentOffset == 0 is faster still, + * so we preserve that case. + */ + if (cctx->currentOffset != 0 && tableType == byU32) { + DEBUGLOG(5, "LZ4_prepareTable: adding 64KB to currentOffset"); + cctx->currentOffset += 64 KB; + } + + /* Finally, clear history */ + cctx->dictCtx = NULL; + cctx->dictionary = NULL; + cctx->dictSize = 0; +} + +/** LZ4_compress_generic_validated() : + * inlined, to ensure branches are decided at compilation time. + * The following conditions are presumed already validated: + * - source != NULL + * - inputSize > 0 + */ +LZ4_FORCE_INLINE int LZ4_compress_generic_validated( + LZ4_stream_t_internal* const cctx, + const char* const source, + char* const dest, + const int inputSize, + int* inputConsumed, /* only written when outputDirective == fillOutput */ + const int maxOutputSize, + const limitedOutput_directive outputDirective, + const tableType_t tableType, + const dict_directive dictDirective, + const dictIssue_directive dictIssue, + const int acceleration) +{ + int result; + const BYTE* ip = (const BYTE*)source; + + U32 const startIndex = cctx->currentOffset; + const BYTE* base = (const BYTE*)source - startIndex; + const BYTE* lowLimit; + + const LZ4_stream_t_internal* dictCtx = + (const LZ4_stream_t_internal*)cctx->dictCtx; + const BYTE* const dictionary = + dictDirective == usingDictCtx ? dictCtx->dictionary : cctx->dictionary; + const U32 dictSize = + dictDirective == usingDictCtx ? dictCtx->dictSize : cctx->dictSize; + const U32 dictDelta = (dictDirective == usingDictCtx) + ? startIndex - dictCtx->currentOffset + : 0; /* make indexes in dictCtx comparable with + indexes in current context */ + + int const maybe_extMem = + (dictDirective == usingExtDict) || (dictDirective == usingDictCtx); + U32 const prefixIdxLimit = + startIndex - dictSize; /* used when dictDirective == dictSmall */ + const BYTE* const dictEnd = dictionary ? dictionary + dictSize : dictionary; + const BYTE* anchor = (const BYTE*)source; + const BYTE* const iend = ip + inputSize; + const BYTE* const mflimitPlusOne = iend - MFLIMIT + 1; + const BYTE* const matchlimit = iend - LASTLITERALS; + + /* the dictCtx currentOffset is indexed on the start of the dictionary, + * while a dictionary in the current context precedes the currentOffset */ + const BYTE* dictBase = (dictionary == NULL) ? NULL + : (dictDirective == usingDictCtx) + ? dictionary + dictSize - dictCtx->currentOffset + : dictionary + dictSize - startIndex; + + BYTE* op = (BYTE*)dest; + BYTE* const olimit = op + maxOutputSize; + + U32 offset = 0; + U32 forwardH; + + DEBUGLOG(5, "LZ4_compress_generic_validated: srcSize=%i, tableType=%u", + inputSize, tableType); + assert(ip != NULL); + if (tableType == byU16) + assert(inputSize < + LZ4_64Klimit); /* Size too large (not within 64K limit) */ + if (tableType == byPtr) + assert(dictDirective == noDict); /* only supported use case with byPtr */ + /* If init conditions are not met, we don't have to mark stream + * as having dirty context, since no action was taken yet */ + if (outputDirective == fillOutput && maxOutputSize < 1) { + return 0; + } /* Impossible to store anything */ + assert(acceleration >= 1); + + lowLimit = + (const BYTE*)source - (dictDirective == withPrefix64k ? dictSize : 0); + + /* Update context state */ + if (dictDirective == usingDictCtx) { + /* Subsequent linked blocks can't use the dictionary. */ + /* Instead, they use the block we just compressed. */ + cctx->dictCtx = NULL; + cctx->dictSize = (U32)inputSize; + } else { + cctx->dictSize += (U32)inputSize; + } + cctx->currentOffset += (U32)inputSize; + cctx->tableType = (U32)tableType; + + if (inputSize < LZ4_minLength) + goto _last_literals; /* Input too small, no compression (all literals) */ + + /* First Byte */ + { + U32 const h = LZ4_hashPosition(ip, tableType); + if (tableType == byPtr) { + LZ4_putPositionOnHash(ip, h, cctx->hashTable, byPtr); + } else { + LZ4_putIndexOnHash(startIndex, h, cctx->hashTable, tableType); + } + } + ip++; + forwardH = LZ4_hashPosition(ip, tableType); + + /* Main Loop */ + for (;;) { + const BYTE* match; + BYTE* token; + const BYTE* filledIp; + + /* Find a match */ + if (tableType == byPtr) { + const BYTE* forwardIp = ip; + int step = 1; + int searchMatchNb = acceleration << LZ4_skipTrigger; + do { + U32 const h = forwardH; + ip = forwardIp; + forwardIp += step; + step = (searchMatchNb++ >> LZ4_skipTrigger); + + if (unlikely(forwardIp > mflimitPlusOne)) + goto _last_literals; + assert(ip < mflimitPlusOne); + + match = LZ4_getPositionOnHash(h, cctx->hashTable, tableType); + forwardH = LZ4_hashPosition(forwardIp, tableType); + LZ4_putPositionOnHash(ip, h, cctx->hashTable, tableType); + + } while ((match + LZ4_DISTANCE_MAX < ip) || + (LZ4_read32(match) != LZ4_read32(ip))); + + } else { /* byU32, byU16 */ + + const BYTE* forwardIp = ip; + int step = 1; + int searchMatchNb = acceleration << LZ4_skipTrigger; + do { + U32 const h = forwardH; + U32 const current = (U32)(forwardIp - base); + U32 matchIndex = LZ4_getIndexOnHash(h, cctx->hashTable, tableType); + assert(matchIndex <= current); + assert(forwardIp - base < (ptrdiff_t)(2 GB - 1)); + ip = forwardIp; + forwardIp += step; + step = (searchMatchNb++ >> LZ4_skipTrigger); + + if (unlikely(forwardIp > mflimitPlusOne)) + goto _last_literals; + assert(ip < mflimitPlusOne); + + if (dictDirective == usingDictCtx) { + if (matchIndex < startIndex) { + /* there was no match, try the dictionary */ + assert(tableType == byU32); + matchIndex = LZ4_getIndexOnHash(h, dictCtx->hashTable, byU32); + match = dictBase + matchIndex; + matchIndex += dictDelta; /* make dictCtx index comparable with + current context */ + lowLimit = dictionary; + } else { + match = base + matchIndex; + lowLimit = (const BYTE*)source; + } + } else if (dictDirective == usingExtDict) { + if (matchIndex < startIndex) { + DEBUGLOG( + 7, "extDict candidate: matchIndex=%5u < startIndex=%5u", + matchIndex, startIndex); + assert(startIndex - matchIndex >= MINMATCH); + assert(dictBase); + match = dictBase + matchIndex; + lowLimit = dictionary; + } else { + match = base + matchIndex; + lowLimit = (const BYTE*)source; + } + } else { /* single continuous memory segment */ + match = base + matchIndex; + } + forwardH = LZ4_hashPosition(forwardIp, tableType); + LZ4_putIndexOnHash(current, h, cctx->hashTable, tableType); + + DEBUGLOG(7, "candidate at pos=%u (offset=%u \n", matchIndex, + current - matchIndex); + if ((dictIssue == dictSmall) && (matchIndex < prefixIdxLimit)) { + continue; + } /* match outside of valid area */ + assert(matchIndex < current); + if (((tableType != byU16) || + (LZ4_DISTANCE_MAX < LZ4_DISTANCE_ABSOLUTE_MAX)) && + (matchIndex + LZ4_DISTANCE_MAX < current)) { + continue; + } /* too far */ + assert((current - matchIndex) <= + LZ4_DISTANCE_MAX); /* match now expected within distance */ + + if (LZ4_read32(match) == LZ4_read32(ip)) { + if (maybe_extMem) + offset = current - matchIndex; + break; /* match found */ + } + + } while (1); + } + + /* Catch up */ + filledIp = ip; + assert(ip > anchor); /* this is always true as ip has been advanced before + entering the main loop */ + if ((match > lowLimit) && unlikely(ip[-1] == match[-1])) { + do { + ip--; + match--; + } while (((ip > anchor) & (match > lowLimit)) && + (unlikely(ip[-1] == match[-1]))); + } + + /* Encode Literals */ + { + unsigned const litLength = (unsigned)(ip - anchor); + token = op++; + if ((outputDirective == + limitedOutput) && /* Check output buffer overflow */ + (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + + (litLength / 255) > + olimit))) { + return 0; /* cannot compress within `dst` budget. Stored indexes in + hash table are nonetheless fine */ + } + if ((outputDirective == fillOutput) && + (unlikely(op + (litLength + 240) / 255 /* litlen */ + + litLength /* literals */ + 2 /* offset */ + + 1 /* token */ + MFLIMIT - + MINMATCH /* min last literals so last match is <= end + - MFLIMIT */ + > olimit))) { + op--; + goto _last_literals; + } + if (litLength >= RUN_MASK) { + unsigned len = litLength - RUN_MASK; + *token = (RUN_MASK << ML_BITS); + for (; len >= 255; len -= 255) + *op++ = 255; + *op++ = (BYTE)len; + } else + *token = (BYTE)(litLength << ML_BITS); + + /* Copy Literals */ + LZ4_wildCopy8(op, anchor, op + litLength); + op += litLength; + DEBUGLOG(6, "seq.start:%i, literals=%u, match.start:%i", + (int)(anchor - (const BYTE*)source), litLength, + (int)(ip - (const BYTE*)source)); + } + + _next_match: + /* at this stage, the following variables must be correctly set : + * - ip : at start of LZ operation + * - match : at start of previous pattern occurrence; can be within + * current prefix, or within extDict + * - offset : if maybe_ext_memSegment==1 (constant) + * - lowLimit : must be == dictionary to mean "match is within extDict"; + * must be == source otherwise + * - token and *token : position to write 4-bits for match length; higher + * 4-bits for literal length supposed already written + */ + + if ((outputDirective == fillOutput) && + (op + 2 /* offset */ + 1 /* token */ + MFLIMIT - + MINMATCH /* min last literals so last match is <= end - MFLIMIT */ + > olimit)) { + /* the match was too close to the end, rewind and go to last literals + */ + op = token; + goto _last_literals; + } + + /* Encode Offset */ + if (maybe_extMem) { /* static test */ + DEBUGLOG(6, " with offset=%u (ext if > %i)", offset, + (int)(ip - (const BYTE*)source)); + assert(offset <= LZ4_DISTANCE_MAX && offset > 0); + LZ4_writeLE16(op, (U16)offset); + op += 2; + } else { + DEBUGLOG(6, " with offset=%u (same segment)", + (U32)(ip - match)); + assert(ip - match <= LZ4_DISTANCE_MAX); + LZ4_writeLE16(op, (U16)(ip - match)); + op += 2; + } + + /* Encode MatchLength */ + { + unsigned matchCode; + + if ((dictDirective == usingExtDict || dictDirective == usingDictCtx) && + (lowLimit == dictionary) /* match within extDict */) { + const BYTE* limit = ip + (dictEnd - match); + assert(dictEnd > match); + if (limit > matchlimit) + limit = matchlimit; + matchCode = LZ4_count(ip + MINMATCH, match + MINMATCH, limit); + ip += (size_t)matchCode + MINMATCH; + if (ip == limit) { + unsigned const more = + LZ4_count(limit, (const BYTE*)source, matchlimit); + matchCode += more; + ip += more; + } + DEBUGLOG(6, " with matchLength=%u starting in extDict", + matchCode + MINMATCH); + } else { + matchCode = LZ4_count(ip + MINMATCH, match + MINMATCH, matchlimit); + ip += (size_t)matchCode + MINMATCH; + DEBUGLOG(6, " with matchLength=%u", + matchCode + MINMATCH); + } + + if ((outputDirective) && /* Check output buffer overflow */ + (unlikely(op + (1 + LASTLITERALS) + (matchCode + 240) / 255 > + olimit))) { + if (outputDirective == fillOutput) { + /* Match description too long : reduce it */ + U32 newMatchCode = 15 /* in token */ - + 1 /* to avoid needing a zero byte */ + + ((U32)(olimit - op) - 1 - LASTLITERALS) * 255; + ip -= matchCode - newMatchCode; + assert(newMatchCode < matchCode); + matchCode = newMatchCode; + if (unlikely(ip <= filledIp)) { + /* We have already filled up to filledIp so if ip ends up less + * than filledIp we have positions in the hash table beyond + * the current position. This is a problem if we reuse the + * hash table. So we have to remove these positions from the + * hash table. + */ + const BYTE* ptr; + DEBUGLOG(5, "Clearing %u positions", (U32)(filledIp - ip)); + for (ptr = ip; ptr <= filledIp; ++ptr) { + U32 const h = LZ4_hashPosition(ptr, tableType); + LZ4_clearHash(h, cctx->hashTable, tableType); + } + } + } else { + assert(outputDirective == limitedOutput); + return 0; /* cannot compress within `dst` budget. Stored indexes + in hash table are nonetheless fine */ + } + } + if (matchCode >= ML_MASK) { + *token += ML_MASK; + matchCode -= ML_MASK; + LZ4_write32(op, 0xFFFFFFFF); + while (matchCode >= 4 * 255) { + op += 4; + LZ4_write32(op, 0xFFFFFFFF); + matchCode -= 4 * 255; + } + op += matchCode / 255; + *op++ = (BYTE)(matchCode % 255); + } else + *token += (BYTE)(matchCode); + } + /* Ensure we have enough space for the last literals. */ + assert( + !(outputDirective == fillOutput && op + 1 + LASTLITERALS > olimit)); + + anchor = ip; + + /* Test end of chunk */ + if (ip >= mflimitPlusOne) + break; + + /* Fill table */ + { + U32 const h = LZ4_hashPosition(ip - 2, tableType); + if (tableType == byPtr) { + LZ4_putPositionOnHash(ip - 2, h, cctx->hashTable, byPtr); + } else { + U32 const idx = (U32)((ip - 2) - base); + LZ4_putIndexOnHash(idx, h, cctx->hashTable, tableType); + } + } + + /* Test next position */ + if (tableType == byPtr) { + + match = LZ4_getPosition(ip, cctx->hashTable, tableType); + LZ4_putPosition(ip, cctx->hashTable, tableType); + if ((match + LZ4_DISTANCE_MAX >= ip) && + (LZ4_read32(match) == LZ4_read32(ip))) { + token = op++; + *token = 0; + goto _next_match; + } + + } else { /* byU32, byU16 */ + + U32 const h = LZ4_hashPosition(ip, tableType); + U32 const current = (U32)(ip - base); + U32 matchIndex = LZ4_getIndexOnHash(h, cctx->hashTable, tableType); + assert(matchIndex < current); + if (dictDirective == usingDictCtx) { + if (matchIndex < startIndex) { + /* there was no match, try the dictionary */ + assert(tableType == byU32); + matchIndex = LZ4_getIndexOnHash(h, dictCtx->hashTable, byU32); + match = dictBase + matchIndex; + lowLimit = dictionary; /* required for match length counter */ + matchIndex += dictDelta; + } else { + match = base + matchIndex; + lowLimit = + (const BYTE*)source; /* required for match length counter */ + } + } else if (dictDirective == usingExtDict) { + if (matchIndex < startIndex) { + assert(dictBase); + match = dictBase + matchIndex; + lowLimit = dictionary; /* required for match length counter */ + } else { + match = base + matchIndex; + lowLimit = + (const BYTE*)source; /* required for match length counter */ + } + } else { /* single memory segment */ + match = base + matchIndex; + } + LZ4_putIndexOnHash(current, h, cctx->hashTable, tableType); + assert(matchIndex < current); + if (((dictIssue == dictSmall) ? (matchIndex >= prefixIdxLimit) : 1) && + (((tableType == byU16) && + (LZ4_DISTANCE_MAX == LZ4_DISTANCE_ABSOLUTE_MAX)) + ? 1 + : (matchIndex + LZ4_DISTANCE_MAX >= current)) && + (LZ4_read32(match) == LZ4_read32(ip))) { + token = op++; + *token = 0; + if (maybe_extMem) + offset = current - matchIndex; + DEBUGLOG(6, "seq.start:%i, literals=%u, match.start:%i", + (int)(anchor - (const BYTE*)source), 0, + (int)(ip - (const BYTE*)source)); + goto _next_match; + } + } + + /* Prepare next loop */ + forwardH = LZ4_hashPosition(++ip, tableType); + } + +_last_literals: + /* Encode Last Literals */ + { + size_t lastRun = (size_t)(iend - anchor); + if ((outputDirective) && /* Check output buffer overflow */ + (op + lastRun + 1 + ((lastRun + 255 - RUN_MASK) / 255) > olimit)) { + if (outputDirective == fillOutput) { + /* adapt lastRun to fill 'dst' */ + assert(olimit >= op); + lastRun = (size_t)(olimit - op) - 1 /*token*/; + lastRun -= + (lastRun + 256 - RUN_MASK) / 256; /*additional length tokens*/ + } else { + assert(outputDirective == limitedOutput); + return 0; /* cannot compress within `dst` budget. Stored indexes in + hash table are nonetheless fine */ + } + } + DEBUGLOG(6, "Final literal run : %i literals", (int)lastRun); + if (lastRun >= RUN_MASK) { + size_t accumulator = lastRun - RUN_MASK; + *op++ = RUN_MASK << ML_BITS; + for (; accumulator >= 255; accumulator -= 255) + *op++ = 255; + *op++ = (BYTE)accumulator; + } else { + *op++ = (BYTE)(lastRun << ML_BITS); + } + LZ4_memcpy(op, anchor, lastRun); + ip = anchor + lastRun; + op += lastRun; + } + + if (outputDirective == fillOutput) { + *inputConsumed = (int)(((const char*)ip) - source); + } + result = (int)(((char*)op) - dest); + assert(result > 0); + DEBUGLOG(5, "LZ4_compress_generic: compressed %i bytes into %i bytes", + inputSize, result); + return result; +} + +/** LZ4_compress_generic() : + * inlined, to ensure branches are decided at compilation time; + * takes care of src == (NULL, 0) + * and forward the rest to LZ4_compress_generic_validated */ +LZ4_FORCE_INLINE int LZ4_compress_generic( + LZ4_stream_t_internal* const cctx, + const char* const src, + char* const dst, + const int srcSize, + int* inputConsumed, /* only written when outputDirective == fillOutput */ + const int dstCapacity, + const limitedOutput_directive outputDirective, + const tableType_t tableType, + const dict_directive dictDirective, + const dictIssue_directive dictIssue, + const int acceleration) +{ + DEBUGLOG(5, "LZ4_compress_generic: srcSize=%i, dstCapacity=%i", srcSize, + dstCapacity); + + if ((U32)srcSize > (U32)LZ4_MAX_INPUT_SIZE) { + return 0; + } /* Unsupported srcSize, too large (or negative) */ + if (srcSize == 0) { /* src == NULL supported if srcSize == 0 */ + if (outputDirective != notLimited && dstCapacity <= 0) + return 0; /* no output, can't write anything */ + DEBUGLOG(5, "Generating an empty block"); + assert(outputDirective == notLimited || dstCapacity >= 1); + assert(dst != NULL); + dst[0] = 0; + if (outputDirective == fillOutput) { + assert(inputConsumed != NULL); + *inputConsumed = 0; + } + return 1; + } + assert(src != NULL); + + return LZ4_compress_generic_validated( + cctx, src, dst, srcSize, + inputConsumed, /* only written into if outputDirective == fillOutput */ + dstCapacity, outputDirective, tableType, dictDirective, dictIssue, + acceleration); +} + +int LZ4_compress_fast_extState(void* state, + const char* source, + char* dest, + int inputSize, + int maxOutputSize, + int acceleration) +{ + LZ4_stream_t_internal* const ctx = + &LZ4_initStream(state, sizeof(LZ4_stream_t))->internal_donotuse; + assert(ctx != NULL); + if (acceleration < 1) + acceleration = LZ4_ACCELERATION_DEFAULT; + if (acceleration > LZ4_ACCELERATION_MAX) + acceleration = LZ4_ACCELERATION_MAX; + if (maxOutputSize >= LZ4_compressBound(inputSize)) { + if (inputSize < LZ4_64Klimit) { + return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, 0, + notLimited, byU16, noDict, noDictIssue, + acceleration); + } else { + const tableType_t tableType = + ((sizeof(void*) == 4) && ((uptrval)source > LZ4_DISTANCE_MAX)) + ? byPtr + : byU32; + return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, 0, + notLimited, tableType, noDict, noDictIssue, + acceleration); + } + } else { + if (inputSize < LZ4_64Klimit) { + return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, + maxOutputSize, limitedOutput, byU16, + noDict, noDictIssue, acceleration); + } else { + const tableType_t tableType = + ((sizeof(void*) == 4) && ((uptrval)source > LZ4_DISTANCE_MAX)) + ? byPtr + : byU32; + return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, + maxOutputSize, limitedOutput, tableType, + noDict, noDictIssue, acceleration); + } + } +} + +/** + * LZ4_compress_fast_extState_fastReset() : + * A variant of LZ4_compress_fast_extState(). + * + * Using this variant avoids an expensive initialization step. It is only safe + * to call if the state buffer is known to be correctly initialized already + * (see comment in lz4.h on LZ4_resetStream_fast() for a definition of + * "correctly initialized"). + */ +int LZ4_compress_fast_extState_fastReset(void* state, + const char* src, + char* dst, + int srcSize, + int dstCapacity, + int acceleration) +{ + LZ4_stream_t_internal* const ctx = + &((LZ4_stream_t*)state)->internal_donotuse; + if (acceleration < 1) + acceleration = LZ4_ACCELERATION_DEFAULT; + if (acceleration > LZ4_ACCELERATION_MAX) + acceleration = LZ4_ACCELERATION_MAX; + assert(ctx != NULL); + + if (dstCapacity >= LZ4_compressBound(srcSize)) { + if (srcSize < LZ4_64Klimit) { + const tableType_t tableType = byU16; + LZ4_prepareTable(ctx, srcSize, tableType); + if (ctx->currentOffset) { + return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, + notLimited, tableType, noDict, + dictSmall, acceleration); + } else { + return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, + notLimited, tableType, noDict, + noDictIssue, acceleration); + } + } else { + const tableType_t tableType = + ((sizeof(void*) == 4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr + : byU32; + LZ4_prepareTable(ctx, srcSize, tableType); + return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, + notLimited, tableType, noDict, noDictIssue, + acceleration); + } + } else { + if (srcSize < LZ4_64Klimit) { + const tableType_t tableType = byU16; + LZ4_prepareTable(ctx, srcSize, tableType); + if (ctx->currentOffset) { + return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, + dstCapacity, limitedOutput, tableType, + noDict, dictSmall, acceleration); + } else { + return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, + dstCapacity, limitedOutput, tableType, + noDict, noDictIssue, acceleration); + } + } else { + const tableType_t tableType = + ((sizeof(void*) == 4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr + : byU32; + LZ4_prepareTable(ctx, srcSize, tableType); + return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, + limitedOutput, tableType, noDict, + noDictIssue, acceleration); + } + } +} + +int LZ4_compress_fast( + const char* src, char* dest, int srcSize, int dstCapacity, int acceleration) +{ + int result; +#if (LZ4_HEAPMODE) + LZ4_stream_t* const ctxPtr = (LZ4_stream_t*)ALLOC( + sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */ + if (ctxPtr == NULL) + return 0; +#else + LZ4_stream_t ctx; + LZ4_stream_t* const ctxPtr = &ctx; +#endif + result = LZ4_compress_fast_extState(ctxPtr, src, dest, srcSize, dstCapacity, + acceleration); + +#if (LZ4_HEAPMODE) + FREEMEM(ctxPtr); +#endif + return result; +} + +int LZ4_compress_default(const char* src, + char* dst, + int srcSize, + int dstCapacity) +{ + return LZ4_compress_fast(src, dst, srcSize, dstCapacity, 1); +} + +/* Note!: This function leaves the stream in an unclean/broken state! + * It is not safe to subsequently use the same state with a _fastReset() or + * _continue() call without resetting it. */ +static int LZ4_compress_destSize_extState_internal(LZ4_stream_t* state, + const char* src, + char* dst, + int* srcSizePtr, + int targetDstSize, + int acceleration) +{ + void* const s = LZ4_initStream(state, sizeof(*state)); + assert(s != NULL); + (void)s; + + if (targetDstSize >= + LZ4_compressBound(*srcSizePtr)) { /* compression success is guaranteed */ + return LZ4_compress_fast_extState(state, src, dst, *srcSizePtr, + targetDstSize, acceleration); + } else { + if (*srcSizePtr < LZ4_64Klimit) { + return LZ4_compress_generic(&state->internal_donotuse, src, dst, + *srcSizePtr, srcSizePtr, targetDstSize, + fillOutput, byU16, noDict, noDictIssue, + acceleration); + } else { + tableType_t const addrMode = + ((sizeof(void*) == 4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr + : byU32; + return LZ4_compress_generic(&state->internal_donotuse, src, dst, + *srcSizePtr, srcSizePtr, targetDstSize, + fillOutput, addrMode, noDict, noDictIssue, + acceleration); + } + } +} + +int LZ4_compress_destSize_extState(void* state, + const char* src, + char* dst, + int* srcSizePtr, + int targetDstSize, + int acceleration) +{ + int const r = LZ4_compress_destSize_extState_internal( + (LZ4_stream_t*)state, src, dst, srcSizePtr, targetDstSize, acceleration); + /* clean the state on exit */ + LZ4_initStream(state, sizeof(LZ4_stream_t)); + return r; +} + +int LZ4_compress_destSize(const char* src, + char* dst, + int* srcSizePtr, + int targetDstSize) +{ +#if (LZ4_HEAPMODE) + LZ4_stream_t* const ctx = (LZ4_stream_t*)ALLOC( + sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */ + if (ctx == NULL) + return 0; +#else + LZ4_stream_t ctxBody; + LZ4_stream_t* const ctx = &ctxBody; +#endif + + int result = LZ4_compress_destSize_extState_internal( + ctx, src, dst, srcSizePtr, targetDstSize, 1); + +#if (LZ4_HEAPMODE) + FREEMEM(ctx); +#endif + return result; +} + +/*-****************************** + * Streaming functions + ********************************/ + +#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) +LZ4_stream_t* LZ4_createStream(void) +{ + LZ4_stream_t* const lz4s = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t)); + LZ4_STATIC_ASSERT(sizeof(LZ4_stream_t) >= sizeof(LZ4_stream_t_internal)); + DEBUGLOG(4, "LZ4_createStream %p", (void*)lz4s); + if (lz4s == NULL) + return NULL; + LZ4_initStream(lz4s, sizeof(*lz4s)); + return lz4s; +} +#endif + +static size_t LZ4_stream_t_alignment(void) +{ +#if LZ4_ALIGN_TEST + typedef struct { + char c; + LZ4_stream_t t; + } t_a; + return sizeof(t_a) - sizeof(LZ4_stream_t); +#else + return 1; /* effectively disabled */ +#endif +} + +LZ4_stream_t* LZ4_initStream(void* buffer, size_t size) +{ + DEBUGLOG(5, "LZ4_initStream"); + if (buffer == NULL) { + return NULL; + } + if (size < sizeof(LZ4_stream_t)) { + return NULL; + } + if (!LZ4_isAligned(buffer, LZ4_stream_t_alignment())) + return NULL; + MEM_INIT(buffer, 0, sizeof(LZ4_stream_t_internal)); + return (LZ4_stream_t*)buffer; +} + +/* resetStream is now deprecated, + * prefer initStream() which is more general */ +void LZ4_resetStream(LZ4_stream_t* LZ4_stream) +{ + DEBUGLOG(5, "LZ4_resetStream (ctx:%p)", (void*)LZ4_stream); + MEM_INIT(LZ4_stream, 0, sizeof(LZ4_stream_t_internal)); +} + +void LZ4_resetStream_fast(LZ4_stream_t* ctx) +{ + LZ4_prepareTable(&(ctx->internal_donotuse), 0, byU32); +} + +#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) +int LZ4_freeStream(LZ4_stream_t* LZ4_stream) +{ + if (!LZ4_stream) + return 0; /* support free on NULL */ + DEBUGLOG(5, "LZ4_freeStream %p", (void*)LZ4_stream); + FREEMEM(LZ4_stream); + return (0); +} +#endif + +typedef enum { _ld_fast, _ld_slow } LoadDict_mode_e; +#define HASH_UNIT sizeof(reg_t) +int LZ4_loadDict_internal(LZ4_stream_t* LZ4_dict, + const char* dictionary, + int dictSize, + LoadDict_mode_e _ld) +{ + LZ4_stream_t_internal* const dict = &LZ4_dict->internal_donotuse; + const tableType_t tableType = byU32; + const BYTE* p = (const BYTE*)dictionary; + const BYTE* const dictEnd = p + dictSize; + U32 idx32; + + DEBUGLOG(4, "LZ4_loadDict (%i bytes from %p into %p)", dictSize, + (void*)dictionary, (void*)LZ4_dict); + + /* It's necessary to reset the context, + * and not just continue it with prepareTable() + * to avoid any risk of generating overflowing matchIndex + * when compressing using this dictionary */ + LZ4_resetStream(LZ4_dict); + + /* We always increment the offset by 64 KB, since, if the dict is longer, + * we truncate it to the last 64k, and if it's shorter, we still want to + * advance by a whole window length so we can provide the guarantee that + * there are only valid offsets in the window, which allows an optimization + * in LZ4_compress_fast_continue() where it uses noDictIssue even when the + * dictionary isn't a full 64k. */ + dict->currentOffset += 64 KB; + + if (dictSize < (int)HASH_UNIT) { + return 0; + } + + if ((dictEnd - p) > 64 KB) + p = dictEnd - 64 KB; + dict->dictionary = p; + dict->dictSize = (U32)(dictEnd - p); + dict->tableType = (U32)tableType; + idx32 = dict->currentOffset - dict->dictSize; + + while (p <= dictEnd - HASH_UNIT) { + U32 const h = LZ4_hashPosition(p, tableType); + /* Note: overwriting => favors positions end of dictionary */ + LZ4_putIndexOnHash(idx32, h, dict->hashTable, tableType); + p += 3; + idx32 += 3; + } + + if (_ld == _ld_slow) { + /* Fill hash table with additional references, to improve compression + * capability */ + p = dict->dictionary; + idx32 = dict->currentOffset - dict->dictSize; + while (p <= dictEnd - HASH_UNIT) { + U32 const h = LZ4_hashPosition(p, tableType); + U32 const limit = dict->currentOffset - 64 KB; + if (LZ4_getIndexOnHash(h, dict->hashTable, tableType) <= limit) { + /* Note: not overwriting => favors positions beginning of dictionary + */ + LZ4_putIndexOnHash(idx32, h, dict->hashTable, tableType); + } + p++; + idx32++; + } + } + + return (int)dict->dictSize; +} + +int LZ4_loadDict(LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize) +{ + return LZ4_loadDict_internal(LZ4_dict, dictionary, dictSize, _ld_fast); +} + +int LZ4_loadDictSlow(LZ4_stream_t* LZ4_dict, + const char* dictionary, + int dictSize) +{ + return LZ4_loadDict_internal(LZ4_dict, dictionary, dictSize, _ld_slow); +} + +void LZ4_attach_dictionary(LZ4_stream_t* workingStream, + const LZ4_stream_t* dictionaryStream) +{ + const LZ4_stream_t_internal* dictCtx = + (dictionaryStream == NULL) ? NULL + : &(dictionaryStream->internal_donotuse); + + DEBUGLOG(4, "LZ4_attach_dictionary (%p, %p, size %u)", (void*)workingStream, + (void*)dictionaryStream, dictCtx != NULL ? dictCtx->dictSize : 0); + + if (dictCtx != NULL) { + /* If the current offset is zero, we will never look in the + * external dictionary context, since there is no value a table + * entry can take that indicate a miss. In that case, we need + * to bump the offset to something non-zero. + */ + if (workingStream->internal_donotuse.currentOffset == 0) { + workingStream->internal_donotuse.currentOffset = 64 KB; + } + + /* Don't actually attach an empty dictionary. + */ + if (dictCtx->dictSize == 0) { + dictCtx = NULL; + } + } + workingStream->internal_donotuse.dictCtx = dictCtx; +} + +static void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, int nextSize) +{ + assert(nextSize >= 0); + if (LZ4_dict->currentOffset + (unsigned)nextSize > + 0x80000000) { /* potential ptrdiff_t overflow (32-bits mode) */ + /* rescale hash table */ + U32 const delta = LZ4_dict->currentOffset - 64 KB; + const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize; + int i; + DEBUGLOG(4, "LZ4_renormDictT"); + for (i = 0; i < LZ4_HASH_SIZE_U32; i++) { + if (LZ4_dict->hashTable[i] < delta) + LZ4_dict->hashTable[i] = 0; + else + LZ4_dict->hashTable[i] -= delta; + } + LZ4_dict->currentOffset = 64 KB; + if (LZ4_dict->dictSize > 64 KB) + LZ4_dict->dictSize = 64 KB; + LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize; + } +} + +int LZ4_compress_fast_continue(LZ4_stream_t* LZ4_stream, + const char* source, + char* dest, + int inputSize, + int maxOutputSize, + int acceleration) +{ + const tableType_t tableType = byU32; + LZ4_stream_t_internal* const streamPtr = &LZ4_stream->internal_donotuse; + const char* dictEnd = + streamPtr->dictSize + ? (const char*)streamPtr->dictionary + streamPtr->dictSize + : NULL; + + DEBUGLOG(5, "LZ4_compress_fast_continue (inputSize=%i, dictSize=%u)", + inputSize, streamPtr->dictSize); + + LZ4_renormDictT(streamPtr, inputSize); /* fix index overflow */ + if (acceleration < 1) + acceleration = LZ4_ACCELERATION_DEFAULT; + if (acceleration > LZ4_ACCELERATION_MAX) + acceleration = LZ4_ACCELERATION_MAX; + + /* invalidate tiny dictionaries */ + if ((streamPtr->dictSize < 4) /* tiny dictionary : not enough for a hash */ + && (dictEnd != source) /* prefix mode */ + && (inputSize > 0) /* tolerance : don't lose history, in case next + invocation would use prefix mode */ + && (streamPtr->dictCtx == NULL) /* usingDictCtx */ + ) { + DEBUGLOG( + 5, "LZ4_compress_fast_continue: dictSize(%u) at addr:%p is too small", + streamPtr->dictSize, (void*)streamPtr->dictionary); + /* remove dictionary existence from history, to employ faster prefix mode + */ + streamPtr->dictSize = 0; + streamPtr->dictionary = (const BYTE*)source; + dictEnd = source; + } + + /* Check overlapping input/dictionary space */ + { + const char* const sourceEnd = source + inputSize; + if ((sourceEnd > (const char*)streamPtr->dictionary) && + (sourceEnd < dictEnd)) { + streamPtr->dictSize = (U32)(dictEnd - sourceEnd); + if (streamPtr->dictSize > 64 KB) + streamPtr->dictSize = 64 KB; + if (streamPtr->dictSize < 4) + streamPtr->dictSize = 0; + streamPtr->dictionary = (const BYTE*)dictEnd - streamPtr->dictSize; + } + } + + /* prefix mode : source data follows dictionary */ + if (dictEnd == source) { + if ((streamPtr->dictSize < 64 KB) && + (streamPtr->dictSize < streamPtr->currentOffset)) + return LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, + maxOutputSize, limitedOutput, tableType, + withPrefix64k, dictSmall, acceleration); + else + return LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, + maxOutputSize, limitedOutput, tableType, + withPrefix64k, noDictIssue, acceleration); + } + + /* external dictionary mode */ + { + int result; + if (streamPtr->dictCtx) { + /* We depend here on the fact that dictCtx'es (produced by + * LZ4_loadDict) guarantee that their tables contain no references + * to offsets between dictCtx->currentOffset - 64 KB and + * dictCtx->currentOffset - dictCtx->dictSize. This makes it safe + * to use noDictIssue even when the dict isn't a full 64 KB. + */ + if (inputSize > 4 KB) { + /* For compressing large blobs, it is faster to pay the setup + * cost to copy the dictionary's tables into the active context, + * so that the compression loop is only looking into one table. + */ + LZ4_memcpy(streamPtr, streamPtr->dictCtx, sizeof(*streamPtr)); + result = + LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, + maxOutputSize, limitedOutput, tableType, + usingExtDict, noDictIssue, acceleration); + } else { + result = + LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, + maxOutputSize, limitedOutput, tableType, + usingDictCtx, noDictIssue, acceleration); + } + } else { /* small data <= 4 KB */ + if ((streamPtr->dictSize < 64 KB) && + (streamPtr->dictSize < streamPtr->currentOffset)) { + result = LZ4_compress_generic( + streamPtr, source, dest, inputSize, NULL, maxOutputSize, + limitedOutput, tableType, usingExtDict, dictSmall, acceleration); + } else { + result = + LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, + maxOutputSize, limitedOutput, tableType, + usingExtDict, noDictIssue, acceleration); + } + } + streamPtr->dictionary = (const BYTE*)source; + streamPtr->dictSize = (U32)inputSize; + return result; + } +} + +/* Hidden debug function, to force-test external dictionary mode */ +int LZ4_compress_forceExtDict(LZ4_stream_t* LZ4_dict, + const char* source, + char* dest, + int srcSize) +{ + LZ4_stream_t_internal* const streamPtr = &LZ4_dict->internal_donotuse; + int result; + + LZ4_renormDictT(streamPtr, srcSize); + + if ((streamPtr->dictSize < 64 KB) && + (streamPtr->dictSize < streamPtr->currentOffset)) { + result = + LZ4_compress_generic(streamPtr, source, dest, srcSize, NULL, 0, + notLimited, byU32, usingExtDict, dictSmall, 1); + } else { + result = + LZ4_compress_generic(streamPtr, source, dest, srcSize, NULL, 0, + notLimited, byU32, usingExtDict, noDictIssue, 1); + } + + streamPtr->dictionary = (const BYTE*)source; + streamPtr->dictSize = (U32)srcSize; + + return result; +} + +/*! LZ4_saveDict() : + * If previously compressed data block is not guaranteed to remain available at + * its memory location, save it into a safer place (char* safeBuffer). Note : no + * need to call LZ4_loadDict() afterwards, dictionary is immediately usable, one + * can therefore call LZ4_compress_fast_continue() right after. + * @return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if + * error. + */ +int LZ4_saveDict(LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize) +{ + LZ4_stream_t_internal* const dict = &LZ4_dict->internal_donotuse; + + DEBUGLOG(5, "LZ4_saveDict : dictSize=%i, safeBuffer=%p", dictSize, + (void*)safeBuffer); + + if ((U32)dictSize > 64 KB) { + dictSize = 64 KB; + } /* useless to define a dictionary > 64 KB */ + if ((U32)dictSize > dict->dictSize) { + dictSize = (int)dict->dictSize; + } + + if (safeBuffer == NULL) + assert(dictSize == 0); + if (dictSize > 0) { + const BYTE* const previousDictEnd = dict->dictionary + dict->dictSize; + assert(dict->dictionary); + LZ4_memmove(safeBuffer, previousDictEnd - dictSize, (size_t)dictSize); + } + + dict->dictionary = (const BYTE*)safeBuffer; + dict->dictSize = (U32)dictSize; + + return dictSize; +} + +/*-******************************* + * Decompression functions + ********************************/ + +typedef enum { decode_full_block = 0, partial_decode = 1 } earlyEnd_directive; + +#undef MIN +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +/* variant for decompress_unsafe() + * does not know end of input + * presumes input is well formed + * note : will consume at least one byte */ +static size_t read_long_length_no_check(const BYTE** pp) +{ + size_t b, l = 0; + do { + b = **pp; + (*pp)++; + l += b; + } while (b == 255); + DEBUGLOG(6, "read_long_length_no_check: +length=%zu using %zu input bytes", + l, l / 255 + 1) + return l; +} + +/* core decoder variant for LZ4_decompress_fast*() + * for legacy support only : these entry points are deprecated. + * - Presumes input is correctly formed (no defense vs malformed inputs) + * - Does not know input size (presume input buffer is "large enough") + * - Decompress a full block (only) + * @return : nb of bytes read from input. + * Note : this variant is not optimized for speed, just for maintenance. + * the goal is to remove support of decompress_fast*() variants by v2.0 + **/ +LZ4_FORCE_INLINE int LZ4_decompress_unsafe_generic( + const BYTE* const istart, + BYTE* const ostart, + int decompressedSize, + + size_t prefixSize, + const BYTE* const dictStart, /* only if dict==usingExtDict */ + const size_t dictSize /* note: =0 if dictStart==NULL */ +) +{ + const BYTE* ip = istart; + BYTE* op = (BYTE*)ostart; + BYTE* const oend = ostart + decompressedSize; + const BYTE* const prefixStart = ostart - prefixSize; + + DEBUGLOG(5, "LZ4_decompress_unsafe_generic"); + if (dictStart == NULL) + assert(dictSize == 0); + + while (1) { + /* start new sequence */ + unsigned token = *ip++; + + /* literals */ + { + size_t ll = token >> ML_BITS; + if (ll == 15) { + /* long literal length */ + ll += read_long_length_no_check(&ip); + } + if ((size_t)(oend - op) < ll) + return -1; /* output buffer overflow */ + LZ4_memmove(op, ip, ll); /* support in-place decompression */ + op += ll; + ip += ll; + if ((size_t)(oend - op) < MFLIMIT) { + if (op == oend) + break; /* end of block */ + DEBUGLOG(5, + "invalid: literals end at distance %zi from end of block", + oend - op); + /* incorrect end of block : + * last match must start at least MFLIMIT==12 bytes before end of + * output block */ + return -1; + } + } + + /* match */ + { + size_t ml = token & 15; + size_t const offset = LZ4_readLE16(ip); + ip += 2; + + if (ml == 15) { + /* long literal length */ + ml += read_long_length_no_check(&ip); + } + ml += MINMATCH; + + if ((size_t)(oend - op) < ml) + return -1; /* output buffer overflow */ + + { + const BYTE* match = op - offset; + + /* out of range */ + if (offset > (size_t)(op - prefixStart) + dictSize) { + DEBUGLOG(6, "offset out of range"); + return -1; + } + + /* check special case : extDict */ + if (offset > (size_t)(op - prefixStart)) { + /* extDict scenario */ + const BYTE* const dictEnd = dictStart + dictSize; + const BYTE* extMatch = + dictEnd - (offset - (size_t)(op - prefixStart)); + size_t const extml = (size_t)(dictEnd - extMatch); + if (extml > ml) { + /* match entirely within extDict */ + LZ4_memmove(op, extMatch, ml); + op += ml; + ml = 0; + } else { + /* match split between extDict & prefix */ + LZ4_memmove(op, extMatch, extml); + op += extml; + ml -= extml; + } + match = prefixStart; + } + + /* match copy - slow variant, supporting overlap copy */ + { + size_t u; + for (u = 0; u < ml; u++) { + op[u] = match[u]; + } + } + } + op += ml; + if ((size_t)(oend - op) < LASTLITERALS) { + DEBUGLOG(5, "invalid: match ends at distance %zi from end of block", + oend - op); + /* incorrect end of block : + * last match must stop at least LASTLITERALS==5 bytes before end of + * output block */ + return -1; + } + } /* match */ + } /* main loop */ + return (int)(ip - istart); +} + +/* Read the variable-length literal or match length. + * + * @ip : input pointer + * @ilimit : position after which if length is not decoded, the input is + *necessarily corrupted. + * @initial_check - check ip >= ipmax before start of loop. Returns + *initial_error if so. + * @error (output) - error code. Must be set to 0 before call. + **/ +typedef size_t Rvl_t; +static const Rvl_t rvl_error = (Rvl_t)(-1); +LZ4_FORCE_INLINE Rvl_t read_variable_length(const BYTE** ip, + const BYTE* ilimit, + int initial_check) +{ + Rvl_t s, length = 0; + assert(ip != NULL); + assert(*ip != NULL); + assert(ilimit != NULL); + if (initial_check && unlikely((*ip) >= ilimit)) { /* read limit reached */ + return rvl_error; + } + s = **ip; + (*ip)++; + length += s; + if (unlikely((*ip) > ilimit)) { /* read limit reached */ + return rvl_error; + } + /* accumulator overflow detection (32-bit mode only) */ + if ((sizeof(length) < 8) && unlikely(length > ((Rvl_t)(-1) / 2))) { + return rvl_error; + } + if (likely(s != 255)) + return length; + do { + s = **ip; + (*ip)++; + length += s; + if (unlikely((*ip) > ilimit)) { /* read limit reached */ + return rvl_error; + } + /* accumulator overflow detection (32-bit mode only) */ + if ((sizeof(length) < 8) && unlikely(length > ((Rvl_t)(-1) / 2))) { + return rvl_error; + } + } while (s == 255); + + return length; +} + +/*! LZ4_decompress_generic() : + * This generic decompression function covers all use cases. + * It shall be instantiated several times, using different sets of directives. + * Note that it is important for performance that this function really get + * inlined, in order to remove useless branches during compilation optimization. + */ +LZ4_FORCE_INLINE int LZ4_decompress_generic( + const char* const src, + char* const dst, + int srcSize, + int outputSize, /* If endOnInput==endOnInputSize, this value is `dstCapacity` + */ + + earlyEnd_directive partialDecoding, /* full, partial */ + dict_directive dict, /* noDict, withPrefix64k, usingExtDict */ + const BYTE* const lowPrefix, /* always <= dst, == dst when no prefix */ + const BYTE* const dictStart, /* only if dict==usingExtDict */ + const size_t dictSize /* note : = 0 if noDict */ +) +{ + if ((src == NULL) || (outputSize < 0)) { + return -1; + } + + { + const BYTE* ip = (const BYTE*)src; + const BYTE* const iend = ip + srcSize; + + BYTE* op = (BYTE*)dst; + BYTE* const oend = op + outputSize; + BYTE* cpy; + + const BYTE* const dictEnd = + (dictStart == NULL) ? NULL : dictStart + dictSize; + + const int checkOffset = (dictSize < (int)(64 KB)); + + /* Set up the "end" pointers for the shortcut. */ + const BYTE* const shortiend = iend - 14 /*maxLL*/ - 2 /*offset*/; + const BYTE* const shortoend = oend - 14 /*maxLL*/ - 18 /*maxML*/; + + const BYTE* match; + size_t offset; + unsigned token; + size_t length; + + DEBUGLOG(5, "LZ4_decompress_generic (srcSize:%i, dstSize:%i)", srcSize, + outputSize); + + /* Special cases */ + assert(lowPrefix <= op); + if (unlikely(outputSize == 0)) { + /* Empty output buffer */ + if (partialDecoding) + return 0; + return ((srcSize == 1) && (*ip == 0)) ? 0 : -1; + } + if (unlikely(srcSize == 0)) { + return -1; + } + + /* LZ4_FAST_DEC_LOOP: + * designed for modern OoO performance cpus, + * where copying reliably 32-bytes is preferable to an unpredictable + * branch. note : fast loop may show a regression for some client arm + * chips. */ +#if LZ4_FAST_DEC_LOOP + if ((oend - op) < FASTLOOP_SAFE_DISTANCE) { + DEBUGLOG(6, "move to safe decode loop"); + goto safe_decode; + } + + /* Fast loop : decode sequences as long as output < + * oend-FASTLOOP_SAFE_DISTANCE */ + DEBUGLOG(6, "using fast decode loop"); + while (1) { + /* Main fastloop assertion: We can always wildcopy + * FASTLOOP_SAFE_DISTANCE */ + assert(oend - op >= FASTLOOP_SAFE_DISTANCE); + assert(ip < iend); + token = *ip++; + length = token >> ML_BITS; /* literal length */ + DEBUGLOG(7, "blockPos%6u: litLength token = %u", + (unsigned)(op - (BYTE*)dst), (unsigned)length); + + /* decode literal length */ + if (length == RUN_MASK) { + size_t const addl = read_variable_length(&ip, iend - RUN_MASK, 1); + if (addl == rvl_error) { + DEBUGLOG(6, "error reading long literal length"); + goto _output_error; + } + length += addl; + if (unlikely((uptrval)(op) + length < (uptrval)(op))) { + goto _output_error; + } /* overflow detection */ + if (unlikely((uptrval)(ip) + length < (uptrval)(ip))) { + goto _output_error; + } /* overflow detection */ + + /* copy literals */ + LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH); + if ((op + length > oend - 32) || (ip + length > iend - 32)) { + goto safe_literal_copy; + } + LZ4_wildCopy32(op, ip, op + length); + ip += length; + op += length; + } else if (ip <= iend - (16 + 1 /*max lit + offset + nextToken*/)) { + /* We don't need to check oend, since we check it once for each loop + * below */ + DEBUGLOG(7, "copy %u bytes in a 16-bytes stripe", (unsigned)length); + /* Literals can only be <= 14, but hope compilers optimize better + * when copy by a register size */ + LZ4_memcpy(op, ip, 16); + ip += length; + op += length; + } else { + goto safe_literal_copy; + } + + /* get offset */ + offset = LZ4_readLE16(ip); + ip += 2; + DEBUGLOG(6, "blockPos%6u: offset = %u", (unsigned)(op - (BYTE*)dst), + (unsigned)offset); + match = op - offset; + assert(match <= op); /* overflow check */ + + /* get matchlength */ + length = token & ML_MASK; + DEBUGLOG(7, " match length token = %u (len==%u)", (unsigned)length, + (unsigned)length + MINMATCH); + + if (length == ML_MASK) { + size_t const addl = + read_variable_length(&ip, iend - LASTLITERALS + 1, 0); + if (addl == rvl_error) { + DEBUGLOG(5, "error reading long match length"); + goto _output_error; + } + length += addl; + length += MINMATCH; + DEBUGLOG(7, " long match length == %u", (unsigned)length); + if (unlikely((uptrval)(op) + length < (uptrval)op)) { + goto _output_error; + } /* overflow detection */ + if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) { + goto safe_match_copy; + } + } else { + length += MINMATCH; + if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) { + DEBUGLOG(7, "moving to safe_match_copy (ml==%u)", + (unsigned)length); + goto safe_match_copy; + } + + /* Fastpath check: skip LZ4_wildCopy32 when true */ + if ((dict == withPrefix64k) || (match >= lowPrefix)) { + if (offset >= 8) { + assert(match >= lowPrefix); + assert(match <= op); + assert(op + 18 <= oend); + + LZ4_memcpy(op, match, 8); + LZ4_memcpy(op + 8, match + 8, 8); + LZ4_memcpy(op + 16, match + 16, 2); + op += length; + continue; + } + } + } + + if (checkOffset && (unlikely(match + dictSize < lowPrefix))) { + DEBUGLOG(5, "Error : pos=%zi, offset=%zi => outside buffers", + op - lowPrefix, op - match); + goto _output_error; + } + /* match starting within external dictionary */ + if ((dict == usingExtDict) && (match < lowPrefix)) { + assert(dictEnd != NULL); + if (unlikely(op + length > oend - LASTLITERALS)) { + if (partialDecoding) { + DEBUGLOG( + 7, "partialDecoding: dictionary match, close to dstEnd"); + length = MIN(length, (size_t)(oend - op)); + } else { + DEBUGLOG(6, "end-of-block condition violated") + goto _output_error; + } + } + + if (length <= (size_t)(lowPrefix - match)) { + /* match fits entirely within external dictionary : just copy */ + LZ4_memmove(op, dictEnd - (lowPrefix - match), length); + op += length; + } else { + /* match stretches into both external dictionary and current + * block */ + size_t const copySize = (size_t)(lowPrefix - match); + size_t const restSize = length - copySize; + LZ4_memcpy(op, dictEnd - copySize, copySize); + op += copySize; + if (restSize > (size_t)(op - lowPrefix)) { /* overlap copy */ + BYTE* const endOfMatch = op + restSize; + const BYTE* copyFrom = lowPrefix; + while (op < endOfMatch) { + *op++ = *copyFrom++; + } + } else { + LZ4_memcpy(op, lowPrefix, restSize); + op += restSize; + } + } + continue; + } + + /* copy match within block */ + cpy = op + length; + + assert((op <= oend) && (oend - op >= 32)); + if (unlikely(offset < 16)) { + LZ4_memcpy_using_offset(op, match, cpy, offset); + } else { + LZ4_wildCopy32(op, match, cpy); + } + + op = cpy; /* wildcopy correction */ + } + safe_decode: +#endif + + /* Main Loop : decode remaining sequences where output < + * FASTLOOP_SAFE_DISTANCE */ + DEBUGLOG(6, "using safe decode loop"); + while (1) { + assert(ip < iend); + token = *ip++; + length = token >> ML_BITS; /* literal length */ + DEBUGLOG(7, "blockPos%6u: litLength token = %u", + (unsigned)(op - (BYTE*)dst), (unsigned)length); + + /* A two-stage shortcut for the most common case: + * 1) If the literal length is 0..14, and there is enough space, + * enter the shortcut and copy 16 bytes on behalf of the literals + * (in the fast mode, only 8 bytes can be safely copied this way). + * 2) Further if the match length is 4..18, copy 18 bytes in a similar + * manner; but we ensure that there's enough space in the output for + * those 18 bytes earlier, upon entering the shortcut (in other words, + * there is a combined check for both stages). + */ + if ((length != RUN_MASK) + /* strictly "less than" on input, to re-enter the loop with at + least one byte */ + && likely((ip < shortiend) & (op <= shortoend))) { + /* Copy the literals */ + LZ4_memcpy(op, ip, 16); + op += length; + ip += length; + + /* The second stage: prepare for match copying, decode full info. + * If it doesn't work out, the info won't be wasted. */ + length = token & ML_MASK; /* match length */ + DEBUGLOG(7, "blockPos%6u: matchLength token = %u (len=%u)", + (unsigned)(op - (BYTE*)dst), (unsigned)length, + (unsigned)length + 4); + offset = LZ4_readLE16(ip); + ip += 2; + match = op - offset; + assert(match <= op); /* check overflow */ + + /* Do not deal with overlapping matches. */ + if ((length != ML_MASK) && (offset >= 8) && + (dict == withPrefix64k || match >= lowPrefix)) { + /* Copy the match. */ + LZ4_memcpy(op + 0, match + 0, 8); + LZ4_memcpy(op + 8, match + 8, 8); + LZ4_memcpy(op + 16, match + 16, 2); + op += length + MINMATCH; + /* Both stages worked, load the next token. */ + continue; + } + + /* The second stage didn't work out, but the info is ready. + * Propel it right to the point of match copying. */ + goto _copy_match; + } + + /* decode literal length */ + if (length == RUN_MASK) { + size_t const addl = read_variable_length(&ip, iend - RUN_MASK, 1); + if (addl == rvl_error) { + goto _output_error; + } + length += addl; + if (unlikely((uptrval)(op) + length < (uptrval)(op))) { + goto _output_error; + } /* overflow detection */ + if (unlikely((uptrval)(ip) + length < (uptrval)(ip))) { + goto _output_error; + } /* overflow detection */ + } + +#if LZ4_FAST_DEC_LOOP + safe_literal_copy: +#endif + /* copy literals */ + cpy = op + length; + + LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH); + if ((cpy > oend - MFLIMIT) || + (ip + length > iend - (2 + 1 + LASTLITERALS))) { + /* We've either hit the input parsing restriction or the output + * parsing restriction. In the normal scenario, decoding a full + * block, it must be the last sequence, otherwise it's an error + * (invalid input or dimensions). In partialDecoding scenario, it's + * necessary to ensure there is no buffer overflow. + */ + if (partialDecoding) { + /* Since we are partial decoding we may be in this block because + * of the output parsing restriction, which is not valid since + * the output buffer is allowed to be undersized. + */ + DEBUGLOG(7, "partialDecoding: copying literals, close to input " + "or output end") + DEBUGLOG(7, "partialDecoding: literal length = %u", + (unsigned)length); + DEBUGLOG(7, "partialDecoding: remaining space in dstBuffer : %i", + (int)(oend - op)); + DEBUGLOG(7, "partialDecoding: remaining space in srcBuffer : %i", + (int)(iend - ip)); + /* Finishing in the middle of a literals segment, + * due to lack of input. + */ + if (ip + length > iend) { + length = (size_t)(iend - ip); + cpy = op + length; + } + /* Finishing in the middle of a literals segment, + * due to lack of output space. + */ + if (cpy > oend) { + cpy = oend; + assert(op <= oend); + length = (size_t)(oend - op); + } + } else { + /* We must be on the last sequence (or invalid) because of the + * parsing limitations so check that we exactly consume the input + * and don't overrun the output buffer. + */ + if ((ip + length != iend) || (cpy > oend)) { + DEBUGLOG(5, "should have been last run of literals") + DEBUGLOG(5, "ip(%p) + length(%i) = %p != iend (%p)", + (void*)ip, (int)length, (void*)(ip + length), + (void*)iend); + DEBUGLOG(5, "or cpy(%p) > (oend-MFLIMIT)(%p)", (void*)cpy, + (void*)(oend - MFLIMIT)); + DEBUGLOG(5, "after writing %u bytes / %i bytes available", + (unsigned)(op - (BYTE*)dst), outputSize); + goto _output_error; + } + } + LZ4_memmove(op, ip, + length); /* supports overlapping memory regions, for + in-place decompression scenarios */ + ip += length; + op += length; + /* Necessarily EOF when !partialDecoding. + * When partialDecoding, it is EOF if we've either + * filled the output buffer or + * can't proceed with reading an offset for following match. + */ + if (!partialDecoding || (cpy == oend) || (ip >= (iend - 2))) { + break; + } + } else { + LZ4_wildCopy8(op, ip, + cpy); /* can overwrite up to 8 bytes beyond cpy */ + ip += length; + op = cpy; + } + + /* get offset */ + offset = LZ4_readLE16(ip); + ip += 2; + match = op - offset; + + /* get matchlength */ + length = token & ML_MASK; + DEBUGLOG(7, "blockPos%6u: matchLength token = %u", + (unsigned)(op - (BYTE*)dst), (unsigned)length); + + _copy_match: + if (length == ML_MASK) { + size_t const addl = + read_variable_length(&ip, iend - LASTLITERALS + 1, 0); + if (addl == rvl_error) { + goto _output_error; + } + length += addl; + if (unlikely((uptrval)(op) + length < (uptrval)op)) + goto _output_error; /* overflow detection */ + } + length += MINMATCH; + +#if LZ4_FAST_DEC_LOOP + safe_match_copy: +#endif + if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) + goto _output_error; /* Error : offset outside buffers */ + /* match starting within external dictionary */ + if ((dict == usingExtDict) && (match < lowPrefix)) { + assert(dictEnd != NULL); + if (unlikely(op + length > oend - LASTLITERALS)) { + if (partialDecoding) + length = MIN(length, (size_t)(oend - op)); + else + goto _output_error; /* doesn't respect parsing restriction */ + } + + if (length <= (size_t)(lowPrefix - match)) { + /* match fits entirely within external dictionary : just copy */ + LZ4_memmove(op, dictEnd - (lowPrefix - match), length); + op += length; + } else { + /* match stretches into both external dictionary and current + * block */ + size_t const copySize = (size_t)(lowPrefix - match); + size_t const restSize = length - copySize; + LZ4_memcpy(op, dictEnd - copySize, copySize); + op += copySize; + if (restSize > (size_t)(op - lowPrefix)) { /* overlap copy */ + BYTE* const endOfMatch = op + restSize; + const BYTE* copyFrom = lowPrefix; + while (op < endOfMatch) + *op++ = *copyFrom++; + } else { + LZ4_memcpy(op, lowPrefix, restSize); + op += restSize; + } + } + continue; + } + assert(match >= lowPrefix); + + /* copy match within block */ + cpy = op + length; + + /* partialDecoding : may end anywhere within the block */ + assert(op <= oend); + if (partialDecoding && (cpy > oend - MATCH_SAFEGUARD_DISTANCE)) { + size_t const mlen = MIN(length, (size_t)(oend - op)); + const BYTE* const matchEnd = match + mlen; + BYTE* const copyEnd = op + mlen; + if (matchEnd > op) { /* overlap copy */ + while (op < copyEnd) { + *op++ = *match++; + } + } else { + LZ4_memcpy(op, match, mlen); + } + op = copyEnd; + if (op == oend) { + break; + } + continue; + } + + if (unlikely(offset < 8)) { + LZ4_write32(op, 0); /* silence msan warning when offset==0 */ + op[0] = match[0]; + op[1] = match[1]; + op[2] = match[2]; + op[3] = match[3]; + match += inc32table[offset]; + LZ4_memcpy(op + 4, match, 4); + match -= dec64table[offset]; + } else { + LZ4_memcpy(op, match, 8); + match += 8; + } + op += 8; + + if (unlikely(cpy > oend - MATCH_SAFEGUARD_DISTANCE)) { + BYTE* const oCopyLimit = oend - (WILDCOPYLENGTH - 1); + if (cpy > oend - LASTLITERALS) { + goto _output_error; + } /* Error : last LASTLITERALS bytes must be literals (uncompressed) + */ + if (op < oCopyLimit) { + LZ4_wildCopy8(op, match, oCopyLimit); + match += oCopyLimit - op; + op = oCopyLimit; + } + while (op < cpy) { + *op++ = *match++; + } + } else { + LZ4_memcpy(op, match, 8); + if (length > 16) { + LZ4_wildCopy8(op + 8, match + 8, cpy); + } + } + op = cpy; /* wildcopy correction */ + } + + /* end of decoding */ + DEBUGLOG(5, "decoded %i bytes", (int)(((char*)op) - dst)); + return (int)(((char*)op) - dst); /* Nb of output bytes decoded */ + + /* Overflow error detected */ + _output_error: + return (int)(-(((const char*)ip) - src)) - 1; + } +} + +/*===== Instantiate the API decoding functions. =====*/ + +LZ4_FORCE_O2 +int LZ4_decompress_safe(const char* source, + char* dest, + int compressedSize, + int maxDecompressedSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, + maxDecompressedSize, decode_full_block, noDict, + (BYTE*)dest, NULL, 0); +} + +LZ4_FORCE_O2 +int LZ4_decompress_safe_partial(const char* src, + char* dst, + int compressedSize, + int targetOutputSize, + int dstCapacity) +{ + dstCapacity = MIN(targetOutputSize, dstCapacity); + return LZ4_decompress_generic(src, dst, compressedSize, dstCapacity, + partial_decode, noDict, (BYTE*)dst, NULL, 0); +} + +LZ4_FORCE_O2 +int LZ4_decompress_fast(const char* source, char* dest, int originalSize) +{ + DEBUGLOG(5, "LZ4_decompress_fast"); + return LZ4_decompress_unsafe_generic((const BYTE*)source, (BYTE*)dest, + originalSize, 0, NULL, 0); +} + +/*===== Instantiate a few more decoding cases, used more than once. =====*/ + +LZ4_FORCE_O2 /* Exported, an obsolete API function. */ + int + LZ4_decompress_safe_withPrefix64k(const char* source, + char* dest, + int compressedSize, + int maxOutputSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, + decode_full_block, withPrefix64k, + (BYTE*)dest - 64 KB, NULL, 0); +} + +LZ4_FORCE_O2 +static int LZ4_decompress_safe_partial_withPrefix64k(const char* source, + char* dest, + int compressedSize, + int targetOutputSize, + int dstCapacity) +{ + dstCapacity = MIN(targetOutputSize, dstCapacity); + return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity, + partial_decode, withPrefix64k, + (BYTE*)dest - 64 KB, NULL, 0); +} + +/* Another obsolete API function, paired with the previous one. */ +int LZ4_decompress_fast_withPrefix64k(const char* source, + char* dest, + int originalSize) +{ + return LZ4_decompress_unsafe_generic((const BYTE*)source, (BYTE*)dest, + originalSize, 64 KB, NULL, 0); +} + +LZ4_FORCE_O2 +static int LZ4_decompress_safe_withSmallPrefix(const char* source, + char* dest, + int compressedSize, + int maxOutputSize, + size_t prefixSize) +{ + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, + decode_full_block, noDict, + (BYTE*)dest - prefixSize, NULL, 0); +} + +LZ4_FORCE_O2 +static int LZ4_decompress_safe_partial_withSmallPrefix(const char* source, + char* dest, + int compressedSize, + int targetOutputSize, + int dstCapacity, + size_t prefixSize) +{ + dstCapacity = MIN(targetOutputSize, dstCapacity); + return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity, + partial_decode, noDict, + (BYTE*)dest - prefixSize, NULL, 0); +} + +LZ4_FORCE_O2 +int LZ4_decompress_safe_forceExtDict(const char* source, + char* dest, + int compressedSize, + int maxOutputSize, + const void* dictStart, + size_t dictSize) +{ + DEBUGLOG(5, "LZ4_decompress_safe_forceExtDict"); + return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, + decode_full_block, usingExtDict, (BYTE*)dest, + (const BYTE*)dictStart, dictSize); +} + +LZ4_FORCE_O2 +int LZ4_decompress_safe_partial_forceExtDict(const char* source, + char* dest, + int compressedSize, + int targetOutputSize, + int dstCapacity, + const void* dictStart, + size_t dictSize) +{ + dstCapacity = MIN(targetOutputSize, dstCapacity); + return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity, + partial_decode, usingExtDict, (BYTE*)dest, + (const BYTE*)dictStart, dictSize); +} + +LZ4_FORCE_O2 +static int LZ4_decompress_fast_extDict(const char* source, + char* dest, + int originalSize, + const void* dictStart, + size_t dictSize) +{ + return LZ4_decompress_unsafe_generic((const BYTE*)source, (BYTE*)dest, + originalSize, 0, (const BYTE*)dictStart, + dictSize); +} + +/* The "double dictionary" mode, for use with e.g. ring buffers: the first part + * of the dictionary is passed as prefix, and the second via dictStart + + * dictSize. These routines are used only once, in LZ4_decompress_*_continue(). + */ +LZ4_FORCE_INLINE +int LZ4_decompress_safe_doubleDict(const char* source, + char* dest, + int compressedSize, + int maxOutputSize, + size_t prefixSize, + const void* dictStart, + size_t dictSize) +{ + return LZ4_decompress_generic( + source, dest, compressedSize, maxOutputSize, decode_full_block, + usingExtDict, (BYTE*)dest - prefixSize, (const BYTE*)dictStart, dictSize); +} + +/*===== streaming decompression functions =====*/ + +#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) +LZ4_streamDecode_t* LZ4_createStreamDecode(void) +{ + LZ4_STATIC_ASSERT(sizeof(LZ4_streamDecode_t) >= + sizeof(LZ4_streamDecode_t_internal)); + return (LZ4_streamDecode_t*)ALLOC_AND_ZERO(sizeof(LZ4_streamDecode_t)); +} + +int LZ4_freeStreamDecode(LZ4_streamDecode_t* LZ4_stream) +{ + if (LZ4_stream == NULL) { + return 0; + } /* support free on NULL */ + FREEMEM(LZ4_stream); + return 0; +} +#endif + +/*! LZ4_setStreamDecode() : + * Use this function to instruct where to find the dictionary. + * This function is not necessary if previous data is still available where it + * was decoded. Loading a size of 0 is allowed (same effect as no dictionary). + * @return : 1 if OK, 0 if error + */ +int LZ4_setStreamDecode(LZ4_streamDecode_t* LZ4_streamDecode, + const char* dictionary, + int dictSize) +{ + LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse; + lz4sd->prefixSize = (size_t)dictSize; + if (dictSize) { + assert(dictionary != NULL); + lz4sd->prefixEnd = (const BYTE*)dictionary + dictSize; + } else { + lz4sd->prefixEnd = (const BYTE*)dictionary; + } + lz4sd->externalDict = NULL; + lz4sd->extDictSize = 0; + return 1; +} + +/*! LZ4_decoderRingBufferSize() : + * when setting a ring buffer for streaming decompression (optional scenario), + * provides the minimum size of this ring buffer + * to be compatible with any source respecting maxBlockSize condition. + * Note : in a ring buffer scenario, + * blocks are presumed decompressed next to each other. + * When not enough space remains for next block (remainingSize < maxBlockSize), + * decoding resumes from beginning of ring buffer. + * @return : minimum ring buffer size, + * or 0 if there is an error (invalid maxBlockSize). + */ +int LZ4_decoderRingBufferSize(int maxBlockSize) +{ + if (maxBlockSize < 0) + return 0; + if (maxBlockSize > LZ4_MAX_INPUT_SIZE) + return 0; + if (maxBlockSize < 16) + maxBlockSize = 16; + return LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize); +} + +/* +*_continue() : + These decoding functions allow decompression of multiple blocks in +"streaming" mode. Previously decoded blocks must still be available at the +memory position where they were decoded. If it's not possible, save the relevant +part of decoded data into a safe buffer, and indicate where it stands using +LZ4_setStreamDecode() +*/ +LZ4_FORCE_O2 +int LZ4_decompress_safe_continue(LZ4_streamDecode_t* LZ4_streamDecode, + const char* source, + char* dest, + int compressedSize, + int maxOutputSize) +{ + LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse; + int result; + + if (lz4sd->prefixSize == 0) { + /* The first call, no dictionary yet. */ + assert(lz4sd->extDictSize == 0); + result = LZ4_decompress_safe(source, dest, compressedSize, maxOutputSize); + if (result <= 0) + return result; + lz4sd->prefixSize = (size_t)result; + lz4sd->prefixEnd = (BYTE*)dest + result; + } else if (lz4sd->prefixEnd == (BYTE*)dest) { + /* They're rolling the current segment. */ + if (lz4sd->prefixSize >= 64 KB - 1) + result = LZ4_decompress_safe_withPrefix64k( + source, dest, compressedSize, maxOutputSize); + else if (lz4sd->extDictSize == 0) + result = LZ4_decompress_safe_withSmallPrefix( + source, dest, compressedSize, maxOutputSize, lz4sd->prefixSize); + else + result = LZ4_decompress_safe_doubleDict( + source, dest, compressedSize, maxOutputSize, lz4sd->prefixSize, + lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) + return result; + lz4sd->prefixSize += (size_t)result; + lz4sd->prefixEnd += result; + } else { + /* The buffer wraps around, or they're switching to another buffer. */ + lz4sd->extDictSize = lz4sd->prefixSize; + lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize; + result = LZ4_decompress_safe_forceExtDict( + source, dest, compressedSize, maxOutputSize, lz4sd->externalDict, + lz4sd->extDictSize); + if (result <= 0) + return result; + lz4sd->prefixSize = (size_t)result; + lz4sd->prefixEnd = (BYTE*)dest + result; + } + + return result; +} + +LZ4_FORCE_O2 int +LZ4_decompress_fast_continue(LZ4_streamDecode_t* LZ4_streamDecode, + const char* source, + char* dest, + int originalSize) +{ + LZ4_streamDecode_t_internal* const lz4sd = + (assert(LZ4_streamDecode != NULL), &LZ4_streamDecode->internal_donotuse); + int result; + + DEBUGLOG(5, "LZ4_decompress_fast_continue (toDecodeSize=%i)", originalSize); + assert(originalSize >= 0); + + if (lz4sd->prefixSize == 0) { + DEBUGLOG(5, "first invocation : no prefix nor extDict"); + assert(lz4sd->extDictSize == 0); + result = LZ4_decompress_fast(source, dest, originalSize); + if (result <= 0) + return result; + lz4sd->prefixSize = (size_t)originalSize; + lz4sd->prefixEnd = (BYTE*)dest + originalSize; + } else if (lz4sd->prefixEnd == (BYTE*)dest) { + DEBUGLOG(5, "continue using existing prefix"); + result = LZ4_decompress_unsafe_generic( + (const BYTE*)source, (BYTE*)dest, originalSize, lz4sd->prefixSize, + lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) + return result; + lz4sd->prefixSize += (size_t)originalSize; + lz4sd->prefixEnd += originalSize; + } else { + DEBUGLOG(5, "prefix becomes extDict"); + lz4sd->extDictSize = lz4sd->prefixSize; + lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize; + result = LZ4_decompress_fast_extDict( + source, dest, originalSize, lz4sd->externalDict, lz4sd->extDictSize); + if (result <= 0) + return result; + lz4sd->prefixSize = (size_t)originalSize; + lz4sd->prefixEnd = (BYTE*)dest + originalSize; + } + + return result; +} + +/* +Advanced decoding functions : +*_usingDict() : + These decoding functions work the same as "_continue" ones, + the dictionary must be explicitly provided within parameters +*/ + +int LZ4_decompress_safe_usingDict(const char* source, + char* dest, + int compressedSize, + int maxOutputSize, + const char* dictStart, + int dictSize) +{ + if (dictSize == 0) + return LZ4_decompress_safe(source, dest, compressedSize, maxOutputSize); + if (dictStart + dictSize == dest) { + if (dictSize >= 64 KB - 1) { + return LZ4_decompress_safe_withPrefix64k(source, dest, compressedSize, + maxOutputSize); + } + assert(dictSize >= 0); + return LZ4_decompress_safe_withSmallPrefix( + source, dest, compressedSize, maxOutputSize, (size_t)dictSize); + } + assert(dictSize >= 0); + return LZ4_decompress_safe_forceExtDict( + source, dest, compressedSize, maxOutputSize, dictStart, (size_t)dictSize); +} + +int LZ4_decompress_safe_partial_usingDict(const char* source, + char* dest, + int compressedSize, + int targetOutputSize, + int dstCapacity, + const char* dictStart, + int dictSize) +{ + if (dictSize == 0) + return LZ4_decompress_safe_partial(source, dest, compressedSize, + targetOutputSize, dstCapacity); + if (dictStart + dictSize == dest) { + if (dictSize >= 64 KB - 1) { + return LZ4_decompress_safe_partial_withPrefix64k( + source, dest, compressedSize, targetOutputSize, dstCapacity); + } + assert(dictSize >= 0); + return LZ4_decompress_safe_partial_withSmallPrefix( + source, dest, compressedSize, targetOutputSize, dstCapacity, + (size_t)dictSize); + } + assert(dictSize >= 0); + return LZ4_decompress_safe_partial_forceExtDict( + source, dest, compressedSize, targetOutputSize, dstCapacity, dictStart, + (size_t)dictSize); +} + +int LZ4_decompress_fast_usingDict(const char* source, + char* dest, + int originalSize, + const char* dictStart, + int dictSize) +{ + if (dictSize == 0 || dictStart + dictSize == dest) + return LZ4_decompress_unsafe_generic((const BYTE*)source, (BYTE*)dest, + originalSize, (size_t)dictSize, NULL, + 0); + assert(dictSize >= 0); + return LZ4_decompress_fast_extDict(source, dest, originalSize, dictStart, + (size_t)dictSize); +} + +/*=************************************************* + * Obsolete Functions + ***************************************************/ +/* obsolete compression functions */ +int LZ4_compress_limitedOutput(const char* source, + char* dest, + int inputSize, + int maxOutputSize) +{ + return LZ4_compress_default(source, dest, inputSize, maxOutputSize); +} +int LZ4_compress(const char* src, char* dest, int srcSize) +{ + return LZ4_compress_default(src, dest, srcSize, LZ4_compressBound(srcSize)); +} +int LZ4_compress_limitedOutput_withState( + void* state, const char* src, char* dst, int srcSize, int dstSize) +{ + return LZ4_compress_fast_extState(state, src, dst, srcSize, dstSize, 1); +} +int LZ4_compress_withState(void* state, const char* src, char* dst, int srcSize) +{ + return LZ4_compress_fast_extState(state, src, dst, srcSize, + LZ4_compressBound(srcSize), 1); +} +int LZ4_compress_limitedOutput_continue(LZ4_stream_t* LZ4_stream, + const char* src, + char* dst, + int srcSize, + int dstCapacity) +{ + return LZ4_compress_fast_continue(LZ4_stream, src, dst, srcSize, dstCapacity, + 1); +} +int LZ4_compress_continue(LZ4_stream_t* LZ4_stream, + const char* source, + char* dest, + int inputSize) +{ + return LZ4_compress_fast_continue(LZ4_stream, source, dest, inputSize, + LZ4_compressBound(inputSize), 1); +} + +/* +These decompression functions are deprecated and should no longer be used. +They are only provided here for compatibility with older user programs. +- LZ4_uncompress is totally equivalent to LZ4_decompress_fast +- LZ4_uncompress_unknownOutputSize is totally equivalent to LZ4_decompress_safe +*/ +int LZ4_uncompress(const char* source, char* dest, int outputSize) +{ + return LZ4_decompress_fast(source, dest, outputSize); +} +int LZ4_uncompress_unknownOutputSize(const char* source, + char* dest, + int isize, + int maxOutputSize) +{ + return LZ4_decompress_safe(source, dest, isize, maxOutputSize); +} + +/* Obsolete Streaming functions */ + +int LZ4_sizeofStreamState(void) { return sizeof(LZ4_stream_t); } + +int LZ4_resetStreamState(void* state, char* inputBuffer) +{ + (void)inputBuffer; + LZ4_resetStream((LZ4_stream_t*)state); + return 0; +} + +#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) +void* LZ4_create(char* inputBuffer) +{ + (void)inputBuffer; + return LZ4_createStream(); +} +#endif + +char* LZ4_slideInputBuffer(void* state) +{ + /* avoid const char * -> char * conversion warning */ + return (char*)(uptrval)((LZ4_stream_t*)state)->internal_donotuse.dictionary; +} + +#endif /* LZ4_COMMONDEFS_ONLY */ diff --git a/tracegrind/lz4.h b/tracegrind/lz4.h new file mode 100644 index 000000000..a08439161 --- /dev/null +++ b/tracegrind/lz4.h @@ -0,0 +1,1053 @@ +/* + * LZ4 - Fast LZ compression algorithm + * Header File + * Copyright (c) Yann Collet. All rights reserved. + + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 homepage : http://www.lz4.org + - LZ4 source repository : https://github.com/lz4/lz4 +*/ +#if defined(__cplusplus) +extern "C" { +#endif + +#ifndef LZ4_H_2983827168210 +#define LZ4_H_2983827168210 + +/* --- Dependency --- */ +#if !LZ4_FREESTANDING +#include /* size_t */ +#endif + +/** + Introduction + + LZ4 is lossless compression algorithm, providing compression speed >500 MB/s + per core, scalable with multi-cores CPU. It features an extremely fast + decoder, with speed in multiple GB/s per core, typically reaching RAM speed + limits on multi-core systems. + + The LZ4 compression library provides in-memory compression and decompression + functions. It gives full buffer control to user. Compression can be done in: + - a single step (described as Simple Functions) + - a single step, reusing a context (described in Advanced Functions) + - unbounded multiple steps (described as Streaming compression) + + lz4.h generates and decodes LZ4-compressed blocks (doc/lz4_Block_format.md). + Decompressing such a compressed block requires additional metadata. + Exact metadata depends on exact decompression function. + For the typical case of LZ4_decompress_safe(), + metadata includes block's compressed size, and maximum bound of decompressed + size. Each application is free to encode and pass such metadata in whichever + way it wants. + + lz4.h only handle blocks, it can not generate Frames. + + Blocks are different from Frames (doc/lz4_Frame_format.md). + Frames bundle both blocks and metadata in a specified manner. + Embedding metadata is required for compressed data to be self-contained and + portable. Frame format is delivered through a companion API, declared in + lz4frame.h. The `lz4` CLI can only manage frames. +*/ + +/*^*************************************************************** + * Export parameters + *****************************************************************/ +/* + * LZ4_DLL_EXPORT : + * Enable exporting of functions when building a Windows DLL + * LZ4LIB_VISIBILITY : + * Control library symbols visibility. + */ +#ifndef LZ4LIB_VISIBILITY +#if defined(__GNUC__) && (__GNUC__ >= 4) +#define LZ4LIB_VISIBILITY __attribute__((visibility("default"))) +#else +#define LZ4LIB_VISIBILITY +#endif +#endif +#if defined(LZ4_DLL_EXPORT) && (LZ4_DLL_EXPORT == 1) +#define LZ4LIB_API __declspec(dllexport) LZ4LIB_VISIBILITY +#elif defined(LZ4_DLL_IMPORT) && (LZ4_DLL_IMPORT == 1) +#define LZ4LIB_API \ + __declspec(dllimport) \ + LZ4LIB_VISIBILITY /* It isn't required but allows to generate better \ + code, saving a function pointer load from the IAT \ + and an indirect jump.*/ +#else +#define LZ4LIB_API LZ4LIB_VISIBILITY +#endif + +/*! LZ4_FREESTANDING : + * When this macro is set to 1, it enables "freestanding mode" that is + * suitable for typical freestanding environment which doesn't support + * standard C library. + * + * - LZ4_FREESTANDING is a compile-time switch. + * - It requires the following macros to be defined: + * LZ4_memcpy, LZ4_memmove, LZ4_memset. + * - It only enables LZ4/HC functions which don't use heap. + * All LZ4F_* functions are not supported. + * - See tests/freestanding.c to check its basic setup. + */ +#if defined(LZ4_FREESTANDING) && (LZ4_FREESTANDING == 1) +#define LZ4_HEAPMODE 0 +#define LZ4HC_HEAPMODE 0 +#define LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION 1 +#if !defined(LZ4_memcpy) +#error "LZ4_FREESTANDING requires macro 'LZ4_memcpy'." +#endif +#if !defined(LZ4_memset) +#error "LZ4_FREESTANDING requires macro 'LZ4_memset'." +#endif +#if !defined(LZ4_memmove) +#error "LZ4_FREESTANDING requires macro 'LZ4_memmove'." +#endif +#elif !defined(LZ4_FREESTANDING) +#define LZ4_FREESTANDING 0 +#endif + +/*------ Version ------*/ +#define LZ4_VERSION_MAJOR 1 /* for breaking interface changes */ +#define LZ4_VERSION_MINOR \ + 10 /* for new (non-breaking) interface capabilities \ + */ +#define LZ4_VERSION_RELEASE 0 /* for tweaks, bug-fixes, or development */ + +#define LZ4_VERSION_NUMBER \ + (LZ4_VERSION_MAJOR * 100 * 100 + LZ4_VERSION_MINOR * 100 + \ + LZ4_VERSION_RELEASE) + +#define LZ4_LIB_VERSION LZ4_VERSION_MAJOR.LZ4_VERSION_MINOR.LZ4_VERSION_RELEASE +#define LZ4_QUOTE(str) #str +#define LZ4_EXPAND_AND_QUOTE(str) LZ4_QUOTE(str) +#define LZ4_VERSION_STRING \ + LZ4_EXPAND_AND_QUOTE(LZ4_LIB_VERSION) /* requires v1.7.3+ */ + +LZ4LIB_API int +LZ4_versionNumber(void); /**< library version number; useful to check dll + version; requires v1.3.0+ */ +LZ4LIB_API const char* +LZ4_versionString(void); /**< library version string; useful to check dll + version; requires v1.7.5+ */ + +/*-************************************ + * Tuning memory usage + **************************************/ +/*! + * LZ4_MEMORY_USAGE : + * Can be selected at compile time, by setting LZ4_MEMORY_USAGE. + * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> + * 64KB; 20 -> 1MB) Increasing memory usage improves compression ratio, + * generally at the cost of speed. Reduced memory usage may improve speed at the + * cost of ratio, thanks to better cache locality. Default value is 14, for + * 16KB, which nicely fits into most L1 caches. + */ +#ifndef LZ4_MEMORY_USAGE +#define LZ4_MEMORY_USAGE LZ4_MEMORY_USAGE_DEFAULT +#endif + +/* These are absolute limits, they should not be changed by users */ +#define LZ4_MEMORY_USAGE_MIN 10 +#define LZ4_MEMORY_USAGE_DEFAULT 14 +#define LZ4_MEMORY_USAGE_MAX 20 + +#if (LZ4_MEMORY_USAGE < LZ4_MEMORY_USAGE_MIN) +#error "LZ4_MEMORY_USAGE is too small !" +#endif + +#if (LZ4_MEMORY_USAGE > LZ4_MEMORY_USAGE_MAX) +#error "LZ4_MEMORY_USAGE is too large !" +#endif + +/*-************************************ + * Simple Functions + **************************************/ +/*! LZ4_compress_default() : + * Compresses 'srcSize' bytes from buffer 'src' + * into already allocated 'dst' buffer of size 'dstCapacity'. + * Compression is guaranteed to succeed if 'dstCapacity' >= + * LZ4_compressBound(srcSize). It also runs faster, so it's a recommended + * setting. If the function cannot compress 'src' into a more limited 'dst' + * budget, compression stops *immediately*, and the function result is zero. In + * which case, 'dst' content is undefined (invalid). srcSize : max supported + * value is LZ4_MAX_INPUT_SIZE. dstCapacity : size of buffer 'dst' (which must + * be already allocated) + * @return : the number of bytes written into buffer 'dst' (necessarily <= + * dstCapacity) or 0 if compression fails Note : This function is protected + * against buffer overflow scenarios (never writes outside 'dst' buffer, nor + * read outside 'source' buffer). + */ +LZ4LIB_API int +LZ4_compress_default(const char* src, char* dst, int srcSize, int dstCapacity); + +/*! LZ4_decompress_safe() : + * @compressedSize : is the exact complete size of the compressed block. + * @dstCapacity : is the size of destination buffer (which must be already + * allocated), presumed an upper bound of decompressed size. + * @return : the number of bytes decompressed into destination buffer + * (necessarily <= dstCapacity) If destination buffer is not large enough, + * decoding will stop and output an error code (negative value). If the source + * stream is detected malformed, the function will stop decoding and return a + * negative result. Note 1 : This function is protected against malicious data + * packets : it will never writes outside 'dst' buffer, nor read outside + * 'source' buffer, even if the compressed block is maliciously modified to + * order the decoder to do these actions. In such case, the decoder stops + * immediately, and considers the compressed block malformed. Note 2 : + * compressedSize and dstCapacity must be provided to the function, the + * compressed block does not contain them. The implementation is free to send / + * store / derive this information in whichever way is most beneficial. If there + * is a need for a different format which bundles together both compressed data + * and its metadata, consider looking at lz4frame.h instead. + */ +LZ4LIB_API int LZ4_decompress_safe(const char* src, + char* dst, + int compressedSize, + int dstCapacity); + +/*-************************************ + * Advanced Functions + **************************************/ +#define LZ4_MAX_INPUT_SIZE 0x7E000000 /* 2 113 929 216 bytes */ +#define LZ4_COMPRESSBOUND(isize) \ + ((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE \ + ? 0 \ + : (isize) + ((isize) / 255) + 16) + +/*! LZ4_compressBound() : + Provides the maximum size that LZ4 compression may output in a "worst case" + scenario (input data not compressible) This function is primarily useful for + memory allocation purposes (destination buffer size). Macro + LZ4_COMPRESSBOUND() is also provided for compilation-time evaluation (stack + memory allocation for example). Note that LZ4_compress_default() compresses + faster when dstCapacity is >= LZ4_compressBound(srcSize) inputSize : max + supported value is LZ4_MAX_INPUT_SIZE return : maximum output size in a + "worst case" scenario or 0, if input size is incorrect (too large or + negative) +*/ +LZ4LIB_API int LZ4_compressBound(int inputSize); + +/*! LZ4_compress_fast() : + Same as LZ4_compress_default(), but allows selection of "acceleration" + factor. The larger the acceleration value, the faster the algorithm, but also + the lesser the compression. It's a trade-off. It can be fine tuned, with each + successive value providing roughly +~3% to speed. An acceleration value of + "1" is the same as regular LZ4_compress_default() Values <= 0 will be + replaced by LZ4_ACCELERATION_DEFAULT (currently == 1, see lz4.c). Values > + LZ4_ACCELERATION_MAX will be replaced by LZ4_ACCELERATION_MAX (currently == + 65537, see lz4.c). +*/ +LZ4LIB_API int LZ4_compress_fast( + const char* src, char* dst, int srcSize, int dstCapacity, int acceleration); + +/*! LZ4_compress_fast_extState() : + * Same as LZ4_compress_fast(), using an externally allocated memory space for + * its state. Use LZ4_sizeofState() to know how much memory must be allocated, + * and allocate it on 8-bytes boundaries (using `malloc()` typically). + * Then, provide this buffer as `void* state` to compression function. + */ +LZ4LIB_API int LZ4_sizeofState(void); +LZ4LIB_API int LZ4_compress_fast_extState(void* state, + const char* src, + char* dst, + int srcSize, + int dstCapacity, + int acceleration); + +/*! LZ4_compress_destSize() : + * Reverse the logic : compresses as much data as possible from 'src' buffer + * into already allocated buffer 'dst', of size >= 'dstCapacity'. + * This function either compresses the entire 'src' content into 'dst' if it's + * large enough, or fill 'dst' buffer completely with as much data as possible + * from 'src'. note: acceleration parameter is fixed to "default". + * + * *srcSizePtr : in+out parameter. Initially contains size of input. + * Will be modified to indicate how many bytes where read from + * 'src' to fill 'dst'. New value is necessarily <= input value. + * @return : Nb bytes written into 'dst' (necessarily <= dstCapacity) + * or 0 if compression fails. + * + * Note : 'targetDstSize' must be >= 1, because it's the smallest valid lz4 + * payload. + * + * Note 2:from v1.8.2 to v1.9.1, this function had a bug (fixed in v1.9.2+): + * the produced compressed content could, in rare circumstances, + * require to be decompressed into a destination buffer + * larger by at least 1 byte than decompressesSize. + * If an application uses `LZ4_compress_destSize()`, + * it's highly recommended to update liblz4 to v1.9.2 or better. + * If this can't be done or ensured, + * the receiving decompression function should provide + * a dstCapacity which is > decompressedSize, by at least 1 byte. + * See https://github.com/lz4/lz4/issues/859 for details + */ +LZ4LIB_API int LZ4_compress_destSize(const char* src, + char* dst, + int* srcSizePtr, + int targetDstSize); + +/*! LZ4_decompress_safe_partial() : + * Decompress an LZ4 compressed block, of size 'srcSize' at position 'src', + * into destination buffer 'dst' of size 'dstCapacity'. + * Up to 'targetOutputSize' bytes will be decoded. + * The function stops decoding on reaching this objective. + * This can be useful to boost performance + * whenever only the beginning of a block is required. + * + * @return : the number of bytes decoded in `dst` (necessarily <= + * targetOutputSize) If source stream is detected malformed, function returns a + * negative result. + * + * Note 1 : @return can be < targetOutputSize, if compressed block contains + * less data. + * + * Note 2 : targetOutputSize must be <= dstCapacity + * + * Note 3 : this function effectively stops decoding on reaching + * targetOutputSize, so dstCapacity is kind of redundant. This is because in + * older versions of this function, decoding operation would still write + * complete sequences. Therefore, there was no guarantee that it would stop + * writing at exactly targetOutputSize, it could write more bytes, though only + * up to dstCapacity. Some "margin" used to be required for this operation to + * work properly. Thankfully, this is no longer necessary. The function + * nonetheless keeps the same signature, in an effort to preserve API + * compatibility. + * + * Note 4 : If srcSize is the exact size of the block, + * then targetOutputSize can be any value, + * including larger than the block's decompressed size. + * The function will, at most, generate block's decompressed size. + * + * Note 5 : If srcSize is _larger_ than block's compressed size, + * then targetOutputSize **MUST** be <= block's decompressed size. + * Otherwise, *silent corruption will occur*. + */ +LZ4LIB_API int LZ4_decompress_safe_partial(const char* src, + char* dst, + int srcSize, + int targetOutputSize, + int dstCapacity); + +/*-********************************************* + * Streaming Compression Functions + ***********************************************/ +typedef union LZ4_stream_u LZ4_stream_t; /* incomplete type (defined later) */ + +/*! + Note about RC_INVOKED + + - RC_INVOKED is predefined symbol of rc.exe (the resource compiler which is + part of MSVC/Visual Studio). + https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros + + - Since rc.exe is a legacy compiler, it truncates long symbol (> 30 chars) + and reports warning "RC4011: identifier truncated". + + - To eliminate the warning, we surround long preprocessor symbol with + "#if !defined(RC_INVOKED) ... #endif" block that means + "skip this block when rc.exe is trying to read it". +*/ +#if !defined( \ + RC_INVOKED) /* https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros \ + */ +#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) +LZ4LIB_API LZ4_stream_t* LZ4_createStream(void); +LZ4LIB_API int LZ4_freeStream(LZ4_stream_t* streamPtr); +#endif /* !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) */ +#endif + +/*! LZ4_resetStream_fast() : v1.9.0+ + * Use this to prepare an LZ4_stream_t for a new chain of dependent blocks + * (e.g., LZ4_compress_fast_continue()). + * + * An LZ4_stream_t must be initialized once before usage. + * This is automatically done when created by LZ4_createStream(). + * However, should the LZ4_stream_t be simply declared on stack (for example), + * it's necessary to initialize it first, using LZ4_initStream(). + * + * After init, start any new stream with LZ4_resetStream_fast(). + * A same LZ4_stream_t can be re-used multiple times consecutively + * and compress multiple streams, + * provided that it starts each new stream with LZ4_resetStream_fast(). + * + * LZ4_resetStream_fast() is much faster than LZ4_initStream(), + * but is not compatible with memory regions containing garbage data. + * + * Note: it's only useful to call LZ4_resetStream_fast() + * in the context of streaming compression. + * The *extState* functions perform their own resets. + * Invoking LZ4_resetStream_fast() before is redundant, and even + * counterproductive. + */ +LZ4LIB_API void LZ4_resetStream_fast(LZ4_stream_t* streamPtr); + +/*! LZ4_loadDict() : + * Use this function to reference a static dictionary into LZ4_stream_t. + * The dictionary must remain available during compression. + * LZ4_loadDict() triggers a reset, so any previous data will be forgotten. + * The same dictionary will have to be loaded on decompression side for + * successful decoding. Dictionary are useful for better compression of small + * data (KB range). While LZ4 itself accepts any input as dictionary, dictionary + * efficiency is also a topic. When in doubt, employ the Zstandard's Dictionary + * Builder. Loading a size of 0 is allowed, and is the same as reset. + * @return : loaded dictionary size, in bytes (note: only the last 64 KB are + * loaded) + */ +LZ4LIB_API int +LZ4_loadDict(LZ4_stream_t* streamPtr, const char* dictionary, int dictSize); + +/*! LZ4_loadDictSlow() : v1.10.0+ + * Same as LZ4_loadDict(), + * but uses a bit more cpu to reference the dictionary content more thoroughly. + * This is expected to slightly improve compression ratio. + * The extra-cpu cost is likely worth it if the dictionary is re-used across + * multiple sessions. + * @return : loaded dictionary size, in bytes (note: only the last 64 KB are + * loaded) + */ +LZ4LIB_API int +LZ4_loadDictSlow(LZ4_stream_t* streamPtr, const char* dictionary, int dictSize); + +/*! LZ4_attach_dictionary() : stable since v1.10.0 + * + * This allows efficient re-use of a static dictionary multiple times. + * + * Rather than re-loading the dictionary buffer into a working context before + * each compression, or copying a pre-loaded dictionary's LZ4_stream_t into a + * working LZ4_stream_t, this function introduces a no-copy setup mechanism, + * in which the working stream references @dictionaryStream in-place. + * + * Several assumptions are made about the state of @dictionaryStream. + * Currently, only states which have been prepared by LZ4_loadDict() or + * LZ4_loadDictSlow() should be expected to work. + * + * Alternatively, the provided @dictionaryStream may be NULL, + * in which case any existing dictionary stream is unset. + * + * If a dictionary is provided, it replaces any pre-existing stream history. + * The dictionary contents are the only history that can be referenced and + * logically immediately precede the data compressed in the first subsequent + * compression call. + * + * The dictionary will only remain attached to the working stream through the + * first compression call, at the end of which it is cleared. + * @dictionaryStream stream (and source buffer) must remain in-place / + * accessible / unchanged through the completion of the compression session. + * + * Note: there is no equivalent LZ4_attach_*() method on the decompression side + * because there is no initialization cost, hence no need to share the cost + * across multiple sessions. To decompress LZ4 blocks using dictionary, attached + * or not, just employ the regular LZ4_setStreamDecode() for streaming, or the + * stateless LZ4_decompress_safe_usingDict() for one-shot decompression. + */ +LZ4LIB_API void LZ4_attach_dictionary(LZ4_stream_t* workingStream, + const LZ4_stream_t* dictionaryStream); + +/*! LZ4_compress_fast_continue() : + * Compress 'src' content using data from previously compressed blocks, for + * better compression ratio. 'dst' buffer must be already allocated. If + * dstCapacity >= LZ4_compressBound(srcSize), compression is guaranteed to + * succeed, and runs faster. + * + * @return : size of compressed block + * or 0 if there is an error (typically, cannot fit into 'dst'). + * + * Note 1 : Each invocation to LZ4_compress_fast_continue() generates a new + * block. Each block has precise boundaries. Each block must be decompressed + * separately, calling LZ4_decompress_*() with relevant metadata. It's not + * possible to append blocks together and expect a single invocation of + * LZ4_decompress_*() to decompress them together. + * + * Note 2 : The previous 64KB of source data is __assumed__ to remain present, + * unmodified, at same address in memory ! + * + * Note 3 : When input is structured as a double-buffer, each buffer can have + * any size, including < 64 KB. Make sure that buffers are separated, by at + * least one byte. This construction ensures that each block only depends on + * previous block. + * + * Note 4 : If input buffer is a ring-buffer, it can have any size, including < + * 64 KB. + * + * Note 5 : After an error, the stream status is undefined (invalid), it can + * only be reset or freed. + */ +LZ4LIB_API int LZ4_compress_fast_continue(LZ4_stream_t* streamPtr, + const char* src, + char* dst, + int srcSize, + int dstCapacity, + int acceleration); + +/*! LZ4_saveDict() : + * If last 64KB data cannot be guaranteed to remain available at its current + * memory location, save it into a safer place (char* safeBuffer). This is + * schematically equivalent to a memcpy() followed by LZ4_loadDict(), but is + * much faster, because LZ4_saveDict() doesn't need to rebuild tables. + * @return : saved dictionary size in bytes (necessarily <= maxDictSize), or 0 + * if error. + */ +LZ4LIB_API int +LZ4_saveDict(LZ4_stream_t* streamPtr, char* safeBuffer, int maxDictSize); + +/*-********************************************** + * Streaming Decompression Functions + * Bufferless synchronous API + ************************************************/ +typedef union LZ4_streamDecode_u LZ4_streamDecode_t; /* tracking context */ + +/*! LZ4_createStreamDecode() and LZ4_freeStreamDecode() : + * creation / destruction of streaming decompression tracking context. + * A tracking context can be re-used multiple times. + */ +#if !defined( \ + RC_INVOKED) /* https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros \ + */ +#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) +LZ4LIB_API LZ4_streamDecode_t* LZ4_createStreamDecode(void); +LZ4LIB_API int LZ4_freeStreamDecode(LZ4_streamDecode_t* LZ4_stream); +#endif /* !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) */ +#endif + +/*! LZ4_setStreamDecode() : + * An LZ4_streamDecode_t context can be allocated once and re-used multiple + * times. Use this function to start decompression of a new stream of blocks. A + * dictionary can optionally be set. Use NULL or size 0 for a reset order. + * Dictionary is presumed stable : it must remain accessible and unmodified + * during next decompression. + * @return : 1 if OK, 0 if error + */ +LZ4LIB_API int LZ4_setStreamDecode(LZ4_streamDecode_t* LZ4_streamDecode, + const char* dictionary, + int dictSize); + +/*! LZ4_decoderRingBufferSize() : v1.8.2+ + * Note : in a ring buffer scenario (optional), + * blocks are presumed decompressed next to each other + * up to the moment there is not enough remaining space for next block + * (remainingSize < maxBlockSize), at which stage it resumes from beginning of + * ring buffer. When setting such a ring buffer for streaming decompression, + * provides the minimum size of this ring buffer + * to be compatible with any source respecting maxBlockSize condition. + * @return : minimum ring buffer size, + * or 0 if there is an error (invalid maxBlockSize). + */ +LZ4LIB_API int LZ4_decoderRingBufferSize(int maxBlockSize); +#define LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize) \ + (65536 + 14 + \ + (maxBlockSize)) /* for static allocation; maxBlockSize presumed valid */ + +/*! LZ4_decompress_safe_continue() : + * This decoding function allows decompression of consecutive blocks in + * "streaming" mode. The difference with the usual independent blocks is that + * new blocks are allowed to find references into former blocks. + * A block is an unsplittable entity, and must be presented entirely to the + * decompression function. LZ4_decompress_safe_continue() only accepts one block + * at a time. It's modeled after `LZ4_decompress_safe()` and behaves similarly. + * + * @LZ4_streamDecode : decompression state, tracking the position in memory of + * past data + * @compressedSize : exact complete size of one compressed block. + * @dstCapacity : size of destination buffer (which must be already allocated), + * must be an upper bound of decompressed size. + * @return : number of bytes decompressed into destination buffer (necessarily + * <= dstCapacity) If destination buffer is not large enough, decoding will stop + * and output an error code (negative value). If the source stream is detected + * malformed, the function will stop decoding and return a negative result. + * + * The last 64KB of previously decoded data *must* remain available and + * unmodified at the memory position where they were previously decoded. If less + * than 64KB of data has been decoded, all the data must be present. + * + * Special : if decompression side sets a ring buffer, it must respect one of + * the following conditions : + * - Decompression buffer size is _at least_ + * LZ4_decoderRingBufferSize(maxBlockSize). maxBlockSize is the maximum size of + * any single block. It can have any value > 16 bytes. In which case, encoding + * and decoding buffers do not need to be synchronized. Actually, data can be + * produced by any source compliant with LZ4 format specification, and + * respecting maxBlockSize. + * - Synchronized mode : + * Decompression buffer size is _exactly_ the same as compression buffer + * size, and follows exactly same update rule (block boundaries at same + * positions), and decoding function is provided with exact decompressed size of + * each block (exception for last block of the stream), _then_ decoding & + * encoding ring buffer can have any size, including small ones ( < 64 KB). + * - Decompression buffer is larger than encoding buffer, by a minimum of + * maxBlockSize more bytes. In which case, encoding and decoding buffers do not + * need to be synchronized, and encoding ring buffer can have any size, + * including small ones ( < 64 KB). + * + * Whenever these conditions are not possible, + * save the last 64KB of decoded data into a safe buffer where it can't be + * modified during decompression, then indicate where this data is saved using + * LZ4_setStreamDecode(), before decompressing next block. + */ +LZ4LIB_API int +LZ4_decompress_safe_continue(LZ4_streamDecode_t* LZ4_streamDecode, + const char* src, + char* dst, + int srcSize, + int dstCapacity); + +/*! LZ4_decompress_safe_usingDict() : + * Works the same as + * a combination of LZ4_setStreamDecode() followed by + * LZ4_decompress_safe_continue() However, it's stateless: it doesn't need any + * LZ4_streamDecode_t state. Dictionary is presumed stable : it must remain + * accessible and unmodified during decompression. Performance tip : + * Decompression speed can be substantially increased when dst == dictStart + + * dictSize. + */ +LZ4LIB_API int LZ4_decompress_safe_usingDict(const char* src, + char* dst, + int srcSize, + int dstCapacity, + const char* dictStart, + int dictSize); + +/*! LZ4_decompress_safe_partial_usingDict() : + * Behaves the same as LZ4_decompress_safe_partial() + * with the added ability to specify a memory segment for past data. + * Performance tip : Decompression speed can be substantially increased + * when dst == dictStart + dictSize. + */ +LZ4LIB_API int LZ4_decompress_safe_partial_usingDict(const char* src, + char* dst, + int compressedSize, + int targetOutputSize, + int maxOutputSize, + const char* dictStart, + int dictSize); + +#endif /* LZ4_H_2983827168210 */ + +/*^************************************* + * !!!!!! STATIC LINKING ONLY !!!!!! + ***************************************/ + +/*-**************************************************************************** + * Experimental section + * + * Symbols declared in this section must be considered unstable. Their + * signatures or semantics may change, or they may be removed altogether in the + * future. They are therefore only safe to depend on when the caller is + * statically linked against the library. + * + * To protect against unsafe usage, not only are the declarations guarded, + * the definitions are hidden by default + * when building LZ4 as a shared/dynamic library. + * + * In order to access these declarations, + * define LZ4_STATIC_LINKING_ONLY in your application + * before including LZ4's headers. + * + * In order to make their implementations accessible dynamically, you must + * define LZ4_PUBLISH_STATIC_FUNCTIONS when building the LZ4 library. + ******************************************************************************/ + +#ifdef LZ4_STATIC_LINKING_ONLY + +#ifndef LZ4_STATIC_3504398509 +#define LZ4_STATIC_3504398509 + +#ifdef LZ4_PUBLISH_STATIC_FUNCTIONS +#define LZ4LIB_STATIC_API LZ4LIB_API +#else +#define LZ4LIB_STATIC_API +#endif + +/*! LZ4_compress_fast_extState_fastReset() : + * A variant of LZ4_compress_fast_extState(). + * + * Using this variant avoids an expensive initialization step. + * It is only safe to call if the state buffer is known to be correctly + * initialized already (see above comment on LZ4_resetStream_fast() for a + * definition of "correctly initialized"). From a high level, the difference is + * that this function initializes the provided state with a call to something + * like LZ4_resetStream_fast() while LZ4_compress_fast_extState() starts with a + * call to LZ4_resetStream(). + */ +LZ4LIB_STATIC_API int LZ4_compress_fast_extState_fastReset(void* state, + const char* src, + char* dst, + int srcSize, + int dstCapacity, + int acceleration); + +/*! LZ4_compress_destSize_extState() : introduced in v1.10.0 + * Same as LZ4_compress_destSize(), but using an externally allocated state. + * Also: exposes @acceleration + */ +int LZ4_compress_destSize_extState(void* state, + const char* src, + char* dst, + int* srcSizePtr, + int targetDstSize, + int acceleration); + +/*! In-place compression and decompression + * + * It's possible to have input and output sharing the same buffer, + * for highly constrained memory environments. + * In both cases, it requires input to lay at the end of the buffer, + * and decompression to start at beginning of the buffer. + * Buffer size must feature some margin, hence be larger than final size. + * + * |<------------------------buffer--------------------------------->| + * |<-----------compressed data--------->| + * |<-----------decompressed size------------------>| + * |<----margin---->| + * + * This technique is more useful for decompression, + * since decompressed size is typically larger, + * and margin is short. + * + * In-place decompression will work inside any buffer + * which size is >= LZ4_DECOMPRESS_INPLACE_BUFFER_SIZE(decompressedSize). + * This presumes that decompressedSize > compressedSize. + * Otherwise, it means compression actually expanded data, + * and it would be more efficient to store such data with a flag indicating it's + * not compressed. This can happen when data is not compressible (already + * compressed, or encrypted). + * + * For in-place compression, margin is larger, as it must be able to cope with + * both history preservation, requiring input data to remain unmodified up to + * LZ4_DISTANCE_MAX, and data expansion, which can happen when input is not + * compressible. As a consequence, buffer size requirements are much higher, and + * memory savings offered by in-place compression are more limited. + * + * There are ways to limit this cost for compression : + * - Reduce history size, by modifying LZ4_DISTANCE_MAX. + * Note that it is a compile-time constant, so all compressions will apply + * this limit. Lower values will reduce compression ratio, except when + * input_size < LZ4_DISTANCE_MAX, so it's a reasonable trick when inputs are + * known to be small. + * - Require the compressor to deliver a "maximum compressed size". + * This is the `dstCapacity` parameter in `LZ4_compress*()`. + * When this size is < LZ4_COMPRESSBOUND(inputSize), then compression can + * fail, in which case, the return code will be 0 (zero). The caller must be + * ready for these cases to happen, and typically design a backup scheme to send + * data uncompressed. The combination of both techniques can significantly + * reduce the amount of margin required for in-place compression. + * + * In-place compression can work in any buffer + * which size is >= (maxCompressedSize) + * with maxCompressedSize == LZ4_COMPRESSBOUND(srcSize) for guaranteed + * compression success. LZ4_COMPRESS_INPLACE_BUFFER_SIZE() depends on both + * maxCompressedSize and LZ4_DISTANCE_MAX, so it's possible to reduce memory + * requirements by playing with them. + */ + +#define LZ4_DECOMPRESS_INPLACE_MARGIN(compressedSize) \ + (((compressedSize) >> 8) + 32) +#define LZ4_DECOMPRESS_INPLACE_BUFFER_SIZE(decompressedSize) \ + ((decompressedSize) + \ + LZ4_DECOMPRESS_INPLACE_MARGIN( \ + decompressedSize)) /**< note: presumes that compressedSize < \ + decompressedSize. note2: margin is overestimated \ + a bit, since it could use compressedSize instead \ + */ + +#ifndef LZ4_DISTANCE_MAX /* history window size; can be user-defined at \ + compile time */ +#define LZ4_DISTANCE_MAX 65535 /* set to maximum value by default */ +#endif + +#define LZ4_COMPRESS_INPLACE_MARGIN \ + (LZ4_DISTANCE_MAX + 32) /* LZ4_DISTANCE_MAX can be safely replaced by \ + srcSize when it's smaller */ +#define LZ4_COMPRESS_INPLACE_BUFFER_SIZE(maxCompressedSize) \ + ((maxCompressedSize) + \ + LZ4_COMPRESS_INPLACE_MARGIN) /**< maxCompressedSize is generally \ + LZ4_COMPRESSBOUND(inputSize), but can be \ + set to any lower value, with the risk that \ + compression can fail (return code 0(zero)) \ + */ + +#endif /* LZ4_STATIC_3504398509 */ +#endif /* LZ4_STATIC_LINKING_ONLY */ + +#ifndef LZ4_H_98237428734687 +#define LZ4_H_98237428734687 + +/*-************************************************************ + * Private Definitions + ************************************************************** + * Do not use these definitions directly. + * They are only exposed to allow static allocation of `LZ4_stream_t` and + *`LZ4_streamDecode_t`. Accessing members will expose user code to API and/or + *ABI break in future versions of the library. + **************************************************************/ +#define LZ4_HASHLOG (LZ4_MEMORY_USAGE - 2) +#define LZ4_HASHTABLESIZE (1 << LZ4_MEMORY_USAGE) +#define LZ4_HASH_SIZE_U32 \ + (1 << LZ4_HASHLOG) /* required as macro for static allocation */ + +#if defined(__cplusplus) || \ + (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +#include +typedef int8_t LZ4_i8; +typedef unsigned char LZ4_byte; +typedef uint16_t LZ4_u16; +typedef uint32_t LZ4_u32; +#else +typedef signed char LZ4_i8; +typedef unsigned char LZ4_byte; +typedef unsigned short LZ4_u16; +typedef unsigned int LZ4_u32; +#endif + +/*! LZ4_stream_t : + * Never ever use below internal definitions directly ! + * These definitions are not API/ABI safe, and may change in future versions. + * If you need static allocation, declare or allocate an LZ4_stream_t object. + **/ + +typedef struct LZ4_stream_t_internal LZ4_stream_t_internal; +struct LZ4_stream_t_internal { + LZ4_u32 hashTable[LZ4_HASH_SIZE_U32]; + const LZ4_byte* dictionary; + const LZ4_stream_t_internal* dictCtx; + LZ4_u32 currentOffset; + LZ4_u32 tableType; + LZ4_u32 dictSize; + /* Implicit padding to ensure structure is aligned */ +}; + +#define LZ4_STREAM_MINSIZE \ + ((1UL << (LZ4_MEMORY_USAGE)) + \ + 32) /* static size, for inter-version compatibility */ +union LZ4_stream_u { + char minStateSize[LZ4_STREAM_MINSIZE]; + LZ4_stream_t_internal internal_donotuse; +}; /* previously typedef'd to LZ4_stream_t */ + +/*! LZ4_initStream() : v1.9.0+ + * An LZ4_stream_t structure must be initialized at least once. + * This is automatically done when invoking LZ4_createStream(), + * but it's not when the structure is simply declared on stack (for example). + * + * Use LZ4_initStream() to properly initialize a newly declared LZ4_stream_t. + * It can also initialize any arbitrary buffer of sufficient size, + * and will @return a pointer of proper type upon initialization. + * + * Note : initialization fails if size and alignment conditions are not + *respected. In which case, the function will @return NULL. Note2: An + *LZ4_stream_t structure guarantees correct alignment and size. Note3: Before + *v1.9.0, use LZ4_resetStream() instead + **/ +LZ4LIB_API LZ4_stream_t* LZ4_initStream(void* stateBuffer, size_t size); + +/*! LZ4_streamDecode_t : + * Never ever use below internal definitions directly ! + * These definitions are not API/ABI safe, and may change in future versions. + * If you need static allocation, declare or allocate an LZ4_streamDecode_t + *object. + **/ +typedef struct { + const LZ4_byte* externalDict; + const LZ4_byte* prefixEnd; + size_t extDictSize; + size_t prefixSize; +} LZ4_streamDecode_t_internal; + +#define LZ4_STREAMDECODE_MINSIZE 32 +union LZ4_streamDecode_u { + char minStateSize[LZ4_STREAMDECODE_MINSIZE]; + LZ4_streamDecode_t_internal internal_donotuse; +}; /* previously typedef'd to LZ4_streamDecode_t */ + +/*-************************************ + * Obsolete Functions + **************************************/ + +/*! Deprecation warnings + * + * Deprecated functions make the compiler generate a warning when invoked. + * This is meant to invite users to update their source code. + * Should deprecation warnings be a problem, it is generally possible to + * disable them, typically with -Wno-deprecated-declarations for gcc or + * _CRT_SECURE_NO_WARNINGS in Visual. + * + * Another method is to define LZ4_DISABLE_DEPRECATE_WARNINGS + * before including the header file. + */ +#ifdef LZ4_DISABLE_DEPRECATE_WARNINGS +#define LZ4_DEPRECATED(message) /* disable deprecation warnings */ +#else +#if defined(__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */ +#define LZ4_DEPRECATED(message) [[deprecated(message)]] +#elif defined(_MSC_VER) +#define LZ4_DEPRECATED(message) __declspec(deprecated(message)) +#elif defined(__clang__) || \ + (defined(__GNUC__) && (__GNUC__ * 10 + __GNUC_MINOR__ >= 45)) +#define LZ4_DEPRECATED(message) __attribute__((deprecated(message))) +#elif defined(__GNUC__) && (__GNUC__ * 10 + __GNUC_MINOR__ >= 31) +#define LZ4_DEPRECATED(message) __attribute__((deprecated)) +#else +#pragma message( \ + "WARNING: LZ4_DEPRECATED needs custom implementation for this compiler") +#define LZ4_DEPRECATED(message) /* disabled */ +#endif +#endif /* LZ4_DISABLE_DEPRECATE_WARNINGS */ + +/*! Obsolete compression functions (since v1.7.3) */ +LZ4_DEPRECATED("use LZ4_compress_default() instead") +LZ4LIB_API int LZ4_compress(const char* src, char* dest, int srcSize); +LZ4_DEPRECATED("use LZ4_compress_default() instead") +LZ4LIB_API int LZ4_compress_limitedOutput(const char* src, + char* dest, + int srcSize, + int maxOutputSize); +LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") +LZ4LIB_API int LZ4_compress_withState(void* state, + const char* source, + char* dest, + int inputSize); +LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") +LZ4LIB_API int LZ4_compress_limitedOutput_withState(void* state, + const char* source, + char* dest, + int inputSize, + int maxOutputSize); +LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") +LZ4LIB_API int LZ4_compress_continue(LZ4_stream_t* LZ4_streamPtr, + const char* source, + char* dest, + int inputSize); +LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") +LZ4LIB_API int LZ4_compress_limitedOutput_continue(LZ4_stream_t* LZ4_streamPtr, + const char* source, + char* dest, + int inputSize, + int maxOutputSize); + +/*! Obsolete decompression functions (since v1.8.0) */ +LZ4_DEPRECATED("use LZ4_decompress_fast() instead") +LZ4LIB_API int LZ4_uncompress(const char* source, char* dest, int outputSize); +LZ4_DEPRECATED("use LZ4_decompress_safe() instead") +LZ4LIB_API int LZ4_uncompress_unknownOutputSize(const char* source, + char* dest, + int isize, + int maxOutputSize); + +/* Obsolete streaming functions (since v1.7.0) + * degraded functionality; do not use! + * + * In order to perform streaming compression, these functions depended on data + * that is no longer tracked in the state. They have been preserved as well as + * possible: using them will still produce a correct output. However, they don't + * actually retain any history between compression calls. The compression ratio + * achieved will therefore be no better than compressing each chunk + * independently. + */ +LZ4_DEPRECATED("Use LZ4_createStream() instead") +LZ4LIB_API void* LZ4_create(char* inputBuffer); +LZ4_DEPRECATED("Use LZ4_createStream() instead") +LZ4LIB_API int LZ4_sizeofStreamState(void); +LZ4_DEPRECATED("Use LZ4_resetStream() instead") +LZ4LIB_API int LZ4_resetStreamState(void* state, char* inputBuffer); +LZ4_DEPRECATED("Use LZ4_saveDict() instead") +LZ4LIB_API char* LZ4_slideInputBuffer(void* state); + +/*! Obsolete streaming decoding functions (since v1.7.0) */ +LZ4_DEPRECATED("use LZ4_decompress_safe_usingDict() instead") +LZ4LIB_API int LZ4_decompress_safe_withPrefix64k(const char* src, + char* dst, + int compressedSize, + int maxDstSize); +LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead") +LZ4LIB_API int +LZ4_decompress_fast_withPrefix64k(const char* src, char* dst, int originalSize); + +/*! Obsolete LZ4_decompress_fast variants (since v1.9.0) : + * These functions used to be faster than LZ4_decompress_safe(), + * but this is no longer the case. They are now slower. + * This is because LZ4_decompress_fast() doesn't know the input size, + * and therefore must progress more cautiously into the input buffer to not + * read beyond the end of block. On top of that `LZ4_decompress_fast()` is not + * protected vs malformed or malicious inputs, making it a security liability. + * As a consequence, LZ4_decompress_fast() is strongly discouraged, and + * deprecated. + * + * The last remaining LZ4_decompress_fast() specificity is that + * it can decompress a block without knowing its compressed size. + * Such functionality can be achieved in a more secure manner + * by employing LZ4_decompress_safe_partial(). + * + * Parameters: + * originalSize : is the uncompressed size to regenerate. + * `dst` must be already allocated, its size must be >= + * 'originalSize' bytes. + * @return : number of bytes read from source buffer (== compressed size). + * The function expects to finish at block's end exactly. + * If the source stream is detected malformed, the function stops + * decoding and returns a negative result. note : LZ4_decompress_fast*() + * requires originalSize. Thanks to this information, it never writes past the + * output buffer. However, since it doesn't know its 'src' size, it may read an + * unknown amount of input, past input buffer bounds. Also, since match offsets + * are not validated, match reads from 'src' may underflow too. These issues + * never happen if input (compressed) data is correct. But they may happen if + * input data is invalid (error or intentional tampering). As a consequence, use + * these functions in trusted environments with trusted data **only**. + */ +LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using " + "LZ4_decompress_safe_partial() instead") +LZ4LIB_API int +LZ4_decompress_fast(const char* src, char* dst, int originalSize); +LZ4_DEPRECATED("This function is deprecated and unsafe. Consider migrating " + "towards LZ4_decompress_safe_continue() instead. " + "Note that the contract will change (requires block's " + "compressed size, instead of decompressed size)") +LZ4LIB_API int +LZ4_decompress_fast_continue(LZ4_streamDecode_t* LZ4_streamDecode, + const char* src, + char* dst, + int originalSize); +LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using " + "LZ4_decompress_safe_partial_usingDict() instead") +LZ4LIB_API int LZ4_decompress_fast_usingDict(const char* src, + char* dst, + int originalSize, + const char* dictStart, + int dictSize); + +/*! LZ4_resetStream() : + * An LZ4_stream_t structure must be initialized at least once. + * This is done with LZ4_initStream(), or LZ4_resetStream(). + * Consider switching to LZ4_initStream(), + * invoking LZ4_resetStream() will trigger deprecation warnings in the future. + */ +LZ4LIB_API void LZ4_resetStream(LZ4_stream_t* streamPtr); + +#endif /* LZ4_H_98237428734687 */ + +#if defined(__cplusplus) +} +#endif diff --git a/tracegrind/main.c b/tracegrind/main.c new file mode 100644 index 000000000..91d2b9498 --- /dev/null +++ b/tracegrind/main.c @@ -0,0 +1,2123 @@ + +/*--------------------------------------------------------------------*/ +/*--- Tracegrind ---*/ +/*--- main.c ---*/ +/*--------------------------------------------------------------------*/ + +/* + This file is part of Tracegrind, a Valgrind tool for call graph + profiling programs. + + Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de) + + This tool is derived from and contains code from Cachegrind + Copyright (C) 2002-2017 Nicholas Nethercote (njn@valgrind.org) + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see . + + The GNU General Public License is contained in the file COPYING. +*/ + +#include "config.h" +#include "global.h" +#include "tracegrind.h" + +#include "pub_tool_gdbserver.h" +#include "pub_tool_threadstate.h" +#include "pub_tool_transtab.h" // VG_(discard_translations_safely) + +#include "cg_branchpred.c" + +/*------------------------------------------------------------*/ +/*--- Global variables ---*/ +/*------------------------------------------------------------*/ + +/* for all threads */ +CommandLineOptions TG_(clo); +Statistics TG_(stat); +Bool TG_(instrument_state) = True; /* Instrumentation on ? */ + +/* thread and signal handler specific */ +exec_state TG_(current_state); + +/* min of L1 and LL cache line sizes. This only gets set to a + non-zero value if we are doing cache simulation. */ +Int TG_(min_line_size) = 0; + +/*------------------------------------------------------------*/ +/*--- Statistics ---*/ +/*------------------------------------------------------------*/ + +static void TG_(init_statistics)(Statistics* s) +{ + s->call_counter = 0; + s->jcnd_counter = 0; + s->jump_counter = 0; + s->rec_call_counter = 0; + s->ret_counter = 0; + s->bb_executions = 0; + + s->context_counter = 0; + s->bb_retranslations = 0; + + s->distinct_objs = 0; + s->distinct_files = 0; + s->distinct_fns = 0; + s->distinct_contexts = 0; + s->distinct_bbs = 0; + s->distinct_bbccs = 0; + s->distinct_instrs = 0; + s->distinct_skips = 0; + + s->bb_hash_resizes = 0; + s->bbcc_hash_resizes = 0; + s->jcc_hash_resizes = 0; + s->cxt_hash_resizes = 0; + s->fn_array_resizes = 0; + s->call_stack_resizes = 0; + s->fn_stack_resizes = 0; + + s->full_debug_BBs = 0; + s->file_line_debug_BBs = 0; + s->fn_name_debug_BBs = 0; + s->no_debug_BBs = 0; + s->bbcc_lru_misses = 0; + s->jcc_lru_misses = 0; + s->cxt_lru_misses = 0; + s->bbcc_clones = 0; +} + +/*------------------------------------------------------------*/ +/*--- Simple callbacks (not cache similator) ---*/ +/*------------------------------------------------------------*/ + +VG_REGPARM(1) +static void log_global_event(InstrInfo* ii) +{ + ULong* cost_Bus; + + TG_DEBUG(6, "log_global_event: Ir %#lx/%u\n", + TG_(bb_base) + ii->instr_offset, ii->instr_size); + + if (!TG_(current_state).collect) + return; + + TG_ASSERT((ii->eventset->mask & (1u << EG_BUS)) > 0); + + TG_(current_state).cost[fullOffset(EG_BUS)]++; + + if (TG_(current_state).nonskipped) + cost_Bus = TG_(current_state).nonskipped->skipped + fullOffset(EG_BUS); + else + cost_Bus = + TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BUS]; + cost_Bus[0]++; +} + +/* For branches, we consult two different predictors, one which + predicts taken/untaken for conditional branches, and the other + which predicts the branch target address for indirect branches + (jump-to-register style ones). */ + +static VG_REGPARM(2) void log_cond_branch(InstrInfo* ii, Word taken) +{ + Bool miss; + Int fullOffset_Bc; + ULong* cost_Bc; + + TG_DEBUG(6, "log_cond_branch: Ir %#lx, taken %ld\n", + TG_(bb_base) + ii->instr_offset, taken); + + miss = 1 & do_cond_branch_predict(TG_(bb_base) + ii->instr_offset, taken); + + if (!TG_(current_state).collect) + return; + + TG_ASSERT((ii->eventset->mask & (1u << EG_BC)) > 0); + + if (TG_(current_state).nonskipped) + cost_Bc = TG_(current_state).nonskipped->skipped + fullOffset(EG_BC); + else + cost_Bc = TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BC]; + + fullOffset_Bc = fullOffset(EG_BC); + TG_(current_state).cost[fullOffset_Bc]++; + cost_Bc[0]++; + if (miss) { + TG_(current_state).cost[fullOffset_Bc + 1]++; + cost_Bc[1]++; + } +} + +static VG_REGPARM(2) void log_ind_branch(InstrInfo* ii, UWord actual_dst) +{ + Bool miss; + Int fullOffset_Bi; + ULong* cost_Bi; + + TG_DEBUG(6, "log_ind_branch: Ir %#lx, dst %#lx\n", + TG_(bb_base) + ii->instr_offset, actual_dst); + + miss = + 1 & do_ind_branch_predict(TG_(bb_base) + ii->instr_offset, actual_dst); + + if (!TG_(current_state).collect) + return; + + TG_ASSERT((ii->eventset->mask & (1u << EG_BI)) > 0); + + if (TG_(current_state).nonskipped) + cost_Bi = TG_(current_state).nonskipped->skipped + fullOffset(EG_BI); + else + cost_Bi = TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BI]; + + fullOffset_Bi = fullOffset(EG_BI); + TG_(current_state).cost[fullOffset_Bi]++; + cost_Bi[0]++; + if (miss) { + TG_(current_state).cost[fullOffset_Bi + 1]++; + cost_Bi[1]++; + } +} + +/*------------------------------------------------------------*/ +/*--- Instrumentation structures and event queue handling ---*/ +/*------------------------------------------------------------*/ + +/* Maintain an ordered list of memory events which are outstanding, in + the sense that no IR has yet been generated to do the relevant + helper calls. The BB is scanned top to bottom and memory events + are added to the end of the list, merging with the most recent + notified event where possible (Dw immediately following Dr and + having the same size and EA can be merged). + + This merging is done so that for architectures which have + load-op-store instructions (x86, amd64), the insn is treated as if + it makes just one memory reference (a modify), rather than two (a + read followed by a write at the same address). + + At various points the list will need to be flushed, that is, IR + generated from it. That must happen before any possible exit from + the block (the end, or an IRStmt_Exit). Flushing also takes place + when there is no space to add a new event. + + If we require the simulation statistics to be up to date with + respect to possible memory exceptions, then the list would have to + be flushed before each memory reference. That would however lose + performance by inhibiting event-merging during flushing. + + Flushing the list consists of walking it start to end and emitting + instrumentation IR for each event, in the order in which they + appear. It may be possible to emit a single call for two adjacent + events in order to reduce the number of helper function calls made. + For example, it could well be profitable to handle two adjacent Ir + events with a single helper call. */ + +typedef IRExpr IRAtom; + +typedef enum { + Ev_Ir, // Instruction read + Ev_Dr, // Data read + Ev_Dw, // Data write + Ev_Dm, // Data modify (read then write) + Ev_Bc, // branch conditional + Ev_Bi, // branch indirect (to unknown destination) + Ev_G // Global bus event +} EventTag; + +typedef struct { + EventTag tag; + InstrInfo* inode; + union { + struct { + } Ir; + struct { + IRAtom* ea; + Int szB; + } Dr; + struct { + IRAtom* ea; + Int szB; + } Dw; + struct { + IRAtom* ea; + Int szB; + } Dm; + struct { + IRAtom* taken; /* :: Ity_I1 */ + } Bc; + struct { + IRAtom* dst; + } Bi; + struct { + } G; + } Ev; +} Event; + +static void init_Event(Event* ev) { VG_(memset)(ev, 0, sizeof(Event)); } + +static IRAtom* get_Event_dea(Event* ev) +{ + switch (ev->tag) { + case Ev_Dr: + return ev->Ev.Dr.ea; + case Ev_Dw: + return ev->Ev.Dw.ea; + case Ev_Dm: + return ev->Ev.Dm.ea; + default: + tl_assert(0); + } +} + +static Int get_Event_dszB(Event* ev) +{ + switch (ev->tag) { + case Ev_Dr: + return ev->Ev.Dr.szB; + case Ev_Dw: + return ev->Ev.Dw.szB; + case Ev_Dm: + return ev->Ev.Dm.szB; + default: + tl_assert(0); + } +} + +/* Up to this many unnotified events are allowed. Number is + arbitrary. Larger numbers allow more event merging to occur, but + potentially induce more spilling due to extending live ranges of + address temporaries. */ +#define N_EVENTS 16 + +/* A struct which holds all the running state during instrumentation. + Mostly to avoid passing loads of parameters everywhere. */ +typedef struct { + /* The current outstanding-memory-event list. */ + Event events[N_EVENTS]; + Int events_used; + + /* The array of InstrInfo's is part of BB struct. */ + BB* bb; + + /* BB seen before (ie. re-instrumentation) */ + Bool seen_before; + + /* Number InstrInfo bins 'used' so far. */ + UInt ii_index; + + // current offset of guest instructions from BB start + UInt instr_offset; + + /* The output SB being constructed. */ + IRSB* sbOut; +} ClgState; + +static void showEvent(Event* ev) +{ + switch (ev->tag) { + case Ev_Ir: + VG_(printf)("Ir (InstrInfo %p) at +%u\n", ev->inode, + ev->inode->instr_offset); + break; + case Ev_Dr: + VG_(printf)("Dr (InstrInfo %p) at +%u %d EA=", ev->inode, + ev->inode->instr_offset, ev->Ev.Dr.szB); + ppIRExpr(ev->Ev.Dr.ea); + VG_(printf)("\n"); + break; + case Ev_Dw: + VG_(printf)("Dw (InstrInfo %p) at +%u %d EA=", ev->inode, + ev->inode->instr_offset, ev->Ev.Dw.szB); + ppIRExpr(ev->Ev.Dw.ea); + VG_(printf)("\n"); + break; + case Ev_Dm: + VG_(printf)("Dm (InstrInfo %p) at +%u %d EA=", ev->inode, + ev->inode->instr_offset, ev->Ev.Dm.szB); + ppIRExpr(ev->Ev.Dm.ea); + VG_(printf)("\n"); + break; + case Ev_Bc: + VG_(printf)("Bc %p GA=", ev->inode); + ppIRExpr(ev->Ev.Bc.taken); + VG_(printf)("\n"); + break; + case Ev_Bi: + VG_(printf)("Bi %p DST=", ev->inode); + ppIRExpr(ev->Ev.Bi.dst); + VG_(printf)("\n"); + break; + case Ev_G: + VG_(printf)("G %p\n", ev->inode); + break; + default: + tl_assert(0); + break; + } +} + +/* Generate code for all outstanding memory events, and mark the queue + empty. Code is generated into cgs->sbOut, and this activity + 'consumes' slots in cgs->bb. */ + +static void flushEvents(ClgState* clgs) +{ + Int i, regparms, inew; + const HChar* helperName; + void* helperAddr; + IRExpr** argv; + IRExpr* i_node_expr; + IRDirty* di; + Event* ev; + Event* ev2; + Event* ev3; + + if (!clgs->seen_before) { + // extend event sets as needed + // available sets: D0 Dr + for (i = 0; i < clgs->events_used; i++) { + ev = &clgs->events[i]; + switch (ev->tag) { + case Ev_Ir: + // Ir event always is first for a guest instruction + TG_ASSERT(ev->inode->eventset == 0); + ev->inode->eventset = TG_(sets).base; + break; + case Ev_Dr: + // extend event set by Dr counters + ev->inode->eventset = + TG_(add_event_group)(ev->inode->eventset, EG_DR); + break; + case Ev_Dw: + case Ev_Dm: + // extend event set by Dw counters + ev->inode->eventset = + TG_(add_event_group)(ev->inode->eventset, EG_DW); + break; + case Ev_Bc: + // extend event set by Bc counters + ev->inode->eventset = + TG_(add_event_group)(ev->inode->eventset, EG_BC); + break; + case Ev_Bi: + // extend event set by Bi counters + ev->inode->eventset = + TG_(add_event_group)(ev->inode->eventset, EG_BI); + break; + case Ev_G: + // extend event set by Bus counter + ev->inode->eventset = + TG_(add_event_group)(ev->inode->eventset, EG_BUS); + break; + default: + tl_assert(0); + } + } + } + + for (i = 0; i < clgs->events_used; i = inew) { + + helperName = NULL; + helperAddr = NULL; + argv = NULL; + regparms = 0; + + /* generate IR to notify event i and possibly the ones + immediately following it. */ + tl_assert(i >= 0 && i < clgs->events_used); + + ev = &clgs->events[i]; + ev2 = (i < clgs->events_used - 1 ? &clgs->events[i + 1] : NULL); + ev3 = (i < clgs->events_used - 2 ? &clgs->events[i + 2] : NULL); + + TG_DEBUGIF(5) + { + VG_(printf)(" flush "); + showEvent(ev); + } + + i_node_expr = mkIRExpr_HWord((HWord)ev->inode); + + /* Decide on helper fn to call and args to pass it, and advance + i appropriately. + Dm events have same effect as Dw events */ + switch (ev->tag) { + case Ev_Ir: + /* Merge an Ir with a following Dr. */ + if (ev2 && ev2->tag == Ev_Dr) { + /* Why is this true? It's because we're merging an Ir + with a following Dr. The Ir derives from the + instruction's IMark and the Dr from data + references which follow it. In short it holds + because each insn starts with an IMark, hence an + Ev_Ir, and so these Dr must pertain to the + immediately preceding Ir. Same applies to analogous + assertions in the subsequent cases. */ + tl_assert(ev2->inode == ev->inode); + helperName = TG_(cachesim).log_1I1Dr_name; + helperAddr = TG_(cachesim).log_1I1Dr; + argv = mkIRExprVec_3(i_node_expr, get_Event_dea(ev2), + mkIRExpr_HWord(get_Event_dszB(ev2))); + regparms = 3; + inew = i + 2; + } + /* Merge an Ir with a following Dw/Dm. */ + else if (ev2 && (ev2->tag == Ev_Dw || ev2->tag == Ev_Dm)) { + tl_assert(ev2->inode == ev->inode); + helperName = TG_(cachesim).log_1I1Dw_name; + helperAddr = TG_(cachesim).log_1I1Dw; + argv = mkIRExprVec_3(i_node_expr, get_Event_dea(ev2), + mkIRExpr_HWord(get_Event_dszB(ev2))); + regparms = 3; + inew = i + 2; + } + /* Merge an Ir with two following Irs. */ + else if (ev2 && ev3 && ev2->tag == Ev_Ir && ev3->tag == Ev_Ir) { + helperName = TG_(cachesim).log_3I0D_name; + helperAddr = TG_(cachesim).log_3I0D; + argv = mkIRExprVec_3(i_node_expr, mkIRExpr_HWord((HWord)ev2->inode), + mkIRExpr_HWord((HWord)ev3->inode)); + regparms = 3; + inew = i + 3; + } + /* Merge an Ir with one following Ir. */ + else if (ev2 && ev2->tag == Ev_Ir) { + helperName = TG_(cachesim).log_2I0D_name; + helperAddr = TG_(cachesim).log_2I0D; + argv = + mkIRExprVec_2(i_node_expr, mkIRExpr_HWord((HWord)ev2->inode)); + regparms = 2; + inew = i + 2; + } + /* No merging possible; emit as-is. */ + else { + helperName = TG_(cachesim).log_1I0D_name; + helperAddr = TG_(cachesim).log_1I0D; + argv = mkIRExprVec_1(i_node_expr); + regparms = 1; + inew = i + 1; + } + break; + case Ev_Dr: + /* Data read or modify */ + helperName = TG_(cachesim).log_0I1Dr_name; + helperAddr = TG_(cachesim).log_0I1Dr; + argv = mkIRExprVec_3(i_node_expr, get_Event_dea(ev), + mkIRExpr_HWord(get_Event_dszB(ev))); + regparms = 3; + inew = i + 1; + break; + case Ev_Dw: + case Ev_Dm: + /* Data write */ + helperName = TG_(cachesim).log_0I1Dw_name; + helperAddr = TG_(cachesim).log_0I1Dw; + argv = mkIRExprVec_3(i_node_expr, get_Event_dea(ev), + mkIRExpr_HWord(get_Event_dszB(ev))); + regparms = 3; + inew = i + 1; + break; + case Ev_Bc: + /* Conditional branch */ + helperName = "log_cond_branch"; + helperAddr = &log_cond_branch; + argv = mkIRExprVec_2(i_node_expr, ev->Ev.Bc.taken); + regparms = 2; + inew = i + 1; + break; + case Ev_Bi: + /* Branch to an unknown destination */ + helperName = "log_ind_branch"; + helperAddr = &log_ind_branch; + argv = mkIRExprVec_2(i_node_expr, ev->Ev.Bi.dst); + regparms = 2; + inew = i + 1; + break; + case Ev_G: + /* Global bus event (CAS, LOCK-prefix, LL-SC, etc) */ + helperName = "log_global_event"; + helperAddr = &log_global_event; + argv = mkIRExprVec_1(i_node_expr); + regparms = 1; + inew = i + 1; + break; + default: + tl_assert(0); + } + + TG_DEBUGIF(5) + { + if (inew > i + 1) { + VG_(printf)(" merge "); + showEvent(ev2); + } + if (inew > i + 2) { + VG_(printf)(" merge "); + showEvent(ev3); + } + if (helperAddr) + VG_(printf)(" call %s (%p)\n", helperName, helperAddr); + } + + /* helper could be unset depending on the simulator used */ + if (helperAddr == 0) + continue; + + /* Add the helper. */ + tl_assert(helperName); + tl_assert(helperAddr); + tl_assert(argv); + di = unsafeIRDirty_0_N(regparms, helperName, + VG_(fnptr_to_fnentry)(helperAddr), argv); + addStmtToIRSB(clgs->sbOut, IRStmt_Dirty(di)); + } + + clgs->events_used = 0; +} + +static void addEvent_Ir(ClgState* clgs, InstrInfo* inode) +{ + Event* evt; + tl_assert(clgs->seen_before || (inode->eventset == 0)); + if (!TG_(clo).simulate_cache) + return; + + if (clgs->events_used == N_EVENTS) + flushEvents(clgs); + tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS); + evt = &clgs->events[clgs->events_used]; + init_Event(evt); + evt->tag = Ev_Ir; + evt->inode = inode; + clgs->events_used++; +} + +static void +addEvent_Dr(ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea) +{ + Event* evt; + tl_assert(isIRAtom(ea)); + tl_assert(datasize >= 1); + if (!TG_(clo).simulate_cache) + return; + tl_assert(datasize <= TG_(min_line_size)); + + if (clgs->events_used == N_EVENTS) + flushEvents(clgs); + tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS); + evt = &clgs->events[clgs->events_used]; + init_Event(evt); + evt->tag = Ev_Dr; + evt->inode = inode; + evt->Ev.Dr.szB = datasize; + evt->Ev.Dr.ea = ea; + clgs->events_used++; +} + +static void +addEvent_Dw(ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea) +{ + Event* evt; + tl_assert(isIRAtom(ea)); + tl_assert(datasize >= 1); + if (!TG_(clo).simulate_cache) + return; + tl_assert(datasize <= TG_(min_line_size)); + + /* Is it possible to merge this write with the preceding read? */ + if (clgs->events_used > 0) { + Event* lastEvt = &clgs->events[clgs->events_used - 1]; + if (lastEvt->tag == Ev_Dr && lastEvt->Ev.Dr.szB == datasize && + lastEvt->inode == inode && eqIRAtom(lastEvt->Ev.Dr.ea, ea)) { + lastEvt->tag = Ev_Dm; + return; + } + } + + /* No. Add as normal. */ + if (clgs->events_used == N_EVENTS) + flushEvents(clgs); + tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS); + evt = &clgs->events[clgs->events_used]; + init_Event(evt); + evt->tag = Ev_Dw; + evt->inode = inode; + evt->Ev.Dw.szB = datasize; + evt->Ev.Dw.ea = ea; + clgs->events_used++; +} + +static void addEvent_D_guarded(ClgState* clgs, + InstrInfo* inode, + Int datasize, + IRAtom* ea, + IRAtom* guard, + Bool isWrite) +{ + tl_assert(isIRAtom(ea)); + tl_assert(guard); + tl_assert(isIRAtom(guard)); + tl_assert(datasize >= 1); + if (!TG_(clo).simulate_cache) + return; + tl_assert(datasize <= TG_(min_line_size)); + + /* Adding guarded memory actions and merging them with the existing + queue is too complex. Simply flush the queue and add this + action immediately. Since guarded loads and stores are pretty + rare, this is not thought likely to cause any noticeable + performance loss as a result of the loss of event-merging + opportunities. */ + tl_assert(clgs->events_used >= 0); + flushEvents(clgs); + tl_assert(clgs->events_used == 0); + /* Same as case Ev_Dw / case Ev_Dr in flushEvents, except with guard */ + IRExpr* i_node_expr; + const HChar* helperName; + void* helperAddr; + IRExpr** argv; + Int regparms; + IRDirty* di; + i_node_expr = mkIRExpr_HWord((HWord)inode); + helperName = + isWrite ? TG_(cachesim).log_0I1Dw_name : TG_(cachesim).log_0I1Dr_name; + helperAddr = isWrite ? TG_(cachesim).log_0I1Dw : TG_(cachesim).log_0I1Dr; + argv = mkIRExprVec_3(i_node_expr, ea, mkIRExpr_HWord(datasize)); + regparms = 3; + di = unsafeIRDirty_0_N(regparms, helperName, + VG_(fnptr_to_fnentry)(helperAddr), argv); + di->guard = guard; + addStmtToIRSB(clgs->sbOut, IRStmt_Dirty(di)); +} + +static void addEvent_Bc(ClgState* clgs, InstrInfo* inode, IRAtom* guard) +{ + Event* evt; + tl_assert(isIRAtom(guard)); + tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, guard) == + (sizeof(RegWord) == 4 ? Ity_I32 : Ity_I64)); + if (!TG_(clo).simulate_branch) + return; + + if (clgs->events_used == N_EVENTS) + flushEvents(clgs); + tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS); + evt = &clgs->events[clgs->events_used]; + init_Event(evt); + evt->tag = Ev_Bc; + evt->inode = inode; + evt->Ev.Bc.taken = guard; + clgs->events_used++; +} + +static void addEvent_Bi(ClgState* clgs, InstrInfo* inode, IRAtom* whereTo) +{ + Event* evt; + tl_assert(isIRAtom(whereTo)); + tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, whereTo) == + (sizeof(RegWord) == 4 ? Ity_I32 : Ity_I64)); + if (!TG_(clo).simulate_branch) + return; + + if (clgs->events_used == N_EVENTS) + flushEvents(clgs); + tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS); + evt = &clgs->events[clgs->events_used]; + init_Event(evt); + evt->tag = Ev_Bi; + evt->inode = inode; + evt->Ev.Bi.dst = whereTo; + clgs->events_used++; +} + +static void addEvent_G(ClgState* clgs, InstrInfo* inode) +{ + Event* evt; + if (!TG_(clo).collect_bus) + return; + + if (clgs->events_used == N_EVENTS) + flushEvents(clgs); + tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS); + evt = &clgs->events[clgs->events_used]; + init_Event(evt); + evt->tag = Ev_G; + evt->inode = inode; + clgs->events_used++; +} + +/* Initialise or check (if already seen before) an InstrInfo for next insn. + We only can set instr_offset/instr_size here. The required event set and + resulting cost offset depend on events (Ir/Dr/Dw/Dm) in guest + instructions. The event set is extended as required on flush of the event + queue (when Dm events were determined), cost offsets are determined at + end of BB instrumentation. */ +static InstrInfo* next_InstrInfo(ClgState* clgs, UInt instr_size) +{ + InstrInfo* ii; + tl_assert(clgs->ii_index < clgs->bb->instr_count); + ii = &clgs->bb->instr[clgs->ii_index]; + + if (clgs->seen_before) { + TG_ASSERT(ii->instr_offset == clgs->instr_offset); + TG_ASSERT(ii->instr_size == instr_size); + } else { + ii->instr_offset = clgs->instr_offset; + ii->instr_size = instr_size; + ii->cost_offset = 0; + ii->eventset = 0; + } + + clgs->ii_index++; + clgs->instr_offset += instr_size; + TG_(stat).distinct_instrs++; + + return ii; +} + +// return total number of cost values needed for this BB +static UInt update_cost_offsets(ClgState* clgs) +{ + Int i; + InstrInfo* ii; + UInt cost_offset = 0; + + TG_ASSERT(clgs->bb->instr_count == clgs->ii_index); + for (i = 0; i < clgs->ii_index; i++) { + ii = &clgs->bb->instr[i]; + if (clgs->seen_before) { + TG_ASSERT(ii->cost_offset == cost_offset); + } else + ii->cost_offset = cost_offset; + cost_offset += ii->eventset ? ii->eventset->size : 0; + } + + return cost_offset; +} + +/*------------------------------------------------------------*/ +/*--- Instrumentation ---*/ +/*------------------------------------------------------------*/ + +#if defined(VG_BIGENDIAN) +#define CLGEndness Iend_BE +#elif defined(VG_LITTLEENDIAN) +#define CLGEndness Iend_LE +#else +#error "Unknown endianness" +#endif + +static Addr IRConst2Addr(IRConst* con) +{ + Addr addr; + + if (sizeof(RegWord) == 4) { + TG_ASSERT(con->tag == Ico_U32); + addr = con->Ico.U32; + } else if (sizeof(RegWord) == 8) { + TG_ASSERT(con->tag == Ico_U64); + addr = con->Ico.U64; + } else + VG_(tool_panic)("Tracegrind: invalid Addr type"); + + return addr; +} + +/* First pass over a BB to instrument, counting instructions and jumps + * This is needed for the size of the BB struct to allocate + * + * Called from TG_(get_bb) + */ +void TG_(collectBlockInfo)(IRSB* sbIn, + /*INOUT*/ UInt* instrs, + /*INOUT*/ UInt* cjmps, + /*INOUT*/ Bool* cjmp_inverted) +{ + Int i; + IRStmt* st; + Addr instrAddr = 0, jumpDst; + UInt instrLen = 0; + Bool toNextInstr = False; + + // Ist_Exit has to be ignored in preamble code, before first IMark: + // preamble code is added by VEX for self modifying code, and has + // nothing to do with client code + Bool inPreamble = True; + + if (!sbIn) + return; + + for (i = 0; i < sbIn->stmts_used; i++) { + st = sbIn->stmts[i]; + if (Ist_IMark == st->tag) { + inPreamble = False; + + instrAddr = st->Ist.IMark.addr; + instrLen = st->Ist.IMark.len; + + (*instrs)++; + toNextInstr = False; + } + if (inPreamble) + continue; + if (Ist_Exit == st->tag) { + jumpDst = IRConst2Addr(st->Ist.Exit.dst); + toNextInstr = (jumpDst == instrAddr + instrLen); + + (*cjmps)++; + } + } + + /* if the last instructions of BB conditionally jumps to next instruction + * (= first instruction of next BB in memory), this is a inverted by VEX. + */ + *cjmp_inverted = toNextInstr; +} + +static void +addConstMemStoreStmt(IRSB* bbOut, UWord addr, UInt val, IRType hWordTy) +{ + addStmtToIRSB( + bbOut, IRStmt_Store(CLGEndness, + IRExpr_Const(hWordTy == Ity_I32 ? IRConst_U32(addr) + : IRConst_U64(addr)), + IRExpr_Const(IRConst_U32(val)))); +} + +/* add helper call to setup_bbcc, with pointer to BB struct as argument + * + * precondition for setup_bbcc: + * - jmps_passed has number of cond.jumps passed in last executed BB + * - current_bbcc has a pointer to the BBCC of the last executed BB + * Thus, if bbcc_jmpkind is != -1 (JmpNone), + * current_bbcc->bb->jmp_addr + * gives the address of the jump source. + * + * the setup does 2 things: + * - trace call: + * * Unwind own call stack, i.e sync our ESP with real ESP + * This is for ESP manipulation (longjmps, C++ exec handling) and RET + * * For CALLs or JMPs crossing objects, record call arg + + * push are on own call stack + * + * - prepare for cache log functions: + * set current_bbcc to BBCC that gets the costs for this BB execution + * attached + */ +static void addBBSetupCall(ClgState* clgs) +{ + IRDirty* di; + IRExpr * arg1, **argv; + + arg1 = mkIRExpr_HWord((HWord)clgs->bb); + argv = mkIRExprVec_1(arg1); + di = unsafeIRDirty_0_N(1, "setup_bbcc", + VG_(fnptr_to_fnentry)(&TG_(setup_bbcc)), argv); + addStmtToIRSB(clgs->sbOut, IRStmt_Dirty(di)); +} + +static IRSB* TG_(instrument)(VgCallbackClosure* closure, + IRSB* sbIn, + const VexGuestLayout* layout, + const VexGuestExtents* vge, + const VexArchInfo* archinfo_host, + IRType gWordTy, + IRType hWordTy) +{ + Int i; + IRStmt* st; + Addr origAddr; + InstrInfo* curr_inode = NULL; + ClgState clgs; + UInt cJumps = 0; + IRTypeEnv* tyenv = sbIn->tyenv; + + if (gWordTy != hWordTy) { + /* We don't currently support this case. */ + VG_(tool_panic)("host/guest word size mismatch"); + } + + // No instrumentation if it is switched off + if (!TG_(instrument_state)) { + TG_DEBUG(5, "instrument(BB %#lx) [Instrumentation OFF]\n", + (Addr)closure->readdr); + return sbIn; + } + + TG_DEBUG(3, "+ instrument(BB %#lx)\n", (Addr)closure->readdr); + + /* Set up SB for instrumented IR */ + clgs.sbOut = deepCopyIRSBExceptStmts(sbIn); + + // Copy verbatim any IR preamble preceding the first IMark + i = 0; + while (i < sbIn->stmts_used && sbIn->stmts[i]->tag != Ist_IMark) { + addStmtToIRSB(clgs.sbOut, sbIn->stmts[i]); + i++; + } + + // Get the first statement, and origAddr from it + TG_ASSERT(sbIn->stmts_used > 0); + TG_ASSERT(i < sbIn->stmts_used); + st = sbIn->stmts[i]; + TG_ASSERT(Ist_IMark == st->tag); + + origAddr = st->Ist.IMark.addr + st->Ist.IMark.delta; + TG_ASSERT(origAddr == st->Ist.IMark.addr + + st->Ist.IMark.delta); // XXX: check no overflow + + /* Get BB struct (creating if necessary). + * JS: The hash table is keyed with orig_addr_noredir -- important! + * JW: Why? If it is because of different chasing of the redirection, + * this is not needed, as chasing is switched off in tracegrind + */ + clgs.bb = TG_(get_bb)(origAddr, sbIn, &(clgs.seen_before)); + + addBBSetupCall(&clgs); + + // Set up running state + clgs.events_used = 0; + clgs.ii_index = 0; + clgs.instr_offset = 0; + + for (/*use current i*/; i < sbIn->stmts_used; i++) { + + st = sbIn->stmts[i]; + TG_ASSERT(isFlatIRStmt(st)); + + switch (st->tag) { + case Ist_NoOp: + case Ist_AbiHint: + case Ist_Put: + case Ist_PutI: + case Ist_MBE: + break; + + case Ist_IMark: { + Addr cia = st->Ist.IMark.addr + st->Ist.IMark.delta; + UInt isize = st->Ist.IMark.len; + TG_ASSERT(clgs.instr_offset == cia - origAddr); + // If Vex fails to decode an instruction, the size will be zero. + // Pretend otherwise. + if (isize == 0) + isize = VG_MIN_INSTR_SZB; + + // Sanity-check size. + tl_assert((VG_MIN_INSTR_SZB <= isize && isize <= VG_MAX_INSTR_SZB) || + VG_CLREQ_SZB == isize); + + // Init the inode, record it as the current one. + // Subsequent Dr/Dw/Dm events from the same instruction will + // also use it. + curr_inode = next_InstrInfo(&clgs, isize); + + addEvent_Ir(&clgs, curr_inode); + break; + } + + case Ist_WrTmp: { + IRExpr* data = st->Ist.WrTmp.data; + if (data->tag == Iex_Load) { + IRExpr* aexpr = data->Iex.Load.addr; + // Note also, endianness info is ignored. I guess + // that's not interesting. + addEvent_Dr(&clgs, curr_inode, sizeofIRType(data->Iex.Load.ty), + aexpr); + } + break; + } + + case Ist_Store: { + IRExpr* data = st->Ist.Store.data; + IRExpr* aexpr = st->Ist.Store.addr; + addEvent_Dw(&clgs, curr_inode, + sizeofIRType(typeOfIRExpr(sbIn->tyenv, data)), aexpr); + break; + } + + case Ist_StoreG: { + IRStoreG* sg = st->Ist.StoreG.details; + IRExpr* data = sg->data; + IRExpr* addr = sg->addr; + IRType type = typeOfIRExpr(tyenv, data); + tl_assert(type != Ity_INVALID); + addEvent_D_guarded(&clgs, curr_inode, sizeofIRType(type), addr, + sg->guard, True /*isWrite*/); + break; + } + + case Ist_LoadG: { + IRLoadG* lg = st->Ist.LoadG.details; + IRType type = Ity_INVALID; /* loaded type */ + IRType typeWide = Ity_INVALID; /* after implicit widening */ + IRExpr* addr = lg->addr; + typeOfIRLoadGOp(lg->cvt, &typeWide, &type); + tl_assert(type != Ity_INVALID); + addEvent_D_guarded(&clgs, curr_inode, sizeofIRType(type), addr, + lg->guard, False /*!isWrite*/); + break; + } + + case Ist_Dirty: { + Int dataSize; + IRDirty* d = st->Ist.Dirty.details; + if (d->mFx != Ifx_None) { + /* This dirty helper accesses memory. Collect the details. */ + tl_assert(d->mAddr != NULL); + tl_assert(d->mSize != 0); + dataSize = d->mSize; + // Large (eg. 28B, 108B, 512B on x86) data-sized + // instructions will be done inaccurately, but they're + // very rare and this avoids errors from hitting more + // than two cache lines in the simulation. + if (TG_(clo).simulate_cache && dataSize > TG_(min_line_size)) + dataSize = TG_(min_line_size); + if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) + addEvent_Dr(&clgs, curr_inode, dataSize, d->mAddr); + if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) + addEvent_Dw(&clgs, curr_inode, dataSize, d->mAddr); + } else { + tl_assert(d->mAddr == NULL); + tl_assert(d->mSize == 0); + } + break; + } + + case Ist_CAS: { + /* We treat it as a read and a write of the location. I + think that is the same behaviour as it was before IRCAS + was introduced, since prior to that point, the Vex + front ends would translate a lock-prefixed instruction + into a (normal) read followed by a (normal) write. */ + Int dataSize; + IRCAS* cas = st->Ist.CAS.details; + TG_ASSERT(cas->addr && isIRAtom(cas->addr)); + TG_ASSERT(cas->dataLo); + dataSize = sizeofIRType(typeOfIRExpr(sbIn->tyenv, cas->dataLo)); + if (cas->dataHi != NULL) + dataSize *= 2; /* since this is a doubleword-cas */ + addEvent_Dr(&clgs, curr_inode, dataSize, cas->addr); + addEvent_Dw(&clgs, curr_inode, dataSize, cas->addr); + addEvent_G(&clgs, curr_inode); + break; + } + + case Ist_LLSC: { + IRType dataTy; + if (st->Ist.LLSC.storedata == NULL) { + /* LL */ + dataTy = typeOfIRTemp(sbIn->tyenv, st->Ist.LLSC.result); + addEvent_Dr(&clgs, curr_inode, sizeofIRType(dataTy), + st->Ist.LLSC.addr); + /* flush events before LL, should help SC to succeed */ + flushEvents(&clgs); + } else { + /* SC */ + dataTy = typeOfIRExpr(sbIn->tyenv, st->Ist.LLSC.storedata); + addEvent_Dw(&clgs, curr_inode, sizeofIRType(dataTy), + st->Ist.LLSC.addr); + /* I don't know whether the global-bus-lock cost should + be attributed to the LL or the SC, but it doesn't + really matter since they always have to be used in + pairs anyway. Hence put it (quite arbitrarily) on + the SC. */ + addEvent_G(&clgs, curr_inode); + } + break; + } + + case Ist_Exit: { + Bool guest_exit, inverted; + + /* VEX code generation sometimes inverts conditional branches. + * As Tracegrind counts (conditional) jumps, it has to correct + * inversions. The heuristic is the following: + * (1) Tracegrind switches off SB chasing and unrolling, and + * therefore it assumes that a candidate for inversion only is + * the last conditional branch in an SB. + * (2) inversion is assumed if the branch jumps to the address of + * the next guest instruction in memory. + * This heuristic is precalculated in TG_(collectBlockInfo)(). + * + * Branching behavior is also used for branch prediction. Note that + * above heuristic is different from what Cachegrind does. + * Cachegrind uses (2) for all branches. + */ + if (cJumps + 1 == clgs.bb->cjmp_count) + inverted = clgs.bb->cjmp_inverted; + else + inverted = False; + + // call branch predictor only if this is a branch in guest code + guest_exit = (st->Ist.Exit.jk == Ijk_Boring) || + (st->Ist.Exit.jk == Ijk_Call) || + (st->Ist.Exit.jk == Ijk_Ret); + + if (guest_exit) { + /* Stuff to widen the guard expression to a host word, so + we can pass it to the branch predictor simulation + functions easily. */ + IRType tyW = hWordTy; + IROp widen = tyW == Ity_I32 ? Iop_1Uto32 : Iop_1Uto64; + IROp opXOR = tyW == Ity_I32 ? Iop_Xor32 : Iop_Xor64; + IRTemp guard1 = newIRTemp(clgs.sbOut->tyenv, Ity_I1); + IRTemp guardW = newIRTemp(clgs.sbOut->tyenv, tyW); + IRTemp guard = newIRTemp(clgs.sbOut->tyenv, tyW); + IRExpr* one = tyW == Ity_I32 ? IRExpr_Const(IRConst_U32(1)) + : IRExpr_Const(IRConst_U64(1)); + + /* Widen the guard expression. */ + addStmtToIRSB(clgs.sbOut, IRStmt_WrTmp(guard1, st->Ist.Exit.guard)); + addStmtToIRSB( + clgs.sbOut, + IRStmt_WrTmp(guardW, IRExpr_Unop(widen, IRExpr_RdTmp(guard1)))); + /* If the exit is inverted, invert the sense of the guard. */ + addStmtToIRSB( + clgs.sbOut, + IRStmt_WrTmp(guard, + inverted + ? IRExpr_Binop(opXOR, IRExpr_RdTmp(guardW), one) + : IRExpr_RdTmp(guardW))); + /* And post the event. */ + addEvent_Bc(&clgs, curr_inode, IRExpr_RdTmp(guard)); + } + + /* We may never reach the next statement, so need to flush + all outstanding transactions now. */ + flushEvents(&clgs); + + TG_ASSERT(clgs.ii_index > 0); + if (!clgs.seen_before) { + TgJumpKind jk; + + if (st->Ist.Exit.jk == Ijk_Call) + jk = jk_Call; + else if (st->Ist.Exit.jk == Ijk_Ret) + jk = jk_Return; + else { + if (IRConst2Addr(st->Ist.Exit.dst) == + origAddr + curr_inode->instr_offset + curr_inode->instr_size) + jk = jk_None; + else + jk = jk_Jump; + } + + clgs.bb->jmp[cJumps].instr = clgs.ii_index - 1; + clgs.bb->jmp[cJumps].jmpkind = jk; + } + + /* Update global variable jmps_passed before the jump + * A correction is needed if VEX inverted the last jump condition + */ + UInt val = inverted ? cJumps + 1 : cJumps; + addConstMemStoreStmt( + clgs.sbOut, (UWord)&TG_(current_state).jmps_passed, val, hWordTy); + cJumps++; + + break; + } + + default: + tl_assert(0); + break; + } + + /* Copy the original statement */ + addStmtToIRSB(clgs.sbOut, st); + + TG_DEBUGIF(5) + { + VG_(printf)(" pass "); + ppIRStmt(st); + VG_(printf)("\n"); + } + } + + /* Deal with branches to unknown destinations. Except ignore ones + which are function returns as we assume the return stack + predictor never mispredicts. */ + if ((sbIn->jumpkind == Ijk_Boring) || (sbIn->jumpkind == Ijk_Call)) { + if (0) { + ppIRExpr(sbIn->next); + VG_(printf)("\n"); + } + switch (sbIn->next->tag) { + case Iex_Const: + break; /* boring - branch to known address */ + case Iex_RdTmp: + /* looks like an indirect branch (branch to unknown) */ + addEvent_Bi(&clgs, curr_inode, sbIn->next); + break; + default: + /* shouldn't happen - if the incoming IR is properly + flattened, should only have tmp and const cases to + consider. */ + tl_assert(0); + } + } + + /* At the end of the bb. Flush outstandings. */ + flushEvents(&clgs); + + /* Update global variable jmps_passed at end of SB. + * As TG_(current_state).jmps_passed is reset to 0 in setup_bbcc, + * this can be omitted if there is no conditional jump in this SB. + * A correction is needed if VEX inverted the last jump condition + */ + if (cJumps > 0) { + UInt jmps_passed = cJumps; + if (clgs.bb->cjmp_inverted) + jmps_passed--; + addConstMemStoreStmt(clgs.sbOut, (UWord)&TG_(current_state).jmps_passed, + jmps_passed, hWordTy); + } + TG_ASSERT(clgs.bb->cjmp_count == cJumps); + TG_ASSERT(clgs.bb->instr_count == clgs.ii_index); + + /* Info for final exit from BB */ + { + TgJumpKind jk; + + if (sbIn->jumpkind == Ijk_Call) + jk = jk_Call; + else if (sbIn->jumpkind == Ijk_Ret) + jk = jk_Return; + else { + jk = jk_Jump; + if ((sbIn->next->tag == Iex_Const) && + (IRConst2Addr(sbIn->next->Iex.Const.con) == + origAddr + clgs.instr_offset)) + jk = jk_None; + } + clgs.bb->jmp[cJumps].jmpkind = jk; + /* Instruction index of the call/ret at BB end + * (it is wrong for fall-through, but does not matter) */ + clgs.bb->jmp[cJumps].instr = clgs.ii_index - 1; + } + + /* swap information of last exit with final exit if inverted */ + if (clgs.bb->cjmp_inverted) { + TgJumpKind jk; + UInt instr; + + jk = clgs.bb->jmp[cJumps].jmpkind; + clgs.bb->jmp[cJumps].jmpkind = clgs.bb->jmp[cJumps - 1].jmpkind; + clgs.bb->jmp[cJumps - 1].jmpkind = jk; + instr = clgs.bb->jmp[cJumps].instr; + clgs.bb->jmp[cJumps].instr = clgs.bb->jmp[cJumps - 1].instr; + clgs.bb->jmp[cJumps - 1].instr = instr; + } + + if (clgs.seen_before) { + TG_ASSERT(clgs.bb->cost_count == update_cost_offsets(&clgs)); + TG_ASSERT(clgs.bb->instr_len == clgs.instr_offset); + } else { + clgs.bb->cost_count = update_cost_offsets(&clgs); + clgs.bb->instr_len = clgs.instr_offset; + } + + TG_DEBUG(3, "- instrument(BB %#lx): byteLen %u, CJumps %u, CostLen %u\n", + origAddr, clgs.bb->instr_len, clgs.bb->cjmp_count, + clgs.bb->cost_count); + if (cJumps > 0) { + TG_DEBUG(3, " [ "); + for (i = 0; i < cJumps; i++) + TG_DEBUG(3, "%u ", clgs.bb->jmp[i].instr); + TG_DEBUG(3, "], last inverted: %s \n", + clgs.bb->cjmp_inverted ? "yes" : "no"); + } + + return clgs.sbOut; +} + +/*--------------------------------------------------------------------*/ +/*--- Discarding BB info ---*/ +/*--------------------------------------------------------------------*/ + +// Called when a translation is removed from the translation cache for +// any reason at all: to free up space, because the guest code was +// unmapped or modified, or for any arbitrary reason. +static void tg_discard_superblock_info(Addr orig_addr, VexGuestExtents vge) +{ + tl_assert(vge.n_used > 0); + + if (0) + VG_(printf)("discard_superblock_info: %p, %p, %llu\n", (void*)orig_addr, + (void*)vge.base[0], (ULong)vge.len[0]); + + // Get BB info, remove from table, free BB info. Simple! + // When created, the BB is keyed by the first instruction address, + // (not orig_addr, but eventually redirected address). Thus, we + // use the first instruction address in vge. + TG_(delete_bb)(vge.base[0]); +} + +/*------------------------------------------------------------*/ +/*--- TG_(fini)() and related function ---*/ +/*------------------------------------------------------------*/ + +static void unwind_thread(thread_info* t) +{ + /* unwind signal handlers */ + while (TG_(current_state).sig != 0) + TG_(post_signal)(TG_(current_tid), TG_(current_state).sig); + + /* unwind regular call stack */ + while (TG_(current_call_stack).sp > 0) + TG_(pop_call_stack)(); + + /* reset context and function stack for context generation */ + TG_(init_exec_state)(&TG_(current_state)); + TG_(current_fn_stack).top = TG_(current_fn_stack).bottom; +} + +static void zero_state_cost(thread_info* t) +{ + TG_(zero_cost)(TG_(sets).full, TG_(current_state).cost); +} + +void TG_(set_instrument_state)(const HChar* reason, Bool state) +{ + if (TG_(instrument_state) == state) { + TG_DEBUG(2, "%s: instrumentation already %s\n", reason, + state ? "ON" : "OFF"); + return; + } + TG_(instrument_state) = state; + TG_DEBUG(2, "%s: Switching instrumentation %s ...\n", reason, + state ? "ON" : "OFF"); + + VG_(discard_translations_safely)((Addr)0x1000, ~(SizeT)0xfff, "tracegrind"); + + /* reset internal state: call stacks, simulator */ + TG_(forall_threads)(unwind_thread); + TG_(forall_threads)(zero_state_cost); + (*TG_(cachesim).clear)(); + + if (VG_(clo_verbosity) > 1) + VG_(message)(Vg_DebugMsg, "%s: instrumentation switched %s\n", reason, + state ? "ON" : "OFF"); +} + +/* helper for dump_state_togdb */ +static void dump_state_of_thread_togdb(thread_info* ti) +{ + static FullCost sum = 0, tmp = 0; + Int t, i; + BBCC * from, *to; + call_entry* ce; + HChar* mcost; + + t = TG_(current_tid); + TG_(init_cost_lz)(TG_(sets).full, &sum); + TG_(copy_cost_lz)(TG_(sets).full, &tmp, ti->lastdump_cost); + TG_(add_diff_cost) + (TG_(sets).full, sum, ti->lastdump_cost, ti->states.entry[0]->cost); + TG_(copy_cost)(TG_(sets).full, ti->lastdump_cost, tmp); + mcost = TG_(mappingcost_as_string)(TG_(dumpmap), sum); + VG_(gdb_printf)("events-%d: %s\n", t, mcost); + VG_(free)(mcost); + VG_(gdb_printf)("frames-%d: %d\n", t, TG_(current_call_stack).sp); + + ce = 0; + for (i = 0; i < TG_(current_call_stack).sp; i++) { + ce = TG_(get_call_entry)(i); + /* if this frame is skipped, we don't have counters */ + if (!ce->jcc) + continue; + + from = ce->jcc->from; + VG_(gdb_printf)("function-%d-%d: %s\n", t, i, from->cxt->fn[0]->name); + VG_(gdb_printf)("calls-%d-%d: %llu\n", t, i, ce->jcc->call_counter); + + /* FIXME: EventSets! */ + TG_(copy_cost)(TG_(sets).full, sum, ce->jcc->cost); + TG_(copy_cost)(TG_(sets).full, tmp, ce->enter_cost); + TG_(add_diff_cost) + (TG_(sets).full, sum, ce->enter_cost, TG_(current_state).cost); + TG_(copy_cost)(TG_(sets).full, ce->enter_cost, tmp); + + mcost = TG_(mappingcost_as_string)(TG_(dumpmap), sum); + VG_(gdb_printf)("events-%d-%d: %s\n", t, i, mcost); + VG_(free)(mcost); + } + if (ce && ce->jcc) { + to = ce->jcc->to; + VG_(gdb_printf)("function-%d-%d: %s\n", t, i, to->cxt->fn[0]->name); + } +} + +/* Dump current state */ +static void dump_state_togdb(void) +{ + thread_info** th; + int t; + Int orig_tid = TG_(current_tid); + + VG_(gdb_printf)("instrumentation: %s\n", + TG_(instrument_state) ? "on" : "off"); + if (!TG_(instrument_state)) + return; + + VG_(gdb_printf)("executed-bbs: %llu\n", TG_(stat).bb_executions); + VG_(gdb_printf)("executed-calls: %llu\n", TG_(stat).call_counter); + VG_(gdb_printf)("distinct-bbs: %d\n", TG_(stat).distinct_bbs); + VG_(gdb_printf)("distinct-calls: %d\n", TG_(stat).distinct_jccs); + VG_(gdb_printf)("distinct-functions: %d\n", TG_(stat).distinct_fns); + VG_(gdb_printf)("distinct-contexts: %d\n", TG_(stat).distinct_contexts); + + /* "events:" line. Given here because it will be dynamic in the future */ + HChar* evmap = TG_(eventmapping_as_string)(TG_(dumpmap)); + VG_(gdb_printf)("events: %s\n", evmap); + VG_(free)(evmap); + /* Total cost summary */ + + /* threads */ + th = TG_(get_threads)(); + VG_(gdb_printf)("threads:"); + for (t = 1; t < VG_N_THREADS; t++) { + if (!th[t]) + continue; + VG_(gdb_printf)(" %d", t); + } + VG_(gdb_printf)("\n"); + VG_(gdb_printf)("current-tid: %d\n", orig_tid); + TG_(forall_threads)(dump_state_of_thread_togdb); +} + +static void print_monitor_help(void) +{ + VG_(gdb_printf)("\n"); + VG_(gdb_printf)("tracegrind monitor commands:\n"); + VG_(gdb_printf)(" status\n"); + VG_(gdb_printf)(" print status\n"); + VG_(gdb_printf)(" instrumentation [on|off]\n"); + VG_(gdb_printf)(" get/set (if on/off given) instrumentation state\n"); + VG_(gdb_printf)("\n"); +} + +/* return True if request recognised, False otherwise */ +static Bool handle_gdb_monitor_command(ThreadId tid, const HChar* req) +{ + HChar* wcmd; + HChar s[VG_(strlen)(req) + 1]; /* copy for strtok_r */ + HChar* ssaveptr; + + VG_(strcpy)(s, req); + + wcmd = VG_(strtok_r)(s, " ", &ssaveptr); + switch (VG_(keyword_id)("help status instrumentation", wcmd, + kwd_report_duplicated_matches)) { + case -2: /* multiple matches */ + return True; + case -1: /* not found */ + return False; + case 0: /* help */ + print_monitor_help(); + return True; + + case 1: { /* status */ + HChar* arg = VG_(strtok_r)(0, " ", &ssaveptr); + if (arg && (VG_(strcmp)(arg, "internal") == 0)) { + /* internal interface to tracegrind_control */ + dump_state_togdb(); + return True; + } + + if (!TG_(instrument_state)) { + VG_(gdb_printf)( + "No status available as instrumentation is switched off\n"); + } else { + // Status information to be improved ... + thread_info** th = TG_(get_threads)(); + Int t, tcount = 0; + for (t = 1; t < VG_N_THREADS; t++) + if (th[t]) + tcount++; + VG_(gdb_printf)("%d thread(s) running.\n", tcount); + } + return True; + } + + case 2: { /* instrumentation */ + HChar* arg = VG_(strtok_r)(0, " ", &ssaveptr); + if (!arg) { + VG_(gdb_printf)("instrumentation: %s\n", + TG_(instrument_state) ? "on" : "off"); + } else + TG_(set_instrument_state)("Command", VG_(strcmp)(arg, "off") != 0); + return True; + } + + default: + tl_assert(0); + return False; + } +} + +static Bool TG_(handle_client_request)(ThreadId tid, UWord* args, UWord* ret) +{ + if (!VG_IS_TOOL_USERREQ('C', 'T', args[0]) && + VG_USERREQ__GDB_MONITOR_COMMAND != args[0]) + return False; + + switch (args[0]) { + case VG_USERREQ__TOGGLE_COLLECT: + TG_(current_state).collect = !TG_(current_state).collect; + TG_DEBUG(2, "Client Request: toggled collection state to %s\n", + TG_(current_state).collect ? "ON" : "OFF"); + *ret = 0; /* meaningless */ + break; + + case VG_USERREQ__ADD_MARKER: { + const HChar* marker = (HChar*)args[1]; + TG_DEBUG(2, "Client Request: add marker '%s'\n", marker); + TG_(trace_emit_marker)(tid, marker); + *ret = 0; /* meaningless */ + } break; + + case VG_USERREQ__START_INSTRUMENTATION: + TG_(set_instrument_state)("Client Request", True); + *ret = 0; /* meaningless */ + break; + + case VG_USERREQ__STOP_INSTRUMENTATION: + TG_(set_instrument_state)("Client Request", False); + *ret = 0; /* meaningless */ + break; + + case VG_USERREQ__GDB_MONITOR_COMMAND: { + Bool handled = handle_gdb_monitor_command(tid, (HChar*)args[1]); + if (handled) + *ret = 1; + else + *ret = 0; + return handled; + } + case VG_USERREQ__DUMP_STATS: + case VG_USERREQ__ZERO_STATS: + TG_DEBUG(2, "Client Request: ignoring %llx\n", (ULong)args[0]); + *ret = 0; /* meaningless */ + break; + + default: + VG_(message)(Vg_UserMsg, + "Warning: unknown tracegrind client request code %llx\n", + (ULong)args[0]); + return False; + } + + return True; +} + +/* Syscall Timing. syscalltime[tid] is the time at which thread tid last + started a syscall. */ + +/* struct vki_timespec syscalltime[VG_N_THREADS]; + Whatever the syscall we use to measure the syscall time, we convert to + seconds and nanoseconds. */ +struct vki_timespec* syscalltime; +struct vki_timespec* syscallcputime; + +static void collect_time(struct vki_timespec* systime, + struct vki_timespec* syscputime) +{ + switch (TG_(clo).collect_systime) { + default: + tl_assert(0); + case systime_msec: { + UInt ms_timer = VG_(read_millisecond_timer)(); + systime->tv_sec = ms_timer / 1000; + systime->tv_nsec = (ms_timer % 1000) * 1000000L; + break; + } + case systime_usec: { + struct vki_timeval tv_now; + VG_(gettimeofday)(&tv_now, NULL); + systime->tv_sec = tv_now.tv_sec; + systime->tv_nsec = tv_now.tv_usec * 1000; + break; + } + case systime_nsec: +#if defined(VGO_linux) || defined(VGO_solaris) || defined(VGO_freebsd) + VG_(clock_gettime)(systime, VKI_CLOCK_MONOTONIC); + VG_(clock_gettime)(syscputime, VKI_CLOCK_THREAD_CPUTIME_ID); + +#elif defined(VGO_darwin) + tl_assert(0); +#else +#error "Unknown OS" +#endif + break; + } +} + +static void +TG_(pre_syscall)(ThreadId tid, UInt syscallno, UWord* args, UInt nArgs) +{ + /* Collect time for systime tracking if enabled */ + if (TG_(clo).collect_systime != systime_no) { + collect_time(&syscalltime[tid], TG_(clo).collect_systime == systime_nsec + ? &syscallcputime[tid] + : NULL); + } +} + +/* Returns "after - before" in the unit as specified by --collect-systime. + after is supposed to be >= before, and tv_nsec must be >= 0 and < + One_Second_In_Nsec. */ +static ULong vki_timespec_diff(struct vki_timespec after, + struct vki_timespec before) +{ + vki_time_t diff_sec = after.tv_sec - before.tv_sec; + long diff_nsec = after.tv_nsec - before.tv_nsec; + ULong nsec_factor; // factor to convert the desired unit into nsec. + + if (diff_nsec < 0) { + diff_sec--; + diff_nsec += 1000000000ULL; + } + switch (TG_(clo).collect_systime) { + case systime_no: + tl_assert(0); + case systime_msec: + nsec_factor = 1000000ULL; + break; + case systime_usec: + nsec_factor = 1000ULL; + break; + case systime_nsec: + nsec_factor = 1ULL; + break; + default: + tl_assert(0); + } + return ((ULong)diff_sec * 1000000000ULL + diff_nsec) / nsec_factor; +} + +/* Check if syscall is a fork-like call that creates a new process */ +static Bool is_fork_syscall(UInt syscallno) +{ +#if defined(VGO_linux) + return syscallno == __NR_clone || syscallno == __NR_fork || + syscallno == __NR_vfork +#if defined(__NR_clone3) + || syscallno == __NR_clone3 +#endif + ; +#else + return False; /* TODO: support other OSes */ +#endif +} + +static void TG_(post_syscall)( + ThreadId tid, UInt syscallno, UWord* args, UInt nArgs, SysRes res) +{ + /* Handle fork/clone: emit FORK event with child PID. + Skip if this was a thread-creating clone (CLONE_THREAD), + since we emit THREAD_CREATE via track_pre_thread_ll_create instead. */ + if (is_fork_syscall(syscallno) && !sr_isError(res) && sr_Res(res) > 0) { + Bool is_thread = False; +#if defined(VGO_linux) + if (syscallno == __NR_clone && nArgs > 0) + is_thread = (args[0] & VKI_CLONE_THREAD) != 0; +#if defined(__NR_clone3) + if (syscallno == __NR_clone3 && nArgs > 0) { + /* clone3 first arg is pointer to struct clone_args; + flags is the first field (ULong / __u64). */ + ULong flags = *(ULong*)(Addr)args[0]; + is_thread = (flags & VKI_CLONE_THREAD) != 0; + } +#endif +#endif + if (!is_thread) { + Int child_pid = (Int)sr_Res(res); + TG_(trace_emit_fork)(tid, child_pid); + } + } + + /* Handle systime collection if enabled */ + if (TG_(clo).collect_systime != systime_no && TG_(current_state).bbcc) { + Int o; + struct vki_timespec ts_now; + struct vki_timespec ts_cpunow; + ULong diff; + + collect_time( + &ts_now, TG_(clo).collect_systime == systime_nsec ? &ts_cpunow : NULL); + + diff = vki_timespec_diff(ts_now, syscalltime[tid]); + + /* offset o is for "SysCount", o+1 for "SysTime", + o+2 is (optionally) "SysCpuTime". */ + o = fullOffset(EG_SYS); + TG_ASSERT(o >= 0); + TG_DEBUG(0, " Time (Off %d) for Syscall %u: %llu\n", o, syscallno, + diff); + + if (!TG_(current_state).bbcc->skipped) + TG_(init_cost_lz)(TG_(sets).full, &(TG_(current_state).bbcc->skipped)); + TG_(current_state).cost[o]++; + TG_(current_state).cost[o + 1] += diff; + TG_(current_state).bbcc->skipped[o]++; + TG_(current_state).bbcc->skipped[o + 1] += diff; + if (TG_(clo).collect_systime == systime_nsec) { + diff = vki_timespec_diff(ts_cpunow, syscallcputime[tid]); + TG_DEBUG(0, " SysCpuTime (Off %d) for Syscall %u: %llu\n", o + 2, + syscallno, diff); + TG_(current_state).cost[o + 2] += diff; + TG_(current_state).bbcc->skipped[o + 2] += diff; + } + } +} + +static UInt ULong_width(ULong n) +{ + UInt w = 0; + while (n > 0) { + n = n / 10; + w++; + } + if (w == 0) + w = 1; + return w + (w - 1) / 3; // add space for commas +} + +static void branchsim_printstat(int l1, int l2, int l3) +{ + static HChar fmt[128]; // large enough + FullCost total; + ULong Bc_total_b, Bc_total_mp, Bi_total_b, Bi_total_mp; + ULong B_total_b, B_total_mp; + + total = TG_(total_cost); + Bc_total_b = total[fullOffset(EG_BC)]; + Bc_total_mp = total[fullOffset(EG_BC) + 1]; + Bi_total_b = total[fullOffset(EG_BI)]; + Bi_total_mp = total[fullOffset(EG_BI) + 1]; + + /* Make format string, getting width right for numbers */ + VG_(sprintf)(fmt, "%%s %%,%dllu (%%,%dllu cond + %%,%dllu ind)\n", l1, l2, + l3); + + if (0 == Bc_total_b) + Bc_total_b = 1; + if (0 == Bi_total_b) + Bi_total_b = 1; + B_total_b = Bc_total_b + Bi_total_b; + B_total_mp = Bc_total_mp + Bi_total_mp; + + VG_(umsg)("\n"); + VG_(umsg)(fmt, "Branches: ", B_total_b, Bc_total_b, Bi_total_b); + + VG_(umsg)(fmt, "Mispredicts: ", B_total_mp, Bc_total_mp, Bi_total_mp); + + VG_(umsg)("Mispred rate: %*.1f%% (%*.1f%% + %*.1f%% )\n", l1, + B_total_mp * 100.0 / B_total_b, l2, + Bc_total_mp * 100.0 / Bc_total_b, l3, + Bi_total_mp * 100.0 / Bi_total_b); +} + +static void tg_print_stats(void) +{ + int BB_lookups = TG_(stat).full_debug_BBs + TG_(stat).fn_name_debug_BBs + + TG_(stat).file_line_debug_BBs + TG_(stat).no_debug_BBs; + + /* Hash table stats */ + VG_(message)(Vg_DebugMsg, "Distinct objects: %d\n", TG_(stat).distinct_objs); + VG_(message)(Vg_DebugMsg, "Distinct files: %d\n", + TG_(stat).distinct_files); + VG_(message)(Vg_DebugMsg, "Distinct fns: %d\n", TG_(stat).distinct_fns); + VG_(message)(Vg_DebugMsg, "Distinct contexts:%d\n", + TG_(stat).distinct_contexts); + VG_(message)(Vg_DebugMsg, "Distinct BBs: %d\n", TG_(stat).distinct_bbs); + VG_(message)(Vg_DebugMsg, "Cost entries: %u (Chunks %u)\n", + TG_(costarray_entries), TG_(costarray_chunks)); + VG_(message)(Vg_DebugMsg, "Distinct BBCCs: %d\n", + TG_(stat).distinct_bbccs); + VG_(message)(Vg_DebugMsg, "Distinct JCCs: %d\n", TG_(stat).distinct_jccs); + VG_(message)(Vg_DebugMsg, "Distinct skips: %d\n", + TG_(stat).distinct_skips); + VG_(message)(Vg_DebugMsg, "BB lookups: %d\n", BB_lookups); + if (BB_lookups > 0) { + VG_(message)(Vg_DebugMsg, "With full debug info:%3d%% (%d)\n", + TG_(stat).full_debug_BBs * 100 / BB_lookups, + TG_(stat).full_debug_BBs); + VG_(message)(Vg_DebugMsg, "With file/line debug info:%3d%% (%d)\n", + TG_(stat).file_line_debug_BBs * 100 / BB_lookups, + TG_(stat).file_line_debug_BBs); + VG_(message)(Vg_DebugMsg, "With fn name debug info:%3d%% (%d)\n", + TG_(stat).fn_name_debug_BBs * 100 / BB_lookups, + TG_(stat).fn_name_debug_BBs); + VG_(message)(Vg_DebugMsg, "With no debug info:%3d%% (%d)\n", + TG_(stat).no_debug_BBs * 100 / BB_lookups, + TG_(stat).no_debug_BBs); + } + VG_(message)(Vg_DebugMsg, "BBCC Clones: %d\n", TG_(stat).bbcc_clones); + VG_(message)(Vg_DebugMsg, "BBs Retranslated: %d\n", + TG_(stat).bb_retranslations); + VG_(message)(Vg_DebugMsg, "Distinct instrs: %d\n", + TG_(stat).distinct_instrs); + + VG_(message)(Vg_DebugMsg, "LRU Contxt Misses: %d\n", + TG_(stat).cxt_lru_misses); + VG_(message)(Vg_DebugMsg, "LRU BBCC Misses: %d\n", + TG_(stat).bbcc_lru_misses); + VG_(message)(Vg_DebugMsg, "LRU JCC Misses: %d\n", + TG_(stat).jcc_lru_misses); + VG_(message)(Vg_DebugMsg, "BBs Executed: %llu\n", + TG_(stat).bb_executions); + VG_(message)(Vg_DebugMsg, "Calls: %llu\n", + TG_(stat).call_counter); + VG_(message)(Vg_DebugMsg, "CondJMP followed: %llu\n", + TG_(stat).jcnd_counter); + VG_(message)(Vg_DebugMsg, "Boring JMPs: %llu\n", + TG_(stat).jump_counter); + VG_(message)(Vg_DebugMsg, "Recursive calls: %llu\n", + TG_(stat).rec_call_counter); + VG_(message)(Vg_DebugMsg, "Returns: %llu\n", + TG_(stat).ret_counter); +} + +static void finish(void) +{ + HChar fmt[128]; // large enough + Int l1, l2, l3; + FullCost total; + + TG_DEBUG(0, "finish()\n"); + + (*TG_(cachesim).finish)(); + + /* pop all remaining items from CallStack for correct sum + */ + TG_(forall_threads)(unwind_thread); + + TG_(compute_total_cost)(); + + /* Close CSV trace output */ + TG_(trace_close_output)(); + + if (VG_(clo_verbosity) == 0) + return; + + if (VG_(clo_stats)) { + VG_(message)(Vg_DebugMsg, "\n"); + tg_print_stats(); + VG_(message)(Vg_DebugMsg, "\n"); + } + + HChar* evmap = TG_(eventmapping_as_string)(TG_(dumpmap)); + VG_(message)(Vg_UserMsg, "Events : %s\n", evmap); + VG_(free)(evmap); + HChar* mcost = TG_(mappingcost_as_string)(TG_(dumpmap), TG_(total_cost)); + VG_(message)(Vg_UserMsg, "Collected : %s\n", mcost); + VG_(free)(mcost); + VG_(message)(Vg_UserMsg, "\n"); + + /* determine value widths for statistics */ + total = TG_(total_cost); + l1 = ULong_width(total[fullOffset(EG_IR)]); + l2 = l3 = 0; + if (TG_(clo).simulate_cache) { + l2 = ULong_width(total[fullOffset(EG_DR)]); + l3 = ULong_width(total[fullOffset(EG_DW)]); + } + if (TG_(clo).simulate_branch) { + int l2b = ULong_width(total[fullOffset(EG_BC)]); + int l3b = ULong_width(total[fullOffset(EG_BI)]); + if (l2b > l2) + l2 = l2b; + if (l3b > l3) + l3 = l3b; + } + + /* Make format string, getting width right for numbers */ + VG_(sprintf)(fmt, "%%s %%,%dllu\n", l1); + + /* Always print this */ + VG_(umsg)(fmt, "I refs: ", total[fullOffset(EG_IR)]); + + if (TG_(clo).simulate_cache) + (*TG_(cachesim).printstat)(l1, l2, l3); + + if (TG_(clo).simulate_branch) + branchsim_printstat(l1, l2, l3); +} + +void TG_(fini)(Int exitcode) { finish(); } + +/*--------------------------------------------------------------------*/ +/*--- Setup ---*/ +/*--------------------------------------------------------------------*/ + +static void tg_start_client_code_callback(ThreadId tid, ULong blocks_done) +{ + static ULong last_blocks_done = 0; + + if (0) + VG_(printf)("%d R %llu\n", (Int)tid, blocks_done); + + /* throttle calls to TG_(run_thread) by number of BBs executed */ + if (blocks_done - last_blocks_done < 5000) + return; + last_blocks_done = blocks_done; + + TG_(run_thread)(tid); +} + +/* + * Called after fork() in the child process. + * Reopens the trace file with the child's PID. + */ +static void tg_atfork_child(ThreadId tid) { TG_(trace_reopen_child)(); } + +static void tg_pre_thread_ll_create(ThreadId tid, ThreadId child) +{ + /* Skip Valgrind's internal scheduler thread (tid 0) creating the + initial client thread -- that's not a user-visible thread creation. */ + if (tid == 0) + return; + TG_(trace_emit_thread_create)(tid, child); +} + +static void TG_(post_clo_init)(void) +{ + if (VG_(clo_vex_control).iropt_register_updates_default != + VexRegUpdSpAtMemAccess) { + TG_DEBUG(1, " Using user specified value for " + "--vex-iropt-register-updates\n"); + } else { + TG_DEBUG(1, " Using default --vex-iropt-register-updates=" + "sp-at-mem-access\n"); + } + + /* Always register syscall wrappers for fork/clone detection. + Also handles systime collection if enabled. */ + VG_(needs_syscall_wrapper)(TG_(pre_syscall), TG_(post_syscall)); + + if (TG_(clo).collect_systime != systime_no) { + syscalltime = + TG_MALLOC("cl.main.pci.1", VG_N_THREADS * sizeof syscalltime[0]); + for (UInt i = 0; i < VG_N_THREADS; ++i) { + syscalltime[i].tv_sec = 0; + syscalltime[i].tv_nsec = 0; + } + if (TG_(clo).collect_systime == systime_nsec) { + syscallcputime = + TG_MALLOC("cl.main.pci.2", VG_N_THREADS * sizeof syscallcputime[0]); + for (UInt i = 0; i < VG_N_THREADS; ++i) { + syscallcputime[i].tv_sec = 0; + syscallcputime[i].tv_nsec = 0; + } + } + } + + if (VG_(clo_px_file_backed) != VexRegUpdSpAtMemAccess) { + TG_DEBUG(1, " Using user specified value for " + "--px-file-backed\n"); + } else { + TG_DEBUG(1, " Using default --px-file-backed=" + "sp-at-mem-access\n"); + } + + if (VG_(clo_vex_control).iropt_unroll_thresh != 0) { + VG_(message)(Vg_UserMsg, + "tracegrind only works with --vex-iropt-unroll-thresh=0\n" + "=> resetting it back to 0\n"); + VG_(clo_vex_control).iropt_unroll_thresh = 0; // cannot be overridden. + } + if (VG_(clo_vex_control).guest_chase) { + VG_(message)(Vg_UserMsg, + "tracegrind only works with --vex-guest-chase=no\n" + "=> resetting it back to 'no'\n"); + VG_(clo_vex_control).guest_chase = False; // cannot be overridden. + } + + TG_DEBUG(1, " dump threads: %s\n", + TG_(clo).separate_threads ? "Yes" : "No"); + TG_DEBUG(1, " call sep. : %d\n", TG_(clo).separate_callers); + TG_DEBUG(1, " rec. sep. : %d\n", TG_(clo).separate_recursions); + + (*TG_(cachesim).post_clo_init)(); + + TG_(init_eventsets)(); + TG_(init_statistics)(&TG_(stat)); + TG_(init_cost_lz)(TG_(sets).full, &TG_(total_cost)); + + /* initialize hash tables */ + TG_(init_obj_table)(); + TG_(init_cxt_table)(); + TG_(init_bb_hash)(); + + TG_(init_threads)(); + TG_(run_thread)(1); + + TG_(instrument_state) = TG_(clo).instrument_atstart; + + /* Open trace output file */ + TG_(trace_open_output)(); + + /* Register fork handler to emit FORK events */ + VG_(atfork)(NULL, NULL, tg_atfork_child); + + if (VG_(clo_verbosity) > 0) { + VG_(message)(Vg_UserMsg, "Streaming trace output to tracegrind.out.%d\n", + VG_(getpid)()); + } +} + +static void TG_(pre_clo_init)(void) +{ + VG_(details_name)("Tracegrind"); + VG_(details_version)(NULL); + VG_(details_description)("a streaming trace cache profiler"); + VG_(details_copyright_author)( + "Copyright (C) 2026, and GNU GPL'd, " + "by CodSpeed Technology SAS. " + "Based on Callgrind by Josef Weidendorfer et al."); + VG_(details_bug_reports_to)(VG_BUGS_TO); + VG_(details_avg_translation_sizeB)(500); + + VG_(clo_vex_control).iropt_register_updates_default = + VG_(clo_px_file_backed) = + VexRegUpdSpAtMemAccess; // overridable by the user. + + VG_(clo_vex_control).iropt_unroll_thresh = 0; // cannot be overridden. + VG_(clo_vex_control).guest_chase = False; // cannot be overridden. + + VG_(basic_tool_funcs)(TG_(post_clo_init), TG_(instrument), TG_(fini)); + + VG_(needs_superblock_discards)(tg_discard_superblock_info); + + VG_(needs_command_line_options)(TG_(process_cmd_line_option), + TG_(print_usage), TG_(print_debug_usage)); + + VG_(needs_client_requests)(TG_(handle_client_request)); + VG_(needs_print_stats)(tg_print_stats); + + VG_(track_start_client_code)(&tg_start_client_code_callback); + VG_(track_pre_deliver_signal)(&TG_(pre_signal)); + VG_(track_post_deliver_signal)(&TG_(post_signal)); + VG_(track_pre_thread_ll_create)(&tg_pre_thread_ll_create); + + TG_(set_clo_defaults)(); +} + +VG_DETERMINE_INTERFACE_VERSION(TG_(pre_clo_init)) + +/*--------------------------------------------------------------------*/ +/*--- end main.c ---*/ +/*--------------------------------------------------------------------*/ diff --git a/tracegrind/scripts/tracegrind-analyzer b/tracegrind/scripts/tracegrind-analyzer new file mode 100755 index 000000000..185ed5d2d --- /dev/null +++ b/tracegrind/scripts/tracegrind-analyzer @@ -0,0 +1,302 @@ +#!/usr/bin/env -S uvx --with lz4 --with msgpack python3 +# /// script +# requires-python = ">=3.8" +# dependencies = ["lz4", "msgpack"] +# /// +""" +Decode and debug tracegrind MsgPack+LZ4 trace files. + +Usage: + ./decode-trace.py [options] + +Examples: + ./decode-trace.py tracegrind.out.12345.msgpack.lz4 + ./decode-trace.py trace.msgpack.lz4 --head 20 + ./decode-trace.py trace.msgpack.lz4 --schema + ./decode-trace.py trace.msgpack.lz4 --stats + ./decode-trace.py trace.msgpack.lz4 --json +""" + +import argparse +import json +import os +import struct +import sys +from collections import Counter +from typing import Any, BinaryIO, Dict, Iterator, List, Tuple + +import lz4.block +import msgpack + + +MAGIC = b'TGMP' + + +def read_header(f: BinaryIO) -> int: + """Read and validate file header, return version.""" + magic = f.read(4) + if magic != MAGIC: + raise ValueError(f"Invalid magic: {magic!r}, expected {MAGIC!r}") + version = struct.unpack(' bytes | None: + """Read a single chunk, return decompressed data or None for end marker.""" + header = f.read(8) + if len(header) < 8: + return None + usize, csize = struct.unpack(' Dict[str, Any]: + """Decode schema chunk into Python dict.""" + schema = msgpack.unpackb(data, raw=False) + return schema + + +def iter_rows(data: bytes) -> Iterator[List[Any]]: + """Iterate over rows in a data chunk.""" + unpacker = msgpack.Unpacker(raw=False) + unpacker.feed(data) + yield from unpacker + + +def decode_trace(filepath: str) -> Tuple[int, Dict[str, Any], List[List[Any]]]: + """Decode entire trace file, return (version, schema, rows).""" + with open(filepath, 'rb') as f: + version = read_header(f) + + # Read schema chunk + schema_data = read_chunk(f) + if schema_data is None: + raise ValueError("Missing schema chunk") + schema = decode_schema(schema_data) + + # Read all data chunks + rows = [] + while True: + chunk_data = read_chunk(f) + if chunk_data is None: + break + rows.extend(iter_rows(chunk_data)) + + return version, schema, rows + + +def get_event_name(event_type: int) -> str: + """Convert event type to name.""" + return {0: 'MARKER', 1: 'ENTER_FN', 2: 'EXIT_FN', 3: 'ENTER_INLINED_FN', 4: 'EXIT_INLINED_FN', 5: 'FORK', 6: 'THREAD_CREATE'}.get(event_type, f'UNKNOWN({event_type})') + + +def format_row(row: List[Any], schema: Dict[str, Any]) -> Dict[str, Any]: + """Format a row as a dict using the appropriate schema.""" + if len(row) < 3: + return {'_raw': row} + + event_type = row[2] + event_schemas = schema.get('event_schemas', {}) + columns = event_schemas.get(str(event_type), []) + + if not columns: + # Fallback for old format with 'columns' key + columns = schema.get('columns', []) + + counter_names = schema.get('counters', []) + + result = {} + for i, val in enumerate(row): + if i < len(columns): + key = columns[i] + if key == 'event': + result[key] = get_event_name(val) + elif key == 'counters' and isinstance(val, list): + # Expand counters sub-array using top-level counter names + for j, cval in enumerate(val): + if j < len(counter_names): + result[counter_names[j]] = cval + else: + result[f'_counter{j}'] = cval + else: + result[key] = val + else: + result[f'_col{i}'] = val + + return result + + +def print_schema(schema: Dict[str, Any], version: int) -> None: + """Print schema information.""" + print(f"Format Version: {version}") + print(f"Format Name: {schema.get('format', 'unknown')}") + print(f"Schema Version: {schema.get('version', 'unknown')}") + print() + + if 'event_schemas' in schema: + print("Event Schemas (discriminated union):") + for event_type, columns in sorted(schema['event_schemas'].items()): + event_name = get_event_name(int(event_type)) + print(f" {event_type} ({event_name}): {columns}") + elif 'columns' in schema: + print(f"Columns: {schema['columns']}") + + if schema.get('counters'): + print(f"Counters: {schema['counters']}") + + if schema.get('counter_units'): + print(f"Counter Units: {dict(sorted(schema['counter_units'].items()))}") + print() + + +def print_stats(rows: List[List[Any]], schema: Dict[str, Any]) -> None: + """Print statistics about the trace.""" + print(f"Total rows: {len(rows):,}") + + if not rows: + return + + # Count by event type + event_counts = Counter(row[2] for row in rows if len(row) > 2) + print("\nEvents by type:") + for event_type, count in sorted(event_counts.items()): + event_name = get_event_name(event_type) + pct = 100 * count / len(rows) + print(f" {event_name}: {count:,} ({pct:.1f}%)") + + # Thread stats + thread_ids = set(row[1] for row in rows if len(row) > 1) + print(f"\nThreads: {len(thread_ids)} ({sorted(thread_ids)})") + + # Sequence range + seqs = [row[0] for row in rows if len(row) > 0] + if seqs: + print(f"Sequence range: {min(seqs):,} - {max(seqs):,}") + + # Function stats (for ENTER/EXIT events) + fn_counts = Counter() + for row in rows: + if len(row) > 3 and row[2] in (1, 2, 3, 4): # ENTER_FN, EXIT_FN, ENTER_INLINED_FN, or EXIT_INLINED_FN + fn_counts[row[3]] += 1 + + if fn_counts: + print(f"\nTop 10 functions by event count:") + for fn, count in fn_counts.most_common(10): + print(f" {count:8,} {fn}") + + # FORK events + fork_rows = [row for row in rows if len(row) > 2 and row[2] == 5] + if fork_rows: + print(f"\nFork events: {len(fork_rows)}") + for row in fork_rows[:5]: + formatted = format_row(row, schema) + child_pid = formatted.get('child_pid', 'unknown') + print(f" seq={formatted.get('seq')}, tid={formatted.get('tid')}, child_pid={child_pid}") + + # THREAD_CREATE events + thread_create_rows = [row for row in rows if len(row) > 2 and row[2] == 6] + if thread_create_rows: + print(f"\nThread create events: {len(thread_create_rows)}") + for row in thread_create_rows[:5]: + formatted = format_row(row, schema) + child_tid = formatted.get('child_tid', 'unknown') + print(f" seq={formatted.get('seq')}, tid={formatted.get('tid')}, child_tid={child_tid}") + + +def print_rows(rows: List[List[Any]], schema: Dict[str, Any], + head: int | None = None, raw: bool = False, as_json: bool = False) -> None: + """Print rows in various formats.""" + display_rows = rows[:head] if head else rows + + if as_json: + output = [format_row(row, schema) for row in display_rows] + print(json.dumps(output, indent=2)) + return + + for row in display_rows: + if raw: + print(row) + else: + formatted = format_row(row, schema) + # Compact single-line format + parts = [] + for k, v in formatted.items(): + if isinstance(v, str) and k in ('obj', 'file'): + v = os.path.basename(v) + parts.append(f"{k}={v}") + print(' | '.join(parts)) + + +def main(): + parser = argparse.ArgumentParser( + description='Decode and debug tracegrind MsgPack+LZ4 trace files.', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__ + ) + parser.add_argument('file', help='Trace file to decode (.msgpack.lz4)') + parser.add_argument('--schema', action='store_true', + help='Print schema information only') + parser.add_argument('--stats', action='store_true', + help='Print statistics about the trace') + parser.add_argument('--head', type=int, metavar='N', + help='Print only first N rows') + parser.add_argument('--tail', type=int, metavar='N', + help='Print only last N rows') + parser.add_argument('--raw', action='store_true', + help='Print raw row arrays') + parser.add_argument('--json', action='store_true', + help='Output as JSON') + parser.add_argument('--event', type=str, choices=['MARKER', 'ENTER_FN', 'EXIT_FN', 'ENTER_INLINED_FN', 'EXIT_INLINED_FN', 'FORK', 'THREAD_CREATE'], + help='Filter by event type') + parser.add_argument('--fn', type=str, metavar='PATTERN', + help='Filter by function name (substring match)') + + args = parser.parse_args() + + try: + version, schema, rows = decode_trace(args.file) + except Exception as e: + print(f"Error reading trace file: {e}", file=sys.stderr) + sys.exit(1) + + # Schema only mode + if args.schema: + print_schema(schema, version) + sys.exit(0) + + # Apply filters + filtered_rows = rows + + if args.event: + event_map = {'MARKER': 0, 'ENTER_FN': 1, 'EXIT_FN': 2, 'ENTER_INLINED_FN': 3, 'EXIT_INLINED_FN': 4, 'FORK': 5, 'THREAD_CREATE': 6} + event_type = event_map[args.event] + filtered_rows = [r for r in filtered_rows if len(r) > 2 and r[2] == event_type] + + if args.fn: + pattern = args.fn.lower() + filtered_rows = [r for r in filtered_rows + if len(r) > 3 and isinstance(r[3], str) and pattern in r[3].lower()] + + # Stats mode + if args.stats: + print_schema(schema, version) + print_stats(filtered_rows, schema) + sys.exit(0) + + # Default: print rows + if args.tail: + filtered_rows = filtered_rows[-args.tail:] + + print_schema(schema, version) + print(f"Showing {min(args.head or len(filtered_rows), len(filtered_rows)):,} of {len(filtered_rows):,} rows") + print("-" * 80) + print_rows(filtered_rows, schema, head=args.head, raw=args.raw, as_json=args.json) + + +if __name__ == '__main__': + main() diff --git a/tracegrind/sim.c b/tracegrind/sim.c new file mode 100644 index 000000000..25d8cf983 --- /dev/null +++ b/tracegrind/sim.c @@ -0,0 +1,1703 @@ +/*--------------------------------------------------------------------*/ +/*--- Cache simulation. ---*/ +/*--- sim.c ---*/ +/*--------------------------------------------------------------------*/ + +/* + This file is part of Tracegrind, a Valgrind tool for call graph + profiling programs. + + Copyright (C) 2003-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de) + + This tool is derived from and contains code from Cachegrind + Copyright (C) 2002-2017 Nicholas Nethercote (njn@valgrind.org) + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see . + + The GNU General Public License is contained in the file COPYING. +*/ + +#include "global.h" + +/* Notes: + - simulates a write-allocate cache + - (block --> set) hash function uses simple bit selection + - handling of references straddling two cache blocks: + - counts as only one cache access (not two) + - both blocks hit --> one hit + - one block hits, the other misses --> one miss + - both blocks miss --> one miss (not two) +*/ + +/* Cache configuration */ +#include "cg_arch.c" + +/* additional structures for cache use info, separated + * according usage frequency: + * - line_loaded : pointer to cost center of instruction + * which loaded the line into cache. + * Needed to increment counters when line is evicted. + * - line_use : updated on every access + */ +typedef struct { + UInt count; + UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */ +} line_use; + +typedef struct { + Addr memline, iaddr; + line_use* dep_use; /* point to higher-level cacheblock for this memline */ + ULong* use_base; +} line_loaded; + +/* Cache state */ +typedef struct { + const HChar* name; + int size; /* bytes */ + int assoc; + int line_size; /* bytes */ + Bool sectored; /* prefetch nearside cacheline on read */ + int sets; + int sets_min_1; + int line_size_bits; + int tag_shift; + UWord tag_mask; + HChar desc_line[128]; // large enough + UWord* tags; + + /* for cache use */ + int line_size_mask; + int* line_start_mask; + int* line_end_mask; + line_loaded* loaded; + line_use* use; +} cache_t2; + +/* + * States of flat caches in our model. + * We use a 2-level hierarchy, + */ +static cache_t2 I1, D1, LL; + +/* Lower bits of cache tags are used as flags for a cache line */ +#define CACHELINE_FLAGMASK (MIN_LINE_SIZE - 1) +#define CACHELINE_DIRTY 1 + +/* Cache simulator Options */ +static Bool clo_simulate_writeback = False; +static Bool clo_simulate_hwpref = False; +static Bool clo_simulate_sectors = False; +static Bool clo_collect_cacheuse = False; + +/* Following global vars are setup before by setup_bbcc(): + * + * - Addr TG_(bb_base) (instruction start address of original BB) + * - ULong* TG_(cost_base) (start of cost array for BB) + */ + +Addr TG_(bb_base); +ULong* TG_(cost_base); + +static InstrInfo* current_ii; + +/* Cache use offsets */ +/* The offsets are only correct because all per-instruction event sets get + * the "Use" set added first ! + */ +static Int off_I1_AcCost = 0; +static Int off_I1_SpLoss = 1; +static Int off_D1_AcCost = 0; +static Int off_D1_SpLoss = 1; +static Int off_LL_AcCost = 2; +static Int off_LL_SpLoss = 3; + +/* Cache access types */ +typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType; + +/* Result of a reference into a flat cache */ +typedef enum { Hit = 0, Miss, MissDirty } CacheResult; + +/* Result of a reference into a hierarchical cache model */ +typedef enum { L1_Hit, LL_Hit, MemAccess, WriteBackMemAccess } CacheModelResult; + +typedef CacheModelResult (*simcall_type)(Addr, UChar); + +static struct { + simcall_type I1_Read; + simcall_type D1_Read; + simcall_type D1_Write; +} simulator; + +/*------------------------------------------------------------*/ +/*--- Cache Simulator Initialization ---*/ +/*------------------------------------------------------------*/ + +static void cachesim_clearcache(cache_t2* c) +{ + Int i; + + for (i = 0; i < c->sets * c->assoc; i++) + c->tags[i] = 0; + if (c->use) { + for (i = 0; i < c->sets * c->assoc; i++) { + c->loaded[i].memline = 0; + c->loaded[i].use_base = 0; + c->loaded[i].dep_use = 0; + c->loaded[i].iaddr = 0; + c->use[i].mask = 0; + c->use[i].count = 0; + c->tags[i] = i % c->assoc; /* init lower bits as pointer */ + } + } +} + +static void cacheuse_initcache(cache_t2* c); + +/* By this point, the size/assoc/line_size has been checked. */ +static void cachesim_initcache(cache_t config, cache_t2* c) +{ + c->size = config.size; + c->assoc = config.assoc; + c->line_size = config.line_size; + c->sectored = False; // FIXME + + c->sets = (c->size / c->line_size) / c->assoc; + c->sets_min_1 = c->sets - 1; + c->line_size_bits = VG_(log2)(c->line_size); + c->tag_shift = c->line_size_bits + VG_(log2)(c->sets); + c->tag_mask = ~((1u << c->tag_shift) - 1); + + /* Can bits in tag entries be used for flags? + * Should be always true as MIN_LINE_SIZE >= 16 */ + TG_ASSERT((c->tag_mask & CACHELINE_FLAGMASK) == 0); + + if (c->assoc == 1) { + VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s", c->size, + c->line_size, c->sectored ? ", sectored" : ""); + } else { + VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s", c->size, + c->line_size, c->assoc, c->sectored ? ", sectored" : ""); + } + + c->tags = + (UWord*)TG_MALLOC("cl.sim.cs_ic.1", sizeof(UWord) * c->sets * c->assoc); + if (clo_collect_cacheuse) + cacheuse_initcache(c); + else + c->use = 0; + cachesim_clearcache(c); +} + +#if 0 +static void print_cache(cache_t2* c) +{ + UInt set, way, i; + + /* Note initialisation and update of 'i'. */ + for (i = 0, set = 0; set < c->sets; set++) { + for (way = 0; way < c->assoc; way++, i++) { + VG_(printf)("%8x ", c->tags[i]); + } + VG_(printf)("\n"); + } +} +#endif + +/*------------------------------------------------------------*/ +/*--- Simple Cache Simulation ---*/ +/*------------------------------------------------------------*/ + +/* + * Model: single inclusive, 2-level cache hierarchy (L1/LL) + * with write-allocate + * + * For simple cache hit/miss counts, we do not have to + * maintain the dirty state of lines (no need to distinguish + * read/write references), and the resulting counts are the + * same for write-through and write-back caches. + * + * Simulator functions: + * CacheModelResult cachesim_I1_ref(Addr a, UChar size) + * CacheModelResult cachesim_D1_ref(Addr a, UChar size) + */ +__attribute__((always_inline)) static __inline__ CacheResult +cachesim_setref(cache_t2* c, UInt set_no, UWord tag) +{ + int i, j; + UWord* set; + + set = &(c->tags[set_no * c->assoc]); + + /* This loop is unrolled for just the first case, which is the most */ + /* common. We can't unroll any further because it would screw up */ + /* if we have a direct-mapped (1-way) cache. */ + if (tag == set[0]) + return Hit; + + /* If the tag is one other than the MRU, move it into the MRU spot */ + /* and shuffle the rest down. */ + for (i = 1; i < c->assoc; i++) { + if (tag == set[i]) { + for (j = i; j > 0; j--) { + set[j] = set[j - 1]; + } + set[0] = tag; + return Hit; + } + } + + /* A miss; install this tag as MRU, shuffle rest down. */ + for (j = c->assoc - 1; j > 0; j--) { + set[j] = set[j - 1]; + } + set[0] = tag; + + return Miss; +} + +__attribute__((always_inline)) static __inline__ CacheResult +cachesim_ref(cache_t2* c, Addr a, UChar size) +{ + UWord block1 = a >> c->line_size_bits; + UWord block2 = (a + size - 1) >> c->line_size_bits; + UInt set1 = block1 & c->sets_min_1; + /* the tag does not need to include bits specifying the set, + * but it can, and this saves instructions */ + UWord tag1 = block1; + + /* Access entirely within line. */ + if (block1 == block2) + return cachesim_setref(c, set1, tag1); + + /* Access straddles two lines. */ + else if (block1 + 1 == block2) { + UInt set2 = block2 & c->sets_min_1; + UWord tag2 = block2; + + /* the call updates cache structures as side effect */ + CacheResult res1 = cachesim_setref(c, set1, tag1); + CacheResult res2 = cachesim_setref(c, set2, tag2); + return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit; + + } else { + VG_(printf)("addr: %lx size: %u blocks: %lu %lu", a, size, block1, + block2); + VG_(tool_panic)("item straddles more than two cache sets"); + } + return Hit; +} + +static CacheModelResult cachesim_I1_ref(Addr a, UChar size) +{ + if (cachesim_ref(&I1, a, size) == Hit) + return L1_Hit; + if (cachesim_ref(&LL, a, size) == Hit) + return LL_Hit; + return MemAccess; +} + +static CacheModelResult cachesim_D1_ref(Addr a, UChar size) +{ + if (cachesim_ref(&D1, a, size) == Hit) + return L1_Hit; + if (cachesim_ref(&LL, a, size) == Hit) + return LL_Hit; + return MemAccess; +} + +/*------------------------------------------------------------*/ +/*--- Write Back Cache Simulation ---*/ +/*------------------------------------------------------------*/ + +/* + * More complex model: L1 Write-through, LL Write-back + * This needs to distinguish among read and write references. + * + * Simulator functions: + * CacheModelResult cachesim_I1_Read(Addr a, UChar size) + * CacheModelResult cachesim_D1_Read(Addr a, UChar size) + * CacheModelResult cachesim_D1_Write(Addr a, UChar size) + */ + +/* + * With write-back, result can be a miss evicting a dirty line + * The dirty state of a cache line is stored in Bit0 of the tag for + * this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference + * type (Read/Write), the line gets dirty on a write. + */ +__attribute__((always_inline)) static __inline__ CacheResult +cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag) +{ + int i, j; + UWord *set, tmp_tag; + + set = &(c->tags[set_no * c->assoc]); + + /* This loop is unrolled for just the first case, which is the most */ + /* common. We can't unroll any further because it would screw up */ + /* if we have a direct-mapped (1-way) cache. */ + if (tag == (set[0] & ~CACHELINE_DIRTY)) { + set[0] |= ref; + return Hit; + } + /* If the tag is one other than the MRU, move it into the MRU spot */ + /* and shuffle the rest down. */ + for (i = 1; i < c->assoc; i++) { + if (tag == (set[i] & ~CACHELINE_DIRTY)) { + tmp_tag = set[i] | ref; // update dirty flag + for (j = i; j > 0; j--) { + set[j] = set[j - 1]; + } + set[0] = tmp_tag; + return Hit; + } + } + + /* A miss; install this tag as MRU, shuffle rest down. */ + tmp_tag = set[c->assoc - 1]; + for (j = c->assoc - 1; j > 0; j--) { + set[j] = set[j - 1]; + } + set[0] = tag | ref; + + return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss; +} + +__attribute__((always_inline)) static __inline__ CacheResult +cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size) +{ + UInt set1 = (a >> c->line_size_bits) & (c->sets_min_1); + UInt set2 = ((a + size - 1) >> c->line_size_bits) & (c->sets_min_1); + UWord tag = a & c->tag_mask; + + /* Access entirely within line. */ + if (set1 == set2) + return cachesim_setref_wb(c, ref, set1, tag); + + /* Access straddles two lines. */ + /* Nb: this is a fast way of doing ((set1+1) % c->sets) */ + else if (((set1 + 1) & (c->sets_min_1)) == set2) { + UWord tag2 = (a + size - 1) & c->tag_mask; + + /* the call updates cache structures as side effect */ + CacheResult res1 = cachesim_setref_wb(c, ref, set1, tag); + CacheResult res2 = cachesim_setref_wb(c, ref, set2, tag2); + + if ((res1 == MissDirty) || (res2 == MissDirty)) + return MissDirty; + return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit; + + } else { + VG_(printf)("addr: %lx size: %u sets: %u %u", a, size, set1, set2); + VG_(tool_panic)("item straddles more than two cache sets"); + } + return Hit; +} + +static CacheModelResult cachesim_I1_Read(Addr a, UChar size) +{ + if (cachesim_ref(&I1, a, size) == Hit) + return L1_Hit; + switch (cachesim_ref_wb(&LL, Read, a, size)) { + case Hit: + return LL_Hit; + case Miss: + return MemAccess; + default: + break; + } + return WriteBackMemAccess; +} + +static CacheModelResult cachesim_D1_Read(Addr a, UChar size) +{ + if (cachesim_ref(&D1, a, size) == Hit) + return L1_Hit; + switch (cachesim_ref_wb(&LL, Read, a, size)) { + case Hit: + return LL_Hit; + case Miss: + return MemAccess; + default: + break; + } + return WriteBackMemAccess; +} + +static CacheModelResult cachesim_D1_Write(Addr a, UChar size) +{ + if (cachesim_ref(&D1, a, size) == Hit) { + /* Even for a L1 hit, the write-trough L1 passes + * the write to the LL to make the LL line dirty. + * But this causes no latency, so return the hit. + */ + cachesim_ref_wb(&LL, Write, a, size); + return L1_Hit; + } + switch (cachesim_ref_wb(&LL, Write, a, size)) { + case Hit: + return LL_Hit; + case Miss: + return MemAccess; + default: + break; + } + return WriteBackMemAccess; +} + +/*------------------------------------------------------------*/ +/*--- Hardware Prefetch Simulation ---*/ +/*------------------------------------------------------------*/ + +static ULong prefetch_up = 0; +static ULong prefetch_down = 0; + +#define PF_STREAMS 8 +#define PF_PAGEBITS 12 + +static UInt pf_lastblock[PF_STREAMS]; +static Int pf_seqblocks[PF_STREAMS]; + +static void prefetch_clear(void) +{ + int i; + for (i = 0; i < PF_STREAMS; i++) + pf_lastblock[i] = pf_seqblocks[i] = 0; +} + +/* + * HW Prefetch emulation + * Start prefetching when detecting sequential access to 3 memory blocks. + * One stream can be detected per 4k page. + */ +static __inline__ void prefetch_LL_doref(Addr a) +{ + UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS; + UInt block = (a >> LL.line_size_bits); + + if (block != pf_lastblock[stream]) { + if (pf_seqblocks[stream] == 0) { + if (pf_lastblock[stream] + 1 == block) + pf_seqblocks[stream]++; + else if (pf_lastblock[stream] - 1 == block) + pf_seqblocks[stream]--; + } else if (pf_seqblocks[stream] > 0) { + if (pf_lastblock[stream] + 1 == block) { + pf_seqblocks[stream]++; + if (pf_seqblocks[stream] >= 2) { + prefetch_up++; + cachesim_ref(&LL, a + 5 * LL.line_size, 1); + } + } else + pf_seqblocks[stream] = 0; + } else if (pf_seqblocks[stream] < 0) { + if (pf_lastblock[stream] - 1 == block) { + pf_seqblocks[stream]--; + if (pf_seqblocks[stream] <= -2) { + prefetch_down++; + cachesim_ref(&LL, a - 5 * LL.line_size, 1); + } + } else + pf_seqblocks[stream] = 0; + } + pf_lastblock[stream] = block; + } +} + +/* simple model with hardware prefetch */ + +static CacheModelResult prefetch_I1_ref(Addr a, UChar size) +{ + if (cachesim_ref(&I1, a, size) == Hit) + return L1_Hit; + prefetch_LL_doref(a); + if (cachesim_ref(&LL, a, size) == Hit) + return LL_Hit; + return MemAccess; +} + +static CacheModelResult prefetch_D1_ref(Addr a, UChar size) +{ + if (cachesim_ref(&D1, a, size) == Hit) + return L1_Hit; + prefetch_LL_doref(a); + if (cachesim_ref(&LL, a, size) == Hit) + return LL_Hit; + return MemAccess; +} + +/* complex model with hardware prefetch */ + +static CacheModelResult prefetch_I1_Read(Addr a, UChar size) +{ + if (cachesim_ref(&I1, a, size) == Hit) + return L1_Hit; + prefetch_LL_doref(a); + switch (cachesim_ref_wb(&LL, Read, a, size)) { + case Hit: + return LL_Hit; + case Miss: + return MemAccess; + default: + break; + } + return WriteBackMemAccess; +} + +static CacheModelResult prefetch_D1_Read(Addr a, UChar size) +{ + if (cachesim_ref(&D1, a, size) == Hit) + return L1_Hit; + prefetch_LL_doref(a); + switch (cachesim_ref_wb(&LL, Read, a, size)) { + case Hit: + return LL_Hit; + case Miss: + return MemAccess; + default: + break; + } + return WriteBackMemAccess; +} + +static CacheModelResult prefetch_D1_Write(Addr a, UChar size) +{ + prefetch_LL_doref(a); + if (cachesim_ref(&D1, a, size) == Hit) { + /* Even for a L1 hit, the write-trough L1 passes + * the write to the LL to make the LL line dirty. + * But this causes no latency, so return the hit. + */ + cachesim_ref_wb(&LL, Write, a, size); + return L1_Hit; + } + switch (cachesim_ref_wb(&LL, Write, a, size)) { + case Hit: + return LL_Hit; + case Miss: + return MemAccess; + default: + break; + } + return WriteBackMemAccess; +} + +/*------------------------------------------------------------*/ +/*--- Cache Simulation with use metric collection ---*/ +/*------------------------------------------------------------*/ + +/* can not be combined with write-back or prefetch */ + +static void cacheuse_initcache(cache_t2* c) +{ + int i; + unsigned int start_mask, start_val; + unsigned int end_mask, end_val; + + c->use = TG_MALLOC("cl.sim.cu_ic.1", sizeof(line_use) * c->sets * c->assoc); + c->loaded = + TG_MALLOC("cl.sim.cu_ic.2", sizeof(line_loaded) * c->sets * c->assoc); + c->line_start_mask = TG_MALLOC("cl.sim.cu_ic.3", sizeof(int) * c->line_size); + c->line_end_mask = TG_MALLOC("cl.sim.cu_ic.4", sizeof(int) * c->line_size); + + c->line_size_mask = c->line_size - 1; + + /* Meaning of line_start_mask/line_end_mask + * Example: for a given cache line, you get an access starting at + * byte offset 5, length 4, byte 5 - 8 was touched. For a cache + * line size of 32, you have 1 bit per byte in the mask: + * + * bit31 bit8 bit5 bit 0 + * | | | | + * 11..111111100000 line_start_mask[5] + * 00..000111111111 line_end_mask[(5+4)-1] + * + * use_mask |= line_start_mask[5] && line_end_mask[8] + * + */ + start_val = end_val = ~0; + if (c->line_size < 32) { + int bits_per_byte = 32 / c->line_size; + start_mask = (1 << bits_per_byte) - 1; + end_mask = start_mask << (32 - bits_per_byte); + for (i = 0; i < c->line_size; i++) { + c->line_start_mask[i] = start_val; + start_val = start_val & ~start_mask; + start_mask = start_mask << bits_per_byte; + + c->line_end_mask[c->line_size - i - 1] = end_val; + end_val = end_val & ~end_mask; + end_mask = end_mask >> bits_per_byte; + } + } else { + int bytes_per_bit = c->line_size / 32; + start_mask = 1; + end_mask = 1u << 31; + for (i = 0; i < c->line_size; i++) { + c->line_start_mask[i] = start_val; + c->line_end_mask[c->line_size - i - 1] = end_val; + if (((i + 1) % bytes_per_bit) == 0) { + start_val &= ~start_mask; + end_val &= ~end_mask; + start_mask <<= 1; + end_mask >>= 1; + } + } + } + + TG_DEBUG(6, "Config %s:\n", c->desc_line); + for (i = 0; i < c->line_size; i++) { + TG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n", i, + (UInt)c->line_start_mask[i], (UInt)c->line_end_mask[i]); + } + + /* We use lower tag bits as offset pointers to cache use info. + * I.e. some cache parameters don't work. + */ + if ((1 << c->tag_shift) < c->assoc) { + VG_(message)(Vg_DebugMsg, + "error: Use associativity < %d for cache use statistics!\n", + (1 << c->tag_shift)); + VG_(tool_panic)("Unsupported cache configuration"); + } +} + +/* for I1/D1 caches */ +#define CACHEUSE(L) \ + \ + static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size) \ + { \ + UInt set1 = (a >> L.line_size_bits) & (L.sets_min_1); \ + UInt set2 = ((a + size - 1) >> L.line_size_bits) & (L.sets_min_1); \ + UWord tag = a & L.tag_mask; \ + UWord tag2; \ + int i, j, idx; \ + UWord *set, tmp_tag; \ + UInt use_mask; \ + \ + TG_DEBUG(6, "%s.Acc(Addr %#lx, size %d): Sets [%u/%u]\n", L.name, a, \ + size, set1, set2); \ + \ + /* First case: word entirely within line. */ \ + if (set1 == set2) { \ + \ + set = &(L.tags[set1 * L.assoc]); \ + use_mask = L.line_start_mask[a & L.line_size_mask] & \ + L.line_end_mask[(a + size - 1) & L.line_size_mask]; \ + \ + /* This loop is unrolled for just the first case, which is the most \ + */ \ + /* common. We can't unroll any further because it would screw up */ \ + /* if we have a direct-mapped (1-way) cache. */ \ + if (tag == (set[0] & L.tag_mask)) { \ + idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask); \ + L.use[idx].count++; \ + L.use[idx].mask |= use_mask; \ + TG_DEBUG( \ + 6, \ + " Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n", \ + idx, L.loaded[idx].memline, L.loaded[idx].iaddr, use_mask, \ + L.use[idx].mask, L.use[idx].count); \ + return L1_Hit; \ + } \ + /* If the tag is one other than the MRU, move it into the MRU spot */ \ + /* and shuffle the rest down. */ \ + for (i = 1; i < L.assoc; i++) { \ + if (tag == (set[i] & L.tag_mask)) { \ + tmp_tag = set[i]; \ + for (j = i; j > 0; j--) { \ + set[j] = set[j - 1]; \ + } \ + set[0] = tmp_tag; \ + idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask); \ + L.use[idx].count++; \ + L.use[idx].mask |= use_mask; \ + TG_DEBUG(6, \ + " Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, " \ + "count %u\n", \ + i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \ + use_mask, L.use[idx].mask, L.use[idx].count); \ + return L1_Hit; \ + } \ + } \ + \ + /* A miss; install this tag as MRU, shuffle rest down. */ \ + tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \ + for (j = L.assoc - 1; j > 0; j--) { \ + set[j] = set[j - 1]; \ + } \ + set[0] = tag | tmp_tag; \ + idx = (set1 * L.assoc) + tmp_tag; \ + return update_##L##_use(&L, idx, use_mask, a & ~L.line_size_mask); \ + \ + /* Second case: word straddles two lines. */ \ + /* Nb: this is a fast way of doing ((set1+1) % L.sets) */ \ + } else if (((set1 + 1) & (L.sets_min_1)) == set2) { \ + Int miss1 = 0, miss2 = 0; /* 0: L1 hit, 1:L1 miss, 2:LL miss */ \ + set = &(L.tags[set1 * L.assoc]); \ + use_mask = L.line_start_mask[a & L.line_size_mask]; \ + if (tag == (set[0] & L.tag_mask)) { \ + idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask); \ + L.use[idx].count++; \ + L.use[idx].mask |= use_mask; \ + TG_DEBUG( \ + 6, \ + " Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n", \ + idx, L.loaded[idx].memline, L.loaded[idx].iaddr, use_mask, \ + L.use[idx].mask, L.use[idx].count); \ + goto block2; \ + } \ + for (i = 1; i < L.assoc; i++) { \ + if (tag == (set[i] & L.tag_mask)) { \ + tmp_tag = set[i]; \ + for (j = i; j > 0; j--) { \ + set[j] = set[j - 1]; \ + } \ + set[0] = tmp_tag; \ + idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask); \ + L.use[idx].count++; \ + L.use[idx].mask |= use_mask; \ + TG_DEBUG(6, \ + " Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, " \ + "count %u\n", \ + i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \ + use_mask, L.use[idx].mask, L.use[idx].count); \ + goto block2; \ + } \ + } \ + tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \ + for (j = L.assoc - 1; j > 0; j--) { \ + set[j] = set[j - 1]; \ + } \ + set[0] = tag | tmp_tag; \ + idx = (set1 * L.assoc) + tmp_tag; \ + miss1 = update_##L##_use(&L, idx, use_mask, a & ~L.line_size_mask); \ + block2: \ + set = &(L.tags[set2 * L.assoc]); \ + use_mask = L.line_end_mask[(a + size - 1) & L.line_size_mask]; \ + tag2 = (a + size - 1) & L.tag_mask; \ + if (tag2 == (set[0] & L.tag_mask)) { \ + idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask); \ + L.use[idx].count++; \ + L.use[idx].mask |= use_mask; \ + TG_DEBUG( \ + 6, \ + " Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n", \ + idx, L.loaded[idx].memline, L.loaded[idx].iaddr, use_mask, \ + L.use[idx].mask, L.use[idx].count); \ + return miss1; \ + } \ + for (i = 1; i < L.assoc; i++) { \ + if (tag2 == (set[i] & L.tag_mask)) { \ + tmp_tag = set[i]; \ + for (j = i; j > 0; j--) { \ + set[j] = set[j - 1]; \ + } \ + set[0] = tmp_tag; \ + idx = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask); \ + L.use[idx].count++; \ + L.use[idx].mask |= use_mask; \ + TG_DEBUG(6, \ + " Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, " \ + "count %u\n", \ + i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \ + use_mask, L.use[idx].mask, L.use[idx].count); \ + return miss1; \ + } \ + } \ + tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \ + for (j = L.assoc - 1; j > 0; j--) { \ + set[j] = set[j - 1]; \ + } \ + set[0] = tag2 | tmp_tag; \ + idx = (set2 * L.assoc) + tmp_tag; \ + miss2 = update_##L##_use(&L, idx, use_mask, \ + (a + size - 1) & ~L.line_size_mask); \ + return (miss1 == MemAccess || miss2 == MemAccess) ? MemAccess \ + : LL_Hit; \ + \ + } else { \ + VG_(printf)("addr: %#lx size: %u sets: %u %u", a, size, set1, \ + set2); \ + VG_(tool_panic)("item straddles more than two cache sets"); \ + } \ + return 0; \ + } + +/* logarithmic bitcounting algorithm, see + * http://graphics.stanford.edu/~seander/bithacks.html + */ +static __inline__ unsigned int countBits(unsigned int bits) +{ + unsigned int c; // store the total here + const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers + const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF}; + + c = bits; + c = ((c >> S[0]) & B[0]) + (c & B[0]); + c = ((c >> S[1]) & B[1]) + (c & B[1]); + c = ((c >> S[2]) & B[2]) + (c & B[2]); + c = ((c >> S[3]) & B[3]) + (c & B[3]); + c = ((c >> S[4]) & B[4]) + (c & B[4]); + return c; +} + +static void update_LL_use(int idx, Addr memline) +{ + line_loaded* loaded = &(LL.loaded[idx]); + line_use* use = &(LL.use[idx]); + int i = ((32 - countBits(use->mask)) * LL.line_size) >> 5; + + TG_DEBUG(2, " LL.miss [%d]: at %#lx accessing memline %#lx\n", idx, + TG_(bb_base) + current_ii->instr_offset, memline); + if (use->count > 0) { + TG_DEBUG(2, + " old: used %u, loss bits %d (%08x) [line %#lx from %#lx]\n", + use->count, i, use->mask, loaded->memline, loaded->iaddr); + TG_DEBUG(2, " collect: %d, use_base %p\n", TG_(current_state).collect, + loaded->use_base); + + if (TG_(current_state).collect && loaded->use_base) { + (loaded->use_base)[off_LL_AcCost] += 1000 / use->count; + (loaded->use_base)[off_LL_SpLoss] += i; + } + } + + use->count = 0; + use->mask = 0; + + loaded->memline = memline; + loaded->iaddr = TG_(bb_base) + current_ii->instr_offset; + loaded->use_base = (TG_(current_state).nonskipped) + ? TG_(current_state).nonskipped->skipped + : TG_(cost_base) + current_ii->cost_offset; +} + +static CacheModelResult cacheuse_LL_access(Addr memline, line_loaded* l1_loaded) +{ + UInt setNo = (memline >> LL.line_size_bits) & (LL.sets_min_1); + UWord* set = &(LL.tags[setNo * LL.assoc]); + UWord tag = memline & LL.tag_mask; + + int i, j, idx; + UWord tmp_tag; + + TG_DEBUG(6, "LL.Acc(Memline %#lx): Set %u\n", memline, setNo); + + if (tag == (set[0] & LL.tag_mask)) { + idx = (setNo * LL.assoc) + (set[0] & ~LL.tag_mask); + l1_loaded->dep_use = &(LL.use[idx]); + + TG_DEBUG(6, " Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %u\n", + idx, LL.loaded[idx].memline, LL.loaded[idx].iaddr, + LL.use[idx].mask, LL.use[idx].count); + return LL_Hit; + } + for (i = 1; i < LL.assoc; i++) { + if (tag == (set[i] & LL.tag_mask)) { + tmp_tag = set[i]; + for (j = i; j > 0; j--) { + set[j] = set[j - 1]; + } + set[0] = tmp_tag; + idx = (setNo * LL.assoc) + (tmp_tag & ~LL.tag_mask); + l1_loaded->dep_use = &(LL.use[idx]); + + TG_DEBUG(6, + " Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %u\n", + i, idx, LL.loaded[idx].memline, LL.loaded[idx].iaddr, + LL.use[idx].mask, LL.use[idx].count); + return LL_Hit; + } + } + + /* A miss; install this tag as MRU, shuffle rest down. */ + tmp_tag = set[LL.assoc - 1] & ~LL.tag_mask; + for (j = LL.assoc - 1; j > 0; j--) { + set[j] = set[j - 1]; + } + set[0] = tag | tmp_tag; + idx = (setNo * LL.assoc) + tmp_tag; + l1_loaded->dep_use = &(LL.use[idx]); + + update_LL_use(idx, memline); + + return MemAccess; +} + +#define UPDATE_USE(L) \ + \ + static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \ + UInt mask, Addr memline) \ + { \ + line_loaded* loaded = &(cache->loaded[idx]); \ + line_use* use = &(cache->use[idx]); \ + int c = ((32 - countBits(use->mask)) * cache->line_size) >> 5; \ + \ + TG_DEBUG(2, \ + " %s.miss [%d]: at %#lx accessing memline %#lx (mask %08x)\n", \ + cache->name, idx, TG_(bb_base) + current_ii->instr_offset, \ + memline, mask); \ + if (use->count > 0) { \ + TG_DEBUG( \ + 2, " old: used %u, loss bits %d (%08x) [line %#lx from %#lx]\n", \ + use->count, c, use->mask, loaded->memline, loaded->iaddr); \ + TG_DEBUG(2, " collect: %d, use_base %p\n", \ + TG_(current_state).collect, loaded->use_base); \ + \ + if (TG_(current_state).collect && loaded->use_base) { \ + (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count; \ + (loaded->use_base)[off_##L##_SpLoss] += c; \ + \ + /* FIXME (?): L1/LL line sizes must be equal ! */ \ + loaded->dep_use->mask |= use->mask; \ + loaded->dep_use->count += use->count; \ + } \ + } \ + \ + use->count = 1; \ + use->mask = mask; \ + loaded->memline = memline; \ + loaded->iaddr = TG_(bb_base) + current_ii->instr_offset; \ + loaded->use_base = (TG_(current_state).nonskipped) \ + ? TG_(current_state).nonskipped->skipped \ + : TG_(cost_base) + current_ii->cost_offset; \ + \ + if (memline == 0) \ + return LL_Hit; \ + return cacheuse_LL_access(memline, loaded); \ + } + +UPDATE_USE(I1); +UPDATE_USE(D1); + +CACHEUSE(I1); +CACHEUSE(D1); + +static void cacheuse_finish(void) +{ + int i; + InstrInfo ii = {0, 0, 0, 0}; + + if (!TG_(current_state).collect) + return; + + TG_(bb_base) = 0; + current_ii = ⅈ /* needs to be set for update_XX_use */ + TG_(cost_base) = 0; + + /* update usage counters */ + if (I1.use) + for (i = 0; i < I1.sets * I1.assoc; i++) + if (I1.loaded[i].use_base) + update_I1_use(&I1, i, 0, 0); + + if (D1.use) + for (i = 0; i < D1.sets * D1.assoc; i++) + if (D1.loaded[i].use_base) + update_D1_use(&D1, i, 0, 0); + + if (LL.use) + for (i = 0; i < LL.sets * LL.assoc; i++) + if (LL.loaded[i].use_base) + update_LL_use(i, 0); + + current_ii = 0; +} + +/*------------------------------------------------------------*/ +/*--- Helper functions called by instrumented code ---*/ +/*------------------------------------------------------------*/ + +static __inline__ void inc_costs(CacheModelResult r, ULong* c1, ULong* c2) +{ + switch (r) { + case WriteBackMemAccess: + if (clo_simulate_writeback) { + c1[3]++; + c2[3]++; + } + // fall through + + case MemAccess: + c1[2]++; + c2[2]++; + // fall through + + case LL_Hit: + c1[1]++; + c2[1]++; + // fall through + + default: + c1[0]++; + c2[0]++; + } +} + +static const HChar* cacheRes(CacheModelResult r) +{ + switch (r) { + case L1_Hit: + return "L1 Hit "; + case LL_Hit: + return "LL Hit "; + case MemAccess: + return "LL Miss"; + case WriteBackMemAccess: + return "LL Miss (dirty)"; + default: + tl_assert(0); + } + return "??"; +} + +VG_REGPARM(1) +static void log_1I0D(InstrInfo* ii) +{ + CacheModelResult IrRes; + + current_ii = ii; + IrRes = + (*simulator.I1_Read)(TG_(bb_base) + ii->instr_offset, ii->instr_size); + + TG_DEBUG(6, "log_1I0D: Ir %#lx/%u => %s\n", + TG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes)); + + if (TG_(current_state).collect) { + ULong* cost_Ir; + + if (TG_(current_state).nonskipped) + cost_Ir = TG_(current_state).nonskipped->skipped + fullOffset(EG_IR); + else + cost_Ir = + TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR]; + + inc_costs(IrRes, cost_Ir, TG_(current_state).cost + fullOffset(EG_IR)); + } +} + +VG_REGPARM(2) +static void log_2I0D(InstrInfo* ii1, InstrInfo* ii2) +{ + CacheModelResult Ir1Res, Ir2Res; + ULong* global_cost_Ir; + + current_ii = ii1; + Ir1Res = + (*simulator.I1_Read)(TG_(bb_base) + ii1->instr_offset, ii1->instr_size); + current_ii = ii2; + Ir2Res = + (*simulator.I1_Read)(TG_(bb_base) + ii2->instr_offset, ii2->instr_size); + + TG_DEBUG(6, "log_2I0D: Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s\n", + TG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res), + TG_(bb_base) + ii2->instr_offset, ii2->instr_size, + cacheRes(Ir2Res)); + + if (!TG_(current_state).collect) + return; + + global_cost_Ir = TG_(current_state).cost + fullOffset(EG_IR); + if (TG_(current_state).nonskipped) { + ULong* skipped_cost_Ir = + TG_(current_state).nonskipped->skipped + fullOffset(EG_IR); + + inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir); + inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir); + return; + } + + inc_costs(Ir1Res, global_cost_Ir, + TG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]); + inc_costs(Ir2Res, global_cost_Ir, + TG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]); +} + +VG_REGPARM(3) +static void log_3I0D(InstrInfo* ii1, InstrInfo* ii2, InstrInfo* ii3) +{ + CacheModelResult Ir1Res, Ir2Res, Ir3Res; + ULong* global_cost_Ir; + + current_ii = ii1; + Ir1Res = + (*simulator.I1_Read)(TG_(bb_base) + ii1->instr_offset, ii1->instr_size); + current_ii = ii2; + Ir2Res = + (*simulator.I1_Read)(TG_(bb_base) + ii2->instr_offset, ii2->instr_size); + current_ii = ii3; + Ir3Res = + (*simulator.I1_Read)(TG_(bb_base) + ii3->instr_offset, ii3->instr_size); + + TG_DEBUG( + 6, "log_3I0D: Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s, Ir3 %#lx/%u => %s\n", + TG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res), + TG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res), + TG_(bb_base) + ii3->instr_offset, ii3->instr_size, cacheRes(Ir3Res)); + + if (!TG_(current_state).collect) + return; + + global_cost_Ir = TG_(current_state).cost + fullOffset(EG_IR); + if (TG_(current_state).nonskipped) { + ULong* skipped_cost_Ir = + TG_(current_state).nonskipped->skipped + fullOffset(EG_IR); + inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir); + inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir); + inc_costs(Ir3Res, global_cost_Ir, skipped_cost_Ir); + return; + } + + inc_costs(Ir1Res, global_cost_Ir, + TG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]); + inc_costs(Ir2Res, global_cost_Ir, + TG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]); + inc_costs(Ir3Res, global_cost_Ir, + TG_(cost_base) + ii3->cost_offset + ii3->eventset->offset[EG_IR]); +} + +/* Instruction doing a read access */ + +VG_REGPARM(3) +static void log_1I1Dr(InstrInfo* ii, Addr data_addr, Word data_size) +{ + CacheModelResult IrRes, DrRes; + + current_ii = ii; + IrRes = + (*simulator.I1_Read)(TG_(bb_base) + ii->instr_offset, ii->instr_size); + DrRes = (*simulator.D1_Read)(data_addr, data_size); + + TG_DEBUG(6, "log_1I1Dr: Ir %#lx/%u => %s, Dr %#lx/%ld => %s\n", + TG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes), + data_addr, data_size, cacheRes(DrRes)); + + if (TG_(current_state).collect) { + ULong *cost_Ir, *cost_Dr; + + if (TG_(current_state).nonskipped) { + cost_Ir = TG_(current_state).nonskipped->skipped + fullOffset(EG_IR); + cost_Dr = TG_(current_state).nonskipped->skipped + fullOffset(EG_DR); + } else { + cost_Ir = + TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR]; + cost_Dr = + TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR]; + } + + inc_costs(IrRes, cost_Ir, TG_(current_state).cost + fullOffset(EG_IR)); + inc_costs(DrRes, cost_Dr, TG_(current_state).cost + fullOffset(EG_DR)); + } +} + +/* Note that addEvent_D_guarded assumes that log_0I1Dr and log_0I1Dw + have exactly the same prototype. If you change them, you must + change addEvent_D_guarded too. */ +VG_REGPARM(3) +static void log_0I1Dr(InstrInfo* ii, Addr data_addr, Word data_size) +{ + CacheModelResult DrRes; + + current_ii = ii; + DrRes = (*simulator.D1_Read)(data_addr, data_size); + + TG_DEBUG(6, "log_0I1Dr: Dr %#lx/%ld => %s\n", data_addr, data_size, + cacheRes(DrRes)); + + if (TG_(current_state).collect) { + ULong* cost_Dr; + + if (TG_(current_state).nonskipped) + cost_Dr = TG_(current_state).nonskipped->skipped + fullOffset(EG_DR); + else + cost_Dr = + TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR]; + + inc_costs(DrRes, cost_Dr, TG_(current_state).cost + fullOffset(EG_DR)); + } +} + +/* Instruction doing a write access */ + +VG_REGPARM(3) +static void log_1I1Dw(InstrInfo* ii, Addr data_addr, Word data_size) +{ + CacheModelResult IrRes, DwRes; + + current_ii = ii; + IrRes = + (*simulator.I1_Read)(TG_(bb_base) + ii->instr_offset, ii->instr_size); + DwRes = (*simulator.D1_Write)(data_addr, data_size); + + TG_DEBUG(6, "log_1I1Dw: Ir %#lx/%u => %s, Dw %#lx/%ld => %s\n", + TG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes), + data_addr, data_size, cacheRes(DwRes)); + + if (TG_(current_state).collect) { + ULong *cost_Ir, *cost_Dw; + + if (TG_(current_state).nonskipped) { + cost_Ir = TG_(current_state).nonskipped->skipped + fullOffset(EG_IR); + cost_Dw = TG_(current_state).nonskipped->skipped + fullOffset(EG_DW); + } else { + cost_Ir = + TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR]; + cost_Dw = + TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW]; + } + + inc_costs(IrRes, cost_Ir, TG_(current_state).cost + fullOffset(EG_IR)); + inc_costs(DwRes, cost_Dw, TG_(current_state).cost + fullOffset(EG_DW)); + } +} + +/* See comment on log_0I1Dr. */ +VG_REGPARM(3) +static void log_0I1Dw(InstrInfo* ii, Addr data_addr, Word data_size) +{ + CacheModelResult DwRes; + + current_ii = ii; + DwRes = (*simulator.D1_Write)(data_addr, data_size); + + TG_DEBUG(6, "log_0I1Dw: Dw %#lx/%ld => %s\n", data_addr, data_size, + cacheRes(DwRes)); + + if (TG_(current_state).collect) { + ULong* cost_Dw; + + if (TG_(current_state).nonskipped) + cost_Dw = TG_(current_state).nonskipped->skipped + fullOffset(EG_DW); + else + cost_Dw = + TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW]; + + inc_costs(DwRes, cost_Dw, TG_(current_state).cost + fullOffset(EG_DW)); + } +} + +/*------------------------------------------------------------*/ +/*--- Cache configuration ---*/ +/*------------------------------------------------------------*/ + +static cache_t clo_I1_cache = UNDEFINED_CACHE; +static cache_t clo_D1_cache = UNDEFINED_CACHE; +static cache_t clo_LL_cache = UNDEFINED_CACHE; + +/* Initialize and clear simulator state */ +static void cachesim_post_clo_init(void) +{ + /* Cache configurations. */ + cache_t I1c, D1c, LLc; + + /* Initialize access handlers */ + if (!TG_(clo).simulate_cache) { + TG_(cachesim).log_1I0D = 0; + TG_(cachesim).log_1I0D_name = "(no function)"; + TG_(cachesim).log_2I0D = 0; + TG_(cachesim).log_2I0D_name = "(no function)"; + TG_(cachesim).log_3I0D = 0; + TG_(cachesim).log_3I0D_name = "(no function)"; + + TG_(cachesim).log_1I1Dr = 0; + TG_(cachesim).log_1I1Dr_name = "(no function)"; + TG_(cachesim).log_1I1Dw = 0; + TG_(cachesim).log_1I1Dw_name = "(no function)"; + + TG_(cachesim).log_0I1Dr = 0; + TG_(cachesim).log_0I1Dr_name = "(no function)"; + TG_(cachesim).log_0I1Dw = 0; + TG_(cachesim).log_0I1Dw_name = "(no function)"; + return; + } + + /* Configuration of caches only needed with real cache simulation */ + VG_(post_clo_init_configure_caches)(&I1c, &D1c, &LLc, &clo_I1_cache, + &clo_D1_cache, &clo_LL_cache); + + I1.name = "I1"; + D1.name = "D1"; + LL.name = "LL"; + + // min_line_size is used to make sure that we never feed + // accesses to the simulator straddling more than two + // cache lines at any cache level + TG_(min_line_size) = + (I1c.line_size < D1c.line_size) ? I1c.line_size : D1c.line_size; + TG_(min_line_size) = + (LLc.line_size < TG_(min_line_size)) ? LLc.line_size : TG_(min_line_size); + + Int largest_load_or_store_size = + VG_(machine_get_size_of_largest_guest_register)(); + if (TG_(min_line_size) < largest_load_or_store_size) { + /* We can't continue, because the cache simulation might + straddle more than 2 lines, and it will assert. So let's + just stop before we start. */ + VG_(umsg)("Tracegrind: cannot continue: the minimum line size (%d)\n", + (Int)TG_(min_line_size)); + VG_(umsg)( + " must be equal to or larger than the maximum register size (%d)\n", + largest_load_or_store_size); + VG_(umsg)(" but it is not. Exiting now.\n"); + VG_(exit)(1); + } + + cachesim_initcache(I1c, &I1); + cachesim_initcache(D1c, &D1); + cachesim_initcache(LLc, &LL); + + /* the other cache simulators use the standard helpers + * with dispatching via simulator struct */ + + TG_(cachesim).log_1I0D = log_1I0D; + TG_(cachesim).log_1I0D_name = "log_1I0D"; + TG_(cachesim).log_2I0D = log_2I0D; + TG_(cachesim).log_2I0D_name = "log_2I0D"; + TG_(cachesim).log_3I0D = log_3I0D; + TG_(cachesim).log_3I0D_name = "log_3I0D"; + + TG_(cachesim).log_1I1Dr = log_1I1Dr; + TG_(cachesim).log_1I1Dw = log_1I1Dw; + TG_(cachesim).log_1I1Dr_name = "log_1I1Dr"; + TG_(cachesim).log_1I1Dw_name = "log_1I1Dw"; + + TG_(cachesim).log_0I1Dr = log_0I1Dr; + TG_(cachesim).log_0I1Dw = log_0I1Dw; + TG_(cachesim).log_0I1Dr_name = "log_0I1Dr"; + TG_(cachesim).log_0I1Dw_name = "log_0I1Dw"; + + if (clo_collect_cacheuse) { + + /* Output warning for not supported option combinations */ + if (clo_simulate_hwpref) { + VG_(message)(Vg_DebugMsg, "warning: prefetch simulation can not be " + "used with cache usage\n"); + clo_simulate_hwpref = False; + } + + if (clo_simulate_writeback) { + VG_(message)(Vg_DebugMsg, "warning: write-back simulation can not be " + "used with cache usage\n"); + clo_simulate_writeback = False; + } + + simulator.I1_Read = cacheuse_I1_doRead; + simulator.D1_Read = cacheuse_D1_doRead; + simulator.D1_Write = cacheuse_D1_doRead; + return; + } + + if (clo_simulate_hwpref) { + prefetch_clear(); + + if (clo_simulate_writeback) { + simulator.I1_Read = prefetch_I1_Read; + simulator.D1_Read = prefetch_D1_Read; + simulator.D1_Write = prefetch_D1_Write; + } else { + simulator.I1_Read = prefetch_I1_ref; + simulator.D1_Read = prefetch_D1_ref; + simulator.D1_Write = prefetch_D1_ref; + } + + return; + } + + if (clo_simulate_writeback) { + simulator.I1_Read = cachesim_I1_Read; + simulator.D1_Read = cachesim_D1_Read; + simulator.D1_Write = cachesim_D1_Write; + } else { + simulator.I1_Read = cachesim_I1_ref; + simulator.D1_Read = cachesim_D1_ref; + simulator.D1_Write = cachesim_D1_ref; + } +} + +/* Clear simulator state. Has to be initialized before */ +static void cachesim_clear(void) +{ + cachesim_clearcache(&I1); + cachesim_clearcache(&D1); + cachesim_clearcache(&LL); + + prefetch_clear(); +} + +static void cachesim_print_opts(void) +{ + VG_(printf)( + "\n cache simulator options (does cache simulation if used):\n" + " --simulate-wb=no|yes Count write-back events [no]\n" + " --simulate-hwpref=no|yes Simulate hardware prefetch [no]\n" +#if TG_EXPERIMENTAL + " --simulate-sectors=no|yes Simulate sectored behaviour [no]\n" +#endif + " --cacheuse=no|yes Collect cache block use [no]\n"); + VG_(print_cache_clo_opts)(); +} + +/* Check for command line option for cache configuration. + * Return False if unknown and not handled. + * + * Called from TG_(process_cmd_line_option)() in clo.c + */ +static Bool cachesim_parse_opt(const HChar* arg) +{ + if VG_BOOL_CLO (arg, "--simulate-wb", clo_simulate_writeback) { + } else if VG_BOOL_CLO (arg, "--simulate-hwpref", clo_simulate_hwpref) { + } else if VG_BOOL_CLO (arg, "--simulate-sectors", clo_simulate_sectors) { + } + + else if VG_BOOL_CLO (arg, "--cacheuse", clo_collect_cacheuse) { + } + + else if (VG_(str_clo_cache_opt)(arg, &clo_I1_cache, &clo_D1_cache, + &clo_LL_cache)) { + } + + else + return False; + + return True; +} + +static void cachesim_printstat(Int l1, Int l2, Int l3) +{ + FullCost total = TG_(total_cost), D_total = 0; + ULong LL_total_m, LL_total_mr, LL_total_mw, LL_total, LL_total_r, LL_total_w; + + if ((VG_(clo_verbosity) > 1) && clo_simulate_hwpref) { + VG_(message)(Vg_DebugMsg, "Prefetch Up: %llu\n", prefetch_up); + VG_(message)(Vg_DebugMsg, "Prefetch Down: %llu\n", prefetch_down); + VG_(message)(Vg_DebugMsg, "\n"); + } + + VG_(message)(Vg_UserMsg, "I1 misses: %'*llu\n", l1, + total[fullOffset(EG_IR) + 1]); + + VG_(message)(Vg_UserMsg, "LLi misses: %'*llu\n", l1, + total[fullOffset(EG_IR) + 2]); + + if (0 == total[fullOffset(EG_IR)]) + total[fullOffset(EG_IR)] = 1; + + VG_(message)(Vg_UserMsg, "I1 miss rate: %*.2f%%\n", l1, + total[fullOffset(EG_IR) + 1] * 100.0 / + total[fullOffset(EG_IR)]); + + VG_(message)(Vg_UserMsg, "LLi miss rate: %*.2f%%\n", l1, + total[fullOffset(EG_IR) + 2] * 100.0 / + total[fullOffset(EG_IR)]); + + VG_(message)(Vg_UserMsg, "\n"); + + /* D cache results. + Use the D_refs.rd and D_refs.wr values to determine the + * width of columns 2 & 3. */ + + D_total = TG_(get_eventset_cost)(TG_(sets).full); + TG_(init_cost)(TG_(sets).full, D_total); + // we only use the first 3 values of D_total, adding up Dr and Dw costs + TG_(copy_cost) + (TG_(get_event_set)(EG_DR), D_total, total + fullOffset(EG_DR)); + TG_(add_cost)(TG_(get_event_set)(EG_DW), D_total, total + fullOffset(EG_DW)); + + VG_(message)(Vg_UserMsg, "D refs: %'*llu (%'*llu rd + %'*llu wr)\n", + l1, D_total[0], l2, total[fullOffset(EG_DR)], l3, + total[fullOffset(EG_DW)]); + + VG_(message)(Vg_UserMsg, "D1 misses: %'*llu (%'*llu rd + %'*llu wr)\n", + l1, D_total[1], l2, total[fullOffset(EG_DR) + 1], l3, + total[fullOffset(EG_DW) + 1]); + + VG_(message)(Vg_UserMsg, "LLd misses: %'*llu (%'*llu rd + %'*llu wr)\n", + l1, D_total[2], l2, total[fullOffset(EG_DR) + 2], l3, + total[fullOffset(EG_DW) + 2]); + + if (0 == D_total[0]) + D_total[0] = 1; + if (0 == total[fullOffset(EG_DR)]) + total[fullOffset(EG_DR)] = 1; + if (0 == total[fullOffset(EG_DW)]) + total[fullOffset(EG_DW)] = 1; + + VG_(message)( + Vg_UserMsg, "D1 miss rate: %*.1f%% (%*.1f%% + %*.1f%% )\n", l1, + D_total[1] * 100.0 / D_total[0], l2, + total[fullOffset(EG_DR) + 1] * 100.0 / total[fullOffset(EG_DR)], l3, + total[fullOffset(EG_DW) + 1] * 100.0 / total[fullOffset(EG_DW)]); + + VG_(message)( + Vg_UserMsg, "LLd miss rate: %*.1f%% (%*.1f%% + %*.1f%% )\n", l1, + D_total[2] * 100.0 / D_total[0], l2, + total[fullOffset(EG_DR) + 2] * 100.0 / total[fullOffset(EG_DR)], l3, + total[fullOffset(EG_DW) + 2] * 100.0 / total[fullOffset(EG_DW)]); + VG_(message)(Vg_UserMsg, "\n"); + + /* LL overall results */ + + LL_total = total[fullOffset(EG_DR) + 1] + total[fullOffset(EG_DW) + 1] + + total[fullOffset(EG_IR) + 1]; + LL_total_r = total[fullOffset(EG_DR) + 1] + total[fullOffset(EG_IR) + 1]; + LL_total_w = total[fullOffset(EG_DW) + 1]; + VG_(message)(Vg_UserMsg, "LL refs: %'*llu (%'*llu rd + %'*llu wr)\n", + l1, LL_total, l2, LL_total_r, l3, LL_total_w); + + LL_total_m = total[fullOffset(EG_DR) + 2] + total[fullOffset(EG_DW) + 2] + + total[fullOffset(EG_IR) + 2]; + LL_total_mr = total[fullOffset(EG_DR) + 2] + total[fullOffset(EG_IR) + 2]; + LL_total_mw = total[fullOffset(EG_DW) + 2]; + VG_(message)(Vg_UserMsg, "LL misses: %'*llu (%'*llu rd + %'*llu wr)\n", + l1, LL_total_m, l2, LL_total_mr, l3, LL_total_mw); + + VG_(message)( + Vg_UserMsg, "LL miss rate: %*.1f%% (%*.1f%% + %*.1f%% )\n", l1, + LL_total_m * 100.0 / (total[fullOffset(EG_IR)] + D_total[0]), l2, + LL_total_mr * 100.0 / + (total[fullOffset(EG_IR)] + total[fullOffset(EG_DR)]), + l3, LL_total_mw * 100.0 / total[fullOffset(EG_DW)]); +} + +/*------------------------------------------------------------*/ +/*--- Setup for Event set. ---*/ +/*------------------------------------------------------------*/ + +struct event_sets TG_(sets); + +void TG_(init_eventsets)(void) +{ + // Event groups from which the event sets are composed + // the "Use" group only is used with "cacheuse" simulation + if (clo_collect_cacheuse) + TG_(register_event_group4) + (EG_USE, "AcCost1", "SpLoss1", "AcCost2", "SpLoss2"); + + if (!TG_(clo).simulate_cache) + TG_(register_event_group)(EG_IR, "Ir"); + else if (!clo_simulate_writeback) { + TG_(register_event_group3)(EG_IR, "Ir", "I1mr", "ILmr"); + TG_(register_event_group3)(EG_DR, "Dr", "D1mr", "DLmr"); + TG_(register_event_group3)(EG_DW, "Dw", "D1mw", "DLmw"); + } else { // clo_simulate_writeback + TG_(register_event_group4)(EG_IR, "Ir", "I1mr", "ILmr", "ILdmr"); + TG_(register_event_group4)(EG_DR, "Dr", "D1mr", "DLmr", "DLdmr"); + TG_(register_event_group4)(EG_DW, "Dw", "D1mw", "DLmw", "DLdmw"); + } + + if (TG_(clo).simulate_branch) { + TG_(register_event_group2)(EG_BC, "Bc", "Bcm"); + TG_(register_event_group2)(EG_BI, "Bi", "Bim"); + } + + if (TG_(clo).collect_bus) + TG_(register_event_group)(EG_BUS, "Ge"); + + if (TG_(clo).collect_systime != systime_no) { + if (TG_(clo).collect_systime == systime_nsec) + TG_(register_event_group3) + (EG_SYS, "sysCount", "sysTime", "sysCpuTime"); + else TG_(register_event_group2)(EG_SYS, "sysCount", "sysTime"); + } + + // event set used as base for instruction self cost + TG_(sets).base = TG_(get_event_set2)(EG_USE, EG_IR); + + // event set comprising all event groups, used for inclusive cost + TG_(sets).full = TG_(add_event_group2)(TG_(sets).base, EG_DR, EG_DW); + TG_(sets).full = TG_(add_event_group2)(TG_(sets).full, EG_BC, EG_BI); + TG_(sets).full = TG_(add_event_group)(TG_(sets).full, EG_BUS); + TG_(sets).full = TG_(add_event_group)(TG_(sets).full, EG_SYS); + + TG_DEBUGIF(1) + { + TG_DEBUG(1, "EventSets:\n"); + TG_(print_eventset)(-2, TG_(sets).base); + TG_(print_eventset)(-2, TG_(sets).full); + } + + /* Not-existing events are silently ignored */ + TG_(dumpmap) = TG_(get_eventmapping)(TG_(sets).full); + TG_(append_event)(TG_(dumpmap), "Ir"); + TG_(append_event)(TG_(dumpmap), "Dr"); + TG_(append_event)(TG_(dumpmap), "Dw"); + TG_(append_event)(TG_(dumpmap), "I1mr"); + TG_(append_event)(TG_(dumpmap), "D1mr"); + TG_(append_event)(TG_(dumpmap), "D1mw"); + TG_(append_event)(TG_(dumpmap), "ILmr"); + TG_(append_event)(TG_(dumpmap), "DLmr"); + TG_(append_event)(TG_(dumpmap), "DLmw"); + TG_(append_event)(TG_(dumpmap), "ILdmr"); + TG_(append_event)(TG_(dumpmap), "DLdmr"); + TG_(append_event)(TG_(dumpmap), "DLdmw"); + TG_(append_event)(TG_(dumpmap), "Bc"); + TG_(append_event)(TG_(dumpmap), "Bcm"); + TG_(append_event)(TG_(dumpmap), "Bi"); + TG_(append_event)(TG_(dumpmap), "Bim"); + TG_(append_event)(TG_(dumpmap), "AcCost1"); + TG_(append_event)(TG_(dumpmap), "SpLoss1"); + TG_(append_event)(TG_(dumpmap), "AcCost2"); + TG_(append_event)(TG_(dumpmap), "SpLoss2"); + TG_(append_event)(TG_(dumpmap), "Ge"); + TG_(append_event)(TG_(dumpmap), "allocCount"); + TG_(append_event)(TG_(dumpmap), "allocSize"); + TG_(append_event)(TG_(dumpmap), "sysCount"); + TG_(append_event)(TG_(dumpmap), "sysTime"); + TG_(append_event)(TG_(dumpmap), "sysCpuTime"); +} + +static void cachesim_finish(void) +{ + if (clo_collect_cacheuse) + cacheuse_finish(); +} + +/*------------------------------------------------------------*/ +/*--- The simulator defined in this file ---*/ +/*------------------------------------------------------------*/ + +struct cachesim_if TG_(cachesim) = { + .print_opts = cachesim_print_opts, + .parse_opt = cachesim_parse_opt, + .post_clo_init = cachesim_post_clo_init, + .clear = cachesim_clear, + .printstat = cachesim_printstat, + .finish = cachesim_finish, + + /* these will be set by cachesim_post_clo_init */ + .log_1I0D = 0, + .log_2I0D = 0, + .log_3I0D = 0, + + .log_1I1Dr = 0, + .log_1I1Dw = 0, + + .log_0I1Dr = 0, + .log_0I1Dw = 0, + + .log_1I0D_name = "(no function)", + .log_2I0D_name = "(no function)", + .log_3I0D_name = "(no function)", + + .log_1I1Dr_name = "(no function)", + .log_1I1Dw_name = "(no function)", + + .log_0I1Dr_name = "(no function)", + .log_0I1Dw_name = "(no function)", +}; + +/*--------------------------------------------------------------------*/ +/*--- end ct_sim.c ---*/ +/*--------------------------------------------------------------------*/ diff --git a/tracegrind/tests/Makefile.am b/tracegrind/tests/Makefile.am new file mode 100644 index 000000000..0ad3b6ae5 --- /dev/null +++ b/tracegrind/tests/Makefile.am @@ -0,0 +1,71 @@ + +include $(top_srcdir)/Makefile.tool-tests.am + +SUBDIRS = . +DIST_SUBDIRS = . + +dist_noinst_SCRIPTS = filter_stderr filter_trace + +check_PROGRAMS = \ + test_basic.bin \ + test_marker.bin \ + test_instr_toggle.bin \ + test_toggle_collect.bin \ + test_foo_bar_baz.bin \ + test_inline.bin \ + test_enter_inlined.bin \ + test_nested_inlined.bin \ + test_signal.bin \ + test_exception.bin \ + test_longjmp.bin \ + test_tailcall.bin \ + test_recursion.bin \ + test_thread_create.bin \ + test_thread_interleave.bin \ + test_syscall.bin + +AM_CPPFLAGS += -I$(top_srcdir)/tracegrind +AM_CFLAGS += $(AM_FLAG_M3264_PRI) +AM_CXXFLAGS += $(AM_FLAG_M3264_PRI) + +test_basic_bin_SOURCES = test_basic.c +test_marker_bin_SOURCES = test_marker.c +test_instr_toggle_bin_SOURCES = test_instr_toggle.c +test_toggle_collect_bin_SOURCES = test_toggle_collect.c +test_foo_bar_baz_bin_SOURCES = test_foo_bar_baz.c +test_inline_bin_SOURCES = test_inline.c +test_inline_bin_CFLAGS = $(AM_CFLAGS) -O2 -g +test_enter_inlined_bin_SOURCES = test_enter_inlined.c +test_enter_inlined_bin_CFLAGS = $(AM_CFLAGS) -O2 -g +test_nested_inlined_bin_SOURCES = test_nested_inlined.c +test_nested_inlined_bin_CFLAGS = $(AM_CFLAGS) -O1 -g +test_signal_bin_SOURCES = test_signal.c +test_exception_bin_SOURCES = test_exception.cpp +test_longjmp_bin_SOURCES = test_longjmp.c +test_tailcall_bin_SOURCES = test_tailcall.c +test_tailcall_bin_CFLAGS = $(AM_CFLAGS) -O2 -g +test_recursion_bin_SOURCES = test_recursion.c +test_thread_create_bin_SOURCES = test_thread_create.c +test_thread_create_bin_LDADD = -lpthread +test_thread_interleave_bin_SOURCES = test_thread_interleave.c +test_thread_interleave_bin_LDADD = -lpthread +test_syscall_bin_SOURCES = test_syscall.c + +EXTRA_DIST = \ + test_basic.vgtest test_basic.stderr.exp test_basic.post.exp \ + test_marker.vgtest test_marker.stderr.exp test_marker.post.exp \ + test_instr_toggle.vgtest test_instr_toggle.stderr.exp test_instr_toggle.post.exp \ + test_toggle_collect.vgtest test_toggle_collect.stderr.exp test_toggle_collect.post.exp \ + test_foo_bar_baz.vgtest test_foo_bar_baz.stderr.exp test_foo_bar_baz.post.exp \ + test_inline.vgtest test_inline.stderr.exp test_inline.post.exp \ + test_enter_inlined.vgtest test_enter_inlined.stderr.exp test_enter_inlined.post.exp \ + test_nested_inlined.vgtest test_nested_inlined.stderr.exp test_nested_inlined.post.exp \ + test_signal.vgtest test_signal.stderr.exp test_signal.post.exp \ + test_exception.vgtest test_exception.stderr.exp test_exception.post.exp \ + test_longjmp.vgtest test_longjmp.stderr.exp test_longjmp.post.exp \ + test_tailcall.vgtest test_tailcall.stderr.exp test_tailcall.post.exp \ + test_recursion.vgtest test_recursion.stderr.exp test_recursion.post.exp \ + test_thread_create.vgtest test_thread_create.stderr.exp test_thread_create.post.exp \ + test_thread_interleave.vgtest test_thread_interleave.stderr.exp test_thread_interleave.post.exp \ + test_syscall.vgtest test_syscall.stderr.exp test_syscall.post.exp \ + test_schema.vgtest test_schema.stderr.exp test_schema.post.exp diff --git a/tracegrind/tests/filter_stderr b/tracegrind/tests/filter_stderr new file mode 100755 index 000000000..c62611e02 --- /dev/null +++ b/tracegrind/tests/filter_stderr @@ -0,0 +1,36 @@ +#! /bin/sh + +dir=`dirname $0` + +$dir/../../tests/filter_stderr_basic | + +# Remove "Tracegrind, ..." line and the following copyright line. +sed "/^Tracegrind, a streaming trace cache profiler/ , /./ d" | + +# Remove pointer to tracegrind_control +sed "/^For interactive control,.*$/d" | + +# Remove numbers from "Collected" line +sed "s/^\(Collected *:\)[ 0-9]*$/\1/" | + +# Remove numbers from I/D/LL "refs:" lines +perl -p -e 's/((I|D|LL) *refs:)[ 0-9,()+rdw]*$/\1/' | + +# Remove numbers from I1/D1/LL/LLi/LLd "misses:" and "miss rates:" lines +perl -p -e 's/((I1|D1|LL|LLi|LLd) *(misses|miss rate):)[ 0-9,()+rdw%\.]*$/\1/' | + +# Remove numbers from "Branches:", "Mispredicts:, and "Mispred rate:" lines +perl -p -e 's/((Branches|Mispredicts|Mispred rate):)[ 0-9,()+condi%\.]*$/\1/' | + +# Remove CPUID warnings lines for P4s and other machines +sed "/warning: Pentium 4 with 12 KB micro-op instruction trace cache/d" | +sed "/Simulating a 16 KB I-cache with 32 B lines/d" | +sed "/warning: L3 cache found, using its data for the LL simulation./d" | +sed "/warning: L4 cache found, using its data for the LL simulation./d" | +sed "/Warning: Cannot auto-detect cache config, using defaults./d" | +sed "/Run with -v to see./d" | +sed "/warning: specified LL cache: line_size .*$/d" | +sed "/warning: simulated LL cache: line_size .*$/d" | + +# Remove trace output file path messages +sed "/^Streaming trace output to /d" diff --git a/tracegrind/tests/filter_trace b/tracegrind/tests/filter_trace new file mode 100755 index 000000000..1ccbbf46a --- /dev/null +++ b/tracegrind/tests/filter_trace @@ -0,0 +1,62 @@ +#!/bin/sh +# +# Filter tracegrind trace output (from tracegrind-analyzer) +# to normalize machine-dependent values for regression testing. +# + +# Normalize format/schema version numbers +sed 's/^Format Version: [0-9]\+$/Format Version: N/' | +sed 's/^Schema Version: [0-9]\+$/Schema Version: N/' | + +# Normalize object paths: replace full path to test binary with just the basename +# e.g. obj=/home/user/valgrind/tracegrind/tests/test_marker -> obj=test_marker +sed 's|obj=[^ |]*[/]||g' | + +# Normalize file paths: replace full source paths with just the basename +# e.g. file=/home/user/.../test_marker.c -> file=test_marker.c +sed 's|file=[^ |]*[/]||g' | + +# Normalize function address/stats that vary: Ir counts +# Replace Ir= with Ir=N +sed 's|Ir=[0-9]\+|Ir=N|g' | + +# Normalize syscall timing values (non-deterministic) +# Replace nonzero sysTime/sysCpuTime with >0 to assert they are measured +sed 's|sysTime=[1-9][0-9]*|sysTime=T|g' | +sed 's|sysCpuTime=[1-9][0-9]*|sysCpuTime=T|g' | + +# Remove the separator line +sed '/^-\{10,\}$/d' | + +# Normalize "Total rows:" count +sed 's/^Total rows: [0-9,]\+$/Total rows: N/' | + +# Normalize "Showing X of Y rows" +sed 's/^Showing [0-9,]\+ of [0-9,]\+ rows$/Showing N of N rows/' | + +# Normalize "Sequence range:" numbers +sed 's/^Sequence range: [0-9,]\+ - [0-9,]\+$/Sequence range: N - N/' | + +# Normalize event count percentages in stats +sed 's/\([0-9,]\+\) ([0-9.]\+%)/N (P%)/g' | + +# Normalize "Threads: N ([...])" +sed 's/^Threads: \([0-9]\+\) (\[.*\])/Threads: \1/' | + +# Remove "Top 10 functions" section (platform-dependent) +sed '/^Top 10 functions/,/^$/d' | + +# Remove "Fork events" section (platform-dependent) +sed '/^Fork events/,/^$/d' | + +# Remove "Thread create events" section (platform-dependent) +sed '/^Thread create events/,/^$/d' | + +# Normalize seq numbers in raw arrays: [1234, ...] -> [N, ...] +sed 's/^\[\([0-9]\+\),/[N,/g' | + +# Normalize seq= in formatted output +sed 's/seq=[0-9]\+/seq=N/g' | + +# Strip GCC optimization suffixes from function names (e.g. .constprop.0, .isra.0, .part.0) +sed 's/\.\(constprop\|isra\|part\|cold\|lto_priv\)\.[0-9]*//g' diff --git a/tracegrind/tests/test_basic.c b/tracegrind/tests/test_basic.c new file mode 100644 index 000000000..2dddef620 --- /dev/null +++ b/tracegrind/tests/test_basic.c @@ -0,0 +1,14 @@ +#include "tracegrind.h" + +static int factorial(int n) +{ + if (n <= 1) + return 1; + return n * factorial(n - 1); +} + +int main(void) +{ + int result = factorial(5); + return result != 120; +} diff --git a/tracegrind/tests/test_basic.post.exp b/tracegrind/tests/test_basic.post.exp new file mode 100644 index 000000000..19397d9bc --- /dev/null +++ b/tracegrind/tests/test_basic.post.exp @@ -0,0 +1,23 @@ +Format Version: N +Format Name: tracegrind-msgpack +Schema Version: N + +Event Schemas (discriminated union): + 0 (MARKER): ['seq', 'tid', 'event', 'marker'] + 1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 5 (FORK): ['seq', 'tid', 'event', 'child_pid'] + 6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid'] +Counters: ['Ir'] + +Total rows: N + +Events by type: + ENTER_FN: N (P%) + EXIT_FN: N (P%) + +Threads: 1 +Sequence range: N - N + diff --git a/tracegrind/tests/test_basic.stderr.exp b/tracegrind/tests/test_basic.stderr.exp new file mode 100644 index 000000000..d0b7820ae --- /dev/null +++ b/tracegrind/tests/test_basic.stderr.exp @@ -0,0 +1,6 @@ + + +Events : Ir +Collected : + +I refs: diff --git a/tracegrind/tests/test_basic.vgtest b/tracegrind/tests/test_basic.vgtest new file mode 100644 index 000000000..4f2a05cd8 --- /dev/null +++ b/tracegrind/tests/test_basic.vgtest @@ -0,0 +1,5 @@ +prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null +prog: test_basic.bin +vgopts: --tracegrind-out-file=tracegrind.out.test_basic.msgpack.lz4 +post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_basic.msgpack.lz4 --stats | ./filter_trace +cleanup: rm -f tracegrind.out.test_basic.msgpack.lz4 diff --git a/tracegrind/tests/test_enter_inlined.c b/tracegrind/tests/test_enter_inlined.c new file mode 100644 index 000000000..70aa99e84 --- /dev/null +++ b/tracegrind/tests/test_enter_inlined.c @@ -0,0 +1,35 @@ +#include "tracegrind.h" + +/* Force inlining - with --read-inline-info=yes these should produce + * ENTER_INLINED / EXIT_INLINED events in the trace */ +static inline __attribute__((always_inline)) int inlined_work(int a, int b) +{ + /* Make the function large enough to span multiple basic blocks + * so at least one BB boundary falls inside inlined code */ + int result = 0; + if (a > 0) { + result = a * b; + } else { + result = a + b; + } + return result; +} + +/* Prevent inlining - SHOULD appear as ENTER/EXIT */ +static int __attribute__((noinline)) not_inlined_caller(int n) +{ + /* Use volatile to prevent constant propagation */ + volatile int x = n; + return inlined_work(x, x + 1); +} + +int main(void) +{ + volatile int input = 3; + TRACEGRIND_ADD_MARKER("start"); + TRACEGRIND_START_INSTRUMENTATION; + int result = not_inlined_caller(input); + TRACEGRIND_STOP_INSTRUMENTATION; + TRACEGRIND_ADD_MARKER("end"); + return result != 12; +} diff --git a/tracegrind/tests/test_enter_inlined.post.exp b/tracegrind/tests/test_enter_inlined.post.exp new file mode 100644 index 000000000..f63eb2906 --- /dev/null +++ b/tracegrind/tests/test_enter_inlined.post.exp @@ -0,0 +1,21 @@ +Format Version: N +Format Name: tracegrind-msgpack +Schema Version: N + +Event Schemas (discriminated union): + 0 (MARKER): ['seq', 'tid', 'event', 'marker'] + 1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 5 (FORK): ['seq', 'tid', 'event', 'child_pid'] + 6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid'] +Counters: ['Ir'] + +Showing N of N rows +seq=N | tid=1 | event=MARKER | marker=start +seq=N | tid=1 | event=ENTER_FN | fn=not_inlined_caller | obj=test_enter_inlined.bin | file=test_enter_inlined.c | line=0 | Ir=N +seq=N | tid=1 | event=ENTER_INLINED_FN | fn=inlined_work | obj=test_enter_inlined.bin | file=test_enter_inlined.c | line=10 | Ir=N +seq=N | tid=1 | event=EXIT_INLINED_FN | fn=inlined_work | obj=test_enter_inlined.bin | file=test_enter_inlined.c | line=10 | Ir=N +seq=N | tid=1 | event=EXIT_FN | fn=not_inlined_caller | obj=test_enter_inlined.bin | file=test_enter_inlined.c | line=0 | Ir=N +seq=N | tid=1 | event=MARKER | marker=end diff --git a/tracegrind/tests/test_enter_inlined.stderr.exp b/tracegrind/tests/test_enter_inlined.stderr.exp new file mode 100644 index 000000000..d0b7820ae --- /dev/null +++ b/tracegrind/tests/test_enter_inlined.stderr.exp @@ -0,0 +1,6 @@ + + +Events : Ir +Collected : + +I refs: diff --git a/tracegrind/tests/test_enter_inlined.vgtest b/tracegrind/tests/test_enter_inlined.vgtest new file mode 100644 index 000000000..1b5d7c55d --- /dev/null +++ b/tracegrind/tests/test_enter_inlined.vgtest @@ -0,0 +1,5 @@ +prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null +prog: test_enter_inlined.bin +vgopts: --tracegrind-out-file=tracegrind.out.test_enter_inlined.msgpack.lz4 --instr-atstart=no --read-inline-info=yes +post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_enter_inlined.msgpack.lz4 | ./filter_trace +cleanup: rm -f tracegrind.out.test_enter_inlined.msgpack.lz4 diff --git a/tracegrind/tests/test_exception.cpp b/tracegrind/tests/test_exception.cpp new file mode 100644 index 000000000..b9b599bdd --- /dev/null +++ b/tracegrind/tests/test_exception.cpp @@ -0,0 +1,46 @@ +#include "tracegrind.h" +#include + +/* + * Test: C++ exception unwinding through regular (non-inlined) functions. + * + * catcher() calls thrower(), which calls do_throw(). + * do_throw() throws an exception that unwinds back through thrower() + * to catcher()'s catch block. Verifies the call stack is properly + * maintained across exception unwinding. + * + * Call chain: catcher -> thrower -> do_throw (throws) + */ + +static void __attribute__((noinline)) do_throw(int x) +{ + if (x > 0) + throw std::runtime_error("boom"); +} + +static int __attribute__((noinline)) thrower(int n) +{ + volatile int x = n; + do_throw(x); + return x; +} + +static int __attribute__((noinline)) catcher(int n) +{ + try { + return thrower(n); + } catch (const std::exception&) { + return -1; + } +} + +int main() +{ + volatile int input = 5; + TRACEGRIND_ADD_MARKER("start"); + TRACEGRIND_START_INSTRUMENTATION; + int result = catcher(input); + TRACEGRIND_STOP_INSTRUMENTATION; + TRACEGRIND_ADD_MARKER("end"); + return result != -1; +} diff --git a/tracegrind/tests/test_exception.post.exp b/tracegrind/tests/test_exception.post.exp new file mode 100644 index 000000000..7089c29d3 --- /dev/null +++ b/tracegrind/tests/test_exception.post.exp @@ -0,0 +1,15 @@ +Format Version: N +Format Name: tracegrind-msgpack +Schema Version: N +Event Schemas (discriminated union): + 0 (MARKER): ['seq', 'tid', 'event', 'marker'] +Counters: ['Ir'] +Showing N of N rows +seq=N | tid=1 | event=MARKER | marker=start +seq=N | tid=1 | event=ENTER_FN | fn=catcher(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N +seq=N | tid=1 | event=ENTER_FN | fn=thrower(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N +seq=N | tid=1 | event=ENTER_FN | fn=do_throw(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N +seq=N | tid=1 | event=EXIT_FN | fn=do_throw(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N +seq=N | tid=1 | event=EXIT_FN | fn=thrower(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N +seq=N | tid=1 | event=EXIT_FN | fn=catcher(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N +seq=N | tid=1 | event=MARKER | marker=end diff --git a/tracegrind/tests/test_exception.stderr.exp b/tracegrind/tests/test_exception.stderr.exp new file mode 100644 index 000000000..d0b7820ae --- /dev/null +++ b/tracegrind/tests/test_exception.stderr.exp @@ -0,0 +1,6 @@ + + +Events : Ir +Collected : + +I refs: diff --git a/tracegrind/tests/test_exception.vgtest b/tracegrind/tests/test_exception.vgtest new file mode 100644 index 000000000..1567cd9d2 --- /dev/null +++ b/tracegrind/tests/test_exception.vgtest @@ -0,0 +1,5 @@ +prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null +prog: test_exception.bin +vgopts: --tracegrind-out-file=tracegrind.out.test_exception.msgpack.lz4 --instr-atstart=no +post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_exception.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Counters:|Showing |MARKER|fn=catcher|fn=thrower|fn=do_throw)' +cleanup: rm -f tracegrind.out.test_exception.msgpack.lz4 diff --git a/tracegrind/tests/test_foo_bar_baz.c b/tracegrind/tests/test_foo_bar_baz.c new file mode 100644 index 000000000..f4f2560f4 --- /dev/null +++ b/tracegrind/tests/test_foo_bar_baz.c @@ -0,0 +1,18 @@ +#include "tracegrind.h" + +static int __attribute__((noinline)) baz(int n) { return n * 2; } + +static int __attribute__((noinline)) bar(int n) { return baz(n) + 1; } + +static int __attribute__((noinline)) foo(int n) { return bar(n) + bar(n + 1); } + +int main(void) +{ + TRACEGRIND_ADD_MARKER("start"); + TRACEGRIND_START_INSTRUMENTATION; + int result = foo(3); + TRACEGRIND_STOP_INSTRUMENTATION; + TRACEGRIND_ADD_MARKER("end"); + + return result != (baz(3) + 1 + baz(4) + 1); +} diff --git a/tracegrind/tests/test_foo_bar_baz.post.exp b/tracegrind/tests/test_foo_bar_baz.post.exp new file mode 100644 index 000000000..ad3a60185 --- /dev/null +++ b/tracegrind/tests/test_foo_bar_baz.post.exp @@ -0,0 +1,27 @@ +Format Version: N +Format Name: tracegrind-msgpack +Schema Version: N + +Event Schemas (discriminated union): + 0 (MARKER): ['seq', 'tid', 'event', 'marker'] + 1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 5 (FORK): ['seq', 'tid', 'event', 'child_pid'] + 6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid'] +Counters: ['Ir'] + +Showing N of N rows +seq=N | tid=1 | event=MARKER | marker=start +seq=N | tid=1 | event=ENTER_FN | fn=foo | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N +seq=N | tid=1 | event=ENTER_FN | fn=bar | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N +seq=N | tid=1 | event=ENTER_FN | fn=baz | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N +seq=N | tid=1 | event=EXIT_FN | fn=baz | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N +seq=N | tid=1 | event=EXIT_FN | fn=bar | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N +seq=N | tid=1 | event=ENTER_FN | fn=bar | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N +seq=N | tid=1 | event=ENTER_FN | fn=baz | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N +seq=N | tid=1 | event=EXIT_FN | fn=baz | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N +seq=N | tid=1 | event=EXIT_FN | fn=bar | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N +seq=N | tid=1 | event=EXIT_FN | fn=foo | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N +seq=N | tid=1 | event=MARKER | marker=end diff --git a/tracegrind/tests/test_foo_bar_baz.stderr.exp b/tracegrind/tests/test_foo_bar_baz.stderr.exp new file mode 100644 index 000000000..d0b7820ae --- /dev/null +++ b/tracegrind/tests/test_foo_bar_baz.stderr.exp @@ -0,0 +1,6 @@ + + +Events : Ir +Collected : + +I refs: diff --git a/tracegrind/tests/test_foo_bar_baz.vgtest b/tracegrind/tests/test_foo_bar_baz.vgtest new file mode 100644 index 000000000..c2a7b3efb --- /dev/null +++ b/tracegrind/tests/test_foo_bar_baz.vgtest @@ -0,0 +1,5 @@ +prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null +prog: test_foo_bar_baz.bin +vgopts: --tracegrind-out-file=tracegrind.out.test_foo_bar_baz.msgpack.lz4 --instr-atstart=no +post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_foo_bar_baz.msgpack.lz4 | ./filter_trace +cleanup: rm -f tracegrind.out.test_foo_bar_baz.msgpack.lz4 diff --git a/tracegrind/tests/test_inline.c b/tracegrind/tests/test_inline.c new file mode 100644 index 000000000..0533ee592 --- /dev/null +++ b/tracegrind/tests/test_inline.c @@ -0,0 +1,29 @@ +#include "tracegrind.h" + +/* Force inlining - these should NOT appear as ENTER/EXIT in the trace */ +static inline __attribute__((always_inline)) int inlined_add(int a, int b) +{ + return a + b; +} + +static inline __attribute__((always_inline)) int inlined_mul(int a, int b) +{ + return a * b; +} + +/* Prevent inlining - these SHOULD appear as ENTER/EXIT in the trace */ +static int __attribute__((noinline)) not_inlined_work(int n) +{ + return inlined_add(n, inlined_mul(n, 2)); +} + +int main(void) +{ + TRACEGRIND_ADD_MARKER("start"); + TRACEGRIND_START_INSTRUMENTATION; + int result = not_inlined_work(5); + TRACEGRIND_STOP_INSTRUMENTATION; + TRACEGRIND_ADD_MARKER("end"); + + return result != 15; +} diff --git a/tracegrind/tests/test_inline.post.exp b/tracegrind/tests/test_inline.post.exp new file mode 100644 index 000000000..f06c345cb --- /dev/null +++ b/tracegrind/tests/test_inline.post.exp @@ -0,0 +1,19 @@ +Format Version: N +Format Name: tracegrind-msgpack +Schema Version: N + +Event Schemas (discriminated union): + 0 (MARKER): ['seq', 'tid', 'event', 'marker'] + 1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 5 (FORK): ['seq', 'tid', 'event', 'child_pid'] + 6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid'] +Counters: ['Ir'] + +Showing N of N rows +seq=N | tid=1 | event=MARKER | marker=start +seq=N | tid=1 | event=ENTER_FN | fn=not_inlined_work | obj=test_inline.bin | file=test_inline.c | line=0 | Ir=N +seq=N | tid=1 | event=EXIT_FN | fn=not_inlined_work | obj=test_inline.bin | file=test_inline.c | line=0 | Ir=N +seq=N | tid=1 | event=MARKER | marker=end diff --git a/tracegrind/tests/test_inline.stderr.exp b/tracegrind/tests/test_inline.stderr.exp new file mode 100644 index 000000000..d0b7820ae --- /dev/null +++ b/tracegrind/tests/test_inline.stderr.exp @@ -0,0 +1,6 @@ + + +Events : Ir +Collected : + +I refs: diff --git a/tracegrind/tests/test_inline.vgtest b/tracegrind/tests/test_inline.vgtest new file mode 100644 index 000000000..5c96843d2 --- /dev/null +++ b/tracegrind/tests/test_inline.vgtest @@ -0,0 +1,5 @@ +prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null +prog: test_inline.bin +vgopts: --tracegrind-out-file=tracegrind.out.test_inline.msgpack.lz4 --instr-atstart=no +post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_inline.msgpack.lz4 | ./filter_trace +cleanup: rm -f tracegrind.out.test_inline.msgpack.lz4 diff --git a/tracegrind/tests/test_instr_toggle.c b/tracegrind/tests/test_instr_toggle.c new file mode 100644 index 000000000..07d5f46f8 --- /dev/null +++ b/tracegrind/tests/test_instr_toggle.c @@ -0,0 +1,21 @@ +#include "tracegrind.h" + +static int __attribute__((noinline)) fibo(int n) +{ + if (n <= 1) + return n; + return fibo(n - 1) + fibo(n - 2); +} + +int main(void) +{ + /* Instrumentation is off (--instr-atstart=no). + Only the fibo(2) call will be traced. */ + TRACEGRIND_ADD_MARKER("before-fibo"); + TRACEGRIND_START_INSTRUMENTATION; + int result = fibo(2); + TRACEGRIND_STOP_INSTRUMENTATION; + TRACEGRIND_ADD_MARKER("after-fibo"); + + return result != 1; +} diff --git a/tracegrind/tests/test_instr_toggle.post.exp b/tracegrind/tests/test_instr_toggle.post.exp new file mode 100644 index 000000000..1ee05299d --- /dev/null +++ b/tracegrind/tests/test_instr_toggle.post.exp @@ -0,0 +1,23 @@ +Format Version: N +Format Name: tracegrind-msgpack +Schema Version: N + +Event Schemas (discriminated union): + 0 (MARKER): ['seq', 'tid', 'event', 'marker'] + 1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 5 (FORK): ['seq', 'tid', 'event', 'child_pid'] + 6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid'] +Counters: ['Ir'] + +Showing N of N rows +seq=N | tid=1 | event=MARKER | marker=before-fibo +seq=N | tid=1 | event=ENTER_FN | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N +seq=N | tid=1 | event=ENTER_FN | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N +seq=N | tid=1 | event=EXIT_FN | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N +seq=N | tid=1 | event=ENTER_FN | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N +seq=N | tid=1 | event=EXIT_FN | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N +seq=N | tid=1 | event=EXIT_FN | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N +seq=N | tid=1 | event=MARKER | marker=after-fibo diff --git a/tracegrind/tests/test_instr_toggle.stderr.exp b/tracegrind/tests/test_instr_toggle.stderr.exp new file mode 100644 index 000000000..d0b7820ae --- /dev/null +++ b/tracegrind/tests/test_instr_toggle.stderr.exp @@ -0,0 +1,6 @@ + + +Events : Ir +Collected : + +I refs: diff --git a/tracegrind/tests/test_instr_toggle.vgtest b/tracegrind/tests/test_instr_toggle.vgtest new file mode 100644 index 000000000..3247a09e4 --- /dev/null +++ b/tracegrind/tests/test_instr_toggle.vgtest @@ -0,0 +1,5 @@ +prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null +prog: test_instr_toggle.bin +vgopts: --tracegrind-out-file=tracegrind.out.test_instr_toggle.msgpack.lz4 --instr-atstart=no +post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_instr_toggle.msgpack.lz4 | ./filter_trace +cleanup: rm -f tracegrind.out.test_instr_toggle.msgpack.lz4 diff --git a/tracegrind/tests/test_longjmp.c b/tracegrind/tests/test_longjmp.c new file mode 100644 index 000000000..5659431b2 --- /dev/null +++ b/tracegrind/tests/test_longjmp.c @@ -0,0 +1,51 @@ +#include "tracegrind.h" +#include + +/* + * Test: longjmp unwinding multiple call frames. + * + * outer() calls middle(), which calls inner(). + * inner() does longjmp back to outer(), skipping middle()'s return. + * Verifies tracegrind properly unwinds the call stack on non-local jumps. + * + * Call chain: outer -> middle -> inner (longjmp back to outer) + */ + +static jmp_buf env; + +static void __attribute__((noinline)) inner(int n) +{ + volatile int x = n * 2; + (void)x; + longjmp(env, 42); +} + +static void __attribute__((noinline)) middle(int n) +{ + volatile int x = n + 1; + inner(x); + /* never reached */ + x = x + 1; +} + +static int __attribute__((noinline)) outer(int n) +{ + int val = setjmp(env); + if (val == 0) { + middle(n); + /* never reached */ + return -1; + } + return val; +} + +int main(void) +{ + volatile int input = 5; + TRACEGRIND_ADD_MARKER("start"); + TRACEGRIND_START_INSTRUMENTATION; + int result = outer(input); + TRACEGRIND_STOP_INSTRUMENTATION; + TRACEGRIND_ADD_MARKER("end"); + return result != 42; +} diff --git a/tracegrind/tests/test_longjmp.post.exp b/tracegrind/tests/test_longjmp.post.exp new file mode 100644 index 000000000..d0524b77e --- /dev/null +++ b/tracegrind/tests/test_longjmp.post.exp @@ -0,0 +1,15 @@ +Format Version: N +Format Name: tracegrind-msgpack +Schema Version: N +Event Schemas (discriminated union): + 0 (MARKER): ['seq', 'tid', 'event', 'marker'] +Counters: ['Ir'] +Showing N of N rows +seq=N | tid=1 | event=MARKER | marker=start +seq=N | tid=1 | event=ENTER_FN | fn=outer | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N +seq=N | tid=1 | event=ENTER_FN | fn=middle | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N +seq=N | tid=1 | event=ENTER_FN | fn=inner | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N +seq=N | tid=1 | event=EXIT_FN | fn=inner | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N +seq=N | tid=1 | event=EXIT_FN | fn=middle | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N +seq=N | tid=1 | event=EXIT_FN | fn=outer | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N +seq=N | tid=1 | event=MARKER | marker=end diff --git a/tracegrind/tests/test_longjmp.stderr.exp b/tracegrind/tests/test_longjmp.stderr.exp new file mode 100644 index 000000000..d0b7820ae --- /dev/null +++ b/tracegrind/tests/test_longjmp.stderr.exp @@ -0,0 +1,6 @@ + + +Events : Ir +Collected : + +I refs: diff --git a/tracegrind/tests/test_longjmp.vgtest b/tracegrind/tests/test_longjmp.vgtest new file mode 100644 index 000000000..0291a7fbe --- /dev/null +++ b/tracegrind/tests/test_longjmp.vgtest @@ -0,0 +1,5 @@ +prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null +prog: test_longjmp.bin +vgopts: --tracegrind-out-file=tracegrind.out.test_longjmp.msgpack.lz4 --instr-atstart=no +post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_longjmp.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Counters:|Showing |MARKER|fn=outer|fn=middle|fn=inner)' +cleanup: rm -f tracegrind.out.test_longjmp.msgpack.lz4 diff --git a/tracegrind/tests/test_marker.c b/tracegrind/tests/test_marker.c new file mode 100644 index 000000000..721883b3b --- /dev/null +++ b/tracegrind/tests/test_marker.c @@ -0,0 +1,17 @@ +#include "tracegrind.h" + +static int compute(int n) +{ + int sum = 0; + for (int i = 0; i < n; i++) + sum += i * i; + return sum; +} + +int main(void) +{ + TRACEGRIND_ADD_MARKER("start-work"); + int result = compute(1000); + TRACEGRIND_ADD_MARKER("end-work"); + return result == 0; +} diff --git a/tracegrind/tests/test_marker.post.exp b/tracegrind/tests/test_marker.post.exp new file mode 100644 index 000000000..cd8748b02 --- /dev/null +++ b/tracegrind/tests/test_marker.post.exp @@ -0,0 +1,17 @@ +Format Version: N +Format Name: tracegrind-msgpack +Schema Version: N + +Event Schemas (discriminated union): + 0 (MARKER): ['seq', 'tid', 'event', 'marker'] + 1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 5 (FORK): ['seq', 'tid', 'event', 'child_pid'] + 6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid'] +Counters: ['Ir'] + +Showing N of N rows +[N, 1, 0, 'start-work'] +[N, 1, 0, 'end-work'] diff --git a/tracegrind/tests/test_marker.stderr.exp b/tracegrind/tests/test_marker.stderr.exp new file mode 100644 index 000000000..d0b7820ae --- /dev/null +++ b/tracegrind/tests/test_marker.stderr.exp @@ -0,0 +1,6 @@ + + +Events : Ir +Collected : + +I refs: diff --git a/tracegrind/tests/test_marker.vgtest b/tracegrind/tests/test_marker.vgtest new file mode 100644 index 000000000..9165191e0 --- /dev/null +++ b/tracegrind/tests/test_marker.vgtest @@ -0,0 +1,5 @@ +prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null +prog: test_marker.bin +vgopts: --tracegrind-out-file=tracegrind.out.test_marker.msgpack.lz4 +post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_marker.msgpack.lz4 --event MARKER --raw | ./filter_trace +cleanup: rm -f tracegrind.out.test_marker.msgpack.lz4 diff --git a/tracegrind/tests/test_nested_inlined.c b/tracegrind/tests/test_nested_inlined.c new file mode 100644 index 000000000..a0daca1e1 --- /dev/null +++ b/tracegrind/tests/test_nested_inlined.c @@ -0,0 +1,51 @@ +#include "tracegrind.h" + +/* Inner inlined function. + * With --read-inline-info=yes, should produce ENTER_INLINED / EXIT_INLINED + * events with fn=inner_inline. */ +static inline __attribute__((always_inline)) int inner_inline(int a) +{ + int result; + if (a > 0) { + result = a * 3; + } else { + result = a + 1; + } + return result; +} + +/* Outer inlined function - calls inner_inline. + * Should produce ENTER_INLINED events for both outer_inline and inner_inline, + * showing nested inline transitions. + * Uses volatile stores in both branches to prevent the compiler from + * converting the if-else to a branchless cmov. */ +static inline __attribute__((always_inline)) int outer_inline(int a, int b) +{ + volatile int x; + if (a > b) { + x = a - b; + } else { + x = b - a; + } + int y = inner_inline(x); + return y + a; +} + +/* Non-inlined caller */ +static int __attribute__((noinline)) caller(int n) +{ + volatile int x = n; + return outer_inline(x, x + 1); +} + +int main(void) +{ + volatile int input = 5; + TRACEGRIND_ADD_MARKER("start"); + TRACEGRIND_START_INSTRUMENTATION; + int result = caller(input); + TRACEGRIND_STOP_INSTRUMENTATION; + TRACEGRIND_ADD_MARKER("end"); + /* caller(5) -> outer_inline(5, 6): x=1, inner_inline(1)=3, 3+5=8 */ + return result != 8; +} diff --git a/tracegrind/tests/test_nested_inlined.post.exp b/tracegrind/tests/test_nested_inlined.post.exp new file mode 100644 index 000000000..0d0571af2 --- /dev/null +++ b/tracegrind/tests/test_nested_inlined.post.exp @@ -0,0 +1,23 @@ +Format Version: N +Format Name: tracegrind-msgpack +Schema Version: N + +Event Schemas (discriminated union): + 0 (MARKER): ['seq', 'tid', 'event', 'marker'] + 1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 5 (FORK): ['seq', 'tid', 'event', 'child_pid'] + 6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid'] +Counters: ['Ir'] + +Showing N of N rows +seq=N | tid=1 | event=MARKER | marker=start +seq=N | tid=1 | event=ENTER_FN | fn=caller | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=0 | Ir=N +seq=N | tid=1 | event=ENTER_INLINED_FN | fn=outer_inline | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=26 | Ir=N +seq=N | tid=1 | event=ENTER_INLINED_FN | fn=inner_inline | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=9 | Ir=N +seq=N | tid=1 | event=EXIT_INLINED_FN | fn=inner_inline | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=9 | Ir=N +seq=N | tid=1 | event=EXIT_INLINED_FN | fn=outer_inline | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=9 | Ir=N +seq=N | tid=1 | event=EXIT_FN | fn=caller | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=0 | Ir=N +seq=N | tid=1 | event=MARKER | marker=end diff --git a/tracegrind/tests/test_nested_inlined.stderr.exp b/tracegrind/tests/test_nested_inlined.stderr.exp new file mode 100644 index 000000000..d0b7820ae --- /dev/null +++ b/tracegrind/tests/test_nested_inlined.stderr.exp @@ -0,0 +1,6 @@ + + +Events : Ir +Collected : + +I refs: diff --git a/tracegrind/tests/test_nested_inlined.vgtest b/tracegrind/tests/test_nested_inlined.vgtest new file mode 100644 index 000000000..adaf9a895 --- /dev/null +++ b/tracegrind/tests/test_nested_inlined.vgtest @@ -0,0 +1,5 @@ +prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null +prog: test_nested_inlined.bin +vgopts: --tracegrind-out-file=tracegrind.out.test_nested_inlined.msgpack.lz4 --instr-atstart=no --read-inline-info=yes +post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_nested_inlined.msgpack.lz4 | ./filter_trace +cleanup: rm -f tracegrind.out.test_nested_inlined.msgpack.lz4 diff --git a/tracegrind/tests/test_recursion.c b/tracegrind/tests/test_recursion.c new file mode 100644 index 000000000..e3589ae6c --- /dev/null +++ b/tracegrind/tests/test_recursion.c @@ -0,0 +1,28 @@ +#include "tracegrind.h" + +/* + * Test: deep recursion (100 levels). + * + * recurse() calls itself 100 times, then returns back through + * all frames. Verifies the call stack handles deep nesting and + * produces balanced ENTER/EXIT pairs. + */ + +static int __attribute__((noinline)) recurse(int depth) +{ + volatile int d = depth; + if (d <= 0) + return 0; + return recurse(d - 1) + 1; +} + +int main(void) +{ + volatile int input = 100; + TRACEGRIND_ADD_MARKER("start"); + TRACEGRIND_START_INSTRUMENTATION; + int result = recurse(input); + TRACEGRIND_STOP_INSTRUMENTATION; + TRACEGRIND_ADD_MARKER("end"); + return result != 100; +} diff --git a/tracegrind/tests/test_recursion.post.exp b/tracegrind/tests/test_recursion.post.exp new file mode 100644 index 000000000..06977039b --- /dev/null +++ b/tracegrind/tests/test_recursion.post.exp @@ -0,0 +1,4 @@ +seq=N | tid=1 | event=MARKER | marker=start +seq=N | tid=1 | event=MARKER | marker=end +ENTER_FN count: 101 +EXIT_FN count: 101 diff --git a/tracegrind/tests/test_recursion.stderr.exp b/tracegrind/tests/test_recursion.stderr.exp new file mode 100644 index 000000000..d0b7820ae --- /dev/null +++ b/tracegrind/tests/test_recursion.stderr.exp @@ -0,0 +1,6 @@ + + +Events : Ir +Collected : + +I refs: diff --git a/tracegrind/tests/test_recursion.vgtest b/tracegrind/tests/test_recursion.vgtest new file mode 100644 index 000000000..bfff7defe --- /dev/null +++ b/tracegrind/tests/test_recursion.vgtest @@ -0,0 +1,5 @@ +prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null +prog: test_recursion.bin +vgopts: --tracegrind-out-file=tracegrind.out.test_recursion.msgpack.lz4 --instr-atstart=no +post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_recursion.msgpack.lz4 | ./filter_trace | grep -E '(event=MARKER|fn=recurse)' | awk '/MARKER/{print} /ENTER_FN/{e++} /EXIT_FN/{x++} END{print "ENTER_FN count: "e; print "EXIT_FN count: "x}' +cleanup: rm -f tracegrind.out.test_recursion.msgpack.lz4 diff --git a/tracegrind/tests/test_schema.post.exp b/tracegrind/tests/test_schema.post.exp new file mode 100644 index 000000000..d30dbc939 --- /dev/null +++ b/tracegrind/tests/test_schema.post.exp @@ -0,0 +1,14 @@ +Format Version: N +Format Name: tracegrind-msgpack +Schema Version: N + +Event Schemas (discriminated union): + 0 (MARKER): ['seq', 'tid', 'event', 'marker'] + 1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 5 (FORK): ['seq', 'tid', 'event', 'child_pid'] + 6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid'] +Counters: ['Ir'] + diff --git a/tracegrind/tests/test_schema.stderr.exp b/tracegrind/tests/test_schema.stderr.exp new file mode 100644 index 000000000..d0b7820ae --- /dev/null +++ b/tracegrind/tests/test_schema.stderr.exp @@ -0,0 +1,6 @@ + + +Events : Ir +Collected : + +I refs: diff --git a/tracegrind/tests/test_schema.vgtest b/tracegrind/tests/test_schema.vgtest new file mode 100644 index 000000000..482a552de --- /dev/null +++ b/tracegrind/tests/test_schema.vgtest @@ -0,0 +1,5 @@ +prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null +prog: test_basic.bin +vgopts: --tracegrind-out-file=tracegrind.out.test_schema.msgpack.lz4 +post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_schema.msgpack.lz4 --schema | ./filter_trace +cleanup: rm -f tracegrind.out.test_schema.msgpack.lz4 diff --git a/tracegrind/tests/test_signal.c b/tracegrind/tests/test_signal.c new file mode 100644 index 000000000..028354780 --- /dev/null +++ b/tracegrind/tests/test_signal.c @@ -0,0 +1,42 @@ +#include "tracegrind.h" +#include +#include + +/* + * Test: signal handler interrupting normal function execution. + * + * caller() raises SIGALRM to itself. The signal handler (handler_fn) + * runs, then execution returns to caller(). Verifies the call stack + * is properly maintained across signal delivery. + */ + +static volatile sig_atomic_t got_signal = 0; + +static void __attribute__((noinline)) handler_fn(int sig) +{ + (void)sig; + got_signal = 1; +} + +static int __attribute__((noinline)) caller(int n) +{ + volatile int x = n; + raise(SIGALRM); + return x + 1; +} + +int main(void) +{ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = handler_fn; + sigaction(SIGALRM, &sa, NULL); + + volatile int input = 5; + TRACEGRIND_ADD_MARKER("start"); + TRACEGRIND_START_INSTRUMENTATION; + int result = caller(input); + TRACEGRIND_STOP_INSTRUMENTATION; + TRACEGRIND_ADD_MARKER("end"); + return (result != 6) || !got_signal; +} diff --git a/tracegrind/tests/test_signal.post.exp b/tracegrind/tests/test_signal.post.exp new file mode 100644 index 000000000..ec413adf6 --- /dev/null +++ b/tracegrind/tests/test_signal.post.exp @@ -0,0 +1,11 @@ +Format Version: N +Format Name: tracegrind-msgpack +Schema Version: N +Event Schemas (discriminated union): + 0 (MARKER): ['seq', 'tid', 'event', 'marker'] +Counters: ['Ir'] +Showing N of N rows +seq=N | tid=1 | event=MARKER | marker=start +seq=N | tid=1 | event=ENTER_FN | fn=caller | obj=test_signal.bin | file=test_signal.c | line=0 | Ir=N +seq=N | tid=1 | event=EXIT_FN | fn=caller | obj=test_signal.bin | file=test_signal.c | line=0 | Ir=N +seq=N | tid=1 | event=MARKER | marker=end diff --git a/tracegrind/tests/test_signal.stderr.exp b/tracegrind/tests/test_signal.stderr.exp new file mode 100644 index 000000000..d0b7820ae --- /dev/null +++ b/tracegrind/tests/test_signal.stderr.exp @@ -0,0 +1,6 @@ + + +Events : Ir +Collected : + +I refs: diff --git a/tracegrind/tests/test_signal.vgtest b/tracegrind/tests/test_signal.vgtest new file mode 100644 index 000000000..66391dfa1 --- /dev/null +++ b/tracegrind/tests/test_signal.vgtest @@ -0,0 +1,5 @@ +prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null +prog: test_signal.bin +vgopts: --tracegrind-out-file=tracegrind.out.test_signal.msgpack.lz4 --instr-atstart=no +post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_signal.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Counters:|Showing |MARKER|fn=caller|fn=handler_fn)' +cleanup: rm -f tracegrind.out.test_signal.msgpack.lz4 diff --git a/tracegrind/tests/test_syscall.c b/tracegrind/tests/test_syscall.c new file mode 100644 index 000000000..9aac40a48 --- /dev/null +++ b/tracegrind/tests/test_syscall.c @@ -0,0 +1,29 @@ +#include "tracegrind.h" +#include +#include + +static int __attribute__((noinline)) do_getpid(void) { return getpid(); } + +static void __attribute__((noinline)) do_write(int fd) +{ + const char msg[] = "hello\n"; + write(fd, msg, sizeof(msg) - 1); +} + +static void __attribute__((noinline)) caller(int fd) +{ + do_getpid(); + do_write(fd); +} + +int main(void) +{ + int fd = open("/dev/null", O_WRONLY); + TRACEGRIND_ADD_MARKER("start"); + TRACEGRIND_START_INSTRUMENTATION; + caller(fd); + TRACEGRIND_STOP_INSTRUMENTATION; + TRACEGRIND_ADD_MARKER("end"); + close(fd); + return 0; +} diff --git a/tracegrind/tests/test_syscall.post.exp b/tracegrind/tests/test_syscall.post.exp new file mode 100644 index 000000000..4f8a1c6fa --- /dev/null +++ b/tracegrind/tests/test_syscall.post.exp @@ -0,0 +1,26 @@ +Format Version: N +Format Name: tracegrind-msgpack +Schema Version: N +Event Schemas (discriminated union): + 0 (MARKER): ['seq', 'tid', 'event', 'marker'] + 1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 5 (FORK): ['seq', 'tid', 'event', 'child_pid'] + 6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid'] +Counters: ['Ir', 'sysCount', 'sysTime', 'sysCpuTime'] +Counter Units: {'sysCpuTime': 'ns', 'sysTime': 'ns'} +Showing N of N rows +seq=N | tid=1 | event=MARKER | marker=start +seq=N | tid=1 | event=ENTER_FN | fn=caller | obj=test_syscall.bin | file=test_syscall.c | line=0 | Ir=N | sysCount=0 | sysTime=0 | sysCpuTime=0 +seq=N | tid=1 | event=ENTER_FN | fn=do_getpid | obj=test_syscall.bin | file=test_syscall.c | line=0 | Ir=N | sysCount=0 | sysTime=0 | sysCpuTime=0 +seq=N | tid=1 | event=ENTER_FN | fn=getpid | obj=libc.so.6 | file=syscall-template.S | line=0 | Ir=N | sysCount=0 | sysTime=0 | sysCpuTime=0 +seq=N | tid=1 | event=EXIT_FN | fn=getpid | obj=libc.so.6 | file=syscall-template.S | line=0 | Ir=N | sysCount=1 | sysTime=T | sysCpuTime=T +seq=N | tid=1 | event=EXIT_FN | fn=do_getpid | obj=test_syscall.bin | file=test_syscall.c | line=0 | Ir=N | sysCount=0 | sysTime=0 | sysCpuTime=0 +seq=N | tid=1 | event=ENTER_FN | fn=do_write | obj=test_syscall.bin | file=test_syscall.c | line=0 | Ir=N | sysCount=0 | sysTime=0 | sysCpuTime=0 +seq=N | tid=1 | event=ENTER_FN | fn=write | obj=libc.so.6 | file=write.c | line=0 | Ir=N | sysCount=0 | sysTime=0 | sysCpuTime=0 +seq=N | tid=1 | event=EXIT_FN | fn=write | obj=libc.so.6 | file=write.c | line=0 | Ir=N | sysCount=1 | sysTime=T | sysCpuTime=T +seq=N | tid=1 | event=EXIT_FN | fn=do_write | obj=test_syscall.bin | file=test_syscall.c | line=0 | Ir=N | sysCount=0 | sysTime=0 | sysCpuTime=0 +seq=N | tid=1 | event=EXIT_FN | fn=caller | obj=test_syscall.bin | file=test_syscall.c | line=0 | Ir=N | sysCount=0 | sysTime=0 | sysCpuTime=0 +seq=N | tid=1 | event=MARKER | marker=end diff --git a/tracegrind/tests/test_syscall.stderr.exp b/tracegrind/tests/test_syscall.stderr.exp new file mode 100644 index 000000000..838c3d735 --- /dev/null +++ b/tracegrind/tests/test_syscall.stderr.exp @@ -0,0 +1,6 @@ + + +Events : Ir sysCount sysTime sysCpuTime +Collected : + +I refs: diff --git a/tracegrind/tests/test_syscall.vgtest b/tracegrind/tests/test_syscall.vgtest new file mode 100644 index 000000000..848ca69f7 --- /dev/null +++ b/tracegrind/tests/test_syscall.vgtest @@ -0,0 +1,5 @@ +prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null +prog: test_syscall.bin +vgopts: --tracegrind-out-file=tracegrind.out.test_syscall.msgpack.lz4 --instr-atstart=no --collect-systime=nsec +post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_syscall.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Counters:|Counter Units:|Showing |\(|MARKER|fn=caller |fn=do_getpid |fn=do_write |fn=getpid |fn=write )' +cleanup: rm -f tracegrind.out.test_syscall.msgpack.lz4 diff --git a/tracegrind/tests/test_tailcall.c b/tracegrind/tests/test_tailcall.c new file mode 100644 index 000000000..b5524c69d --- /dev/null +++ b/tracegrind/tests/test_tailcall.c @@ -0,0 +1,28 @@ +#include "tracegrind.h" + +/* + * Test: tail call optimization. + * + * chain_a() tail-calls chain_b(), which tail-calls chain_c(). + * At -O2, the compiler should optimize these into JMP instructions + * rather than CALL+RET. Verifies tracegrind handles sibling calls. + * + * Call chain: chain_a --(tail call)--> chain_b --(tail call)--> chain_c + */ + +static int __attribute__((noinline)) chain_c(int n) { return n + 3; } + +static int __attribute__((noinline)) chain_b(int n) { return chain_c(n + 2); } + +static int __attribute__((noinline)) chain_a(int n) { return chain_b(n + 1); } + +int main(void) +{ + volatile int input = 10; + TRACEGRIND_ADD_MARKER("start"); + TRACEGRIND_START_INSTRUMENTATION; + int result = chain_a(input); + TRACEGRIND_STOP_INSTRUMENTATION; + TRACEGRIND_ADD_MARKER("end"); + return result != 16; +} diff --git a/tracegrind/tests/test_tailcall.post.exp b/tracegrind/tests/test_tailcall.post.exp new file mode 100644 index 000000000..9d08266f6 --- /dev/null +++ b/tracegrind/tests/test_tailcall.post.exp @@ -0,0 +1,15 @@ +Format Version: N +Format Name: tracegrind-msgpack +Schema Version: N +Event Schemas (discriminated union): + 0 (MARKER): ['seq', 'tid', 'event', 'marker'] +Counters: ['Ir'] +Showing N of N rows +seq=N | tid=1 | event=MARKER | marker=start +seq=N | tid=1 | event=ENTER_FN | fn=chain_a | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N +seq=N | tid=1 | event=ENTER_FN | fn=chain_b | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N +seq=N | tid=1 | event=ENTER_FN | fn=chain_c | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N +seq=N | tid=1 | event=EXIT_FN | fn=chain_c | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N +seq=N | tid=1 | event=EXIT_FN | fn=chain_b | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N +seq=N | tid=1 | event=EXIT_FN | fn=chain_a | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N +seq=N | tid=1 | event=MARKER | marker=end diff --git a/tracegrind/tests/test_tailcall.stderr.exp b/tracegrind/tests/test_tailcall.stderr.exp new file mode 100644 index 000000000..d0b7820ae --- /dev/null +++ b/tracegrind/tests/test_tailcall.stderr.exp @@ -0,0 +1,6 @@ + + +Events : Ir +Collected : + +I refs: diff --git a/tracegrind/tests/test_tailcall.vgtest b/tracegrind/tests/test_tailcall.vgtest new file mode 100644 index 000000000..c5acf2b7b --- /dev/null +++ b/tracegrind/tests/test_tailcall.vgtest @@ -0,0 +1,5 @@ +prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null +prog: test_tailcall.bin +vgopts: --tracegrind-out-file=tracegrind.out.test_tailcall.msgpack.lz4 --instr-atstart=no +post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_tailcall.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Counters:|Showing |MARKER|fn=chain_)' +cleanup: rm -f tracegrind.out.test_tailcall.msgpack.lz4 diff --git a/tracegrind/tests/test_thread_create.c b/tracegrind/tests/test_thread_create.c new file mode 100644 index 000000000..29b340691 --- /dev/null +++ b/tracegrind/tests/test_thread_create.c @@ -0,0 +1,20 @@ +#include "tracegrind.h" +#include + +static void* thread_fn(void* arg) +{ + (void)arg; + return NULL; +} + +int main(void) +{ + pthread_t t; + TRACEGRIND_ADD_MARKER("start"); + TRACEGRIND_START_INSTRUMENTATION; + pthread_create(&t, NULL, thread_fn, NULL); + pthread_join(t, NULL); + TRACEGRIND_STOP_INSTRUMENTATION; + TRACEGRIND_ADD_MARKER("end"); + return 0; +} diff --git a/tracegrind/tests/test_thread_create.post.exp b/tracegrind/tests/test_thread_create.post.exp new file mode 100644 index 000000000..d695fbc3d --- /dev/null +++ b/tracegrind/tests/test_thread_create.post.exp @@ -0,0 +1,18 @@ +Format Version: N +Format Name: tracegrind-msgpack +Schema Version: N +Event Schemas (discriminated union): + 0 (MARKER): ['seq', 'tid', 'event', 'marker'] + 1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 5 (FORK): ['seq', 'tid', 'event', 'child_pid'] + 6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid'] +Counters: ['Ir'] +Showing N of N rows +seq=N | tid=1 | event=MARKER | marker=start +seq=N | tid=1 | event=THREAD_CREATE | child_tid=2 +seq=N | tid=2 | event=ENTER_FN | fn=thread_fn | obj=test_thread_create.bin | file=test_thread_create.c | line=0 | Ir=N +seq=N | tid=2 | event=EXIT_FN | fn=thread_fn | obj=test_thread_create.bin | file=test_thread_create.c | line=0 | Ir=N +seq=N | tid=1 | event=MARKER | marker=end diff --git a/tracegrind/tests/test_thread_create.stderr.exp b/tracegrind/tests/test_thread_create.stderr.exp new file mode 100644 index 000000000..d0b7820ae --- /dev/null +++ b/tracegrind/tests/test_thread_create.stderr.exp @@ -0,0 +1,6 @@ + + +Events : Ir +Collected : + +I refs: diff --git a/tracegrind/tests/test_thread_create.vgtest b/tracegrind/tests/test_thread_create.vgtest new file mode 100644 index 000000000..6ce6328d2 --- /dev/null +++ b/tracegrind/tests/test_thread_create.vgtest @@ -0,0 +1,5 @@ +prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null +prog: test_thread_create.bin +vgopts: --tracegrind-out-file=tracegrind.out.test_thread_create.msgpack.lz4 --instr-atstart=no +post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_thread_create.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Counters:|MARKER|THREAD_CREATE|FORK|thread_fn|Showing |\()' | sed 's/child_pid=[0-9]*/child_pid=N/' +cleanup: rm -f tracegrind.out.test_thread_create.msgpack.lz4 diff --git a/tracegrind/tests/test_thread_interleave.c b/tracegrind/tests/test_thread_interleave.c new file mode 100644 index 000000000..93efcec47 --- /dev/null +++ b/tracegrind/tests/test_thread_interleave.c @@ -0,0 +1,54 @@ +#include "tracegrind.h" +#include + +__attribute__((noinline)) static void depth_a2(void) {} + +__attribute__((noinline)) static void depth_a1(void) { depth_a2(); } + +__attribute__((noinline)) static void* work_a(void* arg) +{ + (void)arg; + depth_a1(); + return NULL; +} + +__attribute__((noinline)) static void depth_b1(void) {} + +__attribute__((noinline)) static void* work_b(void* arg) +{ + (void)arg; + depth_b1(); + return NULL; +} + +__attribute__((noinline)) static void depth_c2(void) {} + +__attribute__((noinline)) static void depth_c1(void) { depth_c2(); } + +__attribute__((noinline)) static void* work_c(void* arg) +{ + (void)arg; + depth_c1(); + return NULL; +} + +int main(void) +{ + pthread_t t1, t2, t3; + + TRACEGRIND_ADD_MARKER("start"); + TRACEGRIND_START_INSTRUMENTATION; + + pthread_create(&t1, NULL, work_a, NULL); + pthread_create(&t2, NULL, work_b, NULL); + pthread_create(&t3, NULL, work_c, NULL); + + pthread_join(t1, NULL); + pthread_join(t2, NULL); + pthread_join(t3, NULL); + + TRACEGRIND_STOP_INSTRUMENTATION; + TRACEGRIND_ADD_MARKER("end"); + + return 0; +} diff --git a/tracegrind/tests/test_thread_interleave.post.exp b/tracegrind/tests/test_thread_interleave.post.exp new file mode 100644 index 000000000..63f1c227f --- /dev/null +++ b/tracegrind/tests/test_thread_interleave.post.exp @@ -0,0 +1,34 @@ +Format Version: N +Format Name: tracegrind-msgpack +Schema Version: N +Event Schemas (discriminated union): + 0 (MARKER): ['seq', 'tid', 'event', 'marker'] + 1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 5 (FORK): ['seq', 'tid', 'event', 'child_pid'] + 6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid'] +Counters: ['Ir'] +Showing N of N rows +seq=N | tid=1 | event=MARKER | marker=start +seq=N | tid=1 | event=THREAD_CREATE | child_tid=2 +seq=N | tid=1 | event=THREAD_CREATE | child_tid=3 +seq=N | tid=1 | event=THREAD_CREATE | child_tid=4 +seq=N | tid=1 | event=MARKER | marker=end +seq=N | tid=2 | event=ENTER_FN | fn=work_a | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N +seq=N | tid=2 | event=ENTER_FN | fn=depth_a1 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N +seq=N | tid=2 | event=ENTER_FN | fn=depth_a2 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N +seq=N | tid=2 | event=EXIT_FN | fn=depth_a2 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N +seq=N | tid=2 | event=EXIT_FN | fn=depth_a1 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N +seq=N | tid=2 | event=EXIT_FN | fn=work_a | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N +seq=N | tid=3 | event=ENTER_FN | fn=work_b | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N +seq=N | tid=3 | event=ENTER_FN | fn=depth_b1 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N +seq=N | tid=3 | event=EXIT_FN | fn=depth_b1 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N +seq=N | tid=3 | event=EXIT_FN | fn=work_b | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N +seq=N | tid=4 | event=ENTER_FN | fn=work_c | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N +seq=N | tid=4 | event=ENTER_FN | fn=depth_c1 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N +seq=N | tid=4 | event=ENTER_FN | fn=depth_c2 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N +seq=N | tid=4 | event=EXIT_FN | fn=depth_c2 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N +seq=N | tid=4 | event=EXIT_FN | fn=depth_c1 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N +seq=N | tid=4 | event=EXIT_FN | fn=work_c | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N diff --git a/tracegrind/tests/test_thread_interleave.stderr.exp b/tracegrind/tests/test_thread_interleave.stderr.exp new file mode 100644 index 000000000..d0b7820ae --- /dev/null +++ b/tracegrind/tests/test_thread_interleave.stderr.exp @@ -0,0 +1,6 @@ + + +Events : Ir +Collected : + +I refs: diff --git a/tracegrind/tests/test_thread_interleave.vgtest b/tracegrind/tests/test_thread_interleave.vgtest new file mode 100644 index 000000000..02ea2cd8d --- /dev/null +++ b/tracegrind/tests/test_thread_interleave.vgtest @@ -0,0 +1,5 @@ +prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null +prog: test_thread_interleave.bin +vgopts: --tracegrind-out-file=tracegrind.out.test_thread_interleave.msgpack.lz4 --instr-atstart=no +post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_thread_interleave.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Counters:|Showing |\(|MARKER|THREAD_CREATE|fn=work_a |fn=work_b |fn=work_c |fn=depth_a1 |fn=depth_a2 |fn=depth_b1 |fn=depth_c1 |fn=depth_c2 )' | sort -t'|' -k2,2 -s +cleanup: rm -f tracegrind.out.test_thread_interleave.msgpack.lz4 diff --git a/tracegrind/tests/test_toggle_collect.c b/tracegrind/tests/test_toggle_collect.c new file mode 100644 index 000000000..635caaacc --- /dev/null +++ b/tracegrind/tests/test_toggle_collect.c @@ -0,0 +1,25 @@ +#include "tracegrind.h" + +static int work(int n) +{ + int sum = 0; + for (int i = 0; i < n; i++) + sum += i; + return sum; +} + +int main(void) +{ + /* Collection on by default, do some traced work */ + int result = work(10); + + /* Toggle collection off */ + TRACEGRIND_TOGGLE_COLLECT; + result += work(20); /* not collected */ + + /* Toggle collection back on */ + TRACEGRIND_TOGGLE_COLLECT; + result += work(30); /* collected again */ + + return result == 0; +} diff --git a/tracegrind/tests/test_toggle_collect.post.exp b/tracegrind/tests/test_toggle_collect.post.exp new file mode 100644 index 000000000..19397d9bc --- /dev/null +++ b/tracegrind/tests/test_toggle_collect.post.exp @@ -0,0 +1,23 @@ +Format Version: N +Format Name: tracegrind-msgpack +Schema Version: N + +Event Schemas (discriminated union): + 0 (MARKER): ['seq', 'tid', 'event', 'marker'] + 1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters'] + 5 (FORK): ['seq', 'tid', 'event', 'child_pid'] + 6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid'] +Counters: ['Ir'] + +Total rows: N + +Events by type: + ENTER_FN: N (P%) + EXIT_FN: N (P%) + +Threads: 1 +Sequence range: N - N + diff --git a/tracegrind/tests/test_toggle_collect.stderr.exp b/tracegrind/tests/test_toggle_collect.stderr.exp new file mode 100644 index 000000000..d0b7820ae --- /dev/null +++ b/tracegrind/tests/test_toggle_collect.stderr.exp @@ -0,0 +1,6 @@ + + +Events : Ir +Collected : + +I refs: diff --git a/tracegrind/tests/test_toggle_collect.vgtest b/tracegrind/tests/test_toggle_collect.vgtest new file mode 100644 index 000000000..0f1123dfb --- /dev/null +++ b/tracegrind/tests/test_toggle_collect.vgtest @@ -0,0 +1,5 @@ +prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null +prog: test_toggle_collect.bin +vgopts: --tracegrind-out-file=tracegrind.out.test_toggle_collect.msgpack.lz4 +post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_toggle_collect.msgpack.lz4 --stats | ./filter_trace +cleanup: rm -f tracegrind.out.test_toggle_collect.msgpack.lz4 diff --git a/tracegrind/tg_lz4.c b/tracegrind/tg_lz4.c new file mode 100644 index 000000000..6a6dd3bcc --- /dev/null +++ b/tracegrind/tg_lz4.c @@ -0,0 +1,92 @@ +/* + * LZ4 compression wrapper for Tracegrind. + * Uses vendored LZ4 library adapted for Valgrind (no libc). + * + * BSD 2-Clause License - see lz4.c for full license. + */ + +#include "pub_tool_basics.h" +#include "pub_tool_libcassert.h" +#include "pub_tool_libcbase.h" +#include "pub_tool_mallocfree.h" + +#include "tg_lz4.h" + +/*------------------------------------------------------------*/ +/*--- LZ4 Configuration for Valgrind ---*/ +/*------------------------------------------------------------*/ + +/* Disable memory allocation functions (we provide them below) */ +#define LZ4_USER_MEMORY_FUNCTIONS 1 + +/* Freestanding mode - no string.h */ +#define LZ4_FREESTANDING 1 + +/* Provide size_t */ +#ifndef size_t +#define size_t SizeT +#endif + +/* Provide INT_MAX from limits.h */ +#ifndef INT_MAX +#define INT_MAX 2147483647 +#endif + +#ifndef UINT_MAX +#define UINT_MAX 4294967295U +#endif + +/*------------------------------------------------------------*/ +/*--- Memory function replacements ---*/ +/*------------------------------------------------------------*/ + +/* Define LZ4_memcpy, LZ4_memmove, LZ4_memset before including lz4 */ +#define LZ4_memcpy(dst, src, size) VG_(memcpy)((dst), (src), (size)) +#define LZ4_memmove(dst, src, size) VG_(memmove)((dst), (src), (size)) +#define LZ4_memset(p, v, s) VG_(memset)((p), (v), (s)) + +/*------------------------------------------------------------*/ +/*--- Memory allocation functions (LZ4_USER_MEMORY_FUNCTIONS) */ +/*------------------------------------------------------------*/ + +void* LZ4_malloc(size_t s) { return VG_(malloc)("tg.lz4", s); } + +void* LZ4_calloc(size_t n, size_t s) { return VG_(calloc)("tg.lz4", n, s); } + +void LZ4_free(void* p) +{ + if (p) + VG_(free)(p); +} + +/*------------------------------------------------------------*/ +/*--- Include the original LZ4 implementation ---*/ +/*------------------------------------------------------------*/ + +/* Disable assert (LZ4 has its own fallback) */ +#define LZ4_DEBUG 0 + +/* Include the main LZ4 source */ +#include "lz4.c" + +/*------------------------------------------------------------*/ +/*--- Wrapper API ---*/ +/*------------------------------------------------------------*/ + +SizeT tg_lz4_compress_bound(SizeT src_size) +{ + return LZ4_compressBound((int)src_size); +} + +SizeT tg_lz4_compress(void* dst, + SizeT dst_capacity, + const void* src, + SizeT src_size) +{ + int result = LZ4_compress_fast((const char*)src, (char*)dst, (int)src_size, + (int)dst_capacity, 2 /* acceleration */); + if (result <= 0) { + return 0; + } + return (SizeT)result; +} diff --git a/tracegrind/tg_lz4.h b/tracegrind/tg_lz4.h new file mode 100644 index 000000000..7e127c0b2 --- /dev/null +++ b/tracegrind/tg_lz4.h @@ -0,0 +1,23 @@ +/* + * LZ4 compression wrapper for Tracegrind. + * Uses vendored LZ4 library adapted for Valgrind (no libc). + */ + +#ifndef TG_LZ4_H +#define TG_LZ4_H + +#include "pub_tool_basics.h" + +/* Return the maximum compressed size for a given source length */ +SizeT tg_lz4_compress_bound(SizeT src_size); + +/* Compress src[0..src_size-1] into dst. + * dst_capacity must be >= tg_lz4_compress_bound(src_size). + * Returns the compressed size on success, 0 on error. + */ +SizeT tg_lz4_compress(void* dst, + SizeT dst_capacity, + const void* src, + SizeT src_size); + +#endif /* TG_LZ4_H */ diff --git a/tracegrind/tg_msgpack.c b/tracegrind/tg_msgpack.c new file mode 100644 index 000000000..aa202f739 --- /dev/null +++ b/tracegrind/tg_msgpack.c @@ -0,0 +1,210 @@ +/* + * Minimal MsgPack encoder for Tracegrind. + * Write-only, adapted for Valgrind (no libc). + * + * MsgPack format spec: https://github.com/msgpack/msgpack/blob/master/spec.md + */ + +#include "pub_tool_basics.h" +#include "pub_tool_libcassert.h" +#include "pub_tool_libcbase.h" +#include "pub_tool_mallocfree.h" + +#include "tg_msgpack.h" + +/* Ensure at least `needed` bytes of capacity */ +static void msgpack_ensure(msgpack_buffer* mb, Int needed) +{ + if (mb->size + needed <= mb->capacity) + return; + Int new_cap = mb->capacity * 2; + if (new_cap < mb->size + needed) + new_cap = mb->size + needed; + mb->data = VG_(realloc)("tg.msgpack.buf", mb->data, new_cap); + mb->capacity = new_cap; +} + +static void write_byte(msgpack_buffer* mb, UChar b) +{ + msgpack_ensure(mb, 1); + mb->data[mb->size++] = b; +} + +static void write_bytes(msgpack_buffer* mb, const void* data, Int len) +{ + msgpack_ensure(mb, len); + VG_(memcpy)(mb->data + mb->size, data, len); + mb->size += len; +} + +/* Write big-endian integers */ +static void write_be16(msgpack_buffer* mb, UShort val) +{ + UChar buf[2]; + buf[0] = (UChar)(val >> 8); + buf[1] = (UChar)(val); + write_bytes(mb, buf, 2); +} + +static void write_be32(msgpack_buffer* mb, UInt val) +{ + UChar buf[4]; + buf[0] = (UChar)(val >> 24); + buf[1] = (UChar)(val >> 16); + buf[2] = (UChar)(val >> 8); + buf[3] = (UChar)(val); + write_bytes(mb, buf, 4); +} + +static void write_be64(msgpack_buffer* mb, ULong val) +{ + UChar buf[8]; + buf[0] = (UChar)(val >> 56); + buf[1] = (UChar)(val >> 48); + buf[2] = (UChar)(val >> 40); + buf[3] = (UChar)(val >> 32); + buf[4] = (UChar)(val >> 24); + buf[5] = (UChar)(val >> 16); + buf[6] = (UChar)(val >> 8); + buf[7] = (UChar)(val); + write_bytes(mb, buf, 8); +} + +void msgpack_init(msgpack_buffer* mb, Int capacity) +{ + if (capacity < 256) + capacity = 256; + mb->data = VG_(malloc)("tg.msgpack.init", capacity); + mb->size = 0; + mb->capacity = capacity; +} + +void msgpack_free(msgpack_buffer* mb) +{ + if (mb->data) { + VG_(free)(mb->data); + mb->data = NULL; + } + mb->size = 0; + mb->capacity = 0; +} + +void msgpack_reset(msgpack_buffer* mb) { mb->size = 0; } + +void msgpack_write_nil(msgpack_buffer* mb) { write_byte(mb, 0xc0); } + +void msgpack_write_bool(msgpack_buffer* mb, Bool val) +{ + write_byte(mb, val ? 0xc3 : 0xc2); +} + +void msgpack_write_int(msgpack_buffer* mb, Long val) +{ + if (val >= 0) { + msgpack_write_uint(mb, (ULong)val); + } else if (val >= -32) { + /* negative fixint: 111xxxxx */ + write_byte(mb, (UChar)(val & 0xff)); + } else if (val >= -128) { + write_byte(mb, 0xd0); /* int8 */ + write_byte(mb, (UChar)(val & 0xff)); + } else if (val >= -32768) { + write_byte(mb, 0xd1); /* int16 */ + write_be16(mb, (UShort)(val & 0xffff)); + } else if (val >= -2147483648LL) { + write_byte(mb, 0xd2); /* int32 */ + write_be32(mb, (UInt)(val & 0xffffffff)); + } else { + write_byte(mb, 0xd3); /* int64 */ + write_be64(mb, (ULong)val); + } +} + +void msgpack_write_uint(msgpack_buffer* mb, ULong val) +{ + if (val <= 0x7f) { + /* positive fixint: 0xxxxxxx */ + write_byte(mb, (UChar)val); + } else if (val <= 0xff) { + write_byte(mb, 0xcc); /* uint8 */ + write_byte(mb, (UChar)val); + } else if (val <= 0xffff) { + write_byte(mb, 0xcd); /* uint16 */ + write_be16(mb, (UShort)val); + } else if (val <= 0xffffffff) { + write_byte(mb, 0xce); /* uint32 */ + write_be32(mb, (UInt)val); + } else { + write_byte(mb, 0xcf); /* uint64 */ + write_be64(mb, val); + } +} + +void msgpack_write_str(msgpack_buffer* mb, const HChar* str, Int len) +{ + if (len < 0) + len = VG_(strlen)(str); + + if (len <= 31) { + /* fixstr: 101xxxxx */ + write_byte(mb, (UChar)(0xa0 | len)); + } else if (len <= 0xff) { + write_byte(mb, 0xd9); /* str8 */ + write_byte(mb, (UChar)len); + } else if (len <= 0xffff) { + write_byte(mb, 0xda); /* str16 */ + write_be16(mb, (UShort)len); + } else { + write_byte(mb, 0xdb); /* str32 */ + write_be32(mb, (UInt)len); + } + write_bytes(mb, str, len); +} + +void msgpack_write_bin(msgpack_buffer* mb, const UChar* data, Int len) +{ + if (len <= 0xff) { + write_byte(mb, 0xc4); /* bin8 */ + write_byte(mb, (UChar)len); + } else if (len <= 0xffff) { + write_byte(mb, 0xc5); /* bin16 */ + write_be16(mb, (UShort)len); + } else { + write_byte(mb, 0xc6); /* bin32 */ + write_be32(mb, (UInt)len); + } + write_bytes(mb, data, len); +} + +void msgpack_write_array_header(msgpack_buffer* mb, UInt count) +{ + if (count <= 15) { + /* fixarray: 1001xxxx */ + write_byte(mb, (UChar)(0x90 | count)); + } else if (count <= 0xffff) { + write_byte(mb, 0xdc); /* array16 */ + write_be16(mb, (UShort)count); + } else { + write_byte(mb, 0xdd); /* array32 */ + write_be32(mb, count); + } +} + +void msgpack_write_map_header(msgpack_buffer* mb, UInt count) +{ + if (count <= 15) { + /* fixmap: 1000xxxx */ + write_byte(mb, (UChar)(0x80 | count)); + } else if (count <= 0xffff) { + write_byte(mb, 0xde); /* map16 */ + write_be16(mb, (UShort)count); + } else { + write_byte(mb, 0xdf); /* map32 */ + write_be32(mb, count); + } +} + +void msgpack_write_key(msgpack_buffer* mb, const HChar* key) +{ + msgpack_write_str(mb, key, -1); +} diff --git a/tracegrind/tg_msgpack.h b/tracegrind/tg_msgpack.h new file mode 100644 index 000000000..ae447970b --- /dev/null +++ b/tracegrind/tg_msgpack.h @@ -0,0 +1,36 @@ +/* + * Minimal MsgPack encoder for Tracegrind. + * Write-only, adapted for Valgrind (no libc). + */ + +#ifndef TG_MSGPACK_H +#define TG_MSGPACK_H + +#include "pub_tool_basics.h" + +typedef struct { + UChar* data; + Int size; + Int capacity; +} msgpack_buffer; + +void msgpack_init(msgpack_buffer* mb, Int capacity); +void msgpack_free(msgpack_buffer* mb); +void msgpack_reset(msgpack_buffer* mb); + +/* Encode primitives */ +void msgpack_write_nil(msgpack_buffer* mb); +void msgpack_write_bool(msgpack_buffer* mb, Bool val); +void msgpack_write_int(msgpack_buffer* mb, Long val); +void msgpack_write_uint(msgpack_buffer* mb, ULong val); +void msgpack_write_str(msgpack_buffer* mb, const HChar* str, Int len); +void msgpack_write_bin(msgpack_buffer* mb, const UChar* data, Int len); + +/* Containers */ +void msgpack_write_array_header(msgpack_buffer* mb, UInt count); +void msgpack_write_map_header(msgpack_buffer* mb, UInt count); + +/* Convenience: write a string key (for maps) */ +void msgpack_write_key(msgpack_buffer* mb, const HChar* key); + +#endif /* TG_MSGPACK_H */ diff --git a/tracegrind/threads.c b/tracegrind/threads.c new file mode 100644 index 000000000..eaac68851 --- /dev/null +++ b/tracegrind/threads.c @@ -0,0 +1,424 @@ +/*--------------------------------------------------------------------*/ +/*--- Tracegrind ---*/ +/*--- ct_threads.c ---*/ +/*--------------------------------------------------------------------*/ + +/* + This file is part of Tracegrind, a Valgrind tool for call tracing. + + Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de) + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see . + + The GNU General Public License is contained in the file COPYING. +*/ + +#include "global.h" + +#include "pub_tool_threadstate.h" + +/* forward decls */ +static exec_state* exec_state_save(void); +static exec_state* exec_state_restore(void); +static exec_state* push_exec_state(int); +static exec_state* top_exec_state(void); + +static exec_stack current_states; + +/*------------------------------------------------------------*/ +/*--- Support for multi-threading ---*/ +/*------------------------------------------------------------*/ + +/* + * For Valgrind, MT is cooperative (no preemting in our code), + * so we don't need locks... + * + * Per-thread data: + * - BBCCs + * - call stack + * - call hash + * - event counters: last, current + * + * Even when ignoring MT, we need this functions to set up some + * datastructures for the process (= Thread 1). + */ + +/* current running thread */ +ThreadId TG_(current_tid); + +static thread_info** thread; + +thread_info** TG_(get_threads)(void) { return thread; } + +thread_info* TG_(get_current_thread)(void) { return thread[TG_(current_tid)]; } + +void TG_(init_threads)(void) +{ + UInt i; + + thread = TG_MALLOC("cl.threads.it.1", VG_N_THREADS * sizeof thread[0]); + + for (i = 0; i < VG_N_THREADS; i++) + thread[i] = 0; + TG_(current_tid) = VG_INVALID_THREADID; +} + +/* switches through all threads and calls func */ +void TG_(forall_threads)(void (*func)(thread_info*)) +{ + Int t, orig_tid = TG_(current_tid); + + for (t = 1; t < VG_N_THREADS; t++) { + if (!thread[t]) + continue; + TG_(switch_thread)(t); + (*func)(thread[t]); + } + TG_(switch_thread)(orig_tid); +} + +static thread_info* new_thread(void) +{ + thread_info* t; + + t = (thread_info*)TG_MALLOC("cl.threads.nt.1", sizeof(thread_info)); + + /* init state */ + TG_(init_exec_stack)(&(t->states)); + TG_(init_call_stack)(&(t->calls)); + TG_(init_fn_stack)(&(t->fns)); + /* t->states.entry[0]->cxt = TG_(get_cxt)(t->fns.bottom); */ + + /* event counters */ + t->lastdump_cost = TG_(get_eventset_cost)(TG_(sets).full); + TG_(init_cost)(TG_(sets).full, t->lastdump_cost); + + /* CSV trace: per-thread sample snapshot (allocated lazily in + * trace_emit_sample) */ + t->last_sample_cost = 0; + + /* init data containers */ + TG_(init_fn_array)(&(t->fn_active)); + TG_(init_bbcc_hash)(&(t->bbccs)); + TG_(init_jcc_hash)(&(t->jccs)); + + return t; +} + +void TG_(switch_thread)(ThreadId tid) +{ + if (tid == TG_(current_tid)) + return; + + TG_DEBUG(0, ">> thread %u (was %u)\n", tid, TG_(current_tid)); + + if (TG_(current_tid) != VG_INVALID_THREADID) { + /* save thread state */ + thread_info* t = thread[TG_(current_tid)]; + + TG_ASSERT(t != 0); + + /* current context (including signal handler contexts) */ + exec_state_save(); + TG_(copy_current_exec_stack)(&(t->states)); + TG_(copy_current_call_stack)(&(t->calls)); + TG_(copy_current_fn_stack)(&(t->fns)); + + TG_(copy_current_fn_array)(&(t->fn_active)); + /* If we cumulate costs of threads, use TID 1 for all jccs/bccs */ + if (!TG_(clo).separate_threads) + t = thread[1]; + TG_(copy_current_bbcc_hash)(&(t->bbccs)); + TG_(copy_current_jcc_hash)(&(t->jccs)); + } + + TG_(current_tid) = tid; + TG_ASSERT(tid < VG_N_THREADS); + + if (tid != VG_INVALID_THREADID) { + thread_info* t; + + /* load thread state */ + + if (thread[tid] == 0) + thread[tid] = new_thread(); + t = thread[tid]; + + /* current context (including signal handler contexts) */ + TG_(set_current_exec_stack)(&(t->states)); + exec_state_restore(); + TG_(set_current_call_stack)(&(t->calls)); + TG_(set_current_fn_stack)(&(t->fns)); + + TG_(set_current_fn_array)(&(t->fn_active)); + /* If we cumulate costs of threads, use TID 1 for all jccs/bccs */ + if (!TG_(clo).separate_threads) + t = thread[1]; + TG_(set_current_bbcc_hash)(&(t->bbccs)); + TG_(set_current_jcc_hash)(&(t->jccs)); + } +} + +void TG_(run_thread)(ThreadId tid) { TG_(switch_thread)(tid); } + +void TG_(pre_signal)(ThreadId tid, Int sigNum, Bool alt_stack) +{ + exec_state* es; + + TG_DEBUG(0, ">> pre_signal(TID %u, sig %d, alt_st %s)\n", tid, sigNum, + alt_stack ? "yes" : "no"); + + /* switch to the thread the handler runs in */ + TG_(switch_thread)(tid); + + /* save current execution state */ + exec_state_save(); + + /* setup new cxtinfo struct for this signal handler */ + es = push_exec_state(sigNum); + TG_(zero_cost)(TG_(sets).full, es->cost); + TG_(current_state).cost = es->cost; + es->call_stack_bottom = TG_(current_call_stack).sp; + + /* setup current state for a spontaneous call */ + TG_(init_exec_state)(&TG_(current_state)); + TG_(current_state).sig = sigNum; + TG_(push_cxt)(0); +} + +/* Run post-signal if the stackpointer for call stack is at + * the bottom in current exec state (e.g. a signal handler) + * + * Called from TG_(pop_call_stack) + */ +void TG_(run_post_signal_on_call_stack_bottom)(void) +{ + exec_state* es = top_exec_state(); + TG_ASSERT(es != 0); + TG_ASSERT(TG_(current_state).sig > 0); + + if (TG_(current_call_stack).sp == es->call_stack_bottom) + TG_(post_signal)(TG_(current_tid), TG_(current_state).sig); +} + +void TG_(post_signal)(ThreadId tid, Int sigNum) +{ + exec_state* es; + UInt fn_number, *pactive; + + TG_DEBUG(0, ">> post_signal(TID %u, sig %d)\n", tid, sigNum); + + /* thread switching potentially needed, eg. with instrumentation off */ + TG_(switch_thread)(tid); + TG_ASSERT(sigNum == TG_(current_state).sig); + + /* Unwind call stack of this signal handler. + * This should only be needed at finalisation time + */ + es = top_exec_state(); + TG_ASSERT(es != 0); + while (TG_(current_call_stack).sp > es->call_stack_bottom) + TG_(pop_call_stack)(); + + if (TG_(current_state).cxt) { + /* correct active counts */ + fn_number = TG_(current_state).cxt->fn[0]->number; + pactive = TG_(get_fn_entry)(fn_number); + (*pactive)--; + TG_DEBUG(0, " set active count of %s back to %u\n", + TG_(current_state).cxt->fn[0]->name, *pactive); + } + + if (TG_(current_fn_stack).top > TG_(current_fn_stack).bottom) { + /* set fn_stack_top back. + * top can point to 0 if nothing was executed in the signal handler; + * this is possible at end on unwinding handlers. + */ + if (*(TG_(current_fn_stack).top) != 0) { + TG_(current_fn_stack).top--; + TG_ASSERT(*(TG_(current_fn_stack).top) == 0); + } + if (TG_(current_fn_stack).top > TG_(current_fn_stack).bottom) + TG_(current_fn_stack).top--; + } + + /* zero signal handler costs before restoring previous context */ + TG_ASSERT(TG_(current_state).cost == es->cost); + TG_(zero_cost)(TG_(sets).full, TG_(current_state).cost); + + /* restore previous context */ + es->sig = -1; + current_states.sp--; + es = top_exec_state(); + TG_(current_state).sig = es->sig; + exec_state_restore(); + + /* There is no way to reliable get the thread ID we are switching to + * after this handler returns. So we sync with actual TID at start of + * TG_(setup_bb)(), which should be the next for tracegrind. + */ +} + +/*------------------------------------------------------------*/ +/*--- Execution states in a thread & signal handlers ---*/ +/*------------------------------------------------------------*/ + +/* Each thread can be interrupted by a signal handler, and they + * themselves again. But as there's no scheduling among handlers + * of the same thread, we don't need additional stacks. + * So storing execution contexts and + * adding separators in the callstack(needed to not intermix normal/handler + * functions in contexts) should be enough. + */ + +/* not initialized: call_stack_bottom, sig */ +void TG_(init_exec_state)(exec_state* es) +{ + es->collect = TG_(clo).collect_atstart; + es->cxt = 0; + es->jmps_passed = 0; + es->bbcc = 0; + es->nonskipped = 0; +} + +static exec_state* new_exec_state(Int sigNum) +{ + exec_state* es; + es = (exec_state*)TG_MALLOC("cl.threads.nes.1", sizeof(exec_state)); + + /* allocate real cost space: needed as incremented by + * simulation functions */ + es->cost = TG_(get_eventset_cost)(TG_(sets).full); + TG_(zero_cost)(TG_(sets).full, es->cost); + TG_(init_exec_state)(es); + es->sig = sigNum; + es->call_stack_bottom = 0; + + return es; +} + +void TG_(init_exec_stack)(exec_stack* es) +{ + Int i; + + /* The first element is for the main thread */ + es->entry[0] = new_exec_state(0); + for (i = 1; i < MAX_SIGHANDLERS; i++) + es->entry[i] = 0; + es->sp = 0; +} + +void TG_(copy_current_exec_stack)(exec_stack* dst) +{ + Int i; + + dst->sp = current_states.sp; + for (i = 0; i < MAX_SIGHANDLERS; i++) + dst->entry[i] = current_states.entry[i]; +} + +void TG_(set_current_exec_stack)(exec_stack* dst) +{ + Int i; + + current_states.sp = dst->sp; + for (i = 0; i < MAX_SIGHANDLERS; i++) + current_states.entry[i] = dst->entry[i]; +} + +/* Get top context info struct of current thread */ +static exec_state* top_exec_state(void) +{ + Int sp = current_states.sp; + exec_state* es; + + TG_ASSERT((sp >= 0) && (sp < MAX_SIGHANDLERS)); + es = current_states.entry[sp]; + TG_ASSERT(es != 0); + return es; +} + +/* Allocates a free context info structure for a new entered + * signal handler, putting it on the context stack. + * Returns a pointer to the structure. + */ +static exec_state* push_exec_state(int sigNum) +{ + Int sp; + exec_state* es; + + current_states.sp++; + sp = current_states.sp; + + TG_ASSERT((sigNum > 0) && (sigNum <= _VKI_NSIG)); + TG_ASSERT((sp > 0) && (sp < MAX_SIGHANDLERS)); + es = current_states.entry[sp]; + if (!es) { + es = new_exec_state(sigNum); + current_states.entry[sp] = es; + } else + es->sig = sigNum; + + return es; +} + +/* Save current context to top cxtinfo struct */ +static exec_state* exec_state_save(void) +{ + exec_state* es = top_exec_state(); + + es->cxt = TG_(current_state).cxt; + es->collect = TG_(current_state).collect; + es->jmps_passed = TG_(current_state).jmps_passed; + es->bbcc = TG_(current_state).bbcc; + es->nonskipped = TG_(current_state).nonskipped; + TG_ASSERT(es->cost == TG_(current_state).cost); + + TG_DEBUGIF(1) + { + TG_DEBUG(1, " cxtinfo_save(sig %d): collect %s, jmps_passed %d\n", + es->sig, es->collect ? "Yes" : "No", es->jmps_passed); + TG_(print_bbcc)(-9, es->bbcc); + TG_(print_cost)(-9, TG_(sets).full, es->cost); + } + + /* signal number does not need to be saved */ + TG_ASSERT(TG_(current_state).sig == es->sig); + + return es; +} + +static exec_state* exec_state_restore(void) +{ + exec_state* es = top_exec_state(); + + TG_(current_state).cxt = es->cxt; + TG_(current_state).collect = es->collect; + TG_(current_state).jmps_passed = es->jmps_passed; + TG_(current_state).bbcc = es->bbcc; + TG_(current_state).nonskipped = es->nonskipped; + TG_(current_state).cost = es->cost; + TG_(current_state).sig = es->sig; + + TG_DEBUGIF(1) + { + TG_DEBUG(1, " exec_state_restore(sig %d): collect %s, jmps_passed %d\n", + es->sig, es->collect ? "Yes" : "No", es->jmps_passed); + TG_(print_bbcc)(-9, es->bbcc); + TG_(print_cxt)(-9, es->cxt, 0); + TG_(print_cost)(-9, TG_(sets).full, es->cost); + } + + return es; +} diff --git a/tracegrind/tracegrind.h b/tracegrind/tracegrind.h new file mode 100644 index 000000000..f600cf2b7 --- /dev/null +++ b/tracegrind/tracegrind.h @@ -0,0 +1,129 @@ + +/* + ---------------------------------------------------------------- + + Notice that the following BSD-style license applies to this one + file (tracegrind.h) only. The rest of Valgrind is licensed under the + terms of the GNU General Public License, version 3, unless + otherwise indicated. See the COPYING file in the source + distribution for details. + + ---------------------------------------------------------------- + + This file is part of tracegrind, a valgrind tool for cache simulation + and streaming CSV trace output. + + Based on callgrind, Copyright (C) 2003-2017 Josef Weidendorfer. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. The origin of this software must not be misrepresented; you must + not claim that you wrote the original software. If you use this + software in a product, an acknowledgment in the product + documentation would be appreciated but is not required. + + 3. Altered source versions must be plainly marked as such, and must + not be misrepresented as being the original software. + + 4. The name of the author may not be used to endorse or promote + products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS + OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE + GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + ---------------------------------------------------------------- + + Notice that the above BSD-style license applies to this one file + (tracegrind.h) only. The entire rest of Valgrind is licensed under + the terms of the GNU General Public License, version 3. See the + COPYING file in the source distribution for details. + + ---------------------------------------------------------------- +*/ + +#ifndef __TRACEGRIND_H +#define __TRACEGRIND_H + +#include "valgrind.h" + +/* !! ABIWARNING !! ABIWARNING !! ABIWARNING !! ABIWARNING !! + This enum comprises an ABI exported by Valgrind to programs + which use client requests. DO NOT CHANGE THE ORDER OF THESE + ENTRIES, NOR DELETE ANY -- add new ones at the end. + + The identification ('C','T') for Tracegrind has historical + reasons: it was called "Calltree" before. Besides, ('C','G') would + clash with cachegrind. We keep ('C','T') for compatibility with + callgrind client request macros. + */ + +typedef enum { + VG_USERREQ__DUMP_STATS = VG_USERREQ_TOOL_BASE('C', 'T'), // ignored + VG_USERREQ__ZERO_STATS, // ignored + VG_USERREQ__TOGGLE_COLLECT, + VG_USERREQ__ADD_MARKER, + VG_USERREQ__START_INSTRUMENTATION, + VG_USERREQ__STOP_INSTRUMENTATION +} Vg_TracegrindClientRequest; + +/* Toggles collection state. + The collection state specifies whether the happening of events + should be noted or if they are to be ignored. Events are noted + by increment of counters in a cost center + + Same as CALLGRIND_TOGGLE_COLLECT + */ +#define TRACEGRIND_TOGGLE_COLLECT \ + VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__TOGGLE_COLLECT, 0, 0, 0, 0, 0) + +/* Add a named marker into the trace output. The argument is a string + that will be recorded as a marker label. + + Same as CALLGRIND_DUMP_STATS_AT + */ +#define TRACEGRIND_ADD_MARKER(marker_str) \ + VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__ADD_MARKER, marker_str, 0, 0, \ + 0, 0) + +/* Start full tracegrind instrumentation if not already switched on. + When cache simulation is done, it will flush the simulated cache; + this will lead to an artificial cache warmup phase afterwards with + cache misses which would not have happened in reality. + + Same as CALLGRIND_START_INSTRUMENTATION + */ +#define TRACEGRIND_START_INSTRUMENTATION \ + VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__START_INSTRUMENTATION, 0, 0, 0, \ + 0, 0) + +/* Stop full tracegrind instrumentation if not already switched off. + This flushes Valgrinds translation cache, and does no additional + instrumentation afterwards, which effectivly will run at the same + speed as the "none" tool (ie. at minimal slowdown). + Use this to bypass Tracegrind aggregation for uninteresting code parts. + To start Tracegrind in this mode to ignore the setup phase, use + the option "--instr-atstart=no". + + Same as CALLGRIND_STOP_INSTRUMENTATION + */ +#define TRACEGRIND_STOP_INSTRUMENTATION \ + VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__STOP_INSTRUMENTATION, 0, 0, 0, \ + 0, 0) + +#endif /* __TRACEGRIND_H */