diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3ae16aa5d..97e933807 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -8,7 +8,8 @@ on:
   workflow_dispatch:
 
 jobs:
-  test-callgrind:
+  test:
+    timeout-minutes: 30
     strategy:
       matrix:
         runner:
@@ -16,6 +17,9 @@ jobs:
             ubuntu-version: 22.04
           - platform: ubuntu-24.04
             ubuntu-version: 24.04
+        tool:
+          - callgrind
+          - tracegrind
 
     runs-on: ${{ matrix.runner.platform }}
 
@@ -32,7 +36,7 @@ jobs:
           path-exclude /usr/share/man/*
           path-exclude /usr/share/info/*
           EOF
-          
+
       - name: Update apt-get cache
         run: sudo apt-get update
 
@@ -51,6 +55,10 @@ jobs:
             docbook-xml \
             xsltproc
 
+      - name: Install uv
+        if: matrix.tool == 'tracegrind'
+        uses: astral-sh/setup-uv@v7
+
       - name: Run autogen
         run: ./autogen.sh
 
@@ -63,11 +71,11 @@ jobs:
       - name: Build test dependencies
         run: |
           make -C tests arch_test os_test true
-          make -C callgrind/tests check
+          make -C ${{ matrix.tool }}/tests check
 
-      - name: Run Callgrind tests
+      - name: Run tests
         run: |
-          cd callgrind/tests
+          cd ${{ matrix.tool }}/tests
           TESTS=$(ls *.vgtest | grep -v bug497723.vgtest)
           perl ../../tests/vg_regtest --valgrind=../../vg-in-place $TESTS
 
@@ -75,5 +83,5 @@ jobs:
         if: failure()
         uses: actions/upload-artifact@v4
         with:
-          name: callgrind-test-logs-${{ matrix.runner.ubuntu-version }}
-          path: callgrind/tests/*.log
+          name: ${{ matrix.tool }}-test-logs-${{ matrix.runner.ubuntu-version }}
+          path: ${{ matrix.tool }}/tests/*.log
diff --git a/.gitignore b/.gitignore
index ea71bb0aa..132e768e7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,8 @@
 /autom4te.cache
 /bin
 /cachegrind.out.*
+/callgrind.out.*
+/tracegrind.out.*
 /compile
 /config.guess
 /config.h*
@@ -17,6 +19,7 @@
 /config.status
 /config.sub
 /configure
+/configure~
 /default.supp
 /depcomp
 /glibc-2.X.supp
@@ -161,6 +164,31 @@
 /callgrind/tests/inline-samefile
 /callgrind/tests/inline-crossfile
 
+# /tracegrind/
+/tracegrind/*.so
+/tracegrind/.deps
+/tracegrind/tracegrind-*-darwin
+/tracegrind/tracegrind-*-linux
+/tracegrind/tracegrind-*-solaris
+/tracegrind/tracegrind-*-freebsd
+/tracegrind/Makefile
+/tracegrind/Makefile.in
+
+# /tracegrind/tests/
+/tracegrind/tests/*.dSYM
+/tracegrind/tests/*.post.diff*
+/tracegrind/tests/*.post.out
+/tracegrind/tests/*.stderr.diff*
+/tracegrind/tests/*.stderr.out
+/tracegrind/tests/*.stdout.diff*
+/tracegrind/tests/*.stdout.out
+/tracegrind/tests/.deps
+/tracegrind/tests/Makefile
+/tracegrind/tests/Makefile.in
+/tracegrind/tests/tracegrind.out.*
+/tracegrind/tests/fibo
+/tracegrind/tests/*.bin
+
 # /coregrind/
 /coregrind/*.a
 /coregrind/*.dSYM
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 000000000..bc102d4a9
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,7 @@
+repos:
+  - repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v14.0.6
+    hooks:
+      - id: clang-format
+        files: ^tracegrind/
+        types_or: [c, c++]
diff --git a/Makefile.am b/Makefile.am
index 6c5b9f5b6..2cfe16d16 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -9,6 +9,7 @@ TOOLS = \
 		memcheck \
 		cachegrind \
 		callgrind \
+		tracegrind \
 		helgrind \
 		drd \
 		massif \
diff --git a/bench/bench.py b/bench/bench.py
index 18e2c472a..d577a67ec 100755
--- a/bench/bench.py
+++ b/bench/bench.py
@@ -41,16 +41,33 @@ def __init__(
             raise RuntimeError(f"Valgrind not found at: {self.valgrind_path}")
         self.valgrind_version = result.stdout.strip()
 
-    def run_valgrind(self, *args: str) -> None:
-        """Execute valgrind with given arguments.
+        # Check which tools are available
+        self.available_tools = self._detect_available_tools()
+
+    def _detect_available_tools(self) -> set:
+        """Detect which valgrind tools are available."""
+        tools = set()
+        for tool in ["callgrind", "tracegrind"]:
+            result = subprocess.run(
+                [self.valgrind_path, f"--tool={tool}", "--help"],
+                capture_output=True,
+                text=True,
+            )
+            if result.returncode == 0:
+                tools.add(tool)
+        return tools
+
+    def run_valgrind(self, tool: str, *args: str) -> None:
+        """Execute valgrind with given tool and arguments.
 
         Args:
+            tool: Valgrind tool to use (callgrind, tracegrind)
             *args: Valgrind arguments
         """
 
         cmd = [
             self.valgrind_path,
-            "--tool=callgrind",
+            f"--tool={tool}",
             "--log-file=/dev/null",
             *args,
             *shlex.split(self.cmd),
@@ -75,76 +92,119 @@ def runner(request):
     return request.config._valgrind_runner
 
 
+CACHE_SIM_OPTIONS = [
+    "--cache-sim=yes",
+    "--I1=32768,8,64",
+    "--D1=32768,8,64",
+    "--LL=8388608,16,64",
+]
+
 def pytest_generate_tests(metafunc):
     """Parametrize tests with valgrind configurations."""
-    if "valgrind_args" in metafunc.fixturenames:
+    if "tool_and_args" in metafunc.fixturenames:
         runner = getattr(metafunc.config, "_valgrind_runner", None)
         if not runner:
             return
 
-        # Define valgrind configurations
-        configs = [
-            (["--read-inline-info=no"], "no-inline"),
-            (["--read-inline-info=yes"], "inline"),
+        # Define configurations for each tool
+        # Format: (tool, args, config_name)
+        all_configs = [
+            # Callgrind configurations
+            ("callgrind", ["--read-inline-info=no"], "cg/no-inline"),
+            ("callgrind", ["--read-inline-info=yes"], "cg/inline"),
             (
+                "callgrind",
                 [
+                    *CACHE_SIM_OPTIONS,
                     "--trace-children=yes",
-                    "--cache-sim=yes",
-                    "--I1=32768,8,64",
-                    "--D1=32768,8,64",
-                    "--LL=8388608,16,64",
                     "--collect-systime=nsec",
                     "--compress-strings=no",
                     "--combine-dumps=yes",
                     "--dump-line=no",
                     "--read-inline-info=yes",
                 ],
-                "full-with-inline",
+                "cg/full-inline",
             ),
             (
+                "callgrind",
                 [
+                    *CACHE_SIM_OPTIONS,
                     "--trace-children=yes",
-                    "--cache-sim=yes",
-                    "--I1=32768,8,64",
-                    "--D1=32768,8,64",
-                    "--LL=8388608,16,64",
                     "--collect-systime=nsec",
                     "--compress-strings=no",
                     "--combine-dumps=yes",
                     "--dump-line=no",
+                    "--read-inline-info=no",
+                ],
+                "cg/full-no-inline",
+            ),
+            # Tracegrind configurations (only available in codspeed fork)
+            ("tracegrind", ["--read-inline-info=no"], "tg/no-inline"),
+            ("tracegrind", ["--read-inline-info=yes"], "tg/inline"),
+            (
+                "tracegrind",
+                [
+                    *CACHE_SIM_OPTIONS,
+                    "--trace-children=yes",
+                    "--collect-systime=nsec",
+                    "--read-inline-info=no",
                 ],
-                "full-no-inline",
+                "tg/full-no-inline",
+            ),
+            (
+                "tracegrind",
+                [
+                    *CACHE_SIM_OPTIONS,
+                    "--trace-children=yes",
+                    "--collect-systime=nsec",
+                    "--read-inline-info=yes",
+                ],
+                "tg/full-inline",
             ),
         ]
 
+        # Filter configs to only include available tools
+        configs = [
+            (tool, args, name)
+            for tool, args, name in all_configs
+            if tool in runner.available_tools
+        ]
+
+        if not configs:
+            return
+
         # If the valgrind version is from CodSpeed, we don't want to display the exact version
-        # to allow comparison against older versions. 
+        # to allow comparison against older versions.
         if ".codspeed" in runner.valgrind_version:
-            runner.valgrind_version = "valgrind.codspeed"
+            runner.valgrind_version = "codspeed"
+        # Clean valgrind version names
+        else:
+            runner.valgrind_version.removeprefix("valgrind-")
 
         # Create test IDs with format: valgrind-version, command, config-name
         test_ids = [
-            f"{runner.valgrind_version}, {runner.cmd}, {config_name}"
-            for _, config_name in configs
+            f"{runner.valgrind_version}/{config_name}, {runner.cmd}"
+            for _, _, config_name in configs
         ]
 
-        # Parametrize with just the args
+        # Parametrize with (tool, args) tuples
         metafunc.parametrize(
-            "valgrind_args",
-            [args for args, _ in configs],
+            "tool_and_args",
+            [(tool, args) for tool, args, _ in configs],
             ids=test_ids,
         )
 
 
 @pytest.mark.benchmark
-def test_valgrind(runner, valgrind_args):
+def test_valgrind(runner, tool_and_args):
     if runner:
-        runner.run_valgrind(*valgrind_args)
+        tool, args = tool_and_args
+        runner.run_valgrind(tool, *args)
 
 
 def main():
     parser = argparse.ArgumentParser(
-        description="Benchmark Valgrind with pytest-codspeed",
+        description="Benchmark Valgrind tools (callgrind, tracegrind) with pytest-codspeed",
         formatter_class=argparse.RawDescriptionHelpFormatter,
         epilog="""
 Examples:
@@ -179,6 +239,7 @@ def main():
         valgrind_path=args.valgrind_path,
     )
     print(f"Valgrind version: {runner.valgrind_version}")
+    print(f"Available tools: {', '.join(sorted(runner.available_tools))}")
     print(f"Command: {args.cmd}")
 
     # Plugin to pass runner to tests
@@ -187,7 +248,7 @@ def pytest_configure(self, config):
             config._valgrind_runner = runner
 
     exit_code = pytest.main(
-        [__file__, "-v", "--codspeed", "--codspeed-warmup-time=0", "--codspeed-max-time=5"],
+        [__file__, "-v", "--codspeed", "--codspeed-warmup-time=0", "--codspeed-max-time=30"],
         plugins=[RunnerPlugin()],
     )
     if exit_code != 0 and exit_code != 5:
diff --git a/configure.ac b/configure.ac
index f3f3867ef..fcc1afea4 100644
--- a/configure.ac
+++ b/configure.ac
@@ -5807,6 +5807,8 @@ AC_CONFIG_FILES([
    callgrind/callgrind_annotate
    callgrind/callgrind_control
    callgrind/tests/Makefile
+   tracegrind/Makefile
+   tracegrind/tests/Makefile
    helgrind/Makefile
    helgrind/tests/Makefile
    drd/Makefile
diff --git a/tracegrind/Makefile.am b/tracegrind/Makefile.am
new file mode 100644
index 000000000..562c4f6c6
--- /dev/null
+++ b/tracegrind/Makefile.am
@@ -0,0 +1,88 @@
+include $(top_srcdir)/Makefile.tool.am
+
+EXTRA_DIST =
+
+#----------------------------------------------------------------------------
+# Headers, etc
+#----------------------------------------------------------------------------
+
+pkginclude_HEADERS = tracegrind.h
+
+noinst_HEADERS = \
+	costs.h \
+	events.h \
+	global.h \
+	lz4.c \
+	lz4.h \
+	tg_lz4.h \
+	tg_msgpack.h
+
+#----------------------------------------------------------------------------
+# tracegrind-<platform>
+#----------------------------------------------------------------------------
+
+noinst_PROGRAMS  = tracegrind-@VGCONF_ARCH_PRI@-@VGCONF_OS@
+if VGCONF_HAVE_PLATFORM_SEC
+noinst_PROGRAMS += tracegrind-@VGCONF_ARCH_SEC@-@VGCONF_OS@
+endif
+
+TRACEGRIND_SOURCES_COMMON = \
+	bb.c \
+	bbcc.c \
+	callstack.c \
+	clo.c \
+	context.c \
+	costs.c \
+	debug.c \
+	dump.c \
+	events.c \
+	fn.c \
+	jumps.c \
+	main.c \
+	sim.c \
+	threads.c \
+	tg_lz4.c \
+	tg_msgpack.c
+
+# We sneakily include "cg_branchpred.c" and "cg_arch.c" from cachegrind
+TRACEGRIND_CFLAGS_COMMON = -I$(top_srcdir)/cachegrind
+
+tracegrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_SOURCES      = \
+	$(TRACEGRIND_SOURCES_COMMON)
+tracegrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_CPPFLAGS     = \
+	$(AM_CPPFLAGS_@VGCONF_PLATFORM_PRI_CAPS@)
+tracegrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_CFLAGS       = $(LTO_CFLAGS) \
+	$(AM_CFLAGS_@VGCONF_PLATFORM_PRI_CAPS@) $(TRACEGRIND_CFLAGS_COMMON)
+tracegrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_DEPENDENCIES = \
+	$(TOOL_DEPENDENCIES_@VGCONF_PLATFORM_PRI_CAPS@)
+tracegrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_LDADD        = \
+	$(TOOL_LDADD_@VGCONF_PLATFORM_PRI_CAPS@)
+tracegrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_LDFLAGS      = \
+	$(TOOL_LDFLAGS_@VGCONF_PLATFORM_PRI_CAPS@)
+tracegrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_LINK = \
+	$(top_builddir)/coregrind/link_tool_exe_@VGCONF_OS@ \
+	@VALT_LOAD_ADDRESS_PRI@ \
+	$(LINK) \
+	$(tracegrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_CFLAGS) \
+	$(tracegrind_@VGCONF_ARCH_PRI@_@VGCONF_OS@_LDFLAGS)
+
+if VGCONF_HAVE_PLATFORM_SEC
+tracegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_SOURCES      = \
+	$(TRACEGRIND_SOURCES_COMMON)
+tracegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_CPPFLAGS     = \
+	$(AM_CPPFLAGS_@VGCONF_PLATFORM_SEC_CAPS@)
+tracegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_CFLAGS       = $(LTO_CFLAGS) \
+	$(AM_CFLAGS_@VGCONF_PLATFORM_SEC_CAPS@) $(TRACEGRIND_CFLAGS_COMMON)
+tracegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_DEPENDENCIES = \
+	$(TOOL_DEPENDENCIES_@VGCONF_PLATFORM_SEC_CAPS@)
+tracegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_LDADD        = \
+	$(TOOL_LDADD_@VGCONF_PLATFORM_SEC_CAPS@)
+tracegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_LDFLAGS      = \
+	$(TOOL_LDFLAGS_@VGCONF_PLATFORM_SEC_CAPS@)
+tracegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_LINK = \
+	$(top_builddir)/coregrind/link_tool_exe_@VGCONF_OS@ \
+	@VALT_LOAD_ADDRESS_SEC@ \
+	$(LINK) \
+	$(tracegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_CFLAGS) \
+	$(tracegrind_@VGCONF_ARCH_SEC@_@VGCONF_OS@_LDFLAGS)
+endif
diff --git a/tracegrind/bb.c b/tracegrind/bb.c
new file mode 100644
index 000000000..ff5f4111c
--- /dev/null
+++ b/tracegrind/bb.c
@@ -0,0 +1,341 @@
+/*--------------------------------------------------------------------*/
+/*--- Tracegrind                                                    ---*/
+/*---                                                         bb.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Tracegrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+
+/*------------------------------------------------------------*/
+/*--- Basic block (BB) operations                          ---*/
+/*------------------------------------------------------------*/
+
+/* BB hash, resizable */
+bb_hash bbs;
+
+void TG_(init_bb_hash)(void)
+{
+   Int i;
+
+   bbs.size    = 8437;
+   bbs.entries = 0;
+   bbs.table   = (BB**)TG_MALLOC("cl.bb.ibh.1", bbs.size * sizeof(BB*));
+
+   for (i = 0; i < bbs.size; i++)
+      bbs.table[i] = NULL;
+}
+
+bb_hash* TG_(get_bb_hash)(void) { return &bbs; }
+
+/* The hash stores BBs according to
+ * - ELF object (is 0 for code in anonymous mapping)
+ * - BB base as object file offset
+ */
+static __inline__ UInt bb_hash_idx(obj_node* obj, PtrdiffT offset, UInt size)
+{
+   return (((Addr)obj) + offset) % size;
+}
+
+/* double size of bb table  */
+static void resize_bb_table(void)
+{
+   Int  i, new_size, conflicts1 = 0, conflicts2 = 0;
+   BB **new_table, *curr, *next;
+   UInt new_idx;
+
+   new_size  = 2 * bbs.size + 3;
+   new_table = (BB**)TG_MALLOC("cl.bb.rbt.1", new_size * sizeof(BB*));
+
+   for (i = 0; i < new_size; i++)
+      new_table[i] = NULL;
+
+   for (i = 0; i < bbs.size; i++) {
+      if (bbs.table[i] == NULL)
+         continue;
+
+      curr = bbs.table[i];
+      while (NULL != curr) {
+         next = curr->next;
+
+         new_idx = bb_hash_idx(curr->obj, curr->offset, new_size);
+
+         curr->next         = new_table[new_idx];
+         new_table[new_idx] = curr;
+         if (curr->next) {
+            conflicts1++;
+            if (curr->next->next)
+               conflicts2++;
+         }
+
+         curr = next;
+      }
+   }
+
+   VG_(free)(bbs.table);
+
+   TG_DEBUG(0, "Resize BB Hash: %u => %d (entries %u, conflicts %d/%d)\n",
+            bbs.size, new_size, bbs.entries, conflicts1, conflicts2);
+
+   bbs.size  = new_size;
+   bbs.table = new_table;
+   TG_(stat).bb_hash_resizes++;
+}
+
+/**
+ * Allocate new BB structure (including space for event type list)
+ * Not initialized:
+ * - instr_len, cost_count, instr[]
+ */
+static BB* new_bb(obj_node* obj,
+                  PtrdiffT  offset,
+                  UInt      instr_count,
+                  UInt      cjmp_count,
+                  Bool      cjmp_inverted)
+{
+   BB*  bb;
+   UInt idx, size;
+
+   /* check fill degree of bb hash table and resize if needed (>80%) */
+   bbs.entries++;
+   if (10 * bbs.entries / bbs.size > 8)
+      resize_bb_table();
+
+   size = sizeof(BB) + instr_count * sizeof(InstrInfo) +
+          (cjmp_count + 1) * sizeof(CJmpInfo);
+   bb = (BB*)TG_MALLOC("cl.bb.nb.1", size);
+   VG_(memset)(bb, 0, size);
+
+   bb->obj    = obj;
+   bb->offset = offset;
+
+   bb->instr_count   = instr_count;
+   bb->cjmp_count    = cjmp_count;
+   bb->cjmp_inverted = cjmp_inverted;
+   bb->jmp           = (CJmpInfo*)&(bb->instr[instr_count]);
+   bb->instr_len     = 0;
+   bb->cost_count    = 0;
+   bb->sect_kind     = VG_(DebugInfo_sect_kind)(NULL, offset + obj->offset);
+   bb->fn            = 0;
+   bb->line          = 0;
+   bb->is_entry      = 0;
+   bb->inl_fns       = NULL;
+   bb->inl_depth     = 0;
+   bb->bbcc_list     = 0;
+   bb->last_bbcc     = 0;
+
+   /* insert into BB hash table */
+   idx            = bb_hash_idx(obj, offset, bbs.size);
+   bb->next       = bbs.table[idx];
+   bbs.table[idx] = bb;
+
+   TG_(stat).distinct_bbs++;
+
+#if TG_ENABLE_DEBUG
+   TG_DEBUGIF(3)
+   {
+      VG_(printf)(
+         "  new_bb (instr %u, jmps %u, inv %s) [now %d]: ", instr_count,
+         cjmp_count, cjmp_inverted ? "yes" : "no", TG_(stat).distinct_bbs);
+      TG_(print_bb)(0, bb);
+      VG_(printf)("\n");
+   }
+#endif
+
+   TG_(get_fn_node)(bb);
+
+   return bb;
+}
+
+/* get the BB structure for a BB start address */
+static __inline__ BB* lookup_bb(obj_node* obj, PtrdiffT offset)
+{
+   BB* bb;
+   Int idx;
+
+   idx = bb_hash_idx(obj, offset, bbs.size);
+   bb  = bbs.table[idx];
+
+   while (bb) {
+      if ((bb->obj == obj) && (bb->offset == offset))
+         break;
+      bb = bb->next;
+   }
+
+   TG_DEBUG(5, "  lookup_bb (Obj %s, off %#lx): %p\n", obj->name, (UWord)offset,
+            bb);
+   return bb;
+}
+
+static __inline__ obj_node* obj_of_address(Addr addr)
+{
+   obj_node*  obj;
+   DebugInfo* di;
+   PtrdiffT   offset;
+
+   DiEpoch ep = VG_(current_DiEpoch)();
+   di         = VG_(find_DebugInfo)(ep, addr);
+   obj        = TG_(get_obj_node)(di);
+
+   /* Update symbol offset in object if remapped */
+   /* FIXME (or at least check this) 2008 Feb 19: 'offset' is
+      only correct for text symbols, not for data symbols */
+   offset = di ? VG_(DebugInfo_get_text_bias)(di) : 0;
+   if (obj->offset != offset) {
+      Addr start = di ? VG_(DebugInfo_get_text_avma)(di) : 0;
+
+      TG_DEBUG(0, "Mapping changed for '%s': %#lx -> %#lx\n", obj->name,
+               obj->start, start);
+
+      /* Size should be the same, and offset diff == start diff */
+      TG_ASSERT(obj->size == (di ? VG_(DebugInfo_get_text_size)(di) : 0));
+      TG_ASSERT(obj->start - start == obj->offset - offset);
+      obj->offset = offset;
+      obj->start  = start;
+   }
+
+   return obj;
+}
+
+/* Get the BB structure for a BB start address.
+ * If the BB has to be created, the IRBB is needed to
+ * compute the event type list for costs, and seen_before is
+ * set to False. Otherwise, seen_before is set to True.
+ *
+ * BBs are never discarded. There are 2 cases where this function
+ * is called from TG_(instrument)() and a BB already exists:
+ * - The instrumented version was removed from Valgrinds TT cache
+ * - The ELF object of the BB was unmapped and mapped again.
+ *   This involves a possibly different address, but is handled by
+ *   looking up a BB keyed by (obj_node, file offset).
+ *
+ * bbIn==0 is possible for artificial BB without real code.
+ * Such a BB is created when returning to an unknown function.
+ */
+BB* TG_(get_bb)(Addr addr, IRSB* bbIn, /*OUT*/ Bool* seen_before)
+{
+   BB*       bb;
+   obj_node* obj;
+   UInt      n_instrs, n_jmps;
+   Bool      cjmp_inverted = False;
+
+   TG_DEBUG(5, "+ get_bb(BB %#lx)\n", addr);
+
+   obj = obj_of_address(addr);
+   bb  = lookup_bb(obj, addr - obj->offset);
+
+   n_instrs = 0;
+   n_jmps   = 0;
+   TG_(collectBlockInfo)(bbIn, &n_instrs, &n_jmps, &cjmp_inverted);
+
+   *seen_before = bb ? True : False;
+   if (*seen_before) {
+      if (bb->instr_count != n_instrs) {
+         VG_(message)(Vg_DebugMsg,
+                      "ERROR: BB Retranslation Mismatch at BB %#lx\n", addr);
+         VG_(message)(
+            Vg_DebugMsg, "  new: Obj %s, Off %#lx, BBOff %#lx, Instrs %u\n",
+            obj->name, (UWord)obj->offset, addr - obj->offset, n_instrs);
+         VG_(message)(Vg_DebugMsg,
+                      "  old: Obj %s, Off %#lx, BBOff %#lx, Instrs %u\n",
+                      bb->obj->name, (UWord)bb->obj->offset, (UWord)bb->offset,
+                      bb->instr_count);
+         TG_ASSERT(bb->instr_count == n_instrs);
+      }
+      TG_ASSERT(bb->cjmp_count == n_jmps);
+      TG_(stat).bb_retranslations++;
+
+      TG_DEBUG(5, "- get_bb(BB %#lx): seen before.\n", addr);
+      return bb;
+   }
+
+   bb = new_bb(obj, addr - obj->offset, n_instrs, n_jmps, cjmp_inverted);
+
+   TG_DEBUG(5, "- get_bb(BB %#lx)\n", addr);
+
+   return bb;
+}
+
+/* Delete the BB info for the bb with unredirected entry-point
+   address 'addr'. */
+void TG_(delete_bb)(Addr addr)
+{
+   BB *bb, *bp;
+   Int idx, size;
+
+   obj_node* obj    = obj_of_address(addr);
+   PtrdiffT  offset = addr - obj->offset;
+
+   idx = bb_hash_idx(obj, offset, bbs.size);
+   bb  = bbs.table[idx];
+
+   /* bb points at the current bb under consideration, and bp is the
+      one before. */
+   bp = NULL;
+   while (bb) {
+      if ((bb->obj == obj) && (bb->offset == offset))
+         break;
+      bp = bb;
+      bb = bb->next;
+   }
+
+   if (bb == NULL) {
+      TG_DEBUG(3, "  delete_bb (Obj %s, off %#lx): NOT FOUND\n", obj->name,
+               (UWord)offset);
+
+      /* we didn't find it.
+       * this happens when tracegrinds instrumentation mode
+       * was off at BB translation time, ie. no BB was created.
+       */
+      return;
+   }
+
+   /* unlink it from hash table */
+
+   if (bp == NULL) {
+      /* we found the first one in the list. */
+      tl_assert(bb == bbs.table[idx]);
+      bbs.table[idx] = bb->next;
+   } else {
+      tl_assert(bb != bbs.table[idx]);
+      bp->next = bb->next;
+   }
+
+   TG_DEBUG(3, "  delete_bb (Obj %s, off %#lx): %p, BBCC head: %p\n", obj->name,
+            (UWord)offset, bb, bb->bbcc_list);
+
+   if (bb->bbcc_list == 0) {
+      /* can be safely deleted */
+
+      if (bb->inl_fns)
+         VG_(free)(bb->inl_fns);
+
+      /* Fill the block up with junk and then free it, so we will
+         hopefully get a segfault if it is used again by mistake. */
+      size = sizeof(BB) + bb->instr_count * sizeof(InstrInfo) +
+             (bb->cjmp_count + 1) * sizeof(CJmpInfo);
+      VG_(memset)(bb, 0xAA, size);
+      TG_FREE(bb);
+      return;
+   }
+   TG_DEBUG(3, "  delete_bb: BB in use, can not free!\n");
+}
diff --git a/tracegrind/bbcc.c b/tracegrind/bbcc.c
new file mode 100644
index 000000000..15143c621
--- /dev/null
+++ b/tracegrind/bbcc.c
@@ -0,0 +1,864 @@
+/*--------------------------------------------------------------------*/
+/*--- Tracegrind                                                    ---*/
+/*---                                                       bbcc.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Tracegrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "costs.h"
+#include "global.h"
+
+#include "pub_tool_threadstate.h"
+
+/*------------------------------------------------------------*/
+/*--- BBCC operations                                      ---*/
+/*------------------------------------------------------------*/
+
+#define N_BBCC_INITIAL_ENTRIES 10437
+
+/* BBCC table (key is BB/Context), per thread, resizable */
+bbcc_hash current_bbccs;
+
+void TG_(init_bbcc_hash)(bbcc_hash* bbccs)
+{
+   Int i;
+
+   TG_ASSERT(bbccs != 0);
+
+   bbccs->size    = N_BBCC_INITIAL_ENTRIES;
+   bbccs->entries = 0;
+   bbccs->table =
+      (BBCC**)TG_MALLOC("cl.bbcc.ibh.1", bbccs->size * sizeof(BBCC*));
+
+   for (i = 0; i < bbccs->size; i++)
+      bbccs->table[i] = NULL;
+}
+
+void TG_(copy_current_bbcc_hash)(bbcc_hash* dst)
+{
+   TG_ASSERT(dst != 0);
+
+   dst->size    = current_bbccs.size;
+   dst->entries = current_bbccs.entries;
+   dst->table   = current_bbccs.table;
+}
+
+bbcc_hash* TG_(get_current_bbcc_hash)(void) { return &current_bbccs; }
+
+void TG_(set_current_bbcc_hash)(bbcc_hash* h)
+{
+   TG_ASSERT(h != 0);
+
+   current_bbccs.size    = h->size;
+   current_bbccs.entries = h->entries;
+   current_bbccs.table   = h->table;
+}
+
+/* All BBCCs for recursion level 0 are inserted into a
+ * thread specific hash table with key
+ * - address of BB structure (unique, as never freed)
+ * - current context (includes caller chain)
+ * BBCCs for other recursion levels are in bbcc->rec_array.
+ *
+ * The hash is used in setup_bb(), i.e. to find the cost
+ * counters to be changed in the execution of a BB.
+ */
+
+static __inline__ UInt bbcc_hash_idx(BB* bb, Context* cxt, UInt size)
+{
+   TG_ASSERT(bb != 0);
+   TG_ASSERT(cxt != 0);
+
+   return ((Addr)bb + (Addr)cxt) % size;
+}
+
+/* Lookup for a BBCC in hash.
+ */
+static BBCC* lookup_bbcc(BB* bb, Context* cxt)
+{
+   BBCC* bbcc = bb->last_bbcc;
+   UInt  idx;
+
+   /* check LRU */
+   if (bbcc->cxt == cxt) {
+      if (!TG_(clo).separate_threads) {
+         /* if we don't dump threads separate, tid doesn't have to match */
+         return bbcc;
+      }
+      if (bbcc->tid == TG_(current_tid))
+         return bbcc;
+   }
+
+   TG_(stat).bbcc_lru_misses++;
+
+   idx  = bbcc_hash_idx(bb, cxt, current_bbccs.size);
+   bbcc = current_bbccs.table[idx];
+   while (bbcc && (bb != bbcc->bb || cxt != bbcc->cxt)) {
+      bbcc = bbcc->next;
+   }
+
+   TG_DEBUG(2, "  lookup_bbcc(BB %#lx, Cxt %u, fn '%s'): %p (tid %u)\n",
+            bb_addr(bb), cxt->base_number, cxt->fn[0]->name, bbcc,
+            bbcc ? bbcc->tid : 0);
+
+   TG_DEBUGIF(2)
+   if (bbcc)
+      TG_(print_bbcc)(-2, bbcc);
+
+   return bbcc;
+}
+
+/* double size of hash table 1 (addr->BBCC) */
+static void resize_bbcc_hash(void)
+{
+   Int    i, new_size, conflicts1 = 0, conflicts2 = 0;
+   BBCC** new_table;
+   UInt   new_idx;
+   BBCC * curr_BBCC, *next_BBCC;
+
+   new_size  = 2 * current_bbccs.size + 3;
+   new_table = (BBCC**)TG_MALLOC("cl.bbcc.rbh.1", new_size * sizeof(BBCC*));
+
+   for (i = 0; i < new_size; i++)
+      new_table[i] = NULL;
+
+   for (i = 0; i < current_bbccs.size; i++) {
+      if (current_bbccs.table[i] == NULL)
+         continue;
+
+      curr_BBCC = current_bbccs.table[i];
+      while (NULL != curr_BBCC) {
+         next_BBCC = curr_BBCC->next;
+
+         new_idx = bbcc_hash_idx(curr_BBCC->bb, curr_BBCC->cxt, new_size);
+
+         curr_BBCC->next    = new_table[new_idx];
+         new_table[new_idx] = curr_BBCC;
+         if (curr_BBCC->next) {
+            conflicts1++;
+            if (curr_BBCC->next->next)
+               conflicts2++;
+         }
+
+         curr_BBCC = next_BBCC;
+      }
+   }
+
+   VG_(free)(current_bbccs.table);
+
+   TG_DEBUG(0, "Resize BBCC Hash: %u => %d (entries %u, conflicts %d/%d)\n",
+            current_bbccs.size, new_size, current_bbccs.entries, conflicts1,
+            conflicts2);
+
+   current_bbccs.size  = new_size;
+   current_bbccs.table = new_table;
+   TG_(stat).bbcc_hash_resizes++;
+}
+
+static __inline BBCC** new_recursion(int size)
+{
+   BBCC** bbccs;
+   int    i;
+
+   bbccs = (BBCC**)TG_MALLOC("cl.bbcc.nr.1", sizeof(BBCC*) * size);
+   for (i = 0; i < size; i++)
+      bbccs[i] = 0;
+
+   TG_DEBUG(3, "  new_recursion(size %d): %p\n", size, bbccs);
+
+   return bbccs;
+}
+
+/*
+ * Allocate a new BBCC
+ *
+ * Uninitialized:
+ * cxt, rec_index, rec_array, next_bbcc, next1, next2
+ */
+static __inline__ BBCC* new_bbcc(BB* bb)
+{
+   BBCC* bbcc;
+   Int   i;
+
+   /* We need cjmp_count+1 JmpData structs:
+    * the last is for the unconditional jump/call/ret at end of BB
+    */
+   bbcc = (BBCC*)TG_MALLOC("cl.bbcc.nb.1", sizeof(BBCC) + (bb->cjmp_count + 1) *
+                                                             sizeof(JmpData));
+   bbcc->bb  = bb;
+   bbcc->tid = TG_(current_tid);
+
+   bbcc->skipped = 0;
+   bbcc->cost    = TG_(get_costarray)(bb->cost_count);
+   for (i = 0; i < bb->cost_count; i++)
+      bbcc->cost[i] = 0;
+   for (i = 0; i <= bb->cjmp_count; i++) {
+      bbcc->jmp[i].ecounter = 0;
+      bbcc->jmp[i].jcc_list = 0;
+   }
+   bbcc->ecounter_sum = 0;
+
+   /* Init pointer caches (LRU) */
+   bbcc->lru_next_bbcc = 0;
+   bbcc->lru_from_jcc  = 0;
+   bbcc->lru_to_jcc    = 0;
+
+   TG_(stat).distinct_bbccs++;
+
+   TG_DEBUG(3, "  new_bbcc(BB %#lx): %p (now %d)\n", bb_addr(bb), bbcc,
+            TG_(stat).distinct_bbccs);
+
+   return bbcc;
+}
+
+/**
+ * Inserts a new BBCC into hashes.
+ * BBCC specific items must be set as this is used for the hash
+ * keys:
+ *  fn     : current function
+ *  tid    : current thread ID
+ *  from   : position where current function is called from
+ *
+ * Recursion level doesn't need to be set as this is not included
+ * in the hash key: Only BBCCs with rec level 0 are in hashes.
+ */
+static void insert_bbcc_into_hash(BBCC* bbcc)
+{
+   UInt idx;
+
+   TG_ASSERT(bbcc->cxt != 0);
+
+   TG_DEBUG(3, "+ insert_bbcc_into_hash(BB %#lx, fn '%s')\n", bb_addr(bbcc->bb),
+            bbcc->cxt->fn[0]->name);
+
+   /* check fill degree of hash and resize if needed (>90%) */
+   current_bbccs.entries++;
+   if (100 * current_bbccs.entries / current_bbccs.size > 90)
+      resize_bbcc_hash();
+
+   idx        = bbcc_hash_idx(bbcc->bb, bbcc->cxt, current_bbccs.size);
+   bbcc->next = current_bbccs.table[idx];
+   current_bbccs.table[idx] = bbcc;
+
+   TG_DEBUG(3, "- insert_bbcc_into_hash: %u entries\n", current_bbccs.entries);
+}
+
+/* String is returned in a dynamically allocated buffer. Caller is
+   responsible for free'ing it. */
+static HChar* mangled_cxt(const Context* cxt, Int rec_index)
+{
+   Int i, p;
+
+   if (!cxt)
+      return VG_(strdup)("cl.bbcc.mcxt", "(no context)");
+
+   /* Overestimate the number of bytes we need to hold the string. */
+   SizeT need = 20; // rec_index + nul-terminator
+   for (i = 0; i < cxt->size; ++i)
+      need += VG_(strlen)(cxt->fn[i]->name) + 1; // 1 for leading '
+
+   HChar* mangled = TG_MALLOC("cl.bbcc.mcxt", need);
+   p              = VG_(sprintf)(mangled, "%s", cxt->fn[0]->name);
+   if (rec_index > 0)
+      p += VG_(sprintf)(mangled + p, "'%d", rec_index + 1);
+   for (i = 1; i < cxt->size; i++)
+      p += VG_(sprintf)(mangled + p, "'%s", cxt->fn[i]->name);
+
+   return mangled;
+}
+
+/* Create a new BBCC as a copy of an existing one,
+ * but with costs set to 0 and jcc chains empty.
+ *
+ * This is needed when a BB is executed in another context than
+ * the one at instrumentation time of the BB.
+ *
+ * Use cases:
+ *  rec_index == 0: clone from a BBCC with differing tid/cxt
+ *                  and insert into hashes
+ *  rec_index >0  : clone from a BBCC with same tid/cxt and rec_index 0
+ *                  don't insert into hashes
+ */
+static BBCC* clone_bbcc(BBCC* orig, Context* cxt, Int rec_index)
+{
+   BBCC* bbcc;
+
+   TG_DEBUG(3, "+ clone_bbcc(BB %#lx, rec %d, fn %s)\n", bb_addr(orig->bb),
+            rec_index, cxt->fn[0]->name);
+
+   bbcc = new_bbcc(orig->bb);
+
+   if (rec_index == 0) {
+
+      /* hash insertion is only allowed if tid or cxt is different */
+      TG_ASSERT((orig->tid != TG_(current_tid)) || (orig->cxt != cxt));
+
+      bbcc->rec_index    = 0;
+      bbcc->cxt          = cxt;
+      bbcc->rec_array    = new_recursion(cxt->fn[0]->separate_recursions);
+      bbcc->rec_array[0] = bbcc;
+
+      insert_bbcc_into_hash(bbcc);
+   } else {
+      if (TG_(clo).separate_threads)
+         TG_ASSERT(orig->tid == TG_(current_tid));
+
+      TG_ASSERT(orig->cxt == cxt);
+      TG_ASSERT(orig->rec_array);
+      TG_ASSERT(cxt->fn[0]->separate_recursions > rec_index);
+      TG_ASSERT(orig->rec_array[rec_index] == 0);
+
+      /* new BBCC will only have differing recursion level */
+      bbcc->rec_index            = rec_index;
+      bbcc->cxt                  = cxt;
+      bbcc->rec_array            = orig->rec_array;
+      bbcc->rec_array[rec_index] = bbcc;
+   }
+
+   /* update list of BBCCs for same BB */
+   bbcc->next_bbcc     = orig->bb->bbcc_list;
+   orig->bb->bbcc_list = bbcc;
+
+   TG_DEBUGIF(3)
+   TG_(print_bbcc)(-2, bbcc);
+
+   HChar* mangled_orig = mangled_cxt(orig->cxt, orig->rec_index);
+   HChar* mangled_bbcc = mangled_cxt(bbcc->cxt, bbcc->rec_index);
+   TG_DEBUG(2,
+            "- clone_BBCC(%p, %d) for BB %#lx\n"
+            "   orig %s\n"
+            "   new  %s\n",
+            orig, rec_index, bb_addr(orig->bb), mangled_orig, mangled_bbcc);
+   TG_FREE(mangled_orig);
+   TG_FREE(mangled_bbcc);
+
+   TG_(stat).bbcc_clones++;
+
+   return bbcc;
+};
+
+/* Get a pointer to the cost centre structure for given basic block
+ * address. If created, the BBCC is inserted into the BBCC hash.
+ * Also sets BB_seen_before by reference.
+ *
+ */
+BBCC* TG_(get_bbcc)(BB* bb)
+{
+   BBCC* bbcc;
+
+   TG_DEBUG(3, "+ get_bbcc(BB %#lx)\n", bb_addr(bb));
+
+   bbcc = bb->bbcc_list;
+
+   if (!bbcc) {
+      bbcc = new_bbcc(bb);
+
+      /* initialize BBCC */
+      bbcc->cxt       = 0;
+      bbcc->rec_array = 0;
+      bbcc->rec_index = 0;
+
+      bbcc->next_bbcc = bb->bbcc_list;
+      bb->bbcc_list   = bbcc;
+      bb->last_bbcc   = bbcc;
+
+      TG_DEBUGIF(3)
+      TG_(print_bbcc)(-2, bbcc);
+   }
+
+   TG_DEBUG(3, "- get_bbcc(BB %#lx): BBCC %p\n", bb_addr(bb), bbcc);
+
+   return bbcc;
+}
+
+/* Tracegrind manages its own call stack for each thread.
+ * When leaving a function, a underflow can happen when
+ * Tracegrind's tracing was switched on in the middle of
+ * a run, i.e. when Tracegrind was not able to trace the
+ * call instruction.
+ * This function tries to reconstruct the original call.
+ * As we know the return address (the address following
+ * the CALL instruction), we can detect the function
+ * we return back to, but the original call site is unknown.
+ * We suppose a call site at return address - 1.
+ * (TODO: other heuristic: lookup info of instrumented BBs).
+ */
+static void handleUnderflow(BB* bb)
+{
+   /* RET at top of call stack */
+   BBCC*       source_bbcc;
+   BB*         source_bb;
+   Bool        seen_before;
+   fn_node*    caller;
+   int         fn_number;
+   unsigned*   pactive;
+   call_entry* call_entry_up;
+
+   TG_DEBUG(1, "  Callstack underflow !\n");
+
+   /* we emulate an old call from the function we return to
+    * by using (<return address> -1) */
+   source_bb   = TG_(get_bb)(bb_addr(bb) - 1, 0, &seen_before);
+   source_bbcc = TG_(get_bbcc)(source_bb);
+
+   /* seen_before can be true if RET from a signal handler */
+   if (!seen_before) {
+      source_bbcc->ecounter_sum = TG_(current_state).collect ? 1 : 0;
+   } else if (TG_(current_state).collect)
+      source_bbcc->ecounter_sum++;
+
+   /* Force a new top context, will be set active by push_cxt() */
+   TG_(current_fn_stack).top--;
+   TG_(current_state).cxt = 0;
+   caller                 = TG_(get_fn_node)(bb);
+   TG_(push_cxt)(caller);
+
+   if (!seen_before) {
+      /* set rec array for source BBCC: this is at rec level 1 */
+      source_bbcc->rec_array    = new_recursion(caller->separate_recursions);
+      source_bbcc->rec_array[0] = source_bbcc;
+
+      TG_ASSERT(source_bbcc->cxt == 0);
+      source_bbcc->cxt = TG_(current_state).cxt;
+      insert_bbcc_into_hash(source_bbcc);
+   }
+   TG_ASSERT(TG_(current_state).bbcc);
+
+   /* correct active counts */
+   fn_number = TG_(current_state).bbcc->cxt->fn[0]->number;
+   pactive   = TG_(get_fn_entry)(fn_number);
+   (*pactive)--;
+
+   /* This assertion is not correct for reentrant
+    * signal handlers */
+   /* TG_ASSERT(*pactive == 0); */
+
+   TG_(current_state).nonskipped = 0; /* we didn't skip this function */
+   /* back to current context */
+   TG_(push_cxt)(TG_(current_state).bbcc->cxt->fn[0]);
+   TG_(push_call_stack)
+   (source_bbcc, 0, TG_(current_state).bbcc, (Addr)-1, False);
+   call_entry_up =
+      &(TG_(current_call_stack).entry[TG_(current_call_stack).sp - 1]);
+   /* assume this call is lasting since last dump or
+    * for a signal handler since it's call */
+   if (TG_(current_state).sig == 0)
+      TG_(copy_cost)
+   (TG_(sets).full, call_entry_up->enter_cost,
+    TG_(get_current_thread)()->lastdump_cost);
+   else TG_(zero_cost)(TG_(sets).full, call_entry_up->enter_cost);
+}
+
+/*
+ * Helper function called at start of each instrumented BB to setup
+ * pointer to costs for current thread/context/recursion level
+ */
+
+VG_REGPARM(1)
+void TG_(setup_bbcc)(BB* bb)
+{
+   BBCC *     bbcc, *last_bbcc;
+   Bool       call_emulation = False, delayed_push = False, skip = False;
+   Addr       sp;
+   BB*        last_bb;
+   ThreadId   tid;
+   TgJumpKind jmpkind;
+   Bool       isConditionalJump;
+   Int        passed             = 0, csp;
+   Bool       ret_without_call   = False;
+   Int        popcount_on_return = 1;
+
+   TG_DEBUG(3, "+ setup_bbcc(BB %#lx)\n", bb_addr(bb));
+
+   /* This is needed because thread switches can not reliable be tracked
+    * with callback TG_(run_thread) only: we have otherwise no way to get
+    * the thread ID after a signal handler returns.
+    * This could be removed again if that bug is fixed in Valgrind.
+    * This is in the hot path but hopefully not to costly.
+    */
+   tid = VG_(get_running_tid)();
+#if 1
+   /* TG_(switch_thread) is a no-op when tid is equal to TG_(current_tid).
+    * As this is on the hot path, we only call TG_(switch_thread)(tid)
+    * if tid differs from the TG_(current_tid).
+    */
+   if (UNLIKELY(tid != TG_(current_tid)))
+      TG_(switch_thread)(tid);
+#else
+   TG_ASSERT(VG_(get_running_tid)() == TG_(current_tid));
+#endif
+
+   sp        = VG_(get_SP)(tid);
+   last_bbcc = TG_(current_state).bbcc;
+   last_bb   = last_bbcc ? last_bbcc->bb : 0;
+
+   if (last_bb) {
+      passed = TG_(current_state).jmps_passed;
+      TG_ASSERT(passed <= last_bb->cjmp_count);
+      jmpkind           = last_bb->jmp[passed].jmpkind;
+      isConditionalJump = (passed < last_bb->cjmp_count);
+
+      if (TG_(current_state).collect) {
+         if (!TG_(current_state).nonskipped) {
+            last_bbcc->ecounter_sum++;
+            last_bbcc->jmp[passed].ecounter++;
+            if (!TG_(clo).simulate_cache) {
+               /* update Ir cost */
+               UInt instr_count = last_bb->jmp[passed].instr + 1;
+               TG_(current_state).cost[fullOffset(EG_IR)] += instr_count;
+            }
+         } else {
+            /* do not increment exe counter of BBs in skipped functions, as it
+             * would fool dumping code */
+            if (!TG_(clo).simulate_cache) {
+               /* update Ir cost */
+               UInt instr_count = last_bb->jmp[passed].instr + 1;
+               TG_(current_state).cost[fullOffset(EG_IR)] += instr_count;
+               TG_(current_state).nonskipped->skipped[fullOffset(EG_IR)] +=
+                  instr_count;
+            }
+         }
+      }
+
+      TG_DEBUGIF(4)
+      {
+         TG_(print_execstate)(-2, &TG_(current_state));
+         TG_(print_bbcc_cost)(-2, last_bbcc);
+      }
+   } else {
+      jmpkind           = jk_None;
+      isConditionalJump = False;
+   }
+
+   /* Manipulate JmpKind if needed, only using BB specific info */
+
+   csp = TG_(current_call_stack).sp;
+
+   /* A return not matching the top call in our callstack is a jump */
+   if ((jmpkind == jk_Return) && (csp > 0)) {
+      Int         csp_up = csp - 1;
+      call_entry* top_ce = &(TG_(current_call_stack).entry[csp_up]);
+
+      /* We have a real return if
+       * - the stack pointer (SP) left the current stack frame, or
+       * - SP has the same value as when reaching the current function
+       *   and the address of this BB is the return address of last call
+       *   (we even allow to leave multiple frames if the SP stays the
+       *    same and we find a matching return address)
+       * The latter condition is needed because on PPC, SP can stay
+       * the same over CALL=b(c)l / RET=b(c)lr boundaries
+       */
+      if (sp < top_ce->sp)
+         popcount_on_return = 0;
+      else if (top_ce->sp == sp) {
+         while (1) {
+            if (top_ce->ret_addr == bb_addr(bb))
+               break;
+            if (csp_up > 0) {
+               csp_up--;
+               top_ce = &(TG_(current_call_stack).entry[csp_up]);
+               if (top_ce->sp == sp) {
+                  popcount_on_return++;
+                  continue;
+               }
+            }
+            popcount_on_return = 0;
+            break;
+         }
+      }
+      if (popcount_on_return == 0) {
+         jmpkind          = jk_Jump;
+         ret_without_call = True;
+      }
+   }
+
+   /* Should this jump be converted to call or pop/call ? */
+   if ((jmpkind != jk_Return) && (jmpkind != jk_Call) && last_bb) {
+
+      /* We simulate a JMP/Cont to be a CALL if
+       * - jump is in another ELF object or section kind
+       * - jump is to first instruction of a function (tail recursion)
+       */
+      if (ret_without_call ||
+      /* This is for detection of optimized tail recursion.
+       * On PPC, this is only detected as call when going to another
+       * function. The problem is that on PPC it can go wrong
+       * more easily (no stack frame setup needed)
+       */
+#if defined(VGA_ppc32)
+          (bb->is_entry && (last_bb->fn != bb->fn)) ||
+#else
+          bb->is_entry ||
+#endif
+          (last_bb->sect_kind != bb->sect_kind) ||
+          (last_bb->obj->number != bb->obj->number)) {
+
+         TG_DEBUG(1, "     JMP: %s[%s] to %s[%s]%s!\n", last_bb->fn->name,
+                  last_bb->obj->name, bb->fn->name, bb->obj->name,
+                  ret_without_call ? " (RET w/o CALL)" : "");
+
+         if (TG_(get_fn_node)(last_bb)->pop_on_jump && (csp > 0)) {
+
+            call_entry* top_ce = &(TG_(current_call_stack).entry[csp - 1]);
+
+            if (top_ce->jcc) {
+
+               TG_DEBUG(1, "     Pop on Jump!\n");
+
+               /* change source for delayed push */
+               TG_(current_state).bbcc = top_ce->jcc->from;
+               sp                      = top_ce->sp;
+               passed                  = top_ce->jcc->jmp;
+               TG_(pop_call_stack)();
+            } else {
+               TG_ASSERT(TG_(current_state).nonskipped != 0);
+            }
+         }
+
+         jmpkind        = jk_Call;
+         call_emulation = True;
+      }
+   }
+
+   if (jmpkind == jk_Call) {
+      fn_node* node = TG_(get_fn_node)(bb);
+      skip          = node->skip;
+   }
+
+   TG_DEBUGIF(1)
+   {
+      if (isConditionalJump)
+         VG_(printf)("Cond-");
+      switch (jmpkind) {
+      case jk_None:
+         VG_(printf)("Fall-through");
+         break;
+      case jk_Jump:
+         VG_(printf)("Jump");
+         break;
+      case jk_Call:
+         VG_(printf)("Call");
+         break;
+      case jk_Return:
+         VG_(printf)("Return");
+         break;
+      default:
+         tl_assert(0);
+      }
+      VG_(printf)(" %08lx -> %08lx, SP %08lx\n",
+                  last_bb ? bb_jmpaddr(last_bb) : 0, bb_addr(bb), sp);
+   }
+
+   /* Handle CALL/RET and update context to get correct BBCC */
+
+   if (jmpkind == jk_Return) {
+
+      if ((csp == 0) ||
+          ((TG_(current_fn_stack).top > TG_(current_fn_stack).bottom) &&
+           (*(TG_(current_fn_stack).top - 1) == 0))) {
+
+         /* On an empty call stack or at a signal separation marker,
+          * a RETURN generates an call stack underflow.
+          */
+         handleUnderflow(bb);
+         TG_(pop_call_stack)();
+      } else {
+         TG_ASSERT(popcount_on_return > 0);
+         TG_(unwind_call_stack)(sp, popcount_on_return);
+      }
+   } else {
+      Int unwind_count = TG_(unwind_call_stack)(sp, 0);
+      if (unwind_count > 0) {
+         /* if unwinding was done, this actually is a return */
+         jmpkind = jk_Return;
+      }
+
+      if (jmpkind == jk_Call) {
+         delayed_push = True;
+
+         csp = TG_(current_call_stack).sp;
+         if (call_emulation && csp > 0)
+            sp = TG_(current_call_stack).entry[csp - 1].sp;
+      }
+   }
+
+   /* Change new context if needed, taking delayed_push into account */
+   if ((delayed_push && !skip) || (TG_(current_state).cxt == 0)) {
+      TG_(push_cxt)(TG_(get_fn_node)(bb));
+   }
+   TG_ASSERT(TG_(current_fn_stack).top > TG_(current_fn_stack).bottom);
+
+   /* If there is a fresh instrumented BBCC, assign current context */
+   bbcc = TG_(get_bbcc)(bb);
+   if (bbcc->cxt == 0) {
+      TG_ASSERT(bbcc->rec_array == 0);
+
+      bbcc->cxt = TG_(current_state).cxt;
+      bbcc->rec_array =
+         new_recursion((*TG_(current_fn_stack).top)->separate_recursions);
+      bbcc->rec_array[0] = bbcc;
+
+      insert_bbcc_into_hash(bbcc);
+   } else {
+      /* get BBCC with current context */
+
+      /* first check LRU of last bbcc executed */
+
+      if (last_bbcc) {
+         bbcc = last_bbcc->lru_next_bbcc;
+         if (bbcc &&
+             ((bbcc->bb != bb) || (bbcc->cxt != TG_(current_state).cxt)))
+            bbcc = 0;
+      } else
+         bbcc = 0;
+
+      if (!bbcc)
+         bbcc = lookup_bbcc(bb, TG_(current_state).cxt);
+      if (!bbcc)
+         bbcc = clone_bbcc(bb->bbcc_list, TG_(current_state).cxt, 0);
+
+      bb->last_bbcc = bbcc;
+   }
+
+   /* save for fast lookup */
+   if (last_bbcc)
+      last_bbcc->lru_next_bbcc = bbcc;
+
+   if ((*TG_(current_fn_stack).top)->separate_recursions > 1) {
+      UInt     level, idx;
+      fn_node* top = *(TG_(current_fn_stack).top);
+
+      level = *TG_(get_fn_entry)(top->number);
+
+      if (delayed_push && !skip) {
+         level++;
+      }
+      if (level > top->separate_recursions)
+         level = top->separate_recursions;
+
+      if (level == 0) {
+         /* can only happen if instrumentation just was switched on */
+         level                           = 1;
+         *TG_(get_fn_entry)(top->number) = 1;
+      }
+
+      idx = level - 1;
+      if (bbcc->rec_array[idx])
+         bbcc = bbcc->rec_array[idx];
+      else
+         bbcc = clone_bbcc(bbcc, TG_(current_state).cxt, idx);
+
+      TG_ASSERT(bbcc->rec_array[bbcc->rec_index] == bbcc);
+   }
+
+   if (delayed_push) {
+      if (!skip && TG_(current_state).nonskipped) {
+         /* a call from skipped to nonskipped */
+         TG_(current_state).bbcc = TG_(current_state).nonskipped;
+         /* FIXME: take the real passed count from shadow stack */
+         passed = TG_(current_state).bbcc->bb->cjmp_count;
+      }
+      TG_(push_call_stack)(TG_(current_state).bbcc, passed, bbcc, sp, skip);
+   }
+
+   if (TG_(clo).collect_jumps && (jmpkind == jk_Jump)) {
+
+      /* Handle conditional jumps followed, i.e. trace arcs
+       * This uses JCC structures, too */
+
+      jCC* jcc = TG_(get_jcc)(last_bbcc, passed, bbcc);
+      TG_ASSERT(jcc != 0);
+      // Change from default, and check if already changed
+      if (jcc->jmpkind == jk_Call)
+         jcc->jmpkind = isConditionalJump ? jk_CondJump : jk_Jump;
+      else {
+         // FIXME: Why can this fail?
+         // TG_ASSERT(jcc->jmpkind == jmpkind);
+      }
+
+      jcc->call_counter++;
+      if (isConditionalJump)
+         TG_(stat).jcnd_counter++;
+      else
+         TG_(stat).jump_counter++;
+   }
+
+   TG_(current_state).bbcc = bbcc;
+
+   /* Check for inline function transitions */
+   if (TG_(current_state).collect) {
+      thread_info* ti = TG_(get_current_thread)();
+      if (ti) {
+         UInt old_depth = ti->cur_inl_depth;
+         UInt new_depth = bb->inl_depth;
+
+         /* Fast path: both empty (most BBs) */
+         if (old_depth != 0 || new_depth != 0) {
+            /* Find longest common prefix */
+            UInt common    = 0;
+            UInt min_depth = old_depth < new_depth ? old_depth : new_depth;
+            while (common < min_depth &&
+                   ti->cur_inl_fns[common] == bb->inl_fns[common])
+               common++;
+
+            /* EXIT from deepest down to common level */
+            for (Int i = (Int)old_depth - 1; i >= (Int)common; i--)
+               TG_(trace_emit_exit_inlined)
+            (TG_(current_tid), bb, ti->cur_inl_fns[i]);
+
+            /* ENTER from common level up to new deepest */
+            for (UInt i = common; i < new_depth; i++)
+               TG_(trace_emit_enter_inlined)
+            (TG_(current_tid), bb, bb->inl_fns[i]);
+
+            /* Update thread state */
+            for (UInt i = 0; i < new_depth; i++)
+               ti->cur_inl_fns[i] = bb->inl_fns[i];
+            ti->cur_inl_depth = new_depth;
+         }
+      }
+   }
+
+   /* Even though this will be set in instrumented code directly before
+    * side exits, it needs to be set to 0 here in case an exception
+    * happens in first instructions of the BB */
+   TG_(current_state).jmps_passed = 0;
+   // needed for log_* handlers called in this BB
+   TG_(bb_base)   = bb->obj->offset + bb->offset;
+   TG_(cost_base) = bbcc->cost;
+
+   TG_DEBUGIF(1)
+   {
+      VG_(printf)("     ");
+      TG_(print_bbcc_fn)(bbcc);
+      VG_(printf)("\n");
+   }
+
+   TG_DEBUG(3, "- setup_bbcc (BB %#lx): Cost %p (Len %u), Instrs %u (Len %u)\n",
+            bb_addr(bb), bbcc->cost, bb->cost_count, bb->instr_count,
+            bb->instr_len);
+   TG_DEBUGIF(3)
+   TG_(print_cxt)(-8, TG_(current_state).cxt, bbcc->rec_index);
+   TG_DEBUG(3, "\n");
+
+   TG_(stat).bb_executions++;
+}
diff --git a/tracegrind/callstack.c b/tracegrind/callstack.c
new file mode 100644
index 000000000..1cf056a3f
--- /dev/null
+++ b/tracegrind/callstack.c
@@ -0,0 +1,420 @@
+/*--------------------------------------------------------------------*/
+/*--- Tracegrind                                                    ---*/
+/*---                                               ct_callstack.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Tracegrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+
+/*------------------------------------------------------------*/
+/*--- Call stack, operations                               ---*/
+/*------------------------------------------------------------*/
+
+/* Stack of current thread. Gets initialized when switching to 1st thread.
+ *
+ * The artificial call stack is an array of call_entry's, representing
+ * stack frames of the executing program.
+ * Array call_stack and call_stack_esp have same size and grow on demand.
+ * Array call_stack_esp holds SPs of corresponding stack frames.
+ *
+ */
+
+#define N_CALL_STACK_INITIAL_ENTRIES 500
+
+call_stack TG_(current_call_stack);
+
+void TG_(init_call_stack)(call_stack* s)
+{
+   Int i;
+
+   TG_ASSERT(s != 0);
+
+   s->size         = N_CALL_STACK_INITIAL_ENTRIES;
+   s->entry        = (call_entry*)TG_MALLOC("cl.callstack.ics.1",
+                                            s->size * sizeof(call_entry));
+   s->sp           = 0;
+   s->entry[0].cxt = 0; /* for assertion in push_cxt() */
+
+   for (i = 0; i < s->size; i++)
+      s->entry[i].enter_cost = 0;
+}
+
+call_entry* TG_(get_call_entry)(Int sp)
+{
+   TG_ASSERT(sp <= TG_(current_call_stack).sp);
+   return &(TG_(current_call_stack).entry[sp]);
+}
+
+void TG_(copy_current_call_stack)(call_stack* dst)
+{
+   TG_ASSERT(dst != 0);
+
+   dst->size  = TG_(current_call_stack).size;
+   dst->entry = TG_(current_call_stack).entry;
+   dst->sp    = TG_(current_call_stack).sp;
+}
+
+void TG_(set_current_call_stack)(call_stack* s)
+{
+   TG_ASSERT(s != 0);
+
+   TG_(current_call_stack).size  = s->size;
+   TG_(current_call_stack).entry = s->entry;
+   TG_(current_call_stack).sp    = s->sp;
+}
+
+static __inline__ void ensure_stack_size(Int i)
+{
+   Int         oldsize;
+   call_stack* cs = &TG_(current_call_stack);
+
+   if (i < cs->size)
+      return;
+
+   oldsize = cs->size;
+   cs->size *= 2;
+   while (i > cs->size)
+      cs->size *= 2;
+
+   cs->entry = (call_entry*)VG_(realloc)("cl.callstack.ess.1", cs->entry,
+                                         cs->size * sizeof(call_entry));
+
+   for (i = oldsize; i < cs->size; i++)
+      cs->entry[i].enter_cost = 0;
+
+   TG_(stat).call_stack_resizes++;
+
+   TG_DEBUGIF(2)
+   VG_(printf)("        call stack enlarged to %u entries\n",
+               TG_(current_call_stack).size);
+}
+
+/* Called when function entered nonrecursive */
+static void function_entered(fn_node* fn)
+{
+   TG_ASSERT(fn != 0);
+
+#if TG_ENABLE_DEBUG
+   if (fn->verbosity >= 0) {
+      Int old          = TG_(clo).verbose;
+      TG_(clo).verbose = fn->verbosity;
+      fn->verbosity    = old;
+      VG_(message)(Vg_DebugMsg, "Entering %s: Verbosity set to %d\n", fn->name,
+                   TG_(clo).verbose);
+   }
+#endif
+
+   if (fn->toggle_collect) {
+      TG_(current_state).collect = !TG_(current_state).collect;
+      TG_DEBUG(2, "   entering %s: toggled collection state to %s\n", fn->name,
+               TG_(current_state).collect ? "ON" : "OFF");
+   }
+}
+
+/* Called when function left (no recursive level active) */
+static void function_left(fn_node* fn)
+{
+   TG_ASSERT(fn != 0);
+
+   if (fn->toggle_collect) {
+      TG_(current_state).collect = !TG_(current_state).collect;
+      TG_DEBUG(2, "   leaving %s: toggled collection state to %s\n", fn->name,
+               TG_(current_state).collect ? "ON" : "OFF");
+   }
+
+#if TG_ENABLE_DEBUG
+   if (fn->verbosity >= 0) {
+      Int old          = TG_(clo).verbose;
+      TG_(clo).verbose = fn->verbosity;
+      fn->verbosity    = old;
+      VG_(message)(Vg_DebugMsg, "Leaving %s: Verbosity set back to %d\n",
+                   fn->name, TG_(clo).verbose);
+   }
+#endif
+}
+
+/* Push call on call stack.
+ *
+ * Increment the usage count for the function called.
+ * A jump from <from> to <to>, with <sp>.
+ * If <skip> is true, this is a call to a function to be skipped;
+ * for this, we set jcc = 0.
+ */
+void TG_(push_call_stack)(BBCC* from, UInt jmp, BBCC* to, Addr sp, Bool skip)
+{
+   jCC*        jcc;
+   UInt*       pdepth;
+   call_entry* current_entry;
+   Addr        ret_addr;
+
+   /* Ensure a call stack of size <current_sp>+1.
+    * The +1 is needed as push_cxt will store the
+    * context at [current_sp]
+    */
+   ensure_stack_size(TG_(current_call_stack).sp + 1);
+   current_entry = &(TG_(current_call_stack).entry[TG_(current_call_stack).sp]);
+
+   if (skip) {
+      jcc = 0;
+   } else {
+      fn_node* to_fn = to->cxt->fn[0];
+
+      if (TG_(current_state).nonskipped) {
+         /* this is a jmp from skipped to nonskipped */
+         TG_ASSERT(TG_(current_state).nonskipped == from);
+      }
+
+      /* As push_cxt() has to be called before push_call_stack if not
+       * skipping, the old context should already be saved on the stack */
+      TG_ASSERT(current_entry->cxt != 0);
+      TG_(copy_cost_lz)
+      (TG_(sets).full, &(current_entry->enter_cost), TG_(current_state).cost);
+
+      jcc = TG_(get_jcc)(from, jmp, to);
+      TG_ASSERT(jcc != 0);
+
+      pdepth = TG_(get_fn_entry)(to_fn->number);
+      (*pdepth)++;
+
+      if (*pdepth > 1)
+         TG_(stat).rec_call_counter++;
+
+      jcc->call_counter++;
+      TG_(stat).call_counter++;
+
+      if (*pdepth == 1)
+         function_entered(to_fn);
+   }
+
+   /* return address is only is useful with a real call;
+    * used to detect RET w/o CALL */
+   if (from->bb->jmp[jmp].jmpkind == jk_Call) {
+      UInt instr = from->bb->jmp[jmp].instr;
+      ret_addr   = bb_addr(from->bb) + from->bb->instr[instr].instr_offset +
+                 from->bb->instr[instr].instr_size;
+   } else
+      ret_addr = 0;
+
+   /* put jcc on call stack */
+   current_entry->jcc        = jcc;
+   current_entry->sp         = sp;
+   current_entry->ret_addr   = ret_addr;
+   current_entry->nonskipped = TG_(current_state).nonskipped;
+
+   TG_(current_call_stack).sp++;
+
+   /* Emit trace sample on function entry */
+   if (!skip && TG_(current_state).collect) {
+      /* Exit entire inline stack, deepest first */
+      thread_info* ti = TG_(get_current_thread)();
+      if (ti && ti->cur_inl_depth > 0 && TG_(current_state).bbcc) {
+         for (Int i = (Int)ti->cur_inl_depth - 1; i >= 0; i--)
+            TG_(trace_emit_exit_inlined)
+         (TG_(current_tid), TG_(current_state).bbcc->bb, ti->cur_inl_fns[i]);
+         ti->cur_inl_depth = 0;
+      }
+      fn_node* to_fn = to->cxt->fn[0];
+      TG_(trace_emit_sample)(TG_(current_tid), True, to_fn);
+   }
+
+   /* To allow for above assertion we set context of next frame to 0 */
+   TG_ASSERT(TG_(current_call_stack).sp < TG_(current_call_stack).size);
+   current_entry++;
+   current_entry->cxt = 0;
+
+   if (!skip)
+      TG_(current_state).nonskipped = 0;
+   else if (!TG_(current_state).nonskipped) {
+      /* a call from nonskipped to skipped */
+      TG_(current_state).nonskipped = from;
+      if (!TG_(current_state).nonskipped->skipped) {
+         TG_(init_cost_lz)
+         (TG_(sets).full, &TG_(current_state).nonskipped->skipped);
+         TG_(stat).distinct_skips++;
+      }
+   }
+
+#if TG_ENABLE_DEBUG
+   TG_DEBUGIF(0)
+   {
+      if (TG_(clo).verbose < 2) {
+         if (jcc && jcc->to && jcc->to->bb) {
+            const HChar spaces[][41] = {
+               "   .   .   .   .   .   .   .   .   .   .",
+               "  .   .   .   .   .   .   .   .   .   . ",
+               " .   .   .   .   .   .   .   .   .   .  ",
+               ".   .   .   .   .   .   .   .   .   .   "};
+
+            int   s    = TG_(current_call_stack).sp;
+            UInt* pars = (UInt*)sp;
+
+            BB* bb = jcc->to->bb;
+            if (s > 40)
+               s = 40;
+            VG_(printf)(
+               "%s> %s(0x%x, 0x%x, ...) [%s / %#lx]\n", spaces[s % 4] + 40 - s,
+               bb->fn->name, pars ? pars[1] : 0, pars ? pars[2] : 0,
+               bb->obj->name + bb->obj->last_slash_pos, (UWord)bb->offset);
+         }
+      } else if (TG_(clo).verbose < 4) {
+         VG_(printf)("+ %2d ", TG_(current_call_stack).sp);
+         TG_(print_short_jcc)(jcc);
+         VG_(printf)(", SP %#lx, RA %#lx\n", sp, ret_addr);
+      } else {
+         VG_(printf)("  Pushed ");
+         TG_(print_stackentry)(3, TG_(current_call_stack).sp - 1);
+      }
+   }
+#endif
+}
+
+/* Pop call stack and update inclusive sums.
+ * Returns modified fcc.
+ *
+ * If the JCC becomes inactive, call entries are freed if possible
+ */
+void TG_(pop_call_stack)(void)
+{
+   jCC*        jcc;
+   Int         depth = 0;
+   call_entry* lower_entry;
+
+   if (TG_(current_state).sig > 0) {
+      /* Check if we leave a signal handler; this can happen when
+       * calling longjmp() in the handler */
+      TG_(run_post_signal_on_call_stack_bottom)();
+   }
+
+   lower_entry =
+      &(TG_(current_call_stack).entry[TG_(current_call_stack).sp - 1]);
+
+   TG_DEBUG(4, "+ pop_call_stack: frame %d, jcc %p\n",
+            TG_(current_call_stack).sp, lower_entry->jcc);
+
+   /* jCC item not any more on real stack: pop */
+   jcc                           = lower_entry->jcc;
+   TG_(current_state).nonskipped = lower_entry->nonskipped;
+
+   if (jcc) {
+      fn_node* to_fn  = jcc->to->cxt->fn[0];
+      UInt*    pdepth = TG_(get_fn_entry)(to_fn->number);
+      (*pdepth)--;
+      depth = *pdepth;
+
+      /* add cost difference to sum */
+      if (TG_(add_diff_cost_lz)(TG_(sets).full, &(jcc->cost),
+                                lower_entry->enter_cost,
+                                TG_(current_state).cost)) {
+      }
+      TG_(stat).ret_counter++;
+
+      /* Emit trace sample on function exit */
+      if (TG_(current_state).collect) {
+         /* Exit entire inline stack, deepest first */
+         thread_info* ti = TG_(get_current_thread)();
+         if (ti && ti->cur_inl_depth > 0 && TG_(current_state).bbcc) {
+            for (Int i = (Int)ti->cur_inl_depth - 1; i >= 0; i--)
+               TG_(trace_emit_exit_inlined)
+            (TG_(current_tid), TG_(current_state).bbcc->bb, ti->cur_inl_fns[i]);
+            ti->cur_inl_depth = 0;
+         }
+         TG_(trace_emit_sample)(TG_(current_tid), False, to_fn);
+      }
+
+      /* restore context */
+      TG_(current_state).cxt = lower_entry->cxt;
+      TG_(current_fn_stack).top =
+         TG_(current_fn_stack).bottom + lower_entry->fn_sp;
+      TG_ASSERT(TG_(current_state).cxt != 0);
+
+      if (depth == 0)
+         function_left(to_fn);
+   }
+
+   /* To allow for an assertion in push_call_stack() */
+   lower_entry->cxt = 0;
+
+   TG_(current_call_stack).sp--;
+
+#if TG_ENABLE_DEBUG
+   TG_DEBUGIF(1)
+   {
+      if (TG_(clo).verbose < 4) {
+         if (jcc) {
+            /* popped JCC target first */
+            VG_(printf)("- %2d %#lx => ", TG_(current_call_stack).sp,
+                        bb_addr(jcc->to->bb));
+            TG_(print_addr)(bb_jmpaddr(jcc->from->bb));
+            VG_(printf)(
+               ", SP %#lx\n",
+               TG_(current_call_stack).entry[TG_(current_call_stack).sp].sp);
+            TG_(print_cost)(10, TG_(sets).full, jcc->cost);
+         } else
+            VG_(printf)(
+               "- %2d [Skipped JCC], SP %#lx\n", TG_(current_call_stack).sp,
+               TG_(current_call_stack).entry[TG_(current_call_stack).sp].sp);
+      } else {
+         VG_(printf)("  Popped ");
+         TG_(print_stackentry)(7, TG_(current_call_stack).sp);
+         if (jcc) {
+            VG_(printf)("       returned to ");
+            TG_(print_addr_ln)(bb_jmpaddr(jcc->from->bb));
+         }
+      }
+   }
+#endif
+}
+
+/* Unwind enough CallStack items to sync with current stack pointer.
+ * Returns the number of stack frames unwinded.
+ */
+Int TG_(unwind_call_stack)(Addr sp, Int minpops)
+{
+   Int csp;
+   Int unwind_count = 0;
+   TG_DEBUG(4, "+ unwind_call_stack(sp %#lx, minpops %d): frame %d\n", sp,
+            minpops, TG_(current_call_stack).sp);
+
+   /* We pop old stack frames.
+    * For a call, be p the stack address with return address.
+    *  - call_stack_esp[] has SP after the CALL: p-4
+    *  - current sp is after a RET: >= p
+    */
+
+   while ((csp = TG_(current_call_stack).sp) > 0) {
+      call_entry* top_ce = &(TG_(current_call_stack).entry[csp - 1]);
+
+      if ((top_ce->sp < sp) || ((top_ce->sp == sp) && minpops > 0)) {
+
+         minpops--;
+         unwind_count++;
+         TG_(pop_call_stack)();
+         csp = TG_(current_call_stack).sp;
+         continue;
+      }
+      break;
+   }
+
+   TG_DEBUG(4, "- unwind_call_stack\n");
+   return unwind_count;
+}
diff --git a/tracegrind/clo.c b/tracegrind/clo.c
new file mode 100644
index 000000000..5bfa108fa
--- /dev/null
+++ b/tracegrind/clo.c
@@ -0,0 +1,613 @@
+/*
+   This file is part of Tracegrind, a Valgrind tool for call graph
+   profiling programs.
+
+   Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This tool is derived from and contains lot of code from Cachegrind
+   Copyright (C) 2002-2017 Nicholas Nethercote (njn@valgrind.org)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "config.h" // for VG_PREFIX
+
+#include "global.h"
+
+/*------------------------------------------------------------*/
+/*--- Function specific configuration options              ---*/
+/*------------------------------------------------------------*/
+
+/* Special value for separate_callers: automatic = adaptive */
+#define CONFIG_AUTO -1
+
+#define CONFIG_DEFAULT -1
+#define CONFIG_FALSE   0
+#define CONFIG_TRUE    1
+
+/* Logging configuration for a function */
+struct _fn_config {
+   Int toggle_collect;
+
+   Int group; /* don't change caller dependency inside group !=0 */
+   Int pop_on_jump;
+
+   Int separate_callers;    /* separate logging dependent on caller  */
+   Int separate_recursions; /* separate logging of rec. levels       */
+
+#if TG_ENABLE_DEBUG
+   Int verbosity; /* Change debug verbosity level while in function */
+#endif
+};
+
+/* Configurations for function name prefix patterns.
+ * Currently, only very limit patterns are possible:
+ * Exact prefix patterns and "*::" are allowed.
+ * E.g.
+ *  - "abc" matches all functions starting with "abc".
+ *  - "abc*::def" matches all functions starting with "abc" and
+ *    starting with "def" after the first "::" separator.
+ *  - "*::print(" matches C++ methods "print" in all classes
+ *    without namespace. I.e. "*" doesn't match a "::".
+ *
+ * We build a trie from patterns, and for a given function, we
+ * go down the tree and apply all non-default configurations.
+ */
+
+#define NODE_DEGREE 30
+
+/* node of compressed trie search structure */
+typedef struct _config_node config_node;
+struct _config_node {
+   Int length;
+
+   fn_config*   config;
+   config_node* sub_node[NODE_DEGREE];
+   config_node* next;
+   config_node* wild_star;
+   config_node* wild_char;
+
+   HChar name[1];
+};
+
+/* root of trie */
+static config_node* fn_configs = 0;
+
+static __inline__ fn_config* new_fnc(void)
+{
+   fn_config* fnc = (fn_config*)TG_MALLOC("cl.clo.nf.1", sizeof(fn_config));
+
+   fnc->toggle_collect      = CONFIG_DEFAULT;
+   fnc->pop_on_jump         = CONFIG_DEFAULT;
+   fnc->group               = CONFIG_DEFAULT;
+   fnc->separate_callers    = CONFIG_DEFAULT;
+   fnc->separate_recursions = CONFIG_DEFAULT;
+
+#if TG_ENABLE_DEBUG
+   fnc->verbosity = CONFIG_DEFAULT;
+#endif
+
+   return fnc;
+}
+
+static config_node* new_config(const HChar* name, int length)
+{
+   int          i;
+   config_node* node =
+      (config_node*)TG_MALLOC("cl.clo.nc.1", sizeof(config_node) + length);
+
+   for (i = 0; i < length; i++) {
+      if (name[i] == 0)
+         break;
+      node->name[i] = name[i];
+   }
+   node->name[i] = 0;
+
+   node->length = length;
+   node->config = 0;
+   for (i = 0; i < NODE_DEGREE; i++)
+      node->sub_node[i] = 0;
+   node->next      = 0;
+   node->wild_char = 0;
+   node->wild_star = 0;
+
+   TG_DEBUG(3, "   new_config('%s', len %d)\n", node->name, length);
+
+   return node;
+}
+
+static __inline__ Bool is_wild(HChar n) { return (n == '*') || (n == '?'); }
+
+/* Recursively build up function matching tree (prefix tree).
+ * Returns function config object for pattern <name>
+ * and starting at tree node <*pnode>.
+ *
+ * Tree nodes (config_node) are created as needed,
+ * tree root is stored into <*pnode>, and the created
+ * leaf (fn_config) for the given pattern is returned.
+ */
+static fn_config* get_fnc2(config_node* node, const HChar* name)
+{
+   config_node *new_sub, *n, *nprev;
+   int          offset, len;
+
+   TG_DEBUG(3, "  get_fnc2(%p, '%s')\n", node, name);
+
+   if (name[0] == 0) {
+      if (!node->config)
+         node->config = new_fnc();
+      return node->config;
+   }
+
+   if (is_wild(*name)) {
+      if (*name == '*') {
+         while (name[1] == '*')
+            name++;
+         new_sub = node->wild_star;
+      } else
+         new_sub = node->wild_char;
+
+      if (!new_sub) {
+         new_sub = new_config(name, 1);
+         if (*name == '*')
+            node->wild_star = new_sub;
+         else
+            node->wild_char = new_sub;
+      }
+
+      return get_fnc2(new_sub, name + 1);
+   }
+
+   n     = node->sub_node[name[0] % NODE_DEGREE];
+   nprev = 0;
+   len   = 0;
+   while (n) {
+      for (len = 0; name[len] == n->name[len]; len++)
+         ;
+      if (len > 0)
+         break;
+      nprev = n;
+      n     = n->next;
+   }
+
+   if (!n) {
+      len = 1;
+      while (name[len] && (!is_wild(name[len])))
+         len++;
+      new_sub       = new_config(name, len);
+      new_sub->next = node->sub_node[name[0] % NODE_DEGREE];
+      node->sub_node[name[0] % NODE_DEGREE] = new_sub;
+
+      if (name[len] == 0) {
+         new_sub->config = new_fnc();
+         return new_sub->config;
+      }
+
+      /* recurse on wildcard */
+      return get_fnc2(new_sub, name + len);
+   }
+
+   if (len < n->length) {
+
+      /* split up the subnode <n> */
+      config_node* new_node;
+      int          i;
+
+      new_node = new_config(n->name, len);
+      if (nprev)
+         nprev->next = new_node;
+      else
+         node->sub_node[n->name[0] % NODE_DEGREE] = new_node;
+      new_node->next = n->next;
+
+      new_node->sub_node[n->name[len] % NODE_DEGREE] = n;
+
+      for (i = 0, offset = len; offset < n->length; i++, offset++)
+         n->name[i] = n->name[offset];
+      n->name[i] = 0;
+      n->length  = i;
+
+      name += len;
+      offset = 0;
+      while (name[offset] && (!is_wild(name[offset])))
+         offset++;
+      new_sub = new_config(name, offset);
+      /* this sub_node of new_node could already be set: chain! */
+      new_sub->next = new_node->sub_node[name[0] % NODE_DEGREE];
+      new_node->sub_node[name[0] % NODE_DEGREE] = new_sub;
+
+      if (name[offset] == 0) {
+         new_sub->config = new_fnc();
+         return new_sub->config;
+      }
+
+      /* recurse on wildcard */
+      return get_fnc2(new_sub, name + offset);
+   }
+
+   name += n->length;
+
+   if (name[0] == 0) {
+      /* name and node name are the same */
+      if (!n->config)
+         n->config = new_fnc();
+      return n->config;
+   }
+
+   offset = 1;
+   while (name[offset] && (!is_wild(name[offset])))
+      offset++;
+
+   new_sub                            = new_config(name, offset);
+   new_sub->next                      = n->sub_node[name[0] % NODE_DEGREE];
+   n->sub_node[name[0] % NODE_DEGREE] = new_sub;
+
+   return get_fnc2(new_sub, name + offset);
+}
+
+static void print_config_node(int depth, int hash, config_node* node)
+{
+   config_node* n;
+   int          i;
+
+   if (node != fn_configs) {
+      const HChar sp[] = "                                        ";
+
+      if (depth > 40)
+         depth = 40;
+      VG_(printf)("%s", sp + 40 - depth);
+      if (hash >= 0)
+         VG_(printf)(" [hash %2d]", hash);
+      else if (hash == -2)
+         VG_(printf)(" [wildc ?]");
+      else if (hash == -3)
+         VG_(printf)(" [wildc *]");
+      VG_(printf)(" '%s' (len %d)\n", node->name, node->length);
+   }
+   for (i = 0; i < NODE_DEGREE; i++) {
+      n = node->sub_node[i];
+      while (n) {
+         print_config_node(depth + 1, i, n);
+         n = n->next;
+      }
+   }
+   if (node->wild_char)
+      print_config_node(depth + 1, -2, node->wild_char);
+   if (node->wild_star)
+      print_config_node(depth + 1, -3, node->wild_star);
+}
+
+/* get a function config for a name pattern (from command line) */
+static fn_config* get_fnc(const HChar* name)
+{
+   fn_config* fnc;
+
+   TG_DEBUG(3, " +get_fnc(%s)\n", name);
+   if (fn_configs == 0)
+      fn_configs = new_config(name, 0);
+   fnc = get_fnc2(fn_configs, name);
+
+   TG_DEBUGIF(3)
+   {
+      TG_DEBUG(3, " -get_fnc(%s):\n", name);
+      print_config_node(3, -1, fn_configs);
+   }
+   return fnc;
+}
+
+static void update_fn_config1(fn_node* fn, fn_config* fnc)
+{
+   if (fnc->toggle_collect != CONFIG_DEFAULT)
+      fn->toggle_collect = (fnc->toggle_collect == CONFIG_TRUE);
+
+   if (fnc->pop_on_jump != CONFIG_DEFAULT)
+      fn->pop_on_jump = (fnc->pop_on_jump == CONFIG_TRUE);
+
+   if (fnc->group != CONFIG_DEFAULT)
+      fn->group = fnc->group;
+
+   if (fnc->separate_callers != CONFIG_DEFAULT)
+      fn->separate_callers = fnc->separate_callers;
+
+   if (fnc->separate_recursions != CONFIG_DEFAULT)
+      fn->separate_recursions = fnc->separate_recursions;
+
+#if TG_ENABLE_DEBUG
+   if (fnc->verbosity != CONFIG_DEFAULT)
+      fn->verbosity = fnc->verbosity;
+#endif
+}
+
+/* Recursively go down the function matching tree,
+ * looking for a match to <name>. For every matching leaf,
+ * <fn> is updated with the pattern config.
+ */
+static void update_fn_config2(fn_node* fn, const HChar* name, config_node* node)
+{
+   config_node* n;
+
+   TG_DEBUG(3, "  update_fn_config2('%s', node '%s'): \n", name, node->name);
+   if ((*name == 0) && node->config) {
+      TG_DEBUG(3, "   found!\n");
+      update_fn_config1(fn, node->config);
+      return;
+   }
+
+   n = node->sub_node[name[0] % NODE_DEGREE];
+   while (n) {
+      if (VG_(strncmp)(name, n->name, n->length) == 0)
+         break;
+      n = n->next;
+   }
+   if (n) {
+      TG_DEBUG(3, "   '%s' matching at hash %d\n", n->name,
+               name[0] % NODE_DEGREE);
+      update_fn_config2(fn, name + n->length, n);
+   }
+
+   if (node->wild_char) {
+      TG_DEBUG(3, "   skip '%c' for wildcard '?'\n", *name);
+      update_fn_config2(fn, name + 1, node->wild_char);
+   }
+
+   if (node->wild_star) {
+      TG_DEBUG(3, "   wildcard '*'\n");
+      while (*name) {
+         update_fn_config2(fn, name, node->wild_star);
+         name++;
+      }
+      update_fn_config2(fn, name, node->wild_star);
+   }
+}
+
+/* Update function config according to configs of name prefixes */
+void TG_(update_fn_config)(fn_node* fn)
+{
+   TG_DEBUG(3, "  update_fn_config('%s')\n", fn->name);
+   if (fn_configs)
+      update_fn_config2(fn, fn->name, fn_configs);
+}
+
+/*--------------------------------------------------------------------*/
+/*--- Command line processing                                      ---*/
+/*--------------------------------------------------------------------*/
+
+Bool TG_(process_cmd_line_option)(const HChar* arg)
+{
+   const HChar* tmp_str;
+
+   if VG_BOOL_CLO (arg, "--skip-plt", TG_(clo).skip_plt) {
+   }
+
+   else if VG_BOOL_CLO (arg, "--collect-jumps", TG_(clo).collect_jumps) {
+   }
+   /* compatibility alias, deprecated option */
+   else if VG_BOOL_CLO (arg, "--trace-jump", TG_(clo).collect_jumps) {
+   }
+
+   else if VG_BOOL_CLO (arg, "--collect-atstart", TG_(clo).collect_atstart) {
+   }
+
+   else if VG_BOOL_CLO (arg, "--instr-atstart", TG_(clo).instrument_atstart) {
+   }
+
+   else if VG_BOOL_CLO (arg, "--separate-threads", TG_(clo).separate_threads) {
+   }
+
+   else if VG_STR_CLO (arg, "--toggle-collect", tmp_str) {
+      fn_config* fnc      = get_fnc(tmp_str);
+      fnc->toggle_collect = CONFIG_TRUE;
+      /* defaults to initial collection off */
+      TG_(clo).collect_atstart = False;
+   }
+
+   else if VG_INT_CLO (arg, "--separate-recs", TG_(clo).separate_recursions) {
+   }
+
+   /* change handling of a jump between functions to ret+call */
+   else if VG_XACT_CLO (arg, "--pop-on-jump", TG_(clo).pop_on_jump, True) {
+   } else if VG_STR_CLO (arg, "--pop-on-jump", tmp_str) {
+      fn_config* fnc   = get_fnc(tmp_str);
+      fnc->pop_on_jump = CONFIG_TRUE;
+   }
+
+#if TG_ENABLE_DEBUG
+   else if VG_INT_CLO (arg, "--ct-verbose", TG_(clo).verbose) {
+   } else if VG_INT_CLO (arg, "--ct-vstart", TG_(clo).verbose_start) {
+   }
+
+   else if VG_STREQN (12, arg, "--ct-verbose") {
+      fn_config* fnc;
+      HChar*     s;
+      UInt       n = VG_(strtoll10)(arg + 12, &s);
+      if ((n <= 0) || *s != '=')
+         return False;
+      fnc            = get_fnc(s + 1);
+      fnc->verbosity = n;
+   }
+#endif
+
+   else if VG_XACT_CLO (arg, "--separate-callers=auto",
+                        TG_(clo).separate_callers, CONFIG_AUTO) {
+   } else if VG_INT_CLO (arg, "--separate-callers", TG_(clo).separate_callers) {
+   }
+
+   else if VG_STREQN (10, arg, "--fn-group") {
+      fn_config* fnc;
+      HChar*     s;
+      UInt       n = VG_(strtoll10)(arg + 10, &s);
+      if ((n <= 0) || *s != '=')
+         return False;
+      fnc        = get_fnc(s + 1);
+      fnc->group = n;
+   }
+
+   else if VG_STREQN (18, arg, "--separate-callers") {
+      fn_config* fnc;
+      HChar*     s;
+      UInt       n = VG_(strtoll10)(arg + 18, &s);
+      if ((n <= 0) || *s != '=')
+         return False;
+      fnc                   = get_fnc(s + 1);
+      fnc->separate_callers = n;
+   }
+
+   else if VG_STREQN (15, arg, "--separate-recs") {
+      fn_config* fnc;
+      HChar*     s;
+      UInt       n = VG_(strtoll10)(arg + 15, &s);
+      if ((n <= 0) || *s != '=')
+         return False;
+      fnc                      = get_fnc(s + 1);
+      fnc->separate_recursions = n;
+   }
+
+   else if VG_STR_CLO (arg, "--tracegrind-out-file", TG_(clo).out_format) {
+   }
+
+   else if VG_XACT_CLO (arg, "--collect-systime=no", TG_(clo).collect_systime,
+                        systime_no) {
+   } else if VG_XACT_CLO (arg, "--collect-systime=msec",
+                          TG_(clo).collect_systime, systime_msec) {
+   } else if VG_XACT_CLO (arg,
+                          "--collect-systime=yes", /* backward compatibility. */
+                          TG_(clo).collect_systime, systime_msec) {
+   } else if VG_XACT_CLO (arg, "--collect-systime=usec",
+                          TG_(clo).collect_systime, systime_usec) {
+   } else if VG_XACT_CLO (arg, "--collect-systime=nsec",
+                          TG_(clo).collect_systime, systime_nsec) {
+#if defined(VGO_darwin)
+      VG_(fmsg_bad_option)
+      (arg, "--collect-systime=nsec not supported on darwin\n");
+#endif
+   }
+
+   else if VG_BOOL_CLO (arg, "--collect-bus", TG_(clo).collect_bus) {
+   }
+   /* for option compatibility with cachegrind */
+   else if VG_BOOL_CLO (arg, "--cache-sim", TG_(clo).simulate_cache) {
+   }
+   /* compatibility alias, deprecated option */
+   else if VG_BOOL_CLO (arg, "--simulate-cache", TG_(clo).simulate_cache) {
+   }
+   /* for option compatibility with cachegrind */
+   else if VG_BOOL_CLO (arg, "--branch-sim", TG_(clo).simulate_branch) {
+   } else {
+      Bool isCachesimOption = (*TG_(cachesim).parse_opt)(arg);
+
+      /* cache simulator is used if a simulator option is given */
+      if (isCachesimOption)
+         TG_(clo).simulate_cache = True;
+
+      return isCachesimOption;
+   }
+
+   return True;
+}
+
+void TG_(print_usage)(void)
+{
+   VG_(printf)(
+      "\n   output options:\n"
+      "    --tracegrind-out-file=<f>  Output file name "
+      "[tracegrind.out.%%p.msgpack.lz4]\n"
+
+      "\n   data collection options:\n"
+      "    --instr-atstart=no|yes    Do instrumentation at tracegrind start "
+      "[yes]\n"
+      "    --collect-atstart=no|yes  Collect at process/thread start [yes]\n"
+      "    --toggle-collect=<func>   Toggle collection on enter/leave "
+      "function\n"
+      "    --collect-jumps=no|yes    Collect jumps? [no]\n"
+      "    --collect-bus=no|yes      Collect global bus events? [no]\n"
+      "    --collect-systime=no|yes|msec|usec|nsec  Collect system call time "
+      "info? [no]\n"
+      "        no         Do not collect system call time info.\n"
+      "        msec|yes   Collect syscount, syscall elapsed time "
+      "(milli-seconds).\n"
+      "        usec       Collect syscount, syscall elapsed time "
+      "(micro-seconds).\n"
+      "        nsec       Collect syscount, syscall elapsed and syscall cpu "
+      "time (nano-seconds).\n"
+
+      "\n   cost entity separation options:\n"
+      "    --separate-threads=no|yes Separate data per thread [no]\n"
+      "    --separate-callers=<n>    Separate functions by call chain length "
+      "[0]\n"
+      "    --separate-callers<n>=<f> Separate <n> callers for function <f>\n"
+      "    --separate-recs=<n>       Separate function recursions up to level "
+      "[2]\n"
+      "    --separate-recs<n>=<f>    Separate <n> recursions for function <f>\n"
+      "    --skip-plt=no|yes         Ignore calls to/from PLT sections? [yes]\n"
+#if TG_EXPERIMENTAL
+      "    --fn-group<no>=<func>     Put function into separation group <no>\n"
+#endif
+      "\n   simulation options:\n"
+      "    --branch-sim=no|yes       Do branch prediction simulation [no]\n"
+      "    --cache-sim=no|yes        Do cache simulation [no]\n");
+
+   (*TG_(cachesim).print_opts)();
+
+   //   VG_(printf)("\n"
+   //	       "  For full tracegrind documentation, see\n"
+   //	       "  "VG_PREFIX"/share/doc/tracegrind/html/tracegrind.html\n\n");
+}
+
+void TG_(print_debug_usage)(void)
+{
+   VG_(printf)(
+
+#if TG_ENABLE_DEBUG
+      "    --ct-verbose=<level>       Verbosity of standard debug output [0]\n"
+      "    --ct-vstart=<BB number>    Only be verbose after basic block [0]\n"
+      "    --ct-verbose<level>=<func> Verbosity while in <func>\n"
+#else
+      "    (none)\n"
+#endif
+
+   );
+}
+
+void TG_(set_clo_defaults)(void)
+{
+   /* Default values for command line arguments */
+
+   /* Output */
+   TG_(clo).out_format = 0;
+
+   /* Collection */
+   TG_(clo).separate_threads = False;
+   TG_(clo).collect_atstart  = True;
+   TG_(clo).collect_jumps    = False;
+   TG_(clo).collect_systime  = systime_no;
+   TG_(clo).collect_bus      = False;
+
+   TG_(clo).skip_plt            = True;
+   TG_(clo).separate_callers    = 0;
+   TG_(clo).separate_recursions = 2;
+   /* Instrumentation */
+   TG_(clo).instrument_atstart = True;
+   TG_(clo).simulate_cache     = False;
+   TG_(clo).simulate_branch    = False;
+
+   /* Call graph */
+   TG_(clo).pop_on_jump = False;
+
+#if TG_ENABLE_DEBUG
+   TG_(clo).verbose       = 0;
+   TG_(clo).verbose_start = 0;
+#endif
+}
diff --git a/tracegrind/context.c b/tracegrind/context.c
new file mode 100644
index 000000000..44fc16331
--- /dev/null
+++ b/tracegrind/context.c
@@ -0,0 +1,335 @@
+/*--------------------------------------------------------------------*/
+/*--- Tracegrind                                                    ---*/
+/*---                                                 ct_context.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Tracegrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+
+/*------------------------------------------------------------*/
+/*--- Context operations                                   ---*/
+/*------------------------------------------------------------*/
+
+#define N_FNSTACK_INITIAL_ENTRIES 500
+#define N_CXT_INITIAL_ENTRIES     2537
+
+fn_stack TG_(current_fn_stack);
+
+void TG_(init_fn_stack)(fn_stack* s)
+{
+   TG_ASSERT(s != 0);
+
+   s->size = N_FNSTACK_INITIAL_ENTRIES;
+   s->bottom =
+      (fn_node**)TG_MALLOC("cl.context.ifs.1", s->size * sizeof(fn_node*));
+   s->top       = s->bottom;
+   s->bottom[0] = 0;
+}
+
+void TG_(copy_current_fn_stack)(fn_stack* dst)
+{
+   TG_ASSERT(dst != 0);
+
+   dst->size   = TG_(current_fn_stack).size;
+   dst->bottom = TG_(current_fn_stack).bottom;
+   dst->top    = TG_(current_fn_stack).top;
+}
+
+void TG_(set_current_fn_stack)(fn_stack* s)
+{
+   TG_ASSERT(s != 0);
+
+   TG_(current_fn_stack).size   = s->size;
+   TG_(current_fn_stack).bottom = s->bottom;
+   TG_(current_fn_stack).top    = s->top;
+}
+
+static cxt_hash cxts;
+
+void TG_(init_cxt_table)(void)
+{
+   Int i;
+
+   cxts.size    = N_CXT_INITIAL_ENTRIES;
+   cxts.entries = 0;
+   cxts.table =
+      (Context**)TG_MALLOC("cl.context.ict.1", cxts.size * sizeof(Context*));
+
+   for (i = 0; i < cxts.size; i++)
+      cxts.table[i] = 0;
+}
+
+/* double size of cxt table  */
+static void resize_cxt_table(void)
+{
+   UInt      i, new_size, conflicts1 = 0, conflicts2 = 0;
+   Context **new_table, *curr, *next;
+   UInt      new_idx;
+
+   new_size = 2 * cxts.size + 3;
+   new_table =
+      (Context**)TG_MALLOC("cl.context.rct.1", new_size * sizeof(Context*));
+
+   for (i = 0; i < new_size; i++)
+      new_table[i] = NULL;
+
+   for (i = 0; i < cxts.size; i++) {
+      if (cxts.table[i] == NULL)
+         continue;
+
+      curr = cxts.table[i];
+      while (NULL != curr) {
+         next = curr->next;
+
+         new_idx = (UInt)(curr->hash % new_size);
+
+         curr->next         = new_table[new_idx];
+         new_table[new_idx] = curr;
+         if (curr->next) {
+            conflicts1++;
+            if (curr->next->next)
+               conflicts2++;
+         }
+
+         curr = next;
+      }
+   }
+
+   VG_(free)(cxts.table);
+
+   TG_DEBUG(0, "Resize Context Hash: %u => %u (entries %u, conflicts %u/%u)\n",
+            cxts.size, new_size, cxts.entries, conflicts1, conflicts2);
+
+   cxts.size  = new_size;
+   cxts.table = new_table;
+   TG_(stat).cxt_hash_resizes++;
+}
+
+__inline__ static UWord cxt_hash_val(fn_node** fn, UInt size)
+{
+   UWord hash  = 0;
+   UInt  count = size;
+   while (*fn != 0) {
+      hash = (hash << 7) + (hash >> 25) + (UWord)(*fn);
+      fn--;
+      count--;
+      if (count == 0)
+         break;
+   }
+   return hash;
+}
+
+__inline__ static Bool is_cxt(UWord hash, fn_node** fn, Context* cxt)
+{
+   int       count;
+   fn_node** cxt_fn;
+
+   if (hash != cxt->hash)
+      return False;
+
+   count  = cxt->size;
+   cxt_fn = &(cxt->fn[0]);
+   while ((*fn != 0) && (count > 0)) {
+      if (*cxt_fn != *fn)
+         return False;
+      fn--;
+      cxt_fn++;
+      count--;
+   }
+   return True;
+}
+
+/**
+ * Allocate new Context structure
+ */
+static Context* new_cxt(fn_node** fn)
+{
+   Context* cxt;
+   UInt     idx, offset;
+   UWord    hash;
+   int      size, recs;
+   fn_node* top_fn;
+
+   TG_ASSERT(fn);
+   top_fn = *fn;
+   if (top_fn == 0)
+      return 0;
+
+   size = top_fn->separate_callers + 1;
+   recs = top_fn->separate_recursions;
+   if (recs < 1)
+      recs = 1;
+
+   /* check fill degree of context hash table and resize if needed (>80%) */
+   cxts.entries++;
+   if (10 * cxts.entries / cxts.size > 8)
+      resize_cxt_table();
+
+   cxt = (Context*)TG_MALLOC("cl.context.nc.1",
+                             sizeof(Context) + sizeof(fn_node*) * size);
+
+   // hash value calculation similar to cxt_hash_val(), but additionally
+   // copying function pointers in one run
+   hash   = 0;
+   offset = 0;
+   while (*fn != 0) {
+      hash            = (hash << 7) + (hash >> 25) + (UWord)(*fn);
+      cxt->fn[offset] = *fn;
+      offset++;
+      fn--;
+      if (offset >= size)
+         break;
+   }
+   if (offset < size)
+      size = offset;
+
+   cxt->size        = size;
+   cxt->base_number = TG_(stat).context_counter;
+   cxt->hash        = hash;
+
+   TG_(stat).context_counter += recs;
+   TG_(stat).distinct_contexts++;
+
+   /* insert into Context hash table */
+   idx             = (UInt)(hash % cxts.size);
+   cxt->next       = cxts.table[idx];
+   cxts.table[idx] = cxt;
+
+#if TG_ENABLE_DEBUG
+   TG_DEBUGIF(3)
+   {
+      VG_(printf)("  new_cxt ox%p: ", cxt);
+      TG_(print_cxt)(12, cxt, 0);
+   }
+#endif
+
+   return cxt;
+}
+
+/* get the Context structure for current context */
+Context* TG_(get_cxt)(fn_node** fn)
+{
+   Context* cxt;
+   UInt     size, idx;
+   UWord    hash;
+
+   TG_ASSERT(fn != 0);
+   if (*fn == 0)
+      return 0;
+   size = (*fn)->separate_callers + 1;
+   if (size <= 0) {
+      size = -size + 1;
+   }
+
+   TG_DEBUG(5, "+ get_cxt(fn '%s'): size %u\n", (*fn)->name, size);
+
+   hash = cxt_hash_val(fn, size);
+
+   if (((cxt = (*fn)->last_cxt) != 0) && is_cxt(hash, fn, cxt)) {
+      TG_DEBUG(5, "- get_cxt: %p\n", cxt);
+      return cxt;
+   }
+
+   TG_(stat).cxt_lru_misses++;
+
+   idx = (UInt)(hash % cxts.size);
+   cxt = cxts.table[idx];
+
+   while (cxt) {
+      if (is_cxt(hash, fn, cxt))
+         break;
+      cxt = cxt->next;
+   }
+
+   if (!cxt)
+      cxt = new_cxt(fn);
+
+   (*fn)->last_cxt = cxt;
+
+   TG_DEBUG(5, "- get_cxt: %p\n", cxt);
+
+   return cxt;
+}
+
+/**
+ * Change execution context by calling a new function from current context
+ * Pushing 0x0 specifies a marker for a signal handler entry
+ */
+void TG_(push_cxt)(fn_node* fn)
+{
+   call_stack* cs = &TG_(current_call_stack);
+   Int         fn_entries;
+
+   TG_DEBUG(5, "+ push_cxt(fn '%s'): old ctx %d\n", fn ? fn->name : "0x0",
+            TG_(current_state).cxt ? (Int)TG_(current_state).cxt->base_number
+                                   : -1);
+
+   /* save old context on stack (even if not changed at all!) */
+   TG_ASSERT(cs->sp < cs->size);
+   TG_ASSERT(cs->entry[cs->sp].cxt == 0);
+   cs->entry[cs->sp].cxt = TG_(current_state).cxt;
+   cs->entry[cs->sp].fn_sp =
+      TG_(current_fn_stack).top - TG_(current_fn_stack).bottom;
+
+   if (fn && (*(TG_(current_fn_stack).top) == fn))
+      return;
+   if (fn && (fn->group > 0) &&
+       ((*(TG_(current_fn_stack).top))->group == fn->group))
+      return;
+
+   /* resizing needed ? */
+   fn_entries = TG_(current_fn_stack).top - TG_(current_fn_stack).bottom;
+   if (fn_entries == TG_(current_fn_stack).size - 1) {
+      UInt      new_size = TG_(current_fn_stack).size * 2;
+      fn_node** new_array =
+         (fn_node**)TG_MALLOC("cl.context.pc.1", new_size * sizeof(fn_node*));
+      int i;
+      for (i = 0; i < TG_(current_fn_stack).size; i++)
+         new_array[i] = TG_(current_fn_stack).bottom[i];
+      VG_(free)(TG_(current_fn_stack).bottom);
+      TG_(current_fn_stack).top    = new_array + fn_entries;
+      TG_(current_fn_stack).bottom = new_array;
+
+      TG_DEBUG(0, "Resize Context Stack: %u => %u (pushing '%s')\n",
+               TG_(current_fn_stack).size, new_size, fn ? fn->name : "0x0");
+
+      TG_(current_fn_stack).size = new_size;
+   }
+
+   if (fn && (*(TG_(current_fn_stack).top) == 0)) {
+      UInt* pactive;
+
+      /* this is first function: increment its active count */
+      pactive = TG_(get_fn_entry)(fn->number);
+      (*pactive)++;
+   }
+
+   TG_(current_fn_stack).top++;
+   *(TG_(current_fn_stack).top) = fn;
+   TG_(current_state).cxt       = TG_(get_cxt)(TG_(current_fn_stack).top);
+
+   TG_DEBUG(
+      5, "- push_cxt(fn '%s'): new cxt %d, fn_sp %ld\n", fn ? fn->name : "0x0",
+      TG_(current_state).cxt ? (Int)TG_(current_state).cxt->base_number : -1,
+      TG_(current_fn_stack).top - TG_(current_fn_stack).bottom + 0L);
+}
diff --git a/tracegrind/costs.c b/tracegrind/costs.c
new file mode 100644
index 000000000..bc7cd41eb
--- /dev/null
+++ b/tracegrind/costs.c
@@ -0,0 +1,68 @@
+/*--------------------------------------------------------------------*/
+/*--- Tracegrind                                                    ---*/
+/*---                                                   ct_costs.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Tracegrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+
+#include "pub_tool_mallocfree.h"
+
+#define COSTCHUNK_SIZE 100000
+
+UInt              TG_(costarray_entries) = 0;
+UInt              TG_(costarray_chunks)  = 0;
+static CostChunk* cost_chunk_base        = 0;
+static CostChunk* cost_chunk_current     = 0;
+
+ULong* TG_(get_costarray)(Int size)
+{
+   ULong* ptr;
+
+   if (!cost_chunk_current ||
+       (cost_chunk_current->size - cost_chunk_current->used < size)) {
+      CostChunk* cc = (CostChunk*)TG_MALLOC(
+         "cl.costs.gc.1", sizeof(CostChunk) + COSTCHUNK_SIZE * sizeof(ULong));
+      TG_ASSERT(size < COSTCHUNK_SIZE);
+
+      cc->size = COSTCHUNK_SIZE;
+      cc->used = 0;
+      cc->next = 0;
+
+      if (cost_chunk_current)
+         cost_chunk_current->next = cc;
+      cost_chunk_current = cc;
+
+      if (!cost_chunk_base)
+         cost_chunk_base = cc;
+
+      TG_(costarray_chunks)++;
+   }
+
+   ptr = &(cost_chunk_current->data[cost_chunk_current->used]);
+   cost_chunk_current->used += size;
+
+   TG_(costarray_entries) += size;
+
+   return ptr;
+}
diff --git a/tracegrind/costs.h b/tracegrind/costs.h
new file mode 100644
index 000000000..2e51c344d
--- /dev/null
+++ b/tracegrind/costs.h
@@ -0,0 +1,54 @@
+/*--------------------------------------------------------------------*/
+/*--- Tracegrind cost array interface.                      costs.h ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2017 Josef Weidendorfer
+      josef.weidendorfer@gmx.de
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#ifndef TG_COSTS
+#define TG_COSTS
+
+#include "pub_tool_basics.h"
+
+#define TG_(str) VGAPPEND(vgTracegrind_, str)
+
+extern UInt TG_(costarray_entries);
+extern UInt TG_(costarray_chunks);
+
+/* Array of 64bit costs. This is separated from other structs
+ * to support a dynamic number of costs for a cost item.
+ * Chunks are allocated on demand.
+ */
+typedef struct _CostChunk CostChunk;
+struct _CostChunk {
+   Int        size;
+   Int        used;
+   CostChunk *next, *prev;
+   ULong      data[0];
+};
+
+/* Allocate a number of 64bit cost values.
+ * Typically used from ct_events.c */
+ULong* TG_(get_costarray)(Int size);
+
+#endif /* TG_COSTS */
diff --git a/tracegrind/debug.c b/tracegrind/debug.c
new file mode 100644
index 000000000..fa8f876e2
--- /dev/null
+++ b/tracegrind/debug.c
@@ -0,0 +1,451 @@
+/*
+   This file is part of Tracegrind, a Valgrind tool for call graph
+   profiling programs.
+
+   Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This tool is derived from and contains lot of code from Cachegrind
+   Copyright (C) 2002-2017 Nicholas Nethercote (njn@valgrind.org)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "events.h"
+#include "global.h"
+
+/* If debugging mode of, dummy functions are provided (see below)
+ */
+#if TG_ENABLE_DEBUG
+
+/*------------------------------------------------------------*/
+/*--- Debug output helpers                                 ---*/
+/*------------------------------------------------------------*/
+
+static void print_indent(int s)
+{
+   /* max of 40 spaces */
+   const HChar sp[] = "                                        ";
+   if (s > 40)
+      s = 40;
+   VG_(printf)("%s", sp + 40 - s);
+}
+
+void TG_(print_bb)(int s, BB* bb)
+{
+   if (s < 0) {
+      s = -s;
+      print_indent(s);
+   }
+
+   VG_(printf)("BB %#lx (Obj '%s')", bb_addr(bb), bb->obj->name);
+}
+
+static void print_mangled_cxt(Context* cxt, int rec_index)
+{
+   int i;
+
+   if (!cxt)
+      VG_(printf)("(none)");
+   else {
+      VG_(printf)("%s", cxt->fn[0]->name);
+      if (rec_index > 0)
+         VG_(printf)("'%d", rec_index + 1);
+      for (i = 1; i < cxt->size; i++)
+         VG_(printf)("'%s", cxt->fn[i]->name);
+   }
+}
+
+void TG_(print_cxt)(Int s, Context* cxt, int rec_index)
+{
+   if (s < 0) {
+      s = -s;
+      print_indent(s);
+   }
+
+   if (cxt) {
+      UInt* pactive = TG_(get_fn_entry)(cxt->fn[0]->number);
+      TG_ASSERT(rec_index < cxt->fn[0]->separate_recursions);
+
+      VG_(printf)("Cxt %u", cxt->base_number + rec_index);
+      if (*pactive > 0)
+         VG_(printf)(" [active=%u]", *pactive);
+      VG_(printf)(": ");
+      print_mangled_cxt(cxt, rec_index);
+      VG_(printf)("\n");
+   } else
+      VG_(printf)("(no context)\n");
+}
+
+void TG_(print_execstate)(int s, exec_state* es)
+{
+   if (s < 0) {
+      s = -s;
+      print_indent(s);
+   }
+
+   if (!es) {
+      VG_(printf)("ExecState 0x0\n");
+      return;
+   }
+
+   VG_(printf)(
+      "ExecState [Sig %d, collect %s, nonskipped %p]: jmps_passed %d\n",
+      es->sig, es->collect ? "yes" : "no", es->nonskipped, es->jmps_passed);
+}
+
+void TG_(print_bbcc)(int s, BBCC* bbcc)
+{
+   BB* bb;
+
+   if (s < 0) {
+      s = -s;
+      print_indent(s);
+   }
+
+   if (!bbcc) {
+      VG_(printf)("BBCC 0x0\n");
+      return;
+   }
+
+   bb = bbcc->bb;
+   TG_ASSERT(bb != 0);
+
+   VG_(printf)("%s +%#lx=%#lx, ", bb->obj->name + bb->obj->last_slash_pos,
+               (UWord)bb->offset, bb_addr(bb));
+   TG_(print_cxt)(s + 8, bbcc->cxt, bbcc->rec_index);
+}
+
+void TG_(print_eventset)(int s, EventSet* es)
+{
+   int         i, j;
+   UInt        mask;
+   EventGroup* eg;
+
+   if (s < 0) {
+      s = -s;
+      print_indent(s);
+   }
+
+   if (!es) {
+      VG_(printf)("(EventSet not set)\n");
+      return;
+   }
+
+   VG_(printf)("EventSet %u (%d groups, size %d):", es->mask, es->count,
+               es->size);
+
+   if (es->count == 0) {
+      VG_(printf)("-\n");
+      return;
+   }
+
+   for (i = 0, mask = 1; i < MAX_EVENTGROUP_COUNT; i++, mask = mask << 1) {
+      if ((es->mask & mask) == 0)
+         continue;
+      eg = TG_(get_event_group)(i);
+      if (!eg)
+         continue;
+      VG_(printf)(" (%d: %s", i, eg->name[0]);
+      for (j = 1; j < eg->size; j++)
+         VG_(printf)(" %s", eg->name[j]);
+      VG_(printf)(")");
+   }
+   VG_(printf)("\n");
+}
+
+void TG_(print_cost)(int s, EventSet* es, ULong* c)
+{
+   Int         i, j, pos, off;
+   UInt        mask;
+   EventGroup* eg;
+
+   if (s < 0) {
+      s = -s;
+      print_indent(s);
+   }
+
+   if (!es) {
+      VG_(printf)("Cost (Nothing, EventSet not set)\n");
+      return;
+   }
+   if (!c) {
+      VG_(printf)("Cost (Null, EventSet %u)\n", es->mask);
+      return;
+   }
+
+   if (es->size == 0) {
+      VG_(printf)("Cost (Nothing, EventSet with len 0)\n");
+      return;
+   }
+
+   pos = s;
+   pos += VG_(printf)("Cost [%p]: ", c);
+   off = 0;
+   for (i = 0, mask = 1; i < MAX_EVENTGROUP_COUNT; i++, mask = mask << 1) {
+      if ((es->mask & mask) == 0)
+         continue;
+      eg = TG_(get_event_group)(i);
+      if (!eg)
+         continue;
+      for (j = 0; j < eg->size; j++) {
+
+         if (off > 0) {
+            if (pos > 70) {
+               VG_(printf)(",\n");
+               print_indent(s + 5);
+               pos = s + 5;
+            } else
+               pos += VG_(printf)(", ");
+         }
+
+         pos += VG_(printf)("%s %llu", eg->name[j], c[off++]);
+      }
+   }
+   VG_(printf)("\n");
+}
+
+void TG_(print_short_jcc)(jCC* jcc)
+{
+   if (jcc)
+      VG_(printf)("%#lx => %#lx [calls %llu/Ir %llu, Dr %llu, Dw %llu]",
+                  bb_jmpaddr(jcc->from->bb), bb_addr(jcc->to->bb),
+                  jcc->call_counter,
+                  jcc->cost ? jcc->cost[fullOffset(EG_IR)] : 0,
+                  jcc->cost ? jcc->cost[fullOffset(EG_DR)] : 0,
+                  jcc->cost ? jcc->cost[fullOffset(EG_DW)] : 0);
+   else
+      VG_(printf)("[Skipped JCC]");
+}
+
+void TG_(print_jcc)(int s, jCC* jcc)
+{
+   if (s < 0) {
+      s = -s;
+      print_indent(s);
+   }
+
+   if (!jcc) {
+      VG_(printf)("JCC to skipped function\n");
+      return;
+   }
+   VG_(printf)("JCC %p from ", jcc);
+   TG_(print_bbcc)(s + 9, jcc->from);
+   print_indent(s + 4);
+   VG_(printf)("to   ");
+   TG_(print_bbcc)(s + 9, jcc->to);
+   print_indent(s + 4);
+   VG_(printf)("Calls %llu\n", jcc->call_counter);
+   print_indent(s + 4);
+   TG_(print_cost)(s + 9, TG_(sets).full, jcc->cost);
+}
+
+/* dump out the current call stack */
+void TG_(print_stackentry)(int s, int sp)
+{
+   call_entry* ce;
+
+   if (s < 0) {
+      s = -s;
+      print_indent(s);
+   }
+
+   ce = TG_(get_call_entry)(sp);
+   VG_(printf)("[%-2d] SP %#lx, RA %#lx", sp, ce->sp, ce->ret_addr);
+   if (ce->nonskipped)
+      VG_(printf)(" NonSkipped BB %#lx / %s", bb_addr(ce->nonskipped->bb),
+                  ce->nonskipped->cxt->fn[0]->name);
+   VG_(printf)("\n");
+   print_indent(s + 5);
+   TG_(print_jcc)(5, ce->jcc);
+}
+
+/* debug output */
+#if 0
+static void print_call_stack()
+{
+    int c;
+
+    VG_(printf)("Call Stack:\n");
+    for(c=0;c<TG_(current_call_stack).sp;c++)
+      TG_(print_stackentry)(-2, c);
+}
+#endif
+
+void TG_(print_bbcc_fn)(BBCC* bbcc)
+{
+   obj_node* obj;
+
+   if (!bbcc) {
+      VG_(printf)("%08x", 0u);
+      return;
+   }
+
+   VG_(printf)("%08lx/%c  %u:", bb_addr(bbcc->bb),
+               (bbcc->bb->sect_kind == Vg_SectText)   ? 'T'
+               : (bbcc->bb->sect_kind == Vg_SectData) ? 'D'
+               : (bbcc->bb->sect_kind == Vg_SectBSS)  ? 'B'
+               : (bbcc->bb->sect_kind == Vg_SectGOT)  ? 'G'
+               : (bbcc->bb->sect_kind == Vg_SectPLT)  ? 'P'
+                                                      : 'U',
+               bbcc->cxt->base_number + bbcc->rec_index);
+   print_mangled_cxt(bbcc->cxt, bbcc->rec_index);
+
+   obj = bbcc->cxt->fn[0]->file->obj;
+   if (obj->name[0])
+      VG_(printf)(" %s", obj->name + obj->last_slash_pos);
+
+   if (VG_(strcmp)(bbcc->cxt->fn[0]->file->name, "???") != 0) {
+      VG_(printf)(" %s", bbcc->cxt->fn[0]->file->name);
+      if ((bbcc->cxt->fn[0] == bbcc->bb->fn) && (bbcc->bb->line > 0))
+         VG_(printf)(":%u", bbcc->bb->line);
+   }
+}
+
+void TG_(print_bbcc_cost)(int s, BBCC* bbcc)
+{
+   BB*   bb;
+   Int   i, cjmpNo;
+   ULong ecounter;
+
+   if (s < 0) {
+      s = -s;
+      print_indent(s);
+   }
+
+   if (!bbcc) {
+      VG_(printf)("BBCC 0x0\n");
+      return;
+   }
+
+   bb = bbcc->bb;
+   TG_ASSERT(bb != 0);
+
+   TG_(print_bbcc)(s, bbcc);
+
+   ecounter = bbcc->ecounter_sum;
+
+   print_indent(s + 2);
+   VG_(printf)("ECounter: sum %llu ", ecounter);
+   for (i = 0; i < bb->cjmp_count; i++) {
+      VG_(printf)("[%u]=%llu ", bb->jmp[i].instr, bbcc->jmp[i].ecounter);
+   }
+   VG_(printf)("\n");
+
+   cjmpNo = 0;
+   for (i = 0; i < bb->instr_count; i++) {
+      InstrInfo* ii = &(bb->instr[i]);
+      print_indent(s + 2);
+      VG_(printf)("[%2d] IOff %2u ecnt %3llu ", i, ii->instr_offset, ecounter);
+      TG_(print_cost)(s + 5, ii->eventset, bbcc->cost + ii->cost_offset);
+
+      /* update execution counter */
+      if (cjmpNo < bb->cjmp_count)
+         if (bb->jmp[cjmpNo].instr == i) {
+            ecounter -= bbcc->jmp[cjmpNo].ecounter;
+            cjmpNo++;
+         }
+   }
+}
+
+/* dump out an address with source info if available */
+void TG_(print_addr)(Addr addr)
+{
+   const HChar *fn_buf, *fl_buf, *dir_buf;
+   const HChar* obj_name;
+   DebugInfo*   di;
+   UInt         ln, i = 0, opos = 0;
+
+   if (addr == 0) {
+      VG_(printf)("%08lx", addr);
+      return;
+   }
+
+   TG_(get_debug_info)(addr, &dir_buf, &fl_buf, &fn_buf, &ln, &di);
+
+   if (VG_(strcmp)(fn_buf, "???") == 0)
+      VG_(printf)("%#lx", addr);
+   else
+      VG_(printf)("%#lx %s", addr, fn_buf);
+
+   if (di) {
+      obj_name = VG_(DebugInfo_get_filename)(di);
+      if (obj_name) {
+         while (obj_name[i]) {
+            if (obj_name[i] == '/')
+               opos = i + 1;
+            i++;
+         }
+         if (obj_name[0])
+            VG_(printf)(" %s", obj_name + opos);
+      }
+   }
+
+   if (ln > 0) {
+      if (dir_buf[0])
+         VG_(printf)(" (%s/%s:%u)", dir_buf, fl_buf, ln);
+      else
+         VG_(printf)(" (%s:%u)", fl_buf, ln);
+   }
+}
+
+void TG_(print_addr_ln)(Addr addr)
+{
+   TG_(print_addr)(addr);
+   VG_(printf)("\n");
+}
+
+static ULong bb_written = 0;
+
+void TG_(print_bbno)(void)
+{
+   if (bb_written != TG_(stat).bb_executions) {
+      bb_written = TG_(stat).bb_executions;
+      VG_(printf)("BB# %llu\n", TG_(stat).bb_executions);
+   }
+}
+
+void TG_(print_context)(void)
+{
+   BBCC* bbcc;
+
+   TG_DEBUG(0, "In tid %u [%d] ", TG_(current_tid), TG_(current_call_stack).sp);
+   bbcc = TG_(current_state).bbcc;
+   print_mangled_cxt(TG_(current_state).cxt, bbcc ? bbcc->rec_index : 0);
+   VG_(printf)("\n");
+}
+
+void* TG_(malloc)(const HChar* cc, UWord s, const HChar* f)
+{
+   TG_DEBUG(3, "Malloc(%lu) in %s.\n", s, f);
+   return VG_(malloc)(cc, s);
+}
+
+#else /* TG_ENABLE_DEBUG */
+
+void TG_(print_bbno)(void) {}
+void TG_(print_context)(void) {}
+void TG_(print_jcc)(int s, jCC* jcc) {}
+void TG_(print_bbcc)(int s, BBCC* bbcc) {}
+void TG_(print_bbcc_fn)(BBCC* bbcc) {}
+void TG_(print_cost)(int s, EventSet* es, ULong* cost) {}
+void TG_(print_bb)(int s, BB* bb) {}
+void TG_(print_cxt)(int s, Context* cxt, int rec_index) {}
+void TG_(print_short_jcc)(jCC* jcc) {}
+void TG_(print_stackentry)(int s, int sp) {}
+void TG_(print_addr)(Addr addr) {}
+void TG_(print_addr_ln)(Addr addr) {}
+
+#endif
diff --git a/tracegrind/docs/tracegrind-msgpack-format.md b/tracegrind/docs/tracegrind-msgpack-format.md
new file mode 100644
index 000000000..f6dabeb31
--- /dev/null
+++ b/tracegrind/docs/tracegrind-msgpack-format.md
@@ -0,0 +1,228 @@
+# Tracegrind MsgPack+LZ4 Output Format
+
+## Overview
+
+Tracegrind produces a binary trace file combining MsgPack serialization with LZ4 block compression. The default output file name is `tracegrind.out.<pid>.msgpack.lz4`.
+
+## File Structure
+
+```
+┌─────────────────────────────────┐
+│       File Header (8 bytes)     │
+├─────────────────────────────────┤
+│       Schema Chunk              │
+├─────────────────────────────────┤
+│       Data Chunk 1..N           │
+├─────────────────────────────────┤
+│       End Marker (8 bytes)      │
+└─────────────────────────────────┘
+```
+
+## File Header
+
+| Offset | Size | Field   | Description |
+|--------|------|---------|-------------|
+| 0      | 4    | magic   | ASCII `TGMP` (0x54 0x47 0x4D 0x50) |
+| 4      | 4    | version | Format version, uint32 LE (currently 4) |
+
+## Chunk Format
+
+Each chunk (schema and data) has the same header:
+
+| Offset | Size | Field             | Description |
+|--------|------|-------------------|-------------|
+| 0      | 4    | uncompressed_size | Size after decompression, uint32 LE |
+| 4      | 4    | compressed_size   | Size of LZ4 block, uint32 LE |
+| 8      | N    | data              | LZ4 block-compressed MsgPack data |
+
+## Schema Chunk
+
+The first chunk contains a MsgPack map describing the discriminated union schema:
+
+```json
+{
+    "version": 4,
+    "format": "tracegrind-msgpack",
+    "creator": "valgrind-tracegrind",
+    "creator_version": "3.26.0.codspeed",
+    "event_schemas": {
+        "0": ["seq", "tid", "event", "marker"],
+        "1": ["seq", "tid", "event", "fn", "obj", "file", "line", "counters"],
+        "2": ["seq", "tid", "event", "fn", "obj", "file", "line", "counters"],
+        "3": ["seq", "tid", "event", "fn", "obj", "file", "line", "counters"],
+        "4": ["seq", "tid", "event", "fn", "obj", "file", "line", "counters"],
+        "5": ["seq", "tid", "event", "child_pid"],
+        "6": ["seq", "tid", "event", "child_tid"]
+    },
+    "counters": ["Ir"],
+    "counter_units": {
+        "sysTime": "ns",
+        "sysCpuTime": "ns"
+    }
+}
+```
+
+The `counters` array lists the dynamic counter column names (e.g. `["Ir"]` or `["Ir", "sysCount", "sysTime", "sysCpuTime"]`). Event schemas for types 1-4 use `"counters"` as a sentinel at index 7 to indicate that a sub-array of counter deltas appears at that position in data rows.
+
+### Event Types
+
+| Type | Name   | Description |
+|------|--------|-------------|
+| 0    | MARKER | Named marker |
+| 1    | ENTER_FN  | Function entry |
+| 2    | EXIT_FN | Function exit |
+| 3    | ENTER_INLINED_FN | Inlined function entry |
+| 4    | EXIT_INLINED_FN  | Inlined function exit |
+| 5    | FORK   | Child process created |
+| 6    | THREAD_CREATE | New thread created |
+
+### Row Schemas
+
+**MARKER rows (event 0):**
+
+| Index | Name   | Type   | Description |
+|-------|--------|--------|-------------|
+| 0     | seq    | uint64 | Sequence number |
+| 1     | tid    | int32  | Thread ID |
+| 2     | event  | int    | 0 = MARKER |
+| 3     | marker | string | Marker label |
+
+**ENTER_FN/EXIT_FN rows (event 1, 2):**
+
+| Index | Name     | Type   | Description |
+|-------|----------|--------|-------------|
+| 0     | seq      | uint64 | Sequence number |
+| 1     | tid      | int32  | Thread ID |
+| 2     | event    | int    | 1 = ENTER_FN, 2 = EXIT_FN |
+| 3     | fn       | string | Function name |
+| 4     | obj      | string | Shared object path |
+| 5     | file     | string | Source file path |
+| 6     | line     | int32  | Line number (0 if unknown) |
+| 7     | counters | array  | Counter deltas sub-array (Ir, Dr, Dw, etc.) |
+
+**ENTER_INLINED_FN/EXIT_INLINED_FN rows (event 3, 4):**
+
+Same schema as ENTER_FN/EXIT_FN rows.
+
+| Index | Name     | Type   | Description |
+|-------|----------|--------|-------------|
+| 0     | seq      | uint64 | Sequence number |
+| 1     | tid      | int32  | Thread ID |
+| 2     | event    | int    | 3 = ENTER_INLINED_FN, 4 = EXIT_INLINED_FN |
+| 3     | fn       | string | Function name |
+| 4     | obj      | string | Shared object path |
+| 5     | file     | string | Source file path |
+| 6     | line     | int32  | Line number (0 if unknown) |
+| 7     | counters | array  | Counter deltas sub-array (Ir, Dr, Dw, etc.) |
+
+**FORK rows (event 5):**
+
+| Index | Name      | Type   | Description |
+|-------|-----------|--------|-------------|
+| 0     | seq       | uint64 | Sequence number |
+| 1     | tid       | int32  | Thread ID that called fork |
+| 2     | event     | int    | 5 = FORK |
+| 3     | child_pid | int32  | PID of the new child process |
+
+**THREAD_CREATE rows (event 6):**
+
+| Index | Name      | Type   | Description |
+|-------|-----------|--------|-------------|
+| 0     | seq       | uint64 | Sequence number |
+| 1     | tid       | int32  | Thread ID that created the new thread |
+| 2     | event     | int    | 6 = THREAD_CREATE |
+| 3     | child_tid | int32  | Thread ID of the new child thread |
+
+### Event Counter Columns
+
+For ENTER_FN/EXIT_FN/ENTER_INLINED_FN/EXIT_INLINED_FN rows, event counters appear as a sub-array at index 7. The order of values in the sub-array corresponds to the top-level `counters` array in the schema. Which counters are present depends on Tracegrind options:
+
+`Ir`, `Dr`, `Dw`, `I1mr`, `D1mr`, `D1mw`, `ILmr`, `DLmr`, `DLmw`, `Bc`, `Bcm`, `Bi`, `Bim`
+
+### Counter Units
+
+The `counter_units` field is a map from event counter name to its unit string. Only time-based counters are listed; counters absent from the map are dimensionless.
+
+| `--collect-systime` | Entries in `counter_units` |
+|---------------------|--------------------|
+| `msec`              | `"sysTime": "ms"` |
+| `usec`              | `"sysTime": "us"` |
+| `nsec`              | `"sysTime": "ns"`, `"sysCpuTime": "ns"` |
+
+When `--collect-systime` is not set, the `counter_units` map is empty.
+
+## Data Chunks
+
+Each data chunk contains concatenated MsgPack arrays. The row format depends on the event type (index 2):
+
+```
+[seq, tid, 0, marker]                                    # MARKER
+[seq, tid, 1, fn, obj, file, line, [delta_Ir, ...]]      # ENTER_FN
+[seq, tid, 2, fn, obj, file, line, [delta_Ir, ...]]      # EXIT_FN
+[seq, tid, 3, fn, obj, file, line, [delta_Ir, ...]]      # ENTER_INLINED_FN
+[seq, tid, 4, fn, obj, file, line, [delta_Ir, ...]]      # EXIT_INLINED_FN
+[seq, tid, 5, child_pid]                                  # FORK
+[seq, tid, 6, child_tid]                                  # THREAD_CREATE
+```
+
+The reference implementation writes 4096 rows per chunk.
+
+## End Marker
+
+8 zero bytes (uncompressed_size = 0, compressed_size = 0).
+
+## Example: Reading in Python
+
+```python
+import struct, lz4.block, msgpack
+
+def read_tracegrind(filepath):
+    with open(filepath, 'rb') as f:
+        assert f.read(4) == b'TGMP'
+        version = struct.unpack('<I', f.read(4))[0]
+        assert version == 4
+
+        # Read schema chunk
+        usize, csize = struct.unpack('<II', f.read(8))
+        schema = msgpack.unpackb(
+            lz4.block.decompress(f.read(csize), uncompressed_size=usize),
+            raw=False)
+        event_schemas = schema['event_schemas']
+        counter_names = schema['counters']
+
+        # Read data chunks
+        rows = []
+        while True:
+            usize, csize = struct.unpack('<II', f.read(8))
+            if usize == 0 and csize == 0:
+                break
+            chunk = lz4.block.decompress(f.read(csize), uncompressed_size=usize)
+            unpacker = msgpack.Unpacker(raw=False)
+            unpacker.feed(chunk)
+            for row in unpacker:
+                event_type = row[2]
+                columns = event_schemas[str(event_type)]
+                record = {}
+                for i, col in enumerate(columns):
+                    if col == 'counters' and isinstance(row[i], list):
+                        # Expand counters sub-array
+                        for j, name in enumerate(counter_names):
+                            record[name] = row[i][j]
+                    else:
+                        record[col] = row[i]
+                rows.append(record)
+
+        return event_schemas, counter_names, rows
+```
+
+## References
+
+- [MsgPack Specification](https://github.com/msgpack/msgpack/blob/master/spec.md)
+- [LZ4 Block Format](https://github.com/lz4/lz4/blob/dev/doc/lz4_Block_format.md)
+
+## Reference Implementation
+
+- `tracegrind/tg_msgpack.c/h` - MsgPack encoder
+- `tracegrind/tg_lz4.c/h` - LZ4 compression wrapper
+- `tracegrind/lz4.c/h` - Vendored LZ4 library
+- `tracegrind/dump.c` - Trace output integration
diff --git a/tracegrind/dump.c b/tracegrind/dump.c
new file mode 100644
index 000000000..dadc053c0
--- /dev/null
+++ b/tracegrind/dump.c
@@ -0,0 +1,764 @@
+/*--------------------------------------------------------------------*/
+/*--- Tracegrind                                                    ---*/
+/*---                                                       dump.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Tracegrind, a Valgrind tool for call tracing.
+
+   Based on callgrind, Copyright (C) 2002-2017, Josef Weidendorfer.
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "config.h"
+#include "global.h"
+#include "tg_lz4.h"
+#include "tg_msgpack.h"
+
+#include "pub_tool_libcfile.h"
+#include "pub_tool_threadstate.h"
+
+/* Total reads/writes/misses sum over all threads. */
+FullCost TG_(total_cost) = 0;
+
+EventMapping* TG_(dumpmap) = 0;
+
+/* ================================================================== */
+/* === Trace output                                                === */
+/* ================================================================== */
+
+trace_output TG_(trace_out) = {
+   .fd = -1, .seq = 0, .initialized = False, .header_written = False};
+
+/* ================================================================== */
+/* === MsgPack + LZ4 output                                       === */
+/* ================================================================== */
+
+#define MSGPACK_CHUNK_ROWS  4096         /* Rows per compressed chunk */
+#define MSGPACK_INITIAL_BUF (256 * 1024) /* Initial buffer size */
+
+typedef struct {
+   msgpack_buffer buf;           /* Buffer for serializing rows */
+   UInt           rows_in_chunk; /* Number of rows in current chunk */
+   UInt           n_event_cols;  /* Number of dynamic event columns */
+   const HChar**  col_names;     /* Column names (for header) */
+   Int            ncols;         /* Total columns including events */
+} msgpack_state;
+
+static msgpack_state mp_state;
+
+/* Write a compressed chunk to the trace output */
+static void msgpack_flush_chunk(void)
+{
+   if (mp_state.rows_in_chunk == 0)
+      return;
+   if (TG_(trace_out).fd < 0)
+      return;
+
+   /* Compress the msgpack data with zstd */
+   SizeT  src_size     = mp_state.buf.size;
+   SizeT  dst_capacity = tg_lz4_compress_bound(src_size);
+   UChar* compressed   = VG_(malloc)("tg.mp.compress", dst_capacity);
+
+   SizeT compressed_size =
+      tg_lz4_compress(compressed, dst_capacity, mp_state.buf.data, src_size);
+
+   if (compressed_size == 0) {
+      /* Compression failed, write raw with size=0 marker */
+      VG_(free)(compressed);
+      return;
+   }
+
+   /* Write chunk header: 4 bytes uncompressed size, 4 bytes compressed size */
+   UChar hdr[8];
+   hdr[0] = (UChar)(src_size & 0xff);
+   hdr[1] = (UChar)((src_size >> 8) & 0xff);
+   hdr[2] = (UChar)((src_size >> 16) & 0xff);
+   hdr[3] = (UChar)((src_size >> 24) & 0xff);
+   hdr[4] = (UChar)(compressed_size & 0xff);
+   hdr[5] = (UChar)((compressed_size >> 8) & 0xff);
+   hdr[6] = (UChar)((compressed_size >> 16) & 0xff);
+   hdr[7] = (UChar)((compressed_size >> 24) & 0xff);
+   VG_(write)(TG_(trace_out).fd, hdr, 8);
+
+   /* Write compressed data */
+   VG_(write)(TG_(trace_out).fd, compressed, compressed_size);
+
+   VG_(free)(compressed);
+
+   /* Reset buffer for next chunk */
+   msgpack_reset(&mp_state.buf);
+   mp_state.rows_in_chunk = 0;
+}
+
+/* Write file header with schema metadata (discriminated union format) */
+static void msgpack_write_header(void)
+{
+   msgpack_buffer hdr;
+   msgpack_init(&hdr, 2048);
+
+   /* Header is a map with metadata */
+   msgpack_write_map_header(&hdr, 7);
+
+   /* version */
+   msgpack_write_key(&hdr, "version");
+   msgpack_write_uint(&hdr, 4);
+
+   /* format */
+   msgpack_write_key(&hdr, "format");
+   msgpack_write_str(&hdr, "tracegrind-msgpack", -1);
+
+   /* creator */
+   msgpack_write_key(&hdr, "creator");
+   msgpack_write_str(&hdr, "valgrind-tracegrind", -1);
+
+   /* creator_version */
+   msgpack_write_key(&hdr, "creator_version");
+   msgpack_write_str(&hdr, VERSION, -1);
+
+   /* event_schemas - discriminated union: each event type has its own schema */
+   msgpack_write_key(&hdr, "event_schemas");
+   msgpack_write_map_header(&hdr, 7); /* 7 event types */
+
+   /* Event type 0 (MARKER) schema */
+   msgpack_write_key(&hdr, "0");
+   msgpack_write_array_header(&hdr, 4);
+   msgpack_write_str(&hdr, "seq", -1);
+   msgpack_write_str(&hdr, "tid", -1);
+   msgpack_write_str(&hdr, "event", -1);
+   msgpack_write_str(&hdr, "marker", -1);
+
+   /* Event types 1-4: 7 fixed columns + "counters" sentinel */
+   {
+      const HChar* ev_keys[] = {"1", "2", "3", "4"};
+      Int          k;
+      for (k = 0; k < 4; k++) {
+         msgpack_write_key(&hdr, ev_keys[k]);
+         msgpack_write_array_header(&hdr, 8);
+         msgpack_write_str(&hdr, "seq", -1);
+         msgpack_write_str(&hdr, "tid", -1);
+         msgpack_write_str(&hdr, "event", -1);
+         msgpack_write_str(&hdr, "fn", -1);
+         msgpack_write_str(&hdr, "obj", -1);
+         msgpack_write_str(&hdr, "file", -1);
+         msgpack_write_str(&hdr, "line", -1);
+         msgpack_write_str(&hdr, "counters", -1);
+      }
+   }
+
+   /* Event type 5 (FORK) schema */
+   msgpack_write_key(&hdr, "5");
+   msgpack_write_array_header(&hdr, 4);
+   msgpack_write_str(&hdr, "seq", -1);
+   msgpack_write_str(&hdr, "tid", -1);
+   msgpack_write_str(&hdr, "event", -1);
+   msgpack_write_str(&hdr, "child_pid", -1);
+
+   /* Event type 6 (THREAD_CREATE) schema */
+   msgpack_write_key(&hdr, "6");
+   msgpack_write_array_header(&hdr, 4);
+   msgpack_write_str(&hdr, "seq", -1);
+   msgpack_write_str(&hdr, "tid", -1);
+   msgpack_write_str(&hdr, "event", -1);
+   msgpack_write_str(&hdr, "child_tid", -1);
+
+   /* counters - array of dynamic counter column names */
+   msgpack_write_key(&hdr, "counters");
+   msgpack_write_array_header(&hdr, mp_state.n_event_cols);
+   {
+      Int i;
+      for (i = 7; i < mp_state.ncols; i++) {
+         msgpack_write_str(&hdr, mp_state.col_names[i], -1);
+      }
+   }
+
+   /* counter_units - map from counter name to unit string.
+      Following callgrind's convention: only time counters get units. */
+   msgpack_write_key(&hdr, "counter_units");
+   {
+      Int          n_units  = 0;
+      const HChar* unit_str = NULL;
+      switch (TG_(clo).collect_systime) {
+      case systime_no:
+         break;
+      case systime_msec:
+         unit_str = "ms";
+         n_units  = 1;
+         break;
+      case systime_usec:
+         unit_str = "us";
+         n_units  = 1;
+         break;
+      case systime_nsec:
+         unit_str = "ns";
+         n_units  = 2;
+         break;
+      }
+      msgpack_write_map_header(&hdr, n_units);
+      if (unit_str) {
+         msgpack_write_key(&hdr, "sysTime");
+         msgpack_write_str(&hdr, unit_str, -1);
+         if (TG_(clo).collect_systime == systime_nsec) {
+            msgpack_write_key(&hdr, "sysCpuTime");
+            msgpack_write_str(&hdr, unit_str, -1);
+         }
+      }
+   }
+
+   /* Compress and write header chunk */
+   SizeT  src_size     = hdr.size;
+   SizeT  dst_capacity = tg_lz4_compress_bound(src_size);
+   UChar* compressed   = VG_(malloc)("tg.mp.hdr", dst_capacity);
+
+   SizeT compressed_size =
+      tg_lz4_compress(compressed, dst_capacity, hdr.data, src_size);
+
+   /* Magic + version (8 bytes): "TGMP" + version(4) - version 4 */
+   UChar magic[8] = {'T', 'G', 'M', 'P', 0x04, 0x00, 0x00, 0x00};
+   VG_(write)(TG_(trace_out).fd, magic, 8);
+
+   /* Header chunk size (4 bytes uncompressed, 4 bytes compressed) */
+   UChar hdr_size[8];
+   hdr_size[0] = (UChar)(src_size & 0xff);
+   hdr_size[1] = (UChar)((src_size >> 8) & 0xff);
+   hdr_size[2] = (UChar)((src_size >> 16) & 0xff);
+   hdr_size[3] = (UChar)((src_size >> 24) & 0xff);
+   hdr_size[4] = (UChar)(compressed_size & 0xff);
+   hdr_size[5] = (UChar)((compressed_size >> 8) & 0xff);
+   hdr_size[6] = (UChar)((compressed_size >> 16) & 0xff);
+   hdr_size[7] = (UChar)((compressed_size >> 24) & 0xff);
+   VG_(write)(TG_(trace_out).fd, hdr_size, 8);
+
+   /* Compressed header data */
+   VG_(write)(TG_(trace_out).fd, compressed, compressed_size);
+
+   VG_(free)(compressed);
+   msgpack_free(&hdr);
+}
+
+/* Initialize msgpack state with schema from event sets */
+static void msgpack_init_state(void)
+{
+   EventSet* es = TG_(sets).full;
+   Int       g, i;
+
+   /* Count dynamic event columns */
+   Int n_events = 0;
+   for (g = 0; g < MAX_EVENTGROUP_COUNT; g++) {
+      if (!(es->mask & (1u << g)))
+         continue;
+      EventGroup* eg = TG_(get_event_group)(g);
+      if (!eg)
+         continue;
+      n_events += eg->size;
+   }
+
+   mp_state.n_event_cols = n_events;
+   mp_state.ncols        = 7 + n_events; /* 7 fixed + dynamic */
+
+   /* Allocate column names array */
+   mp_state.col_names =
+      VG_(malloc)("tg.mp.cols", mp_state.ncols * sizeof(HChar*));
+
+   /* Fixed columns */
+   mp_state.col_names[0] = "seq";
+   mp_state.col_names[1] = "tid";
+   mp_state.col_names[2] = "event";
+   mp_state.col_names[3] = "fn";
+   mp_state.col_names[4] = "obj";
+   mp_state.col_names[5] = "file";
+   mp_state.col_names[6] = "line";
+
+   /* Dynamic event columns */
+   Int c = 7;
+   for (g = 0; g < MAX_EVENTGROUP_COUNT; g++) {
+      if (!(es->mask & (1u << g)))
+         continue;
+      EventGroup* eg = TG_(get_event_group)(g);
+      if (!eg)
+         continue;
+      for (i = 0; i < eg->size; i++) {
+         mp_state.col_names[c++] = eg->name[i];
+      }
+   }
+
+   /* Initialize buffer */
+   msgpack_init(&mp_state.buf, MSGPACK_INITIAL_BUF);
+   mp_state.rows_in_chunk = 0;
+
+   /* Write file header */
+   msgpack_write_header();
+}
+
+/* Add an ENTER/EXIT row to the msgpack output */
+static void msgpack_add_row(ULong        seq,
+                            Int          tid,
+                            Int          event,
+                            const HChar* fn_name,
+                            Int          fn_len,
+                            const HChar* obj_name,
+                            Int          obj_len,
+                            const HChar* file_name,
+                            Int          file_len,
+                            Int          line,
+                            const ULong* deltas,
+                            Int          n_deltas)
+{
+   /* Each row is a msgpack array: 7 fixed + 1 counters sub-array */
+   msgpack_write_array_header(&mp_state.buf, 8);
+
+   /* Fixed columns */
+   msgpack_write_uint(&mp_state.buf, seq);
+   msgpack_write_int(&mp_state.buf, tid);
+   msgpack_write_int(&mp_state.buf, event);
+   msgpack_write_str(&mp_state.buf, fn_name, fn_len);
+   msgpack_write_str(&mp_state.buf, obj_name, obj_len);
+   msgpack_write_str(&mp_state.buf, file_name, file_len);
+   msgpack_write_int(&mp_state.buf, line);
+
+   /* Counters sub-array */
+   msgpack_write_array_header(&mp_state.buf, n_deltas);
+   for (Int i = 0; i < n_deltas; i++) {
+      msgpack_write_uint(&mp_state.buf, deltas[i]);
+   }
+
+   mp_state.rows_in_chunk++;
+
+   /* Flush if chunk is full */
+   if (mp_state.rows_in_chunk >= MSGPACK_CHUNK_ROWS) {
+      msgpack_flush_chunk();
+   }
+}
+
+/* Add a FORK row to the msgpack output (minimal schema: seq, tid, event,
+ * child_pid) */
+static void msgpack_add_fork_row(ULong seq, Int tid, Int child_pid)
+{
+   /* FORK row is a 4-element array */
+   msgpack_write_array_header(&mp_state.buf, 4);
+   msgpack_write_uint(&mp_state.buf, seq);
+   msgpack_write_int(&mp_state.buf, tid);
+   msgpack_write_int(&mp_state.buf, TG_EV_FORK);
+   msgpack_write_int(&mp_state.buf, child_pid);
+
+   mp_state.rows_in_chunk++;
+
+   /* Flush if chunk is full */
+   if (mp_state.rows_in_chunk >= MSGPACK_CHUNK_ROWS) {
+      msgpack_flush_chunk();
+   }
+}
+
+/* Add a THREAD_CREATE row to the msgpack output (seq, tid, event, child_tid) */
+static void msgpack_add_thread_create_row(ULong seq, Int tid, Int child_tid)
+{
+   msgpack_write_array_header(&mp_state.buf, 4);
+   msgpack_write_uint(&mp_state.buf, seq);
+   msgpack_write_int(&mp_state.buf, tid);
+   msgpack_write_int(&mp_state.buf, TG_EV_THREAD_CREATE);
+   msgpack_write_int(&mp_state.buf, child_tid);
+
+   mp_state.rows_in_chunk++;
+
+   if (mp_state.rows_in_chunk >= MSGPACK_CHUNK_ROWS) {
+      msgpack_flush_chunk();
+   }
+}
+
+/* Add a MARKER row to the msgpack output (seq, tid, event, marker_str) */
+static void msgpack_add_marker_row(ULong seq, Int tid, const HChar* marker)
+{
+   msgpack_write_array_header(&mp_state.buf, 4);
+   msgpack_write_uint(&mp_state.buf, seq);
+   msgpack_write_int(&mp_state.buf, tid);
+   msgpack_write_int(&mp_state.buf, TG_EV_MARKER);
+   msgpack_write_str(&mp_state.buf, marker, -1);
+
+   mp_state.rows_in_chunk++;
+
+   if (mp_state.rows_in_chunk >= MSGPACK_CHUNK_ROWS) {
+      msgpack_flush_chunk();
+   }
+}
+
+/* Close msgpack output */
+static void msgpack_close_output(void)
+{
+   /* Flush any remaining rows */
+   msgpack_flush_chunk();
+
+   /* Write end marker (zero-size chunk) */
+   UChar end[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+   VG_(write)(TG_(trace_out).fd, end, 8);
+
+   /* Cleanup */
+   msgpack_free(&mp_state.buf);
+   if (mp_state.col_names) {
+      VG_(free)(mp_state.col_names);
+      mp_state.col_names = NULL;
+   }
+}
+
+void TG_(trace_open_output)(void)
+{
+   SysRes res;
+   HChar  filename[512];
+
+   if (TG_(trace_out).initialized)
+      return;
+
+   if (!TG_(clo).out_format)
+      TG_(clo).out_format = DEFAULT_OUTFORMAT;
+
+   HChar* expanded =
+      VG_(expand_file_name)("--tracegrind-out-file", TG_(clo).out_format);
+   VG_(strncpy)(filename, expanded, sizeof(filename) - 1);
+   filename[sizeof(filename) - 1] = '\0';
+   VG_(free)(expanded);
+
+   res = VG_(open)(filename, VKI_O_CREAT | VKI_O_WRONLY | VKI_O_TRUNC,
+                   VKI_S_IRUSR | VKI_S_IWUSR);
+   if (sr_isError(res)) {
+      VG_(message)(Vg_UserMsg, "Error: cannot open trace output file '%s'\n",
+                   filename);
+      VG_(exit)(1);
+   }
+
+   TG_(trace_out).fd             = (Int)sr_Res(res);
+   TG_(trace_out).seq            = 0;
+   TG_(trace_out).initialized    = True;
+   TG_(trace_out).header_written = False;
+
+   /* Initialize msgpack writer */
+   msgpack_init_state();
+
+   if (VG_(clo_verbosity) > 1)
+      VG_(message)(Vg_DebugMsg, "Trace output to %s\n", filename);
+}
+
+/*
+ * Called in child process after fork.
+ * Closes the inherited file descriptor (without writing end marker)
+ * and opens a new trace file with the child's PID.
+ */
+void TG_(trace_reopen_child)(void)
+{
+   /* Close inherited fd without flushing/finalizing (that's parent's job) */
+   if (TG_(trace_out).fd >= 0) {
+      VG_(close)(TG_(trace_out).fd);
+   }
+
+   /* Reset state completely */
+   TG_(trace_out).fd             = -1;
+   TG_(trace_out).seq            = 0;
+   TG_(trace_out).initialized    = False;
+   TG_(trace_out).header_written = False;
+
+   /* Open new trace file with child's PID (also re-inits msgpack state) */
+   TG_(trace_open_output)();
+}
+
+void TG_(trace_emit_sample)(ThreadId tid, Bool is_enter, fn_node* fn)
+{
+   Int i;
+
+   if (!TG_(trace_out).initialized)
+      return;
+   if (TG_(trace_out).fd < 0)
+      return;
+
+   /* Get current thread info for per-thread last_sample_cost */
+   thread_info* ti = TG_(get_current_thread)();
+   if (!ti)
+      return;
+
+   EventSet* es           = TG_(sets).full;
+   FullCost  current_cost = TG_(current_state).cost;
+
+   /* If last_sample_cost not yet allocated, allocate and zero it */
+   if (!ti->last_sample_cost) {
+      ti->last_sample_cost = TG_(get_eventset_cost)(es);
+      TG_(init_cost)(es, ti->last_sample_cost);
+   }
+
+   TG_(trace_out).seq++;
+
+   /* Resolve function info with cached lengths */
+   const HChar* fn_name;
+   Int          fn_len;
+   const HChar* obj_name;
+   Int          obj_len;
+   const HChar* file_name;
+   Int          file_len;
+
+   if (fn) {
+      fn_name = fn->name;
+      fn_len  = (Int)fn->name_len;
+      if (fn->file) {
+         file_name = fn->file->name;
+         file_len  = (Int)fn->file->name_len;
+         if (fn->file->obj) {
+            obj_name = fn->file->obj->name;
+            obj_len  = (Int)fn->file->obj->name_len;
+         } else {
+            obj_name = "???";
+            obj_len  = 3;
+         }
+      } else {
+         file_name = "???";
+         file_len  = 3;
+         obj_name  = "???";
+         obj_len   = 3;
+      }
+   } else {
+      fn_name   = "???";
+      fn_len    = 3;
+      obj_name  = "???";
+      obj_len   = 3;
+      file_name = "???";
+      file_len  = 3;
+   }
+
+   /* Compute deltas for all event counters */
+   ULong deltas[64]; /* es->size is always small */
+   tl_assert(es->size <= 64);
+   if (current_cost && ti->last_sample_cost) {
+      for (i = 0; i < es->size; i++) {
+         deltas[i] = current_cost[i] - ti->last_sample_cost[i];
+      }
+      TG_(copy_cost)(es, ti->last_sample_cost, current_cost);
+   } else {
+      for (i = 0; i < es->size; i++) {
+         deltas[i] = 0;
+      }
+   }
+
+   Int event_val = is_enter ? TG_EV_ENTER_FN : TG_EV_EXIT_FN;
+
+   msgpack_add_row(TG_(trace_out).seq, (Int)tid, event_val, fn_name, fn_len,
+                   obj_name, obj_len, file_name, file_len, 0, deltas, es->size);
+}
+
+void TG_(trace_emit_enter_inlined)(ThreadId tid, BB* bb, const HChar* inl_fn)
+{
+   Int i;
+
+   if (!TG_(trace_out).initialized)
+      return;
+   if (TG_(trace_out).fd < 0)
+      return;
+
+   thread_info* ti = TG_(get_current_thread)();
+   if (!ti)
+      return;
+
+   EventSet* es           = TG_(sets).full;
+   FullCost  current_cost = TG_(current_state).cost;
+
+   if (!ti->last_sample_cost) {
+      ti->last_sample_cost = TG_(get_eventset_cost)(es);
+      TG_(init_cost)(es, ti->last_sample_cost);
+   }
+
+   TG_(trace_out).seq++;
+
+   const HChar* fn_name = inl_fn;
+   Int          fn_len  = -1; /* inlined fn names not cached, use strlen */
+   const HChar* obj_name;
+   Int          obj_len;
+   const HChar* file_name;
+   Int          file_len;
+
+   if (bb->obj) {
+      obj_name = bb->obj->name;
+      obj_len  = (Int)bb->obj->name_len;
+   } else {
+      obj_name = "???";
+      obj_len  = 3;
+   }
+   if (bb->fn && bb->fn->file) {
+      file_name = bb->fn->file->name;
+      file_len  = (Int)bb->fn->file->name_len;
+   } else {
+      file_name = "???";
+      file_len  = 3;
+   }
+
+   ULong deltas[64];
+   tl_assert(es->size <= 64);
+   if (current_cost && ti->last_sample_cost) {
+      for (i = 0; i < es->size; i++) {
+         deltas[i] = current_cost[i] - ti->last_sample_cost[i];
+      }
+      TG_(copy_cost)(es, ti->last_sample_cost, current_cost);
+   } else {
+      for (i = 0; i < es->size; i++) {
+         deltas[i] = 0;
+      }
+   }
+
+   msgpack_add_row(TG_(trace_out).seq, (Int)tid, TG_EV_ENTER_INLINED_FN,
+                   fn_name, fn_len, obj_name, obj_len, file_name, file_len,
+                   (Int)bb->line, deltas, es->size);
+}
+
+void TG_(trace_emit_exit_inlined)(ThreadId tid, BB* bb, const HChar* inl_fn)
+{
+   Int i;
+
+   if (!TG_(trace_out).initialized)
+      return;
+   if (TG_(trace_out).fd < 0)
+      return;
+
+   thread_info* ti = TG_(get_current_thread)();
+   if (!ti)
+      return;
+
+   EventSet* es           = TG_(sets).full;
+   FullCost  current_cost = TG_(current_state).cost;
+
+   if (!ti->last_sample_cost) {
+      ti->last_sample_cost = TG_(get_eventset_cost)(es);
+      TG_(init_cost)(es, ti->last_sample_cost);
+   }
+
+   TG_(trace_out).seq++;
+
+   const HChar* fn_name = inl_fn;
+   Int          fn_len  = -1; /* inlined fn names not cached, use strlen */
+   const HChar* obj_name;
+   Int          obj_len;
+   const HChar* file_name;
+   Int          file_len;
+
+   if (bb->obj) {
+      obj_name = bb->obj->name;
+      obj_len  = (Int)bb->obj->name_len;
+   } else {
+      obj_name = "???";
+      obj_len  = 3;
+   }
+   if (bb->fn && bb->fn->file) {
+      file_name = bb->fn->file->name;
+      file_len  = (Int)bb->fn->file->name_len;
+   } else {
+      file_name = "???";
+      file_len  = 3;
+   }
+
+   ULong deltas[64];
+   tl_assert(es->size <= 64);
+   if (current_cost && ti->last_sample_cost) {
+      for (i = 0; i < es->size; i++) {
+         deltas[i] = current_cost[i] - ti->last_sample_cost[i];
+      }
+      TG_(copy_cost)(es, ti->last_sample_cost, current_cost);
+   } else {
+      for (i = 0; i < es->size; i++) {
+         deltas[i] = 0;
+      }
+   }
+
+   msgpack_add_row(TG_(trace_out).seq, (Int)tid, TG_EV_EXIT_INLINED_FN, fn_name,
+                   fn_len, obj_name, obj_len, file_name, file_len,
+                   (Int)bb->line, deltas, es->size);
+}
+
+/*
+ * Emit a FORK event when a child process is created.
+ * Called from the post-syscall handler when fork/clone returns in parent.
+ * child_pid is the PID of the newly created child process.
+ */
+void TG_(trace_emit_fork)(ThreadId tid, Int child_pid)
+{
+   if (!TG_(trace_out).initialized)
+      return;
+   if (TG_(trace_out).fd < 0)
+      return;
+
+   TG_(trace_out).seq++;
+
+   /* FORK uses minimal schema: [seq, tid, event, child_pid] */
+   msgpack_add_fork_row(TG_(trace_out).seq, (Int)tid, child_pid);
+}
+
+void TG_(trace_emit_thread_create)(ThreadId tid, ThreadId child)
+{
+   if (!TG_(trace_out).initialized)
+      return;
+   if (TG_(trace_out).fd < 0)
+      return;
+
+   TG_(trace_out).seq++;
+
+   msgpack_add_thread_create_row(TG_(trace_out).seq, (Int)tid, (Int)child);
+}
+
+void TG_(trace_emit_marker)(ThreadId tid, const HChar* marker)
+{
+   if (!TG_(trace_out).initialized)
+      return;
+   if (TG_(trace_out).fd < 0)
+      return;
+
+   TG_(trace_out).seq++;
+
+   msgpack_add_marker_row(TG_(trace_out).seq, (Int)tid, marker);
+}
+
+void TG_(trace_close_output)(void)
+{
+   if (!TG_(trace_out).initialized)
+      return;
+   if (TG_(trace_out).fd < 0)
+      return;
+
+   /* Flush remaining rows, write end marker */
+   msgpack_close_output();
+   VG_(close)(TG_(trace_out).fd);
+
+   TG_(trace_out).fd          = -1;
+   TG_(trace_out).initialized = False;
+
+   if (VG_(clo_verbosity) > 1)
+      VG_(message)(Vg_DebugMsg, "Trace output closed (%llu samples written)\n",
+                   TG_(trace_out).seq);
+}
+
+/* Sum costs from all threads into total_cost */
+void TG_(compute_total_cost)(void)
+{
+   if (!TG_(total_cost)) {
+      TG_(total_cost) = TG_(get_eventset_cost)(TG_(sets).full);
+      TG_(init_cost)(TG_(sets).full, TG_(total_cost));
+   }
+
+   {
+      Int           t;
+      thread_info** thr = TG_(get_threads)();
+      for (t = 1; t < VG_N_THREADS; t++) {
+         if (!thr[t])
+            continue;
+         TG_(add_diff_cost)
+         (TG_(sets).full, TG_(total_cost), thr[t]->lastdump_cost,
+          thr[t]->states.entry[0]->cost);
+         TG_(copy_cost)
+         (TG_(sets).full, thr[t]->lastdump_cost, thr[t]->states.entry[0]->cost);
+      }
+   }
+}
diff --git a/tracegrind/events.c b/tracegrind/events.c
new file mode 100644
index 000000000..4e91967b8
--- /dev/null
+++ b/tracegrind/events.c
@@ -0,0 +1,524 @@
+/*--------------------------------------------------------------------*/
+/*--- Tracegrind                                                    ---*/
+/*---                                                     events.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Tracegrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+
+/* This should be 2**MAX_EVENTGROUP_COUNT */
+#define MAX_EVENTSET_COUNT 1024
+
+static EventGroup* eventGroup[MAX_EVENTGROUP_COUNT];
+static EventSet*   eventSetTable[MAX_EVENTSET_COUNT];
+static Bool        eventSets_initialized = 0;
+
+static void initialize_event_sets(void)
+{
+   Int i;
+
+   if (eventSets_initialized)
+      return;
+
+   for (i = 0; i < MAX_EVENTGROUP_COUNT; i++)
+      eventGroup[i] = 0;
+
+   for (i = 0; i < MAX_EVENTSET_COUNT; i++)
+      eventSetTable[i] = 0;
+
+   eventSets_initialized = 1;
+}
+
+static EventGroup* new_event_group(int id, int n)
+{
+   EventGroup* eg;
+
+   initialize_event_sets();
+
+   TG_ASSERT(id >= 0 && id < MAX_EVENTGROUP_COUNT);
+   TG_ASSERT(eventGroup[id] == 0);
+
+   eg             = (EventGroup*)TG_MALLOC("cl.events.group.1",
+                                           sizeof(EventGroup) + n * sizeof(HChar*));
+   eg->size       = n;
+   eventGroup[id] = eg;
+   return eg;
+}
+
+EventGroup* TG_(register_event_group)(int id, const HChar* n1)
+{
+   EventGroup* eg = new_event_group(id, 1);
+   eg->name[0]    = n1;
+
+   return eg;
+}
+
+EventGroup* TG_(register_event_group2)(int id, const HChar* n1, const HChar* n2)
+{
+   EventGroup* eg = new_event_group(id, 2);
+   eg->name[0]    = n1;
+   eg->name[1]    = n2;
+
+   return eg;
+}
+
+EventGroup* TG_(register_event_group3)(int          id,
+                                       const HChar* n1,
+                                       const HChar* n2,
+                                       const HChar* n3)
+{
+   EventGroup* eg = new_event_group(id, 3);
+   eg->name[0]    = n1;
+   eg->name[1]    = n2;
+   eg->name[2]    = n3;
+
+   return eg;
+}
+
+EventGroup* TG_(register_event_group4)(
+   int id, const HChar* n1, const HChar* n2, const HChar* n3, const HChar* n4)
+{
+   EventGroup* eg = new_event_group(id, 4);
+   eg->name[0]    = n1;
+   eg->name[1]    = n2;
+   eg->name[2]    = n3;
+   eg->name[3]    = n4;
+
+   return eg;
+}
+
+EventGroup* TG_(get_event_group)(int id)
+{
+   TG_ASSERT(id >= 0 && id < MAX_EVENTGROUP_COUNT);
+
+   return eventGroup[id];
+}
+
+static EventSet* eventset_from_mask(UInt mask)
+{
+   EventSet* es;
+   Int       i, count, offset;
+
+   if (mask >= MAX_EVENTSET_COUNT)
+      return 0;
+
+   initialize_event_sets();
+   if (eventSetTable[mask])
+      return eventSetTable[mask];
+
+   es       = (EventSet*)TG_MALLOC("cl.events.eventset.1", sizeof(EventSet));
+   es->mask = mask;
+
+   offset = 0;
+   count  = 0;
+   for (i = 0; i < MAX_EVENTGROUP_COUNT; i++) {
+      es->offset[i] = offset;
+      if (((mask & (1u << i)) == 0) || (eventGroup[i] == 0))
+         continue;
+
+      offset += eventGroup[i]->size;
+      count++;
+   }
+   es->size  = offset;
+   es->count = count;
+
+   eventSetTable[mask] = es;
+   return es;
+}
+
+EventSet* TG_(get_event_set)(Int id)
+{
+   TG_ASSERT(id >= 0 && id < MAX_EVENTGROUP_COUNT);
+   return eventset_from_mask(1u << id);
+}
+
+EventSet* TG_(get_event_set2)(Int id1, Int id2)
+{
+   TG_ASSERT(id1 >= 0 && id1 < MAX_EVENTGROUP_COUNT);
+   TG_ASSERT(id2 >= 0 && id2 < MAX_EVENTGROUP_COUNT);
+   return eventset_from_mask((1u << id1) | (1u << id2));
+}
+
+EventSet* TG_(add_event_group)(EventSet* es, Int id)
+{
+   TG_ASSERT(id >= 0 && id < MAX_EVENTGROUP_COUNT);
+   if (!es)
+      es = eventset_from_mask(0);
+   return eventset_from_mask(es->mask | (1u << id));
+}
+
+EventSet* TG_(add_event_group2)(EventSet* es, Int id1, Int id2)
+{
+   TG_ASSERT(id1 >= 0 && id1 < MAX_EVENTGROUP_COUNT);
+   TG_ASSERT(id2 >= 0 && id2 < MAX_EVENTGROUP_COUNT);
+   if (!es)
+      es = eventset_from_mask(0);
+   return eventset_from_mask(es->mask | (1u << id1) | (1u << id2));
+}
+
+EventSet* TG_(add_event_set)(EventSet* es1, EventSet* es2)
+{
+   if (!es1)
+      es1 = eventset_from_mask(0);
+   if (!es2)
+      es2 = eventset_from_mask(0);
+   return eventset_from_mask(es1->mask | es2->mask);
+}
+
+/* Get cost array for an event set */
+ULong* TG_(get_eventset_cost)(EventSet* es)
+{
+   return TG_(get_costarray)(es->size);
+}
+
+/* Set all costs of an event set to zero */
+void TG_(init_cost)(EventSet* es, ULong* cost)
+{
+   Int i;
+
+   if (!cost)
+      return;
+
+   for (i = 0; i < es->size; i++)
+      cost[i] = 0;
+}
+
+/* Set all costs of an event set to zero */
+void TG_(init_cost_lz)(EventSet* es, ULong** cost)
+{
+   Int i;
+
+   TG_ASSERT(cost != 0);
+   if (!(*cost))
+      *cost = TG_(get_eventset_cost)(es);
+
+   for (i = 0; i < es->size; i++)
+      (*cost)[i] = 0;
+}
+
+void TG_(zero_cost)(EventSet* es, ULong* cost)
+{
+   Int i;
+
+   if (!cost)
+      return;
+
+   for (i = 0; i < es->size; i++)
+      cost[i] = 0;
+}
+
+Bool TG_(is_zero_cost)(EventSet* es, ULong* cost)
+{
+   Int i;
+
+   if (!cost)
+      return True;
+
+   for (i = 0; i < es->size; i++)
+      if (cost[i] != 0)
+         return False;
+
+   return True;
+}
+
+void TG_(copy_cost)(EventSet* es, ULong* dst, ULong* src)
+{
+   Int i;
+
+   if (!src) {
+      TG_(zero_cost)(es, dst);
+      return;
+   }
+   TG_ASSERT(dst != 0);
+
+   for (i = 0; i < es->size; i++)
+      dst[i] = src[i];
+}
+
+void TG_(copy_cost_lz)(EventSet* es, ULong** pdst, ULong* src)
+{
+   Int    i;
+   ULong* dst;
+
+   TG_ASSERT(pdst != 0);
+
+   if (!src) {
+      TG_(zero_cost)(es, *pdst);
+      return;
+   }
+   dst = *pdst;
+   if (!dst)
+      dst = *pdst = TG_(get_eventset_cost)(es);
+
+   for (i = 0; i < es->size; i++)
+      dst[i] = src[i];
+}
+
+void TG_(add_cost)(EventSet* es, ULong* dst, ULong* src)
+{
+   Int i;
+
+   if (!src)
+      return;
+   TG_ASSERT(dst != 0);
+
+   for (i = 0; i < es->size; i++)
+      dst[i] += src[i];
+}
+
+void TG_(add_cost_lz)(EventSet* es, ULong** pdst, ULong* src)
+{
+   Int    i;
+   ULong* dst;
+
+   if (!src)
+      return;
+   TG_ASSERT(pdst != 0);
+
+   dst = *pdst;
+   if (!dst) {
+      dst = *pdst = TG_(get_eventset_cost)(es);
+      TG_(copy_cost)(es, dst, src);
+      return;
+   }
+
+   for (i = 0; i < es->size; i++)
+      dst[i] += src[i];
+}
+
+/* Adds src to dst and zeros src. Returns false if nothing changed */
+Bool TG_(add_and_zero_cost)(EventSet* es, ULong* dst, ULong* src)
+{
+   Int  i;
+   Bool is_nonzero = False;
+
+   TG_ASSERT((es != 0) && (dst != 0));
+   if (!src)
+      return False;
+
+   for (i = 0; i < es->size; i++) {
+      if (src[i] == 0)
+         continue;
+      dst[i] += src[i];
+      src[i]     = 0;
+      is_nonzero = True;
+   }
+
+   return is_nonzero;
+}
+
+/* Adds src to dst and zeros src. Returns false if nothing changed */
+Bool TG_(add_and_zero_cost2)(EventSet* esDst,
+                             ULong*    dst,
+                             EventSet* esSrc,
+                             ULong*    src)
+{
+   Int         i, j;
+   Bool        is_nonzero = False;
+   UInt        mask;
+   EventGroup* eg;
+   ULong *     egDst, *egSrc;
+
+   TG_ASSERT((esDst != 0) && (dst != 0) && (esSrc != 0));
+   if (!src)
+      return False;
+
+   for (i = 0, mask = 1; i < MAX_EVENTGROUP_COUNT; i++, mask = mask << 1) {
+      if ((esSrc->mask & mask) == 0)
+         continue;
+      if (eventGroup[i] == 0)
+         continue;
+
+      /* if src has a subset, dst must have, too */
+      TG_ASSERT((esDst->mask & mask) > 0);
+      eg    = eventGroup[i];
+      egSrc = src + esSrc->offset[i];
+      egDst = dst + esDst->offset[i];
+      for (j = 0; j < eg->size; j++) {
+         if (egSrc[j] == 0)
+            continue;
+         egDst[j] += egSrc[j];
+         egSrc[j]   = 0;
+         is_nonzero = True;
+      }
+   }
+
+   return is_nonzero;
+}
+
+/* Adds difference of new and old to dst, and set old to new.
+ * Returns false if nothing changed */
+Bool TG_(add_diff_cost)(EventSet* es, ULong* dst, ULong* old, ULong* new_cost)
+{
+   Int  i;
+   Bool is_nonzero = False;
+
+   TG_ASSERT((es != 0) && (dst != 0));
+   TG_ASSERT(old && new_cost);
+
+   for (i = 0; i < es->size; i++) {
+      if (new_cost[i] == old[i])
+         continue;
+      dst[i] += new_cost[i] - old[i];
+      old[i]     = new_cost[i];
+      is_nonzero = True;
+   }
+
+   return is_nonzero;
+}
+
+Bool TG_(add_diff_cost_lz)(EventSet* es,
+                           ULong**   pdst,
+                           ULong*    old,
+                           ULong*    new_cost)
+{
+   Int    i;
+   ULong* dst;
+   Bool   is_nonzero = False;
+
+   TG_ASSERT((es != 0) && (pdst != 0));
+   TG_ASSERT(old && new_cost);
+
+   dst = *pdst;
+   if (!dst) {
+      dst = *pdst = TG_(get_eventset_cost)(es);
+      TG_(zero_cost)(es, dst);
+   }
+
+   for (i = 0; i < es->size; i++) {
+      if (new_cost[i] == old[i])
+         continue;
+      dst[i] += new_cost[i] - old[i];
+      old[i]     = new_cost[i];
+      is_nonzero = True;
+   }
+
+   return is_nonzero;
+}
+
+/* Allocate space for an event mapping */
+EventMapping* TG_(get_eventmapping)(EventSet* es)
+{
+   EventMapping* em;
+
+   TG_ASSERT(es != 0);
+
+   em = (EventMapping*)TG_MALLOC(
+      "cl.events.geMapping.1",
+      sizeof(EventMapping) + sizeof(struct EventMappingEntry) * es->size);
+   em->capacity = es->size;
+   em->size     = 0;
+   em->es       = es;
+
+   return em;
+}
+
+void TG_(append_event)(EventMapping* em, const HChar* n)
+{
+   Int         i, j, offset = 0;
+   UInt        mask;
+   EventGroup* eg;
+
+   TG_ASSERT(em != 0);
+   for (i = 0, mask = 1; i < MAX_EVENTGROUP_COUNT; i++, mask = mask << 1) {
+      if ((em->es->mask & mask) == 0)
+         continue;
+      if (eventGroup[i] == 0)
+         continue;
+
+      eg = eventGroup[i];
+      for (j = 0; j < eg->size; j++, offset++) {
+         if (VG_(strcmp)(n, eg->name[j]) != 0)
+            continue;
+
+         TG_ASSERT(em->capacity > em->size);
+         em->entry[em->size].group  = i;
+         em->entry[em->size].index  = j;
+         em->entry[em->size].offset = offset;
+         em->size++;
+         return;
+      }
+   }
+}
+
+/* Returns pointer to dynamically string. The string will be overwritten
+   with each invocation. */
+HChar* TG_(eventmapping_as_string)(const EventMapping* em)
+{
+   Int         i;
+   EventGroup* eg;
+
+   TG_ASSERT(em != 0);
+
+   XArray* xa =
+      VG_(newXA)(VG_(malloc), "cl.events.emas", VG_(free), sizeof(HChar));
+
+   for (i = 0; i < em->size; i++) {
+      if (i > 0) {
+         VG_(xaprintf)(xa, "%c", ' ');
+      }
+      eg = eventGroup[em->entry[i].group];
+      TG_ASSERT(eg != 0);
+      VG_(xaprintf)(xa, "%s", eg->name[em->entry[i].index]);
+   }
+   VG_(xaprintf)(xa, "%c", '\0'); // zero terminate the string
+
+   HChar* buf = VG_(strdup)("cl.events.emas", VG_(indexXA)(xa, 0));
+   VG_(deleteXA)(xa);
+
+   return buf;
+}
+
+/* Returns pointer to dynamically allocated string. Caller needs to
+   VG_(free) it. */
+HChar* TG_(mappingcost_as_string)(const EventMapping* em, const ULong* c)
+{
+   Int i, skipped = 0;
+
+   if (!c || em->size == 0)
+      return VG_(strdup)("cl.events.mcas", "");
+
+   XArray* xa =
+      VG_(newXA)(VG_(malloc), "cl.events.mcas", VG_(free), sizeof(HChar));
+
+   /* At least one entry */
+   VG_(xaprintf)(xa, "%llu", c[em->entry[0].offset]);
+
+   for (i = 1; i < em->size; i++) {
+      if (c[em->entry[i].offset] == 0) {
+         skipped++;
+         continue;
+      }
+      while (skipped > 0) {
+         VG_(xaprintf)(xa, " 0");
+         skipped--;
+      }
+      VG_(xaprintf)(xa, " %llu", c[em->entry[i].offset]);
+   }
+   VG_(xaprintf)(xa, "%c", '\0'); // zero terminate the string
+
+   HChar* buf = VG_(strdup)("cl.events.mas", VG_(indexXA)(xa, 0));
+   VG_(deleteXA)(xa);
+
+   return buf;
+}
diff --git a/tracegrind/events.h b/tracegrind/events.h
new file mode 100644
index 000000000..bac264c45
--- /dev/null
+++ b/tracegrind/events.h
@@ -0,0 +1,131 @@
+/*--------------------------------------------------------------------*/
+/*--- Tracegrind                                                    ---*/
+/*---                                                     events.h ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Tracegrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+/* Abstractions for 64-bit cost lists (events.h) */
+
+#ifndef TG_EVENTS
+#define TG_EVENTS
+
+#include "pub_tool_basics.h"
+
+#define TG_(str) VGAPPEND(vgTracegrind_, str)
+
+/* Event groups consist of one or more named event types.
+ * Event sets are constructed from such event groups.
+ *
+ * Event groups have to be registered globally with a unique ID
+ * before they can be used in an event set.
+ * A group can appear at most once in a event set.
+ */
+
+#define MAX_EVENTGROUP_COUNT 10
+
+typedef struct _EventGroup EventGroup;
+struct _EventGroup {
+   Int          size;
+   const HChar* name[0];
+};
+
+/* return 0 if event group can not be registered */
+EventGroup* TG_(register_event_group)(int id, const HChar*);
+EventGroup* TG_(register_event_group2)(int id, const HChar*, const HChar*);
+EventGroup*
+   TG_(register_event_group3)(int id, const HChar*, const HChar*, const HChar*);
+EventGroup* TG_(register_event_group4)(
+   int id, const HChar*, const HChar*, const HChar*, const HChar*);
+EventGroup* TG_(get_event_group)(int id);
+
+/* Event sets are defined by event groups they consist of. */
+
+typedef struct _EventSet EventSet;
+struct _EventSet {
+   /* if subset with ID x is in the set, then bit x is set */
+   UInt mask;
+   Int  count;
+   Int  size;
+   Int  offset[MAX_EVENTGROUP_COUNT];
+};
+
+/* Same event set is returned when requesting same event groups */
+EventSet* TG_(get_event_set)(Int id);
+EventSet* TG_(get_event_set2)(Int id1, Int id2);
+EventSet* TG_(add_event_group)(EventSet*, Int id);
+EventSet* TG_(add_event_group2)(EventSet*, Int id1, Int id2);
+EventSet* TG_(add_event_set)(EventSet*, EventSet*);
+
+/* Operations on costs. A cost pointer of 0 means zero cost.
+ * Functions ending in _lz allocate cost arrays only when needed
+ */
+ULong* TG_(get_eventset_cost)(EventSet*);
+/* Set costs of event set to 0 */
+void TG_(init_cost)(EventSet*, ULong*);
+/* This always allocates counter and sets them to 0 */
+void TG_(init_cost_lz)(EventSet*, ULong**);
+/* Set costs of an event set to zero */
+void TG_(zero_cost)(EventSet*, ULong*);
+Bool TG_(is_zero_cost)(EventSet*, ULong*);
+void TG_(copy_cost)(EventSet*, ULong* dst, ULong* src);
+void TG_(copy_cost_lz)(EventSet*, ULong** pdst, ULong* src);
+void TG_(add_cost)(EventSet*, ULong* dst, ULong* src);
+void TG_(add_cost_lz)(EventSet*, ULong** pdst, ULong* src);
+/* Adds src to dst and zeros src. Returns false if nothing changed */
+Bool TG_(add_and_zero_cost)(EventSet*, ULong* dst, ULong* src);
+Bool TG_(add_and_zero_cost2)(EventSet*, ULong* dst, EventSet*, ULong* src);
+/* Adds difference of new and old to to dst, and set old to new.
+ * Returns false if nothing changed */
+Bool TG_(add_diff_cost)(EventSet*, ULong* dst, ULong* old, ULong* new_cost);
+Bool
+   TG_(add_diff_cost_lz)(EventSet*, ULong** pdst, ULong* old, ULong* new_cost);
+
+/* EventMapping: An ordered subset of events from an event set.
+ * This is used to print out part of an EventSet, or in another order.
+ */
+struct EventMappingEntry {
+   Int group;
+   Int index;
+   Int offset;
+};
+typedef struct _EventMapping EventMapping;
+struct _EventMapping {
+   EventSet*                es;
+   Int                      size;
+   Int                      capacity;
+   struct EventMappingEntry entry[0];
+};
+
+/* Allocate space for an event mapping */
+EventMapping* TG_(get_eventmapping)(EventSet*);
+void          TG_(append_event)(EventMapping*, const HChar*);
+/* Returns event mapping as a character string. That string is dynamically
+   allocated and it is the caller's responsibility to free it.
+   The function never returns NULL. */
+HChar* TG_(eventmapping_as_string)(const EventMapping*);
+/* Returns mapping cost as a character string. That string is dynamically
+   allocated and it is the caller's responsibility to free it.
+   The function never returns NULL. */
+HChar* TG_(mappingcost_as_string)(const EventMapping*, const ULong*);
+
+#endif /* TG_EVENTS */
diff --git a/tracegrind/examples/.gitignore b/tracegrind/examples/.gitignore
new file mode 100644
index 000000000..585f5d244
--- /dev/null
+++ b/tracegrind/examples/.gitignore
@@ -0,0 +1,2 @@
+*.tgtrace
+*.txt
diff --git a/tracegrind/examples/README.md b/tracegrind/examples/README.md
new file mode 100644
index 000000000..d1a6b2834
--- /dev/null
+++ b/tracegrind/examples/README.md
@@ -0,0 +1,59 @@
+# Tracegrind example output files
+
+This directory contains pre-generated tracegrind output files for use as
+reference material when implementing a trace parser.
+
+Each test produces two files:
+
+- **`<name>.tgtrace`** — binary trace file (msgpack + lz4 compressed)
+- **`<name>.txt`** — full human-readable dump from `tracegrind-analyzer`
+
+## Files
+
+| Name | Description | Extra options |
+|------|-------------|---------------|
+| `test_basic` | Full program trace (loader + libc + main) | — |
+| `test_marker` | `VALGRIND_TRACEGRIND_MARKER` client request | — |
+| `test_toggle_collect` | `--toggle-collect` style collection | — |
+| `test_foo_bar_baz` | Simple call chain: `foo -> bar -> baz` | `--instr-atstart=no` |
+| `test_inline` | Inlined function calls | `--instr-atstart=no` |
+| `test_enter_inlined` | `ENTER_INLINED_FN` / `EXIT_INLINED_FN` events | `--instr-atstart=no --read-inline-info=yes` |
+| `test_nested_inlined` | Nested inlined function calls | `--instr-atstart=no --read-inline-info=yes` |
+| `test_recursion` | Recursive function calls | `--instr-atstart=no` |
+| `test_tailcall` | Tail-call optimized functions | `--instr-atstart=no` |
+| `test_longjmp` | `setjmp` / `longjmp` unwinding | `--instr-atstart=no` |
+| `test_signal` | Signal handler invocation | `--instr-atstart=no` |
+| `test_exception` | C++ exception throw/catch | `--instr-atstart=no` |
+| `test_thread_create` | `THREAD_CREATE` events | `--instr-atstart=no` |
+| `test_thread_interleave` | Multi-thread interleaved callstacks | `--instr-atstart=no` |
+| `test_syscall` | System call timing (`sysCount`, `sysTime` counters) | `--instr-atstart=no --collect-systime=nsec` |
+| `test_instr_toggle` | Instrumentation toggle on/off mid-run | `--instr-atstart=no` |
+
+## Regenerating
+
+From the repository root (after building valgrind):
+
+```bash
+bash tracegrind/examples/generate.sh
+```
+
+## Format
+
+The `.tgtrace` files use the tracegrind msgpack format (lz4-compressed msgpack).
+See `tracegrind/docs/tracegrind-msgpack-format.md` for the format specification.
+
+Use `tracegrind/scripts/tracegrind-analyzer` to inspect any trace file:
+
+```bash
+# Full dump
+./tracegrind/scripts/tracegrind-analyzer tracegrind/examples/test_foo_bar_baz.tgtrace
+
+# Schema only
+./tracegrind/scripts/tracegrind-analyzer tracegrind/examples/test_foo_bar_baz.tgtrace --schema
+
+# Statistics
+./tracegrind/scripts/tracegrind-analyzer tracegrind/examples/test_foo_bar_baz.tgtrace --stats
+
+# Filter by event type
+./tracegrind/scripts/tracegrind-analyzer tracegrind/examples/test_foo_bar_baz.tgtrace --event ENTER_FN
+```
diff --git a/tracegrind/examples/generate.sh b/tracegrind/examples/generate.sh
new file mode 100755
index 000000000..00aa6b072
--- /dev/null
+++ b/tracegrind/examples/generate.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+#
+# Generate tracegrind example output files.
+#
+# Run from the valgrind-codspeed repository root:
+#   bash tracegrind/examples/generate.sh
+#
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
+VG="$ROOT/vg-in-place"
+ANALYZER="$ROOT/tracegrind/scripts/tracegrind-analyzer"
+TESTS="$ROOT/tracegrind/tests"
+OUT="$ROOT/tracegrind/examples"
+
+if [ ! -x "$VG" ]; then
+    echo "Error: vg-in-place not found at $VG" >&2
+    echo "Build valgrind first (./configure && make)" >&2
+    exit 1
+fi
+
+generate() {
+    local name="$1"
+    local binary="$2"
+    shift 2
+    local vgopts=("$@")
+
+    local trace="$OUT/${name}.tgtrace"
+    local txt="$OUT/${name}.txt"
+
+    echo "Generating $name ..."
+    "$VG" --tool=tracegrind \
+        --tracegrind-out-file="$trace" \
+        "${vgopts[@]}" \
+        "$TESTS/$binary" > /dev/null 2>&1
+
+    "$ANALYZER" "$trace" > "$txt" 2>&1
+
+    echo "  -> $(wc -c < "$trace") bytes, $(wc -l < "$txt") lines"
+}
+
+# Remove previous outputs
+rm -f "$OUT"/*.tgtrace "$OUT"/*.txt
+
+generate test_basic              test_basic.bin
+generate test_marker             test_marker.bin
+generate test_toggle_collect     test_toggle_collect.bin
+generate test_foo_bar_baz        test_foo_bar_baz.bin        --instr-atstart=no
+generate test_inline             test_inline.bin             --instr-atstart=no
+generate test_enter_inlined      test_enter_inlined.bin      --instr-atstart=no --read-inline-info=yes
+generate test_nested_inlined     test_nested_inlined.bin     --instr-atstart=no --read-inline-info=yes
+generate test_recursion          test_recursion.bin          --instr-atstart=no
+generate test_tailcall           test_tailcall.bin           --instr-atstart=no
+generate test_longjmp            test_longjmp.bin            --instr-atstart=no
+generate test_signal             test_signal.bin             --instr-atstart=no
+generate test_exception          test_exception.bin          --instr-atstart=no
+generate test_thread_create      test_thread_create.bin      --instr-atstart=no
+generate test_thread_interleave  test_thread_interleave.bin  --instr-atstart=no
+generate test_syscall            test_syscall.bin            --instr-atstart=no --collect-systime=nsec
+generate test_instr_toggle       test_instr_toggle.bin       --instr-atstart=no
+
+echo ""
+echo "Done. Generated $(ls "$OUT"/*.tgtrace 2>/dev/null | wc -l) trace files."
diff --git a/tracegrind/fn.c b/tracegrind/fn.c
new file mode 100644
index 000000000..47702dccc
--- /dev/null
+++ b/tracegrind/fn.c
@@ -0,0 +1,809 @@
+/*--------------------------------------------------------------------*/
+/*--- Tracegrind                                                    ---*/
+/*---                                                      ct_fn.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Tracegrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+
+#define N_INITIAL_FN_ARRAY_SIZE 10071
+
+static fn_array current_fn_active;
+
+/* x86_64 defines 4 variants.  */
+#define MAX_RESOLVE_ADDRS 4
+static int  runtime_resolve_addrs = 0;
+static Addr runtime_resolve_addr[MAX_RESOLVE_ADDRS];
+static int  runtime_resolve_length[MAX_RESOLVE_ADDRS];
+
+// a code pattern is a list of tuples (start offset, length)
+struct chunk_t {
+   int start, len;
+};
+struct pattern {
+   const HChar*   name;
+   int            len;
+   struct chunk_t chunk[];
+};
+
+/* Scan for a pattern in the code of an ELF object.
+ * If found, return true and set runtime_resolve_{addr,length}
+ */
+__attribute__((unused)) // Possibly;  depends on the platform.
+static Bool
+check_code(obj_node* obj, UChar code[], struct pattern* pat)
+{
+   Bool found;
+   Addr addr, end;
+   int  chunk, start, len;
+
+   /* first chunk of pattern should always start at offset 0 and
+    * have at least 3 bytes */
+   TG_ASSERT((pat->chunk[0].start == 0) && (pat->chunk[0].len > 2));
+
+   /* and we cannot be called more than MAX_RESOLVE_ADDRS times */
+   TG_ASSERT(runtime_resolve_addrs < MAX_RESOLVE_ADDRS);
+
+   TG_DEBUG(1, "check_code: %s, pattern %s, check %d bytes of [%x %x %x...]\n",
+            obj->name, pat->name, pat->chunk[0].len, code[0], code[1], code[2]);
+
+   end  = obj->start + obj->size - pat->len;
+   addr = obj->start;
+   while (addr < end) {
+      found = (VG_(memcmp)((void*)addr, code, pat->chunk[0].len) == 0);
+
+      if (found) {
+         chunk = 1;
+         while (1) {
+            start = pat->chunk[chunk].start;
+            len   = pat->chunk[chunk].len;
+            if (len == 0)
+               break;
+
+            TG_ASSERT(len > 2);
+            TG_DEBUG(1,
+                     " found chunk %d at %#lx, checking %d bytes "
+                     "of [%x %x %x...]\n",
+                     chunk - 1, addr - obj->start, len, code[start],
+                     code[start + 1], code[start + 2]);
+
+            if (VG_(memcmp)((void*)(addr + start), code + start, len) != 0) {
+               found = False;
+               break;
+            }
+            chunk++;
+         }
+
+         if (found) {
+            TG_DEBUG(1, "found at offset %#lx.\n", addr - obj->start);
+            if (VG_(clo_verbosity) > 1)
+               VG_(message)(Vg_DebugMsg,
+                            "Found runtime_resolve (%s): "
+                            "%s +%#lx=%#lx, length %d\n",
+                            pat->name, obj->name + obj->last_slash_pos,
+                            addr - obj->start, addr, pat->len);
+
+            runtime_resolve_addr[runtime_resolve_addrs]   = addr;
+            runtime_resolve_length[runtime_resolve_addrs] = pat->len;
+            runtime_resolve_addrs++;
+            return True;
+         }
+      }
+      addr++;
+   }
+   TG_DEBUG(1, " found nothing.\n");
+   return False;
+}
+
+/* _ld_runtime_resolve, located in ld.so, needs special handling:
+ * The jump at end into the resolved function should not be
+ * represented as a call (as usually done in tracegrind with jumps),
+ * but as a return + call. Otherwise, the repeated existence of
+ * _ld_runtime_resolve in call chains will lead to huge cycles,
+ * making the profile almost worthless.
+ *
+ * If ld.so is stripped, the symbol will not appear. But as this
+ * function is handcrafted assembler, we search for it.
+ *
+ * We stop if the ELF object name does not seem to be the runtime linker
+ */
+static Bool search_runtime_resolve(obj_node* obj)
+{
+#if defined(VGP_x86_linux)
+   static UChar code[] = {
+      /* 0*/ 0x50, 0x51, 0x52, 0x8b, 0x54, 0x24, 0x10, 0x8b,
+      /* 8*/ 0x44, 0x24, 0x0c, 0xe8, 0x70, 0x01, 0x00, 0x00,
+      /*16*/ 0x5a, 0x59, 0x87, 0x04, 0x24, 0xc2, 0x08, 0x00};
+   /* Check ranges [0-11] and [16-23] ([12-15] is an absolute address) */
+   static struct pattern pat = {"x86-def", 24, {{0, 12}, {16, 8}, {24, 0}}};
+
+   /* Pattern for glibc-2.8 on OpenSuse11.0 */
+   static UChar code_28[] = {
+      /* 0*/ 0x50, 0x51, 0x52, 0x8b, 0x54, 0x24, 0x10, 0x8b,
+      /* 8*/ 0x44, 0x24, 0x0c, 0xe8, 0x70, 0x01, 0x00, 0x00,
+      /*16*/ 0x5a, 0x8b, 0x0c, 0x24, 0x89, 0x04, 0x24, 0x8b,
+      /*24*/ 0x44, 0x24, 0x04, 0xc2, 0x0c, 0x00};
+   static struct pattern pat_28 = {
+      "x86-glibc2.8", 30, {{0, 12}, {16, 14}, {30, 0}}};
+
+   if (VG_(strncmp)(obj->name, "/lib/ld", 7) != 0)
+      return False;
+   Bool pat_p    = check_code(obj, code, &pat);
+   Bool pat_28_p = check_code(obj, code_28, &pat_28);
+   if (pat_p || pat_28_p)
+      return True;
+   return False;
+#endif
+
+#if defined(VGP_ppc32_linux)
+   static UChar          code[] = {/* 0*/ 0x94,  0x21, 0xff, 0xc0, 0x90,
+                          0x01,         0x00, 0x0c,
+                          /* 8*/ 0x90,  0x61, 0x00, 0x10, 0x90,
+                          0x81,         0x00, 0x14,
+                          /*16*/ 0x7d,  0x83, 0x63, 0x78, 0x90,
+                          0xa1,         0x00, 0x18,
+                          /*24*/ 0x7d,  0x64, 0x5b, 0x78, 0x90,
+                          0xc1,         0x00, 0x1c,
+                          /*32*/ 0x7c,  0x08, 0x02, 0xa6, 0x90,
+                          0xe1,         0x00, 0x20,
+                          /*40*/ 0x90,  0x01, 0x00, 0x30, 0x91,
+                          0x01,         0x00, 0x24,
+                          /*48*/ 0x7c,  0x00, 0x00, 0x26, 0x91,
+                          0x21,         0x00, 0x28,
+                          /*56*/ 0x91,  0x41, 0x00, 0x2c, 0x90,
+                          0x01,         0x00, 0x08,
+                          /*64*/ 0x48,  0x00, 0x02, 0x91, 0x7c,
+                          0x69,         0x03, 0xa6, /* at 64: bl aff0 <fixup> */
+                          /*72*/ 0x80,  0x01, 0x00, 0x30, 0x81,
+                          0x41,         0x00, 0x2c,
+                          /*80*/ 0x81,  0x21, 0x00, 0x28, 0x7c,
+                          0x08,         0x03, 0xa6,
+                          /*88*/ 0x81,  0x01, 0x00, 0x24, 0x80,
+                          0x01,         0x00, 0x08,
+                          /*96*/ 0x80,  0xe1, 0x00, 0x20, 0x80,
+                          0xc1,         0x00, 0x1c,
+                          /*104*/ 0x7c, 0x0f, 0xf1, 0x20, 0x80,
+                          0xa1,         0x00, 0x18,
+                          /*112*/ 0x80, 0x81, 0x00, 0x14, 0x80,
+                          0x61,         0x00, 0x10,
+                          /*120*/ 0x80, 0x01, 0x00, 0x0c, 0x38,
+                          0x21,         0x00, 0x40,
+                          /*128*/ 0x4e, 0x80, 0x04, 0x20};
+   static struct pattern pat    = {
+         "ppc32-def", 132, {{0, 65}, {68, 64}, {132, 0}}};
+
+   if (VG_(strncmp)(obj->name, "/lib/ld", 7) != 0)
+      return False;
+   return check_code(obj, code, &pat);
+#endif
+
+#if defined(VGP_amd64_linux)
+   static UChar code[] = {
+      /* 0*/ 0x48,  0x83, 0xec, 0x38, 0x48, 0x89, 0x04, 0x24,
+      /* 8*/ 0x48,  0x89, 0x4c, 0x24, 0x08, 0x48, 0x89, 0x54, 0x24, 0x10,
+      /*18*/ 0x48,  0x89, 0x74, 0x24, 0x18, 0x48, 0x89, 0x7c, 0x24, 0x20,
+      /*28*/ 0x4c,  0x89, 0x44, 0x24, 0x28, 0x4c, 0x89, 0x4c, 0x24, 0x30,
+      /*38*/ 0x48,  0x8b, 0x74, 0x24, 0x40, 0x49, 0x89, 0xf3,
+      /*46*/ 0x4c,  0x01, 0xde, 0x4c, 0x01, 0xde, 0x48, 0xc1, 0xe6, 0x03,
+      /*56*/ 0x48,  0x8b, 0x7c, 0x24, 0x38, 0xe8, 0xee, 0x01, 0x00, 0x00,
+      /*66*/ 0x49,  0x89, 0xc3, 0x4c, 0x8b, 0x4c, 0x24, 0x30,
+      /*74*/ 0x4c,  0x8b, 0x44, 0x24, 0x28, 0x48, 0x8b, 0x7c, 0x24, 0x20,
+      /*84*/ 0x48,  0x8b, 0x74, 0x24, 0x18, 0x48, 0x8b, 0x54, 0x24, 0x10,
+      /*94*/ 0x48,  0x8b, 0x4c, 0x24, 0x08, 0x48, 0x8b, 0x04, 0x24,
+      /*103*/ 0x48, 0x83, 0xc4, 0x48, 0x41, 0xff, 0xe3};
+   static struct pattern pat = {
+      "amd64-def", 110, {{0, 62}, {66, 44}, {110, 0}}};
+
+   static UChar code_xsavec[] = {
+      /* 0*/ 0x53,  0x48, 0x89, 0xe3, 0x48, 0x83, 0xe4, 0xc0,
+      /* 8*/ 0x48,  0x2b, 0x25, 0x00, 0x00, 0x00, 0x00, /* sub <i32>(%rip),%rsp
+                                                         */
+      /*15*/ 0x48,
+      /*16*/ 0x89,  0x04, 0x24, 0x48, 0x89, 0x4c, 0x24, 0x08,
+      /*24*/ 0x48,  0x89, 0x54, 0x24, 0x10, 0x48, 0x89, 0x74,
+      /*32*/ 0x24,  0x18, 0x48, 0x89, 0x7c, 0x24, 0x20, 0x4c,
+      /*40*/ 0x89,  0x44, 0x24, 0x28, 0x4c, 0x89, 0x4c, 0x24,
+      /*48*/ 0x30,  0xb8, 0xee, 0x00, 0x00, 0x00, 0x31, 0xd2,
+      /*56*/ 0x48,  0x89, 0x94, 0x24, 0x50, 0x02, 0x00, 0x00,
+      /*64*/ 0x48,  0x89, 0x94, 0x24, 0x58, 0x02, 0x00, 0x00,
+      /*72*/ 0x48,  0x89, 0x94, 0x24, 0x60, 0x02, 0x00, 0x00,
+      /*80*/ 0x48,  0x89, 0x94, 0x24, 0x68, 0x02, 0x00, 0x00,
+      /*88*/ 0x48,  0x89, 0x94, 0x24, 0x70, 0x02, 0x00, 0x00,
+      /*96*/ 0x48,  0x89, 0x94, 0x24, 0x78, 0x02, 0x00, 0x00,
+      /*04*/ 0x0f,  0xc7, 0x64, 0x24, 0x40, 0x48, 0x8b, 0x73,
+      /*112*/ 0x10, 0x48, 0x8b, 0x7b, 0x08,
+      /*117*/ 0xe8, 0x00, 0x00, 0x00, 0x00, /* callq <_dl_fixup> */
+      /*122*/ 0x49, 0x89, 0xc3, 0xb8, 0xee, 0x00,
+      /*128*/ 0x00, 0x00, 0x31, 0xd2, 0x0f, 0xae, 0x6c, 0x24,
+      /*136*/ 0x40, 0x4c, 0x8b, 0x4c, 0x24, 0x30, 0x4c, 0x8b,
+      /*144*/ 0x44, 0x24, 0x28, 0x48, 0x8b, 0x7c, 0x24, 0x20,
+      /*152*/ 0x48, 0x8b, 0x74, 0x24, 0x18, 0x48, 0x8b, 0x54,
+      /*160*/ 0x24, 0x10, 0x48, 0x8b, 0x4c, 0x24, 0x08, 0x48,
+      /*168*/ 0x8b, 0x04, 0x24, 0x48, 0x89, 0xdc, 0x48, 0x8b,
+      /*176*/ 0x1c, 0x24, 0x48, 0x83, 0xc4, 0x18, 0xf2, 0x41,
+      /*184*/ 0xff, 0xe3};
+   static struct pattern pat_xsavec = {
+      "amd64-xsavec", 186, {{0, 11}, {15, 103}, {122, 64}, {186, 0}}};
+
+   static UChar code_xsave[] = {
+      /* 0*/ 0x53,  0x48, 0x89, 0xe3, 0x48, 0x83, 0xe4, 0xc0,
+      /* 8*/ 0x48,  0x2b, 0x25, 0x00, 0x00, 0x00, 0x00, /* sub <i32>(%rip),%rsp
+                                                         */
+      /*15*/ 0x48,
+      /*16*/ 0x89,  0x04, 0x24, 0x48, 0x89, 0x4c, 0x24, 0x08,
+      /*24*/ 0x48,  0x89, 0x54, 0x24, 0x10, 0x48, 0x89, 0x74,
+      /*32*/ 0x24,  0x18, 0x48, 0x89, 0x7c, 0x24, 0x20, 0x4c,
+      /*40*/ 0x89,  0x44, 0x24, 0x28, 0x4c, 0x89, 0x4c, 0x24,
+      /*48*/ 0x30,  0xb8, 0xee, 0x00, 0x00, 0x00, 0x31, 0xd2,
+      /*56*/ 0x48,  0x89, 0x94, 0x24, 0x40, 0x02, 0x00, 0x00,
+      /*64*/ 0x48,  0x89, 0x94, 0x24, 0x48, 0x02, 0x00, 0x00,
+      /*72*/ 0x48,  0x89, 0x94, 0x24, 0x50, 0x02, 0x00, 0x00,
+      /*80*/ 0x48,  0x89, 0x94, 0x24, 0x58, 0x02, 0x00, 0x00,
+      /*88*/ 0x48,  0x89, 0x94, 0x24, 0x60, 0x02, 0x00, 0x00,
+      /*96*/ 0x48,  0x89, 0x94, 0x24, 0x68, 0x02, 0x00, 0x00,
+      /*104*/ 0x48, 0x89, 0x94, 0x24, 0x70, 0x02, 0x00, 0x00,
+      /*112*/ 0x48, 0x89, 0x94, 0x24, 0x78, 0x02, 0x00, 0x00,
+      /*120*/ 0x0f, 0xae, 0x64, 0x24, 0x40, 0x48, 0x8b, 0x73,
+      /*128*/ 0x10, 0x48, 0x8b, 0x7b, 0x08,
+      /*133*/ 0xe8, 0x00, 0x00, 0x00, 0x00, /* callq <_dl_fixup> */
+      /*138*/ 0x49, 0x89, 0xc3, 0xb8, 0xee, 0x00,
+      /*144*/ 0x00, 0x00, 0x31, 0xd2, 0x0f, 0xae, 0x6c, 0x24,
+      /*152*/ 0x40, 0x4c, 0x8b, 0x4c, 0x24, 0x30, 0x4c, 0x8b,
+      /*160*/ 0x44, 0x24, 0x28, 0x48, 0x8b, 0x7c, 0x24, 0x20,
+      /*168*/ 0x48, 0x8b, 0x74, 0x24, 0x18, 0x48, 0x8b, 0x54,
+      /*176*/ 0x24, 0x10, 0x48, 0x8b, 0x4c, 0x24, 0x08, 0x48,
+      /*184*/ 0x8b, 0x04, 0x24, 0x48, 0x89, 0xdc, 0x48, 0x8b,
+      /*192*/ 0x1c, 0x24, 0x48, 0x83, 0xc4, 0x18, 0xf2, 0x41,
+      /*200*/ 0xff, 0xe3};
+   static struct pattern pat_xsave = {
+      "amd64-xsave", 202, {{0, 11}, {15, 119}, {138, 64}, {202, 0}}};
+
+   static UChar code_fxsave[] = {
+      /* 0*/ 0x53,  0x48, 0x89, 0xe3, 0x48, 0x83, 0xe4, 0xf0,
+      /* 8*/ 0x48,  0x81, 0xec, 0x40, 0x02, 0x00, 0x00, 0x48,
+      /*16*/ 0x89,  0x04, 0x24, 0x48, 0x89, 0x4c, 0x24, 0x08,
+      /*24*/ 0x48,  0x89, 0x54, 0x24, 0x10, 0x48, 0x89, 0x74,
+      /*32*/ 0x24,  0x18, 0x48, 0x89, 0x7c, 0x24, 0x20, 0x4c,
+      /*40*/ 0x89,  0x44, 0x24, 0x28, 0x4c, 0x89, 0x4c, 0x24,
+      /*48*/ 0x30,  0x0f, 0xae, 0x44, 0x24, 0x40, 0x48, 0x8b,
+      /*56*/ 0x73,  0x10, 0x48, 0x8b, 0x7b, 0x08,
+      /*62*/ 0xe8,  0x00, 0x00, 0x00, 0x00, /* callq <_dl_fixup> */
+      /*67*/ 0x49,  0x89, 0xc3, 0x0f, 0xae,
+      /*72*/ 0x4c,  0x24, 0x40, 0x4c, 0x8b, 0x4c, 0x24, 0x30,
+      /*80*/ 0x4c,  0x8b, 0x44, 0x24, 0x28, 0x48, 0x8b, 0x7c,
+      /*88*/ 0x24,  0x20, 0x48, 0x8b, 0x74, 0x24, 0x18, 0x48,
+      /*96*/ 0x8b,  0x54, 0x24, 0x10, 0x48, 0x8b, 0x4c, 0x24,
+      /*104*/ 0x08, 0x48, 0x8b, 0x04, 0x24, 0x48, 0x89, 0xdc,
+      /*112*/ 0x48, 0x8b, 0x1c, 0x24, 0x48, 0x83, 0xc4, 0x18,
+      /*120*/ 0xf2, 0x41, 0xff, 0xe3};
+   static struct pattern pat_fxsave = {
+      "amd64-fxsave", 124, {{0, 63}, {67, 57}, {124, 0}}};
+
+   if ((VG_(strncmp)(obj->name, "/lib/ld", 7) != 0) &&
+       (VG_(strncmp)(obj->name, "/lib64/ld", 9) != 0) &&
+       (VG_(strncmp)(obj->name, "/usr/lib/ld", 11) != 0) &&
+       (VG_(strncmp)(obj->name, "/usr/lib64/ld", 13) != 0))
+      return False;
+   Bool pat_p        = check_code(obj, code, &pat);
+   Bool pat_xsavec_p = check_code(obj, code_xsavec, &pat_xsavec);
+   Bool pat_xsave_p  = check_code(obj, code_xsave, &pat_xsave);
+   Bool pat_fxsave_p = check_code(obj, code_fxsave, &pat_fxsave);
+   if (pat_p || pat_xsavec_p || pat_xsave_p || pat_fxsave_p)
+      return True;
+#endif
+
+   /* For other platforms, no patterns known */
+   return False;
+}
+
+/*------------------------------------------------------------*/
+/*--- Object/File/Function hash entry operations           ---*/
+/*------------------------------------------------------------*/
+
+/* Object hash table, fixed */
+static obj_node* obj_table[N_OBJ_ENTRIES];
+
+void TG_(init_obj_table)(void)
+{
+   Int i;
+   for (i = 0; i < N_OBJ_ENTRIES; i++)
+      obj_table[i] = 0;
+}
+
+#define HASH_CONSTANT 256
+
+static UInt str_hash(const HChar* s, UInt table_size)
+{
+   int hash_value = 0;
+   for (; *s; s++)
+      hash_value = (HASH_CONSTANT * hash_value + *s) % table_size;
+   return hash_value;
+}
+
+static const HChar* anonymous_obj = "???";
+
+static __inline__ obj_node* new_obj_node(DebugInfo* di, obj_node* next)
+{
+   Int       i;
+   obj_node* obj;
+
+   obj       = (obj_node*)TG_MALLOC("cl.fn.non.1", sizeof(obj_node));
+   obj->name = di ? VG_(strdup)("cl.fn.non.2", VG_(DebugInfo_get_filename)(di))
+                  : anonymous_obj;
+   for (i = 0; i < N_FILE_ENTRIES; i++) {
+      obj->files[i] = NULL;
+   }
+   TG_(stat).distinct_objs++;
+   obj->number = TG_(stat).distinct_objs;
+   /* JRS 2008 Feb 19: maybe rename .start/.size/.offset to
+      .text_avma/.text_size/.test_bias to make it clearer what these
+      fields really mean */
+   obj->start  = di ? VG_(DebugInfo_get_text_avma)(di) : 0;
+   obj->size   = di ? VG_(DebugInfo_get_text_size)(di) : 0;
+   obj->offset = di ? VG_(DebugInfo_get_text_bias)(di) : 0;
+   obj->next   = next;
+
+   // not only used for debug output (see static.c)
+   obj->last_slash_pos = 0;
+   i                   = 0;
+   while (obj->name[i]) {
+      if (obj->name[i] == '/')
+         obj->last_slash_pos = i + 1;
+      i++;
+   }
+   obj->name_len = i;
+
+   if (runtime_resolve_addrs == 0)
+      search_runtime_resolve(obj);
+
+   return obj;
+}
+
+obj_node* TG_(get_obj_node)(DebugInfo* di)
+{
+   obj_node*    curr_obj_node;
+   UInt         objname_hash;
+   const HChar* obj_name;
+
+   obj_name = di ? VG_(DebugInfo_get_filename)(di) : anonymous_obj;
+
+   /* lookup in obj hash */
+   objname_hash  = str_hash(obj_name, N_OBJ_ENTRIES);
+   curr_obj_node = obj_table[objname_hash];
+   while (NULL != curr_obj_node &&
+          VG_(strcmp)(obj_name, curr_obj_node->name) != 0) {
+      curr_obj_node = curr_obj_node->next;
+   }
+   if (NULL == curr_obj_node) {
+      obj_table[objname_hash] = curr_obj_node =
+         new_obj_node(di, obj_table[objname_hash]);
+   }
+
+   return curr_obj_node;
+}
+
+static __inline__ file_node*
+new_file_node(const HChar* filename, obj_node* obj, file_node* next)
+{
+   Int        i;
+   file_node* file = (file_node*)TG_MALLOC("cl.fn.nfn.1", sizeof(file_node));
+   file->name      = VG_(strdup)("cl.fn.nfn.2", filename);
+   file->name_len  = VG_(strlen)(filename);
+   for (i = 0; i < N_FN_ENTRIES; i++) {
+      file->fns[i] = NULL;
+   }
+   TG_(stat).distinct_files++;
+   file->obj  = obj;
+   file->next = next;
+   return file;
+}
+
+file_node*
+TG_(get_file_node)(obj_node* curr_obj_node, const HChar* dir, const HChar* file)
+{
+   file_node* curr_file_node;
+   UInt       filename_hash;
+
+   /* Build up an absolute pathname, if there is a directory available */
+   HChar filename[VG_(strlen)(dir) + 1 + VG_(strlen)(file) + 1];
+   VG_(strcpy)(filename, dir);
+   if (filename[0] != '\0') {
+      VG_(strcat)(filename, "/");
+   }
+   VG_(strcat)(filename, file);
+
+   /* lookup in file hash */
+   filename_hash  = str_hash(filename, N_FILE_ENTRIES);
+   curr_file_node = curr_obj_node->files[filename_hash];
+   while (NULL != curr_file_node &&
+          VG_(strcmp)(filename, curr_file_node->name) != 0) {
+      curr_file_node = curr_file_node->next;
+   }
+   if (NULL == curr_file_node) {
+      curr_obj_node->files[filename_hash] = curr_file_node = new_file_node(
+         filename, curr_obj_node, curr_obj_node->files[filename_hash]);
+   }
+
+   return curr_file_node;
+}
+
+/* forward decl. */
+static void resize_fn_array(void);
+
+static __inline__ fn_node*
+new_fn_node(const HChar* fnname, file_node* file, fn_node* next)
+{
+   fn_node* fn  = (fn_node*)TG_MALLOC("cl.fn.nfnnd.1", sizeof(fn_node));
+   fn->name     = VG_(strdup)("cl.fn.nfnnd.2", fnname);
+   fn->name_len = VG_(strlen)(fnname);
+
+   TG_(stat).distinct_fns++;
+   fn->number   = TG_(stat).distinct_fns;
+   fn->last_cxt = 0;
+   fn->pure_cxt = 0;
+   fn->file     = file;
+   fn->next     = next;
+
+   fn->toggle_collect      = False;
+   fn->skip                = False;
+   fn->pop_on_jump         = TG_(clo).pop_on_jump;
+   fn->group               = 0;
+   fn->separate_callers    = TG_(clo).separate_callers;
+   fn->separate_recursions = TG_(clo).separate_recursions;
+
+#if TG_ENABLE_DEBUG
+   fn->verbosity = -1;
+#endif
+
+   if (TG_(stat).distinct_fns >= current_fn_active.size)
+      resize_fn_array();
+
+   return fn;
+}
+
+/* Get a function node in hash2 with known file node.
+ * hash nodes are created if needed
+ */
+static fn_node* get_fn_node_infile(file_node*   curr_file_node,
+                                   const HChar* fnname)
+{
+   fn_node* curr_fn_node;
+   UInt     fnname_hash;
+
+   TG_ASSERT(curr_file_node != 0);
+
+   /* lookup in function hash */
+   fnname_hash  = str_hash(fnname, N_FN_ENTRIES);
+   curr_fn_node = curr_file_node->fns[fnname_hash];
+   while (NULL != curr_fn_node &&
+          VG_(strcmp)(fnname, curr_fn_node->name) != 0) {
+      curr_fn_node = curr_fn_node->next;
+   }
+   if (NULL == curr_fn_node) {
+      curr_file_node->fns[fnname_hash] = curr_fn_node =
+         new_fn_node(fnname, curr_file_node, curr_file_node->fns[fnname_hash]);
+   }
+
+   return curr_fn_node;
+}
+
+/* Get a function node in a Segment.
+ * Hash nodes are created if needed.
+ */
+static __inline__ fn_node* get_fn_node_inseg(DebugInfo*   di,
+                                             const HChar* dirname,
+                                             const HChar* filename,
+                                             const HChar* fnname)
+{
+   obj_node*  obj  = TG_(get_obj_node)(di);
+   file_node* file = TG_(get_file_node)(obj, dirname, filename);
+   fn_node*   fn   = get_fn_node_infile(file, fnname);
+
+   return fn;
+}
+
+Bool TG_(get_debug_info)(Addr          instr_addr,
+                         const HChar** dir,
+                         const HChar** file,
+                         const HChar** fn_name,
+                         UInt*         line_num,
+                         DebugInfo**   pDebugInfo)
+{
+   Bool found_file_line, found_fn, result = True;
+   UInt line;
+
+   TG_DEBUG(6, "  + get_debug_info(%#lx)\n", instr_addr);
+
+   DiEpoch ep = VG_(current_DiEpoch)();
+   if (pDebugInfo) {
+      *pDebugInfo = VG_(find_DebugInfo)(ep, instr_addr);
+
+      // for generated code in anonymous space, pSegInfo is 0
+   }
+
+   found_file_line =
+      VG_(get_filename_linenum)(ep, instr_addr, file, dir, &line);
+   found_fn = VG_(get_fnname)(ep, instr_addr, fn_name);
+
+   if (!found_file_line && !found_fn) {
+      TG_(stat).no_debug_BBs++;
+      *file    = "???";
+      *fn_name = "???";
+      if (line_num)
+         *line_num = 0;
+      result = False;
+
+   } else if (found_file_line && found_fn) {
+      TG_(stat).full_debug_BBs++;
+      if (line_num)
+         *line_num = line;
+
+   } else if (found_file_line && !found_fn) {
+      TG_(stat).file_line_debug_BBs++;
+      *fn_name = "???";
+      if (line_num)
+         *line_num = line;
+
+   } else /*(!found_file_line &&  found_fn)*/ {
+      TG_(stat).fn_name_debug_BBs++;
+      *file = "???";
+      if (line_num)
+         *line_num = 0;
+   }
+
+   TG_DEBUG(6, "  - get_debug_info(%#lx): seg '%s', fn %s\n", instr_addr,
+            !pDebugInfo     ? "-"
+            : (*pDebugInfo) ? VG_(DebugInfo_get_filename)(*pDebugInfo)
+                            : "(None)",
+            *fn_name);
+
+   return result;
+}
+
+/* for _libc_freeres_wrapper => _exit renaming */
+static BB* exit_bb = 0;
+
+/*
+ * Attach function struct to a BB from debug info.
+ */
+fn_node* TG_(get_fn_node)(BB* bb)
+{
+   const HChar *fnname, *filename, *dirname;
+   DebugInfo*   di;
+   UInt         line_num;
+   fn_node*     fn;
+   Int          i;
+
+   /* fn from debug info is idempotent for a BB */
+   if (bb->fn)
+      return bb->fn;
+
+   TG_DEBUG(3, "+ get_fn_node(BB %#lx)\n", bb_addr(bb));
+
+   /* get function/file name, line number and object of
+    * the BB according to debug information
+    */
+   TG_(get_debug_info)
+   (bb_addr(bb), &dirname, &filename, &fnname, &line_num, &di);
+
+   DiEpoch ep = VG_(current_DiEpoch)();
+
+   /* Build inline stack for this BB using InlIPCursor */
+   {
+      InlIPCursor* iipc = VG_(new_IIPC)(ep, bb_addr(bb));
+      if (iipc) {
+         const HChar* tmp[TG_MAX_INL_DEPTH + 1];
+         Int          total = 0;
+         do {
+            const HChar* fn_name = NULL;
+            VG_(get_fnname_inl)(ep, bb_addr(bb), &fn_name, iipc);
+            if (fn_name && total < TG_MAX_INL_DEPTH + 1)
+               tmp[total++] = fn_name;
+         } while (VG_(next_IIPC)(iipc));
+         VG_(delete_IIPC)(iipc);
+
+         /* tmp[] is innermost-first; last entry is the non-inlined function
+          * (skip it) */
+         Int inl_count = total - 1;
+         if (inl_count > 0) {
+            bb->inl_depth = inl_count;
+            bb->inl_fns = VG_(malloc)("tg.bb.inl", inl_count * sizeof(HChar*));
+            /* Reverse into outermost-first order */
+            for (Int i = 0; i < inl_count; i++)
+               bb->inl_fns[i] = tmp[inl_count - 1 - i];
+         }
+      }
+   }
+
+   if (0 == VG_(strcmp)(fnname, "???")) {
+      int          p;
+      static HChar buf[32]; // for sure large enough
+      /* Use address as found in library */
+      if (sizeof(Addr) == 4)
+         p = VG_(sprintf)(buf, "%#08lx", (UWord)bb->offset);
+      else
+         // 64bit address
+         p = VG_(sprintf)(buf, "%#016lx", (UWord)bb->offset);
+
+      VG_(sprintf)(buf + p, "%s",
+                   (bb->sect_kind == Vg_SectData)  ? " [Data]"
+                   : (bb->sect_kind == Vg_SectBSS) ? " [BSS]"
+                   : (bb->sect_kind == Vg_SectGOT) ? " [GOT]"
+                   : (bb->sect_kind == Vg_SectPLT) ? " [PLT]"
+                                                   : "");
+      fnname = buf;
+   } else {
+      if (VG_(get_fnname_if_entry)(ep, bb_addr(bb), &fnname))
+         bb->is_entry = 1;
+   }
+
+   /* HACK for correct _exit:
+    * _exit is redirected to VG_(__libc_freeres_wrapper) by valgrind,
+    * so we rename it back again :-)
+    */
+   if (0 == VG_(strcmp)(fnname, "vgPlain___libc_freeres_wrapper") && exit_bb) {
+      TG_(get_debug_info)
+      (bb_addr(exit_bb), &dirname, &filename, &fnname, &line_num, &di);
+
+      TG_DEBUG(1, "__libc_freeres_wrapper renamed to _exit\n");
+   }
+   if (0 == VG_(strcmp)(fnname, "_exit") && !exit_bb)
+      exit_bb = bb;
+
+   for (i = 0; i < runtime_resolve_addrs; i++) {
+      if ((bb_addr(bb) >= runtime_resolve_addr[i]) &&
+          (bb_addr(bb) < runtime_resolve_addr[i] + runtime_resolve_length[i])) {
+         /* BB in runtime_resolve found by code check; use this name */
+         fnname = "_dl_runtime_resolve";
+         break;
+      }
+   }
+
+   /* get fn_node struct for this function */
+   fn = get_fn_node_inseg(di, dirname, filename, fnname);
+
+   /* if this is the 1st time the function is seen,
+    * some attributes are set */
+   if (fn->pure_cxt == 0) {
+
+      /* Every function gets a "pure" context, i.e. a context with stack
+       * depth 1 only with this function. This is for compression of mangled
+       * names
+       */
+      fn_node* pure[2];
+      pure[0]      = 0;
+      pure[1]      = fn;
+      fn->pure_cxt = TG_(get_cxt)(pure + 1);
+
+      if (bb->sect_kind == Vg_SectPLT || bb->sect_kind == Vg_SectPLTSEC)
+         fn->skip = TG_(clo).skip_plt;
+
+      if (VG_(strncmp)(fn->name, "_dl_runtime_resolve", 19) == 0) {
+         fn->pop_on_jump = True;
+
+         if (VG_(clo_verbosity) > 1)
+            VG_(message)(Vg_DebugMsg,
+                         "Symbol match: found runtime_resolve:"
+                         " %s +%#lx=%#lx\n",
+                         bb->obj->name + bb->obj->last_slash_pos,
+                         (UWord)bb->offset, bb_addr(bb));
+      }
+
+      /* apply config options from function name patterns
+       * given on command line */
+      TG_(update_fn_config)(fn);
+   }
+
+   bb->fn   = fn;
+   bb->line = line_num;
+
+   if (dirname[0]) {
+      TG_DEBUG(3, "- get_fn_node(BB %#lx): %s (in %s:%u)\n", bb_addr(bb),
+               fnname, filename, line_num);
+   } else
+      TG_DEBUG(3, "- get_fn_node(BB %#lx): %s (in %s/%s:%u)\n", bb_addr(bb),
+               fnname, dirname, filename, line_num);
+
+   return fn;
+}
+
+/*------------------------------------------------------------*/
+/*--- Active function array operations                     ---*/
+/*------------------------------------------------------------*/
+
+/* The active function array is a thread-specific array
+ * of UInts, mapping function numbers to the active count of
+ * functions.
+ * The active count is the number of times a function appears
+ * in the current call stack, and is used when costs for recursion
+ * levels should be separated.
+ */
+
+UInt* TG_(get_fn_entry)(Int n)
+{
+   TG_ASSERT(n < current_fn_active.size);
+   return current_fn_active.array + n;
+}
+
+void TG_(init_fn_array)(fn_array* a)
+{
+   Int i;
+
+   TG_ASSERT(a != 0);
+
+   a->size = N_INITIAL_FN_ARRAY_SIZE;
+   if (a->size <= TG_(stat).distinct_fns)
+      a->size = TG_(stat).distinct_fns + 1;
+
+   a->array = (UInt*)TG_MALLOC("cl.fn.gfe.1", a->size * sizeof(UInt));
+   for (i = 0; i < a->size; i++)
+      a->array[i] = 0;
+}
+
+void TG_(copy_current_fn_array)(fn_array* dst)
+{
+   TG_ASSERT(dst != 0);
+
+   dst->size  = current_fn_active.size;
+   dst->array = current_fn_active.array;
+}
+
+fn_array* TG_(get_current_fn_array)(void) { return &current_fn_active; }
+
+void TG_(set_current_fn_array)(fn_array* a)
+{
+   TG_ASSERT(a != 0);
+
+   current_fn_active.size  = a->size;
+   current_fn_active.array = a->array;
+   if (current_fn_active.size <= TG_(stat).distinct_fns)
+      resize_fn_array();
+}
+
+/* ensure that active_array is big enough:
+ *  <distinct_fns> is the highest index, so <fn_active_array_size>
+ *  has to be bigger than that.
+ */
+static void resize_fn_array(void)
+{
+   UInt* new_array;
+   Int   i;
+
+   UInt newsize = current_fn_active.size;
+   while (newsize <= TG_(stat).distinct_fns)
+      newsize *= 2;
+
+   TG_DEBUG(0, "Resize fn_active_array: %u => %u\n", current_fn_active.size,
+            newsize);
+
+   new_array = (UInt*)TG_MALLOC("cl.fn.rfa.1", newsize * sizeof(UInt));
+   for (i = 0; i < current_fn_active.size; i++)
+      new_array[i] = current_fn_active.array[i];
+   while (i < newsize)
+      new_array[i++] = 0;
+
+   VG_(free)(current_fn_active.array);
+   current_fn_active.size  = newsize;
+   current_fn_active.array = new_array;
+   TG_(stat).fn_array_resizes++;
+}
diff --git a/tracegrind/global.h b/tracegrind/global.h
new file mode 100644
index 000000000..1c6196c52
--- /dev/null
+++ b/tracegrind/global.h
@@ -0,0 +1,828 @@
+/*--------------------------------------------------------------------*/
+/*--- Tracegrind data structures, functions.               global.h ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2017 Josef Weidendorfer
+      josef.weidendorfer@gmx.de
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#ifndef TG_GLOBAL
+#define TG_GLOBAL
+
+#include "pub_tool_basics.h"
+#include "pub_tool_clientstate.h"
+#include "pub_tool_debuginfo.h"
+#include "pub_tool_libcassert.h"
+#include "pub_tool_libcbase.h"
+#include "pub_tool_libcfile.h"
+#include "pub_tool_libcprint.h"
+#include "pub_tool_libcproc.h"
+#include "pub_tool_machine.h"
+#include "pub_tool_machine.h" // VG_(fnptr_to_fnentry)
+#include "pub_tool_mallocfree.h"
+#include "pub_tool_options.h"
+#include "pub_tool_tooliface.h"
+#include "pub_tool_vki.h"
+#include "pub_tool_vkiscnums.h"
+#include "pub_tool_xarray.h"
+
+#include "costs.h"
+#include "events.h" // defines TG_ macro
+
+/*------------------------------------------------------------*/
+/*--- Tracegrind compile options                           --- */
+/*------------------------------------------------------------*/
+
+/* Enable debug output */
+#define TG_ENABLE_DEBUG 1
+
+/* Enable experimental features? */
+#define TG_EXPERIMENTAL 0
+
+/* Maximum depth of inline call stack tracking */
+#define TG_MAX_INL_DEPTH 16
+
+/*------------------------------------------------------------*/
+/*--- Command line options                                 ---*/
+/*------------------------------------------------------------*/
+
+#define DEFAULT_OUTFORMAT "tracegrind.out.%p.msgpack.lz4"
+
+/* If and how to collect syscall time.
+   systime_no : do not collect systime
+   systime_msec : collect syscount, systime elapsed, milli second precision.
+   systime_usec : collect syscount, systime elapsed, micro second precision.
+   systime_nsec : collect syscount, systime elapsed, systime cpu, nano second
+                  precision.  */
+typedef enum {
+   systime_no,
+   systime_msec,
+   systime_usec,
+   systime_nsec
+} Collect_Systime;
+
+/* Trace event types */
+typedef enum {
+   TG_EV_MARKER           = 0,
+   TG_EV_ENTER_FN         = 1,
+   TG_EV_EXIT_FN          = 2,
+   TG_EV_ENTER_INLINED_FN = 3,
+   TG_EV_EXIT_INLINED_FN  = 4,
+   TG_EV_FORK             = 5,
+   TG_EV_THREAD_CREATE    = 6
+} TraceEventType;
+
+typedef struct _CommandLineOptions CommandLineOptions;
+struct _CommandLineOptions {
+
+   /* Output options */
+   const HChar* out_format; /* Format string for tracegrind output file name */
+
+   /* Collection options */
+   Bool separate_threads;    /* Separate threads in dump? */
+   Int  separate_callers;    /* Separate dependent on how many callers? */
+   Int  separate_recursions; /* Max level of recursions to separate */
+   Bool skip_plt;            /* Skip functions in PLT section? */
+
+   Bool collect_atstart; /* Start in collecting state ? */
+   Bool collect_jumps;   /* Collect (cond.) jumps in functions ? */
+
+   Collect_Systime collect_systime; /* Collect time for system calls */
+
+   Bool collect_bus; /* Collect global bus events */
+
+   /* Instrument options */
+   Bool instrument_atstart; /* Instrument at start? */
+   Bool simulate_cache;     /* Call into cache simulator ? */
+   Bool simulate_branch;    /* Call into branch prediction simulator ? */
+
+   /* Call graph generation */
+   Bool pop_on_jump; /* Handle a jump between functions as ret+call */
+
+#if TG_ENABLE_DEBUG
+   Int   verbose;
+   ULong verbose_start;
+#endif
+};
+
+/*------------------------------------------------------------*/
+/*--- Constants                                            ---*/
+/*------------------------------------------------------------*/
+
+/* Minimum cache line size allowed */
+#define MIN_LINE_SIZE 16
+
+/*------------------------------------------------------------*/
+/*--- Statistics                                           ---*/
+/*------------------------------------------------------------*/
+
+typedef struct _Statistics Statistics;
+struct _Statistics {
+   ULong call_counter;
+   ULong jcnd_counter;
+   ULong jump_counter;
+   ULong rec_call_counter;
+   ULong ret_counter;
+   ULong bb_executions;
+
+   Int context_counter;
+   Int bb_retranslations;
+
+   Int distinct_objs;
+   Int distinct_files;
+   Int distinct_fns;
+   Int distinct_contexts;
+   Int distinct_bbs;
+   Int distinct_jccs;
+   Int distinct_bbccs;
+   Int distinct_instrs;
+   Int distinct_skips;
+
+   Int bb_hash_resizes;
+   Int bbcc_hash_resizes;
+   Int jcc_hash_resizes;
+   Int cxt_hash_resizes;
+   Int fn_array_resizes;
+   Int call_stack_resizes;
+   Int fn_stack_resizes;
+
+   Int full_debug_BBs;
+   Int file_line_debug_BBs;
+   Int fn_name_debug_BBs;
+   Int no_debug_BBs;
+   Int bbcc_lru_misses;
+   Int jcc_lru_misses;
+   Int cxt_lru_misses;
+   Int bbcc_clones;
+};
+
+/*------------------------------------------------------------*/
+/*--- Structure declarations                               ---*/
+/*------------------------------------------------------------*/
+
+typedef struct _Context     Context;
+typedef struct _CC          CC;
+typedef struct _BB          BB;
+typedef struct _BBCC        BBCC;
+typedef struct _jCC         jCC;
+typedef struct _fn_node     fn_node;
+typedef struct _file_node   file_node;
+typedef struct _obj_node    obj_node;
+typedef struct _fn_config   fn_config;
+typedef struct _call_entry  call_entry;
+typedef struct _thread_info thread_info;
+
+/* Cost arrays: aliases to arrays of 64-bit event counters */
+typedef ULong* FullCost;
+
+/* The types of control flow changes that can happen between
+ * execution of two BBs in a thread.
+ */
+typedef enum {
+   jk_None = 0, /* no explicit change by a guest instruction */
+   jk_Jump,     /* regular jump */
+   jk_Call,
+   jk_Return,
+   jk_CondJump /* conditional jump taken (only used as jCC type) */
+} TgJumpKind;
+
+/* JmpCall cost center
+ * for subroutine call (from->bb->jmp_addr => to->bb->addr)
+ *
+ * Each BB has at most one CALL instruction. The list of JCC from
+ * this call is a pointer to the list head (stored in BBCC), and
+ * <next_from> in the JCC struct.
+ *
+ * For fast lookup, JCCs are reachable with a hash table, keyed by
+ * the (from_bbcc,to) pair. <next_hash> is used for the JCC chain
+ * of one hash table entry.
+ *
+ * Cost <sum> holds event counts for already returned executions.
+ * <last> are the event counters at last enter of the subroutine.
+ * <sum> is updated on returning from the subroutine by
+ * adding the diff of <last> and current event counters to <sum>.
+ *
+ * After updating, <last> is set to current event counters. Thus,
+ * events are not counted twice for recursive calls (TODO: True?)
+ */
+
+struct _jCC {
+   TgJumpKind jmpkind;   /* jk_Call, jk_Jump, jk_CondJump */
+   jCC*       next_hash; /* for hash entry chain */
+   jCC*       next_from; /* next JCC from a BBCC */
+   BBCC *     from, *to; /* call arc from/to this BBCC */
+   UInt       jmp;       /* jump no. in source */
+
+   ULong call_counter; /* no wraparound with 64 bit */
+
+   FullCost cost; /* simulator + user counters */
+};
+
+/*
+ * Info for one instruction of a basic block.
+ */
+typedef struct _InstrInfo InstrInfo;
+struct _InstrInfo {
+   UInt      instr_offset;
+   UInt      instr_size;
+   UInt      cost_offset;
+   EventSet* eventset;
+};
+
+/*
+ * Info for a side exit in a BB
+ */
+typedef struct _CJmpInfo CJmpInfo;
+struct _CJmpInfo {
+   UInt       instr;   /* instruction index for BB.instr array */
+   TgJumpKind jmpkind; /* jump kind when leaving BB at this side exit */
+};
+
+/**
+ * An instrumented basic block (BB).
+ *
+ * BBs are put into a resizable hash to allow for fast detection if a
+ * BB is to be retranslated but cost info is already available.
+ * The key for a BB is a (object, offset) tupel making it independent
+ * from possibly multiple mappings of the same ELF object.
+ *
+ * At the beginning of each instrumented BB,
+ * a call to setup_bbcc(), specifying a pointer to the
+ * according BB structure, is added.
+ *
+ * As cost of a BB has to be distinguished depending on the context,
+ * multiple cost centers for one BB (struct BBCC) exist and the according
+ * BBCC is set by setup_bbcc.
+ */
+struct _BB {
+   obj_node* obj;    /* ELF object of BB */
+   PtrdiffT  offset; /* offset of BB in ELF object file */
+   BB*       next;   /* chaining for a hash entry */
+
+   VgSectKind sect_kind; /* section of this BB, e.g. PLT */
+   UInt       instr_count;
+
+   /* filled by TG_(get_fn_node) if debug info is available */
+   fn_node* fn; /* debug info for this BB */
+   UInt     line;
+   Bool     is_entry; /* True if this BB is a function entry */
+
+   BBCC* bbcc_list; /* BBCCs for same BB (see next_bbcc in BBCC) */
+   BBCC* last_bbcc; /* Temporary: Cached for faster access (LRU) */
+
+   /* filled by TG_(instrument) if not seen before */
+   UInt      cjmp_count; /* number of side exits */
+   CJmpInfo* jmp;        /* array of info for condition jumps,
+                          * allocated directly after this struct */
+   Bool cjmp_inverted;   /* is last side exit actually fall through? */
+
+   const HChar**
+        inl_fns;   /* inlined fn names at BB start (outermost first), or NULL */
+   UInt inl_depth; /* number of entries in inl_fns */
+
+   UInt      instr_len;
+   UInt      cost_count;
+   InstrInfo instr[0]; /* info on instruction sizes and costs */
+};
+
+/**
+ * Function context
+ *
+ * Basic blocks are always executed in the scope of a context.
+ * A function context is a list of function nodes representing
+ * the call chain to the current context: I.e. fn[0] is the
+ * function we are currently in, fn[1] has called fn[0], and so on.
+ * Recursion levels are used for fn[0].
+ *
+ * To get a unique number for a full execution context, use
+ *  rec_index = min(<fn->rec_separation>,<active>) - 1;
+ *  unique_no = <number> + rec_index
+ *
+ * For each Context, recursion index and BB, there can be a BBCC.
+ */
+struct _Context {
+   UInt     size;        // number of function dependencies
+   UInt     base_number; // for context compression & dump array
+   Context* next;        // entry chaining for hash
+   UWord    hash;        // for faster lookup...
+   fn_node* fn[0];
+};
+
+/*
+ * Cost info for a side exits from a BB
+ */
+typedef struct _JmpData JmpData;
+struct _JmpData {
+   ULong ecounter; /* number of times the BB was left at this exit */
+   jCC*  jcc_list; /* JCCs used for this exit */
+};
+
+/*
+ * Basic Block Cost Center
+ *
+ * On demand, multiple BBCCs will be created for the same BB
+ * dependent on command line options and:
+ * - current function (it's possible that a BB is executed in the
+ *   context of different functions, e.g. in manual assembler/PLT)
+ * - current thread ID
+ * - position where current function is called from
+ * - recursion level of current function
+ *
+ * The cost centres for the instructions of a basic block are
+ * stored in a contiguous array.
+ * They are distinguishable by their tag field.
+ */
+struct _BBCC {
+   BB* bb; /* BB for this cost center */
+
+   Context* cxt;        /* execution context of this BBCC */
+   ThreadId tid;        /* only for assertion check purpose */
+   UInt     rec_index;  /* Recursion index in rec->bbcc for this bbcc */
+   BBCC**   rec_array;  /* Variable sized array of pointers to
+                         * recursion BBCCs. Shared. */
+   BBCC* next_bbcc;     /* Chain of BBCCs for same BB */
+   BBCC* lru_next_bbcc; /* BBCC executed next the last time */
+
+   jCC*     lru_from_jcc; /* Temporary: Cached for faster access (LRU) */
+   jCC*     lru_to_jcc;   /* Temporary: Cached for faster access (LRU) */
+   FullCost skipped;      /* cost for skipped functions called from
+                           * jmp_addr. Allocated lazy */
+
+   BBCC*   next;         /* entry chain in hash */
+   ULong*  cost;         /* start of 64bit costs for this BBCC */
+   ULong   ecounter_sum; /* execution counter for first instruction of BB */
+   JmpData jmp[0];
+};
+
+struct _fn_node {
+   HChar*     name;
+   UInt       name_len;
+   UInt       number;
+   Context*   last_cxt; /* LRU info */
+   Context*   pure_cxt; /* the context with only the function itself */
+   file_node* file;     /* reverse mapping for 2nd hash */
+   fn_node*   next;
+
+   Bool toggle_collect : 1;
+   Bool skip : 1;
+   Bool pop_on_jump : 1;
+
+   Int group;
+   Int separate_callers;
+   Int separate_recursions;
+#if TG_ENABLE_DEBUG
+   Int verbosity; /* Stores old verbosity level while in function */
+#endif
+};
+
+/* Quite arbitrary fixed hash sizes */
+
+#define N_OBJ_ENTRIES  47
+#define N_FILE_ENTRIES 53
+#define N_FN_ENTRIES   87
+
+struct _file_node {
+   HChar*     name;
+   UInt       name_len;
+   fn_node*   fns[N_FN_ENTRIES];
+   obj_node*  obj;
+   file_node* next;
+};
+
+/* If an object is dlopened multiple times, we hope that <name> is unique;
+ * <start> and <offset> can change with each dlopen, and <start> is
+ * zero when object is unmapped (possible at dump time).
+ */
+struct _obj_node {
+   const HChar* name;
+   UInt         name_len;
+   UInt         last_slash_pos;
+
+   Addr     start;  /* Start address of text segment mapping */
+   SizeT    size;   /* Length of mapping */
+   PtrdiffT offset; /* Offset between symbol address and file offset */
+
+   file_node* files[N_FILE_ENTRIES];
+   UInt       number;
+   obj_node*  next;
+};
+
+/* an entry in the callstack
+ *
+ * <nonskipped> is 0 if the function called is not skipped (usual case).
+ * Otherwise, it is the last non-skipped BBCC. This one gets all
+ * the calls to non-skipped functions and all costs in skipped
+ * instructions.
+ */
+struct _call_entry {
+   jCC*     jcc;        /* jCC for this call */
+   FullCost enter_cost; /* cost event counters at entering frame */
+   Addr     sp;         /* stack pointer directly after call */
+   Addr     ret_addr;   /* address to which to return to
+                         * is 0 on a simulated call */
+   BBCC*    nonskipped; /* see above */
+   Context* cxt;        /* context before call */
+   Int      fn_sp;      /* function stack index before call */
+};
+
+/*
+ * Execution state of main thread or a running signal handler in
+ * a thread while interrupted by another signal handler.
+ * As there's no scheduling among running signal handlers of one thread,
+ * we only need a subset of a full thread state:
+ * - event counter
+ * - collect state
+ * - last BB, last jump kind, last nonskipped BB
+ * - callstack pointer for sanity checking and correct unwinding
+ *   after exit
+ */
+typedef struct _exec_state exec_state;
+struct _exec_state {
+
+   /* the signum of the handler, 0 for main thread context
+    */
+   Int sig;
+
+   /* the old call stack pointer at entering the signal handler */
+   Int orig_sp;
+
+   FullCost cost;
+   Bool     collect;
+   Context* cxt;
+
+   /* number of conditional jumps passed in last BB */
+   Int   jmps_passed;
+   BBCC* bbcc; /* last BB executed */
+   BBCC* nonskipped;
+
+   Int call_stack_bottom; /* Index into fn_stack */
+};
+
+/* Global state structures */
+typedef struct _bb_hash bb_hash;
+struct _bb_hash {
+   UInt size, entries;
+   BB** table;
+};
+
+typedef struct _cxt_hash cxt_hash;
+struct _cxt_hash {
+   UInt      size, entries;
+   Context** table;
+};
+
+/* Thread specific state structures, i.e. parts of a thread state.
+ * There are variables for the current state of each part,
+ * on which a thread state is copied at thread switch.
+ */
+typedef struct _bbcc_hash bbcc_hash;
+struct _bbcc_hash {
+   UInt   size, entries;
+   BBCC** table;
+};
+
+typedef struct _jcc_hash jcc_hash;
+struct _jcc_hash {
+   UInt  size, entries;
+   jCC** table;
+   jCC*  spontaneous;
+};
+
+typedef struct _fn_array fn_array;
+struct _fn_array {
+   UInt  size;
+   UInt* array;
+};
+
+typedef struct _call_stack call_stack;
+struct _call_stack {
+   UInt        size;
+   Int         sp;
+   call_entry* entry;
+};
+
+typedef struct _fn_stack fn_stack;
+struct _fn_stack {
+   UInt      size;
+   fn_node **bottom, **top;
+};
+
+/* The maximum number of simultaneous running signal handlers per thread.
+ * This is the number of execution states storable in a thread.
+ */
+#define MAX_SIGHANDLERS 10
+
+typedef struct _exec_stack exec_stack;
+struct _exec_stack {
+   Int         sp; /* > 0 if a handler is running */
+   exec_state* entry[MAX_SIGHANDLERS];
+};
+
+/* Thread State
+ *
+ * This structure stores thread specific info while a thread is *not*
+ * running. See function switch_thread() for save/restore on thread switch.
+ *
+ * If --separate-threads=no, BBCCs and JCCs can be shared by all threads, i.e.
+ * only structures of thread 1 are used.
+ * This involves variables fn_info_table, bbcc_table and jcc_table.
+ */
+struct _thread_info {
+
+   /* state */
+   fn_stack   fns;    /* function stack */
+   call_stack calls;  /* context call arc stack */
+   exec_stack states; /* execution states interrupted by signals */
+
+   /* cost tracking */
+   FullCost lastdump_cost; /* Cost at last total cost computation */
+
+   /* CSV trace: per-thread snapshot of cost at last sample emission */
+   FullCost last_sample_cost;
+
+   /* Inline tracking: current inline call stack (outermost first) */
+   const HChar* cur_inl_fns[TG_MAX_INL_DEPTH];
+   UInt         cur_inl_depth;
+
+   /* thread specific data structure containers */
+   fn_array  fn_active;
+   jcc_hash  jccs;
+   bbcc_hash bbccs;
+};
+
+/*------------------------------------------------------------*/
+/*--- Cache simulator interface                            ---*/
+/*------------------------------------------------------------*/
+
+struct cachesim_if {
+   void (*print_opts)(void);
+   Bool (*parse_opt)(const HChar* arg);
+   void (*post_clo_init)(void);
+   void (*clear)(void);
+   void (*printstat)(Int, Int, Int);
+   void (*finish)(void);
+
+   void (*log_1I0D)(InstrInfo*) VG_REGPARM(1);
+   void (*log_2I0D)(InstrInfo*, InstrInfo*) VG_REGPARM(2);
+   void (*log_3I0D)(InstrInfo*, InstrInfo*, InstrInfo*) VG_REGPARM(3);
+
+   void (*log_1I1Dr)(InstrInfo*, Addr, Word) VG_REGPARM(3);
+   void (*log_1I1Dw)(InstrInfo*, Addr, Word) VG_REGPARM(3);
+
+   void (*log_0I1Dr)(InstrInfo*, Addr, Word) VG_REGPARM(3);
+   void (*log_0I1Dw)(InstrInfo*, Addr, Word) VG_REGPARM(3);
+
+   // function names of helpers (for debugging generated code)
+   const HChar *log_1I0D_name, *log_2I0D_name, *log_3I0D_name;
+   const HChar *log_1I1Dr_name, *log_1I1Dw_name;
+   const HChar *log_0I1Dr_name, *log_0I1Dw_name;
+};
+
+// Event groups
+#define EG_USE 0
+#define EG_IR  1
+#define EG_DR  2
+#define EG_DW  3
+#define EG_BC  4
+#define EG_BI  5
+#define EG_BUS 6
+#define EG_SYS 7
+
+struct event_sets {
+   EventSet *base, *full;
+};
+
+#define fullOffset(group) (TG_(sets).full->offset[group])
+
+/*------------------------------------------------------------*/
+/*--- Trace output state                                   ---*/
+/*------------------------------------------------------------*/
+
+typedef struct {
+   Int   fd;             /* Output file descriptor (-1 if not open) */
+   ULong seq;            /* Global sequence counter */
+   Bool  initialized;    /* Has the output been opened? */
+   Bool  header_written; /* Has the schema chunk been written? */
+} trace_output;
+
+/*------------------------------------------------------------*/
+/*--- Functions                                            ---*/
+/*------------------------------------------------------------*/
+
+/* from clo.c */
+
+void TG_(set_clo_defaults)(void);
+void TG_(update_fn_config)(fn_node*);
+Bool TG_(process_cmd_line_option)(const HChar*);
+void TG_(print_usage)(void);
+void TG_(print_debug_usage)(void);
+
+/* from sim.c */
+void TG_(init_eventsets)(void);
+
+/* from main.c */
+Bool TG_(get_debug_info)(Addr,
+                         const HChar** dirname,
+                         const HChar** filename,
+                         const HChar** fn_name,
+                         UInt*,
+                         DebugInfo**);
+void TG_(collectBlockInfo)(IRSB* bbIn, UInt*, UInt*, Bool*);
+void TG_(set_instrument_state)(const HChar*, Bool);
+void TG_(compute_total_cost)(void);
+void TG_(fini)(Int exitcode);
+
+/* from bb.c */
+void     TG_(init_bb_hash)(void);
+bb_hash* TG_(get_bb_hash)(void);
+BB*      TG_(get_bb)(Addr addr, IRSB* bb_in, Bool* seen_before);
+void     TG_(delete_bb)(Addr addr);
+
+static __inline__ Addr bb_addr(BB* bb) { return bb->offset + bb->obj->offset; }
+static __inline__ Addr bb_jmpaddr(BB* bb)
+{
+   UInt off =
+      (bb->instr_count > 0) ? bb->instr[bb->instr_count - 1].instr_offset : 0;
+   return off + bb->offset + bb->obj->offset;
+}
+
+/* from fn.c */
+void      TG_(init_fn_array)(fn_array*);
+void      TG_(copy_current_fn_array)(fn_array* dst);
+fn_array* TG_(get_current_fn_array)(void);
+void      TG_(set_current_fn_array)(fn_array*);
+UInt*     TG_(get_fn_entry)(Int n);
+
+void      TG_(init_obj_table)(void);
+obj_node* TG_(get_obj_node)(DebugInfo* si);
+file_node*
+   TG_(get_file_node)(obj_node*, const HChar* dirname, const HChar* filename);
+fn_node* TG_(get_fn_node)(BB* bb);
+
+/* from bbcc.c */
+void       TG_(init_bbcc_hash)(bbcc_hash* bbccs);
+void       TG_(copy_current_bbcc_hash)(bbcc_hash* dst);
+bbcc_hash* TG_(get_current_bbcc_hash)(void);
+void       TG_(set_current_bbcc_hash)(bbcc_hash*);
+BBCC*      TG_(get_bbcc)(BB* bb);
+void       TG_(setup_bbcc)(BB* bb) VG_REGPARM(1);
+
+/* from jumps.c */
+void TG_(init_jcc_hash)(jcc_hash*);
+void TG_(copy_current_jcc_hash)(jcc_hash* dst);
+void TG_(set_current_jcc_hash)(jcc_hash*);
+jCC* TG_(get_jcc)(BBCC* from, UInt, BBCC* to);
+
+/* from callstack.c */
+void        TG_(init_call_stack)(call_stack*);
+void        TG_(copy_current_call_stack)(call_stack* dst);
+void        TG_(set_current_call_stack)(call_stack*);
+call_entry* TG_(get_call_entry)(Int n);
+
+void TG_(push_call_stack)(BBCC* from, UInt jmp, BBCC* to, Addr sp, Bool skip);
+void TG_(pop_call_stack)(void);
+Int  TG_(unwind_call_stack)(Addr sp, Int);
+
+/* from context.c */
+void TG_(init_fn_stack)(fn_stack*);
+void TG_(copy_current_fn_stack)(fn_stack*);
+void TG_(set_current_fn_stack)(fn_stack*);
+
+void     TG_(init_cxt_table)(void);
+Context* TG_(get_cxt)(fn_node** fn);
+void     TG_(push_cxt)(fn_node* fn);
+
+/* from threads.c */
+void          TG_(init_threads)(void);
+thread_info** TG_(get_threads)(void);
+thread_info*  TG_(get_current_thread)(void);
+void          TG_(switch_thread)(ThreadId tid);
+void          TG_(forall_threads)(void (*func)(thread_info*));
+void          TG_(run_thread)(ThreadId tid);
+
+void TG_(init_exec_state)(exec_state* es);
+void TG_(init_exec_stack)(exec_stack*);
+void TG_(copy_current_exec_stack)(exec_stack*);
+void TG_(set_current_exec_stack)(exec_stack*);
+void TG_(pre_signal)(ThreadId tid, Int sigNum, Bool alt_stack);
+void TG_(post_signal)(ThreadId tid, Int sigNum);
+void TG_(run_post_signal_on_call_stack_bottom)(void);
+
+/* from dump.c */
+
+/* Trace output (from dump.c) */
+void TG_(trace_open_output)(void);
+void TG_(trace_reopen_child)(void);
+void TG_(trace_emit_sample)(ThreadId tid, Bool is_enter, fn_node* fn);
+void TG_(trace_emit_enter_inlined)(ThreadId tid, BB* bb, const HChar* inl_fn);
+void TG_(trace_emit_exit_inlined)(ThreadId tid, BB* bb, const HChar* inl_fn);
+void TG_(trace_emit_fork)(ThreadId tid, Int child_pid);
+void TG_(trace_emit_thread_create)(ThreadId tid, ThreadId child);
+void TG_(trace_emit_marker)(ThreadId tid, const HChar* marker);
+void TG_(trace_close_output)(void);
+
+/*------------------------------------------------------------*/
+/*--- Exported global variables                            ---*/
+/*------------------------------------------------------------*/
+
+extern CommandLineOptions TG_(clo);
+extern Statistics         TG_(stat);
+extern EventMapping*      TG_(dumpmap);
+extern trace_output       TG_(trace_out);
+
+/* Function active counter array, indexed by function number */
+extern UInt* TG_(fn_active_array);
+extern Bool  TG_(instrument_state);
+/* min of L1 and LL cache line sizes */
+extern Int                TG_(min_line_size);
+extern call_stack         TG_(current_call_stack);
+extern fn_stack           TG_(current_fn_stack);
+extern exec_state         TG_(current_state);
+extern ThreadId           TG_(current_tid);
+extern FullCost           TG_(total_cost);
+extern struct cachesim_if TG_(cachesim);
+extern struct event_sets  TG_(sets);
+
+// set by setup_bbcc at start of every BB, and needed by log_* helpers
+extern Addr   TG_(bb_base);
+extern ULong* TG_(cost_base);
+
+/*------------------------------------------------------------*/
+/*--- Debug output                                         ---*/
+/*------------------------------------------------------------*/
+
+#if TG_ENABLE_DEBUG
+
+#define TG_DEBUGIF(x)                                                          \
+   if (UNLIKELY((TG_(clo).verbose > x) &&                                      \
+                (TG_(stat).bb_executions >= TG_(clo).verbose_start)))
+
+#define TG_DEBUG(x, format, args...)                                           \
+   TG_DEBUGIF(x)                                                               \
+   {                                                                           \
+      TG_(print_bbno)();                                                       \
+      VG_(printf)(format, ##args);                                             \
+   }
+
+#define TG_ASSERT(cond)                                                        \
+   if (UNLIKELY(!(cond))) {                                                    \
+      TG_(print_context)();                                                    \
+      TG_(print_bbno)();                                                       \
+      tl_assert(cond);                                                         \
+   }
+
+#else
+#define TG_DEBUGIF(x) if (0)
+#define TG_DEBUG(x...)                                                         \
+   {                                                                           \
+   }
+#define TG_ASSERT(cond) tl_assert(cond);
+#endif
+
+/* from debug.c */
+void TG_(print_bbno)(void);
+void TG_(print_context)(void);
+void TG_(print_jcc)(int s, jCC* jcc);
+void TG_(print_bbcc)(int s, BBCC* bbcc);
+void TG_(print_bbcc_fn)(BBCC* bbcc);
+void TG_(print_execstate)(int s, exec_state* es);
+void TG_(print_eventset)(int s, EventSet* es);
+void TG_(print_cost)(int s, EventSet*, ULong* cost);
+void TG_(print_bb)(int s, BB* bb);
+void TG_(print_bbcc_cost)(int s, BBCC*);
+void TG_(print_cxt)(int s, Context* cxt, int rec_index);
+void TG_(print_short_jcc)(jCC* jcc);
+void TG_(print_stackentry)(int s, int sp);
+void TG_(print_addr)(Addr addr);
+void TG_(print_addr_ln)(Addr addr);
+
+void* TG_(malloc)(const HChar* cc, UWord s, const HChar* f);
+void* TG_(free)(void* p, const HChar* f);
+#if 0
+#define TG_MALLOC(_cc, x) TG_(malloc)((_cc), x, __FUNCTION__)
+#define TG_FREE(p)        TG_(free)(p, __FUNCTION__)
+#else
+#define TG_MALLOC(_cc, x) VG_(malloc)((_cc), x)
+#define TG_FREE(p)        VG_(free)(p)
+#endif
+
+#endif /* TG_GLOBAL */
diff --git a/tracegrind/jumps.c b/tracegrind/jumps.c
new file mode 100644
index 000000000..f25d062cb
--- /dev/null
+++ b/tracegrind/jumps.c
@@ -0,0 +1,219 @@
+/*--------------------------------------------------------------------*/
+/*--- Tracegrind                                                    ---*/
+/*---                                                   ct_jumps.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Tracegrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+
+/*------------------------------------------------------------*/
+/*--- Jump Cost Center (JCC) operations, including Calls   ---*/
+/*------------------------------------------------------------*/
+
+#define N_JCC_INITIAL_ENTRIES 4437
+
+static jcc_hash current_jccs;
+
+void TG_(init_jcc_hash)(jcc_hash* jccs)
+{
+   Int i;
+
+   TG_ASSERT(jccs != 0);
+
+   jccs->size    = N_JCC_INITIAL_ENTRIES;
+   jccs->entries = 0;
+   jccs->table = (jCC**)TG_MALLOC("cl.jumps.ijh.1", jccs->size * sizeof(jCC*));
+   jccs->spontaneous = 0;
+
+   for (i = 0; i < jccs->size; i++)
+      jccs->table[i] = 0;
+}
+
+void TG_(copy_current_jcc_hash)(jcc_hash* dst)
+{
+   TG_ASSERT(dst != 0);
+
+   dst->size        = current_jccs.size;
+   dst->entries     = current_jccs.entries;
+   dst->table       = current_jccs.table;
+   dst->spontaneous = current_jccs.spontaneous;
+}
+
+void TG_(set_current_jcc_hash)(jcc_hash* h)
+{
+   TG_ASSERT(h != 0);
+
+   current_jccs.size        = h->size;
+   current_jccs.entries     = h->entries;
+   current_jccs.table       = h->table;
+   current_jccs.spontaneous = h->spontaneous;
+}
+
+__inline__ static UInt jcc_hash_idx(BBCC* from, UInt jmp, BBCC* to, UInt size)
+{
+   return (UInt)((UWord)from + 7 * (UWord)to + 13 * jmp) % size;
+}
+
+/* double size of jcc table  */
+static void resize_jcc_table(void)
+{
+   Int   i, new_size, conflicts1 = 0, conflicts2 = 0;
+   jCC** new_table;
+   UInt  new_idx;
+   jCC * curr_jcc, *next_jcc;
+
+   new_size  = 2 * current_jccs.size + 3;
+   new_table = (jCC**)TG_MALLOC("cl.jumps.rjt.1", new_size * sizeof(jCC*));
+
+   for (i = 0; i < new_size; i++)
+      new_table[i] = NULL;
+
+   for (i = 0; i < current_jccs.size; i++) {
+      if (current_jccs.table[i] == NULL)
+         continue;
+
+      curr_jcc = current_jccs.table[i];
+      while (NULL != curr_jcc) {
+         next_jcc = curr_jcc->next_hash;
+
+         new_idx =
+            jcc_hash_idx(curr_jcc->from, curr_jcc->jmp, curr_jcc->to, new_size);
+
+         curr_jcc->next_hash = new_table[new_idx];
+         new_table[new_idx]  = curr_jcc;
+         if (curr_jcc->next_hash) {
+            conflicts1++;
+            if (curr_jcc->next_hash->next_hash)
+               conflicts2++;
+         }
+
+         curr_jcc = next_jcc;
+      }
+   }
+
+   VG_(free)(current_jccs.table);
+
+   TG_DEBUG(0, "Resize JCC Hash: %u => %d (entries %u, conflicts %d/%d)\n",
+            current_jccs.size, new_size, current_jccs.entries, conflicts1,
+            conflicts2);
+
+   current_jccs.size  = new_size;
+   current_jccs.table = new_table;
+   TG_(stat).jcc_hash_resizes++;
+}
+
+/* new jCC structure: a call was done to a BB of a BBCC
+ * for a spontaneous call, from is 0 (i.e. caller unknown)
+ */
+static jCC* new_jcc(BBCC* from, UInt jmp, BBCC* to)
+{
+   jCC* jcc;
+   UInt new_idx;
+
+   /* check fill degree of jcc hash table and resize if needed (>80%) */
+   current_jccs.entries++;
+   if (10 * current_jccs.entries / current_jccs.size > 8)
+      resize_jcc_table();
+
+   jcc = (jCC*)TG_MALLOC("cl.jumps.nj.1", sizeof(jCC));
+
+   jcc->from         = from;
+   jcc->jmp          = jmp;
+   jcc->to           = to;
+   jcc->jmpkind      = jk_Call;
+   jcc->call_counter = 0;
+   jcc->cost         = 0;
+
+   /* insert into JCC chain of calling BBCC.
+    * This list is only used at dumping time */
+
+   if (from) {
+      /* Prohibit corruption by array overrun */
+      TG_ASSERT(jmp <= from->bb->cjmp_count);
+      jcc->next_from          = from->jmp[jmp].jcc_list;
+      from->jmp[jmp].jcc_list = jcc;
+   } else {
+      jcc->next_from           = current_jccs.spontaneous;
+      current_jccs.spontaneous = jcc;
+   }
+
+   /* insert into JCC hash table */
+   new_idx                     = jcc_hash_idx(from, jmp, to, current_jccs.size);
+   jcc->next_hash              = current_jccs.table[new_idx];
+   current_jccs.table[new_idx] = jcc;
+
+   TG_(stat).distinct_jccs++;
+
+   TG_DEBUGIF(3)
+   {
+      VG_(printf)("  new_jcc (now %d): %p\n", TG_(stat).distinct_jccs, jcc);
+   }
+
+   return jcc;
+}
+
+/* get the jCC for a call arc (BBCC->BBCC) */
+jCC* TG_(get_jcc)(BBCC* from, UInt jmp, BBCC* to)
+{
+   jCC* jcc;
+   UInt idx;
+
+   TG_DEBUG(5, "+ get_jcc(bbcc %p/%u => bbcc %p)\n", from, jmp, to);
+
+   /* first check last recently used JCC */
+   jcc = to->lru_to_jcc;
+   if (jcc && (jcc->from == from) && (jcc->jmp == jmp)) {
+      TG_ASSERT(to == jcc->to);
+      TG_DEBUG(5, "- get_jcc: [LRU to] jcc %p\n", jcc);
+      return jcc;
+   }
+
+   jcc = from->lru_from_jcc;
+   if (jcc && (jcc->to == to) && (jcc->jmp == jmp)) {
+      TG_ASSERT(from == jcc->from);
+      TG_DEBUG(5, "- get_jcc: [LRU from] jcc %p\n", jcc);
+      return jcc;
+   }
+
+   TG_(stat).jcc_lru_misses++;
+
+   idx = jcc_hash_idx(from, jmp, to, current_jccs.size);
+   jcc = current_jccs.table[idx];
+
+   while (jcc) {
+      if ((jcc->from == from) && (jcc->jmp == jmp) && (jcc->to == to))
+         break;
+      jcc = jcc->next_hash;
+   }
+
+   if (!jcc)
+      jcc = new_jcc(from, jmp, to);
+
+   /* set LRU */
+   from->lru_from_jcc = jcc;
+   to->lru_to_jcc     = jcc;
+
+   TG_DEBUG(5, "- get_jcc(bbcc %p => bbcc %p)\n", from, to);
+
+   return jcc;
+}
diff --git a/tracegrind/lz4.c b/tracegrind/lz4.c
new file mode 100644
index 000000000..e0af37e2b
--- /dev/null
+++ b/tracegrind/lz4.c
@@ -0,0 +1,3417 @@
+/*
+   LZ4 - Fast LZ compression algorithm
+   Copyright (c) Yann Collet. All rights reserved.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+    - LZ4 homepage : http://www.lz4.org
+    - LZ4 source repository : https://github.com/lz4/lz4
+*/
+
+/*-************************************
+ *  Tuning parameters
+ **************************************/
+/*
+ * LZ4_HEAPMODE :
+ * Select how stateless compression functions like `LZ4_compress_default()`
+ * allocate memory for their hash table,
+ * in memory stack (0:default, fastest), or in memory heap (1:requires
+ * malloc()).
+ */
+#ifndef LZ4_HEAPMODE
+#define LZ4_HEAPMODE 0
+#endif
+
+/*
+ * LZ4_ACCELERATION_DEFAULT :
+ * Select "acceleration" for LZ4_compress_fast() when parameter value <= 0
+ */
+#define LZ4_ACCELERATION_DEFAULT 1
+/*
+ * LZ4_ACCELERATION_MAX :
+ * Any "acceleration" value higher than this threshold
+ * get treated as LZ4_ACCELERATION_MAX instead (fix #876)
+ */
+#define LZ4_ACCELERATION_MAX 65537
+
+/*-************************************
+ *  CPU Feature Detection
+ **************************************/
+/* LZ4_FORCE_MEMORY_ACCESS
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is
+ * safe and portable. Unfortunately, on some target/compiler combinations, the
+ * generated assembly is sub-optimal. The below switch allow to select different
+ * access method for improved performance. Method 0 (default) : use `memcpy()`.
+ * Safe and portable. Method 1 : `__packed` statement. It depends on compiler
+ * extension (ie, not portable). This method is safe if your compiler supports
+ * it, and *generally* as fast or faster than `memcpy`. Method 2 : direct
+ * access. This method is portable but violate C standard. It can generate buggy
+ * code on targets which assembly generation depends on alignment. But in some
+ * circumstances, it's the only known way to get the most performance (ie GCC +
+ * ARMv6) See
+ * https://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html
+ * for details. Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef LZ4_FORCE_MEMORY_ACCESS /* can be defined externally */
+#if defined(__GNUC__) &&                                                       \
+   (defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) ||                     \
+    defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) ||                    \
+    defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ||                  \
+    (defined(__riscv) && defined(__riscv_zicclsm)))
+#define LZ4_FORCE_MEMORY_ACCESS 2
+#elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) || defined(__GNUC__) ||  \
+   defined(_MSC_VER)
+#define LZ4_FORCE_MEMORY_ACCESS 1
+#endif
+#endif
+
+/*
+ * LZ4_FORCE_SW_BITCOUNT
+ * Define this parameter if your target system or compiler does not support
+ * hardware bit count
+ */
+#if defined(_MSC_VER) &&                                                       \
+   defined(_WIN32_WCE) /* Visual Studio for WinCE doesn't support Hardware bit \
+                          count */
+#undef LZ4_FORCE_SW_BITCOUNT /* avoid double def */
+#define LZ4_FORCE_SW_BITCOUNT
+#endif
+
+/*-************************************
+ *  Dependency
+ **************************************/
+/*
+ * LZ4_SRC_INCLUDED:
+ * Amalgamation flag, whether lz4.c is included
+ */
+#ifndef LZ4_SRC_INCLUDED
+#define LZ4_SRC_INCLUDED 1
+#endif
+
+#ifndef LZ4_DISABLE_DEPRECATE_WARNINGS
+#define LZ4_DISABLE_DEPRECATE_WARNINGS    /* due to                            \
+                                             LZ4_decompress_safe_withPrefix64k */
+#endif
+
+#ifndef LZ4_STATIC_LINKING_ONLY
+#define LZ4_STATIC_LINKING_ONLY
+#endif
+#include "lz4.h"
+/* see also "memory routines" below */
+
+/*-************************************
+ *  Compiler Options
+ **************************************/
+#if defined(_MSC_VER) && (_MSC_VER >= 1400) /* Visual Studio 2005+ */
+#include <intrin.h>                         /* only present in VS2005+ */
+#pragma warning(                                                               \
+   disable : 4127) /* disable: C4127: conditional expression is constant */
+#pragma warning(                                                               \
+   disable : 6237) /* disable: C6237: conditional expression is always 0 */
+#pragma warning(                                                               \
+   disable : 6239) /* disable: C6239: (<non-zero constant> && <expression>)    \
+                      always evaluates to the result of <expression> */
+#pragma warning(                                                               \
+   disable : 6240) /* disable: C6240: (<expression> && <non-zero constant>)    \
+                      always evaluates to the result of <expression> */
+#pragma warning(disable : 6326) /* disable: C6326: Potential comparison of a   \
+                                   constant with another constant */
+#endif                          /* _MSC_VER */
+
+#ifndef LZ4_FORCE_INLINE
+#if defined(_MSC_VER) && !defined(__clang__) /* MSVC */
+#define LZ4_FORCE_INLINE static __forceinline
+#else
+#if defined(__cplusplus) ||                                                    \
+   defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */
+#if defined(__GNUC__) || defined(__clang__)
+#define LZ4_FORCE_INLINE static inline __attribute__((always_inline))
+#else
+#define LZ4_FORCE_INLINE static inline
+#endif
+#else
+#define LZ4_FORCE_INLINE static
+#endif /* __STDC_VERSION__ */
+#endif /* _MSC_VER */
+#endif /* LZ4_FORCE_INLINE */
+
+/* LZ4_FORCE_O2 and LZ4_FORCE_INLINE
+ * gcc on ppc64le generates an unrolled SIMDized loop for LZ4_wildCopy8,
+ * together with a simple 8-byte copy loop as a fall-back path.
+ * However, this optimization hurts the decompression speed by >30%,
+ * because the execution does not go to the optimized loop
+ * for typical compressible data, and all of the preamble checks
+ * before going to the fall-back path become useless overhead.
+ * This optimization happens only with the -O3 flag, and -O2 generates
+ * a simple 8-byte copy loop.
+ * With gcc on ppc64le, all of the LZ4_decompress_* and LZ4_wildCopy8
+ * functions are annotated with __attribute__((optimize("O2"))),
+ * and also LZ4_wildCopy8 is forcibly inlined, so that the O2 attribute
+ * of LZ4_wildCopy8 does not affect the compression speed.
+ */
+#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) && defined(__GNUC__) &&   \
+   !defined(__clang__)
+#define LZ4_FORCE_O2 __attribute__((optimize("O2")))
+#undef LZ4_FORCE_INLINE
+#define LZ4_FORCE_INLINE                                                       \
+   static __inline __attribute__((optimize("O2"), always_inline))
+#else
+#define LZ4_FORCE_O2
+#endif
+
+#if (defined(__GNUC__) && (__GNUC__ >= 3)) ||                                  \
+   (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) ||                 \
+   defined(__clang__)
+#define expect(expr, value) (__builtin_expect((expr), (value)))
+#else
+#define expect(expr, value) (expr)
+#endif
+
+#ifndef likely
+#define likely(expr) expect((expr) != 0, 1)
+#endif
+#ifndef unlikely
+#define unlikely(expr) expect((expr) != 0, 0)
+#endif
+
+/* Should the alignment test prove unreliable, for some reason,
+ * it can be disabled by setting LZ4_ALIGN_TEST to 0 */
+#ifndef LZ4_ALIGN_TEST /* can be externally provided */
+#define LZ4_ALIGN_TEST 1
+#endif
+
+/*-************************************
+ *  Memory routines
+ **************************************/
+
+/*! LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION :
+ *  Disable relatively high-level LZ4/HC functions that use dynamic memory
+ *  allocation functions (malloc(), calloc(), free()).
+ *
+ *  Note that this is a compile-time switch. And since it disables
+ *  public/stable LZ4 v1 API functions, we don't recommend using this
+ *  symbol to generate a library for distribution.
+ *
+ *  The following public functions are removed when this symbol is defined.
+ *  - lz4   : LZ4_createStream, LZ4_freeStream,
+ *            LZ4_createStreamDecode, LZ4_freeStreamDecode, LZ4_create
+ * (deprecated)
+ *  - lz4hc : LZ4_createStreamHC, LZ4_freeStreamHC,
+ *            LZ4_createHC (deprecated), LZ4_freeHC  (deprecated)
+ *  - lz4frame, lz4file : All LZ4F_* functions
+ */
+#if defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+#define ALLOC(s)          lz4_error_memory_allocation_is_disabled
+#define ALLOC_AND_ZERO(s) lz4_error_memory_allocation_is_disabled
+#define FREEMEM(p)        lz4_error_memory_allocation_is_disabled
+#elif defined(LZ4_USER_MEMORY_FUNCTIONS)
+/* memory management functions can be customized by user project.
+ * Below functions must exist somewhere in the Project
+ * and be available at link time */
+void*                      LZ4_malloc(size_t s);
+void*                      LZ4_calloc(size_t n, size_t s);
+void                       LZ4_free(void* p);
+#define ALLOC(s)          LZ4_malloc(s)
+#define ALLOC_AND_ZERO(s) LZ4_calloc(1, s)
+#define FREEMEM(p)        LZ4_free(p)
+#else
+#include <stdlib.h> /* malloc, calloc, free */
+#define ALLOC(s)          malloc(s)
+#define ALLOC_AND_ZERO(s) calloc(1, s)
+#define FREEMEM(p)        free(p)
+#endif
+
+#if !LZ4_FREESTANDING
+#include <string.h> /* memset, memcpy */
+#endif
+#if !defined(LZ4_memset)
+#define LZ4_memset(p, v, s) memset((p), (v), (s))
+#endif
+#define MEM_INIT(p, v, s) LZ4_memset((p), (v), (s))
+
+/*-************************************
+ *  Common Constants
+ **************************************/
+#define MINMATCH 4
+
+#define WILDCOPYLENGTH 8
+#define LASTLITERALS                                                           \
+   5               /* see ../doc/lz4_Block_format.md#parsing-restrictions      \
+                    */
+#define MFLIMIT 12 /* see ../doc/lz4_Block_format.md#parsing-restrictions */
+#define MATCH_SAFEGUARD_DISTANCE                                               \
+   ((2 * WILDCOPYLENGTH) -                                                     \
+    MINMATCH) /* ensure it's possible to write 2 x wildcopyLength without      \
+                 overflowing output buffer */
+#define FASTLOOP_SAFE_DISTANCE 64
+static const int LZ4_minLength = (MFLIMIT + 1);
+
+#define KB *(1 << 10)
+#define MB *(1 << 20)
+#define GB *(1U << 30)
+
+#define LZ4_DISTANCE_ABSOLUTE_MAX 65535
+#if (LZ4_DISTANCE_MAX >                                                        \
+     LZ4_DISTANCE_ABSOLUTE_MAX) /* max supported by LZ4 format */
+#error "LZ4_DISTANCE_MAX is too big : must be <= 65535"
+#endif
+
+#define ML_BITS  4
+#define ML_MASK  ((1U << ML_BITS) - 1)
+#define RUN_BITS (8 - ML_BITS)
+#define RUN_MASK ((1U << RUN_BITS) - 1)
+
+/*-************************************
+ *  Error detection
+ **************************************/
+#if defined(LZ4_DEBUG) && (LZ4_DEBUG >= 1)
+#include <assert.h>
+#else
+#ifndef assert
+#define assert(condition) ((void)0)
+#endif
+#endif
+
+#define LZ4_STATIC_ASSERT(c)                                                   \
+   {                                                                           \
+      enum { LZ4_static_assert = 1 / (int)(!!(c)) };                           \
+   } /* use after variable declarations */
+
+#if defined(LZ4_DEBUG) && (LZ4_DEBUG >= 2)
+#include <stdio.h>
+static int g_debuglog_enable = 1;
+#define DEBUGLOG(l, ...)                                                       \
+   {                                                                           \
+      if ((g_debuglog_enable) && (l <= LZ4_DEBUG)) {                           \
+         fprintf(stderr, __FILE__ " %i: ", __LINE__);                          \
+         fprintf(stderr, __VA_ARGS__);                                         \
+         fprintf(stderr, " \n");                                               \
+      }                                                                        \
+   }
+#else
+#define DEBUGLOG(l, ...)                                                       \
+   {                                                                           \
+   } /* disabled */
+#endif
+
+static int LZ4_isAligned(const void* ptr, size_t alignment)
+{
+   return ((size_t)ptr & (alignment - 1)) == 0;
+}
+
+/*-************************************
+ *  Types
+ **************************************/
+#include <limits.h>
+#if defined(__cplusplus) ||                                                    \
+   (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#include <stdint.h>
+typedef unsigned char
+   BYTE; /*uint8_t not necessarily blessed to alias arbitrary type*/
+typedef uint16_t  U16;
+typedef uint32_t  U32;
+typedef int32_t   S32;
+typedef uint64_t  U64;
+typedef uintptr_t uptrval;
+#else
+#if UINT_MAX != 4294967295UL
+#error "LZ4 code (when not C++ or C99) assumes that sizeof(int) == 4"
+#endif
+typedef unsigned char      BYTE;
+typedef unsigned short     U16;
+typedef unsigned int       U32;
+typedef signed int         S32;
+typedef unsigned long long U64;
+typedef size_t             uptrval; /* generally true, except OpenVMS-64 */
+#endif
+
+#if defined(__x86_64__)
+typedef U64 reg_t; /* 64-bits in x32 mode */
+#else
+typedef size_t             reg_t;   /* 32-bits in x32 mode */
+#endif
+
+typedef enum {
+   notLimited    = 0,
+   limitedOutput = 1,
+   fillOutput    = 2
+} limitedOutput_directive;
+
+/*-************************************
+ *  Reading and writing into memory
+ **************************************/
+
+/**
+ * LZ4 relies on memcpy with a constant size being inlined. In freestanding
+ * environments, the compiler can't assume the implementation of memcpy() is
+ * standard compliant, so it can't apply its specialized memcpy() inlining
+ * logic. When possible, use __builtin_memcpy() to tell the compiler to analyze
+ * memcpy() as if it were standard compliant, so it can inline it in
+ * freestanding environments. This is needed when decompressing the Linux
+ * Kernel, for example.
+ */
+#if !defined(LZ4_memcpy)
+#if defined(__GNUC__) && (__GNUC__ >= 4)
+#define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size)
+#else
+#define LZ4_memcpy(dst, src, size) memcpy(dst, src, size)
+#endif
+#endif
+
+#if !defined(LZ4_memmove)
+#if defined(__GNUC__) && (__GNUC__ >= 4)
+#define LZ4_memmove __builtin_memmove
+#else
+#define LZ4_memmove memmove
+#endif
+#endif
+
+static unsigned LZ4_isLittleEndian(void)
+{
+   const union {
+      U32  u;
+      BYTE c[4];
+   } one = {1}; /* don't use static : performance detrimental */
+   return one.c[0];
+}
+
+#if defined(__GNUC__) || defined(__INTEL_COMPILER)
+#define LZ4_PACK(__Declaration__) __Declaration__ __attribute__((__packed__))
+#elif defined(_MSC_VER)
+#define LZ4_PACK(__Declaration__)                                              \
+   __pragma(pack(push, 1)) __Declaration__ __pragma(pack(pop))
+#endif
+
+#if defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS == 2)
+/* lie to the compiler about data alignment; use with caution */
+
+static U16   LZ4_read16(const void* memPtr) { return *(const U16*)memPtr; }
+static U32   LZ4_read32(const void* memPtr) { return *(const U32*)memPtr; }
+static reg_t LZ4_read_ARCH(const void* memPtr) { return *(const reg_t*)memPtr; }
+
+static void LZ4_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+static void LZ4_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; }
+
+#elif defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS == 1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially
+ * problematic for some compilers */
+/* currently only defined for gcc and icc */
+LZ4_PACK(typedef struct { U16 u16; }) LZ4_unalign16;
+LZ4_PACK(typedef struct { U32 u32; }) LZ4_unalign32;
+LZ4_PACK(typedef struct { reg_t uArch; }) LZ4_unalignST;
+
+static U16 LZ4_read16(const void* ptr)
+{
+   return ((const LZ4_unalign16*)ptr)->u16;
+}
+static U32 LZ4_read32(const void* ptr)
+{
+   return ((const LZ4_unalign32*)ptr)->u32;
+}
+static reg_t LZ4_read_ARCH(const void* ptr)
+{
+   return ((const LZ4_unalignST*)ptr)->uArch;
+}
+
+static void LZ4_write16(void* memPtr, U16 value)
+{
+   ((LZ4_unalign16*)memPtr)->u16 = value;
+}
+static void LZ4_write32(void* memPtr, U32 value)
+{
+   ((LZ4_unalign32*)memPtr)->u32 = value;
+}
+
+#else /* safe and portable access using memcpy() */
+
+static U16 LZ4_read16(const void* memPtr)
+{
+   U16 val;
+   LZ4_memcpy(&val, memPtr, sizeof(val));
+   return val;
+}
+
+static U32 LZ4_read32(const void* memPtr)
+{
+   U32 val;
+   LZ4_memcpy(&val, memPtr, sizeof(val));
+   return val;
+}
+
+static reg_t LZ4_read_ARCH(const void* memPtr)
+{
+   reg_t val;
+   LZ4_memcpy(&val, memPtr, sizeof(val));
+   return val;
+}
+
+static void LZ4_write16(void* memPtr, U16 value)
+{
+   LZ4_memcpy(memPtr, &value, sizeof(value));
+}
+
+static void LZ4_write32(void* memPtr, U32 value)
+{
+   LZ4_memcpy(memPtr, &value, sizeof(value));
+}
+
+#endif /* LZ4_FORCE_MEMORY_ACCESS */
+
+static U16 LZ4_readLE16(const void* memPtr)
+{
+   if (LZ4_isLittleEndian()) {
+      return LZ4_read16(memPtr);
+   } else {
+      const BYTE* p = (const BYTE*)memPtr;
+      return (U16)((U16)p[0] | (p[1] << 8));
+   }
+}
+
+#ifdef LZ4_STATIC_LINKING_ONLY_ENDIANNESS_INDEPENDENT_OUTPUT
+static U32 LZ4_readLE32(const void* memPtr)
+{
+   if (LZ4_isLittleEndian()) {
+      return LZ4_read32(memPtr);
+   } else {
+      const BYTE* p = (const BYTE*)memPtr;
+      return (U32)p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24);
+   }
+}
+#endif
+
+static void LZ4_writeLE16(void* memPtr, U16 value)
+{
+   if (LZ4_isLittleEndian()) {
+      LZ4_write16(memPtr, value);
+   } else {
+      BYTE* p = (BYTE*)memPtr;
+      p[0]    = (BYTE)value;
+      p[1]    = (BYTE)(value >> 8);
+   }
+}
+
+/* customized variant of memcpy, which can overwrite up to 8 bytes beyond dstEnd
+ */
+LZ4_FORCE_INLINE
+void LZ4_wildCopy8(void* dstPtr, const void* srcPtr, void* dstEnd)
+{
+   BYTE*       d = (BYTE*)dstPtr;
+   const BYTE* s = (const BYTE*)srcPtr;
+   BYTE* const e = (BYTE*)dstEnd;
+
+   do {
+      LZ4_memcpy(d, s, 8);
+      d += 8;
+      s += 8;
+   } while (d < e);
+}
+
+static const unsigned inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4};
+static const int      dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3};
+
+#ifndef LZ4_FAST_DEC_LOOP
+#if defined __i386__ || defined _M_IX86 || defined __x86_64__ || defined _M_X64
+#define LZ4_FAST_DEC_LOOP 1
+#elif defined(__aarch64__)
+#if defined(__clang__) && defined(__ANDROID__)
+/* On Android aarch64, we disable this optimization for clang because
+ * on certain mobile chipsets, performance is reduced with clang. For
+ * more information refer to https://github.com/lz4/lz4/pull/707 */
+#define LZ4_FAST_DEC_LOOP 0
+#else
+#define LZ4_FAST_DEC_LOOP 1
+#endif
+#else
+#define LZ4_FAST_DEC_LOOP 0
+#endif
+#endif
+
+#if LZ4_FAST_DEC_LOOP
+
+LZ4_FORCE_INLINE void LZ4_memcpy_using_offset_base(BYTE*        dstPtr,
+                                                   const BYTE*  srcPtr,
+                                                   BYTE*        dstEnd,
+                                                   const size_t offset)
+{
+   assert(srcPtr + offset == dstPtr);
+   if (offset < 8) {
+      LZ4_write32(dstPtr, 0); /* silence an msan warning when offset==0 */
+      dstPtr[0] = srcPtr[0];
+      dstPtr[1] = srcPtr[1];
+      dstPtr[2] = srcPtr[2];
+      dstPtr[3] = srcPtr[3];
+      srcPtr += inc32table[offset];
+      LZ4_memcpy(dstPtr + 4, srcPtr, 4);
+      srcPtr -= dec64table[offset];
+      dstPtr += 8;
+   } else {
+      LZ4_memcpy(dstPtr, srcPtr, 8);
+      dstPtr += 8;
+      srcPtr += 8;
+   }
+
+   LZ4_wildCopy8(dstPtr, srcPtr, dstEnd);
+}
+
+/* customized variant of memcpy, which can overwrite up to 32 bytes beyond
+ * dstEnd this version copies two times 16 bytes (instead of one time 32 bytes)
+ * because it must be compatible with offsets >= 16. */
+LZ4_FORCE_INLINE void
+LZ4_wildCopy32(void* dstPtr, const void* srcPtr, void* dstEnd)
+{
+   BYTE*       d = (BYTE*)dstPtr;
+   const BYTE* s = (const BYTE*)srcPtr;
+   BYTE* const e = (BYTE*)dstEnd;
+
+   do {
+      LZ4_memcpy(d, s, 16);
+      LZ4_memcpy(d + 16, s + 16, 16);
+      d += 32;
+      s += 32;
+   } while (d < e);
+}
+
+/* LZ4_memcpy_using_offset()  presumes :
+ * - dstEnd >= dstPtr + MINMATCH
+ * - there is at least 12 bytes available to write after dstEnd */
+LZ4_FORCE_INLINE void LZ4_memcpy_using_offset(BYTE*        dstPtr,
+                                              const BYTE*  srcPtr,
+                                              BYTE*        dstEnd,
+                                              const size_t offset)
+{
+   BYTE v[8];
+
+   assert(dstEnd >= dstPtr + MINMATCH);
+
+   switch (offset) {
+   case 1:
+      MEM_INIT(v, *srcPtr, 8);
+      break;
+   case 2:
+      LZ4_memcpy(v, srcPtr, 2);
+      LZ4_memcpy(&v[2], srcPtr, 2);
+#if defined(_MSC_VER) && (_MSC_VER <= 1937) /* MSVC 2022 ver 17.7 or earlier   \
+                                             */
+#pragma warning(push)
+#pragma warning(                                                               \
+   disable : 6385) /* warning C6385: Reading invalid data from 'v'. */
+#endif
+      LZ4_memcpy(&v[4], v, 4);
+#if defined(_MSC_VER) && (_MSC_VER <= 1937) /* MSVC 2022 ver 17.7 or earlier   \
+                                             */
+#pragma warning(pop)
+#endif
+      break;
+   case 4:
+      LZ4_memcpy(v, srcPtr, 4);
+      LZ4_memcpy(&v[4], srcPtr, 4);
+      break;
+   default:
+      LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
+      return;
+   }
+
+   LZ4_memcpy(dstPtr, v, 8);
+   dstPtr += 8;
+   while (dstPtr < dstEnd) {
+      LZ4_memcpy(dstPtr, v, 8);
+      dstPtr += 8;
+   }
+}
+#endif
+
+/*-************************************
+ *  Common functions
+ **************************************/
+static unsigned LZ4_NbCommonBytes(reg_t val)
+{
+   assert(val != 0);
+   if (LZ4_isLittleEndian()) {
+      if (sizeof(val) == 8) {
+#if defined(_MSC_VER) && (_MSC_VER >= 1800) &&                                 \
+   (defined(_M_AMD64) && !defined(_M_ARM64EC)) &&                              \
+   !defined(LZ4_FORCE_SW_BITCOUNT)
+/*-*************************************************************************************************
+ * ARM64EC is a Microsoft-designed ARM64 ABI compatible with AMD64 applications
+ *on ARM64 Windows 11. The ARM64EC ABI does not support AVX/AVX2/AVX512
+ *instructions, nor their relevant intrinsics including _tzcnt_u64. Therefore,
+ *we need to neuter the _tzcnt_u64 code path for ARM64EC.
+ ****************************************************************************************************/
+#if defined(__clang__) && (__clang_major__ < 10)
+         /* Avoid undefined clang-cl intrinsics issue.
+          * See https://github.com/lz4/lz4/pull/1017 for details. */
+         return (unsigned)__builtin_ia32_tzcnt_u64(val) >> 3;
+#else
+         /* x64 CPUS without BMI support interpret `TZCNT` as `REP BSF` */
+         return (unsigned)_tzcnt_u64(val) >> 3;
+#endif
+#elif defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
+         unsigned long r = 0;
+         _BitScanForward64(&r, (U64)val);
+         return (unsigned)r >> 3;
+#elif (defined(__clang__) ||                                                   \
+       (defined(__GNUC__) &&                                                   \
+        ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) &&    \
+   !defined(LZ4_FORCE_SW_BITCOUNT)
+         return (unsigned)__builtin_ctzll((U64)val) >> 3;
+#else
+         const U64 m = 0x0101010101010101ULL;
+         val ^= val - 1;
+         return (unsigned)(((U64)((val & (m - 1)) * m)) >> 56);
+#endif
+      } else /* 32 bits */ {
+#if defined(_MSC_VER) && (_MSC_VER >= 1400) && !defined(LZ4_FORCE_SW_BITCOUNT)
+         unsigned long r;
+         _BitScanForward(&r, (U32)val);
+         return (unsigned)r >> 3;
+#elif (defined(__clang__) ||                                                   \
+       (defined(__GNUC__) &&                                                   \
+        ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) &&    \
+   !defined(__TINYC__) && !defined(LZ4_FORCE_SW_BITCOUNT)
+         return (unsigned)__builtin_ctz((U32)val) >> 3;
+#else
+         const U32 m = 0x01010101;
+         return (unsigned)((((val - 1) ^ val) & (m - 1)) * m) >> 24;
+#endif
+      }
+   } else /* Big Endian CPU */ {
+      if (sizeof(val) == 8) {
+#if (defined(__clang__) ||                                                     \
+     (defined(__GNUC__) &&                                                     \
+      ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) &&      \
+   !defined(__TINYC__) && !defined(LZ4_FORCE_SW_BITCOUNT)
+         return (unsigned)__builtin_clzll((U64)val) >> 3;
+#else
+#if 1
+         /* this method is probably faster,
+          * but adds a 128 bytes lookup table */
+         static const unsigned char ctz7_tab[128] = {
+            7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0,
+            1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0,
+            2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0,
+            1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0,
+            3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0,
+            1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+         };
+         U64 const mask = 0x0101010101010101ULL;
+         U64 const t    = (((val >> 8) - mask) | val) & mask;
+         return ctz7_tab[(t * 0x0080402010080402ULL) >> 57];
+#else
+         /* this method doesn't consume memory space like the previous one,
+          * but it contains several branches,
+          * that may end up slowing execution */
+         static const U32 by32 =
+            sizeof(val) * 4; /* 32 on 64 bits (goal), 16 on 32 bits.
+Just to avoid some static analyzer complaining about shift by 32 on 32-bits
+target. Note that this code path is never triggered in 32-bits mode. */
+         unsigned r;
+         if (!(val >> by32)) {
+            r = 4;
+         } else {
+            r = 0;
+            val >>= by32;
+         }
+         if (!(val >> 16)) {
+            r += 2;
+            val >>= 8;
+         } else {
+            val >>= 24;
+         }
+         r += (!val);
+         return r;
+#endif
+#endif
+      } else /* 32 bits */ {
+#if (defined(__clang__) ||                                                     \
+     (defined(__GNUC__) &&                                                     \
+      ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) &&      \
+   !defined(LZ4_FORCE_SW_BITCOUNT)
+         return (unsigned)__builtin_clz((U32)val) >> 3;
+#else
+         val >>= 8;
+         val =
+            ((((val + 0x00FFFF00) | 0x00FFFFFF) + val) | (val + 0x00FF0000)) >>
+            24;
+         return (unsigned)val ^ 3;
+#endif
+      }
+   }
+}
+
+#define STEPSIZE sizeof(reg_t)
+LZ4_FORCE_INLINE
+unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit)
+{
+   const BYTE* const pStart = pIn;
+
+   if (likely(pIn < pInLimit - (STEPSIZE - 1))) {
+      reg_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
+      if (!diff) {
+         pIn += STEPSIZE;
+         pMatch += STEPSIZE;
+      } else {
+         return LZ4_NbCommonBytes(diff);
+      }
+   }
+
+   while (likely(pIn < pInLimit - (STEPSIZE - 1))) {
+      reg_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
+      if (!diff) {
+         pIn += STEPSIZE;
+         pMatch += STEPSIZE;
+         continue;
+      }
+      pIn += LZ4_NbCommonBytes(diff);
+      return (unsigned)(pIn - pStart);
+   }
+
+   if ((STEPSIZE == 8) && (pIn < (pInLimit - 3)) &&
+       (LZ4_read32(pMatch) == LZ4_read32(pIn))) {
+      pIn += 4;
+      pMatch += 4;
+   }
+   if ((pIn < (pInLimit - 1)) && (LZ4_read16(pMatch) == LZ4_read16(pIn))) {
+      pIn += 2;
+      pMatch += 2;
+   }
+   if ((pIn < pInLimit) && (*pMatch == *pIn))
+      pIn++;
+   return (unsigned)(pIn - pStart);
+}
+
+#ifndef LZ4_COMMONDEFS_ONLY
+/*-************************************
+ *  Local Constants
+ **************************************/
+static const int LZ4_64Klimit    = ((64 KB) + (MFLIMIT - 1));
+static const U32 LZ4_skipTrigger = 6; /* Increase this value ==> compression run
+                                         slower on incompressible data */
+
+/*-************************************
+ *  Local Structures and types
+ **************************************/
+typedef enum { clearedTable = 0, byPtr, byU32, byU16 } tableType_t;
+
+/**
+ * This enum distinguishes several different modes of accessing previous
+ * content in the stream.
+ *
+ * - noDict        : There is no preceding content.
+ * - withPrefix64k : Table entries up to ctx->dictSize before the current blob
+ *                   blob being compressed are valid and refer to the preceding
+ *                   content (of length ctx->dictSize), which is available
+ *                   contiguously preceding in memory the content currently
+ *                   being compressed.
+ * - usingExtDict  : Like withPrefix64k, but the preceding content is somewhere
+ *                   else in memory, starting at ctx->dictionary with length
+ *                   ctx->dictSize.
+ * - usingDictCtx  : Everything concerning the preceding content is
+ *                   in a separate context, pointed to by ctx->dictCtx.
+ *                   ctx->dictionary, ctx->dictSize, and table entries
+ *                   in the current context that refer to positions
+ *                   preceding the beginning of the current compression are
+ *                   ignored. Instead, ctx->dictCtx->dictionary and ctx->dictCtx
+ *                   ->dictSize describe the location and size of the preceding
+ *                   content, and matches are found by looking in the ctx
+ *                   ->dictCtx->hashTable.
+ */
+typedef enum {
+   noDict = 0,
+   withPrefix64k,
+   usingExtDict,
+   usingDictCtx
+} dict_directive;
+typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive;
+
+/*-************************************
+ *  Local Utils
+ **************************************/
+int         LZ4_versionNumber(void) { return LZ4_VERSION_NUMBER; }
+const char* LZ4_versionString(void) { return LZ4_VERSION_STRING; }
+int         LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); }
+int         LZ4_sizeofState(void) { return sizeof(LZ4_stream_t); }
+
+/*-****************************************
+ *  Internal Definitions, used only in Tests
+ *******************************************/
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int LZ4_compress_forceExtDict(LZ4_stream_t* LZ4_dict,
+                              const char*   source,
+                              char*         dest,
+                              int           srcSize);
+
+int LZ4_decompress_safe_forceExtDict(const char* source,
+                                     char*       dest,
+                                     int         compressedSize,
+                                     int         maxOutputSize,
+                                     const void* dictStart,
+                                     size_t      dictSize);
+int LZ4_decompress_safe_partial_forceExtDict(const char* source,
+                                             char*       dest,
+                                             int         compressedSize,
+                                             int         targetOutputSize,
+                                             int         dstCapacity,
+                                             const void* dictStart,
+                                             size_t      dictSize);
+#if defined(__cplusplus)
+}
+#endif
+
+/*-******************************
+ *  Compression functions
+ ********************************/
+LZ4_FORCE_INLINE U32 LZ4_hash4(U32 sequence, tableType_t const tableType)
+{
+   if (tableType == byU16)
+      return ((sequence * 2654435761U) >> ((MINMATCH * 8) - (LZ4_HASHLOG + 1)));
+   else
+      return ((sequence * 2654435761U) >> ((MINMATCH * 8) - LZ4_HASHLOG));
+}
+
+LZ4_FORCE_INLINE U32 LZ4_hash5(U64 sequence, tableType_t const tableType)
+{
+   const U32 hashLog = (tableType == byU16) ? LZ4_HASHLOG + 1 : LZ4_HASHLOG;
+   if (LZ4_isLittleEndian()) {
+      const U64 prime5bytes = 889523592379ULL;
+      return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog));
+   } else {
+      const U64 prime8bytes = 11400714785074694791ULL;
+      return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog));
+   }
+}
+
+LZ4_FORCE_INLINE U32 LZ4_hashPosition(const void* const p,
+                                      tableType_t const tableType)
+{
+   if ((sizeof(reg_t) == 8) && (tableType != byU16))
+      return LZ4_hash5(LZ4_read_ARCH(p), tableType);
+
+#ifdef LZ4_STATIC_LINKING_ONLY_ENDIANNESS_INDEPENDENT_OUTPUT
+   return LZ4_hash4(LZ4_readLE32(p), tableType);
+#else
+   return LZ4_hash4(LZ4_read32(p), tableType);
+#endif
+}
+
+LZ4_FORCE_INLINE void
+LZ4_clearHash(U32 h, void* tableBase, tableType_t const tableType)
+{
+   switch (tableType) {
+   default:             /* fallthrough */
+   case clearedTable: { /* illegal! */
+      assert(0);
+      return;
+   }
+   case byPtr: {
+      const BYTE** hashTable = (const BYTE**)tableBase;
+      hashTable[h]           = NULL;
+      return;
+   }
+   case byU32: {
+      U32* hashTable = (U32*)tableBase;
+      hashTable[h]   = 0;
+      return;
+   }
+   case byU16: {
+      U16* hashTable = (U16*)tableBase;
+      hashTable[h]   = 0;
+      return;
+   }
+   }
+}
+
+LZ4_FORCE_INLINE void
+LZ4_putIndexOnHash(U32 idx, U32 h, void* tableBase, tableType_t const tableType)
+{
+   switch (tableType) {
+   default:           /* fallthrough */
+   case clearedTable: /* fallthrough */
+   case byPtr: {      /* illegal! */
+      assert(0);
+      return;
+   }
+   case byU32: {
+      U32* hashTable = (U32*)tableBase;
+      hashTable[h]   = idx;
+      return;
+   }
+   case byU16: {
+      U16* hashTable = (U16*)tableBase;
+      assert(idx < 65536);
+      hashTable[h] = (U16)idx;
+      return;
+   }
+   }
+}
+
+/* LZ4_putPosition*() : only used in byPtr mode */
+LZ4_FORCE_INLINE void LZ4_putPositionOnHash(const BYTE*       p,
+                                            U32               h,
+                                            void*             tableBase,
+                                            tableType_t const tableType)
+{
+   const BYTE** const hashTable = (const BYTE**)tableBase;
+   assert(tableType == byPtr);
+   (void)tableType;
+   hashTable[h] = p;
+}
+
+LZ4_FORCE_INLINE void
+LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType)
+{
+   U32 const h = LZ4_hashPosition(p, tableType);
+   LZ4_putPositionOnHash(p, h, tableBase, tableType);
+}
+
+/* LZ4_getIndexOnHash() :
+ * Index of match position registered in hash table.
+ * hash position must be calculated by using base+index, or dictBase+index.
+ * Assumption 1 : only valid if tableType == byU32 or byU16.
+ * Assumption 2 : h is presumed valid (within limits of hash table)
+ */
+LZ4_FORCE_INLINE U32 LZ4_getIndexOnHash(U32         h,
+                                        const void* tableBase,
+                                        tableType_t tableType)
+{
+   LZ4_STATIC_ASSERT(LZ4_MEMORY_USAGE > 2);
+   if (tableType == byU32) {
+      const U32* const hashTable = (const U32*)tableBase;
+      assert(h < (1U << (LZ4_MEMORY_USAGE - 2)));
+      return hashTable[h];
+   }
+   if (tableType == byU16) {
+      const U16* const hashTable = (const U16*)tableBase;
+      assert(h < (1U << (LZ4_MEMORY_USAGE - 1)));
+      return hashTable[h];
+   }
+   assert(0);
+   return 0; /* forbidden case */
+}
+
+static const BYTE*
+LZ4_getPositionOnHash(U32 h, const void* tableBase, tableType_t tableType)
+{
+   assert(tableType == byPtr);
+   (void)tableType;
+   {
+      const BYTE* const* hashTable = (const BYTE* const*)tableBase;
+      return hashTable[h];
+   }
+}
+
+LZ4_FORCE_INLINE const BYTE*
+LZ4_getPosition(const BYTE* p, const void* tableBase, tableType_t tableType)
+{
+   U32 const h = LZ4_hashPosition(p, tableType);
+   return LZ4_getPositionOnHash(h, tableBase, tableType);
+}
+
+LZ4_FORCE_INLINE void LZ4_prepareTable(LZ4_stream_t_internal* const cctx,
+                                       const int                    inputSize,
+                                       const tableType_t            tableType)
+{
+   /* If the table hasn't been used, it's guaranteed to be zeroed out, and is
+    * therefore safe to use no matter what mode we're in. Otherwise, we figure
+    * out if it's safe to leave as is or whether it needs to be reset.
+    */
+   if ((tableType_t)cctx->tableType != clearedTable) {
+      assert(inputSize >= 0);
+      if ((tableType_t)cctx->tableType != tableType ||
+          ((tableType == byU16) &&
+           cctx->currentOffset + (unsigned)inputSize >= 0xFFFFU) ||
+          ((tableType == byU32) && cctx->currentOffset > 1 GB) ||
+          tableType == byPtr || inputSize >= 4 KB) {
+         DEBUGLOG(4, "LZ4_prepareTable: Resetting table in %p", (void*)cctx);
+         MEM_INIT(cctx->hashTable, 0, LZ4_HASHTABLESIZE);
+         cctx->currentOffset = 0;
+         cctx->tableType     = (U32)clearedTable;
+      } else {
+         DEBUGLOG(4, "LZ4_prepareTable: Re-use hash table (no reset)");
+      }
+   }
+
+   /* Adding a gap, so all previous entries are > LZ4_DISTANCE_MAX back,
+    * is faster than compressing without a gap.
+    * However, compressing with currentOffset == 0 is faster still,
+    * so we preserve that case.
+    */
+   if (cctx->currentOffset != 0 && tableType == byU32) {
+      DEBUGLOG(5, "LZ4_prepareTable: adding 64KB to currentOffset");
+      cctx->currentOffset += 64 KB;
+   }
+
+   /* Finally, clear history */
+   cctx->dictCtx    = NULL;
+   cctx->dictionary = NULL;
+   cctx->dictSize   = 0;
+}
+
+/** LZ4_compress_generic_validated() :
+ *  inlined, to ensure branches are decided at compilation time.
+ *  The following conditions are presumed already validated:
+ *  - source != NULL
+ *  - inputSize > 0
+ */
+LZ4_FORCE_INLINE int LZ4_compress_generic_validated(
+   LZ4_stream_t_internal* const cctx,
+   const char* const            source,
+   char* const                  dest,
+   const int                    inputSize,
+   int* inputConsumed, /* only written when outputDirective == fillOutput */
+   const int                     maxOutputSize,
+   const limitedOutput_directive outputDirective,
+   const tableType_t             tableType,
+   const dict_directive          dictDirective,
+   const dictIssue_directive     dictIssue,
+   const int                     acceleration)
+{
+   int         result;
+   const BYTE* ip = (const BYTE*)source;
+
+   U32 const   startIndex = cctx->currentOffset;
+   const BYTE* base       = (const BYTE*)source - startIndex;
+   const BYTE* lowLimit;
+
+   const LZ4_stream_t_internal* dictCtx =
+      (const LZ4_stream_t_internal*)cctx->dictCtx;
+   const BYTE* const dictionary =
+      dictDirective == usingDictCtx ? dictCtx->dictionary : cctx->dictionary;
+   const U32 dictSize =
+      dictDirective == usingDictCtx ? dictCtx->dictSize : cctx->dictSize;
+   const U32 dictDelta = (dictDirective == usingDictCtx)
+                            ? startIndex - dictCtx->currentOffset
+                            : 0; /* make indexes in dictCtx comparable with
+                                    indexes in current context */
+
+   int const maybe_extMem =
+      (dictDirective == usingExtDict) || (dictDirective == usingDictCtx);
+   U32 const prefixIdxLimit =
+      startIndex - dictSize; /* used when dictDirective == dictSmall */
+   const BYTE* const dictEnd = dictionary ? dictionary + dictSize : dictionary;
+   const BYTE*       anchor  = (const BYTE*)source;
+   const BYTE* const iend    = ip + inputSize;
+   const BYTE* const mflimitPlusOne = iend - MFLIMIT + 1;
+   const BYTE* const matchlimit     = iend - LASTLITERALS;
+
+   /* the dictCtx currentOffset is indexed on the start of the dictionary,
+    * while a dictionary in the current context precedes the currentOffset */
+   const BYTE* dictBase = (dictionary == NULL) ? NULL
+                          : (dictDirective == usingDictCtx)
+                             ? dictionary + dictSize - dictCtx->currentOffset
+                             : dictionary + dictSize - startIndex;
+
+   BYTE*       op     = (BYTE*)dest;
+   BYTE* const olimit = op + maxOutputSize;
+
+   U32 offset = 0;
+   U32 forwardH;
+
+   DEBUGLOG(5, "LZ4_compress_generic_validated: srcSize=%i, tableType=%u",
+            inputSize, tableType);
+   assert(ip != NULL);
+   if (tableType == byU16)
+      assert(inputSize <
+             LZ4_64Klimit); /* Size too large (not within 64K limit) */
+   if (tableType == byPtr)
+      assert(dictDirective == noDict); /* only supported use case with byPtr */
+   /* If init conditions are not met, we don't have to mark stream
+    * as having dirty context, since no action was taken yet */
+   if (outputDirective == fillOutput && maxOutputSize < 1) {
+      return 0;
+   } /* Impossible to store anything */
+   assert(acceleration >= 1);
+
+   lowLimit =
+      (const BYTE*)source - (dictDirective == withPrefix64k ? dictSize : 0);
+
+   /* Update context state */
+   if (dictDirective == usingDictCtx) {
+      /* Subsequent linked blocks can't use the dictionary. */
+      /* Instead, they use the block we just compressed. */
+      cctx->dictCtx  = NULL;
+      cctx->dictSize = (U32)inputSize;
+   } else {
+      cctx->dictSize += (U32)inputSize;
+   }
+   cctx->currentOffset += (U32)inputSize;
+   cctx->tableType = (U32)tableType;
+
+   if (inputSize < LZ4_minLength)
+      goto _last_literals; /* Input too small, no compression (all literals) */
+
+   /* First Byte */
+   {
+      U32 const h = LZ4_hashPosition(ip, tableType);
+      if (tableType == byPtr) {
+         LZ4_putPositionOnHash(ip, h, cctx->hashTable, byPtr);
+      } else {
+         LZ4_putIndexOnHash(startIndex, h, cctx->hashTable, tableType);
+      }
+   }
+   ip++;
+   forwardH = LZ4_hashPosition(ip, tableType);
+
+   /* Main Loop */
+   for (;;) {
+      const BYTE* match;
+      BYTE*       token;
+      const BYTE* filledIp;
+
+      /* Find a match */
+      if (tableType == byPtr) {
+         const BYTE* forwardIp     = ip;
+         int         step          = 1;
+         int         searchMatchNb = acceleration << LZ4_skipTrigger;
+         do {
+            U32 const h = forwardH;
+            ip          = forwardIp;
+            forwardIp += step;
+            step = (searchMatchNb++ >> LZ4_skipTrigger);
+
+            if (unlikely(forwardIp > mflimitPlusOne))
+               goto _last_literals;
+            assert(ip < mflimitPlusOne);
+
+            match    = LZ4_getPositionOnHash(h, cctx->hashTable, tableType);
+            forwardH = LZ4_hashPosition(forwardIp, tableType);
+            LZ4_putPositionOnHash(ip, h, cctx->hashTable, tableType);
+
+         } while ((match + LZ4_DISTANCE_MAX < ip) ||
+                  (LZ4_read32(match) != LZ4_read32(ip)));
+
+      } else { /* byU32, byU16 */
+
+         const BYTE* forwardIp     = ip;
+         int         step          = 1;
+         int         searchMatchNb = acceleration << LZ4_skipTrigger;
+         do {
+            U32 const h       = forwardH;
+            U32 const current = (U32)(forwardIp - base);
+            U32 matchIndex = LZ4_getIndexOnHash(h, cctx->hashTable, tableType);
+            assert(matchIndex <= current);
+            assert(forwardIp - base < (ptrdiff_t)(2 GB - 1));
+            ip = forwardIp;
+            forwardIp += step;
+            step = (searchMatchNb++ >> LZ4_skipTrigger);
+
+            if (unlikely(forwardIp > mflimitPlusOne))
+               goto _last_literals;
+            assert(ip < mflimitPlusOne);
+
+            if (dictDirective == usingDictCtx) {
+               if (matchIndex < startIndex) {
+                  /* there was no match, try the dictionary */
+                  assert(tableType == byU32);
+                  matchIndex = LZ4_getIndexOnHash(h, dictCtx->hashTable, byU32);
+                  match      = dictBase + matchIndex;
+                  matchIndex += dictDelta; /* make dictCtx index comparable with
+                                              current context */
+                  lowLimit = dictionary;
+               } else {
+                  match    = base + matchIndex;
+                  lowLimit = (const BYTE*)source;
+               }
+            } else if (dictDirective == usingExtDict) {
+               if (matchIndex < startIndex) {
+                  DEBUGLOG(
+                     7, "extDict candidate: matchIndex=%5u  <  startIndex=%5u",
+                     matchIndex, startIndex);
+                  assert(startIndex - matchIndex >= MINMATCH);
+                  assert(dictBase);
+                  match    = dictBase + matchIndex;
+                  lowLimit = dictionary;
+               } else {
+                  match    = base + matchIndex;
+                  lowLimit = (const BYTE*)source;
+               }
+            } else { /* single continuous memory segment */
+               match = base + matchIndex;
+            }
+            forwardH = LZ4_hashPosition(forwardIp, tableType);
+            LZ4_putIndexOnHash(current, h, cctx->hashTable, tableType);
+
+            DEBUGLOG(7, "candidate at pos=%u  (offset=%u \n", matchIndex,
+                     current - matchIndex);
+            if ((dictIssue == dictSmall) && (matchIndex < prefixIdxLimit)) {
+               continue;
+            } /* match outside of valid area */
+            assert(matchIndex < current);
+            if (((tableType != byU16) ||
+                 (LZ4_DISTANCE_MAX < LZ4_DISTANCE_ABSOLUTE_MAX)) &&
+                (matchIndex + LZ4_DISTANCE_MAX < current)) {
+               continue;
+            } /* too far */
+            assert((current - matchIndex) <=
+                   LZ4_DISTANCE_MAX); /* match now expected within distance */
+
+            if (LZ4_read32(match) == LZ4_read32(ip)) {
+               if (maybe_extMem)
+                  offset = current - matchIndex;
+               break; /* match found */
+            }
+
+         } while (1);
+      }
+
+      /* Catch up */
+      filledIp = ip;
+      assert(ip > anchor); /* this is always true as ip has been advanced before
+                              entering the main loop */
+      if ((match > lowLimit) && unlikely(ip[-1] == match[-1])) {
+         do {
+            ip--;
+            match--;
+         } while (((ip > anchor) & (match > lowLimit)) &&
+                  (unlikely(ip[-1] == match[-1])));
+      }
+
+      /* Encode Literals */
+      {
+         unsigned const litLength = (unsigned)(ip - anchor);
+         token                    = op++;
+         if ((outputDirective ==
+              limitedOutput) && /* Check output buffer overflow */
+             (unlikely(op + litLength + (2 + 1 + LASTLITERALS) +
+                          (litLength / 255) >
+                       olimit))) {
+            return 0; /* cannot compress within `dst` budget. Stored indexes in
+                         hash table are nonetheless fine */
+         }
+         if ((outputDirective == fillOutput) &&
+             (unlikely(op + (litLength + 240) / 255 /* litlen */ +
+                          litLength /* literals */ + 2 /* offset */ +
+                          1 /* token */ + MFLIMIT -
+                          MINMATCH /* min last literals so last match is <= end
+                                      - MFLIMIT */
+                       > olimit))) {
+            op--;
+            goto _last_literals;
+         }
+         if (litLength >= RUN_MASK) {
+            unsigned len = litLength - RUN_MASK;
+            *token       = (RUN_MASK << ML_BITS);
+            for (; len >= 255; len -= 255)
+               *op++ = 255;
+            *op++ = (BYTE)len;
+         } else
+            *token = (BYTE)(litLength << ML_BITS);
+
+         /* Copy Literals */
+         LZ4_wildCopy8(op, anchor, op + litLength);
+         op += litLength;
+         DEBUGLOG(6, "seq.start:%i, literals=%u, match.start:%i",
+                  (int)(anchor - (const BYTE*)source), litLength,
+                  (int)(ip - (const BYTE*)source));
+      }
+
+   _next_match:
+      /* at this stage, the following variables must be correctly set :
+       * - ip : at start of LZ operation
+       * - match : at start of previous pattern occurrence; can be within
+       * current prefix, or within extDict
+       * - offset : if maybe_ext_memSegment==1 (constant)
+       * - lowLimit : must be == dictionary to mean "match is within extDict";
+       * must be == source otherwise
+       * - token and *token : position to write 4-bits for match length; higher
+       * 4-bits for literal length supposed already written
+       */
+
+      if ((outputDirective == fillOutput) &&
+          (op + 2 /* offset */ + 1 /* token */ + MFLIMIT -
+              MINMATCH /* min last literals so last match is <= end - MFLIMIT */
+           > olimit)) {
+         /* the match was too close to the end, rewind and go to last literals
+          */
+         op = token;
+         goto _last_literals;
+      }
+
+      /* Encode Offset */
+      if (maybe_extMem) { /* static test */
+         DEBUGLOG(6, "             with offset=%u  (ext if > %i)", offset,
+                  (int)(ip - (const BYTE*)source));
+         assert(offset <= LZ4_DISTANCE_MAX && offset > 0);
+         LZ4_writeLE16(op, (U16)offset);
+         op += 2;
+      } else {
+         DEBUGLOG(6, "             with offset=%u  (same segment)",
+                  (U32)(ip - match));
+         assert(ip - match <= LZ4_DISTANCE_MAX);
+         LZ4_writeLE16(op, (U16)(ip - match));
+         op += 2;
+      }
+
+      /* Encode MatchLength */
+      {
+         unsigned matchCode;
+
+         if ((dictDirective == usingExtDict || dictDirective == usingDictCtx) &&
+             (lowLimit == dictionary) /* match within extDict */) {
+            const BYTE* limit = ip + (dictEnd - match);
+            assert(dictEnd > match);
+            if (limit > matchlimit)
+               limit = matchlimit;
+            matchCode = LZ4_count(ip + MINMATCH, match + MINMATCH, limit);
+            ip += (size_t)matchCode + MINMATCH;
+            if (ip == limit) {
+               unsigned const more =
+                  LZ4_count(limit, (const BYTE*)source, matchlimit);
+               matchCode += more;
+               ip += more;
+            }
+            DEBUGLOG(6, "             with matchLength=%u starting in extDict",
+                     matchCode + MINMATCH);
+         } else {
+            matchCode = LZ4_count(ip + MINMATCH, match + MINMATCH, matchlimit);
+            ip += (size_t)matchCode + MINMATCH;
+            DEBUGLOG(6, "             with matchLength=%u",
+                     matchCode + MINMATCH);
+         }
+
+         if ((outputDirective) && /* Check output buffer overflow */
+             (unlikely(op + (1 + LASTLITERALS) + (matchCode + 240) / 255 >
+                       olimit))) {
+            if (outputDirective == fillOutput) {
+               /* Match description too long : reduce it */
+               U32 newMatchCode = 15 /* in token */ -
+                                  1 /* to avoid needing a zero byte */ +
+                                  ((U32)(olimit - op) - 1 - LASTLITERALS) * 255;
+               ip -= matchCode - newMatchCode;
+               assert(newMatchCode < matchCode);
+               matchCode = newMatchCode;
+               if (unlikely(ip <= filledIp)) {
+                  /* We have already filled up to filledIp so if ip ends up less
+                   * than filledIp we have positions in the hash table beyond
+                   * the current position. This is a problem if we reuse the
+                   * hash table. So we have to remove these positions from the
+                   * hash table.
+                   */
+                  const BYTE* ptr;
+                  DEBUGLOG(5, "Clearing %u positions", (U32)(filledIp - ip));
+                  for (ptr = ip; ptr <= filledIp; ++ptr) {
+                     U32 const h = LZ4_hashPosition(ptr, tableType);
+                     LZ4_clearHash(h, cctx->hashTable, tableType);
+                  }
+               }
+            } else {
+               assert(outputDirective == limitedOutput);
+               return 0; /* cannot compress within `dst` budget. Stored indexes
+                            in hash table are nonetheless fine */
+            }
+         }
+         if (matchCode >= ML_MASK) {
+            *token += ML_MASK;
+            matchCode -= ML_MASK;
+            LZ4_write32(op, 0xFFFFFFFF);
+            while (matchCode >= 4 * 255) {
+               op += 4;
+               LZ4_write32(op, 0xFFFFFFFF);
+               matchCode -= 4 * 255;
+            }
+            op += matchCode / 255;
+            *op++ = (BYTE)(matchCode % 255);
+         } else
+            *token += (BYTE)(matchCode);
+      }
+      /* Ensure we have enough space for the last literals. */
+      assert(
+         !(outputDirective == fillOutput && op + 1 + LASTLITERALS > olimit));
+
+      anchor = ip;
+
+      /* Test end of chunk */
+      if (ip >= mflimitPlusOne)
+         break;
+
+      /* Fill table */
+      {
+         U32 const h = LZ4_hashPosition(ip - 2, tableType);
+         if (tableType == byPtr) {
+            LZ4_putPositionOnHash(ip - 2, h, cctx->hashTable, byPtr);
+         } else {
+            U32 const idx = (U32)((ip - 2) - base);
+            LZ4_putIndexOnHash(idx, h, cctx->hashTable, tableType);
+         }
+      }
+
+      /* Test next position */
+      if (tableType == byPtr) {
+
+         match = LZ4_getPosition(ip, cctx->hashTable, tableType);
+         LZ4_putPosition(ip, cctx->hashTable, tableType);
+         if ((match + LZ4_DISTANCE_MAX >= ip) &&
+             (LZ4_read32(match) == LZ4_read32(ip))) {
+            token  = op++;
+            *token = 0;
+            goto _next_match;
+         }
+
+      } else { /* byU32, byU16 */
+
+         U32 const h       = LZ4_hashPosition(ip, tableType);
+         U32 const current = (U32)(ip - base);
+         U32 matchIndex    = LZ4_getIndexOnHash(h, cctx->hashTable, tableType);
+         assert(matchIndex < current);
+         if (dictDirective == usingDictCtx) {
+            if (matchIndex < startIndex) {
+               /* there was no match, try the dictionary */
+               assert(tableType == byU32);
+               matchIndex = LZ4_getIndexOnHash(h, dictCtx->hashTable, byU32);
+               match      = dictBase + matchIndex;
+               lowLimit   = dictionary; /* required for match length counter */
+               matchIndex += dictDelta;
+            } else {
+               match = base + matchIndex;
+               lowLimit =
+                  (const BYTE*)source; /* required for match length counter */
+            }
+         } else if (dictDirective == usingExtDict) {
+            if (matchIndex < startIndex) {
+               assert(dictBase);
+               match    = dictBase + matchIndex;
+               lowLimit = dictionary; /* required for match length counter */
+            } else {
+               match = base + matchIndex;
+               lowLimit =
+                  (const BYTE*)source; /* required for match length counter */
+            }
+         } else { /* single memory segment */
+            match = base + matchIndex;
+         }
+         LZ4_putIndexOnHash(current, h, cctx->hashTable, tableType);
+         assert(matchIndex < current);
+         if (((dictIssue == dictSmall) ? (matchIndex >= prefixIdxLimit) : 1) &&
+             (((tableType == byU16) &&
+               (LZ4_DISTANCE_MAX == LZ4_DISTANCE_ABSOLUTE_MAX))
+                 ? 1
+                 : (matchIndex + LZ4_DISTANCE_MAX >= current)) &&
+             (LZ4_read32(match) == LZ4_read32(ip))) {
+            token  = op++;
+            *token = 0;
+            if (maybe_extMem)
+               offset = current - matchIndex;
+            DEBUGLOG(6, "seq.start:%i, literals=%u, match.start:%i",
+                     (int)(anchor - (const BYTE*)source), 0,
+                     (int)(ip - (const BYTE*)source));
+            goto _next_match;
+         }
+      }
+
+      /* Prepare next loop */
+      forwardH = LZ4_hashPosition(++ip, tableType);
+   }
+
+_last_literals:
+   /* Encode Last Literals */
+   {
+      size_t lastRun = (size_t)(iend - anchor);
+      if ((outputDirective) && /* Check output buffer overflow */
+          (op + lastRun + 1 + ((lastRun + 255 - RUN_MASK) / 255) > olimit)) {
+         if (outputDirective == fillOutput) {
+            /* adapt lastRun to fill 'dst' */
+            assert(olimit >= op);
+            lastRun = (size_t)(olimit - op) - 1 /*token*/;
+            lastRun -=
+               (lastRun + 256 - RUN_MASK) / 256; /*additional length tokens*/
+         } else {
+            assert(outputDirective == limitedOutput);
+            return 0; /* cannot compress within `dst` budget. Stored indexes in
+                         hash table are nonetheless fine */
+         }
+      }
+      DEBUGLOG(6, "Final literal run : %i literals", (int)lastRun);
+      if (lastRun >= RUN_MASK) {
+         size_t accumulator = lastRun - RUN_MASK;
+         *op++              = RUN_MASK << ML_BITS;
+         for (; accumulator >= 255; accumulator -= 255)
+            *op++ = 255;
+         *op++ = (BYTE)accumulator;
+      } else {
+         *op++ = (BYTE)(lastRun << ML_BITS);
+      }
+      LZ4_memcpy(op, anchor, lastRun);
+      ip = anchor + lastRun;
+      op += lastRun;
+   }
+
+   if (outputDirective == fillOutput) {
+      *inputConsumed = (int)(((const char*)ip) - source);
+   }
+   result = (int)(((char*)op) - dest);
+   assert(result > 0);
+   DEBUGLOG(5, "LZ4_compress_generic: compressed %i bytes into %i bytes",
+            inputSize, result);
+   return result;
+}
+
+/** LZ4_compress_generic() :
+ *  inlined, to ensure branches are decided at compilation time;
+ *  takes care of src == (NULL, 0)
+ *  and forward the rest to LZ4_compress_generic_validated */
+LZ4_FORCE_INLINE int LZ4_compress_generic(
+   LZ4_stream_t_internal* const cctx,
+   const char* const            src,
+   char* const                  dst,
+   const int                    srcSize,
+   int* inputConsumed, /* only written when outputDirective == fillOutput */
+   const int                     dstCapacity,
+   const limitedOutput_directive outputDirective,
+   const tableType_t             tableType,
+   const dict_directive          dictDirective,
+   const dictIssue_directive     dictIssue,
+   const int                     acceleration)
+{
+   DEBUGLOG(5, "LZ4_compress_generic: srcSize=%i, dstCapacity=%i", srcSize,
+            dstCapacity);
+
+   if ((U32)srcSize > (U32)LZ4_MAX_INPUT_SIZE) {
+      return 0;
+   }                   /* Unsupported srcSize, too large (or negative) */
+   if (srcSize == 0) { /* src == NULL supported if srcSize == 0 */
+      if (outputDirective != notLimited && dstCapacity <= 0)
+         return 0; /* no output, can't write anything */
+      DEBUGLOG(5, "Generating an empty block");
+      assert(outputDirective == notLimited || dstCapacity >= 1);
+      assert(dst != NULL);
+      dst[0] = 0;
+      if (outputDirective == fillOutput) {
+         assert(inputConsumed != NULL);
+         *inputConsumed = 0;
+      }
+      return 1;
+   }
+   assert(src != NULL);
+
+   return LZ4_compress_generic_validated(
+      cctx, src, dst, srcSize,
+      inputConsumed, /* only written into if outputDirective == fillOutput */
+      dstCapacity, outputDirective, tableType, dictDirective, dictIssue,
+      acceleration);
+}
+
+int LZ4_compress_fast_extState(void*       state,
+                               const char* source,
+                               char*       dest,
+                               int         inputSize,
+                               int         maxOutputSize,
+                               int         acceleration)
+{
+   LZ4_stream_t_internal* const ctx =
+      &LZ4_initStream(state, sizeof(LZ4_stream_t))->internal_donotuse;
+   assert(ctx != NULL);
+   if (acceleration < 1)
+      acceleration = LZ4_ACCELERATION_DEFAULT;
+   if (acceleration > LZ4_ACCELERATION_MAX)
+      acceleration = LZ4_ACCELERATION_MAX;
+   if (maxOutputSize >= LZ4_compressBound(inputSize)) {
+      if (inputSize < LZ4_64Klimit) {
+         return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, 0,
+                                     notLimited, byU16, noDict, noDictIssue,
+                                     acceleration);
+      } else {
+         const tableType_t tableType =
+            ((sizeof(void*) == 4) && ((uptrval)source > LZ4_DISTANCE_MAX))
+               ? byPtr
+               : byU32;
+         return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, 0,
+                                     notLimited, tableType, noDict, noDictIssue,
+                                     acceleration);
+      }
+   } else {
+      if (inputSize < LZ4_64Klimit) {
+         return LZ4_compress_generic(ctx, source, dest, inputSize, NULL,
+                                     maxOutputSize, limitedOutput, byU16,
+                                     noDict, noDictIssue, acceleration);
+      } else {
+         const tableType_t tableType =
+            ((sizeof(void*) == 4) && ((uptrval)source > LZ4_DISTANCE_MAX))
+               ? byPtr
+               : byU32;
+         return LZ4_compress_generic(ctx, source, dest, inputSize, NULL,
+                                     maxOutputSize, limitedOutput, tableType,
+                                     noDict, noDictIssue, acceleration);
+      }
+   }
+}
+
+/**
+ * LZ4_compress_fast_extState_fastReset() :
+ * A variant of LZ4_compress_fast_extState().
+ *
+ * Using this variant avoids an expensive initialization step. It is only safe
+ * to call if the state buffer is known to be correctly initialized already
+ * (see comment in lz4.h on LZ4_resetStream_fast() for a definition of
+ * "correctly initialized").
+ */
+int LZ4_compress_fast_extState_fastReset(void*       state,
+                                         const char* src,
+                                         char*       dst,
+                                         int         srcSize,
+                                         int         dstCapacity,
+                                         int         acceleration)
+{
+   LZ4_stream_t_internal* const ctx =
+      &((LZ4_stream_t*)state)->internal_donotuse;
+   if (acceleration < 1)
+      acceleration = LZ4_ACCELERATION_DEFAULT;
+   if (acceleration > LZ4_ACCELERATION_MAX)
+      acceleration = LZ4_ACCELERATION_MAX;
+   assert(ctx != NULL);
+
+   if (dstCapacity >= LZ4_compressBound(srcSize)) {
+      if (srcSize < LZ4_64Klimit) {
+         const tableType_t tableType = byU16;
+         LZ4_prepareTable(ctx, srcSize, tableType);
+         if (ctx->currentOffset) {
+            return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0,
+                                        notLimited, tableType, noDict,
+                                        dictSmall, acceleration);
+         } else {
+            return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0,
+                                        notLimited, tableType, noDict,
+                                        noDictIssue, acceleration);
+         }
+      } else {
+         const tableType_t tableType =
+            ((sizeof(void*) == 4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr
+                                                                        : byU32;
+         LZ4_prepareTable(ctx, srcSize, tableType);
+         return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0,
+                                     notLimited, tableType, noDict, noDictIssue,
+                                     acceleration);
+      }
+   } else {
+      if (srcSize < LZ4_64Klimit) {
+         const tableType_t tableType = byU16;
+         LZ4_prepareTable(ctx, srcSize, tableType);
+         if (ctx->currentOffset) {
+            return LZ4_compress_generic(ctx, src, dst, srcSize, NULL,
+                                        dstCapacity, limitedOutput, tableType,
+                                        noDict, dictSmall, acceleration);
+         } else {
+            return LZ4_compress_generic(ctx, src, dst, srcSize, NULL,
+                                        dstCapacity, limitedOutput, tableType,
+                                        noDict, noDictIssue, acceleration);
+         }
+      } else {
+         const tableType_t tableType =
+            ((sizeof(void*) == 4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr
+                                                                        : byU32;
+         LZ4_prepareTable(ctx, srcSize, tableType);
+         return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity,
+                                     limitedOutput, tableType, noDict,
+                                     noDictIssue, acceleration);
+      }
+   }
+}
+
+int LZ4_compress_fast(
+   const char* src, char* dest, int srcSize, int dstCapacity, int acceleration)
+{
+   int result;
+#if (LZ4_HEAPMODE)
+   LZ4_stream_t* const ctxPtr = (LZ4_stream_t*)ALLOC(
+      sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */
+   if (ctxPtr == NULL)
+      return 0;
+#else
+   LZ4_stream_t        ctx;
+   LZ4_stream_t* const ctxPtr = &ctx;
+#endif
+   result = LZ4_compress_fast_extState(ctxPtr, src, dest, srcSize, dstCapacity,
+                                       acceleration);
+
+#if (LZ4_HEAPMODE)
+   FREEMEM(ctxPtr);
+#endif
+   return result;
+}
+
+int LZ4_compress_default(const char* src,
+                         char*       dst,
+                         int         srcSize,
+                         int         dstCapacity)
+{
+   return LZ4_compress_fast(src, dst, srcSize, dstCapacity, 1);
+}
+
+/* Note!: This function leaves the stream in an unclean/broken state!
+ * It is not safe to subsequently use the same state with a _fastReset() or
+ * _continue() call without resetting it. */
+static int LZ4_compress_destSize_extState_internal(LZ4_stream_t* state,
+                                                   const char*   src,
+                                                   char*         dst,
+                                                   int*          srcSizePtr,
+                                                   int           targetDstSize,
+                                                   int           acceleration)
+{
+   void* const s = LZ4_initStream(state, sizeof(*state));
+   assert(s != NULL);
+   (void)s;
+
+   if (targetDstSize >=
+       LZ4_compressBound(*srcSizePtr)) { /* compression success is guaranteed */
+      return LZ4_compress_fast_extState(state, src, dst, *srcSizePtr,
+                                        targetDstSize, acceleration);
+   } else {
+      if (*srcSizePtr < LZ4_64Klimit) {
+         return LZ4_compress_generic(&state->internal_donotuse, src, dst,
+                                     *srcSizePtr, srcSizePtr, targetDstSize,
+                                     fillOutput, byU16, noDict, noDictIssue,
+                                     acceleration);
+      } else {
+         tableType_t const addrMode =
+            ((sizeof(void*) == 4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr
+                                                                        : byU32;
+         return LZ4_compress_generic(&state->internal_donotuse, src, dst,
+                                     *srcSizePtr, srcSizePtr, targetDstSize,
+                                     fillOutput, addrMode, noDict, noDictIssue,
+                                     acceleration);
+      }
+   }
+}
+
+int LZ4_compress_destSize_extState(void*       state,
+                                   const char* src,
+                                   char*       dst,
+                                   int*        srcSizePtr,
+                                   int         targetDstSize,
+                                   int         acceleration)
+{
+   int const r = LZ4_compress_destSize_extState_internal(
+      (LZ4_stream_t*)state, src, dst, srcSizePtr, targetDstSize, acceleration);
+   /* clean the state on exit */
+   LZ4_initStream(state, sizeof(LZ4_stream_t));
+   return r;
+}
+
+int LZ4_compress_destSize(const char* src,
+                          char*       dst,
+                          int*        srcSizePtr,
+                          int         targetDstSize)
+{
+#if (LZ4_HEAPMODE)
+   LZ4_stream_t* const ctx = (LZ4_stream_t*)ALLOC(
+      sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */
+   if (ctx == NULL)
+      return 0;
+#else
+   LZ4_stream_t        ctxBody;
+   LZ4_stream_t* const ctx = &ctxBody;
+#endif
+
+   int result = LZ4_compress_destSize_extState_internal(
+      ctx, src, dst, srcSizePtr, targetDstSize, 1);
+
+#if (LZ4_HEAPMODE)
+   FREEMEM(ctx);
+#endif
+   return result;
+}
+
+/*-******************************
+ *  Streaming functions
+ ********************************/
+
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+LZ4_stream_t* LZ4_createStream(void)
+{
+   LZ4_stream_t* const lz4s = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t));
+   LZ4_STATIC_ASSERT(sizeof(LZ4_stream_t) >= sizeof(LZ4_stream_t_internal));
+   DEBUGLOG(4, "LZ4_createStream %p", (void*)lz4s);
+   if (lz4s == NULL)
+      return NULL;
+   LZ4_initStream(lz4s, sizeof(*lz4s));
+   return lz4s;
+}
+#endif
+
+static size_t LZ4_stream_t_alignment(void)
+{
+#if LZ4_ALIGN_TEST
+   typedef struct {
+      char         c;
+      LZ4_stream_t t;
+   } t_a;
+   return sizeof(t_a) - sizeof(LZ4_stream_t);
+#else
+   return 1; /* effectively disabled */
+#endif
+}
+
+LZ4_stream_t* LZ4_initStream(void* buffer, size_t size)
+{
+   DEBUGLOG(5, "LZ4_initStream");
+   if (buffer == NULL) {
+      return NULL;
+   }
+   if (size < sizeof(LZ4_stream_t)) {
+      return NULL;
+   }
+   if (!LZ4_isAligned(buffer, LZ4_stream_t_alignment()))
+      return NULL;
+   MEM_INIT(buffer, 0, sizeof(LZ4_stream_t_internal));
+   return (LZ4_stream_t*)buffer;
+}
+
+/* resetStream is now deprecated,
+ * prefer initStream() which is more general */
+void LZ4_resetStream(LZ4_stream_t* LZ4_stream)
+{
+   DEBUGLOG(5, "LZ4_resetStream (ctx:%p)", (void*)LZ4_stream);
+   MEM_INIT(LZ4_stream, 0, sizeof(LZ4_stream_t_internal));
+}
+
+void LZ4_resetStream_fast(LZ4_stream_t* ctx)
+{
+   LZ4_prepareTable(&(ctx->internal_donotuse), 0, byU32);
+}
+
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+int LZ4_freeStream(LZ4_stream_t* LZ4_stream)
+{
+   if (!LZ4_stream)
+      return 0; /* support free on NULL */
+   DEBUGLOG(5, "LZ4_freeStream %p", (void*)LZ4_stream);
+   FREEMEM(LZ4_stream);
+   return (0);
+}
+#endif
+
+typedef enum { _ld_fast, _ld_slow } LoadDict_mode_e;
+#define HASH_UNIT sizeof(reg_t)
+int LZ4_loadDict_internal(LZ4_stream_t*   LZ4_dict,
+                          const char*     dictionary,
+                          int             dictSize,
+                          LoadDict_mode_e _ld)
+{
+   LZ4_stream_t_internal* const dict      = &LZ4_dict->internal_donotuse;
+   const tableType_t            tableType = byU32;
+   const BYTE*                  p         = (const BYTE*)dictionary;
+   const BYTE* const            dictEnd   = p + dictSize;
+   U32                          idx32;
+
+   DEBUGLOG(4, "LZ4_loadDict (%i bytes from %p into %p)", dictSize,
+            (void*)dictionary, (void*)LZ4_dict);
+
+   /* It's necessary to reset the context,
+    * and not just continue it with prepareTable()
+    * to avoid any risk of generating overflowing matchIndex
+    * when compressing using this dictionary */
+   LZ4_resetStream(LZ4_dict);
+
+   /* We always increment the offset by 64 KB, since, if the dict is longer,
+    * we truncate it to the last 64k, and if it's shorter, we still want to
+    * advance by a whole window length so we can provide the guarantee that
+    * there are only valid offsets in the window, which allows an optimization
+    * in LZ4_compress_fast_continue() where it uses noDictIssue even when the
+    * dictionary isn't a full 64k. */
+   dict->currentOffset += 64 KB;
+
+   if (dictSize < (int)HASH_UNIT) {
+      return 0;
+   }
+
+   if ((dictEnd - p) > 64 KB)
+      p = dictEnd - 64 KB;
+   dict->dictionary = p;
+   dict->dictSize   = (U32)(dictEnd - p);
+   dict->tableType  = (U32)tableType;
+   idx32            = dict->currentOffset - dict->dictSize;
+
+   while (p <= dictEnd - HASH_UNIT) {
+      U32 const h = LZ4_hashPosition(p, tableType);
+      /* Note: overwriting => favors positions end of dictionary */
+      LZ4_putIndexOnHash(idx32, h, dict->hashTable, tableType);
+      p += 3;
+      idx32 += 3;
+   }
+
+   if (_ld == _ld_slow) {
+      /* Fill hash table with additional references, to improve compression
+       * capability */
+      p     = dict->dictionary;
+      idx32 = dict->currentOffset - dict->dictSize;
+      while (p <= dictEnd - HASH_UNIT) {
+         U32 const h     = LZ4_hashPosition(p, tableType);
+         U32 const limit = dict->currentOffset - 64 KB;
+         if (LZ4_getIndexOnHash(h, dict->hashTable, tableType) <= limit) {
+            /* Note: not overwriting => favors positions beginning of dictionary
+             */
+            LZ4_putIndexOnHash(idx32, h, dict->hashTable, tableType);
+         }
+         p++;
+         idx32++;
+      }
+   }
+
+   return (int)dict->dictSize;
+}
+
+int LZ4_loadDict(LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize)
+{
+   return LZ4_loadDict_internal(LZ4_dict, dictionary, dictSize, _ld_fast);
+}
+
+int LZ4_loadDictSlow(LZ4_stream_t* LZ4_dict,
+                     const char*   dictionary,
+                     int           dictSize)
+{
+   return LZ4_loadDict_internal(LZ4_dict, dictionary, dictSize, _ld_slow);
+}
+
+void LZ4_attach_dictionary(LZ4_stream_t*       workingStream,
+                           const LZ4_stream_t* dictionaryStream)
+{
+   const LZ4_stream_t_internal* dictCtx =
+      (dictionaryStream == NULL) ? NULL
+                                 : &(dictionaryStream->internal_donotuse);
+
+   DEBUGLOG(4, "LZ4_attach_dictionary (%p, %p, size %u)", (void*)workingStream,
+            (void*)dictionaryStream, dictCtx != NULL ? dictCtx->dictSize : 0);
+
+   if (dictCtx != NULL) {
+      /* If the current offset is zero, we will never look in the
+       * external dictionary context, since there is no value a table
+       * entry can take that indicate a miss. In that case, we need
+       * to bump the offset to something non-zero.
+       */
+      if (workingStream->internal_donotuse.currentOffset == 0) {
+         workingStream->internal_donotuse.currentOffset = 64 KB;
+      }
+
+      /* Don't actually attach an empty dictionary.
+       */
+      if (dictCtx->dictSize == 0) {
+         dictCtx = NULL;
+      }
+   }
+   workingStream->internal_donotuse.dictCtx = dictCtx;
+}
+
+static void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, int nextSize)
+{
+   assert(nextSize >= 0);
+   if (LZ4_dict->currentOffset + (unsigned)nextSize >
+       0x80000000) { /* potential ptrdiff_t overflow (32-bits mode) */
+      /* rescale hash table */
+      U32 const   delta   = LZ4_dict->currentOffset - 64 KB;
+      const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize;
+      int         i;
+      DEBUGLOG(4, "LZ4_renormDictT");
+      for (i = 0; i < LZ4_HASH_SIZE_U32; i++) {
+         if (LZ4_dict->hashTable[i] < delta)
+            LZ4_dict->hashTable[i] = 0;
+         else
+            LZ4_dict->hashTable[i] -= delta;
+      }
+      LZ4_dict->currentOffset = 64 KB;
+      if (LZ4_dict->dictSize > 64 KB)
+         LZ4_dict->dictSize = 64 KB;
+      LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize;
+   }
+}
+
+int LZ4_compress_fast_continue(LZ4_stream_t* LZ4_stream,
+                               const char*   source,
+                               char*         dest,
+                               int           inputSize,
+                               int           maxOutputSize,
+                               int           acceleration)
+{
+   const tableType_t            tableType = byU32;
+   LZ4_stream_t_internal* const streamPtr = &LZ4_stream->internal_donotuse;
+   const char*                  dictEnd =
+      streamPtr->dictSize
+                          ? (const char*)streamPtr->dictionary + streamPtr->dictSize
+                          : NULL;
+
+   DEBUGLOG(5, "LZ4_compress_fast_continue (inputSize=%i, dictSize=%u)",
+            inputSize, streamPtr->dictSize);
+
+   LZ4_renormDictT(streamPtr, inputSize); /* fix index overflow */
+   if (acceleration < 1)
+      acceleration = LZ4_ACCELERATION_DEFAULT;
+   if (acceleration > LZ4_ACCELERATION_MAX)
+      acceleration = LZ4_ACCELERATION_MAX;
+
+   /* invalidate tiny dictionaries */
+   if ((streamPtr->dictSize < 4) /* tiny dictionary : not enough for a hash */
+       && (dictEnd != source)    /* prefix mode */
+       && (inputSize > 0)        /* tolerance : don't lose history, in case next
+                                    invocation would use prefix mode */
+       && (streamPtr->dictCtx == NULL) /* usingDictCtx */
+   ) {
+      DEBUGLOG(
+         5, "LZ4_compress_fast_continue: dictSize(%u) at addr:%p is too small",
+         streamPtr->dictSize, (void*)streamPtr->dictionary);
+      /* remove dictionary existence from history, to employ faster prefix mode
+       */
+      streamPtr->dictSize   = 0;
+      streamPtr->dictionary = (const BYTE*)source;
+      dictEnd               = source;
+   }
+
+   /* Check overlapping input/dictionary space */
+   {
+      const char* const sourceEnd = source + inputSize;
+      if ((sourceEnd > (const char*)streamPtr->dictionary) &&
+          (sourceEnd < dictEnd)) {
+         streamPtr->dictSize = (U32)(dictEnd - sourceEnd);
+         if (streamPtr->dictSize > 64 KB)
+            streamPtr->dictSize = 64 KB;
+         if (streamPtr->dictSize < 4)
+            streamPtr->dictSize = 0;
+         streamPtr->dictionary = (const BYTE*)dictEnd - streamPtr->dictSize;
+      }
+   }
+
+   /* prefix mode : source data follows dictionary */
+   if (dictEnd == source) {
+      if ((streamPtr->dictSize < 64 KB) &&
+          (streamPtr->dictSize < streamPtr->currentOffset))
+         return LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL,
+                                     maxOutputSize, limitedOutput, tableType,
+                                     withPrefix64k, dictSmall, acceleration);
+      else
+         return LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL,
+                                     maxOutputSize, limitedOutput, tableType,
+                                     withPrefix64k, noDictIssue, acceleration);
+   }
+
+   /* external dictionary mode */
+   {
+      int result;
+      if (streamPtr->dictCtx) {
+         /* We depend here on the fact that dictCtx'es (produced by
+          * LZ4_loadDict) guarantee that their tables contain no references
+          * to offsets between dictCtx->currentOffset - 64 KB and
+          * dictCtx->currentOffset - dictCtx->dictSize. This makes it safe
+          * to use noDictIssue even when the dict isn't a full 64 KB.
+          */
+         if (inputSize > 4 KB) {
+            /* For compressing large blobs, it is faster to pay the setup
+             * cost to copy the dictionary's tables into the active context,
+             * so that the compression loop is only looking into one table.
+             */
+            LZ4_memcpy(streamPtr, streamPtr->dictCtx, sizeof(*streamPtr));
+            result =
+               LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL,
+                                    maxOutputSize, limitedOutput, tableType,
+                                    usingExtDict, noDictIssue, acceleration);
+         } else {
+            result =
+               LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL,
+                                    maxOutputSize, limitedOutput, tableType,
+                                    usingDictCtx, noDictIssue, acceleration);
+         }
+      } else { /* small data <= 4 KB */
+         if ((streamPtr->dictSize < 64 KB) &&
+             (streamPtr->dictSize < streamPtr->currentOffset)) {
+            result = LZ4_compress_generic(
+               streamPtr, source, dest, inputSize, NULL, maxOutputSize,
+               limitedOutput, tableType, usingExtDict, dictSmall, acceleration);
+         } else {
+            result =
+               LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL,
+                                    maxOutputSize, limitedOutput, tableType,
+                                    usingExtDict, noDictIssue, acceleration);
+         }
+      }
+      streamPtr->dictionary = (const BYTE*)source;
+      streamPtr->dictSize   = (U32)inputSize;
+      return result;
+   }
+}
+
+/* Hidden debug function, to force-test external dictionary mode */
+int LZ4_compress_forceExtDict(LZ4_stream_t* LZ4_dict,
+                              const char*   source,
+                              char*         dest,
+                              int           srcSize)
+{
+   LZ4_stream_t_internal* const streamPtr = &LZ4_dict->internal_donotuse;
+   int                          result;
+
+   LZ4_renormDictT(streamPtr, srcSize);
+
+   if ((streamPtr->dictSize < 64 KB) &&
+       (streamPtr->dictSize < streamPtr->currentOffset)) {
+      result =
+         LZ4_compress_generic(streamPtr, source, dest, srcSize, NULL, 0,
+                              notLimited, byU32, usingExtDict, dictSmall, 1);
+   } else {
+      result =
+         LZ4_compress_generic(streamPtr, source, dest, srcSize, NULL, 0,
+                              notLimited, byU32, usingExtDict, noDictIssue, 1);
+   }
+
+   streamPtr->dictionary = (const BYTE*)source;
+   streamPtr->dictSize   = (U32)srcSize;
+
+   return result;
+}
+
+/*! LZ4_saveDict() :
+ *  If previously compressed data block is not guaranteed to remain available at
+ * its memory location, save it into a safer place (char* safeBuffer). Note : no
+ * need to call LZ4_loadDict() afterwards, dictionary is immediately usable, one
+ * can therefore call LZ4_compress_fast_continue() right after.
+ * @return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if
+ * error.
+ */
+int LZ4_saveDict(LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize)
+{
+   LZ4_stream_t_internal* const dict = &LZ4_dict->internal_donotuse;
+
+   DEBUGLOG(5, "LZ4_saveDict : dictSize=%i, safeBuffer=%p", dictSize,
+            (void*)safeBuffer);
+
+   if ((U32)dictSize > 64 KB) {
+      dictSize = 64 KB;
+   } /* useless to define a dictionary > 64 KB */
+   if ((U32)dictSize > dict->dictSize) {
+      dictSize = (int)dict->dictSize;
+   }
+
+   if (safeBuffer == NULL)
+      assert(dictSize == 0);
+   if (dictSize > 0) {
+      const BYTE* const previousDictEnd = dict->dictionary + dict->dictSize;
+      assert(dict->dictionary);
+      LZ4_memmove(safeBuffer, previousDictEnd - dictSize, (size_t)dictSize);
+   }
+
+   dict->dictionary = (const BYTE*)safeBuffer;
+   dict->dictSize   = (U32)dictSize;
+
+   return dictSize;
+}
+
+/*-*******************************
+ *  Decompression functions
+ ********************************/
+
+typedef enum { decode_full_block = 0, partial_decode = 1 } earlyEnd_directive;
+
+#undef MIN
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+/* variant for decompress_unsafe()
+ * does not know end of input
+ * presumes input is well formed
+ * note : will consume at least one byte */
+static size_t read_long_length_no_check(const BYTE** pp)
+{
+   size_t b, l = 0;
+   do {
+      b = **pp;
+      (*pp)++;
+      l += b;
+   } while (b == 255);
+   DEBUGLOG(6, "read_long_length_no_check: +length=%zu using %zu input bytes",
+            l, l / 255 + 1)
+   return l;
+}
+
+/* core decoder variant for LZ4_decompress_fast*()
+ * for legacy support only : these entry points are deprecated.
+ * - Presumes input is correctly formed (no defense vs malformed inputs)
+ * - Does not know input size (presume input buffer is "large enough")
+ * - Decompress a full block (only)
+ * @return : nb of bytes read from input.
+ * Note : this variant is not optimized for speed, just for maintenance.
+ *        the goal is to remove support of decompress_fast*() variants by v2.0
+ **/
+LZ4_FORCE_INLINE int LZ4_decompress_unsafe_generic(
+   const BYTE* const istart,
+   BYTE* const       ostart,
+   int               decompressedSize,
+
+   size_t            prefixSize,
+   const BYTE* const dictStart, /* only if dict==usingExtDict */
+   const size_t      dictSize   /* note: =0 if dictStart==NULL */
+)
+{
+   const BYTE*       ip          = istart;
+   BYTE*             op          = (BYTE*)ostart;
+   BYTE* const       oend        = ostart + decompressedSize;
+   const BYTE* const prefixStart = ostart - prefixSize;
+
+   DEBUGLOG(5, "LZ4_decompress_unsafe_generic");
+   if (dictStart == NULL)
+      assert(dictSize == 0);
+
+   while (1) {
+      /* start new sequence */
+      unsigned token = *ip++;
+
+      /* literals */
+      {
+         size_t ll = token >> ML_BITS;
+         if (ll == 15) {
+            /* long literal length */
+            ll += read_long_length_no_check(&ip);
+         }
+         if ((size_t)(oend - op) < ll)
+            return -1;            /* output buffer overflow */
+         LZ4_memmove(op, ip, ll); /* support in-place decompression */
+         op += ll;
+         ip += ll;
+         if ((size_t)(oend - op) < MFLIMIT) {
+            if (op == oend)
+               break; /* end of block */
+            DEBUGLOG(5,
+                     "invalid: literals end at distance %zi from end of block",
+                     oend - op);
+            /* incorrect end of block :
+             * last match must start at least MFLIMIT==12 bytes before end of
+             * output block */
+            return -1;
+         }
+      }
+
+      /* match */
+      {
+         size_t       ml     = token & 15;
+         size_t const offset = LZ4_readLE16(ip);
+         ip += 2;
+
+         if (ml == 15) {
+            /* long literal length */
+            ml += read_long_length_no_check(&ip);
+         }
+         ml += MINMATCH;
+
+         if ((size_t)(oend - op) < ml)
+            return -1; /* output buffer overflow */
+
+         {
+            const BYTE* match = op - offset;
+
+            /* out of range */
+            if (offset > (size_t)(op - prefixStart) + dictSize) {
+               DEBUGLOG(6, "offset out of range");
+               return -1;
+            }
+
+            /* check special case : extDict */
+            if (offset > (size_t)(op - prefixStart)) {
+               /* extDict scenario */
+               const BYTE* const dictEnd = dictStart + dictSize;
+               const BYTE*       extMatch =
+                  dictEnd - (offset - (size_t)(op - prefixStart));
+               size_t const extml = (size_t)(dictEnd - extMatch);
+               if (extml > ml) {
+                  /* match entirely within extDict */
+                  LZ4_memmove(op, extMatch, ml);
+                  op += ml;
+                  ml = 0;
+               } else {
+                  /* match split between extDict & prefix */
+                  LZ4_memmove(op, extMatch, extml);
+                  op += extml;
+                  ml -= extml;
+               }
+               match = prefixStart;
+            }
+
+            /* match copy - slow variant, supporting overlap copy */
+            {
+               size_t u;
+               for (u = 0; u < ml; u++) {
+                  op[u] = match[u];
+               }
+            }
+         }
+         op += ml;
+         if ((size_t)(oend - op) < LASTLITERALS) {
+            DEBUGLOG(5, "invalid: match ends at distance %zi from end of block",
+                     oend - op);
+            /* incorrect end of block :
+             * last match must stop at least LASTLITERALS==5 bytes before end of
+             * output block */
+            return -1;
+         }
+      } /* match */
+   }    /* main loop */
+   return (int)(ip - istart);
+}
+
+/* Read the variable-length literal or match length.
+ *
+ * @ip : input pointer
+ * @ilimit : position after which if length is not decoded, the input is
+ *necessarily corrupted.
+ * @initial_check - check ip >= ipmax before start of loop.  Returns
+ *initial_error if so.
+ * @error (output) - error code.  Must be set to 0 before call.
+ **/
+typedef size_t         Rvl_t;
+static const Rvl_t     rvl_error = (Rvl_t)(-1);
+LZ4_FORCE_INLINE Rvl_t read_variable_length(const BYTE** ip,
+                                            const BYTE*  ilimit,
+                                            int          initial_check)
+{
+   Rvl_t s, length = 0;
+   assert(ip != NULL);
+   assert(*ip != NULL);
+   assert(ilimit != NULL);
+   if (initial_check && unlikely((*ip) >= ilimit)) { /* read limit reached */
+      return rvl_error;
+   }
+   s = **ip;
+   (*ip)++;
+   length += s;
+   if (unlikely((*ip) > ilimit)) { /* read limit reached */
+      return rvl_error;
+   }
+   /* accumulator overflow detection (32-bit mode only) */
+   if ((sizeof(length) < 8) && unlikely(length > ((Rvl_t)(-1) / 2))) {
+      return rvl_error;
+   }
+   if (likely(s != 255))
+      return length;
+   do {
+      s = **ip;
+      (*ip)++;
+      length += s;
+      if (unlikely((*ip) > ilimit)) { /* read limit reached */
+         return rvl_error;
+      }
+      /* accumulator overflow detection (32-bit mode only) */
+      if ((sizeof(length) < 8) && unlikely(length > ((Rvl_t)(-1) / 2))) {
+         return rvl_error;
+      }
+   } while (s == 255);
+
+   return length;
+}
+
+/*! LZ4_decompress_generic() :
+ *  This generic decompression function covers all use cases.
+ *  It shall be instantiated several times, using different sets of directives.
+ *  Note that it is important for performance that this function really get
+ * inlined, in order to remove useless branches during compilation optimization.
+ */
+LZ4_FORCE_INLINE int LZ4_decompress_generic(
+   const char* const src,
+   char* const       dst,
+   int               srcSize,
+   int outputSize, /* If endOnInput==endOnInputSize, this value is `dstCapacity`
+                    */
+
+   earlyEnd_directive partialDecoding, /* full, partial */
+   dict_directive     dict,            /* noDict, withPrefix64k, usingExtDict */
+   const BYTE* const  lowPrefix, /* always <= dst, == dst when no prefix */
+   const BYTE* const  dictStart, /* only if dict==usingExtDict */
+   const size_t       dictSize   /* note : = 0 if noDict */
+)
+{
+   if ((src == NULL) || (outputSize < 0)) {
+      return -1;
+   }
+
+   {
+      const BYTE*       ip   = (const BYTE*)src;
+      const BYTE* const iend = ip + srcSize;
+
+      BYTE*       op   = (BYTE*)dst;
+      BYTE* const oend = op + outputSize;
+      BYTE*       cpy;
+
+      const BYTE* const dictEnd =
+         (dictStart == NULL) ? NULL : dictStart + dictSize;
+
+      const int checkOffset = (dictSize < (int)(64 KB));
+
+      /* Set up the "end" pointers for the shortcut. */
+      const BYTE* const shortiend = iend - 14 /*maxLL*/ - 2 /*offset*/;
+      const BYTE* const shortoend = oend - 14 /*maxLL*/ - 18 /*maxML*/;
+
+      const BYTE* match;
+      size_t      offset;
+      unsigned    token;
+      size_t      length;
+
+      DEBUGLOG(5, "LZ4_decompress_generic (srcSize:%i, dstSize:%i)", srcSize,
+               outputSize);
+
+      /* Special cases */
+      assert(lowPrefix <= op);
+      if (unlikely(outputSize == 0)) {
+         /* Empty output buffer */
+         if (partialDecoding)
+            return 0;
+         return ((srcSize == 1) && (*ip == 0)) ? 0 : -1;
+      }
+      if (unlikely(srcSize == 0)) {
+         return -1;
+      }
+
+      /* LZ4_FAST_DEC_LOOP:
+       * designed for modern OoO performance cpus,
+       * where copying reliably 32-bytes is preferable to an unpredictable
+       * branch. note : fast loop may show a regression for some client arm
+       * chips. */
+#if LZ4_FAST_DEC_LOOP
+      if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
+         DEBUGLOG(6, "move to safe decode loop");
+         goto safe_decode;
+      }
+
+      /* Fast loop : decode sequences as long as output <
+       * oend-FASTLOOP_SAFE_DISTANCE */
+      DEBUGLOG(6, "using fast decode loop");
+      while (1) {
+         /* Main fastloop assertion: We can always wildcopy
+          * FASTLOOP_SAFE_DISTANCE */
+         assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
+         assert(ip < iend);
+         token  = *ip++;
+         length = token >> ML_BITS; /* literal length */
+         DEBUGLOG(7, "blockPos%6u: litLength token = %u",
+                  (unsigned)(op - (BYTE*)dst), (unsigned)length);
+
+         /* decode literal length */
+         if (length == RUN_MASK) {
+            size_t const addl = read_variable_length(&ip, iend - RUN_MASK, 1);
+            if (addl == rvl_error) {
+               DEBUGLOG(6, "error reading long literal length");
+               goto _output_error;
+            }
+            length += addl;
+            if (unlikely((uptrval)(op) + length < (uptrval)(op))) {
+               goto _output_error;
+            } /* overflow detection */
+            if (unlikely((uptrval)(ip) + length < (uptrval)(ip))) {
+               goto _output_error;
+            } /* overflow detection */
+
+            /* copy literals */
+            LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
+            if ((op + length > oend - 32) || (ip + length > iend - 32)) {
+               goto safe_literal_copy;
+            }
+            LZ4_wildCopy32(op, ip, op + length);
+            ip += length;
+            op += length;
+         } else if (ip <= iend - (16 + 1 /*max lit + offset + nextToken*/)) {
+            /* We don't need to check oend, since we check it once for each loop
+             * below */
+            DEBUGLOG(7, "copy %u bytes in a 16-bytes stripe", (unsigned)length);
+            /* Literals can only be <= 14, but hope compilers optimize better
+             * when copy by a register size */
+            LZ4_memcpy(op, ip, 16);
+            ip += length;
+            op += length;
+         } else {
+            goto safe_literal_copy;
+         }
+
+         /* get offset */
+         offset = LZ4_readLE16(ip);
+         ip += 2;
+         DEBUGLOG(6, "blockPos%6u: offset = %u", (unsigned)(op - (BYTE*)dst),
+                  (unsigned)offset);
+         match = op - offset;
+         assert(match <= op); /* overflow check */
+
+         /* get matchlength */
+         length = token & ML_MASK;
+         DEBUGLOG(7, "  match length token = %u (len==%u)", (unsigned)length,
+                  (unsigned)length + MINMATCH);
+
+         if (length == ML_MASK) {
+            size_t const addl =
+               read_variable_length(&ip, iend - LASTLITERALS + 1, 0);
+            if (addl == rvl_error) {
+               DEBUGLOG(5, "error reading long match length");
+               goto _output_error;
+            }
+            length += addl;
+            length += MINMATCH;
+            DEBUGLOG(7, "  long match length == %u", (unsigned)length);
+            if (unlikely((uptrval)(op) + length < (uptrval)op)) {
+               goto _output_error;
+            } /* overflow detection */
+            if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
+               goto safe_match_copy;
+            }
+         } else {
+            length += MINMATCH;
+            if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
+               DEBUGLOG(7, "moving to safe_match_copy (ml==%u)",
+                        (unsigned)length);
+               goto safe_match_copy;
+            }
+
+            /* Fastpath check: skip LZ4_wildCopy32 when true */
+            if ((dict == withPrefix64k) || (match >= lowPrefix)) {
+               if (offset >= 8) {
+                  assert(match >= lowPrefix);
+                  assert(match <= op);
+                  assert(op + 18 <= oend);
+
+                  LZ4_memcpy(op, match, 8);
+                  LZ4_memcpy(op + 8, match + 8, 8);
+                  LZ4_memcpy(op + 16, match + 16, 2);
+                  op += length;
+                  continue;
+               }
+            }
+         }
+
+         if (checkOffset && (unlikely(match + dictSize < lowPrefix))) {
+            DEBUGLOG(5, "Error : pos=%zi, offset=%zi => outside buffers",
+                     op - lowPrefix, op - match);
+            goto _output_error;
+         }
+         /* match starting within external dictionary */
+         if ((dict == usingExtDict) && (match < lowPrefix)) {
+            assert(dictEnd != NULL);
+            if (unlikely(op + length > oend - LASTLITERALS)) {
+               if (partialDecoding) {
+                  DEBUGLOG(
+                     7, "partialDecoding: dictionary match, close to dstEnd");
+                  length = MIN(length, (size_t)(oend - op));
+               } else {
+                  DEBUGLOG(6, "end-of-block condition violated")
+                  goto _output_error;
+               }
+            }
+
+            if (length <= (size_t)(lowPrefix - match)) {
+               /* match fits entirely within external dictionary : just copy */
+               LZ4_memmove(op, dictEnd - (lowPrefix - match), length);
+               op += length;
+            } else {
+               /* match stretches into both external dictionary and current
+                * block */
+               size_t const copySize = (size_t)(lowPrefix - match);
+               size_t const restSize = length - copySize;
+               LZ4_memcpy(op, dictEnd - copySize, copySize);
+               op += copySize;
+               if (restSize > (size_t)(op - lowPrefix)) { /* overlap copy */
+                  BYTE* const endOfMatch = op + restSize;
+                  const BYTE* copyFrom   = lowPrefix;
+                  while (op < endOfMatch) {
+                     *op++ = *copyFrom++;
+                  }
+               } else {
+                  LZ4_memcpy(op, lowPrefix, restSize);
+                  op += restSize;
+               }
+            }
+            continue;
+         }
+
+         /* copy match within block */
+         cpy = op + length;
+
+         assert((op <= oend) && (oend - op >= 32));
+         if (unlikely(offset < 16)) {
+            LZ4_memcpy_using_offset(op, match, cpy, offset);
+         } else {
+            LZ4_wildCopy32(op, match, cpy);
+         }
+
+         op = cpy; /* wildcopy correction */
+      }
+   safe_decode:
+#endif
+
+      /* Main Loop : decode remaining sequences where output <
+       * FASTLOOP_SAFE_DISTANCE */
+      DEBUGLOG(6, "using safe decode loop");
+      while (1) {
+         assert(ip < iend);
+         token  = *ip++;
+         length = token >> ML_BITS; /* literal length */
+         DEBUGLOG(7, "blockPos%6u: litLength token = %u",
+                  (unsigned)(op - (BYTE*)dst), (unsigned)length);
+
+         /* A two-stage shortcut for the most common case:
+          * 1) If the literal length is 0..14, and there is enough space,
+          * enter the shortcut and copy 16 bytes on behalf of the literals
+          * (in the fast mode, only 8 bytes can be safely copied this way).
+          * 2) Further if the match length is 4..18, copy 18 bytes in a similar
+          * manner; but we ensure that there's enough space in the output for
+          * those 18 bytes earlier, upon entering the shortcut (in other words,
+          * there is a combined check for both stages).
+          */
+         if ((length != RUN_MASK)
+             /* strictly "less than" on input, to re-enter the loop with at
+                least one byte */
+             && likely((ip < shortiend) & (op <= shortoend))) {
+            /* Copy the literals */
+            LZ4_memcpy(op, ip, 16);
+            op += length;
+            ip += length;
+
+            /* The second stage: prepare for match copying, decode full info.
+             * If it doesn't work out, the info won't be wasted. */
+            length = token & ML_MASK; /* match length */
+            DEBUGLOG(7, "blockPos%6u: matchLength token = %u (len=%u)",
+                     (unsigned)(op - (BYTE*)dst), (unsigned)length,
+                     (unsigned)length + 4);
+            offset = LZ4_readLE16(ip);
+            ip += 2;
+            match = op - offset;
+            assert(match <= op); /* check overflow */
+
+            /* Do not deal with overlapping matches. */
+            if ((length != ML_MASK) && (offset >= 8) &&
+                (dict == withPrefix64k || match >= lowPrefix)) {
+               /* Copy the match. */
+               LZ4_memcpy(op + 0, match + 0, 8);
+               LZ4_memcpy(op + 8, match + 8, 8);
+               LZ4_memcpy(op + 16, match + 16, 2);
+               op += length + MINMATCH;
+               /* Both stages worked, load the next token. */
+               continue;
+            }
+
+            /* The second stage didn't work out, but the info is ready.
+             * Propel it right to the point of match copying. */
+            goto _copy_match;
+         }
+
+         /* decode literal length */
+         if (length == RUN_MASK) {
+            size_t const addl = read_variable_length(&ip, iend - RUN_MASK, 1);
+            if (addl == rvl_error) {
+               goto _output_error;
+            }
+            length += addl;
+            if (unlikely((uptrval)(op) + length < (uptrval)(op))) {
+               goto _output_error;
+            } /* overflow detection */
+            if (unlikely((uptrval)(ip) + length < (uptrval)(ip))) {
+               goto _output_error;
+            } /* overflow detection */
+         }
+
+#if LZ4_FAST_DEC_LOOP
+      safe_literal_copy:
+#endif
+         /* copy literals */
+         cpy = op + length;
+
+         LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
+         if ((cpy > oend - MFLIMIT) ||
+             (ip + length > iend - (2 + 1 + LASTLITERALS))) {
+            /* We've either hit the input parsing restriction or the output
+             * parsing restriction. In the normal scenario, decoding a full
+             * block, it must be the last sequence, otherwise it's an error
+             * (invalid input or dimensions). In partialDecoding scenario, it's
+             * necessary to ensure there is no buffer overflow.
+             */
+            if (partialDecoding) {
+               /* Since we are partial decoding we may be in this block because
+                * of the output parsing restriction, which is not valid since
+                * the output buffer is allowed to be undersized.
+                */
+               DEBUGLOG(7, "partialDecoding: copying literals, close to input "
+                           "or output end")
+               DEBUGLOG(7, "partialDecoding: literal length = %u",
+                        (unsigned)length);
+               DEBUGLOG(7, "partialDecoding: remaining space in dstBuffer : %i",
+                        (int)(oend - op));
+               DEBUGLOG(7, "partialDecoding: remaining space in srcBuffer : %i",
+                        (int)(iend - ip));
+               /* Finishing in the middle of a literals segment,
+                * due to lack of input.
+                */
+               if (ip + length > iend) {
+                  length = (size_t)(iend - ip);
+                  cpy    = op + length;
+               }
+               /* Finishing in the middle of a literals segment,
+                * due to lack of output space.
+                */
+               if (cpy > oend) {
+                  cpy = oend;
+                  assert(op <= oend);
+                  length = (size_t)(oend - op);
+               }
+            } else {
+               /* We must be on the last sequence (or invalid) because of the
+                * parsing limitations so check that we exactly consume the input
+                * and don't overrun the output buffer.
+                */
+               if ((ip + length != iend) || (cpy > oend)) {
+                  DEBUGLOG(5, "should have been last run of literals")
+                  DEBUGLOG(5, "ip(%p) + length(%i) = %p != iend (%p)",
+                           (void*)ip, (int)length, (void*)(ip + length),
+                           (void*)iend);
+                  DEBUGLOG(5, "or cpy(%p) > (oend-MFLIMIT)(%p)", (void*)cpy,
+                           (void*)(oend - MFLIMIT));
+                  DEBUGLOG(5, "after writing %u bytes / %i bytes available",
+                           (unsigned)(op - (BYTE*)dst), outputSize);
+                  goto _output_error;
+               }
+            }
+            LZ4_memmove(op, ip,
+                        length); /* supports overlapping memory regions, for
+                                    in-place decompression scenarios */
+            ip += length;
+            op += length;
+            /* Necessarily EOF when !partialDecoding.
+             * When partialDecoding, it is EOF if we've either
+             * filled the output buffer or
+             * can't proceed with reading an offset for following match.
+             */
+            if (!partialDecoding || (cpy == oend) || (ip >= (iend - 2))) {
+               break;
+            }
+         } else {
+            LZ4_wildCopy8(op, ip,
+                          cpy); /* can overwrite up to 8 bytes beyond cpy */
+            ip += length;
+            op = cpy;
+         }
+
+         /* get offset */
+         offset = LZ4_readLE16(ip);
+         ip += 2;
+         match = op - offset;
+
+         /* get matchlength */
+         length = token & ML_MASK;
+         DEBUGLOG(7, "blockPos%6u: matchLength token = %u",
+                  (unsigned)(op - (BYTE*)dst), (unsigned)length);
+
+      _copy_match:
+         if (length == ML_MASK) {
+            size_t const addl =
+               read_variable_length(&ip, iend - LASTLITERALS + 1, 0);
+            if (addl == rvl_error) {
+               goto _output_error;
+            }
+            length += addl;
+            if (unlikely((uptrval)(op) + length < (uptrval)op))
+               goto _output_error; /* overflow detection */
+         }
+         length += MINMATCH;
+
+#if LZ4_FAST_DEC_LOOP
+      safe_match_copy:
+#endif
+         if ((checkOffset) && (unlikely(match + dictSize < lowPrefix)))
+            goto _output_error; /* Error : offset outside buffers */
+         /* match starting within external dictionary */
+         if ((dict == usingExtDict) && (match < lowPrefix)) {
+            assert(dictEnd != NULL);
+            if (unlikely(op + length > oend - LASTLITERALS)) {
+               if (partialDecoding)
+                  length = MIN(length, (size_t)(oend - op));
+               else
+                  goto _output_error; /* doesn't respect parsing restriction */
+            }
+
+            if (length <= (size_t)(lowPrefix - match)) {
+               /* match fits entirely within external dictionary : just copy */
+               LZ4_memmove(op, dictEnd - (lowPrefix - match), length);
+               op += length;
+            } else {
+               /* match stretches into both external dictionary and current
+                * block */
+               size_t const copySize = (size_t)(lowPrefix - match);
+               size_t const restSize = length - copySize;
+               LZ4_memcpy(op, dictEnd - copySize, copySize);
+               op += copySize;
+               if (restSize > (size_t)(op - lowPrefix)) { /* overlap copy */
+                  BYTE* const endOfMatch = op + restSize;
+                  const BYTE* copyFrom   = lowPrefix;
+                  while (op < endOfMatch)
+                     *op++ = *copyFrom++;
+               } else {
+                  LZ4_memcpy(op, lowPrefix, restSize);
+                  op += restSize;
+               }
+            }
+            continue;
+         }
+         assert(match >= lowPrefix);
+
+         /* copy match within block */
+         cpy = op + length;
+
+         /* partialDecoding : may end anywhere within the block */
+         assert(op <= oend);
+         if (partialDecoding && (cpy > oend - MATCH_SAFEGUARD_DISTANCE)) {
+            size_t const      mlen     = MIN(length, (size_t)(oend - op));
+            const BYTE* const matchEnd = match + mlen;
+            BYTE* const       copyEnd  = op + mlen;
+            if (matchEnd > op) { /* overlap copy */
+               while (op < copyEnd) {
+                  *op++ = *match++;
+               }
+            } else {
+               LZ4_memcpy(op, match, mlen);
+            }
+            op = copyEnd;
+            if (op == oend) {
+               break;
+            }
+            continue;
+         }
+
+         if (unlikely(offset < 8)) {
+            LZ4_write32(op, 0); /* silence msan warning when offset==0 */
+            op[0] = match[0];
+            op[1] = match[1];
+            op[2] = match[2];
+            op[3] = match[3];
+            match += inc32table[offset];
+            LZ4_memcpy(op + 4, match, 4);
+            match -= dec64table[offset];
+         } else {
+            LZ4_memcpy(op, match, 8);
+            match += 8;
+         }
+         op += 8;
+
+         if (unlikely(cpy > oend - MATCH_SAFEGUARD_DISTANCE)) {
+            BYTE* const oCopyLimit = oend - (WILDCOPYLENGTH - 1);
+            if (cpy > oend - LASTLITERALS) {
+               goto _output_error;
+            } /* Error : last LASTLITERALS bytes must be literals (uncompressed)
+               */
+            if (op < oCopyLimit) {
+               LZ4_wildCopy8(op, match, oCopyLimit);
+               match += oCopyLimit - op;
+               op = oCopyLimit;
+            }
+            while (op < cpy) {
+               *op++ = *match++;
+            }
+         } else {
+            LZ4_memcpy(op, match, 8);
+            if (length > 16) {
+               LZ4_wildCopy8(op + 8, match + 8, cpy);
+            }
+         }
+         op = cpy; /* wildcopy correction */
+      }
+
+      /* end of decoding */
+      DEBUGLOG(5, "decoded %i bytes", (int)(((char*)op) - dst));
+      return (int)(((char*)op) - dst); /* Nb of output bytes decoded */
+
+      /* Overflow error detected */
+   _output_error:
+      return (int)(-(((const char*)ip) - src)) - 1;
+   }
+}
+
+/*===== Instantiate the API decoding functions. =====*/
+
+LZ4_FORCE_O2
+int LZ4_decompress_safe(const char* source,
+                        char*       dest,
+                        int         compressedSize,
+                        int         maxDecompressedSize)
+{
+   return LZ4_decompress_generic(source, dest, compressedSize,
+                                 maxDecompressedSize, decode_full_block, noDict,
+                                 (BYTE*)dest, NULL, 0);
+}
+
+LZ4_FORCE_O2
+int LZ4_decompress_safe_partial(const char* src,
+                                char*       dst,
+                                int         compressedSize,
+                                int         targetOutputSize,
+                                int         dstCapacity)
+{
+   dstCapacity = MIN(targetOutputSize, dstCapacity);
+   return LZ4_decompress_generic(src, dst, compressedSize, dstCapacity,
+                                 partial_decode, noDict, (BYTE*)dst, NULL, 0);
+}
+
+LZ4_FORCE_O2
+int LZ4_decompress_fast(const char* source, char* dest, int originalSize)
+{
+   DEBUGLOG(5, "LZ4_decompress_fast");
+   return LZ4_decompress_unsafe_generic((const BYTE*)source, (BYTE*)dest,
+                                        originalSize, 0, NULL, 0);
+}
+
+/*===== Instantiate a few more decoding cases, used more than once. =====*/
+
+LZ4_FORCE_O2 /* Exported, an obsolete API function. */
+   int
+   LZ4_decompress_safe_withPrefix64k(const char* source,
+                                     char*       dest,
+                                     int         compressedSize,
+                                     int         maxOutputSize)
+{
+   return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                 decode_full_block, withPrefix64k,
+                                 (BYTE*)dest - 64 KB, NULL, 0);
+}
+
+LZ4_FORCE_O2
+static int LZ4_decompress_safe_partial_withPrefix64k(const char* source,
+                                                     char*       dest,
+                                                     int         compressedSize,
+                                                     int targetOutputSize,
+                                                     int dstCapacity)
+{
+   dstCapacity = MIN(targetOutputSize, dstCapacity);
+   return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity,
+                                 partial_decode, withPrefix64k,
+                                 (BYTE*)dest - 64 KB, NULL, 0);
+}
+
+/* Another obsolete API function, paired with the previous one. */
+int LZ4_decompress_fast_withPrefix64k(const char* source,
+                                      char*       dest,
+                                      int         originalSize)
+{
+   return LZ4_decompress_unsafe_generic((const BYTE*)source, (BYTE*)dest,
+                                        originalSize, 64 KB, NULL, 0);
+}
+
+LZ4_FORCE_O2
+static int LZ4_decompress_safe_withSmallPrefix(const char* source,
+                                               char*       dest,
+                                               int         compressedSize,
+                                               int         maxOutputSize,
+                                               size_t      prefixSize)
+{
+   return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                 decode_full_block, noDict,
+                                 (BYTE*)dest - prefixSize, NULL, 0);
+}
+
+LZ4_FORCE_O2
+static int LZ4_decompress_safe_partial_withSmallPrefix(const char* source,
+                                                       char*       dest,
+                                                       int    compressedSize,
+                                                       int    targetOutputSize,
+                                                       int    dstCapacity,
+                                                       size_t prefixSize)
+{
+   dstCapacity = MIN(targetOutputSize, dstCapacity);
+   return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity,
+                                 partial_decode, noDict,
+                                 (BYTE*)dest - prefixSize, NULL, 0);
+}
+
+LZ4_FORCE_O2
+int LZ4_decompress_safe_forceExtDict(const char* source,
+                                     char*       dest,
+                                     int         compressedSize,
+                                     int         maxOutputSize,
+                                     const void* dictStart,
+                                     size_t      dictSize)
+{
+   DEBUGLOG(5, "LZ4_decompress_safe_forceExtDict");
+   return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                 decode_full_block, usingExtDict, (BYTE*)dest,
+                                 (const BYTE*)dictStart, dictSize);
+}
+
+LZ4_FORCE_O2
+int LZ4_decompress_safe_partial_forceExtDict(const char* source,
+                                             char*       dest,
+                                             int         compressedSize,
+                                             int         targetOutputSize,
+                                             int         dstCapacity,
+                                             const void* dictStart,
+                                             size_t      dictSize)
+{
+   dstCapacity = MIN(targetOutputSize, dstCapacity);
+   return LZ4_decompress_generic(source, dest, compressedSize, dstCapacity,
+                                 partial_decode, usingExtDict, (BYTE*)dest,
+                                 (const BYTE*)dictStart, dictSize);
+}
+
+LZ4_FORCE_O2
+static int LZ4_decompress_fast_extDict(const char* source,
+                                       char*       dest,
+                                       int         originalSize,
+                                       const void* dictStart,
+                                       size_t      dictSize)
+{
+   return LZ4_decompress_unsafe_generic((const BYTE*)source, (BYTE*)dest,
+                                        originalSize, 0, (const BYTE*)dictStart,
+                                        dictSize);
+}
+
+/* The "double dictionary" mode, for use with e.g. ring buffers: the first part
+ * of the dictionary is passed as prefix, and the second via dictStart +
+ * dictSize. These routines are used only once, in LZ4_decompress_*_continue().
+ */
+LZ4_FORCE_INLINE
+int LZ4_decompress_safe_doubleDict(const char* source,
+                                   char*       dest,
+                                   int         compressedSize,
+                                   int         maxOutputSize,
+                                   size_t      prefixSize,
+                                   const void* dictStart,
+                                   size_t      dictSize)
+{
+   return LZ4_decompress_generic(
+      source, dest, compressedSize, maxOutputSize, decode_full_block,
+      usingExtDict, (BYTE*)dest - prefixSize, (const BYTE*)dictStart, dictSize);
+}
+
+/*===== streaming decompression functions =====*/
+
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+LZ4_streamDecode_t* LZ4_createStreamDecode(void)
+{
+   LZ4_STATIC_ASSERT(sizeof(LZ4_streamDecode_t) >=
+                     sizeof(LZ4_streamDecode_t_internal));
+   return (LZ4_streamDecode_t*)ALLOC_AND_ZERO(sizeof(LZ4_streamDecode_t));
+}
+
+int LZ4_freeStreamDecode(LZ4_streamDecode_t* LZ4_stream)
+{
+   if (LZ4_stream == NULL) {
+      return 0;
+   } /* support free on NULL */
+   FREEMEM(LZ4_stream);
+   return 0;
+}
+#endif
+
+/*! LZ4_setStreamDecode() :
+ *  Use this function to instruct where to find the dictionary.
+ *  This function is not necessary if previous data is still available where it
+ * was decoded. Loading a size of 0 is allowed (same effect as no dictionary).
+ * @return : 1 if OK, 0 if error
+ */
+int LZ4_setStreamDecode(LZ4_streamDecode_t* LZ4_streamDecode,
+                        const char*         dictionary,
+                        int                 dictSize)
+{
+   LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse;
+   lz4sd->prefixSize                  = (size_t)dictSize;
+   if (dictSize) {
+      assert(dictionary != NULL);
+      lz4sd->prefixEnd = (const BYTE*)dictionary + dictSize;
+   } else {
+      lz4sd->prefixEnd = (const BYTE*)dictionary;
+   }
+   lz4sd->externalDict = NULL;
+   lz4sd->extDictSize  = 0;
+   return 1;
+}
+
+/*! LZ4_decoderRingBufferSize() :
+ *  when setting a ring buffer for streaming decompression (optional scenario),
+ *  provides the minimum size of this ring buffer
+ *  to be compatible with any source respecting maxBlockSize condition.
+ *  Note : in a ring buffer scenario,
+ *  blocks are presumed decompressed next to each other.
+ *  When not enough space remains for next block (remainingSize < maxBlockSize),
+ *  decoding resumes from beginning of ring buffer.
+ * @return : minimum ring buffer size,
+ *           or 0 if there is an error (invalid maxBlockSize).
+ */
+int LZ4_decoderRingBufferSize(int maxBlockSize)
+{
+   if (maxBlockSize < 0)
+      return 0;
+   if (maxBlockSize > LZ4_MAX_INPUT_SIZE)
+      return 0;
+   if (maxBlockSize < 16)
+      maxBlockSize = 16;
+   return LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize);
+}
+
+/*
+*_continue() :
+    These decoding functions allow decompression of multiple blocks in
+"streaming" mode. Previously decoded blocks must still be available at the
+memory position where they were decoded. If it's not possible, save the relevant
+part of decoded data into a safe buffer, and indicate where it stands using
+LZ4_setStreamDecode()
+*/
+LZ4_FORCE_O2
+int LZ4_decompress_safe_continue(LZ4_streamDecode_t* LZ4_streamDecode,
+                                 const char*         source,
+                                 char*               dest,
+                                 int                 compressedSize,
+                                 int                 maxOutputSize)
+{
+   LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse;
+   int                          result;
+
+   if (lz4sd->prefixSize == 0) {
+      /* The first call, no dictionary yet. */
+      assert(lz4sd->extDictSize == 0);
+      result = LZ4_decompress_safe(source, dest, compressedSize, maxOutputSize);
+      if (result <= 0)
+         return result;
+      lz4sd->prefixSize = (size_t)result;
+      lz4sd->prefixEnd  = (BYTE*)dest + result;
+   } else if (lz4sd->prefixEnd == (BYTE*)dest) {
+      /* They're rolling the current segment. */
+      if (lz4sd->prefixSize >= 64 KB - 1)
+         result = LZ4_decompress_safe_withPrefix64k(
+            source, dest, compressedSize, maxOutputSize);
+      else if (lz4sd->extDictSize == 0)
+         result = LZ4_decompress_safe_withSmallPrefix(
+            source, dest, compressedSize, maxOutputSize, lz4sd->prefixSize);
+      else
+         result = LZ4_decompress_safe_doubleDict(
+            source, dest, compressedSize, maxOutputSize, lz4sd->prefixSize,
+            lz4sd->externalDict, lz4sd->extDictSize);
+      if (result <= 0)
+         return result;
+      lz4sd->prefixSize += (size_t)result;
+      lz4sd->prefixEnd += result;
+   } else {
+      /* The buffer wraps around, or they're switching to another buffer. */
+      lz4sd->extDictSize  = lz4sd->prefixSize;
+      lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
+      result              = LZ4_decompress_safe_forceExtDict(
+                      source, dest, compressedSize, maxOutputSize, lz4sd->externalDict,
+                      lz4sd->extDictSize);
+      if (result <= 0)
+         return result;
+      lz4sd->prefixSize = (size_t)result;
+      lz4sd->prefixEnd  = (BYTE*)dest + result;
+   }
+
+   return result;
+}
+
+LZ4_FORCE_O2 int
+LZ4_decompress_fast_continue(LZ4_streamDecode_t* LZ4_streamDecode,
+                             const char*         source,
+                             char*               dest,
+                             int                 originalSize)
+{
+   LZ4_streamDecode_t_internal* const lz4sd =
+      (assert(LZ4_streamDecode != NULL), &LZ4_streamDecode->internal_donotuse);
+   int result;
+
+   DEBUGLOG(5, "LZ4_decompress_fast_continue (toDecodeSize=%i)", originalSize);
+   assert(originalSize >= 0);
+
+   if (lz4sd->prefixSize == 0) {
+      DEBUGLOG(5, "first invocation : no prefix nor extDict");
+      assert(lz4sd->extDictSize == 0);
+      result = LZ4_decompress_fast(source, dest, originalSize);
+      if (result <= 0)
+         return result;
+      lz4sd->prefixSize = (size_t)originalSize;
+      lz4sd->prefixEnd  = (BYTE*)dest + originalSize;
+   } else if (lz4sd->prefixEnd == (BYTE*)dest) {
+      DEBUGLOG(5, "continue using existing prefix");
+      result = LZ4_decompress_unsafe_generic(
+         (const BYTE*)source, (BYTE*)dest, originalSize, lz4sd->prefixSize,
+         lz4sd->externalDict, lz4sd->extDictSize);
+      if (result <= 0)
+         return result;
+      lz4sd->prefixSize += (size_t)originalSize;
+      lz4sd->prefixEnd += originalSize;
+   } else {
+      DEBUGLOG(5, "prefix becomes extDict");
+      lz4sd->extDictSize  = lz4sd->prefixSize;
+      lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
+      result              = LZ4_decompress_fast_extDict(
+                      source, dest, originalSize, lz4sd->externalDict, lz4sd->extDictSize);
+      if (result <= 0)
+         return result;
+      lz4sd->prefixSize = (size_t)originalSize;
+      lz4sd->prefixEnd  = (BYTE*)dest + originalSize;
+   }
+
+   return result;
+}
+
+/*
+Advanced decoding functions :
+*_usingDict() :
+    These decoding functions work the same as "_continue" ones,
+    the dictionary must be explicitly provided within parameters
+*/
+
+int LZ4_decompress_safe_usingDict(const char* source,
+                                  char*       dest,
+                                  int         compressedSize,
+                                  int         maxOutputSize,
+                                  const char* dictStart,
+                                  int         dictSize)
+{
+   if (dictSize == 0)
+      return LZ4_decompress_safe(source, dest, compressedSize, maxOutputSize);
+   if (dictStart + dictSize == dest) {
+      if (dictSize >= 64 KB - 1) {
+         return LZ4_decompress_safe_withPrefix64k(source, dest, compressedSize,
+                                                  maxOutputSize);
+      }
+      assert(dictSize >= 0);
+      return LZ4_decompress_safe_withSmallPrefix(
+         source, dest, compressedSize, maxOutputSize, (size_t)dictSize);
+   }
+   assert(dictSize >= 0);
+   return LZ4_decompress_safe_forceExtDict(
+      source, dest, compressedSize, maxOutputSize, dictStart, (size_t)dictSize);
+}
+
+int LZ4_decompress_safe_partial_usingDict(const char* source,
+                                          char*       dest,
+                                          int         compressedSize,
+                                          int         targetOutputSize,
+                                          int         dstCapacity,
+                                          const char* dictStart,
+                                          int         dictSize)
+{
+   if (dictSize == 0)
+      return LZ4_decompress_safe_partial(source, dest, compressedSize,
+                                         targetOutputSize, dstCapacity);
+   if (dictStart + dictSize == dest) {
+      if (dictSize >= 64 KB - 1) {
+         return LZ4_decompress_safe_partial_withPrefix64k(
+            source, dest, compressedSize, targetOutputSize, dstCapacity);
+      }
+      assert(dictSize >= 0);
+      return LZ4_decompress_safe_partial_withSmallPrefix(
+         source, dest, compressedSize, targetOutputSize, dstCapacity,
+         (size_t)dictSize);
+   }
+   assert(dictSize >= 0);
+   return LZ4_decompress_safe_partial_forceExtDict(
+      source, dest, compressedSize, targetOutputSize, dstCapacity, dictStart,
+      (size_t)dictSize);
+}
+
+int LZ4_decompress_fast_usingDict(const char* source,
+                                  char*       dest,
+                                  int         originalSize,
+                                  const char* dictStart,
+                                  int         dictSize)
+{
+   if (dictSize == 0 || dictStart + dictSize == dest)
+      return LZ4_decompress_unsafe_generic((const BYTE*)source, (BYTE*)dest,
+                                           originalSize, (size_t)dictSize, NULL,
+                                           0);
+   assert(dictSize >= 0);
+   return LZ4_decompress_fast_extDict(source, dest, originalSize, dictStart,
+                                      (size_t)dictSize);
+}
+
+/*=*************************************************
+ *  Obsolete Functions
+ ***************************************************/
+/* obsolete compression functions */
+int LZ4_compress_limitedOutput(const char* source,
+                               char*       dest,
+                               int         inputSize,
+                               int         maxOutputSize)
+{
+   return LZ4_compress_default(source, dest, inputSize, maxOutputSize);
+}
+int LZ4_compress(const char* src, char* dest, int srcSize)
+{
+   return LZ4_compress_default(src, dest, srcSize, LZ4_compressBound(srcSize));
+}
+int LZ4_compress_limitedOutput_withState(
+   void* state, const char* src, char* dst, int srcSize, int dstSize)
+{
+   return LZ4_compress_fast_extState(state, src, dst, srcSize, dstSize, 1);
+}
+int LZ4_compress_withState(void* state, const char* src, char* dst, int srcSize)
+{
+   return LZ4_compress_fast_extState(state, src, dst, srcSize,
+                                     LZ4_compressBound(srcSize), 1);
+}
+int LZ4_compress_limitedOutput_continue(LZ4_stream_t* LZ4_stream,
+                                        const char*   src,
+                                        char*         dst,
+                                        int           srcSize,
+                                        int           dstCapacity)
+{
+   return LZ4_compress_fast_continue(LZ4_stream, src, dst, srcSize, dstCapacity,
+                                     1);
+}
+int LZ4_compress_continue(LZ4_stream_t* LZ4_stream,
+                          const char*   source,
+                          char*         dest,
+                          int           inputSize)
+{
+   return LZ4_compress_fast_continue(LZ4_stream, source, dest, inputSize,
+                                     LZ4_compressBound(inputSize), 1);
+}
+
+/*
+These decompression functions are deprecated and should no longer be used.
+They are only provided here for compatibility with older user programs.
+- LZ4_uncompress is totally equivalent to LZ4_decompress_fast
+- LZ4_uncompress_unknownOutputSize is totally equivalent to LZ4_decompress_safe
+*/
+int LZ4_uncompress(const char* source, char* dest, int outputSize)
+{
+   return LZ4_decompress_fast(source, dest, outputSize);
+}
+int LZ4_uncompress_unknownOutputSize(const char* source,
+                                     char*       dest,
+                                     int         isize,
+                                     int         maxOutputSize)
+{
+   return LZ4_decompress_safe(source, dest, isize, maxOutputSize);
+}
+
+/* Obsolete Streaming functions */
+
+int LZ4_sizeofStreamState(void) { return sizeof(LZ4_stream_t); }
+
+int LZ4_resetStreamState(void* state, char* inputBuffer)
+{
+   (void)inputBuffer;
+   LZ4_resetStream((LZ4_stream_t*)state);
+   return 0;
+}
+
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+void* LZ4_create(char* inputBuffer)
+{
+   (void)inputBuffer;
+   return LZ4_createStream();
+}
+#endif
+
+char* LZ4_slideInputBuffer(void* state)
+{
+   /* avoid const char * -> char * conversion warning */
+   return (char*)(uptrval)((LZ4_stream_t*)state)->internal_donotuse.dictionary;
+}
+
+#endif /* LZ4_COMMONDEFS_ONLY */
diff --git a/tracegrind/lz4.h b/tracegrind/lz4.h
new file mode 100644
index 000000000..a08439161
--- /dev/null
+++ b/tracegrind/lz4.h
@@ -0,0 +1,1053 @@
+/*
+ *  LZ4 - Fast LZ compression algorithm
+ *  Header File
+ *  Copyright (c) Yann Collet. All rights reserved.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+    - LZ4 homepage : http://www.lz4.org
+    - LZ4 source repository : https://github.com/lz4/lz4
+*/
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#ifndef LZ4_H_2983827168210
+#define LZ4_H_2983827168210
+
+/* --- Dependency --- */
+#if !LZ4_FREESTANDING
+#include <stddef.h> /* size_t */
+#endif
+
+/**
+  Introduction
+
+  LZ4 is lossless compression algorithm, providing compression speed >500 MB/s
+  per core, scalable with multi-cores CPU. It features an extremely fast
+  decoder, with speed in multiple GB/s per core, typically reaching RAM speed
+  limits on multi-core systems.
+
+  The LZ4 compression library provides in-memory compression and decompression
+  functions. It gives full buffer control to user. Compression can be done in:
+    - a single step (described as Simple Functions)
+    - a single step, reusing a context (described in Advanced Functions)
+    - unbounded multiple steps (described as Streaming compression)
+
+  lz4.h generates and decodes LZ4-compressed blocks (doc/lz4_Block_format.md).
+  Decompressing such a compressed block requires additional metadata.
+  Exact metadata depends on exact decompression function.
+  For the typical case of LZ4_decompress_safe(),
+  metadata includes block's compressed size, and maximum bound of decompressed
+  size. Each application is free to encode and pass such metadata in whichever
+  way it wants.
+
+  lz4.h only handle blocks, it can not generate Frames.
+
+  Blocks are different from Frames (doc/lz4_Frame_format.md).
+  Frames bundle both blocks and metadata in a specified manner.
+  Embedding metadata is required for compressed data to be self-contained and
+  portable. Frame format is delivered through a companion API, declared in
+  lz4frame.h. The `lz4` CLI can only manage frames.
+*/
+
+/*^***************************************************************
+ *  Export parameters
+ *****************************************************************/
+/*
+ *  LZ4_DLL_EXPORT :
+ *  Enable exporting of functions when building a Windows DLL
+ *  LZ4LIB_VISIBILITY :
+ *  Control library symbols visibility.
+ */
+#ifndef LZ4LIB_VISIBILITY
+#if defined(__GNUC__) && (__GNUC__ >= 4)
+#define LZ4LIB_VISIBILITY __attribute__((visibility("default")))
+#else
+#define LZ4LIB_VISIBILITY
+#endif
+#endif
+#if defined(LZ4_DLL_EXPORT) && (LZ4_DLL_EXPORT == 1)
+#define LZ4LIB_API __declspec(dllexport) LZ4LIB_VISIBILITY
+#elif defined(LZ4_DLL_IMPORT) && (LZ4_DLL_IMPORT == 1)
+#define LZ4LIB_API                                                             \
+   __declspec(dllimport)                                                       \
+      LZ4LIB_VISIBILITY /* It isn't required but allows to generate better     \
+                           code, saving a function pointer load from the IAT   \
+                           and an indirect jump.*/
+#else
+#define LZ4LIB_API LZ4LIB_VISIBILITY
+#endif
+
+/*! LZ4_FREESTANDING :
+ *  When this macro is set to 1, it enables "freestanding mode" that is
+ *  suitable for typical freestanding environment which doesn't support
+ *  standard C library.
+ *
+ *  - LZ4_FREESTANDING is a compile-time switch.
+ *  - It requires the following macros to be defined:
+ *    LZ4_memcpy, LZ4_memmove, LZ4_memset.
+ *  - It only enables LZ4/HC functions which don't use heap.
+ *    All LZ4F_* functions are not supported.
+ *  - See tests/freestanding.c to check its basic setup.
+ */
+#if defined(LZ4_FREESTANDING) && (LZ4_FREESTANDING == 1)
+#define LZ4_HEAPMODE                                      0
+#define LZ4HC_HEAPMODE                                    0
+#define LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION 1
+#if !defined(LZ4_memcpy)
+#error "LZ4_FREESTANDING requires macro 'LZ4_memcpy'."
+#endif
+#if !defined(LZ4_memset)
+#error "LZ4_FREESTANDING requires macro 'LZ4_memset'."
+#endif
+#if !defined(LZ4_memmove)
+#error "LZ4_FREESTANDING requires macro 'LZ4_memmove'."
+#endif
+#elif !defined(LZ4_FREESTANDING)
+#define LZ4_FREESTANDING 0
+#endif
+
+/*------   Version   ------*/
+#define LZ4_VERSION_MAJOR 1 /* for breaking interface changes  */
+#define LZ4_VERSION_MINOR                                                      \
+   10                         /* for new (non-breaking) interface capabilities \
+                               */
+#define LZ4_VERSION_RELEASE 0 /* for tweaks, bug-fixes, or development */
+
+#define LZ4_VERSION_NUMBER                                                     \
+   (LZ4_VERSION_MAJOR * 100 * 100 + LZ4_VERSION_MINOR * 100 +                  \
+    LZ4_VERSION_RELEASE)
+
+#define LZ4_LIB_VERSION           LZ4_VERSION_MAJOR.LZ4_VERSION_MINOR.LZ4_VERSION_RELEASE
+#define LZ4_QUOTE(str)            #str
+#define LZ4_EXPAND_AND_QUOTE(str) LZ4_QUOTE(str)
+#define LZ4_VERSION_STRING                                                     \
+   LZ4_EXPAND_AND_QUOTE(LZ4_LIB_VERSION) /* requires v1.7.3+ */
+
+LZ4LIB_API int
+LZ4_versionNumber(void); /**< library version number; useful to check dll
+                            version; requires v1.3.0+ */
+LZ4LIB_API const char*
+LZ4_versionString(void); /**< library version string; useful to check dll
+                            version; requires v1.7.5+ */
+
+/*-************************************
+ *  Tuning memory usage
+ **************************************/
+/*!
+ * LZ4_MEMORY_USAGE :
+ * Can be selected at compile time, by setting LZ4_MEMORY_USAGE.
+ * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 ->
+ * 64KB; 20 -> 1MB) Increasing memory usage improves compression ratio,
+ * generally at the cost of speed. Reduced memory usage may improve speed at the
+ * cost of ratio, thanks to better cache locality. Default value is 14, for
+ * 16KB, which nicely fits into most L1 caches.
+ */
+#ifndef LZ4_MEMORY_USAGE
+#define LZ4_MEMORY_USAGE LZ4_MEMORY_USAGE_DEFAULT
+#endif
+
+/* These are absolute limits, they should not be changed by users */
+#define LZ4_MEMORY_USAGE_MIN     10
+#define LZ4_MEMORY_USAGE_DEFAULT 14
+#define LZ4_MEMORY_USAGE_MAX     20
+
+#if (LZ4_MEMORY_USAGE < LZ4_MEMORY_USAGE_MIN)
+#error "LZ4_MEMORY_USAGE is too small !"
+#endif
+
+#if (LZ4_MEMORY_USAGE > LZ4_MEMORY_USAGE_MAX)
+#error "LZ4_MEMORY_USAGE is too large !"
+#endif
+
+/*-************************************
+ *  Simple Functions
+ **************************************/
+/*! LZ4_compress_default() :
+ *  Compresses 'srcSize' bytes from buffer 'src'
+ *  into already allocated 'dst' buffer of size 'dstCapacity'.
+ *  Compression is guaranteed to succeed if 'dstCapacity' >=
+ * LZ4_compressBound(srcSize). It also runs faster, so it's a recommended
+ * setting. If the function cannot compress 'src' into a more limited 'dst'
+ * budget, compression stops *immediately*, and the function result is zero. In
+ * which case, 'dst' content is undefined (invalid). srcSize : max supported
+ * value is LZ4_MAX_INPUT_SIZE. dstCapacity : size of buffer 'dst' (which must
+ * be already allocated)
+ *     @return  : the number of bytes written into buffer 'dst' (necessarily <=
+ * dstCapacity) or 0 if compression fails Note : This function is protected
+ * against buffer overflow scenarios (never writes outside 'dst' buffer, nor
+ * read outside 'source' buffer).
+ */
+LZ4LIB_API int
+LZ4_compress_default(const char* src, char* dst, int srcSize, int dstCapacity);
+
+/*! LZ4_decompress_safe() :
+ * @compressedSize : is the exact complete size of the compressed block.
+ * @dstCapacity : is the size of destination buffer (which must be already
+ * allocated), presumed an upper bound of decompressed size.
+ * @return : the number of bytes decompressed into destination buffer
+ * (necessarily <= dstCapacity) If destination buffer is not large enough,
+ * decoding will stop and output an error code (negative value). If the source
+ * stream is detected malformed, the function will stop decoding and return a
+ * negative result. Note 1 : This function is protected against malicious data
+ * packets : it will never writes outside 'dst' buffer, nor read outside
+ * 'source' buffer, even if the compressed block is maliciously modified to
+ * order the decoder to do these actions. In such case, the decoder stops
+ * immediately, and considers the compressed block malformed. Note 2 :
+ * compressedSize and dstCapacity must be provided to the function, the
+ * compressed block does not contain them. The implementation is free to send /
+ * store / derive this information in whichever way is most beneficial. If there
+ * is a need for a different format which bundles together both compressed data
+ * and its metadata, consider looking at lz4frame.h instead.
+ */
+LZ4LIB_API int LZ4_decompress_safe(const char* src,
+                                   char*       dst,
+                                   int         compressedSize,
+                                   int         dstCapacity);
+
+/*-************************************
+ *  Advanced Functions
+ **************************************/
+#define LZ4_MAX_INPUT_SIZE 0x7E000000 /* 2 113 929 216 bytes */
+#define LZ4_COMPRESSBOUND(isize)                                               \
+   ((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE                           \
+       ? 0                                                                     \
+       : (isize) + ((isize) / 255) + 16)
+
+/*! LZ4_compressBound() :
+    Provides the maximum size that LZ4 compression may output in a "worst case"
+   scenario (input data not compressible) This function is primarily useful for
+   memory allocation purposes (destination buffer size). Macro
+   LZ4_COMPRESSBOUND() is also provided for compilation-time evaluation (stack
+   memory allocation for example). Note that LZ4_compress_default() compresses
+   faster when dstCapacity is >= LZ4_compressBound(srcSize) inputSize  : max
+   supported value is LZ4_MAX_INPUT_SIZE return : maximum output size in a
+   "worst case" scenario or 0, if input size is incorrect (too large or
+   negative)
+*/
+LZ4LIB_API int LZ4_compressBound(int inputSize);
+
+/*! LZ4_compress_fast() :
+    Same as LZ4_compress_default(), but allows selection of "acceleration"
+   factor. The larger the acceleration value, the faster the algorithm, but also
+   the lesser the compression. It's a trade-off. It can be fine tuned, with each
+   successive value providing roughly +~3% to speed. An acceleration value of
+   "1" is the same as regular LZ4_compress_default() Values <= 0 will be
+   replaced by LZ4_ACCELERATION_DEFAULT (currently == 1, see lz4.c). Values >
+   LZ4_ACCELERATION_MAX will be replaced by LZ4_ACCELERATION_MAX (currently ==
+   65537, see lz4.c).
+*/
+LZ4LIB_API int LZ4_compress_fast(
+   const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+
+/*! LZ4_compress_fast_extState() :
+ *  Same as LZ4_compress_fast(), using an externally allocated memory space for
+ * its state. Use LZ4_sizeofState() to know how much memory must be allocated,
+ *  and allocate it on 8-bytes boundaries (using `malloc()` typically).
+ *  Then, provide this buffer as `void* state` to compression function.
+ */
+LZ4LIB_API int LZ4_sizeofState(void);
+LZ4LIB_API int LZ4_compress_fast_extState(void*       state,
+                                          const char* src,
+                                          char*       dst,
+                                          int         srcSize,
+                                          int         dstCapacity,
+                                          int         acceleration);
+
+/*! LZ4_compress_destSize() :
+ *  Reverse the logic : compresses as much data as possible from 'src' buffer
+ *  into already allocated buffer 'dst', of size >= 'dstCapacity'.
+ *  This function either compresses the entire 'src' content into 'dst' if it's
+ * large enough, or fill 'dst' buffer completely with as much data as possible
+ * from 'src'. note: acceleration parameter is fixed to "default".
+ *
+ * *srcSizePtr : in+out parameter. Initially contains size of input.
+ *               Will be modified to indicate how many bytes where read from
+ * 'src' to fill 'dst'. New value is necessarily <= input value.
+ * @return : Nb bytes written into 'dst' (necessarily <= dstCapacity)
+ *           or 0 if compression fails.
+ *
+ * Note : 'targetDstSize' must be >= 1, because it's the smallest valid lz4
+ * payload.
+ *
+ * Note 2:from v1.8.2 to v1.9.1, this function had a bug (fixed in v1.9.2+):
+ *        the produced compressed content could, in rare circumstances,
+ *        require to be decompressed into a destination buffer
+ *        larger by at least 1 byte than decompressesSize.
+ *        If an application uses `LZ4_compress_destSize()`,
+ *        it's highly recommended to update liblz4 to v1.9.2 or better.
+ *        If this can't be done or ensured,
+ *        the receiving decompression function should provide
+ *        a dstCapacity which is > decompressedSize, by at least 1 byte.
+ *        See https://github.com/lz4/lz4/issues/859 for details
+ */
+LZ4LIB_API int LZ4_compress_destSize(const char* src,
+                                     char*       dst,
+                                     int*        srcSizePtr,
+                                     int         targetDstSize);
+
+/*! LZ4_decompress_safe_partial() :
+ *  Decompress an LZ4 compressed block, of size 'srcSize' at position 'src',
+ *  into destination buffer 'dst' of size 'dstCapacity'.
+ *  Up to 'targetOutputSize' bytes will be decoded.
+ *  The function stops decoding on reaching this objective.
+ *  This can be useful to boost performance
+ *  whenever only the beginning of a block is required.
+ *
+ * @return : the number of bytes decoded in `dst` (necessarily <=
+ * targetOutputSize) If source stream is detected malformed, function returns a
+ * negative result.
+ *
+ *  Note 1 : @return can be < targetOutputSize, if compressed block contains
+ * less data.
+ *
+ *  Note 2 : targetOutputSize must be <= dstCapacity
+ *
+ *  Note 3 : this function effectively stops decoding on reaching
+ * targetOutputSize, so dstCapacity is kind of redundant. This is because in
+ * older versions of this function, decoding operation would still write
+ * complete sequences. Therefore, there was no guarantee that it would stop
+ * writing at exactly targetOutputSize, it could write more bytes, though only
+ * up to dstCapacity. Some "margin" used to be required for this operation to
+ * work properly. Thankfully, this is no longer necessary. The function
+ * nonetheless keeps the same signature, in an effort to preserve API
+ * compatibility.
+ *
+ *  Note 4 : If srcSize is the exact size of the block,
+ *           then targetOutputSize can be any value,
+ *           including larger than the block's decompressed size.
+ *           The function will, at most, generate block's decompressed size.
+ *
+ *  Note 5 : If srcSize is _larger_ than block's compressed size,
+ *           then targetOutputSize **MUST** be <= block's decompressed size.
+ *           Otherwise, *silent corruption will occur*.
+ */
+LZ4LIB_API int LZ4_decompress_safe_partial(const char* src,
+                                           char*       dst,
+                                           int         srcSize,
+                                           int         targetOutputSize,
+                                           int         dstCapacity);
+
+/*-*********************************************
+ *  Streaming Compression Functions
+ ***********************************************/
+typedef union LZ4_stream_u LZ4_stream_t; /* incomplete type (defined later) */
+
+/*!
+ Note about RC_INVOKED
+
+ - RC_INVOKED is predefined symbol of rc.exe (the resource compiler which is
+ part of MSVC/Visual Studio).
+   https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros
+
+ - Since rc.exe is a legacy compiler, it truncates long symbol (> 30 chars)
+   and reports warning "RC4011: identifier truncated".
+
+ - To eliminate the warning, we surround long preprocessor symbol with
+   "#if !defined(RC_INVOKED) ... #endif" block that means
+   "skip this block when rc.exe is trying to read it".
+*/
+#if !defined(                                                                             \
+   RC_INVOKED) /* https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros \
+                */
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+LZ4LIB_API LZ4_stream_t* LZ4_createStream(void);
+LZ4LIB_API int           LZ4_freeStream(LZ4_stream_t* streamPtr);
+#endif /* !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) */
+#endif
+
+/*! LZ4_resetStream_fast() : v1.9.0+
+ *  Use this to prepare an LZ4_stream_t for a new chain of dependent blocks
+ *  (e.g., LZ4_compress_fast_continue()).
+ *
+ *  An LZ4_stream_t must be initialized once before usage.
+ *  This is automatically done when created by LZ4_createStream().
+ *  However, should the LZ4_stream_t be simply declared on stack (for example),
+ *  it's necessary to initialize it first, using LZ4_initStream().
+ *
+ *  After init, start any new stream with LZ4_resetStream_fast().
+ *  A same LZ4_stream_t can be re-used multiple times consecutively
+ *  and compress multiple streams,
+ *  provided that it starts each new stream with LZ4_resetStream_fast().
+ *
+ *  LZ4_resetStream_fast() is much faster than LZ4_initStream(),
+ *  but is not compatible with memory regions containing garbage data.
+ *
+ *  Note: it's only useful to call LZ4_resetStream_fast()
+ *        in the context of streaming compression.
+ *        The *extState* functions perform their own resets.
+ *        Invoking LZ4_resetStream_fast() before is redundant, and even
+ * counterproductive.
+ */
+LZ4LIB_API void LZ4_resetStream_fast(LZ4_stream_t* streamPtr);
+
+/*! LZ4_loadDict() :
+ *  Use this function to reference a static dictionary into LZ4_stream_t.
+ *  The dictionary must remain available during compression.
+ *  LZ4_loadDict() triggers a reset, so any previous data will be forgotten.
+ *  The same dictionary will have to be loaded on decompression side for
+ * successful decoding. Dictionary are useful for better compression of small
+ * data (KB range). While LZ4 itself accepts any input as dictionary, dictionary
+ * efficiency is also a topic. When in doubt, employ the Zstandard's Dictionary
+ * Builder. Loading a size of 0 is allowed, and is the same as reset.
+ * @return : loaded dictionary size, in bytes (note: only the last 64 KB are
+ * loaded)
+ */
+LZ4LIB_API int
+LZ4_loadDict(LZ4_stream_t* streamPtr, const char* dictionary, int dictSize);
+
+/*! LZ4_loadDictSlow() : v1.10.0+
+ *  Same as LZ4_loadDict(),
+ *  but uses a bit more cpu to reference the dictionary content more thoroughly.
+ *  This is expected to slightly improve compression ratio.
+ *  The extra-cpu cost is likely worth it if the dictionary is re-used across
+ * multiple sessions.
+ * @return : loaded dictionary size, in bytes (note: only the last 64 KB are
+ * loaded)
+ */
+LZ4LIB_API int
+LZ4_loadDictSlow(LZ4_stream_t* streamPtr, const char* dictionary, int dictSize);
+
+/*! LZ4_attach_dictionary() : stable since v1.10.0
+ *
+ *  This allows efficient re-use of a static dictionary multiple times.
+ *
+ *  Rather than re-loading the dictionary buffer into a working context before
+ *  each compression, or copying a pre-loaded dictionary's LZ4_stream_t into a
+ *  working LZ4_stream_t, this function introduces a no-copy setup mechanism,
+ *  in which the working stream references @dictionaryStream in-place.
+ *
+ *  Several assumptions are made about the state of @dictionaryStream.
+ *  Currently, only states which have been prepared by LZ4_loadDict() or
+ *  LZ4_loadDictSlow() should be expected to work.
+ *
+ *  Alternatively, the provided @dictionaryStream may be NULL,
+ *  in which case any existing dictionary stream is unset.
+ *
+ *  If a dictionary is provided, it replaces any pre-existing stream history.
+ *  The dictionary contents are the only history that can be referenced and
+ *  logically immediately precede the data compressed in the first subsequent
+ *  compression call.
+ *
+ *  The dictionary will only remain attached to the working stream through the
+ *  first compression call, at the end of which it is cleared.
+ * @dictionaryStream stream (and source buffer) must remain in-place /
+ * accessible / unchanged through the completion of the compression session.
+ *
+ *  Note: there is no equivalent LZ4_attach_*() method on the decompression side
+ *  because there is no initialization cost, hence no need to share the cost
+ * across multiple sessions. To decompress LZ4 blocks using dictionary, attached
+ * or not, just employ the regular LZ4_setStreamDecode() for streaming, or the
+ * stateless LZ4_decompress_safe_usingDict() for one-shot decompression.
+ */
+LZ4LIB_API void LZ4_attach_dictionary(LZ4_stream_t*       workingStream,
+                                      const LZ4_stream_t* dictionaryStream);
+
+/*! LZ4_compress_fast_continue() :
+ *  Compress 'src' content using data from previously compressed blocks, for
+ * better compression ratio. 'dst' buffer must be already allocated. If
+ * dstCapacity >= LZ4_compressBound(srcSize), compression is guaranteed to
+ * succeed, and runs faster.
+ *
+ * @return : size of compressed block
+ *           or 0 if there is an error (typically, cannot fit into 'dst').
+ *
+ *  Note 1 : Each invocation to LZ4_compress_fast_continue() generates a new
+ * block. Each block has precise boundaries. Each block must be decompressed
+ * separately, calling LZ4_decompress_*() with relevant metadata. It's not
+ * possible to append blocks together and expect a single invocation of
+ * LZ4_decompress_*() to decompress them together.
+ *
+ *  Note 2 : The previous 64KB of source data is __assumed__ to remain present,
+ * unmodified, at same address in memory !
+ *
+ *  Note 3 : When input is structured as a double-buffer, each buffer can have
+ * any size, including < 64 KB. Make sure that buffers are separated, by at
+ * least one byte. This construction ensures that each block only depends on
+ * previous block.
+ *
+ *  Note 4 : If input buffer is a ring-buffer, it can have any size, including <
+ * 64 KB.
+ *
+ *  Note 5 : After an error, the stream status is undefined (invalid), it can
+ * only be reset or freed.
+ */
+LZ4LIB_API int LZ4_compress_fast_continue(LZ4_stream_t* streamPtr,
+                                          const char*   src,
+                                          char*         dst,
+                                          int           srcSize,
+                                          int           dstCapacity,
+                                          int           acceleration);
+
+/*! LZ4_saveDict() :
+ *  If last 64KB data cannot be guaranteed to remain available at its current
+ * memory location, save it into a safer place (char* safeBuffer). This is
+ * schematically equivalent to a memcpy() followed by LZ4_loadDict(), but is
+ * much faster, because LZ4_saveDict() doesn't need to rebuild tables.
+ * @return : saved dictionary size in bytes (necessarily <= maxDictSize), or 0
+ * if error.
+ */
+LZ4LIB_API int
+LZ4_saveDict(LZ4_stream_t* streamPtr, char* safeBuffer, int maxDictSize);
+
+/*-**********************************************
+ *  Streaming Decompression Functions
+ *  Bufferless synchronous API
+ ************************************************/
+typedef union LZ4_streamDecode_u LZ4_streamDecode_t; /* tracking context */
+
+/*! LZ4_createStreamDecode() and LZ4_freeStreamDecode() :
+ *  creation / destruction of streaming decompression tracking context.
+ *  A tracking context can be re-used multiple times.
+ */
+#if !defined(                                                                             \
+   RC_INVOKED) /* https://docs.microsoft.com/en-us/windows/win32/menurc/predefined-macros \
+                */
+#if !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION)
+LZ4LIB_API LZ4_streamDecode_t* LZ4_createStreamDecode(void);
+LZ4LIB_API int LZ4_freeStreamDecode(LZ4_streamDecode_t* LZ4_stream);
+#endif /* !defined(LZ4_STATIC_LINKING_ONLY_DISABLE_MEMORY_ALLOCATION) */
+#endif
+
+/*! LZ4_setStreamDecode() :
+ *  An LZ4_streamDecode_t context can be allocated once and re-used multiple
+ * times. Use this function to start decompression of a new stream of blocks. A
+ * dictionary can optionally be set. Use NULL or size 0 for a reset order.
+ *  Dictionary is presumed stable : it must remain accessible and unmodified
+ * during next decompression.
+ * @return : 1 if OK, 0 if error
+ */
+LZ4LIB_API int LZ4_setStreamDecode(LZ4_streamDecode_t* LZ4_streamDecode,
+                                   const char*         dictionary,
+                                   int                 dictSize);
+
+/*! LZ4_decoderRingBufferSize() : v1.8.2+
+ *  Note : in a ring buffer scenario (optional),
+ *  blocks are presumed decompressed next to each other
+ *  up to the moment there is not enough remaining space for next block
+ * (remainingSize < maxBlockSize), at which stage it resumes from beginning of
+ * ring buffer. When setting such a ring buffer for streaming decompression,
+ *  provides the minimum size of this ring buffer
+ *  to be compatible with any source respecting maxBlockSize condition.
+ * @return : minimum ring buffer size,
+ *           or 0 if there is an error (invalid maxBlockSize).
+ */
+LZ4LIB_API int LZ4_decoderRingBufferSize(int maxBlockSize);
+#define LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize)                             \
+   (65536 + 14 +                                                               \
+    (maxBlockSize)) /* for static allocation; maxBlockSize presumed valid */
+
+/*! LZ4_decompress_safe_continue() :
+ *  This decoding function allows decompression of consecutive blocks in
+ * "streaming" mode. The difference with the usual independent blocks is that
+ *  new blocks are allowed to find references into former blocks.
+ *  A block is an unsplittable entity, and must be presented entirely to the
+ * decompression function. LZ4_decompress_safe_continue() only accepts one block
+ * at a time. It's modeled after `LZ4_decompress_safe()` and behaves similarly.
+ *
+ * @LZ4_streamDecode : decompression state, tracking the position in memory of
+ * past data
+ * @compressedSize : exact complete size of one compressed block.
+ * @dstCapacity : size of destination buffer (which must be already allocated),
+ *                must be an upper bound of decompressed size.
+ * @return : number of bytes decompressed into destination buffer (necessarily
+ * <= dstCapacity) If destination buffer is not large enough, decoding will stop
+ * and output an error code (negative value). If the source stream is detected
+ * malformed, the function will stop decoding and return a negative result.
+ *
+ *  The last 64KB of previously decoded data *must* remain available and
+ * unmodified at the memory position where they were previously decoded. If less
+ * than 64KB of data has been decoded, all the data must be present.
+ *
+ *  Special : if decompression side sets a ring buffer, it must respect one of
+ * the following conditions :
+ *  - Decompression buffer size is _at least_
+ * LZ4_decoderRingBufferSize(maxBlockSize). maxBlockSize is the maximum size of
+ * any single block. It can have any value > 16 bytes. In which case, encoding
+ * and decoding buffers do not need to be synchronized. Actually, data can be
+ * produced by any source compliant with LZ4 format specification, and
+ * respecting maxBlockSize.
+ *  - Synchronized mode :
+ *    Decompression buffer size is _exactly_ the same as compression buffer
+ * size, and follows exactly same update rule (block boundaries at same
+ * positions), and decoding function is provided with exact decompressed size of
+ * each block (exception for last block of the stream), _then_ decoding &
+ * encoding ring buffer can have any size, including small ones ( < 64 KB).
+ *  - Decompression buffer is larger than encoding buffer, by a minimum of
+ * maxBlockSize more bytes. In which case, encoding and decoding buffers do not
+ * need to be synchronized, and encoding ring buffer can have any size,
+ * including small ones ( < 64 KB).
+ *
+ *  Whenever these conditions are not possible,
+ *  save the last 64KB of decoded data into a safe buffer where it can't be
+ * modified during decompression, then indicate where this data is saved using
+ * LZ4_setStreamDecode(), before decompressing next block.
+ */
+LZ4LIB_API int
+LZ4_decompress_safe_continue(LZ4_streamDecode_t* LZ4_streamDecode,
+                             const char*         src,
+                             char*               dst,
+                             int                 srcSize,
+                             int                 dstCapacity);
+
+/*! LZ4_decompress_safe_usingDict() :
+ *  Works the same as
+ *  a combination of LZ4_setStreamDecode() followed by
+ * LZ4_decompress_safe_continue() However, it's stateless: it doesn't need any
+ * LZ4_streamDecode_t state. Dictionary is presumed stable : it must remain
+ * accessible and unmodified during decompression. Performance tip :
+ * Decompression speed can be substantially increased when dst == dictStart +
+ * dictSize.
+ */
+LZ4LIB_API int LZ4_decompress_safe_usingDict(const char* src,
+                                             char*       dst,
+                                             int         srcSize,
+                                             int         dstCapacity,
+                                             const char* dictStart,
+                                             int         dictSize);
+
+/*! LZ4_decompress_safe_partial_usingDict() :
+ *  Behaves the same as LZ4_decompress_safe_partial()
+ *  with the added ability to specify a memory segment for past data.
+ *  Performance tip : Decompression speed can be substantially increased
+ *                    when dst == dictStart + dictSize.
+ */
+LZ4LIB_API int LZ4_decompress_safe_partial_usingDict(const char* src,
+                                                     char*       dst,
+                                                     int         compressedSize,
+                                                     int targetOutputSize,
+                                                     int maxOutputSize,
+                                                     const char* dictStart,
+                                                     int         dictSize);
+
+#endif /* LZ4_H_2983827168210 */
+
+/*^*************************************
+ * !!!!!!   STATIC LINKING ONLY   !!!!!!
+ ***************************************/
+
+/*-****************************************************************************
+ * Experimental section
+ *
+ * Symbols declared in this section must be considered unstable. Their
+ * signatures or semantics may change, or they may be removed altogether in the
+ * future. They are therefore only safe to depend on when the caller is
+ * statically linked against the library.
+ *
+ * To protect against unsafe usage, not only are the declarations guarded,
+ * the definitions are hidden by default
+ * when building LZ4 as a shared/dynamic library.
+ *
+ * In order to access these declarations,
+ * define LZ4_STATIC_LINKING_ONLY in your application
+ * before including LZ4's headers.
+ *
+ * In order to make their implementations accessible dynamically, you must
+ * define LZ4_PUBLISH_STATIC_FUNCTIONS when building the LZ4 library.
+ ******************************************************************************/
+
+#ifdef LZ4_STATIC_LINKING_ONLY
+
+#ifndef LZ4_STATIC_3504398509
+#define LZ4_STATIC_3504398509
+
+#ifdef LZ4_PUBLISH_STATIC_FUNCTIONS
+#define LZ4LIB_STATIC_API LZ4LIB_API
+#else
+#define LZ4LIB_STATIC_API
+#endif
+
+/*! LZ4_compress_fast_extState_fastReset() :
+ *  A variant of LZ4_compress_fast_extState().
+ *
+ *  Using this variant avoids an expensive initialization step.
+ *  It is only safe to call if the state buffer is known to be correctly
+ * initialized already (see above comment on LZ4_resetStream_fast() for a
+ * definition of "correctly initialized"). From a high level, the difference is
+ * that this function initializes the provided state with a call to something
+ * like LZ4_resetStream_fast() while LZ4_compress_fast_extState() starts with a
+ * call to LZ4_resetStream().
+ */
+LZ4LIB_STATIC_API int LZ4_compress_fast_extState_fastReset(void*       state,
+                                                           const char* src,
+                                                           char*       dst,
+                                                           int         srcSize,
+                                                           int dstCapacity,
+                                                           int acceleration);
+
+/*! LZ4_compress_destSize_extState() : introduced in v1.10.0
+ *  Same as LZ4_compress_destSize(), but using an externally allocated state.
+ *  Also: exposes @acceleration
+ */
+int LZ4_compress_destSize_extState(void*       state,
+                                   const char* src,
+                                   char*       dst,
+                                   int*        srcSizePtr,
+                                   int         targetDstSize,
+                                   int         acceleration);
+
+/*! In-place compression and decompression
+ *
+ * It's possible to have input and output sharing the same buffer,
+ * for highly constrained memory environments.
+ * In both cases, it requires input to lay at the end of the buffer,
+ * and decompression to start at beginning of the buffer.
+ * Buffer size must feature some margin, hence be larger than final size.
+ *
+ * |<------------------------buffer--------------------------------->|
+ *                             |<-----------compressed data--------->|
+ * |<-----------decompressed size------------------>|
+ *                                                  |<----margin---->|
+ *
+ * This technique is more useful for decompression,
+ * since decompressed size is typically larger,
+ * and margin is short.
+ *
+ * In-place decompression will work inside any buffer
+ * which size is >= LZ4_DECOMPRESS_INPLACE_BUFFER_SIZE(decompressedSize).
+ * This presumes that decompressedSize > compressedSize.
+ * Otherwise, it means compression actually expanded data,
+ * and it would be more efficient to store such data with a flag indicating it's
+ * not compressed. This can happen when data is not compressible (already
+ * compressed, or encrypted).
+ *
+ * For in-place compression, margin is larger, as it must be able to cope with
+ * both history preservation, requiring input data to remain unmodified up to
+ * LZ4_DISTANCE_MAX, and data expansion, which can happen when input is not
+ * compressible. As a consequence, buffer size requirements are much higher, and
+ * memory savings offered by in-place compression are more limited.
+ *
+ * There are ways to limit this cost for compression :
+ * - Reduce history size, by modifying LZ4_DISTANCE_MAX.
+ *   Note that it is a compile-time constant, so all compressions will apply
+ * this limit. Lower values will reduce compression ratio, except when
+ * input_size < LZ4_DISTANCE_MAX, so it's a reasonable trick when inputs are
+ * known to be small.
+ * - Require the compressor to deliver a "maximum compressed size".
+ *   This is the `dstCapacity` parameter in `LZ4_compress*()`.
+ *   When this size is < LZ4_COMPRESSBOUND(inputSize), then compression can
+ * fail, in which case, the return code will be 0 (zero). The caller must be
+ * ready for these cases to happen, and typically design a backup scheme to send
+ * data uncompressed. The combination of both techniques can significantly
+ * reduce the amount of margin required for in-place compression.
+ *
+ * In-place compression can work in any buffer
+ * which size is >= (maxCompressedSize)
+ * with maxCompressedSize == LZ4_COMPRESSBOUND(srcSize) for guaranteed
+ * compression success. LZ4_COMPRESS_INPLACE_BUFFER_SIZE() depends on both
+ * maxCompressedSize and LZ4_DISTANCE_MAX, so it's possible to reduce memory
+ * requirements by playing with them.
+ */
+
+#define LZ4_DECOMPRESS_INPLACE_MARGIN(compressedSize)                          \
+   (((compressedSize) >> 8) + 32)
+#define LZ4_DECOMPRESS_INPLACE_BUFFER_SIZE(decompressedSize)                   \
+   ((decompressedSize) +                                                       \
+    LZ4_DECOMPRESS_INPLACE_MARGIN(                                             \
+       decompressedSize)) /**< note: presumes that compressedSize <            \
+                             decompressedSize. note2: margin is overestimated  \
+                             a bit, since it could use compressedSize instead  \
+                           */
+
+#ifndef LZ4_DISTANCE_MAX       /* history window size; can be user-defined at  \
+                                  compile time */
+#define LZ4_DISTANCE_MAX 65535 /* set to maximum value by default */
+#endif
+
+#define LZ4_COMPRESS_INPLACE_MARGIN                                            \
+   (LZ4_DISTANCE_MAX + 32) /* LZ4_DISTANCE_MAX can be safely replaced by       \
+                              srcSize when it's smaller */
+#define LZ4_COMPRESS_INPLACE_BUFFER_SIZE(maxCompressedSize)                    \
+   ((maxCompressedSize) +                                                      \
+    LZ4_COMPRESS_INPLACE_MARGIN) /**< maxCompressedSize is generally           \
+                                    LZ4_COMPRESSBOUND(inputSize), but can be   \
+                                    set to any lower value, with the risk that \
+                                    compression can fail (return code 0(zero)) \
+                                  */
+
+#endif /* LZ4_STATIC_3504398509 */
+#endif /* LZ4_STATIC_LINKING_ONLY */
+
+#ifndef LZ4_H_98237428734687
+#define LZ4_H_98237428734687
+
+/*-************************************************************
+ *  Private Definitions
+ **************************************************************
+ * Do not use these definitions directly.
+ * They are only exposed to allow static allocation of `LZ4_stream_t` and
+ *`LZ4_streamDecode_t`. Accessing members will expose user code to API and/or
+ *ABI break in future versions of the library.
+ **************************************************************/
+#define LZ4_HASHLOG       (LZ4_MEMORY_USAGE - 2)
+#define LZ4_HASHTABLESIZE (1 << LZ4_MEMORY_USAGE)
+#define LZ4_HASH_SIZE_U32                                                      \
+   (1 << LZ4_HASHLOG) /* required as macro for static allocation */
+
+#if defined(__cplusplus) ||                                                    \
+   (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#include <stdint.h>
+typedef int8_t        LZ4_i8;
+typedef unsigned char LZ4_byte;
+typedef uint16_t      LZ4_u16;
+typedef uint32_t      LZ4_u32;
+#else
+typedef signed char    LZ4_i8;
+typedef unsigned char  LZ4_byte;
+typedef unsigned short LZ4_u16;
+typedef unsigned int   LZ4_u32;
+#endif
+
+/*! LZ4_stream_t :
+ *  Never ever use below internal definitions directly !
+ *  These definitions are not API/ABI safe, and may change in future versions.
+ *  If you need static allocation, declare or allocate an LZ4_stream_t object.
+ **/
+
+typedef struct LZ4_stream_t_internal LZ4_stream_t_internal;
+struct LZ4_stream_t_internal {
+   LZ4_u32                      hashTable[LZ4_HASH_SIZE_U32];
+   const LZ4_byte*              dictionary;
+   const LZ4_stream_t_internal* dictCtx;
+   LZ4_u32                      currentOffset;
+   LZ4_u32                      tableType;
+   LZ4_u32                      dictSize;
+   /* Implicit padding to ensure structure is aligned */
+};
+
+#define LZ4_STREAM_MINSIZE                                                     \
+   ((1UL << (LZ4_MEMORY_USAGE)) +                                              \
+    32) /* static size, for inter-version compatibility */
+union LZ4_stream_u {
+   char                  minStateSize[LZ4_STREAM_MINSIZE];
+   LZ4_stream_t_internal internal_donotuse;
+}; /* previously typedef'd to LZ4_stream_t */
+
+/*! LZ4_initStream() : v1.9.0+
+ *  An LZ4_stream_t structure must be initialized at least once.
+ *  This is automatically done when invoking LZ4_createStream(),
+ *  but it's not when the structure is simply declared on stack (for example).
+ *
+ *  Use LZ4_initStream() to properly initialize a newly declared LZ4_stream_t.
+ *  It can also initialize any arbitrary buffer of sufficient size,
+ *  and will @return a pointer of proper type upon initialization.
+ *
+ *  Note : initialization fails if size and alignment conditions are not
+ *respected. In which case, the function will @return NULL. Note2: An
+ *LZ4_stream_t structure guarantees correct alignment and size. Note3: Before
+ *v1.9.0, use LZ4_resetStream() instead
+ **/
+LZ4LIB_API LZ4_stream_t* LZ4_initStream(void* stateBuffer, size_t size);
+
+/*! LZ4_streamDecode_t :
+ *  Never ever use below internal definitions directly !
+ *  These definitions are not API/ABI safe, and may change in future versions.
+ *  If you need static allocation, declare or allocate an LZ4_streamDecode_t
+ *object.
+ **/
+typedef struct {
+   const LZ4_byte* externalDict;
+   const LZ4_byte* prefixEnd;
+   size_t          extDictSize;
+   size_t          prefixSize;
+} LZ4_streamDecode_t_internal;
+
+#define LZ4_STREAMDECODE_MINSIZE 32
+union LZ4_streamDecode_u {
+   char                        minStateSize[LZ4_STREAMDECODE_MINSIZE];
+   LZ4_streamDecode_t_internal internal_donotuse;
+}; /* previously typedef'd to LZ4_streamDecode_t */
+
+/*-************************************
+ *  Obsolete Functions
+ **************************************/
+
+/*! Deprecation warnings
+ *
+ *  Deprecated functions make the compiler generate a warning when invoked.
+ *  This is meant to invite users to update their source code.
+ *  Should deprecation warnings be a problem, it is generally possible to
+ * disable them, typically with -Wno-deprecated-declarations for gcc or
+ * _CRT_SECURE_NO_WARNINGS in Visual.
+ *
+ *  Another method is to define LZ4_DISABLE_DEPRECATE_WARNINGS
+ *  before including the header file.
+ */
+#ifdef LZ4_DISABLE_DEPRECATE_WARNINGS
+#define LZ4_DEPRECATED(message) /* disable deprecation warnings */
+#else
+#if defined(__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
+#define LZ4_DEPRECATED(message) [[deprecated(message)]]
+#elif defined(_MSC_VER)
+#define LZ4_DEPRECATED(message) __declspec(deprecated(message))
+#elif defined(__clang__) ||                                                    \
+   (defined(__GNUC__) && (__GNUC__ * 10 + __GNUC_MINOR__ >= 45))
+#define LZ4_DEPRECATED(message) __attribute__((deprecated(message)))
+#elif defined(__GNUC__) && (__GNUC__ * 10 + __GNUC_MINOR__ >= 31)
+#define LZ4_DEPRECATED(message) __attribute__((deprecated))
+#else
+#pragma message(                                                               \
+   "WARNING: LZ4_DEPRECATED needs custom implementation for this compiler")
+#define LZ4_DEPRECATED(message) /* disabled */
+#endif
+#endif /* LZ4_DISABLE_DEPRECATE_WARNINGS */
+
+/*! Obsolete compression functions (since v1.7.3) */
+LZ4_DEPRECATED("use LZ4_compress_default() instead")
+LZ4LIB_API int LZ4_compress(const char* src, char* dest, int srcSize);
+LZ4_DEPRECATED("use LZ4_compress_default() instead")
+LZ4LIB_API int LZ4_compress_limitedOutput(const char* src,
+                                          char*       dest,
+                                          int         srcSize,
+                                          int         maxOutputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead")
+LZ4LIB_API int LZ4_compress_withState(void*       state,
+                                      const char* source,
+                                      char*       dest,
+                                      int         inputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead")
+LZ4LIB_API int LZ4_compress_limitedOutput_withState(void*       state,
+                                                    const char* source,
+                                                    char*       dest,
+                                                    int         inputSize,
+                                                    int         maxOutputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead")
+LZ4LIB_API int LZ4_compress_continue(LZ4_stream_t* LZ4_streamPtr,
+                                     const char*   source,
+                                     char*         dest,
+                                     int           inputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead")
+LZ4LIB_API int LZ4_compress_limitedOutput_continue(LZ4_stream_t* LZ4_streamPtr,
+                                                   const char*   source,
+                                                   char*         dest,
+                                                   int           inputSize,
+                                                   int           maxOutputSize);
+
+/*! Obsolete decompression functions (since v1.8.0) */
+LZ4_DEPRECATED("use LZ4_decompress_fast() instead")
+LZ4LIB_API int LZ4_uncompress(const char* source, char* dest, int outputSize);
+LZ4_DEPRECATED("use LZ4_decompress_safe() instead")
+LZ4LIB_API int LZ4_uncompress_unknownOutputSize(const char* source,
+                                                char*       dest,
+                                                int         isize,
+                                                int         maxOutputSize);
+
+/* Obsolete streaming functions (since v1.7.0)
+ * degraded functionality; do not use!
+ *
+ * In order to perform streaming compression, these functions depended on data
+ * that is no longer tracked in the state. They have been preserved as well as
+ * possible: using them will still produce a correct output. However, they don't
+ * actually retain any history between compression calls. The compression ratio
+ * achieved will therefore be no better than compressing each chunk
+ * independently.
+ */
+LZ4_DEPRECATED("Use LZ4_createStream() instead")
+LZ4LIB_API void* LZ4_create(char* inputBuffer);
+LZ4_DEPRECATED("Use LZ4_createStream() instead")
+LZ4LIB_API int LZ4_sizeofStreamState(void);
+LZ4_DEPRECATED("Use LZ4_resetStream() instead")
+LZ4LIB_API int LZ4_resetStreamState(void* state, char* inputBuffer);
+LZ4_DEPRECATED("Use LZ4_saveDict() instead")
+LZ4LIB_API char* LZ4_slideInputBuffer(void* state);
+
+/*! Obsolete streaming decoding functions (since v1.7.0) */
+LZ4_DEPRECATED("use LZ4_decompress_safe_usingDict() instead")
+LZ4LIB_API int LZ4_decompress_safe_withPrefix64k(const char* src,
+                                                 char*       dst,
+                                                 int         compressedSize,
+                                                 int         maxDstSize);
+LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead")
+LZ4LIB_API int
+LZ4_decompress_fast_withPrefix64k(const char* src, char* dst, int originalSize);
+
+/*! Obsolete LZ4_decompress_fast variants (since v1.9.0) :
+ *  These functions used to be faster than LZ4_decompress_safe(),
+ *  but this is no longer the case. They are now slower.
+ *  This is because LZ4_decompress_fast() doesn't know the input size,
+ *  and therefore must progress more cautiously into the input buffer to not
+ * read beyond the end of block. On top of that `LZ4_decompress_fast()` is not
+ * protected vs malformed or malicious inputs, making it a security liability.
+ *  As a consequence, LZ4_decompress_fast() is strongly discouraged, and
+ * deprecated.
+ *
+ *  The last remaining LZ4_decompress_fast() specificity is that
+ *  it can decompress a block without knowing its compressed size.
+ *  Such functionality can be achieved in a more secure manner
+ *  by employing LZ4_decompress_safe_partial().
+ *
+ *  Parameters:
+ *  originalSize : is the uncompressed size to regenerate.
+ *                 `dst` must be already allocated, its size must be >=
+ * 'originalSize' bytes.
+ * @return : number of bytes read from source buffer (== compressed size).
+ *           The function expects to finish at block's end exactly.
+ *           If the source stream is detected malformed, the function stops
+ * decoding and returns a negative result. note : LZ4_decompress_fast*()
+ * requires originalSize. Thanks to this information, it never writes past the
+ * output buffer. However, since it doesn't know its 'src' size, it may read an
+ * unknown amount of input, past input buffer bounds. Also, since match offsets
+ * are not validated, match reads from 'src' may underflow too. These issues
+ * never happen if input (compressed) data is correct. But they may happen if
+ * input data is invalid (error or intentional tampering). As a consequence, use
+ * these functions in trusted environments with trusted data **only**.
+ */
+LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using "
+               "LZ4_decompress_safe_partial() instead")
+LZ4LIB_API int
+LZ4_decompress_fast(const char* src, char* dst, int originalSize);
+LZ4_DEPRECATED("This function is deprecated and unsafe. Consider migrating "
+               "towards LZ4_decompress_safe_continue() instead. "
+               "Note that the contract will change (requires block's "
+               "compressed size, instead of decompressed size)")
+LZ4LIB_API int
+LZ4_decompress_fast_continue(LZ4_streamDecode_t* LZ4_streamDecode,
+                             const char*         src,
+                             char*               dst,
+                             int                 originalSize);
+LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using "
+               "LZ4_decompress_safe_partial_usingDict() instead")
+LZ4LIB_API int LZ4_decompress_fast_usingDict(const char* src,
+                                             char*       dst,
+                                             int         originalSize,
+                                             const char* dictStart,
+                                             int         dictSize);
+
+/*! LZ4_resetStream() :
+ *  An LZ4_stream_t structure must be initialized at least once.
+ *  This is done with LZ4_initStream(), or LZ4_resetStream().
+ *  Consider switching to LZ4_initStream(),
+ *  invoking LZ4_resetStream() will trigger deprecation warnings in the future.
+ */
+LZ4LIB_API void LZ4_resetStream(LZ4_stream_t* streamPtr);
+
+#endif /* LZ4_H_98237428734687 */
+
+#if defined(__cplusplus)
+}
+#endif
diff --git a/tracegrind/main.c b/tracegrind/main.c
new file mode 100644
index 000000000..91d2b9498
--- /dev/null
+++ b/tracegrind/main.c
@@ -0,0 +1,2123 @@
+
+/*--------------------------------------------------------------------*/
+/*--- Tracegrind                                                    ---*/
+/*---                                                       main.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Tracegrind, a Valgrind tool for call graph
+   profiling programs.
+
+   Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This tool is derived from and contains code from Cachegrind
+   Copyright (C) 2002-2017 Nicholas Nethercote (njn@valgrind.org)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "config.h"
+#include "global.h"
+#include "tracegrind.h"
+
+#include "pub_tool_gdbserver.h"
+#include "pub_tool_threadstate.h"
+#include "pub_tool_transtab.h" // VG_(discard_translations_safely)
+
+#include "cg_branchpred.c"
+
+/*------------------------------------------------------------*/
+/*--- Global variables                                     ---*/
+/*------------------------------------------------------------*/
+
+/* for all threads */
+CommandLineOptions TG_(clo);
+Statistics         TG_(stat);
+Bool               TG_(instrument_state) = True; /* Instrumentation on ? */
+
+/* thread and signal handler specific */
+exec_state TG_(current_state);
+
+/* min of L1 and LL cache line sizes.  This only gets set to a
+   non-zero value if we are doing cache simulation. */
+Int TG_(min_line_size) = 0;
+
+/*------------------------------------------------------------*/
+/*--- Statistics                                           ---*/
+/*------------------------------------------------------------*/
+
+static void TG_(init_statistics)(Statistics* s)
+{
+   s->call_counter     = 0;
+   s->jcnd_counter     = 0;
+   s->jump_counter     = 0;
+   s->rec_call_counter = 0;
+   s->ret_counter      = 0;
+   s->bb_executions    = 0;
+
+   s->context_counter   = 0;
+   s->bb_retranslations = 0;
+
+   s->distinct_objs     = 0;
+   s->distinct_files    = 0;
+   s->distinct_fns      = 0;
+   s->distinct_contexts = 0;
+   s->distinct_bbs      = 0;
+   s->distinct_bbccs    = 0;
+   s->distinct_instrs   = 0;
+   s->distinct_skips    = 0;
+
+   s->bb_hash_resizes    = 0;
+   s->bbcc_hash_resizes  = 0;
+   s->jcc_hash_resizes   = 0;
+   s->cxt_hash_resizes   = 0;
+   s->fn_array_resizes   = 0;
+   s->call_stack_resizes = 0;
+   s->fn_stack_resizes   = 0;
+
+   s->full_debug_BBs      = 0;
+   s->file_line_debug_BBs = 0;
+   s->fn_name_debug_BBs   = 0;
+   s->no_debug_BBs        = 0;
+   s->bbcc_lru_misses     = 0;
+   s->jcc_lru_misses      = 0;
+   s->cxt_lru_misses      = 0;
+   s->bbcc_clones         = 0;
+}
+
+/*------------------------------------------------------------*/
+/*--- Simple callbacks (not cache similator)               ---*/
+/*------------------------------------------------------------*/
+
+VG_REGPARM(1)
+static void log_global_event(InstrInfo* ii)
+{
+   ULong* cost_Bus;
+
+   TG_DEBUG(6, "log_global_event:  Ir  %#lx/%u\n",
+            TG_(bb_base) + ii->instr_offset, ii->instr_size);
+
+   if (!TG_(current_state).collect)
+      return;
+
+   TG_ASSERT((ii->eventset->mask & (1u << EG_BUS)) > 0);
+
+   TG_(current_state).cost[fullOffset(EG_BUS)]++;
+
+   if (TG_(current_state).nonskipped)
+      cost_Bus = TG_(current_state).nonskipped->skipped + fullOffset(EG_BUS);
+   else
+      cost_Bus =
+         TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BUS];
+   cost_Bus[0]++;
+}
+
+/* For branches, we consult two different predictors, one which
+   predicts taken/untaken for conditional branches, and the other
+   which predicts the branch target address for indirect branches
+   (jump-to-register style ones). */
+
+static VG_REGPARM(2) void log_cond_branch(InstrInfo* ii, Word taken)
+{
+   Bool   miss;
+   Int    fullOffset_Bc;
+   ULong* cost_Bc;
+
+   TG_DEBUG(6, "log_cond_branch:  Ir %#lx, taken %ld\n",
+            TG_(bb_base) + ii->instr_offset, taken);
+
+   miss = 1 & do_cond_branch_predict(TG_(bb_base) + ii->instr_offset, taken);
+
+   if (!TG_(current_state).collect)
+      return;
+
+   TG_ASSERT((ii->eventset->mask & (1u << EG_BC)) > 0);
+
+   if (TG_(current_state).nonskipped)
+      cost_Bc = TG_(current_state).nonskipped->skipped + fullOffset(EG_BC);
+   else
+      cost_Bc = TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BC];
+
+   fullOffset_Bc = fullOffset(EG_BC);
+   TG_(current_state).cost[fullOffset_Bc]++;
+   cost_Bc[0]++;
+   if (miss) {
+      TG_(current_state).cost[fullOffset_Bc + 1]++;
+      cost_Bc[1]++;
+   }
+}
+
+static VG_REGPARM(2) void log_ind_branch(InstrInfo* ii, UWord actual_dst)
+{
+   Bool   miss;
+   Int    fullOffset_Bi;
+   ULong* cost_Bi;
+
+   TG_DEBUG(6, "log_ind_branch:  Ir  %#lx, dst %#lx\n",
+            TG_(bb_base) + ii->instr_offset, actual_dst);
+
+   miss =
+      1 & do_ind_branch_predict(TG_(bb_base) + ii->instr_offset, actual_dst);
+
+   if (!TG_(current_state).collect)
+      return;
+
+   TG_ASSERT((ii->eventset->mask & (1u << EG_BI)) > 0);
+
+   if (TG_(current_state).nonskipped)
+      cost_Bi = TG_(current_state).nonskipped->skipped + fullOffset(EG_BI);
+   else
+      cost_Bi = TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BI];
+
+   fullOffset_Bi = fullOffset(EG_BI);
+   TG_(current_state).cost[fullOffset_Bi]++;
+   cost_Bi[0]++;
+   if (miss) {
+      TG_(current_state).cost[fullOffset_Bi + 1]++;
+      cost_Bi[1]++;
+   }
+}
+
+/*------------------------------------------------------------*/
+/*--- Instrumentation structures and event queue handling  ---*/
+/*------------------------------------------------------------*/
+
+/* Maintain an ordered list of memory events which are outstanding, in
+   the sense that no IR has yet been generated to do the relevant
+   helper calls.  The BB is scanned top to bottom and memory events
+   are added to the end of the list, merging with the most recent
+   notified event where possible (Dw immediately following Dr and
+   having the same size and EA can be merged).
+
+   This merging is done so that for architectures which have
+   load-op-store instructions (x86, amd64), the insn is treated as if
+   it makes just one memory reference (a modify), rather than two (a
+   read followed by a write at the same address).
+
+   At various points the list will need to be flushed, that is, IR
+   generated from it.  That must happen before any possible exit from
+   the block (the end, or an IRStmt_Exit).  Flushing also takes place
+   when there is no space to add a new event.
+
+   If we require the simulation statistics to be up to date with
+   respect to possible memory exceptions, then the list would have to
+   be flushed before each memory reference.  That would however lose
+   performance by inhibiting event-merging during flushing.
+
+   Flushing the list consists of walking it start to end and emitting
+   instrumentation IR for each event, in the order in which they
+   appear.  It may be possible to emit a single call for two adjacent
+   events in order to reduce the number of helper function calls made.
+   For example, it could well be profitable to handle two adjacent Ir
+   events with a single helper call.  */
+
+typedef IRExpr IRAtom;
+
+typedef enum {
+   Ev_Ir, // Instruction read
+   Ev_Dr, // Data read
+   Ev_Dw, // Data write
+   Ev_Dm, // Data modify (read then write)
+   Ev_Bc, // branch conditional
+   Ev_Bi, // branch indirect (to unknown destination)
+   Ev_G   // Global bus event
+} EventTag;
+
+typedef struct {
+   EventTag   tag;
+   InstrInfo* inode;
+   union {
+      struct {
+      } Ir;
+      struct {
+         IRAtom* ea;
+         Int     szB;
+      } Dr;
+      struct {
+         IRAtom* ea;
+         Int     szB;
+      } Dw;
+      struct {
+         IRAtom* ea;
+         Int     szB;
+      } Dm;
+      struct {
+         IRAtom* taken; /* :: Ity_I1 */
+      } Bc;
+      struct {
+         IRAtom* dst;
+      } Bi;
+      struct {
+      } G;
+   } Ev;
+} Event;
+
+static void init_Event(Event* ev) { VG_(memset)(ev, 0, sizeof(Event)); }
+
+static IRAtom* get_Event_dea(Event* ev)
+{
+   switch (ev->tag) {
+   case Ev_Dr:
+      return ev->Ev.Dr.ea;
+   case Ev_Dw:
+      return ev->Ev.Dw.ea;
+   case Ev_Dm:
+      return ev->Ev.Dm.ea;
+   default:
+      tl_assert(0);
+   }
+}
+
+static Int get_Event_dszB(Event* ev)
+{
+   switch (ev->tag) {
+   case Ev_Dr:
+      return ev->Ev.Dr.szB;
+   case Ev_Dw:
+      return ev->Ev.Dw.szB;
+   case Ev_Dm:
+      return ev->Ev.Dm.szB;
+   default:
+      tl_assert(0);
+   }
+}
+
+/* Up to this many unnotified events are allowed.  Number is
+   arbitrary.  Larger numbers allow more event merging to occur, but
+   potentially induce more spilling due to extending live ranges of
+   address temporaries. */
+#define N_EVENTS 16
+
+/* A struct which holds all the running state during instrumentation.
+   Mostly to avoid passing loads of parameters everywhere. */
+typedef struct {
+   /* The current outstanding-memory-event list. */
+   Event events[N_EVENTS];
+   Int   events_used;
+
+   /* The array of InstrInfo's is part of BB struct. */
+   BB* bb;
+
+   /* BB seen before (ie. re-instrumentation) */
+   Bool seen_before;
+
+   /* Number InstrInfo bins 'used' so far. */
+   UInt ii_index;
+
+   // current offset of guest instructions from BB start
+   UInt instr_offset;
+
+   /* The output SB being constructed. */
+   IRSB* sbOut;
+} ClgState;
+
+static void showEvent(Event* ev)
+{
+   switch (ev->tag) {
+   case Ev_Ir:
+      VG_(printf)("Ir (InstrInfo %p) at +%u\n", ev->inode,
+                  ev->inode->instr_offset);
+      break;
+   case Ev_Dr:
+      VG_(printf)("Dr (InstrInfo %p) at +%u %d EA=", ev->inode,
+                  ev->inode->instr_offset, ev->Ev.Dr.szB);
+      ppIRExpr(ev->Ev.Dr.ea);
+      VG_(printf)("\n");
+      break;
+   case Ev_Dw:
+      VG_(printf)("Dw (InstrInfo %p) at +%u %d EA=", ev->inode,
+                  ev->inode->instr_offset, ev->Ev.Dw.szB);
+      ppIRExpr(ev->Ev.Dw.ea);
+      VG_(printf)("\n");
+      break;
+   case Ev_Dm:
+      VG_(printf)("Dm (InstrInfo %p) at +%u %d EA=", ev->inode,
+                  ev->inode->instr_offset, ev->Ev.Dm.szB);
+      ppIRExpr(ev->Ev.Dm.ea);
+      VG_(printf)("\n");
+      break;
+   case Ev_Bc:
+      VG_(printf)("Bc %p   GA=", ev->inode);
+      ppIRExpr(ev->Ev.Bc.taken);
+      VG_(printf)("\n");
+      break;
+   case Ev_Bi:
+      VG_(printf)("Bi %p  DST=", ev->inode);
+      ppIRExpr(ev->Ev.Bi.dst);
+      VG_(printf)("\n");
+      break;
+   case Ev_G:
+      VG_(printf)("G  %p\n", ev->inode);
+      break;
+   default:
+      tl_assert(0);
+      break;
+   }
+}
+
+/* Generate code for all outstanding memory events, and mark the queue
+   empty.  Code is generated into cgs->sbOut, and this activity
+   'consumes' slots in cgs->bb. */
+
+static void flushEvents(ClgState* clgs)
+{
+   Int          i, regparms, inew;
+   const HChar* helperName;
+   void*        helperAddr;
+   IRExpr**     argv;
+   IRExpr*      i_node_expr;
+   IRDirty*     di;
+   Event*       ev;
+   Event*       ev2;
+   Event*       ev3;
+
+   if (!clgs->seen_before) {
+      // extend event sets as needed
+      // available sets: D0 Dr
+      for (i = 0; i < clgs->events_used; i++) {
+         ev = &clgs->events[i];
+         switch (ev->tag) {
+         case Ev_Ir:
+            // Ir event always is first for a guest instruction
+            TG_ASSERT(ev->inode->eventset == 0);
+            ev->inode->eventset = TG_(sets).base;
+            break;
+         case Ev_Dr:
+            // extend event set by Dr counters
+            ev->inode->eventset =
+               TG_(add_event_group)(ev->inode->eventset, EG_DR);
+            break;
+         case Ev_Dw:
+         case Ev_Dm:
+            // extend event set by Dw counters
+            ev->inode->eventset =
+               TG_(add_event_group)(ev->inode->eventset, EG_DW);
+            break;
+         case Ev_Bc:
+            // extend event set by Bc counters
+            ev->inode->eventset =
+               TG_(add_event_group)(ev->inode->eventset, EG_BC);
+            break;
+         case Ev_Bi:
+            // extend event set by Bi counters
+            ev->inode->eventset =
+               TG_(add_event_group)(ev->inode->eventset, EG_BI);
+            break;
+         case Ev_G:
+            // extend event set by Bus counter
+            ev->inode->eventset =
+               TG_(add_event_group)(ev->inode->eventset, EG_BUS);
+            break;
+         default:
+            tl_assert(0);
+         }
+      }
+   }
+
+   for (i = 0; i < clgs->events_used; i = inew) {
+
+      helperName = NULL;
+      helperAddr = NULL;
+      argv       = NULL;
+      regparms   = 0;
+
+      /* generate IR to notify event i and possibly the ones
+         immediately following it. */
+      tl_assert(i >= 0 && i < clgs->events_used);
+
+      ev  = &clgs->events[i];
+      ev2 = (i < clgs->events_used - 1 ? &clgs->events[i + 1] : NULL);
+      ev3 = (i < clgs->events_used - 2 ? &clgs->events[i + 2] : NULL);
+
+      TG_DEBUGIF(5)
+      {
+         VG_(printf)("   flush ");
+         showEvent(ev);
+      }
+
+      i_node_expr = mkIRExpr_HWord((HWord)ev->inode);
+
+      /* Decide on helper fn to call and args to pass it, and advance
+         i appropriately.
+         Dm events have same effect as Dw events */
+      switch (ev->tag) {
+      case Ev_Ir:
+         /* Merge an Ir with a following Dr. */
+         if (ev2 && ev2->tag == Ev_Dr) {
+            /* Why is this true?  It's because we're merging an Ir
+               with a following Dr.  The Ir derives from the
+               instruction's IMark and the Dr from data
+               references which follow it.  In short it holds
+               because each insn starts with an IMark, hence an
+               Ev_Ir, and so these Dr must pertain to the
+               immediately preceding Ir.  Same applies to analogous
+               assertions in the subsequent cases. */
+            tl_assert(ev2->inode == ev->inode);
+            helperName = TG_(cachesim).log_1I1Dr_name;
+            helperAddr = TG_(cachesim).log_1I1Dr;
+            argv       = mkIRExprVec_3(i_node_expr, get_Event_dea(ev2),
+                                       mkIRExpr_HWord(get_Event_dszB(ev2)));
+            regparms   = 3;
+            inew       = i + 2;
+         }
+         /* Merge an Ir with a following Dw/Dm. */
+         else if (ev2 && (ev2->tag == Ev_Dw || ev2->tag == Ev_Dm)) {
+            tl_assert(ev2->inode == ev->inode);
+            helperName = TG_(cachesim).log_1I1Dw_name;
+            helperAddr = TG_(cachesim).log_1I1Dw;
+            argv       = mkIRExprVec_3(i_node_expr, get_Event_dea(ev2),
+                                       mkIRExpr_HWord(get_Event_dszB(ev2)));
+            regparms   = 3;
+            inew       = i + 2;
+         }
+         /* Merge an Ir with two following Irs. */
+         else if (ev2 && ev3 && ev2->tag == Ev_Ir && ev3->tag == Ev_Ir) {
+            helperName = TG_(cachesim).log_3I0D_name;
+            helperAddr = TG_(cachesim).log_3I0D;
+            argv = mkIRExprVec_3(i_node_expr, mkIRExpr_HWord((HWord)ev2->inode),
+                                 mkIRExpr_HWord((HWord)ev3->inode));
+            regparms = 3;
+            inew     = i + 3;
+         }
+         /* Merge an Ir with one following Ir. */
+         else if (ev2 && ev2->tag == Ev_Ir) {
+            helperName = TG_(cachesim).log_2I0D_name;
+            helperAddr = TG_(cachesim).log_2I0D;
+            argv =
+               mkIRExprVec_2(i_node_expr, mkIRExpr_HWord((HWord)ev2->inode));
+            regparms = 2;
+            inew     = i + 2;
+         }
+         /* No merging possible; emit as-is. */
+         else {
+            helperName = TG_(cachesim).log_1I0D_name;
+            helperAddr = TG_(cachesim).log_1I0D;
+            argv       = mkIRExprVec_1(i_node_expr);
+            regparms   = 1;
+            inew       = i + 1;
+         }
+         break;
+      case Ev_Dr:
+         /* Data read or modify */
+         helperName = TG_(cachesim).log_0I1Dr_name;
+         helperAddr = TG_(cachesim).log_0I1Dr;
+         argv       = mkIRExprVec_3(i_node_expr, get_Event_dea(ev),
+                                    mkIRExpr_HWord(get_Event_dszB(ev)));
+         regparms   = 3;
+         inew       = i + 1;
+         break;
+      case Ev_Dw:
+      case Ev_Dm:
+         /* Data write */
+         helperName = TG_(cachesim).log_0I1Dw_name;
+         helperAddr = TG_(cachesim).log_0I1Dw;
+         argv       = mkIRExprVec_3(i_node_expr, get_Event_dea(ev),
+                                    mkIRExpr_HWord(get_Event_dszB(ev)));
+         regparms   = 3;
+         inew       = i + 1;
+         break;
+      case Ev_Bc:
+         /* Conditional branch */
+         helperName = "log_cond_branch";
+         helperAddr = &log_cond_branch;
+         argv       = mkIRExprVec_2(i_node_expr, ev->Ev.Bc.taken);
+         regparms   = 2;
+         inew       = i + 1;
+         break;
+      case Ev_Bi:
+         /* Branch to an unknown destination */
+         helperName = "log_ind_branch";
+         helperAddr = &log_ind_branch;
+         argv       = mkIRExprVec_2(i_node_expr, ev->Ev.Bi.dst);
+         regparms   = 2;
+         inew       = i + 1;
+         break;
+      case Ev_G:
+         /* Global bus event (CAS, LOCK-prefix, LL-SC, etc) */
+         helperName = "log_global_event";
+         helperAddr = &log_global_event;
+         argv       = mkIRExprVec_1(i_node_expr);
+         regparms   = 1;
+         inew       = i + 1;
+         break;
+      default:
+         tl_assert(0);
+      }
+
+      TG_DEBUGIF(5)
+      {
+         if (inew > i + 1) {
+            VG_(printf)("   merge ");
+            showEvent(ev2);
+         }
+         if (inew > i + 2) {
+            VG_(printf)("   merge ");
+            showEvent(ev3);
+         }
+         if (helperAddr)
+            VG_(printf)("   call  %s (%p)\n", helperName, helperAddr);
+      }
+
+      /* helper could be unset depending on the simulator used */
+      if (helperAddr == 0)
+         continue;
+
+      /* Add the helper. */
+      tl_assert(helperName);
+      tl_assert(helperAddr);
+      tl_assert(argv);
+      di = unsafeIRDirty_0_N(regparms, helperName,
+                             VG_(fnptr_to_fnentry)(helperAddr), argv);
+      addStmtToIRSB(clgs->sbOut, IRStmt_Dirty(di));
+   }
+
+   clgs->events_used = 0;
+}
+
+static void addEvent_Ir(ClgState* clgs, InstrInfo* inode)
+{
+   Event* evt;
+   tl_assert(clgs->seen_before || (inode->eventset == 0));
+   if (!TG_(clo).simulate_cache)
+      return;
+
+   if (clgs->events_used == N_EVENTS)
+      flushEvents(clgs);
+   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
+   evt = &clgs->events[clgs->events_used];
+   init_Event(evt);
+   evt->tag   = Ev_Ir;
+   evt->inode = inode;
+   clgs->events_used++;
+}
+
+static void
+addEvent_Dr(ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea)
+{
+   Event* evt;
+   tl_assert(isIRAtom(ea));
+   tl_assert(datasize >= 1);
+   if (!TG_(clo).simulate_cache)
+      return;
+   tl_assert(datasize <= TG_(min_line_size));
+
+   if (clgs->events_used == N_EVENTS)
+      flushEvents(clgs);
+   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
+   evt = &clgs->events[clgs->events_used];
+   init_Event(evt);
+   evt->tag       = Ev_Dr;
+   evt->inode     = inode;
+   evt->Ev.Dr.szB = datasize;
+   evt->Ev.Dr.ea  = ea;
+   clgs->events_used++;
+}
+
+static void
+addEvent_Dw(ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea)
+{
+   Event* evt;
+   tl_assert(isIRAtom(ea));
+   tl_assert(datasize >= 1);
+   if (!TG_(clo).simulate_cache)
+      return;
+   tl_assert(datasize <= TG_(min_line_size));
+
+   /* Is it possible to merge this write with the preceding read? */
+   if (clgs->events_used > 0) {
+      Event* lastEvt = &clgs->events[clgs->events_used - 1];
+      if (lastEvt->tag == Ev_Dr && lastEvt->Ev.Dr.szB == datasize &&
+          lastEvt->inode == inode && eqIRAtom(lastEvt->Ev.Dr.ea, ea)) {
+         lastEvt->tag = Ev_Dm;
+         return;
+      }
+   }
+
+   /* No.  Add as normal. */
+   if (clgs->events_used == N_EVENTS)
+      flushEvents(clgs);
+   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
+   evt = &clgs->events[clgs->events_used];
+   init_Event(evt);
+   evt->tag       = Ev_Dw;
+   evt->inode     = inode;
+   evt->Ev.Dw.szB = datasize;
+   evt->Ev.Dw.ea  = ea;
+   clgs->events_used++;
+}
+
+static void addEvent_D_guarded(ClgState*  clgs,
+                               InstrInfo* inode,
+                               Int        datasize,
+                               IRAtom*    ea,
+                               IRAtom*    guard,
+                               Bool       isWrite)
+{
+   tl_assert(isIRAtom(ea));
+   tl_assert(guard);
+   tl_assert(isIRAtom(guard));
+   tl_assert(datasize >= 1);
+   if (!TG_(clo).simulate_cache)
+      return;
+   tl_assert(datasize <= TG_(min_line_size));
+
+   /* Adding guarded memory actions and merging them with the existing
+      queue is too complex.  Simply flush the queue and add this
+      action immediately.  Since guarded loads and stores are pretty
+      rare, this is not thought likely to cause any noticeable
+      performance loss as a result of the loss of event-merging
+      opportunities. */
+   tl_assert(clgs->events_used >= 0);
+   flushEvents(clgs);
+   tl_assert(clgs->events_used == 0);
+   /* Same as case Ev_Dw / case Ev_Dr in flushEvents, except with guard */
+   IRExpr*      i_node_expr;
+   const HChar* helperName;
+   void*        helperAddr;
+   IRExpr**     argv;
+   Int          regparms;
+   IRDirty*     di;
+   i_node_expr = mkIRExpr_HWord((HWord)inode);
+   helperName =
+      isWrite ? TG_(cachesim).log_0I1Dw_name : TG_(cachesim).log_0I1Dr_name;
+   helperAddr = isWrite ? TG_(cachesim).log_0I1Dw : TG_(cachesim).log_0I1Dr;
+   argv       = mkIRExprVec_3(i_node_expr, ea, mkIRExpr_HWord(datasize));
+   regparms   = 3;
+   di         = unsafeIRDirty_0_N(regparms, helperName,
+                                  VG_(fnptr_to_fnentry)(helperAddr), argv);
+   di->guard  = guard;
+   addStmtToIRSB(clgs->sbOut, IRStmt_Dirty(di));
+}
+
+static void addEvent_Bc(ClgState* clgs, InstrInfo* inode, IRAtom* guard)
+{
+   Event* evt;
+   tl_assert(isIRAtom(guard));
+   tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, guard) ==
+             (sizeof(RegWord) == 4 ? Ity_I32 : Ity_I64));
+   if (!TG_(clo).simulate_branch)
+      return;
+
+   if (clgs->events_used == N_EVENTS)
+      flushEvents(clgs);
+   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
+   evt = &clgs->events[clgs->events_used];
+   init_Event(evt);
+   evt->tag         = Ev_Bc;
+   evt->inode       = inode;
+   evt->Ev.Bc.taken = guard;
+   clgs->events_used++;
+}
+
+static void addEvent_Bi(ClgState* clgs, InstrInfo* inode, IRAtom* whereTo)
+{
+   Event* evt;
+   tl_assert(isIRAtom(whereTo));
+   tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, whereTo) ==
+             (sizeof(RegWord) == 4 ? Ity_I32 : Ity_I64));
+   if (!TG_(clo).simulate_branch)
+      return;
+
+   if (clgs->events_used == N_EVENTS)
+      flushEvents(clgs);
+   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
+   evt = &clgs->events[clgs->events_used];
+   init_Event(evt);
+   evt->tag       = Ev_Bi;
+   evt->inode     = inode;
+   evt->Ev.Bi.dst = whereTo;
+   clgs->events_used++;
+}
+
+static void addEvent_G(ClgState* clgs, InstrInfo* inode)
+{
+   Event* evt;
+   if (!TG_(clo).collect_bus)
+      return;
+
+   if (clgs->events_used == N_EVENTS)
+      flushEvents(clgs);
+   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
+   evt = &clgs->events[clgs->events_used];
+   init_Event(evt);
+   evt->tag   = Ev_G;
+   evt->inode = inode;
+   clgs->events_used++;
+}
+
+/* Initialise or check (if already seen before) an InstrInfo for next insn.
+   We only can set instr_offset/instr_size here. The required event set and
+   resulting cost offset depend on events (Ir/Dr/Dw/Dm) in guest
+   instructions. The event set is extended as required on flush of the event
+   queue (when Dm events were determined), cost offsets are determined at
+   end of BB instrumentation. */
+static InstrInfo* next_InstrInfo(ClgState* clgs, UInt instr_size)
+{
+   InstrInfo* ii;
+   tl_assert(clgs->ii_index < clgs->bb->instr_count);
+   ii = &clgs->bb->instr[clgs->ii_index];
+
+   if (clgs->seen_before) {
+      TG_ASSERT(ii->instr_offset == clgs->instr_offset);
+      TG_ASSERT(ii->instr_size == instr_size);
+   } else {
+      ii->instr_offset = clgs->instr_offset;
+      ii->instr_size   = instr_size;
+      ii->cost_offset  = 0;
+      ii->eventset     = 0;
+   }
+
+   clgs->ii_index++;
+   clgs->instr_offset += instr_size;
+   TG_(stat).distinct_instrs++;
+
+   return ii;
+}
+
+// return total number of cost values needed for this BB
+static UInt update_cost_offsets(ClgState* clgs)
+{
+   Int        i;
+   InstrInfo* ii;
+   UInt       cost_offset = 0;
+
+   TG_ASSERT(clgs->bb->instr_count == clgs->ii_index);
+   for (i = 0; i < clgs->ii_index; i++) {
+      ii = &clgs->bb->instr[i];
+      if (clgs->seen_before) {
+         TG_ASSERT(ii->cost_offset == cost_offset);
+      } else
+         ii->cost_offset = cost_offset;
+      cost_offset += ii->eventset ? ii->eventset->size : 0;
+   }
+
+   return cost_offset;
+}
+
+/*------------------------------------------------------------*/
+/*--- Instrumentation                                      ---*/
+/*------------------------------------------------------------*/
+
+#if defined(VG_BIGENDIAN)
+#define CLGEndness Iend_BE
+#elif defined(VG_LITTLEENDIAN)
+#define CLGEndness Iend_LE
+#else
+#error "Unknown endianness"
+#endif
+
+static Addr IRConst2Addr(IRConst* con)
+{
+   Addr addr;
+
+   if (sizeof(RegWord) == 4) {
+      TG_ASSERT(con->tag == Ico_U32);
+      addr = con->Ico.U32;
+   } else if (sizeof(RegWord) == 8) {
+      TG_ASSERT(con->tag == Ico_U64);
+      addr = con->Ico.U64;
+   } else
+      VG_(tool_panic)("Tracegrind: invalid Addr type");
+
+   return addr;
+}
+
+/* First pass over a BB to instrument, counting instructions and jumps
+ * This is needed for the size of the BB struct to allocate
+ *
+ * Called from TG_(get_bb)
+ */
+void TG_(collectBlockInfo)(IRSB*           sbIn,
+                           /*INOUT*/ UInt* instrs,
+                           /*INOUT*/ UInt* cjmps,
+                           /*INOUT*/ Bool* cjmp_inverted)
+{
+   Int     i;
+   IRStmt* st;
+   Addr    instrAddr   = 0, jumpDst;
+   UInt    instrLen    = 0;
+   Bool    toNextInstr = False;
+
+   // Ist_Exit has to be ignored in preamble code, before first IMark:
+   // preamble code is added by VEX for self modifying code, and has
+   // nothing to do with client code
+   Bool inPreamble = True;
+
+   if (!sbIn)
+      return;
+
+   for (i = 0; i < sbIn->stmts_used; i++) {
+      st = sbIn->stmts[i];
+      if (Ist_IMark == st->tag) {
+         inPreamble = False;
+
+         instrAddr = st->Ist.IMark.addr;
+         instrLen  = st->Ist.IMark.len;
+
+         (*instrs)++;
+         toNextInstr = False;
+      }
+      if (inPreamble)
+         continue;
+      if (Ist_Exit == st->tag) {
+         jumpDst     = IRConst2Addr(st->Ist.Exit.dst);
+         toNextInstr = (jumpDst == instrAddr + instrLen);
+
+         (*cjmps)++;
+      }
+   }
+
+   /* if the last instructions of BB conditionally jumps to next instruction
+    * (= first instruction of next BB in memory), this is a inverted by VEX.
+    */
+   *cjmp_inverted = toNextInstr;
+}
+
+static void
+addConstMemStoreStmt(IRSB* bbOut, UWord addr, UInt val, IRType hWordTy)
+{
+   addStmtToIRSB(
+      bbOut, IRStmt_Store(CLGEndness,
+                          IRExpr_Const(hWordTy == Ity_I32 ? IRConst_U32(addr)
+                                                          : IRConst_U64(addr)),
+                          IRExpr_Const(IRConst_U32(val))));
+}
+
+/* add helper call to setup_bbcc, with pointer to BB struct as argument
+ *
+ * precondition for setup_bbcc:
+ * - jmps_passed has number of cond.jumps passed in last executed BB
+ * - current_bbcc has a pointer to the BBCC of the last executed BB
+ *   Thus, if bbcc_jmpkind is != -1 (JmpNone),
+ *     current_bbcc->bb->jmp_addr
+ *   gives the address of the jump source.
+ *
+ * the setup does 2 things:
+ * - trace call:
+ *   * Unwind own call stack, i.e sync our ESP with real ESP
+ *     This is for ESP manipulation (longjmps, C++ exec handling) and RET
+ *   * For CALLs or JMPs crossing objects, record call arg +
+ *     push are on own call stack
+ *
+ * - prepare for cache log functions:
+ *   set current_bbcc to BBCC that gets the costs for this BB execution
+ *   attached
+ */
+static void addBBSetupCall(ClgState* clgs)
+{
+   IRDirty* di;
+   IRExpr * arg1, **argv;
+
+   arg1 = mkIRExpr_HWord((HWord)clgs->bb);
+   argv = mkIRExprVec_1(arg1);
+   di   = unsafeIRDirty_0_N(1, "setup_bbcc",
+                            VG_(fnptr_to_fnentry)(&TG_(setup_bbcc)), argv);
+   addStmtToIRSB(clgs->sbOut, IRStmt_Dirty(di));
+}
+
+static IRSB* TG_(instrument)(VgCallbackClosure*     closure,
+                             IRSB*                  sbIn,
+                             const VexGuestLayout*  layout,
+                             const VexGuestExtents* vge,
+                             const VexArchInfo*     archinfo_host,
+                             IRType                 gWordTy,
+                             IRType                 hWordTy)
+{
+   Int        i;
+   IRStmt*    st;
+   Addr       origAddr;
+   InstrInfo* curr_inode = NULL;
+   ClgState   clgs;
+   UInt       cJumps = 0;
+   IRTypeEnv* tyenv  = sbIn->tyenv;
+
+   if (gWordTy != hWordTy) {
+      /* We don't currently support this case. */
+      VG_(tool_panic)("host/guest word size mismatch");
+   }
+
+   // No instrumentation if it is switched off
+   if (!TG_(instrument_state)) {
+      TG_DEBUG(5, "instrument(BB %#lx) [Instrumentation OFF]\n",
+               (Addr)closure->readdr);
+      return sbIn;
+   }
+
+   TG_DEBUG(3, "+ instrument(BB %#lx)\n", (Addr)closure->readdr);
+
+   /* Set up SB for instrumented IR */
+   clgs.sbOut = deepCopyIRSBExceptStmts(sbIn);
+
+   // Copy verbatim any IR preamble preceding the first IMark
+   i = 0;
+   while (i < sbIn->stmts_used && sbIn->stmts[i]->tag != Ist_IMark) {
+      addStmtToIRSB(clgs.sbOut, sbIn->stmts[i]);
+      i++;
+   }
+
+   // Get the first statement, and origAddr from it
+   TG_ASSERT(sbIn->stmts_used > 0);
+   TG_ASSERT(i < sbIn->stmts_used);
+   st = sbIn->stmts[i];
+   TG_ASSERT(Ist_IMark == st->tag);
+
+   origAddr = st->Ist.IMark.addr + st->Ist.IMark.delta;
+   TG_ASSERT(origAddr == st->Ist.IMark.addr +
+                            st->Ist.IMark.delta); // XXX: check no overflow
+
+   /* Get BB struct (creating if necessary).
+    * JS: The hash table is keyed with orig_addr_noredir -- important!
+    * JW: Why? If it is because of different chasing of the redirection,
+    *     this is not needed, as chasing is switched off in tracegrind
+    */
+   clgs.bb = TG_(get_bb)(origAddr, sbIn, &(clgs.seen_before));
+
+   addBBSetupCall(&clgs);
+
+   // Set up running state
+   clgs.events_used  = 0;
+   clgs.ii_index     = 0;
+   clgs.instr_offset = 0;
+
+   for (/*use current i*/; i < sbIn->stmts_used; i++) {
+
+      st = sbIn->stmts[i];
+      TG_ASSERT(isFlatIRStmt(st));
+
+      switch (st->tag) {
+      case Ist_NoOp:
+      case Ist_AbiHint:
+      case Ist_Put:
+      case Ist_PutI:
+      case Ist_MBE:
+         break;
+
+      case Ist_IMark: {
+         Addr cia   = st->Ist.IMark.addr + st->Ist.IMark.delta;
+         UInt isize = st->Ist.IMark.len;
+         TG_ASSERT(clgs.instr_offset == cia - origAddr);
+         // If Vex fails to decode an instruction, the size will be zero.
+         // Pretend otherwise.
+         if (isize == 0)
+            isize = VG_MIN_INSTR_SZB;
+
+         // Sanity-check size.
+         tl_assert((VG_MIN_INSTR_SZB <= isize && isize <= VG_MAX_INSTR_SZB) ||
+                   VG_CLREQ_SZB == isize);
+
+         // Init the inode, record it as the current one.
+         // Subsequent Dr/Dw/Dm events from the same instruction will
+         // also use it.
+         curr_inode = next_InstrInfo(&clgs, isize);
+
+         addEvent_Ir(&clgs, curr_inode);
+         break;
+      }
+
+      case Ist_WrTmp: {
+         IRExpr* data = st->Ist.WrTmp.data;
+         if (data->tag == Iex_Load) {
+            IRExpr* aexpr = data->Iex.Load.addr;
+            // Note also, endianness info is ignored.  I guess
+            // that's not interesting.
+            addEvent_Dr(&clgs, curr_inode, sizeofIRType(data->Iex.Load.ty),
+                        aexpr);
+         }
+         break;
+      }
+
+      case Ist_Store: {
+         IRExpr* data  = st->Ist.Store.data;
+         IRExpr* aexpr = st->Ist.Store.addr;
+         addEvent_Dw(&clgs, curr_inode,
+                     sizeofIRType(typeOfIRExpr(sbIn->tyenv, data)), aexpr);
+         break;
+      }
+
+      case Ist_StoreG: {
+         IRStoreG* sg   = st->Ist.StoreG.details;
+         IRExpr*   data = sg->data;
+         IRExpr*   addr = sg->addr;
+         IRType    type = typeOfIRExpr(tyenv, data);
+         tl_assert(type != Ity_INVALID);
+         addEvent_D_guarded(&clgs, curr_inode, sizeofIRType(type), addr,
+                            sg->guard, True /*isWrite*/);
+         break;
+      }
+
+      case Ist_LoadG: {
+         IRLoadG* lg       = st->Ist.LoadG.details;
+         IRType   type     = Ity_INVALID; /* loaded type */
+         IRType   typeWide = Ity_INVALID; /* after implicit widening */
+         IRExpr*  addr     = lg->addr;
+         typeOfIRLoadGOp(lg->cvt, &typeWide, &type);
+         tl_assert(type != Ity_INVALID);
+         addEvent_D_guarded(&clgs, curr_inode, sizeofIRType(type), addr,
+                            lg->guard, False /*!isWrite*/);
+         break;
+      }
+
+      case Ist_Dirty: {
+         Int      dataSize;
+         IRDirty* d = st->Ist.Dirty.details;
+         if (d->mFx != Ifx_None) {
+            /* This dirty helper accesses memory.  Collect the details. */
+            tl_assert(d->mAddr != NULL);
+            tl_assert(d->mSize != 0);
+            dataSize = d->mSize;
+            // Large (eg. 28B, 108B, 512B on x86) data-sized
+            // instructions will be done inaccurately, but they're
+            // very rare and this avoids errors from hitting more
+            // than two cache lines in the simulation.
+            if (TG_(clo).simulate_cache && dataSize > TG_(min_line_size))
+               dataSize = TG_(min_line_size);
+            if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify)
+               addEvent_Dr(&clgs, curr_inode, dataSize, d->mAddr);
+            if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify)
+               addEvent_Dw(&clgs, curr_inode, dataSize, d->mAddr);
+         } else {
+            tl_assert(d->mAddr == NULL);
+            tl_assert(d->mSize == 0);
+         }
+         break;
+      }
+
+      case Ist_CAS: {
+         /* We treat it as a read and a write of the location.  I
+            think that is the same behaviour as it was before IRCAS
+            was introduced, since prior to that point, the Vex
+            front ends would translate a lock-prefixed instruction
+            into a (normal) read followed by a (normal) write. */
+         Int    dataSize;
+         IRCAS* cas = st->Ist.CAS.details;
+         TG_ASSERT(cas->addr && isIRAtom(cas->addr));
+         TG_ASSERT(cas->dataLo);
+         dataSize = sizeofIRType(typeOfIRExpr(sbIn->tyenv, cas->dataLo));
+         if (cas->dataHi != NULL)
+            dataSize *= 2; /* since this is a doubleword-cas */
+         addEvent_Dr(&clgs, curr_inode, dataSize, cas->addr);
+         addEvent_Dw(&clgs, curr_inode, dataSize, cas->addr);
+         addEvent_G(&clgs, curr_inode);
+         break;
+      }
+
+      case Ist_LLSC: {
+         IRType dataTy;
+         if (st->Ist.LLSC.storedata == NULL) {
+            /* LL */
+            dataTy = typeOfIRTemp(sbIn->tyenv, st->Ist.LLSC.result);
+            addEvent_Dr(&clgs, curr_inode, sizeofIRType(dataTy),
+                        st->Ist.LLSC.addr);
+            /* flush events before LL, should help SC to succeed */
+            flushEvents(&clgs);
+         } else {
+            /* SC */
+            dataTy = typeOfIRExpr(sbIn->tyenv, st->Ist.LLSC.storedata);
+            addEvent_Dw(&clgs, curr_inode, sizeofIRType(dataTy),
+                        st->Ist.LLSC.addr);
+            /* I don't know whether the global-bus-lock cost should
+               be attributed to the LL or the SC, but it doesn't
+               really matter since they always have to be used in
+               pairs anyway.  Hence put it (quite arbitrarily) on
+               the SC. */
+            addEvent_G(&clgs, curr_inode);
+         }
+         break;
+      }
+
+      case Ist_Exit: {
+         Bool guest_exit, inverted;
+
+         /* VEX code generation sometimes inverts conditional branches.
+          * As Tracegrind counts (conditional) jumps, it has to correct
+          * inversions. The heuristic is the following:
+          * (1) Tracegrind switches off SB chasing and unrolling, and
+          *     therefore it assumes that a candidate for inversion only is
+          *     the last conditional branch in an SB.
+          * (2) inversion is assumed if the branch jumps to the address of
+          *     the next guest instruction in memory.
+          * This heuristic is precalculated in TG_(collectBlockInfo)().
+          *
+          * Branching behavior is also used for branch prediction. Note that
+          * above heuristic is different from what Cachegrind does.
+          * Cachegrind uses (2) for all branches.
+          */
+         if (cJumps + 1 == clgs.bb->cjmp_count)
+            inverted = clgs.bb->cjmp_inverted;
+         else
+            inverted = False;
+
+         // call branch predictor only if this is a branch in guest code
+         guest_exit = (st->Ist.Exit.jk == Ijk_Boring) ||
+                      (st->Ist.Exit.jk == Ijk_Call) ||
+                      (st->Ist.Exit.jk == Ijk_Ret);
+
+         if (guest_exit) {
+            /* Stuff to widen the guard expression to a host word, so
+               we can pass it to the branch predictor simulation
+               functions easily. */
+            IRType  tyW    = hWordTy;
+            IROp    widen  = tyW == Ity_I32 ? Iop_1Uto32 : Iop_1Uto64;
+            IROp    opXOR  = tyW == Ity_I32 ? Iop_Xor32 : Iop_Xor64;
+            IRTemp  guard1 = newIRTemp(clgs.sbOut->tyenv, Ity_I1);
+            IRTemp  guardW = newIRTemp(clgs.sbOut->tyenv, tyW);
+            IRTemp  guard  = newIRTemp(clgs.sbOut->tyenv, tyW);
+            IRExpr* one    = tyW == Ity_I32 ? IRExpr_Const(IRConst_U32(1))
+                                            : IRExpr_Const(IRConst_U64(1));
+
+            /* Widen the guard expression. */
+            addStmtToIRSB(clgs.sbOut, IRStmt_WrTmp(guard1, st->Ist.Exit.guard));
+            addStmtToIRSB(
+               clgs.sbOut,
+               IRStmt_WrTmp(guardW, IRExpr_Unop(widen, IRExpr_RdTmp(guard1))));
+            /* If the exit is inverted, invert the sense of the guard. */
+            addStmtToIRSB(
+               clgs.sbOut,
+               IRStmt_WrTmp(guard,
+                            inverted
+                               ? IRExpr_Binop(opXOR, IRExpr_RdTmp(guardW), one)
+                               : IRExpr_RdTmp(guardW)));
+            /* And post the event. */
+            addEvent_Bc(&clgs, curr_inode, IRExpr_RdTmp(guard));
+         }
+
+         /* We may never reach the next statement, so need to flush
+            all outstanding transactions now. */
+         flushEvents(&clgs);
+
+         TG_ASSERT(clgs.ii_index > 0);
+         if (!clgs.seen_before) {
+            TgJumpKind jk;
+
+            if (st->Ist.Exit.jk == Ijk_Call)
+               jk = jk_Call;
+            else if (st->Ist.Exit.jk == Ijk_Ret)
+               jk = jk_Return;
+            else {
+               if (IRConst2Addr(st->Ist.Exit.dst) ==
+                   origAddr + curr_inode->instr_offset + curr_inode->instr_size)
+                  jk = jk_None;
+               else
+                  jk = jk_Jump;
+            }
+
+            clgs.bb->jmp[cJumps].instr   = clgs.ii_index - 1;
+            clgs.bb->jmp[cJumps].jmpkind = jk;
+         }
+
+         /* Update global variable jmps_passed before the jump
+          * A correction is needed if VEX inverted the last jump condition
+          */
+         UInt val = inverted ? cJumps + 1 : cJumps;
+         addConstMemStoreStmt(
+            clgs.sbOut, (UWord)&TG_(current_state).jmps_passed, val, hWordTy);
+         cJumps++;
+
+         break;
+      }
+
+      default:
+         tl_assert(0);
+         break;
+      }
+
+      /* Copy the original statement */
+      addStmtToIRSB(clgs.sbOut, st);
+
+      TG_DEBUGIF(5)
+      {
+         VG_(printf)("   pass  ");
+         ppIRStmt(st);
+         VG_(printf)("\n");
+      }
+   }
+
+   /* Deal with branches to unknown destinations.  Except ignore ones
+      which are function returns as we assume the return stack
+      predictor never mispredicts. */
+   if ((sbIn->jumpkind == Ijk_Boring) || (sbIn->jumpkind == Ijk_Call)) {
+      if (0) {
+         ppIRExpr(sbIn->next);
+         VG_(printf)("\n");
+      }
+      switch (sbIn->next->tag) {
+      case Iex_Const:
+         break; /* boring - branch to known address */
+      case Iex_RdTmp:
+         /* looks like an indirect branch (branch to unknown) */
+         addEvent_Bi(&clgs, curr_inode, sbIn->next);
+         break;
+      default:
+         /* shouldn't happen - if the incoming IR is properly
+            flattened, should only have tmp and const cases to
+            consider. */
+         tl_assert(0);
+      }
+   }
+
+   /* At the end of the bb.  Flush outstandings. */
+   flushEvents(&clgs);
+
+   /* Update global variable jmps_passed at end of SB.
+    * As TG_(current_state).jmps_passed is reset to 0 in setup_bbcc,
+    * this can be omitted if there is no conditional jump in this SB.
+    * A correction is needed if VEX inverted the last jump condition
+    */
+   if (cJumps > 0) {
+      UInt jmps_passed = cJumps;
+      if (clgs.bb->cjmp_inverted)
+         jmps_passed--;
+      addConstMemStoreStmt(clgs.sbOut, (UWord)&TG_(current_state).jmps_passed,
+                           jmps_passed, hWordTy);
+   }
+   TG_ASSERT(clgs.bb->cjmp_count == cJumps);
+   TG_ASSERT(clgs.bb->instr_count == clgs.ii_index);
+
+   /* Info for final exit from BB */
+   {
+      TgJumpKind jk;
+
+      if (sbIn->jumpkind == Ijk_Call)
+         jk = jk_Call;
+      else if (sbIn->jumpkind == Ijk_Ret)
+         jk = jk_Return;
+      else {
+         jk = jk_Jump;
+         if ((sbIn->next->tag == Iex_Const) &&
+             (IRConst2Addr(sbIn->next->Iex.Const.con) ==
+              origAddr + clgs.instr_offset))
+            jk = jk_None;
+      }
+      clgs.bb->jmp[cJumps].jmpkind = jk;
+      /* Instruction index of the call/ret at BB end
+       * (it is wrong for fall-through, but does not matter) */
+      clgs.bb->jmp[cJumps].instr = clgs.ii_index - 1;
+   }
+
+   /* swap information of last exit with final exit if inverted */
+   if (clgs.bb->cjmp_inverted) {
+      TgJumpKind jk;
+      UInt       instr;
+
+      jk                               = clgs.bb->jmp[cJumps].jmpkind;
+      clgs.bb->jmp[cJumps].jmpkind     = clgs.bb->jmp[cJumps - 1].jmpkind;
+      clgs.bb->jmp[cJumps - 1].jmpkind = jk;
+      instr                            = clgs.bb->jmp[cJumps].instr;
+      clgs.bb->jmp[cJumps].instr       = clgs.bb->jmp[cJumps - 1].instr;
+      clgs.bb->jmp[cJumps - 1].instr   = instr;
+   }
+
+   if (clgs.seen_before) {
+      TG_ASSERT(clgs.bb->cost_count == update_cost_offsets(&clgs));
+      TG_ASSERT(clgs.bb->instr_len == clgs.instr_offset);
+   } else {
+      clgs.bb->cost_count = update_cost_offsets(&clgs);
+      clgs.bb->instr_len  = clgs.instr_offset;
+   }
+
+   TG_DEBUG(3, "- instrument(BB %#lx): byteLen %u, CJumps %u, CostLen %u\n",
+            origAddr, clgs.bb->instr_len, clgs.bb->cjmp_count,
+            clgs.bb->cost_count);
+   if (cJumps > 0) {
+      TG_DEBUG(3, "                     [ ");
+      for (i = 0; i < cJumps; i++)
+         TG_DEBUG(3, "%u ", clgs.bb->jmp[i].instr);
+      TG_DEBUG(3, "], last inverted: %s \n",
+               clgs.bb->cjmp_inverted ? "yes" : "no");
+   }
+
+   return clgs.sbOut;
+}
+
+/*--------------------------------------------------------------------*/
+/*--- Discarding BB info                                           ---*/
+/*--------------------------------------------------------------------*/
+
+// Called when a translation is removed from the translation cache for
+// any reason at all: to free up space, because the guest code was
+// unmapped or modified, or for any arbitrary reason.
+static void tg_discard_superblock_info(Addr orig_addr, VexGuestExtents vge)
+{
+   tl_assert(vge.n_used > 0);
+
+   if (0)
+      VG_(printf)("discard_superblock_info: %p, %p, %llu\n", (void*)orig_addr,
+                  (void*)vge.base[0], (ULong)vge.len[0]);
+
+   // Get BB info, remove from table, free BB info.  Simple!
+   // When created, the BB is keyed by the first instruction address,
+   // (not orig_addr, but eventually redirected address). Thus, we
+   // use the first instruction address in vge.
+   TG_(delete_bb)(vge.base[0]);
+}
+
+/*------------------------------------------------------------*/
+/*--- TG_(fini)() and related function                     ---*/
+/*------------------------------------------------------------*/
+
+static void unwind_thread(thread_info* t)
+{
+   /* unwind signal handlers */
+   while (TG_(current_state).sig != 0)
+      TG_(post_signal)(TG_(current_tid), TG_(current_state).sig);
+
+   /* unwind regular call stack */
+   while (TG_(current_call_stack).sp > 0)
+      TG_(pop_call_stack)();
+
+   /* reset context and function stack for context generation */
+   TG_(init_exec_state)(&TG_(current_state));
+   TG_(current_fn_stack).top = TG_(current_fn_stack).bottom;
+}
+
+static void zero_state_cost(thread_info* t)
+{
+   TG_(zero_cost)(TG_(sets).full, TG_(current_state).cost);
+}
+
+void TG_(set_instrument_state)(const HChar* reason, Bool state)
+{
+   if (TG_(instrument_state) == state) {
+      TG_DEBUG(2, "%s: instrumentation already %s\n", reason,
+               state ? "ON" : "OFF");
+      return;
+   }
+   TG_(instrument_state) = state;
+   TG_DEBUG(2, "%s: Switching instrumentation %s ...\n", reason,
+            state ? "ON" : "OFF");
+
+   VG_(discard_translations_safely)((Addr)0x1000, ~(SizeT)0xfff, "tracegrind");
+
+   /* reset internal state: call stacks, simulator */
+   TG_(forall_threads)(unwind_thread);
+   TG_(forall_threads)(zero_state_cost);
+   (*TG_(cachesim).clear)();
+
+   if (VG_(clo_verbosity) > 1)
+      VG_(message)(Vg_DebugMsg, "%s: instrumentation switched %s\n", reason,
+                   state ? "ON" : "OFF");
+}
+
+/* helper for dump_state_togdb */
+static void dump_state_of_thread_togdb(thread_info* ti)
+{
+   static FullCost sum = 0, tmp = 0;
+   Int             t, i;
+   BBCC *          from, *to;
+   call_entry*     ce;
+   HChar*          mcost;
+
+   t = TG_(current_tid);
+   TG_(init_cost_lz)(TG_(sets).full, &sum);
+   TG_(copy_cost_lz)(TG_(sets).full, &tmp, ti->lastdump_cost);
+   TG_(add_diff_cost)
+   (TG_(sets).full, sum, ti->lastdump_cost, ti->states.entry[0]->cost);
+   TG_(copy_cost)(TG_(sets).full, ti->lastdump_cost, tmp);
+   mcost = TG_(mappingcost_as_string)(TG_(dumpmap), sum);
+   VG_(gdb_printf)("events-%d: %s\n", t, mcost);
+   VG_(free)(mcost);
+   VG_(gdb_printf)("frames-%d: %d\n", t, TG_(current_call_stack).sp);
+
+   ce = 0;
+   for (i = 0; i < TG_(current_call_stack).sp; i++) {
+      ce = TG_(get_call_entry)(i);
+      /* if this frame is skipped, we don't have counters */
+      if (!ce->jcc)
+         continue;
+
+      from = ce->jcc->from;
+      VG_(gdb_printf)("function-%d-%d: %s\n", t, i, from->cxt->fn[0]->name);
+      VG_(gdb_printf)("calls-%d-%d: %llu\n", t, i, ce->jcc->call_counter);
+
+      /* FIXME: EventSets! */
+      TG_(copy_cost)(TG_(sets).full, sum, ce->jcc->cost);
+      TG_(copy_cost)(TG_(sets).full, tmp, ce->enter_cost);
+      TG_(add_diff_cost)
+      (TG_(sets).full, sum, ce->enter_cost, TG_(current_state).cost);
+      TG_(copy_cost)(TG_(sets).full, ce->enter_cost, tmp);
+
+      mcost = TG_(mappingcost_as_string)(TG_(dumpmap), sum);
+      VG_(gdb_printf)("events-%d-%d: %s\n", t, i, mcost);
+      VG_(free)(mcost);
+   }
+   if (ce && ce->jcc) {
+      to = ce->jcc->to;
+      VG_(gdb_printf)("function-%d-%d: %s\n", t, i, to->cxt->fn[0]->name);
+   }
+}
+
+/* Dump current state */
+static void dump_state_togdb(void)
+{
+   thread_info** th;
+   int           t;
+   Int           orig_tid = TG_(current_tid);
+
+   VG_(gdb_printf)("instrumentation: %s\n",
+                   TG_(instrument_state) ? "on" : "off");
+   if (!TG_(instrument_state))
+      return;
+
+   VG_(gdb_printf)("executed-bbs: %llu\n", TG_(stat).bb_executions);
+   VG_(gdb_printf)("executed-calls: %llu\n", TG_(stat).call_counter);
+   VG_(gdb_printf)("distinct-bbs: %d\n", TG_(stat).distinct_bbs);
+   VG_(gdb_printf)("distinct-calls: %d\n", TG_(stat).distinct_jccs);
+   VG_(gdb_printf)("distinct-functions: %d\n", TG_(stat).distinct_fns);
+   VG_(gdb_printf)("distinct-contexts: %d\n", TG_(stat).distinct_contexts);
+
+   /* "events:" line. Given here because it will be dynamic in the future */
+   HChar* evmap = TG_(eventmapping_as_string)(TG_(dumpmap));
+   VG_(gdb_printf)("events: %s\n", evmap);
+   VG_(free)(evmap);
+   /* Total cost summary */
+
+   /* threads */
+   th = TG_(get_threads)();
+   VG_(gdb_printf)("threads:");
+   for (t = 1; t < VG_N_THREADS; t++) {
+      if (!th[t])
+         continue;
+      VG_(gdb_printf)(" %d", t);
+   }
+   VG_(gdb_printf)("\n");
+   VG_(gdb_printf)("current-tid: %d\n", orig_tid);
+   TG_(forall_threads)(dump_state_of_thread_togdb);
+}
+
+static void print_monitor_help(void)
+{
+   VG_(gdb_printf)("\n");
+   VG_(gdb_printf)("tracegrind monitor commands:\n");
+   VG_(gdb_printf)("  status\n");
+   VG_(gdb_printf)("        print status\n");
+   VG_(gdb_printf)("  instrumentation [on|off]\n");
+   VG_(gdb_printf)("        get/set (if on/off given) instrumentation state\n");
+   VG_(gdb_printf)("\n");
+}
+
+/* return True if request recognised, False otherwise */
+static Bool handle_gdb_monitor_command(ThreadId tid, const HChar* req)
+{
+   HChar* wcmd;
+   HChar  s[VG_(strlen)(req) + 1]; /* copy for strtok_r */
+   HChar* ssaveptr;
+
+   VG_(strcpy)(s, req);
+
+   wcmd = VG_(strtok_r)(s, " ", &ssaveptr);
+   switch (VG_(keyword_id)("help status instrumentation", wcmd,
+                           kwd_report_duplicated_matches)) {
+   case -2: /* multiple matches */
+      return True;
+   case -1: /* not found */
+      return False;
+   case 0: /* help */
+      print_monitor_help();
+      return True;
+
+   case 1: { /* status */
+      HChar* arg = VG_(strtok_r)(0, " ", &ssaveptr);
+      if (arg && (VG_(strcmp)(arg, "internal") == 0)) {
+         /* internal interface to tracegrind_control */
+         dump_state_togdb();
+         return True;
+      }
+
+      if (!TG_(instrument_state)) {
+         VG_(gdb_printf)(
+            "No status available as instrumentation is switched off\n");
+      } else {
+         // Status information to be improved ...
+         thread_info** th = TG_(get_threads)();
+         Int           t, tcount = 0;
+         for (t = 1; t < VG_N_THREADS; t++)
+            if (th[t])
+               tcount++;
+         VG_(gdb_printf)("%d thread(s) running.\n", tcount);
+      }
+      return True;
+   }
+
+   case 2: { /* instrumentation */
+      HChar* arg = VG_(strtok_r)(0, " ", &ssaveptr);
+      if (!arg) {
+         VG_(gdb_printf)("instrumentation: %s\n",
+                         TG_(instrument_state) ? "on" : "off");
+      } else
+         TG_(set_instrument_state)("Command", VG_(strcmp)(arg, "off") != 0);
+      return True;
+   }
+
+   default:
+      tl_assert(0);
+      return False;
+   }
+}
+
+static Bool TG_(handle_client_request)(ThreadId tid, UWord* args, UWord* ret)
+{
+   if (!VG_IS_TOOL_USERREQ('C', 'T', args[0]) &&
+       VG_USERREQ__GDB_MONITOR_COMMAND != args[0])
+      return False;
+
+   switch (args[0]) {
+   case VG_USERREQ__TOGGLE_COLLECT:
+      TG_(current_state).collect = !TG_(current_state).collect;
+      TG_DEBUG(2, "Client Request: toggled collection state to %s\n",
+               TG_(current_state).collect ? "ON" : "OFF");
+      *ret = 0; /* meaningless */
+      break;
+
+   case VG_USERREQ__ADD_MARKER: {
+      const HChar* marker = (HChar*)args[1];
+      TG_DEBUG(2, "Client Request: add marker '%s'\n", marker);
+      TG_(trace_emit_marker)(tid, marker);
+      *ret = 0; /* meaningless */
+   } break;
+
+   case VG_USERREQ__START_INSTRUMENTATION:
+      TG_(set_instrument_state)("Client Request", True);
+      *ret = 0; /* meaningless */
+      break;
+
+   case VG_USERREQ__STOP_INSTRUMENTATION:
+      TG_(set_instrument_state)("Client Request", False);
+      *ret = 0; /* meaningless */
+      break;
+
+   case VG_USERREQ__GDB_MONITOR_COMMAND: {
+      Bool handled = handle_gdb_monitor_command(tid, (HChar*)args[1]);
+      if (handled)
+         *ret = 1;
+      else
+         *ret = 0;
+      return handled;
+   }
+   case VG_USERREQ__DUMP_STATS:
+   case VG_USERREQ__ZERO_STATS:
+      TG_DEBUG(2, "Client Request: ignoring  %llx\n", (ULong)args[0]);
+      *ret = 0; /* meaningless */
+      break;
+
+   default:
+      VG_(message)(Vg_UserMsg,
+                   "Warning: unknown tracegrind client request code %llx\n",
+                   (ULong)args[0]);
+      return False;
+   }
+
+   return True;
+}
+
+/* Syscall Timing.  syscalltime[tid] is the time at which thread tid last
+   started a syscall.  */
+
+/* struct vki_timespec syscalltime[VG_N_THREADS];
+   Whatever the syscall we use to measure the syscall time, we convert to
+   seconds and nanoseconds.  */
+struct vki_timespec* syscalltime;
+struct vki_timespec* syscallcputime;
+
+static void collect_time(struct vki_timespec* systime,
+                         struct vki_timespec* syscputime)
+{
+   switch (TG_(clo).collect_systime) {
+   default:
+      tl_assert(0);
+   case systime_msec: {
+      UInt ms_timer    = VG_(read_millisecond_timer)();
+      systime->tv_sec  = ms_timer / 1000;
+      systime->tv_nsec = (ms_timer % 1000) * 1000000L;
+      break;
+   }
+   case systime_usec: {
+      struct vki_timeval tv_now;
+      VG_(gettimeofday)(&tv_now, NULL);
+      systime->tv_sec  = tv_now.tv_sec;
+      systime->tv_nsec = tv_now.tv_usec * 1000;
+      break;
+   }
+   case systime_nsec:
+#if defined(VGO_linux) || defined(VGO_solaris) || defined(VGO_freebsd)
+      VG_(clock_gettime)(systime, VKI_CLOCK_MONOTONIC);
+      VG_(clock_gettime)(syscputime, VKI_CLOCK_THREAD_CPUTIME_ID);
+
+#elif defined(VGO_darwin)
+      tl_assert(0);
+#else
+#error "Unknown OS"
+#endif
+      break;
+   }
+}
+
+static void
+TG_(pre_syscall)(ThreadId tid, UInt syscallno, UWord* args, UInt nArgs)
+{
+   /* Collect time for systime tracking if enabled */
+   if (TG_(clo).collect_systime != systime_no) {
+      collect_time(&syscalltime[tid], TG_(clo).collect_systime == systime_nsec
+                                         ? &syscallcputime[tid]
+                                         : NULL);
+   }
+}
+
+/* Returns "after - before" in the unit as specified by --collect-systime.
+   after is supposed to be >= before, and tv_nsec must be >= 0 and <
+   One_Second_In_Nsec.  */
+static ULong vki_timespec_diff(struct vki_timespec after,
+                               struct vki_timespec before)
+{
+   vki_time_t diff_sec  = after.tv_sec - before.tv_sec;
+   long       diff_nsec = after.tv_nsec - before.tv_nsec;
+   ULong      nsec_factor; // factor to convert the desired unit into nsec.
+
+   if (diff_nsec < 0) {
+      diff_sec--;
+      diff_nsec += 1000000000ULL;
+   }
+   switch (TG_(clo).collect_systime) {
+   case systime_no:
+      tl_assert(0);
+   case systime_msec:
+      nsec_factor = 1000000ULL;
+      break;
+   case systime_usec:
+      nsec_factor = 1000ULL;
+      break;
+   case systime_nsec:
+      nsec_factor = 1ULL;
+      break;
+   default:
+      tl_assert(0);
+   }
+   return ((ULong)diff_sec * 1000000000ULL + diff_nsec) / nsec_factor;
+}
+
+/* Check if syscall is a fork-like call that creates a new process */
+static Bool is_fork_syscall(UInt syscallno)
+{
+#if defined(VGO_linux)
+   return syscallno == __NR_clone || syscallno == __NR_fork ||
+          syscallno == __NR_vfork
+#if defined(__NR_clone3)
+          || syscallno == __NR_clone3
+#endif
+      ;
+#else
+   return False; /* TODO: support other OSes */
+#endif
+}
+
+static void TG_(post_syscall)(
+   ThreadId tid, UInt syscallno, UWord* args, UInt nArgs, SysRes res)
+{
+   /* Handle fork/clone: emit FORK event with child PID.
+      Skip if this was a thread-creating clone (CLONE_THREAD),
+      since we emit THREAD_CREATE via track_pre_thread_ll_create instead. */
+   if (is_fork_syscall(syscallno) && !sr_isError(res) && sr_Res(res) > 0) {
+      Bool is_thread = False;
+#if defined(VGO_linux)
+      if (syscallno == __NR_clone && nArgs > 0)
+         is_thread = (args[0] & VKI_CLONE_THREAD) != 0;
+#if defined(__NR_clone3)
+      if (syscallno == __NR_clone3 && nArgs > 0) {
+         /* clone3 first arg is pointer to struct clone_args;
+            flags is the first field (ULong / __u64). */
+         ULong flags = *(ULong*)(Addr)args[0];
+         is_thread   = (flags & VKI_CLONE_THREAD) != 0;
+      }
+#endif
+#endif
+      if (!is_thread) {
+         Int child_pid = (Int)sr_Res(res);
+         TG_(trace_emit_fork)(tid, child_pid);
+      }
+   }
+
+   /* Handle systime collection if enabled */
+   if (TG_(clo).collect_systime != systime_no && TG_(current_state).bbcc) {
+      Int                 o;
+      struct vki_timespec ts_now;
+      struct vki_timespec ts_cpunow;
+      ULong               diff;
+
+      collect_time(
+         &ts_now, TG_(clo).collect_systime == systime_nsec ? &ts_cpunow : NULL);
+
+      diff = vki_timespec_diff(ts_now, syscalltime[tid]);
+
+      /* offset o is for "SysCount", o+1 for "SysTime",
+         o+2 is (optionally) "SysCpuTime".  */
+      o = fullOffset(EG_SYS);
+      TG_ASSERT(o >= 0);
+      TG_DEBUG(0, "   Time (Off %d) for Syscall %u: %llu\n", o, syscallno,
+               diff);
+
+      if (!TG_(current_state).bbcc->skipped)
+         TG_(init_cost_lz)(TG_(sets).full, &(TG_(current_state).bbcc->skipped));
+      TG_(current_state).cost[o]++;
+      TG_(current_state).cost[o + 1] += diff;
+      TG_(current_state).bbcc->skipped[o]++;
+      TG_(current_state).bbcc->skipped[o + 1] += diff;
+      if (TG_(clo).collect_systime == systime_nsec) {
+         diff = vki_timespec_diff(ts_cpunow, syscallcputime[tid]);
+         TG_DEBUG(0, "   SysCpuTime (Off %d) for Syscall %u: %llu\n", o + 2,
+                  syscallno, diff);
+         TG_(current_state).cost[o + 2] += diff;
+         TG_(current_state).bbcc->skipped[o + 2] += diff;
+      }
+   }
+}
+
+static UInt ULong_width(ULong n)
+{
+   UInt w = 0;
+   while (n > 0) {
+      n = n / 10;
+      w++;
+   }
+   if (w == 0)
+      w = 1;
+   return w + (w - 1) / 3; // add space for commas
+}
+
+static void branchsim_printstat(int l1, int l2, int l3)
+{
+   static HChar fmt[128]; // large enough
+   FullCost     total;
+   ULong        Bc_total_b, Bc_total_mp, Bi_total_b, Bi_total_mp;
+   ULong        B_total_b, B_total_mp;
+
+   total       = TG_(total_cost);
+   Bc_total_b  = total[fullOffset(EG_BC)];
+   Bc_total_mp = total[fullOffset(EG_BC) + 1];
+   Bi_total_b  = total[fullOffset(EG_BI)];
+   Bi_total_mp = total[fullOffset(EG_BI) + 1];
+
+   /* Make format string, getting width right for numbers */
+   VG_(sprintf)(fmt, "%%s %%,%dllu  (%%,%dllu cond + %%,%dllu ind)\n", l1, l2,
+                l3);
+
+   if (0 == Bc_total_b)
+      Bc_total_b = 1;
+   if (0 == Bi_total_b)
+      Bi_total_b = 1;
+   B_total_b  = Bc_total_b + Bi_total_b;
+   B_total_mp = Bc_total_mp + Bi_total_mp;
+
+   VG_(umsg)("\n");
+   VG_(umsg)(fmt, "Branches:     ", B_total_b, Bc_total_b, Bi_total_b);
+
+   VG_(umsg)(fmt, "Mispredicts:  ", B_total_mp, Bc_total_mp, Bi_total_mp);
+
+   VG_(umsg)("Mispred rate:  %*.1f%% (%*.1f%%     + %*.1f%%   )\n", l1,
+             B_total_mp * 100.0 / B_total_b, l2,
+             Bc_total_mp * 100.0 / Bc_total_b, l3,
+             Bi_total_mp * 100.0 / Bi_total_b);
+}
+
+static void tg_print_stats(void)
+{
+   int BB_lookups = TG_(stat).full_debug_BBs + TG_(stat).fn_name_debug_BBs +
+                    TG_(stat).file_line_debug_BBs + TG_(stat).no_debug_BBs;
+
+   /* Hash table stats */
+   VG_(message)(Vg_DebugMsg, "Distinct objects: %d\n", TG_(stat).distinct_objs);
+   VG_(message)(Vg_DebugMsg, "Distinct files:   %d\n",
+                TG_(stat).distinct_files);
+   VG_(message)(Vg_DebugMsg, "Distinct fns:     %d\n", TG_(stat).distinct_fns);
+   VG_(message)(Vg_DebugMsg, "Distinct contexts:%d\n",
+                TG_(stat).distinct_contexts);
+   VG_(message)(Vg_DebugMsg, "Distinct BBs:     %d\n", TG_(stat).distinct_bbs);
+   VG_(message)(Vg_DebugMsg, "Cost entries:     %u (Chunks %u)\n",
+                TG_(costarray_entries), TG_(costarray_chunks));
+   VG_(message)(Vg_DebugMsg, "Distinct BBCCs:   %d\n",
+                TG_(stat).distinct_bbccs);
+   VG_(message)(Vg_DebugMsg, "Distinct JCCs:    %d\n", TG_(stat).distinct_jccs);
+   VG_(message)(Vg_DebugMsg, "Distinct skips:   %d\n",
+                TG_(stat).distinct_skips);
+   VG_(message)(Vg_DebugMsg, "BB lookups:       %d\n", BB_lookups);
+   if (BB_lookups > 0) {
+      VG_(message)(Vg_DebugMsg, "With full      debug info:%3d%% (%d)\n",
+                   TG_(stat).full_debug_BBs * 100 / BB_lookups,
+                   TG_(stat).full_debug_BBs);
+      VG_(message)(Vg_DebugMsg, "With file/line debug info:%3d%% (%d)\n",
+                   TG_(stat).file_line_debug_BBs * 100 / BB_lookups,
+                   TG_(stat).file_line_debug_BBs);
+      VG_(message)(Vg_DebugMsg, "With fn name   debug info:%3d%% (%d)\n",
+                   TG_(stat).fn_name_debug_BBs * 100 / BB_lookups,
+                   TG_(stat).fn_name_debug_BBs);
+      VG_(message)(Vg_DebugMsg, "With no        debug info:%3d%% (%d)\n",
+                   TG_(stat).no_debug_BBs * 100 / BB_lookups,
+                   TG_(stat).no_debug_BBs);
+   }
+   VG_(message)(Vg_DebugMsg, "BBCC Clones:       %d\n", TG_(stat).bbcc_clones);
+   VG_(message)(Vg_DebugMsg, "BBs Retranslated:  %d\n",
+                TG_(stat).bb_retranslations);
+   VG_(message)(Vg_DebugMsg, "Distinct instrs:   %d\n",
+                TG_(stat).distinct_instrs);
+
+   VG_(message)(Vg_DebugMsg, "LRU Contxt Misses: %d\n",
+                TG_(stat).cxt_lru_misses);
+   VG_(message)(Vg_DebugMsg, "LRU BBCC Misses:   %d\n",
+                TG_(stat).bbcc_lru_misses);
+   VG_(message)(Vg_DebugMsg, "LRU JCC Misses:    %d\n",
+                TG_(stat).jcc_lru_misses);
+   VG_(message)(Vg_DebugMsg, "BBs Executed:      %llu\n",
+                TG_(stat).bb_executions);
+   VG_(message)(Vg_DebugMsg, "Calls:             %llu\n",
+                TG_(stat).call_counter);
+   VG_(message)(Vg_DebugMsg, "CondJMP followed:  %llu\n",
+                TG_(stat).jcnd_counter);
+   VG_(message)(Vg_DebugMsg, "Boring JMPs:       %llu\n",
+                TG_(stat).jump_counter);
+   VG_(message)(Vg_DebugMsg, "Recursive calls:   %llu\n",
+                TG_(stat).rec_call_counter);
+   VG_(message)(Vg_DebugMsg, "Returns:           %llu\n",
+                TG_(stat).ret_counter);
+}
+
+static void finish(void)
+{
+   HChar    fmt[128]; // large enough
+   Int      l1, l2, l3;
+   FullCost total;
+
+   TG_DEBUG(0, "finish()\n");
+
+   (*TG_(cachesim).finish)();
+
+   /* pop all remaining items from CallStack for correct sum
+    */
+   TG_(forall_threads)(unwind_thread);
+
+   TG_(compute_total_cost)();
+
+   /* Close CSV trace output */
+   TG_(trace_close_output)();
+
+   if (VG_(clo_verbosity) == 0)
+      return;
+
+   if (VG_(clo_stats)) {
+      VG_(message)(Vg_DebugMsg, "\n");
+      tg_print_stats();
+      VG_(message)(Vg_DebugMsg, "\n");
+   }
+
+   HChar* evmap = TG_(eventmapping_as_string)(TG_(dumpmap));
+   VG_(message)(Vg_UserMsg, "Events    : %s\n", evmap);
+   VG_(free)(evmap);
+   HChar* mcost = TG_(mappingcost_as_string)(TG_(dumpmap), TG_(total_cost));
+   VG_(message)(Vg_UserMsg, "Collected : %s\n", mcost);
+   VG_(free)(mcost);
+   VG_(message)(Vg_UserMsg, "\n");
+
+   /* determine value widths for statistics */
+   total = TG_(total_cost);
+   l1    = ULong_width(total[fullOffset(EG_IR)]);
+   l2 = l3 = 0;
+   if (TG_(clo).simulate_cache) {
+      l2 = ULong_width(total[fullOffset(EG_DR)]);
+      l3 = ULong_width(total[fullOffset(EG_DW)]);
+   }
+   if (TG_(clo).simulate_branch) {
+      int l2b = ULong_width(total[fullOffset(EG_BC)]);
+      int l3b = ULong_width(total[fullOffset(EG_BI)]);
+      if (l2b > l2)
+         l2 = l2b;
+      if (l3b > l3)
+         l3 = l3b;
+   }
+
+   /* Make format string, getting width right for numbers */
+   VG_(sprintf)(fmt, "%%s %%,%dllu\n", l1);
+
+   /* Always print this */
+   VG_(umsg)(fmt, "I   refs:     ", total[fullOffset(EG_IR)]);
+
+   if (TG_(clo).simulate_cache)
+      (*TG_(cachesim).printstat)(l1, l2, l3);
+
+   if (TG_(clo).simulate_branch)
+      branchsim_printstat(l1, l2, l3);
+}
+
+void TG_(fini)(Int exitcode) { finish(); }
+
+/*--------------------------------------------------------------------*/
+/*--- Setup                                                        ---*/
+/*--------------------------------------------------------------------*/
+
+static void tg_start_client_code_callback(ThreadId tid, ULong blocks_done)
+{
+   static ULong last_blocks_done = 0;
+
+   if (0)
+      VG_(printf)("%d R %llu\n", (Int)tid, blocks_done);
+
+   /* throttle calls to TG_(run_thread) by number of BBs executed */
+   if (blocks_done - last_blocks_done < 5000)
+      return;
+   last_blocks_done = blocks_done;
+
+   TG_(run_thread)(tid);
+}
+
+/*
+ * Called after fork() in the child process.
+ * Reopens the trace file with the child's PID.
+ */
+static void tg_atfork_child(ThreadId tid) { TG_(trace_reopen_child)(); }
+
+static void tg_pre_thread_ll_create(ThreadId tid, ThreadId child)
+{
+   /* Skip Valgrind's internal scheduler thread (tid 0) creating the
+      initial client thread -- that's not a user-visible thread creation. */
+   if (tid == 0)
+      return;
+   TG_(trace_emit_thread_create)(tid, child);
+}
+
+static void TG_(post_clo_init)(void)
+{
+   if (VG_(clo_vex_control).iropt_register_updates_default !=
+       VexRegUpdSpAtMemAccess) {
+      TG_DEBUG(1, " Using user specified value for "
+                  "--vex-iropt-register-updates\n");
+   } else {
+      TG_DEBUG(1, " Using default --vex-iropt-register-updates="
+                  "sp-at-mem-access\n");
+   }
+
+   /* Always register syscall wrappers for fork/clone detection.
+      Also handles systime collection if enabled. */
+   VG_(needs_syscall_wrapper)(TG_(pre_syscall), TG_(post_syscall));
+
+   if (TG_(clo).collect_systime != systime_no) {
+      syscalltime =
+         TG_MALLOC("cl.main.pci.1", VG_N_THREADS * sizeof syscalltime[0]);
+      for (UInt i = 0; i < VG_N_THREADS; ++i) {
+         syscalltime[i].tv_sec  = 0;
+         syscalltime[i].tv_nsec = 0;
+      }
+      if (TG_(clo).collect_systime == systime_nsec) {
+         syscallcputime =
+            TG_MALLOC("cl.main.pci.2", VG_N_THREADS * sizeof syscallcputime[0]);
+         for (UInt i = 0; i < VG_N_THREADS; ++i) {
+            syscallcputime[i].tv_sec  = 0;
+            syscallcputime[i].tv_nsec = 0;
+         }
+      }
+   }
+
+   if (VG_(clo_px_file_backed) != VexRegUpdSpAtMemAccess) {
+      TG_DEBUG(1, " Using user specified value for "
+                  "--px-file-backed\n");
+   } else {
+      TG_DEBUG(1, " Using default --px-file-backed="
+                  "sp-at-mem-access\n");
+   }
+
+   if (VG_(clo_vex_control).iropt_unroll_thresh != 0) {
+      VG_(message)(Vg_UserMsg,
+                   "tracegrind only works with --vex-iropt-unroll-thresh=0\n"
+                   "=> resetting it back to 0\n");
+      VG_(clo_vex_control).iropt_unroll_thresh = 0; // cannot be overridden.
+   }
+   if (VG_(clo_vex_control).guest_chase) {
+      VG_(message)(Vg_UserMsg,
+                   "tracegrind only works with --vex-guest-chase=no\n"
+                   "=> resetting it back to 'no'\n");
+      VG_(clo_vex_control).guest_chase = False; // cannot be overridden.
+   }
+
+   TG_DEBUG(1, "  dump threads: %s\n",
+            TG_(clo).separate_threads ? "Yes" : "No");
+   TG_DEBUG(1, "  call sep. : %d\n", TG_(clo).separate_callers);
+   TG_DEBUG(1, "  rec. sep. : %d\n", TG_(clo).separate_recursions);
+
+   (*TG_(cachesim).post_clo_init)();
+
+   TG_(init_eventsets)();
+   TG_(init_statistics)(&TG_(stat));
+   TG_(init_cost_lz)(TG_(sets).full, &TG_(total_cost));
+
+   /* initialize hash tables */
+   TG_(init_obj_table)();
+   TG_(init_cxt_table)();
+   TG_(init_bb_hash)();
+
+   TG_(init_threads)();
+   TG_(run_thread)(1);
+
+   TG_(instrument_state) = TG_(clo).instrument_atstart;
+
+   /* Open trace output file */
+   TG_(trace_open_output)();
+
+   /* Register fork handler to emit FORK events */
+   VG_(atfork)(NULL, NULL, tg_atfork_child);
+
+   if (VG_(clo_verbosity) > 0) {
+      VG_(message)(Vg_UserMsg, "Streaming trace output to tracegrind.out.%d\n",
+                   VG_(getpid)());
+   }
+}
+
+static void TG_(pre_clo_init)(void)
+{
+   VG_(details_name)("Tracegrind");
+   VG_(details_version)(NULL);
+   VG_(details_description)("a streaming trace cache profiler");
+   VG_(details_copyright_author)(
+      "Copyright (C) 2026, and GNU GPL'd, "
+      "by CodSpeed Technology SAS. "
+      "Based on Callgrind by Josef Weidendorfer et al.");
+   VG_(details_bug_reports_to)(VG_BUGS_TO);
+   VG_(details_avg_translation_sizeB)(500);
+
+   VG_(clo_vex_control).iropt_register_updates_default =
+      VG_(clo_px_file_backed) =
+         VexRegUpdSpAtMemAccess; // overridable by the user.
+
+   VG_(clo_vex_control).iropt_unroll_thresh = 0;     // cannot be overridden.
+   VG_(clo_vex_control).guest_chase         = False; // cannot be overridden.
+
+   VG_(basic_tool_funcs)(TG_(post_clo_init), TG_(instrument), TG_(fini));
+
+   VG_(needs_superblock_discards)(tg_discard_superblock_info);
+
+   VG_(needs_command_line_options)(TG_(process_cmd_line_option),
+                                   TG_(print_usage), TG_(print_debug_usage));
+
+   VG_(needs_client_requests)(TG_(handle_client_request));
+   VG_(needs_print_stats)(tg_print_stats);
+
+   VG_(track_start_client_code)(&tg_start_client_code_callback);
+   VG_(track_pre_deliver_signal)(&TG_(pre_signal));
+   VG_(track_post_deliver_signal)(&TG_(post_signal));
+   VG_(track_pre_thread_ll_create)(&tg_pre_thread_ll_create);
+
+   TG_(set_clo_defaults)();
+}
+
+VG_DETERMINE_INTERFACE_VERSION(TG_(pre_clo_init))
+
+/*--------------------------------------------------------------------*/
+/*--- end                                                   main.c ---*/
+/*--------------------------------------------------------------------*/
diff --git a/tracegrind/scripts/tracegrind-analyzer b/tracegrind/scripts/tracegrind-analyzer
new file mode 100755
index 000000000..185ed5d2d
--- /dev/null
+++ b/tracegrind/scripts/tracegrind-analyzer
@@ -0,0 +1,302 @@
+#!/usr/bin/env -S uvx --with lz4 --with msgpack python3
+# /// script
+# requires-python = ">=3.8"
+# dependencies = ["lz4", "msgpack"]
+# ///
+"""
+Decode and debug tracegrind MsgPack+LZ4 trace files.
+
+Usage:
+    ./decode-trace.py <trace.msgpack.lz4> [options]
+
+Examples:
+    ./decode-trace.py tracegrind.out.12345.msgpack.lz4
+    ./decode-trace.py trace.msgpack.lz4 --head 20
+    ./decode-trace.py trace.msgpack.lz4 --schema
+    ./decode-trace.py trace.msgpack.lz4 --stats
+    ./decode-trace.py trace.msgpack.lz4 --json
+"""
+
+import argparse
+import json
+import os
+import struct
+import sys
+from collections import Counter
+from typing import Any, BinaryIO, Dict, Iterator, List, Tuple
+
+import lz4.block
+import msgpack
+
+
+MAGIC = b'TGMP'
+
+
+def read_header(f: BinaryIO) -> int:
+    """Read and validate file header, return version."""
+    magic = f.read(4)
+    if magic != MAGIC:
+        raise ValueError(f"Invalid magic: {magic!r}, expected {MAGIC!r}")
+    version = struct.unpack('<I', f.read(4))[0]
+    return version
+
+
+def read_chunk(f: BinaryIO) -> bytes | None:
+    """Read a single chunk, return decompressed data or None for end marker."""
+    header = f.read(8)
+    if len(header) < 8:
+        return None
+    usize, csize = struct.unpack('<II', header)
+    if usize == 0 and csize == 0:
+        return None  # End marker
+    compressed = f.read(csize)
+    if len(compressed) < csize:
+        raise ValueError(f"Truncated chunk: expected {csize} bytes, got {len(compressed)}")
+    return lz4.block.decompress(compressed, uncompressed_size=usize)
+
+
+def decode_schema(data: bytes) -> Dict[str, Any]:
+    """Decode schema chunk into Python dict."""
+    schema = msgpack.unpackb(data, raw=False)
+    return schema
+
+
+def iter_rows(data: bytes) -> Iterator[List[Any]]:
+    """Iterate over rows in a data chunk."""
+    unpacker = msgpack.Unpacker(raw=False)
+    unpacker.feed(data)
+    yield from unpacker
+
+
+def decode_trace(filepath: str) -> Tuple[int, Dict[str, Any], List[List[Any]]]:
+    """Decode entire trace file, return (version, schema, rows)."""
+    with open(filepath, 'rb') as f:
+        version = read_header(f)
+
+        # Read schema chunk
+        schema_data = read_chunk(f)
+        if schema_data is None:
+            raise ValueError("Missing schema chunk")
+        schema = decode_schema(schema_data)
+
+        # Read all data chunks
+        rows = []
+        while True:
+            chunk_data = read_chunk(f)
+            if chunk_data is None:
+                break
+            rows.extend(iter_rows(chunk_data))
+
+        return version, schema, rows
+
+
+def get_event_name(event_type: int) -> str:
+    """Convert event type to name."""
+    return {0: 'MARKER', 1: 'ENTER_FN', 2: 'EXIT_FN', 3: 'ENTER_INLINED_FN', 4: 'EXIT_INLINED_FN', 5: 'FORK', 6: 'THREAD_CREATE'}.get(event_type, f'UNKNOWN({event_type})')
+
+
+def format_row(row: List[Any], schema: Dict[str, Any]) -> Dict[str, Any]:
+    """Format a row as a dict using the appropriate schema."""
+    if len(row) < 3:
+        return {'_raw': row}
+
+    event_type = row[2]
+    event_schemas = schema.get('event_schemas', {})
+    columns = event_schemas.get(str(event_type), [])
+
+    if not columns:
+        # Fallback for old format with 'columns' key
+        columns = schema.get('columns', [])
+
+    counter_names = schema.get('counters', [])
+
+    result = {}
+    for i, val in enumerate(row):
+        if i < len(columns):
+            key = columns[i]
+            if key == 'event':
+                result[key] = get_event_name(val)
+            elif key == 'counters' and isinstance(val, list):
+                # Expand counters sub-array using top-level counter names
+                for j, cval in enumerate(val):
+                    if j < len(counter_names):
+                        result[counter_names[j]] = cval
+                    else:
+                        result[f'_counter{j}'] = cval
+            else:
+                result[key] = val
+        else:
+            result[f'_col{i}'] = val
+
+    return result
+
+
+def print_schema(schema: Dict[str, Any], version: int) -> None:
+    """Print schema information."""
+    print(f"Format Version: {version}")
+    print(f"Format Name: {schema.get('format', 'unknown')}")
+    print(f"Schema Version: {schema.get('version', 'unknown')}")
+    print()
+
+    if 'event_schemas' in schema:
+        print("Event Schemas (discriminated union):")
+        for event_type, columns in sorted(schema['event_schemas'].items()):
+            event_name = get_event_name(int(event_type))
+            print(f"  {event_type} ({event_name}): {columns}")
+    elif 'columns' in schema:
+        print(f"Columns: {schema['columns']}")
+
+    if schema.get('counters'):
+        print(f"Counters: {schema['counters']}")
+
+    if schema.get('counter_units'):
+        print(f"Counter Units: {dict(sorted(schema['counter_units'].items()))}")
+    print()
+
+
+def print_stats(rows: List[List[Any]], schema: Dict[str, Any]) -> None:
+    """Print statistics about the trace."""
+    print(f"Total rows: {len(rows):,}")
+
+    if not rows:
+        return
+
+    # Count by event type
+    event_counts = Counter(row[2] for row in rows if len(row) > 2)
+    print("\nEvents by type:")
+    for event_type, count in sorted(event_counts.items()):
+        event_name = get_event_name(event_type)
+        pct = 100 * count / len(rows)
+        print(f"  {event_name}: {count:,} ({pct:.1f}%)")
+
+    # Thread stats
+    thread_ids = set(row[1] for row in rows if len(row) > 1)
+    print(f"\nThreads: {len(thread_ids)} ({sorted(thread_ids)})")
+
+    # Sequence range
+    seqs = [row[0] for row in rows if len(row) > 0]
+    if seqs:
+        print(f"Sequence range: {min(seqs):,} - {max(seqs):,}")
+
+    # Function stats (for ENTER/EXIT events)
+    fn_counts = Counter()
+    for row in rows:
+        if len(row) > 3 and row[2] in (1, 2, 3, 4):  # ENTER_FN, EXIT_FN, ENTER_INLINED_FN, or EXIT_INLINED_FN
+            fn_counts[row[3]] += 1
+
+    if fn_counts:
+        print(f"\nTop 10 functions by event count:")
+        for fn, count in fn_counts.most_common(10):
+            print(f"  {count:8,}  {fn}")
+
+    # FORK events
+    fork_rows = [row for row in rows if len(row) > 2 and row[2] == 5]
+    if fork_rows:
+        print(f"\nFork events: {len(fork_rows)}")
+        for row in fork_rows[:5]:
+            formatted = format_row(row, schema)
+            child_pid = formatted.get('child_pid', 'unknown')
+            print(f"  seq={formatted.get('seq')}, tid={formatted.get('tid')}, child_pid={child_pid}")
+
+    # THREAD_CREATE events
+    thread_create_rows = [row for row in rows if len(row) > 2 and row[2] == 6]
+    if thread_create_rows:
+        print(f"\nThread create events: {len(thread_create_rows)}")
+        for row in thread_create_rows[:5]:
+            formatted = format_row(row, schema)
+            child_tid = formatted.get('child_tid', 'unknown')
+            print(f"  seq={formatted.get('seq')}, tid={formatted.get('tid')}, child_tid={child_tid}")
+
+
+def print_rows(rows: List[List[Any]], schema: Dict[str, Any],
+               head: int | None = None, raw: bool = False, as_json: bool = False) -> None:
+    """Print rows in various formats."""
+    display_rows = rows[:head] if head else rows
+
+    if as_json:
+        output = [format_row(row, schema) for row in display_rows]
+        print(json.dumps(output, indent=2))
+        return
+
+    for row in display_rows:
+        if raw:
+            print(row)
+        else:
+            formatted = format_row(row, schema)
+            # Compact single-line format
+            parts = []
+            for k, v in formatted.items():
+                if isinstance(v, str) and k in ('obj', 'file'):
+                    v = os.path.basename(v)
+                parts.append(f"{k}={v}")
+            print(' | '.join(parts))
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Decode and debug tracegrind MsgPack+LZ4 trace files.',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__
+    )
+    parser.add_argument('file', help='Trace file to decode (.msgpack.lz4)')
+    parser.add_argument('--schema', action='store_true',
+                        help='Print schema information only')
+    parser.add_argument('--stats', action='store_true',
+                        help='Print statistics about the trace')
+    parser.add_argument('--head', type=int, metavar='N',
+                        help='Print only first N rows')
+    parser.add_argument('--tail', type=int, metavar='N',
+                        help='Print only last N rows')
+    parser.add_argument('--raw', action='store_true',
+                        help='Print raw row arrays')
+    parser.add_argument('--json', action='store_true',
+                        help='Output as JSON')
+    parser.add_argument('--event', type=str, choices=['MARKER', 'ENTER_FN', 'EXIT_FN', 'ENTER_INLINED_FN', 'EXIT_INLINED_FN', 'FORK', 'THREAD_CREATE'],
+                        help='Filter by event type')
+    parser.add_argument('--fn', type=str, metavar='PATTERN',
+                        help='Filter by function name (substring match)')
+
+    args = parser.parse_args()
+
+    try:
+        version, schema, rows = decode_trace(args.file)
+    except Exception as e:
+        print(f"Error reading trace file: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    # Schema only mode
+    if args.schema:
+        print_schema(schema, version)
+        sys.exit(0)
+
+    # Apply filters
+    filtered_rows = rows
+
+    if args.event:
+        event_map = {'MARKER': 0, 'ENTER_FN': 1, 'EXIT_FN': 2, 'ENTER_INLINED_FN': 3, 'EXIT_INLINED_FN': 4, 'FORK': 5, 'THREAD_CREATE': 6}
+        event_type = event_map[args.event]
+        filtered_rows = [r for r in filtered_rows if len(r) > 2 and r[2] == event_type]
+
+    if args.fn:
+        pattern = args.fn.lower()
+        filtered_rows = [r for r in filtered_rows
+                        if len(r) > 3 and isinstance(r[3], str) and pattern in r[3].lower()]
+
+    # Stats mode
+    if args.stats:
+        print_schema(schema, version)
+        print_stats(filtered_rows, schema)
+        sys.exit(0)
+
+    # Default: print rows
+    if args.tail:
+        filtered_rows = filtered_rows[-args.tail:]
+
+    print_schema(schema, version)
+    print(f"Showing {min(args.head or len(filtered_rows), len(filtered_rows)):,} of {len(filtered_rows):,} rows")
+    print("-" * 80)
+    print_rows(filtered_rows, schema, head=args.head, raw=args.raw, as_json=args.json)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tracegrind/sim.c b/tracegrind/sim.c
new file mode 100644
index 000000000..25d8cf983
--- /dev/null
+++ b/tracegrind/sim.c
@@ -0,0 +1,1703 @@
+/*--------------------------------------------------------------------*/
+/*--- Cache simulation.                                            ---*/
+/*---                                                        sim.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Tracegrind, a Valgrind tool for call graph
+   profiling programs.
+
+   Copyright (C) 2003-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This tool is derived from and contains code from Cachegrind
+   Copyright (C) 2002-2017 Nicholas Nethercote (njn@valgrind.org)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+
+/* Notes:
+  - simulates a write-allocate cache
+  - (block --> set) hash function uses simple bit selection
+  - handling of references straddling two cache blocks:
+      - counts as only one cache access (not two)
+      - both blocks hit                  --> one hit
+      - one block hits, the other misses --> one miss
+      - both blocks miss                 --> one miss (not two)
+*/
+
+/* Cache configuration */
+#include "cg_arch.c"
+
+/* additional structures for cache use info, separated
+ * according usage frequency:
+ * - line_loaded : pointer to cost center of instruction
+ *                 which loaded the line into cache.
+ *                 Needed to increment counters when line is evicted.
+ * - line_use    : updated on every access
+ */
+typedef struct {
+   UInt count;
+   UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */
+} line_use;
+
+typedef struct {
+   Addr      memline, iaddr;
+   line_use* dep_use; /* point to higher-level cacheblock for this memline */
+   ULong*    use_base;
+} line_loaded;
+
+/* Cache state */
+typedef struct {
+   const HChar* name;
+   int          size; /* bytes */
+   int          assoc;
+   int          line_size; /* bytes */
+   Bool         sectored;  /* prefetch nearside cacheline on read */
+   int          sets;
+   int          sets_min_1;
+   int          line_size_bits;
+   int          tag_shift;
+   UWord        tag_mask;
+   HChar        desc_line[128]; // large enough
+   UWord*       tags;
+
+   /* for cache use */
+   int          line_size_mask;
+   int*         line_start_mask;
+   int*         line_end_mask;
+   line_loaded* loaded;
+   line_use*    use;
+} cache_t2;
+
+/*
+ * States of flat caches in our model.
+ * We use a 2-level hierarchy,
+ */
+static cache_t2 I1, D1, LL;
+
+/* Lower bits of cache tags are used as flags for a cache line */
+#define CACHELINE_FLAGMASK (MIN_LINE_SIZE - 1)
+#define CACHELINE_DIRTY    1
+
+/* Cache simulator Options */
+static Bool clo_simulate_writeback = False;
+static Bool clo_simulate_hwpref    = False;
+static Bool clo_simulate_sectors   = False;
+static Bool clo_collect_cacheuse   = False;
+
+/* Following global vars are setup before by setup_bbcc():
+ *
+ * - Addr   TG_(bb_base)     (instruction start address of original BB)
+ * - ULong* TG_(cost_base)   (start of cost array for BB)
+ */
+
+Addr   TG_(bb_base);
+ULong* TG_(cost_base);
+
+static InstrInfo* current_ii;
+
+/* Cache use offsets */
+/* The offsets are only correct because all per-instruction event sets get
+ * the "Use" set added first !
+ */
+static Int off_I1_AcCost = 0;
+static Int off_I1_SpLoss = 1;
+static Int off_D1_AcCost = 0;
+static Int off_D1_SpLoss = 1;
+static Int off_LL_AcCost = 2;
+static Int off_LL_SpLoss = 3;
+
+/* Cache access types */
+typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
+
+/* Result of a reference into a flat cache */
+typedef enum { Hit = 0, Miss, MissDirty } CacheResult;
+
+/* Result of a reference into a hierarchical cache model */
+typedef enum { L1_Hit, LL_Hit, MemAccess, WriteBackMemAccess } CacheModelResult;
+
+typedef CacheModelResult (*simcall_type)(Addr, UChar);
+
+static struct {
+   simcall_type I1_Read;
+   simcall_type D1_Read;
+   simcall_type D1_Write;
+} simulator;
+
+/*------------------------------------------------------------*/
+/*--- Cache Simulator Initialization                       ---*/
+/*------------------------------------------------------------*/
+
+static void cachesim_clearcache(cache_t2* c)
+{
+   Int i;
+
+   for (i = 0; i < c->sets * c->assoc; i++)
+      c->tags[i] = 0;
+   if (c->use) {
+      for (i = 0; i < c->sets * c->assoc; i++) {
+         c->loaded[i].memline  = 0;
+         c->loaded[i].use_base = 0;
+         c->loaded[i].dep_use  = 0;
+         c->loaded[i].iaddr    = 0;
+         c->use[i].mask        = 0;
+         c->use[i].count       = 0;
+         c->tags[i]            = i % c->assoc; /* init lower bits as pointer */
+      }
+   }
+}
+
+static void cacheuse_initcache(cache_t2* c);
+
+/* By this point, the size/assoc/line_size has been checked. */
+static void cachesim_initcache(cache_t config, cache_t2* c)
+{
+   c->size      = config.size;
+   c->assoc     = config.assoc;
+   c->line_size = config.line_size;
+   c->sectored  = False; // FIXME
+
+   c->sets           = (c->size / c->line_size) / c->assoc;
+   c->sets_min_1     = c->sets - 1;
+   c->line_size_bits = VG_(log2)(c->line_size);
+   c->tag_shift      = c->line_size_bits + VG_(log2)(c->sets);
+   c->tag_mask       = ~((1u << c->tag_shift) - 1);
+
+   /* Can bits in tag entries be used for flags?
+    * Should be always true as MIN_LINE_SIZE >= 16 */
+   TG_ASSERT((c->tag_mask & CACHELINE_FLAGMASK) == 0);
+
+   if (c->assoc == 1) {
+      VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s", c->size,
+                   c->line_size, c->sectored ? ", sectored" : "");
+   } else {
+      VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s", c->size,
+                   c->line_size, c->assoc, c->sectored ? ", sectored" : "");
+   }
+
+   c->tags =
+      (UWord*)TG_MALLOC("cl.sim.cs_ic.1", sizeof(UWord) * c->sets * c->assoc);
+   if (clo_collect_cacheuse)
+      cacheuse_initcache(c);
+   else
+      c->use = 0;
+   cachesim_clearcache(c);
+}
+
+#if 0
+static void print_cache(cache_t2* c)
+{
+   UInt set, way, i;
+
+   /* Note initialisation and update of 'i'. */
+   for (i = 0, set = 0; set < c->sets; set++) {
+      for (way = 0; way < c->assoc; way++, i++) {
+         VG_(printf)("%8x ", c->tags[i]);
+      }
+      VG_(printf)("\n");
+   }
+}
+#endif
+
+/*------------------------------------------------------------*/
+/*--- Simple Cache Simulation                              ---*/
+/*------------------------------------------------------------*/
+
+/*
+ * Model: single inclusive, 2-level cache hierarchy (L1/LL)
+ *        with write-allocate
+ *
+ * For simple cache hit/miss counts, we do not have to
+ * maintain the dirty state of lines (no need to distinguish
+ * read/write references), and the resulting counts are the
+ * same for write-through and write-back caches.
+ *
+ * Simulator functions:
+ *  CacheModelResult cachesim_I1_ref(Addr a, UChar size)
+ *  CacheModelResult cachesim_D1_ref(Addr a, UChar size)
+ */
+__attribute__((always_inline)) static __inline__ CacheResult
+cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
+{
+   int    i, j;
+   UWord* set;
+
+   set = &(c->tags[set_no * c->assoc]);
+
+   /* This loop is unrolled for just the first case, which is the most */
+   /* common.  We can't unroll any further because it would screw up   */
+   /* if we have a direct-mapped (1-way) cache.                        */
+   if (tag == set[0])
+      return Hit;
+
+   /* If the tag is one other than the MRU, move it into the MRU spot  */
+   /* and shuffle the rest down.                                       */
+   for (i = 1; i < c->assoc; i++) {
+      if (tag == set[i]) {
+         for (j = i; j > 0; j--) {
+            set[j] = set[j - 1];
+         }
+         set[0] = tag;
+         return Hit;
+      }
+   }
+
+   /* A miss;  install this tag as MRU, shuffle rest down. */
+   for (j = c->assoc - 1; j > 0; j--) {
+      set[j] = set[j - 1];
+   }
+   set[0] = tag;
+
+   return Miss;
+}
+
+__attribute__((always_inline)) static __inline__ CacheResult
+cachesim_ref(cache_t2* c, Addr a, UChar size)
+{
+   UWord block1 = a >> c->line_size_bits;
+   UWord block2 = (a + size - 1) >> c->line_size_bits;
+   UInt  set1   = block1 & c->sets_min_1;
+   /* the tag does not need to include bits specifying the set,
+    * but it can, and this saves instructions */
+   UWord tag1 = block1;
+
+   /* Access entirely within line. */
+   if (block1 == block2)
+      return cachesim_setref(c, set1, tag1);
+
+   /* Access straddles two lines. */
+   else if (block1 + 1 == block2) {
+      UInt  set2 = block2 & c->sets_min_1;
+      UWord tag2 = block2;
+
+      /* the call updates cache structures as side effect */
+      CacheResult res1 = cachesim_setref(c, set1, tag1);
+      CacheResult res2 = cachesim_setref(c, set2, tag2);
+      return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
+
+   } else {
+      VG_(printf)("addr: %lx  size: %u  blocks: %lu %lu", a, size, block1,
+                  block2);
+      VG_(tool_panic)("item straddles more than two cache sets");
+   }
+   return Hit;
+}
+
+static CacheModelResult cachesim_I1_ref(Addr a, UChar size)
+{
+   if (cachesim_ref(&I1, a, size) == Hit)
+      return L1_Hit;
+   if (cachesim_ref(&LL, a, size) == Hit)
+      return LL_Hit;
+   return MemAccess;
+}
+
+static CacheModelResult cachesim_D1_ref(Addr a, UChar size)
+{
+   if (cachesim_ref(&D1, a, size) == Hit)
+      return L1_Hit;
+   if (cachesim_ref(&LL, a, size) == Hit)
+      return LL_Hit;
+   return MemAccess;
+}
+
+/*------------------------------------------------------------*/
+/*--- Write Back Cache Simulation                          ---*/
+/*------------------------------------------------------------*/
+
+/*
+ * More complex model: L1 Write-through, LL Write-back
+ * This needs to distinguish among read and write references.
+ *
+ * Simulator functions:
+ *  CacheModelResult cachesim_I1_Read(Addr a, UChar size)
+ *  CacheModelResult cachesim_D1_Read(Addr a, UChar size)
+ *  CacheModelResult cachesim_D1_Write(Addr a, UChar size)
+ */
+
+/*
+ * With write-back, result can be a miss evicting a dirty line
+ * The dirty state of a cache line is stored in Bit0 of the tag for
+ * this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference
+ * type (Read/Write), the line gets dirty on a write.
+ */
+__attribute__((always_inline)) static __inline__ CacheResult
+cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
+{
+   int    i, j;
+   UWord *set, tmp_tag;
+
+   set = &(c->tags[set_no * c->assoc]);
+
+   /* This loop is unrolled for just the first case, which is the most */
+   /* common.  We can't unroll any further because it would screw up   */
+   /* if we have a direct-mapped (1-way) cache.                        */
+   if (tag == (set[0] & ~CACHELINE_DIRTY)) {
+      set[0] |= ref;
+      return Hit;
+   }
+   /* If the tag is one other than the MRU, move it into the MRU spot  */
+   /* and shuffle the rest down.                                       */
+   for (i = 1; i < c->assoc; i++) {
+      if (tag == (set[i] & ~CACHELINE_DIRTY)) {
+         tmp_tag = set[i] | ref; // update dirty flag
+         for (j = i; j > 0; j--) {
+            set[j] = set[j - 1];
+         }
+         set[0] = tmp_tag;
+         return Hit;
+      }
+   }
+
+   /* A miss;  install this tag as MRU, shuffle rest down. */
+   tmp_tag = set[c->assoc - 1];
+   for (j = c->assoc - 1; j > 0; j--) {
+      set[j] = set[j - 1];
+   }
+   set[0] = tag | ref;
+
+   return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss;
+}
+
+__attribute__((always_inline)) static __inline__ CacheResult
+cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
+{
+   UInt  set1 = (a >> c->line_size_bits) & (c->sets_min_1);
+   UInt  set2 = ((a + size - 1) >> c->line_size_bits) & (c->sets_min_1);
+   UWord tag  = a & c->tag_mask;
+
+   /* Access entirely within line. */
+   if (set1 == set2)
+      return cachesim_setref_wb(c, ref, set1, tag);
+
+   /* Access straddles two lines. */
+   /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
+   else if (((set1 + 1) & (c->sets_min_1)) == set2) {
+      UWord tag2 = (a + size - 1) & c->tag_mask;
+
+      /* the call updates cache structures as side effect */
+      CacheResult res1 = cachesim_setref_wb(c, ref, set1, tag);
+      CacheResult res2 = cachesim_setref_wb(c, ref, set2, tag2);
+
+      if ((res1 == MissDirty) || (res2 == MissDirty))
+         return MissDirty;
+      return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
+
+   } else {
+      VG_(printf)("addr: %lx  size: %u  sets: %u %u", a, size, set1, set2);
+      VG_(tool_panic)("item straddles more than two cache sets");
+   }
+   return Hit;
+}
+
+static CacheModelResult cachesim_I1_Read(Addr a, UChar size)
+{
+   if (cachesim_ref(&I1, a, size) == Hit)
+      return L1_Hit;
+   switch (cachesim_ref_wb(&LL, Read, a, size)) {
+   case Hit:
+      return LL_Hit;
+   case Miss:
+      return MemAccess;
+   default:
+      break;
+   }
+   return WriteBackMemAccess;
+}
+
+static CacheModelResult cachesim_D1_Read(Addr a, UChar size)
+{
+   if (cachesim_ref(&D1, a, size) == Hit)
+      return L1_Hit;
+   switch (cachesim_ref_wb(&LL, Read, a, size)) {
+   case Hit:
+      return LL_Hit;
+   case Miss:
+      return MemAccess;
+   default:
+      break;
+   }
+   return WriteBackMemAccess;
+}
+
+static CacheModelResult cachesim_D1_Write(Addr a, UChar size)
+{
+   if (cachesim_ref(&D1, a, size) == Hit) {
+      /* Even for a L1 hit, the write-trough L1 passes
+       * the write to the LL to make the LL line dirty.
+       * But this causes no latency, so return the hit.
+       */
+      cachesim_ref_wb(&LL, Write, a, size);
+      return L1_Hit;
+   }
+   switch (cachesim_ref_wb(&LL, Write, a, size)) {
+   case Hit:
+      return LL_Hit;
+   case Miss:
+      return MemAccess;
+   default:
+      break;
+   }
+   return WriteBackMemAccess;
+}
+
+/*------------------------------------------------------------*/
+/*--- Hardware Prefetch Simulation                         ---*/
+/*------------------------------------------------------------*/
+
+static ULong prefetch_up   = 0;
+static ULong prefetch_down = 0;
+
+#define PF_STREAMS  8
+#define PF_PAGEBITS 12
+
+static UInt pf_lastblock[PF_STREAMS];
+static Int  pf_seqblocks[PF_STREAMS];
+
+static void prefetch_clear(void)
+{
+   int i;
+   for (i = 0; i < PF_STREAMS; i++)
+      pf_lastblock[i] = pf_seqblocks[i] = 0;
+}
+
+/*
+ * HW Prefetch emulation
+ * Start prefetching when detecting sequential access to 3 memory blocks.
+ * One stream can be detected per 4k page.
+ */
+static __inline__ void prefetch_LL_doref(Addr a)
+{
+   UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
+   UInt block  = (a >> LL.line_size_bits);
+
+   if (block != pf_lastblock[stream]) {
+      if (pf_seqblocks[stream] == 0) {
+         if (pf_lastblock[stream] + 1 == block)
+            pf_seqblocks[stream]++;
+         else if (pf_lastblock[stream] - 1 == block)
+            pf_seqblocks[stream]--;
+      } else if (pf_seqblocks[stream] > 0) {
+         if (pf_lastblock[stream] + 1 == block) {
+            pf_seqblocks[stream]++;
+            if (pf_seqblocks[stream] >= 2) {
+               prefetch_up++;
+               cachesim_ref(&LL, a + 5 * LL.line_size, 1);
+            }
+         } else
+            pf_seqblocks[stream] = 0;
+      } else if (pf_seqblocks[stream] < 0) {
+         if (pf_lastblock[stream] - 1 == block) {
+            pf_seqblocks[stream]--;
+            if (pf_seqblocks[stream] <= -2) {
+               prefetch_down++;
+               cachesim_ref(&LL, a - 5 * LL.line_size, 1);
+            }
+         } else
+            pf_seqblocks[stream] = 0;
+      }
+      pf_lastblock[stream] = block;
+   }
+}
+
+/* simple model with hardware prefetch */
+
+static CacheModelResult prefetch_I1_ref(Addr a, UChar size)
+{
+   if (cachesim_ref(&I1, a, size) == Hit)
+      return L1_Hit;
+   prefetch_LL_doref(a);
+   if (cachesim_ref(&LL, a, size) == Hit)
+      return LL_Hit;
+   return MemAccess;
+}
+
+static CacheModelResult prefetch_D1_ref(Addr a, UChar size)
+{
+   if (cachesim_ref(&D1, a, size) == Hit)
+      return L1_Hit;
+   prefetch_LL_doref(a);
+   if (cachesim_ref(&LL, a, size) == Hit)
+      return LL_Hit;
+   return MemAccess;
+}
+
+/* complex model with hardware prefetch */
+
+static CacheModelResult prefetch_I1_Read(Addr a, UChar size)
+{
+   if (cachesim_ref(&I1, a, size) == Hit)
+      return L1_Hit;
+   prefetch_LL_doref(a);
+   switch (cachesim_ref_wb(&LL, Read, a, size)) {
+   case Hit:
+      return LL_Hit;
+   case Miss:
+      return MemAccess;
+   default:
+      break;
+   }
+   return WriteBackMemAccess;
+}
+
+static CacheModelResult prefetch_D1_Read(Addr a, UChar size)
+{
+   if (cachesim_ref(&D1, a, size) == Hit)
+      return L1_Hit;
+   prefetch_LL_doref(a);
+   switch (cachesim_ref_wb(&LL, Read, a, size)) {
+   case Hit:
+      return LL_Hit;
+   case Miss:
+      return MemAccess;
+   default:
+      break;
+   }
+   return WriteBackMemAccess;
+}
+
+static CacheModelResult prefetch_D1_Write(Addr a, UChar size)
+{
+   prefetch_LL_doref(a);
+   if (cachesim_ref(&D1, a, size) == Hit) {
+      /* Even for a L1 hit, the write-trough L1 passes
+       * the write to the LL to make the LL line dirty.
+       * But this causes no latency, so return the hit.
+       */
+      cachesim_ref_wb(&LL, Write, a, size);
+      return L1_Hit;
+   }
+   switch (cachesim_ref_wb(&LL, Write, a, size)) {
+   case Hit:
+      return LL_Hit;
+   case Miss:
+      return MemAccess;
+   default:
+      break;
+   }
+   return WriteBackMemAccess;
+}
+
+/*------------------------------------------------------------*/
+/*--- Cache Simulation with use metric collection          ---*/
+/*------------------------------------------------------------*/
+
+/* can not be combined with write-back or prefetch */
+
+static void cacheuse_initcache(cache_t2* c)
+{
+   int          i;
+   unsigned int start_mask, start_val;
+   unsigned int end_mask, end_val;
+
+   c->use = TG_MALLOC("cl.sim.cu_ic.1", sizeof(line_use) * c->sets * c->assoc);
+   c->loaded =
+      TG_MALLOC("cl.sim.cu_ic.2", sizeof(line_loaded) * c->sets * c->assoc);
+   c->line_start_mask = TG_MALLOC("cl.sim.cu_ic.3", sizeof(int) * c->line_size);
+   c->line_end_mask   = TG_MALLOC("cl.sim.cu_ic.4", sizeof(int) * c->line_size);
+
+   c->line_size_mask = c->line_size - 1;
+
+   /* Meaning of line_start_mask/line_end_mask
+    * Example: for a given cache line, you get an access starting at
+    * byte offset 5, length 4, byte 5 - 8 was touched. For a cache
+    * line size of 32, you have 1 bit per byte in the mask:
+    *
+    *   bit31   bit8 bit5  bit 0
+    *       |      |  |    |
+    *       11..111111100000   line_start_mask[5]
+    *       00..000111111111   line_end_mask[(5+4)-1]
+    *
+    *  use_mask |= line_start_mask[5] && line_end_mask[8]
+    *
+    */
+   start_val = end_val = ~0;
+   if (c->line_size < 32) {
+      int bits_per_byte = 32 / c->line_size;
+      start_mask        = (1 << bits_per_byte) - 1;
+      end_mask          = start_mask << (32 - bits_per_byte);
+      for (i = 0; i < c->line_size; i++) {
+         c->line_start_mask[i] = start_val;
+         start_val             = start_val & ~start_mask;
+         start_mask            = start_mask << bits_per_byte;
+
+         c->line_end_mask[c->line_size - i - 1] = end_val;
+         end_val                                = end_val & ~end_mask;
+         end_mask                               = end_mask >> bits_per_byte;
+      }
+   } else {
+      int bytes_per_bit = c->line_size / 32;
+      start_mask        = 1;
+      end_mask          = 1u << 31;
+      for (i = 0; i < c->line_size; i++) {
+         c->line_start_mask[i]                  = start_val;
+         c->line_end_mask[c->line_size - i - 1] = end_val;
+         if (((i + 1) % bytes_per_bit) == 0) {
+            start_val &= ~start_mask;
+            end_val &= ~end_mask;
+            start_mask <<= 1;
+            end_mask >>= 1;
+         }
+      }
+   }
+
+   TG_DEBUG(6, "Config %s:\n", c->desc_line);
+   for (i = 0; i < c->line_size; i++) {
+      TG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n", i,
+               (UInt)c->line_start_mask[i], (UInt)c->line_end_mask[i]);
+   }
+
+   /* We use lower tag bits as offset pointers to cache use info.
+    * I.e. some cache parameters don't work.
+    */
+   if ((1 << c->tag_shift) < c->assoc) {
+      VG_(message)(Vg_DebugMsg,
+                   "error: Use associativity < %d for cache use statistics!\n",
+                   (1 << c->tag_shift));
+      VG_(tool_panic)("Unsupported cache configuration");
+   }
+}
+
+/* for I1/D1 caches */
+#define CACHEUSE(L)                                                            \
+                                                                               \
+   static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
+   {                                                                           \
+      UInt   set1 = (a >> L.line_size_bits) & (L.sets_min_1);                  \
+      UInt   set2 = ((a + size - 1) >> L.line_size_bits) & (L.sets_min_1);     \
+      UWord  tag  = a & L.tag_mask;                                            \
+      UWord  tag2;                                                             \
+      int    i, j, idx;                                                        \
+      UWord *set, tmp_tag;                                                     \
+      UInt   use_mask;                                                         \
+                                                                               \
+      TG_DEBUG(6, "%s.Acc(Addr %#lx, size %d): Sets [%u/%u]\n", L.name, a,     \
+               size, set1, set2);                                              \
+                                                                               \
+      /* First case: word entirely within line. */                             \
+      if (set1 == set2) {                                                      \
+                                                                               \
+         set      = &(L.tags[set1 * L.assoc]);                                 \
+         use_mask = L.line_start_mask[a & L.line_size_mask] &                  \
+                    L.line_end_mask[(a + size - 1) & L.line_size_mask];        \
+                                                                               \
+         /* This loop is unrolled for just the first case, which is the most   \
+          */                                                                   \
+         /* common.  We can't unroll any further because it would screw up */  \
+         /* if we have a direct-mapped (1-way) cache. */                       \
+         if (tag == (set[0] & L.tag_mask)) {                                   \
+            idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                   \
+            L.use[idx].count++;                                                \
+            L.use[idx].mask |= use_mask;                                       \
+            TG_DEBUG(                                                          \
+               6,                                                              \
+               " Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n", \
+               idx, L.loaded[idx].memline, L.loaded[idx].iaddr, use_mask,      \
+               L.use[idx].mask, L.use[idx].count);                             \
+            return L1_Hit;                                                     \
+         }                                                                     \
+         /* If the tag is one other than the MRU, move it into the MRU spot */ \
+         /* and shuffle the rest down. */                                      \
+         for (i = 1; i < L.assoc; i++) {                                       \
+            if (tag == (set[i] & L.tag_mask)) {                                \
+               tmp_tag = set[i];                                               \
+               for (j = i; j > 0; j--) {                                       \
+                  set[j] = set[j - 1];                                         \
+               }                                                               \
+               set[0] = tmp_tag;                                               \
+               idx    = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);            \
+               L.use[idx].count++;                                             \
+               L.use[idx].mask |= use_mask;                                    \
+               TG_DEBUG(6,                                                     \
+                        " Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, "  \
+                        "count %u\n",                                          \
+                        i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr,    \
+                        use_mask, L.use[idx].mask, L.use[idx].count);          \
+               return L1_Hit;                                                  \
+            }                                                                  \
+         }                                                                     \
+                                                                               \
+         /* A miss;  install this tag as MRU, shuffle rest down. */            \
+         tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
+         for (j = L.assoc - 1; j > 0; j--) {                                   \
+            set[j] = set[j - 1];                                               \
+         }                                                                     \
+         set[0] = tag | tmp_tag;                                               \
+         idx    = (set1 * L.assoc) + tmp_tag;                                  \
+         return update_##L##_use(&L, idx, use_mask, a & ~L.line_size_mask);    \
+                                                                               \
+         /* Second case: word straddles two lines. */                          \
+         /* Nb: this is a fast way of doing ((set1+1) % L.sets) */             \
+      } else if (((set1 + 1) & (L.sets_min_1)) == set2) {                      \
+         Int miss1 = 0, miss2 = 0; /* 0: L1 hit, 1:L1 miss, 2:LL miss */       \
+         set      = &(L.tags[set1 * L.assoc]);                                 \
+         use_mask = L.line_start_mask[a & L.line_size_mask];                   \
+         if (tag == (set[0] & L.tag_mask)) {                                   \
+            idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                   \
+            L.use[idx].count++;                                                \
+            L.use[idx].mask |= use_mask;                                       \
+            TG_DEBUG(                                                          \
+               6,                                                              \
+               " Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n", \
+               idx, L.loaded[idx].memline, L.loaded[idx].iaddr, use_mask,      \
+               L.use[idx].mask, L.use[idx].count);                             \
+            goto block2;                                                       \
+         }                                                                     \
+         for (i = 1; i < L.assoc; i++) {                                       \
+            if (tag == (set[i] & L.tag_mask)) {                                \
+               tmp_tag = set[i];                                               \
+               for (j = i; j > 0; j--) {                                       \
+                  set[j] = set[j - 1];                                         \
+               }                                                               \
+               set[0] = tmp_tag;                                               \
+               idx    = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);            \
+               L.use[idx].count++;                                             \
+               L.use[idx].mask |= use_mask;                                    \
+               TG_DEBUG(6,                                                     \
+                        " Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, "  \
+                        "count %u\n",                                          \
+                        i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr,    \
+                        use_mask, L.use[idx].mask, L.use[idx].count);          \
+               goto block2;                                                    \
+            }                                                                  \
+         }                                                                     \
+         tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
+         for (j = L.assoc - 1; j > 0; j--) {                                   \
+            set[j] = set[j - 1];                                               \
+         }                                                                     \
+         set[0] = tag | tmp_tag;                                               \
+         idx    = (set1 * L.assoc) + tmp_tag;                                  \
+         miss1  = update_##L##_use(&L, idx, use_mask, a & ~L.line_size_mask);  \
+      block2:                                                                  \
+         set      = &(L.tags[set2 * L.assoc]);                                 \
+         use_mask = L.line_end_mask[(a + size - 1) & L.line_size_mask];        \
+         tag2     = (a + size - 1) & L.tag_mask;                               \
+         if (tag2 == (set[0] & L.tag_mask)) {                                  \
+            idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask);                   \
+            L.use[idx].count++;                                                \
+            L.use[idx].mask |= use_mask;                                       \
+            TG_DEBUG(                                                          \
+               6,                                                              \
+               " Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n", \
+               idx, L.loaded[idx].memline, L.loaded[idx].iaddr, use_mask,      \
+               L.use[idx].mask, L.use[idx].count);                             \
+            return miss1;                                                      \
+         }                                                                     \
+         for (i = 1; i < L.assoc; i++) {                                       \
+            if (tag2 == (set[i] & L.tag_mask)) {                               \
+               tmp_tag = set[i];                                               \
+               for (j = i; j > 0; j--) {                                       \
+                  set[j] = set[j - 1];                                         \
+               }                                                               \
+               set[0] = tmp_tag;                                               \
+               idx    = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask);            \
+               L.use[idx].count++;                                             \
+               L.use[idx].mask |= use_mask;                                    \
+               TG_DEBUG(6,                                                     \
+                        " Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, "  \
+                        "count %u\n",                                          \
+                        i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr,    \
+                        use_mask, L.use[idx].mask, L.use[idx].count);          \
+               return miss1;                                                   \
+            }                                                                  \
+         }                                                                     \
+         tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
+         for (j = L.assoc - 1; j > 0; j--) {                                   \
+            set[j] = set[j - 1];                                               \
+         }                                                                     \
+         set[0] = tag2 | tmp_tag;                                              \
+         idx    = (set2 * L.assoc) + tmp_tag;                                  \
+         miss2  = update_##L##_use(&L, idx, use_mask,                          \
+                                   (a + size - 1) & ~L.line_size_mask);        \
+         return (miss1 == MemAccess || miss2 == MemAccess) ? MemAccess         \
+                                                           : LL_Hit;           \
+                                                                               \
+      } else {                                                                 \
+         VG_(printf)("addr: %#lx  size: %u  sets: %u %u", a, size, set1,       \
+                     set2);                                                    \
+         VG_(tool_panic)("item straddles more than two cache sets");           \
+      }                                                                        \
+      return 0;                                                                \
+   }
+
+/* logarithmic bitcounting algorithm, see
+ * http://graphics.stanford.edu/~seander/bithacks.html
+ */
+static __inline__ unsigned int countBits(unsigned int bits)
+{
+   unsigned int c;                      // store the total here
+   const int    S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
+   const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
+
+   c = bits;
+   c = ((c >> S[0]) & B[0]) + (c & B[0]);
+   c = ((c >> S[1]) & B[1]) + (c & B[1]);
+   c = ((c >> S[2]) & B[2]) + (c & B[2]);
+   c = ((c >> S[3]) & B[3]) + (c & B[3]);
+   c = ((c >> S[4]) & B[4]) + (c & B[4]);
+   return c;
+}
+
+static void update_LL_use(int idx, Addr memline)
+{
+   line_loaded* loaded = &(LL.loaded[idx]);
+   line_use*    use    = &(LL.use[idx]);
+   int          i      = ((32 - countBits(use->mask)) * LL.line_size) >> 5;
+
+   TG_DEBUG(2, " LL.miss [%d]: at %#lx accessing memline %#lx\n", idx,
+            TG_(bb_base) + current_ii->instr_offset, memline);
+   if (use->count > 0) {
+      TG_DEBUG(2,
+               "   old: used %u, loss bits %d (%08x) [line %#lx from %#lx]\n",
+               use->count, i, use->mask, loaded->memline, loaded->iaddr);
+      TG_DEBUG(2, "   collect: %d, use_base %p\n", TG_(current_state).collect,
+               loaded->use_base);
+
+      if (TG_(current_state).collect && loaded->use_base) {
+         (loaded->use_base)[off_LL_AcCost] += 1000 / use->count;
+         (loaded->use_base)[off_LL_SpLoss] += i;
+      }
+   }
+
+   use->count = 0;
+   use->mask  = 0;
+
+   loaded->memline  = memline;
+   loaded->iaddr    = TG_(bb_base) + current_ii->instr_offset;
+   loaded->use_base = (TG_(current_state).nonskipped)
+                         ? TG_(current_state).nonskipped->skipped
+                         : TG_(cost_base) + current_ii->cost_offset;
+}
+
+static CacheModelResult cacheuse_LL_access(Addr memline, line_loaded* l1_loaded)
+{
+   UInt   setNo = (memline >> LL.line_size_bits) & (LL.sets_min_1);
+   UWord* set   = &(LL.tags[setNo * LL.assoc]);
+   UWord  tag   = memline & LL.tag_mask;
+
+   int   i, j, idx;
+   UWord tmp_tag;
+
+   TG_DEBUG(6, "LL.Acc(Memline %#lx): Set %u\n", memline, setNo);
+
+   if (tag == (set[0] & LL.tag_mask)) {
+      idx                = (setNo * LL.assoc) + (set[0] & ~LL.tag_mask);
+      l1_loaded->dep_use = &(LL.use[idx]);
+
+      TG_DEBUG(6, " Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %u\n",
+               idx, LL.loaded[idx].memline, LL.loaded[idx].iaddr,
+               LL.use[idx].mask, LL.use[idx].count);
+      return LL_Hit;
+   }
+   for (i = 1; i < LL.assoc; i++) {
+      if (tag == (set[i] & LL.tag_mask)) {
+         tmp_tag = set[i];
+         for (j = i; j > 0; j--) {
+            set[j] = set[j - 1];
+         }
+         set[0]             = tmp_tag;
+         idx                = (setNo * LL.assoc) + (tmp_tag & ~LL.tag_mask);
+         l1_loaded->dep_use = &(LL.use[idx]);
+
+         TG_DEBUG(6,
+                  " Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %u\n",
+                  i, idx, LL.loaded[idx].memline, LL.loaded[idx].iaddr,
+                  LL.use[idx].mask, LL.use[idx].count);
+         return LL_Hit;
+      }
+   }
+
+   /* A miss;  install this tag as MRU, shuffle rest down. */
+   tmp_tag = set[LL.assoc - 1] & ~LL.tag_mask;
+   for (j = LL.assoc - 1; j > 0; j--) {
+      set[j] = set[j - 1];
+   }
+   set[0]             = tag | tmp_tag;
+   idx                = (setNo * LL.assoc) + tmp_tag;
+   l1_loaded->dep_use = &(LL.use[idx]);
+
+   update_LL_use(idx, memline);
+
+   return MemAccess;
+}
+
+#define UPDATE_USE(L)                                                          \
+                                                                               \
+   static CacheModelResult update##_##L##_use(cache_t2* cache, int idx,        \
+                                              UInt mask, Addr memline)         \
+   {                                                                           \
+      line_loaded* loaded = &(cache->loaded[idx]);                             \
+      line_use*    use    = &(cache->use[idx]);                                \
+      int          c = ((32 - countBits(use->mask)) * cache->line_size) >> 5;  \
+                                                                               \
+      TG_DEBUG(2,                                                              \
+               " %s.miss [%d]: at %#lx accessing memline %#lx (mask %08x)\n",  \
+               cache->name, idx, TG_(bb_base) + current_ii->instr_offset,      \
+               memline, mask);                                                 \
+      if (use->count > 0) {                                                    \
+         TG_DEBUG(                                                             \
+            2, "   old: used %u, loss bits %d (%08x) [line %#lx from %#lx]\n", \
+            use->count, c, use->mask, loaded->memline, loaded->iaddr);         \
+         TG_DEBUG(2, "   collect: %d, use_base %p\n",                          \
+                  TG_(current_state).collect, loaded->use_base);               \
+                                                                               \
+         if (TG_(current_state).collect && loaded->use_base) {                 \
+            (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count;         \
+            (loaded->use_base)[off_##L##_SpLoss] += c;                         \
+                                                                               \
+            /* FIXME (?): L1/LL line sizes must be equal ! */                  \
+            loaded->dep_use->mask |= use->mask;                                \
+            loaded->dep_use->count += use->count;                              \
+         }                                                                     \
+      }                                                                        \
+                                                                               \
+      use->count       = 1;                                                    \
+      use->mask        = mask;                                                 \
+      loaded->memline  = memline;                                              \
+      loaded->iaddr    = TG_(bb_base) + current_ii->instr_offset;              \
+      loaded->use_base = (TG_(current_state).nonskipped)                       \
+                            ? TG_(current_state).nonskipped->skipped           \
+                            : TG_(cost_base) + current_ii->cost_offset;        \
+                                                                               \
+      if (memline == 0)                                                        \
+         return LL_Hit;                                                        \
+      return cacheuse_LL_access(memline, loaded);                              \
+   }
+
+UPDATE_USE(I1);
+UPDATE_USE(D1);
+
+CACHEUSE(I1);
+CACHEUSE(D1);
+
+static void cacheuse_finish(void)
+{
+   int       i;
+   InstrInfo ii = {0, 0, 0, 0};
+
+   if (!TG_(current_state).collect)
+      return;
+
+   TG_(bb_base)   = 0;
+   current_ii     = &ii; /* needs to be set for update_XX_use */
+   TG_(cost_base) = 0;
+
+   /* update usage counters */
+   if (I1.use)
+      for (i = 0; i < I1.sets * I1.assoc; i++)
+         if (I1.loaded[i].use_base)
+            update_I1_use(&I1, i, 0, 0);
+
+   if (D1.use)
+      for (i = 0; i < D1.sets * D1.assoc; i++)
+         if (D1.loaded[i].use_base)
+            update_D1_use(&D1, i, 0, 0);
+
+   if (LL.use)
+      for (i = 0; i < LL.sets * LL.assoc; i++)
+         if (LL.loaded[i].use_base)
+            update_LL_use(i, 0);
+
+   current_ii = 0;
+}
+
+/*------------------------------------------------------------*/
+/*--- Helper functions called by instrumented code         ---*/
+/*------------------------------------------------------------*/
+
+static __inline__ void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
+{
+   switch (r) {
+   case WriteBackMemAccess:
+      if (clo_simulate_writeback) {
+         c1[3]++;
+         c2[3]++;
+      }
+      // fall through
+
+   case MemAccess:
+      c1[2]++;
+      c2[2]++;
+      // fall through
+
+   case LL_Hit:
+      c1[1]++;
+      c2[1]++;
+      // fall through
+
+   default:
+      c1[0]++;
+      c2[0]++;
+   }
+}
+
+static const HChar* cacheRes(CacheModelResult r)
+{
+   switch (r) {
+   case L1_Hit:
+      return "L1 Hit ";
+   case LL_Hit:
+      return "LL Hit ";
+   case MemAccess:
+      return "LL Miss";
+   case WriteBackMemAccess:
+      return "LL Miss (dirty)";
+   default:
+      tl_assert(0);
+   }
+   return "??";
+}
+
+VG_REGPARM(1)
+static void log_1I0D(InstrInfo* ii)
+{
+   CacheModelResult IrRes;
+
+   current_ii = ii;
+   IrRes =
+      (*simulator.I1_Read)(TG_(bb_base) + ii->instr_offset, ii->instr_size);
+
+   TG_DEBUG(6, "log_1I0D:  Ir  %#lx/%u => %s\n",
+            TG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes));
+
+   if (TG_(current_state).collect) {
+      ULong* cost_Ir;
+
+      if (TG_(current_state).nonskipped)
+         cost_Ir = TG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
+      else
+         cost_Ir =
+            TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
+
+      inc_costs(IrRes, cost_Ir, TG_(current_state).cost + fullOffset(EG_IR));
+   }
+}
+
+VG_REGPARM(2)
+static void log_2I0D(InstrInfo* ii1, InstrInfo* ii2)
+{
+   CacheModelResult Ir1Res, Ir2Res;
+   ULong*           global_cost_Ir;
+
+   current_ii = ii1;
+   Ir1Res =
+      (*simulator.I1_Read)(TG_(bb_base) + ii1->instr_offset, ii1->instr_size);
+   current_ii = ii2;
+   Ir2Res =
+      (*simulator.I1_Read)(TG_(bb_base) + ii2->instr_offset, ii2->instr_size);
+
+   TG_DEBUG(6, "log_2I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s\n",
+            TG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
+            TG_(bb_base) + ii2->instr_offset, ii2->instr_size,
+            cacheRes(Ir2Res));
+
+   if (!TG_(current_state).collect)
+      return;
+
+   global_cost_Ir = TG_(current_state).cost + fullOffset(EG_IR);
+   if (TG_(current_state).nonskipped) {
+      ULong* skipped_cost_Ir =
+         TG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
+
+      inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
+      inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
+      return;
+   }
+
+   inc_costs(Ir1Res, global_cost_Ir,
+             TG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
+   inc_costs(Ir2Res, global_cost_Ir,
+             TG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
+}
+
+VG_REGPARM(3)
+static void log_3I0D(InstrInfo* ii1, InstrInfo* ii2, InstrInfo* ii3)
+{
+   CacheModelResult Ir1Res, Ir2Res, Ir3Res;
+   ULong*           global_cost_Ir;
+
+   current_ii = ii1;
+   Ir1Res =
+      (*simulator.I1_Read)(TG_(bb_base) + ii1->instr_offset, ii1->instr_size);
+   current_ii = ii2;
+   Ir2Res =
+      (*simulator.I1_Read)(TG_(bb_base) + ii2->instr_offset, ii2->instr_size);
+   current_ii = ii3;
+   Ir3Res =
+      (*simulator.I1_Read)(TG_(bb_base) + ii3->instr_offset, ii3->instr_size);
+
+   TG_DEBUG(
+      6, "log_3I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s, Ir3 %#lx/%u => %s\n",
+      TG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
+      TG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res),
+      TG_(bb_base) + ii3->instr_offset, ii3->instr_size, cacheRes(Ir3Res));
+
+   if (!TG_(current_state).collect)
+      return;
+
+   global_cost_Ir = TG_(current_state).cost + fullOffset(EG_IR);
+   if (TG_(current_state).nonskipped) {
+      ULong* skipped_cost_Ir =
+         TG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
+      inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
+      inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
+      inc_costs(Ir3Res, global_cost_Ir, skipped_cost_Ir);
+      return;
+   }
+
+   inc_costs(Ir1Res, global_cost_Ir,
+             TG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
+   inc_costs(Ir2Res, global_cost_Ir,
+             TG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
+   inc_costs(Ir3Res, global_cost_Ir,
+             TG_(cost_base) + ii3->cost_offset + ii3->eventset->offset[EG_IR]);
+}
+
+/* Instruction doing a read access */
+
+VG_REGPARM(3)
+static void log_1I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
+{
+   CacheModelResult IrRes, DrRes;
+
+   current_ii = ii;
+   IrRes =
+      (*simulator.I1_Read)(TG_(bb_base) + ii->instr_offset, ii->instr_size);
+   DrRes = (*simulator.D1_Read)(data_addr, data_size);
+
+   TG_DEBUG(6, "log_1I1Dr: Ir  %#lx/%u => %s, Dr  %#lx/%ld => %s\n",
+            TG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
+            data_addr, data_size, cacheRes(DrRes));
+
+   if (TG_(current_state).collect) {
+      ULong *cost_Ir, *cost_Dr;
+
+      if (TG_(current_state).nonskipped) {
+         cost_Ir = TG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
+         cost_Dr = TG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
+      } else {
+         cost_Ir =
+            TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
+         cost_Dr =
+            TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
+      }
+
+      inc_costs(IrRes, cost_Ir, TG_(current_state).cost + fullOffset(EG_IR));
+      inc_costs(DrRes, cost_Dr, TG_(current_state).cost + fullOffset(EG_DR));
+   }
+}
+
+/* Note that addEvent_D_guarded assumes that log_0I1Dr and log_0I1Dw
+   have exactly the same prototype.  If you change them, you must
+   change addEvent_D_guarded too. */
+VG_REGPARM(3)
+static void log_0I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
+{
+   CacheModelResult DrRes;
+
+   current_ii = ii;
+   DrRes      = (*simulator.D1_Read)(data_addr, data_size);
+
+   TG_DEBUG(6, "log_0I1Dr: Dr  %#lx/%ld => %s\n", data_addr, data_size,
+            cacheRes(DrRes));
+
+   if (TG_(current_state).collect) {
+      ULong* cost_Dr;
+
+      if (TG_(current_state).nonskipped)
+         cost_Dr = TG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
+      else
+         cost_Dr =
+            TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
+
+      inc_costs(DrRes, cost_Dr, TG_(current_state).cost + fullOffset(EG_DR));
+   }
+}
+
+/* Instruction doing a write access */
+
+VG_REGPARM(3)
+static void log_1I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
+{
+   CacheModelResult IrRes, DwRes;
+
+   current_ii = ii;
+   IrRes =
+      (*simulator.I1_Read)(TG_(bb_base) + ii->instr_offset, ii->instr_size);
+   DwRes = (*simulator.D1_Write)(data_addr, data_size);
+
+   TG_DEBUG(6, "log_1I1Dw: Ir  %#lx/%u => %s, Dw  %#lx/%ld => %s\n",
+            TG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
+            data_addr, data_size, cacheRes(DwRes));
+
+   if (TG_(current_state).collect) {
+      ULong *cost_Ir, *cost_Dw;
+
+      if (TG_(current_state).nonskipped) {
+         cost_Ir = TG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
+         cost_Dw = TG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
+      } else {
+         cost_Ir =
+            TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
+         cost_Dw =
+            TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
+      }
+
+      inc_costs(IrRes, cost_Ir, TG_(current_state).cost + fullOffset(EG_IR));
+      inc_costs(DwRes, cost_Dw, TG_(current_state).cost + fullOffset(EG_DW));
+   }
+}
+
+/* See comment on log_0I1Dr. */
+VG_REGPARM(3)
+static void log_0I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
+{
+   CacheModelResult DwRes;
+
+   current_ii = ii;
+   DwRes      = (*simulator.D1_Write)(data_addr, data_size);
+
+   TG_DEBUG(6, "log_0I1Dw: Dw  %#lx/%ld => %s\n", data_addr, data_size,
+            cacheRes(DwRes));
+
+   if (TG_(current_state).collect) {
+      ULong* cost_Dw;
+
+      if (TG_(current_state).nonskipped)
+         cost_Dw = TG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
+      else
+         cost_Dw =
+            TG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
+
+      inc_costs(DwRes, cost_Dw, TG_(current_state).cost + fullOffset(EG_DW));
+   }
+}
+
+/*------------------------------------------------------------*/
+/*--- Cache configuration                                  ---*/
+/*------------------------------------------------------------*/
+
+static cache_t clo_I1_cache = UNDEFINED_CACHE;
+static cache_t clo_D1_cache = UNDEFINED_CACHE;
+static cache_t clo_LL_cache = UNDEFINED_CACHE;
+
+/* Initialize and clear simulator state */
+static void cachesim_post_clo_init(void)
+{
+   /* Cache configurations. */
+   cache_t I1c, D1c, LLc;
+
+   /* Initialize access handlers */
+   if (!TG_(clo).simulate_cache) {
+      TG_(cachesim).log_1I0D      = 0;
+      TG_(cachesim).log_1I0D_name = "(no function)";
+      TG_(cachesim).log_2I0D      = 0;
+      TG_(cachesim).log_2I0D_name = "(no function)";
+      TG_(cachesim).log_3I0D      = 0;
+      TG_(cachesim).log_3I0D_name = "(no function)";
+
+      TG_(cachesim).log_1I1Dr      = 0;
+      TG_(cachesim).log_1I1Dr_name = "(no function)";
+      TG_(cachesim).log_1I1Dw      = 0;
+      TG_(cachesim).log_1I1Dw_name = "(no function)";
+
+      TG_(cachesim).log_0I1Dr      = 0;
+      TG_(cachesim).log_0I1Dr_name = "(no function)";
+      TG_(cachesim).log_0I1Dw      = 0;
+      TG_(cachesim).log_0I1Dw_name = "(no function)";
+      return;
+   }
+
+   /* Configuration of caches only needed with real cache simulation */
+   VG_(post_clo_init_configure_caches)(&I1c, &D1c, &LLc, &clo_I1_cache,
+                                       &clo_D1_cache, &clo_LL_cache);
+
+   I1.name = "I1";
+   D1.name = "D1";
+   LL.name = "LL";
+
+   // min_line_size is used to make sure that we never feed
+   // accesses to the simulator straddling more than two
+   // cache lines at any cache level
+   TG_(min_line_size) =
+      (I1c.line_size < D1c.line_size) ? I1c.line_size : D1c.line_size;
+   TG_(min_line_size) =
+      (LLc.line_size < TG_(min_line_size)) ? LLc.line_size : TG_(min_line_size);
+
+   Int largest_load_or_store_size =
+      VG_(machine_get_size_of_largest_guest_register)();
+   if (TG_(min_line_size) < largest_load_or_store_size) {
+      /* We can't continue, because the cache simulation might
+         straddle more than 2 lines, and it will assert.  So let's
+         just stop before we start. */
+      VG_(umsg)("Tracegrind: cannot continue: the minimum line size (%d)\n",
+                (Int)TG_(min_line_size));
+      VG_(umsg)(
+         "  must be equal to or larger than the maximum register size (%d)\n",
+         largest_load_or_store_size);
+      VG_(umsg)("  but it is not.  Exiting now.\n");
+      VG_(exit)(1);
+   }
+
+   cachesim_initcache(I1c, &I1);
+   cachesim_initcache(D1c, &D1);
+   cachesim_initcache(LLc, &LL);
+
+   /* the other cache simulators use the standard helpers
+    * with dispatching via simulator struct */
+
+   TG_(cachesim).log_1I0D      = log_1I0D;
+   TG_(cachesim).log_1I0D_name = "log_1I0D";
+   TG_(cachesim).log_2I0D      = log_2I0D;
+   TG_(cachesim).log_2I0D_name = "log_2I0D";
+   TG_(cachesim).log_3I0D      = log_3I0D;
+   TG_(cachesim).log_3I0D_name = "log_3I0D";
+
+   TG_(cachesim).log_1I1Dr      = log_1I1Dr;
+   TG_(cachesim).log_1I1Dw      = log_1I1Dw;
+   TG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
+   TG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
+
+   TG_(cachesim).log_0I1Dr      = log_0I1Dr;
+   TG_(cachesim).log_0I1Dw      = log_0I1Dw;
+   TG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
+   TG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
+
+   if (clo_collect_cacheuse) {
+
+      /* Output warning for not supported option combinations */
+      if (clo_simulate_hwpref) {
+         VG_(message)(Vg_DebugMsg, "warning: prefetch simulation can not be "
+                                   "used with cache usage\n");
+         clo_simulate_hwpref = False;
+      }
+
+      if (clo_simulate_writeback) {
+         VG_(message)(Vg_DebugMsg, "warning: write-back simulation can not be "
+                                   "used with cache usage\n");
+         clo_simulate_writeback = False;
+      }
+
+      simulator.I1_Read  = cacheuse_I1_doRead;
+      simulator.D1_Read  = cacheuse_D1_doRead;
+      simulator.D1_Write = cacheuse_D1_doRead;
+      return;
+   }
+
+   if (clo_simulate_hwpref) {
+      prefetch_clear();
+
+      if (clo_simulate_writeback) {
+         simulator.I1_Read  = prefetch_I1_Read;
+         simulator.D1_Read  = prefetch_D1_Read;
+         simulator.D1_Write = prefetch_D1_Write;
+      } else {
+         simulator.I1_Read  = prefetch_I1_ref;
+         simulator.D1_Read  = prefetch_D1_ref;
+         simulator.D1_Write = prefetch_D1_ref;
+      }
+
+      return;
+   }
+
+   if (clo_simulate_writeback) {
+      simulator.I1_Read  = cachesim_I1_Read;
+      simulator.D1_Read  = cachesim_D1_Read;
+      simulator.D1_Write = cachesim_D1_Write;
+   } else {
+      simulator.I1_Read  = cachesim_I1_ref;
+      simulator.D1_Read  = cachesim_D1_ref;
+      simulator.D1_Write = cachesim_D1_ref;
+   }
+}
+
+/* Clear simulator state. Has to be initialized before */
+static void cachesim_clear(void)
+{
+   cachesim_clearcache(&I1);
+   cachesim_clearcache(&D1);
+   cachesim_clearcache(&LL);
+
+   prefetch_clear();
+}
+
+static void cachesim_print_opts(void)
+{
+   VG_(printf)(
+      "\n   cache simulator options (does cache simulation if used):\n"
+      "    --simulate-wb=no|yes      Count write-back events [no]\n"
+      "    --simulate-hwpref=no|yes  Simulate hardware prefetch [no]\n"
+#if TG_EXPERIMENTAL
+      "    --simulate-sectors=no|yes Simulate sectored behaviour [no]\n"
+#endif
+      "    --cacheuse=no|yes         Collect cache block use [no]\n");
+   VG_(print_cache_clo_opts)();
+}
+
+/* Check for command line option for cache configuration.
+ * Return False if unknown and not handled.
+ *
+ * Called from TG_(process_cmd_line_option)() in clo.c
+ */
+static Bool cachesim_parse_opt(const HChar* arg)
+{
+   if VG_BOOL_CLO (arg, "--simulate-wb", clo_simulate_writeback) {
+   } else if VG_BOOL_CLO (arg, "--simulate-hwpref", clo_simulate_hwpref) {
+   } else if VG_BOOL_CLO (arg, "--simulate-sectors", clo_simulate_sectors) {
+   }
+
+   else if VG_BOOL_CLO (arg, "--cacheuse", clo_collect_cacheuse) {
+   }
+
+   else if (VG_(str_clo_cache_opt)(arg, &clo_I1_cache, &clo_D1_cache,
+                                   &clo_LL_cache)) {
+   }
+
+   else
+      return False;
+
+   return True;
+}
+
+static void cachesim_printstat(Int l1, Int l2, Int l3)
+{
+   FullCost total = TG_(total_cost), D_total = 0;
+   ULong LL_total_m, LL_total_mr, LL_total_mw, LL_total, LL_total_r, LL_total_w;
+
+   if ((VG_(clo_verbosity) > 1) && clo_simulate_hwpref) {
+      VG_(message)(Vg_DebugMsg, "Prefetch Up:       %llu\n", prefetch_up);
+      VG_(message)(Vg_DebugMsg, "Prefetch Down:     %llu\n", prefetch_down);
+      VG_(message)(Vg_DebugMsg, "\n");
+   }
+
+   VG_(message)(Vg_UserMsg, "I1  misses:    %'*llu\n", l1,
+                total[fullOffset(EG_IR) + 1]);
+
+   VG_(message)(Vg_UserMsg, "LLi misses:    %'*llu\n", l1,
+                total[fullOffset(EG_IR) + 2]);
+
+   if (0 == total[fullOffset(EG_IR)])
+      total[fullOffset(EG_IR)] = 1;
+
+   VG_(message)(Vg_UserMsg, "I1  miss rate: %*.2f%%\n", l1,
+                total[fullOffset(EG_IR) + 1] * 100.0 /
+                   total[fullOffset(EG_IR)]);
+
+   VG_(message)(Vg_UserMsg, "LLi miss rate: %*.2f%%\n", l1,
+                total[fullOffset(EG_IR) + 2] * 100.0 /
+                   total[fullOffset(EG_IR)]);
+
+   VG_(message)(Vg_UserMsg, "\n");
+
+   /* D cache results.
+      Use the D_refs.rd and D_refs.wr values to determine the
+    * width of columns 2 & 3. */
+
+   D_total = TG_(get_eventset_cost)(TG_(sets).full);
+   TG_(init_cost)(TG_(sets).full, D_total);
+   // we only use the first 3 values of D_total, adding up Dr and Dw costs
+   TG_(copy_cost)
+   (TG_(get_event_set)(EG_DR), D_total, total + fullOffset(EG_DR));
+   TG_(add_cost)(TG_(get_event_set)(EG_DW), D_total, total + fullOffset(EG_DW));
+
+   VG_(message)(Vg_UserMsg, "D   refs:      %'*llu  (%'*llu rd + %'*llu wr)\n",
+                l1, D_total[0], l2, total[fullOffset(EG_DR)], l3,
+                total[fullOffset(EG_DW)]);
+
+   VG_(message)(Vg_UserMsg, "D1  misses:    %'*llu  (%'*llu rd + %'*llu wr)\n",
+                l1, D_total[1], l2, total[fullOffset(EG_DR) + 1], l3,
+                total[fullOffset(EG_DW) + 1]);
+
+   VG_(message)(Vg_UserMsg, "LLd misses:    %'*llu  (%'*llu rd + %'*llu wr)\n",
+                l1, D_total[2], l2, total[fullOffset(EG_DR) + 2], l3,
+                total[fullOffset(EG_DW) + 2]);
+
+   if (0 == D_total[0])
+      D_total[0] = 1;
+   if (0 == total[fullOffset(EG_DR)])
+      total[fullOffset(EG_DR)] = 1;
+   if (0 == total[fullOffset(EG_DW)])
+      total[fullOffset(EG_DW)] = 1;
+
+   VG_(message)(
+      Vg_UserMsg, "D1  miss rate: %*.1f%% (%*.1f%%   + %*.1f%%  )\n", l1,
+      D_total[1] * 100.0 / D_total[0], l2,
+      total[fullOffset(EG_DR) + 1] * 100.0 / total[fullOffset(EG_DR)], l3,
+      total[fullOffset(EG_DW) + 1] * 100.0 / total[fullOffset(EG_DW)]);
+
+   VG_(message)(
+      Vg_UserMsg, "LLd miss rate: %*.1f%% (%*.1f%%   + %*.1f%%  )\n", l1,
+      D_total[2] * 100.0 / D_total[0], l2,
+      total[fullOffset(EG_DR) + 2] * 100.0 / total[fullOffset(EG_DR)], l3,
+      total[fullOffset(EG_DW) + 2] * 100.0 / total[fullOffset(EG_DW)]);
+   VG_(message)(Vg_UserMsg, "\n");
+
+   /* LL overall results */
+
+   LL_total = total[fullOffset(EG_DR) + 1] + total[fullOffset(EG_DW) + 1] +
+              total[fullOffset(EG_IR) + 1];
+   LL_total_r = total[fullOffset(EG_DR) + 1] + total[fullOffset(EG_IR) + 1];
+   LL_total_w = total[fullOffset(EG_DW) + 1];
+   VG_(message)(Vg_UserMsg, "LL refs:       %'*llu  (%'*llu rd + %'*llu wr)\n",
+                l1, LL_total, l2, LL_total_r, l3, LL_total_w);
+
+   LL_total_m = total[fullOffset(EG_DR) + 2] + total[fullOffset(EG_DW) + 2] +
+                total[fullOffset(EG_IR) + 2];
+   LL_total_mr = total[fullOffset(EG_DR) + 2] + total[fullOffset(EG_IR) + 2];
+   LL_total_mw = total[fullOffset(EG_DW) + 2];
+   VG_(message)(Vg_UserMsg, "LL misses:     %'*llu  (%'*llu rd + %'*llu wr)\n",
+                l1, LL_total_m, l2, LL_total_mr, l3, LL_total_mw);
+
+   VG_(message)(
+      Vg_UserMsg, "LL miss rate:  %*.1f%% (%*.1f%%   + %*.1f%%  )\n", l1,
+      LL_total_m * 100.0 / (total[fullOffset(EG_IR)] + D_total[0]), l2,
+      LL_total_mr * 100.0 /
+         (total[fullOffset(EG_IR)] + total[fullOffset(EG_DR)]),
+      l3, LL_total_mw * 100.0 / total[fullOffset(EG_DW)]);
+}
+
+/*------------------------------------------------------------*/
+/*--- Setup for Event set.                                 ---*/
+/*------------------------------------------------------------*/
+
+struct event_sets TG_(sets);
+
+void TG_(init_eventsets)(void)
+{
+   // Event groups from which the event sets are composed
+   // the "Use" group only is used with "cacheuse" simulation
+   if (clo_collect_cacheuse)
+      TG_(register_event_group4)
+   (EG_USE, "AcCost1", "SpLoss1", "AcCost2", "SpLoss2");
+
+   if (!TG_(clo).simulate_cache)
+      TG_(register_event_group)(EG_IR, "Ir");
+   else if (!clo_simulate_writeback) {
+      TG_(register_event_group3)(EG_IR, "Ir", "I1mr", "ILmr");
+      TG_(register_event_group3)(EG_DR, "Dr", "D1mr", "DLmr");
+      TG_(register_event_group3)(EG_DW, "Dw", "D1mw", "DLmw");
+   } else { // clo_simulate_writeback
+      TG_(register_event_group4)(EG_IR, "Ir", "I1mr", "ILmr", "ILdmr");
+      TG_(register_event_group4)(EG_DR, "Dr", "D1mr", "DLmr", "DLdmr");
+      TG_(register_event_group4)(EG_DW, "Dw", "D1mw", "DLmw", "DLdmw");
+   }
+
+   if (TG_(clo).simulate_branch) {
+      TG_(register_event_group2)(EG_BC, "Bc", "Bcm");
+      TG_(register_event_group2)(EG_BI, "Bi", "Bim");
+   }
+
+   if (TG_(clo).collect_bus)
+      TG_(register_event_group)(EG_BUS, "Ge");
+
+   if (TG_(clo).collect_systime != systime_no) {
+      if (TG_(clo).collect_systime == systime_nsec)
+         TG_(register_event_group3)
+      (EG_SYS, "sysCount", "sysTime", "sysCpuTime");
+      else TG_(register_event_group2)(EG_SYS, "sysCount", "sysTime");
+   }
+
+   // event set used as base for instruction self cost
+   TG_(sets).base = TG_(get_event_set2)(EG_USE, EG_IR);
+
+   // event set comprising all event groups, used for inclusive cost
+   TG_(sets).full = TG_(add_event_group2)(TG_(sets).base, EG_DR, EG_DW);
+   TG_(sets).full = TG_(add_event_group2)(TG_(sets).full, EG_BC, EG_BI);
+   TG_(sets).full = TG_(add_event_group)(TG_(sets).full, EG_BUS);
+   TG_(sets).full = TG_(add_event_group)(TG_(sets).full, EG_SYS);
+
+   TG_DEBUGIF(1)
+   {
+      TG_DEBUG(1, "EventSets:\n");
+      TG_(print_eventset)(-2, TG_(sets).base);
+      TG_(print_eventset)(-2, TG_(sets).full);
+   }
+
+   /* Not-existing events are silently ignored */
+   TG_(dumpmap) = TG_(get_eventmapping)(TG_(sets).full);
+   TG_(append_event)(TG_(dumpmap), "Ir");
+   TG_(append_event)(TG_(dumpmap), "Dr");
+   TG_(append_event)(TG_(dumpmap), "Dw");
+   TG_(append_event)(TG_(dumpmap), "I1mr");
+   TG_(append_event)(TG_(dumpmap), "D1mr");
+   TG_(append_event)(TG_(dumpmap), "D1mw");
+   TG_(append_event)(TG_(dumpmap), "ILmr");
+   TG_(append_event)(TG_(dumpmap), "DLmr");
+   TG_(append_event)(TG_(dumpmap), "DLmw");
+   TG_(append_event)(TG_(dumpmap), "ILdmr");
+   TG_(append_event)(TG_(dumpmap), "DLdmr");
+   TG_(append_event)(TG_(dumpmap), "DLdmw");
+   TG_(append_event)(TG_(dumpmap), "Bc");
+   TG_(append_event)(TG_(dumpmap), "Bcm");
+   TG_(append_event)(TG_(dumpmap), "Bi");
+   TG_(append_event)(TG_(dumpmap), "Bim");
+   TG_(append_event)(TG_(dumpmap), "AcCost1");
+   TG_(append_event)(TG_(dumpmap), "SpLoss1");
+   TG_(append_event)(TG_(dumpmap), "AcCost2");
+   TG_(append_event)(TG_(dumpmap), "SpLoss2");
+   TG_(append_event)(TG_(dumpmap), "Ge");
+   TG_(append_event)(TG_(dumpmap), "allocCount");
+   TG_(append_event)(TG_(dumpmap), "allocSize");
+   TG_(append_event)(TG_(dumpmap), "sysCount");
+   TG_(append_event)(TG_(dumpmap), "sysTime");
+   TG_(append_event)(TG_(dumpmap), "sysCpuTime");
+}
+
+static void cachesim_finish(void)
+{
+   if (clo_collect_cacheuse)
+      cacheuse_finish();
+}
+
+/*------------------------------------------------------------*/
+/*--- The simulator defined in this file                   ---*/
+/*------------------------------------------------------------*/
+
+struct cachesim_if TG_(cachesim) = {
+   .print_opts    = cachesim_print_opts,
+   .parse_opt     = cachesim_parse_opt,
+   .post_clo_init = cachesim_post_clo_init,
+   .clear         = cachesim_clear,
+   .printstat     = cachesim_printstat,
+   .finish        = cachesim_finish,
+
+   /* these will be set by cachesim_post_clo_init */
+   .log_1I0D = 0,
+   .log_2I0D = 0,
+   .log_3I0D = 0,
+
+   .log_1I1Dr = 0,
+   .log_1I1Dw = 0,
+
+   .log_0I1Dr = 0,
+   .log_0I1Dw = 0,
+
+   .log_1I0D_name = "(no function)",
+   .log_2I0D_name = "(no function)",
+   .log_3I0D_name = "(no function)",
+
+   .log_1I1Dr_name = "(no function)",
+   .log_1I1Dw_name = "(no function)",
+
+   .log_0I1Dr_name = "(no function)",
+   .log_0I1Dw_name = "(no function)",
+};
+
+/*--------------------------------------------------------------------*/
+/*--- end                                                 ct_sim.c ---*/
+/*--------------------------------------------------------------------*/
diff --git a/tracegrind/tests/Makefile.am b/tracegrind/tests/Makefile.am
new file mode 100644
index 000000000..0ad3b6ae5
--- /dev/null
+++ b/tracegrind/tests/Makefile.am
@@ -0,0 +1,71 @@
+
+include $(top_srcdir)/Makefile.tool-tests.am
+
+SUBDIRS = .
+DIST_SUBDIRS = .
+
+dist_noinst_SCRIPTS = filter_stderr filter_trace
+
+check_PROGRAMS = \
+	test_basic.bin \
+	test_marker.bin \
+	test_instr_toggle.bin \
+	test_toggle_collect.bin \
+	test_foo_bar_baz.bin \
+	test_inline.bin \
+	test_enter_inlined.bin \
+	test_nested_inlined.bin \
+	test_signal.bin \
+	test_exception.bin \
+	test_longjmp.bin \
+	test_tailcall.bin \
+	test_recursion.bin \
+	test_thread_create.bin \
+	test_thread_interleave.bin \
+	test_syscall.bin
+
+AM_CPPFLAGS += -I$(top_srcdir)/tracegrind
+AM_CFLAGS   += $(AM_FLAG_M3264_PRI)
+AM_CXXFLAGS += $(AM_FLAG_M3264_PRI)
+
+test_basic_bin_SOURCES = test_basic.c
+test_marker_bin_SOURCES = test_marker.c
+test_instr_toggle_bin_SOURCES = test_instr_toggle.c
+test_toggle_collect_bin_SOURCES = test_toggle_collect.c
+test_foo_bar_baz_bin_SOURCES = test_foo_bar_baz.c
+test_inline_bin_SOURCES = test_inline.c
+test_inline_bin_CFLAGS = $(AM_CFLAGS) -O2 -g
+test_enter_inlined_bin_SOURCES = test_enter_inlined.c
+test_enter_inlined_bin_CFLAGS = $(AM_CFLAGS) -O2 -g
+test_nested_inlined_bin_SOURCES = test_nested_inlined.c
+test_nested_inlined_bin_CFLAGS = $(AM_CFLAGS) -O1 -g
+test_signal_bin_SOURCES = test_signal.c
+test_exception_bin_SOURCES = test_exception.cpp
+test_longjmp_bin_SOURCES = test_longjmp.c
+test_tailcall_bin_SOURCES = test_tailcall.c
+test_tailcall_bin_CFLAGS = $(AM_CFLAGS) -O2 -g
+test_recursion_bin_SOURCES = test_recursion.c
+test_thread_create_bin_SOURCES = test_thread_create.c
+test_thread_create_bin_LDADD = -lpthread
+test_thread_interleave_bin_SOURCES = test_thread_interleave.c
+test_thread_interleave_bin_LDADD = -lpthread
+test_syscall_bin_SOURCES = test_syscall.c
+
+EXTRA_DIST = \
+	test_basic.vgtest test_basic.stderr.exp test_basic.post.exp \
+	test_marker.vgtest test_marker.stderr.exp test_marker.post.exp \
+	test_instr_toggle.vgtest test_instr_toggle.stderr.exp test_instr_toggle.post.exp \
+	test_toggle_collect.vgtest test_toggle_collect.stderr.exp test_toggle_collect.post.exp \
+	test_foo_bar_baz.vgtest test_foo_bar_baz.stderr.exp test_foo_bar_baz.post.exp \
+	test_inline.vgtest test_inline.stderr.exp test_inline.post.exp \
+	test_enter_inlined.vgtest test_enter_inlined.stderr.exp test_enter_inlined.post.exp \
+	test_nested_inlined.vgtest test_nested_inlined.stderr.exp test_nested_inlined.post.exp \
+	test_signal.vgtest test_signal.stderr.exp test_signal.post.exp \
+	test_exception.vgtest test_exception.stderr.exp test_exception.post.exp \
+	test_longjmp.vgtest test_longjmp.stderr.exp test_longjmp.post.exp \
+	test_tailcall.vgtest test_tailcall.stderr.exp test_tailcall.post.exp \
+	test_recursion.vgtest test_recursion.stderr.exp test_recursion.post.exp \
+	test_thread_create.vgtest test_thread_create.stderr.exp test_thread_create.post.exp \
+	test_thread_interleave.vgtest test_thread_interleave.stderr.exp test_thread_interleave.post.exp \
+	test_syscall.vgtest test_syscall.stderr.exp test_syscall.post.exp \
+	test_schema.vgtest test_schema.stderr.exp test_schema.post.exp
diff --git a/tracegrind/tests/filter_stderr b/tracegrind/tests/filter_stderr
new file mode 100755
index 000000000..c62611e02
--- /dev/null
+++ b/tracegrind/tests/filter_stderr
@@ -0,0 +1,36 @@
+#! /bin/sh
+
+dir=`dirname $0`
+
+$dir/../../tests/filter_stderr_basic                |
+
+# Remove "Tracegrind, ..." line and the following copyright line.
+sed "/^Tracegrind, a streaming trace cache profiler/ , /./ d" |
+
+# Remove pointer to tracegrind_control
+sed "/^For interactive control,.*$/d" |
+
+# Remove numbers from "Collected" line
+sed "s/^\(Collected *:\)[ 0-9]*$/\1/" |
+
+# Remove numbers from I/D/LL "refs:" lines
+perl -p -e 's/((I|D|LL) *refs:)[ 0-9,()+rdw]*$/\1/'  |
+
+# Remove numbers from I1/D1/LL/LLi/LLd "misses:" and "miss rates:" lines
+perl -p -e 's/((I1|D1|LL|LLi|LLd) *(misses|miss rate):)[ 0-9,()+rdw%\.]*$/\1/' |
+
+# Remove numbers from "Branches:", "Mispredicts:, and "Mispred rate:" lines
+perl -p -e 's/((Branches|Mispredicts|Mispred rate):)[ 0-9,()+condi%\.]*$/\1/' |
+
+# Remove CPUID warnings lines for P4s and other machines
+sed "/warning: Pentium 4 with 12 KB micro-op instruction trace cache/d" |
+sed "/Simulating a 16 KB I-cache with 32 B lines/d"   |
+sed "/warning: L3 cache found, using its data for the LL simulation./d" |
+sed "/warning: L4 cache found, using its data for the LL simulation./d" |
+sed "/Warning: Cannot auto-detect cache config, using defaults./d" |
+sed "/Run with -v to see./d" |
+sed "/warning: specified LL cache: line_size .*$/d" |
+sed "/warning: simulated LL cache: line_size .*$/d" |
+
+# Remove trace output file path messages
+sed "/^Streaming trace output to /d"
diff --git a/tracegrind/tests/filter_trace b/tracegrind/tests/filter_trace
new file mode 100755
index 000000000..1ccbbf46a
--- /dev/null
+++ b/tracegrind/tests/filter_trace
@@ -0,0 +1,62 @@
+#!/bin/sh
+#
+# Filter tracegrind trace output (from tracegrind-analyzer)
+# to normalize machine-dependent values for regression testing.
+#
+
+# Normalize format/schema version numbers
+sed 's/^Format Version: [0-9]\+$/Format Version: N/' |
+sed 's/^Schema Version: [0-9]\+$/Schema Version: N/' |
+
+# Normalize object paths: replace full path to test binary with just the basename
+# e.g. obj=/home/user/valgrind/tracegrind/tests/test_marker -> obj=test_marker
+sed 's|obj=[^ |]*[/]||g' |
+
+# Normalize file paths: replace full source paths with just the basename
+# e.g. file=/home/user/.../test_marker.c -> file=test_marker.c
+sed 's|file=[^ |]*[/]||g' |
+
+# Normalize function address/stats that vary: Ir counts
+# Replace Ir=<number> with Ir=N
+sed 's|Ir=[0-9]\+|Ir=N|g' |
+
+# Normalize syscall timing values (non-deterministic)
+# Replace nonzero sysTime/sysCpuTime with >0 to assert they are measured
+sed 's|sysTime=[1-9][0-9]*|sysTime=T|g' |
+sed 's|sysCpuTime=[1-9][0-9]*|sysCpuTime=T|g' |
+
+# Remove the separator line
+sed '/^-\{10,\}$/d' |
+
+# Normalize "Total rows:" count
+sed 's/^Total rows: [0-9,]\+$/Total rows: N/' |
+
+# Normalize "Showing X of Y rows"
+sed 's/^Showing [0-9,]\+ of [0-9,]\+ rows$/Showing N of N rows/' |
+
+# Normalize "Sequence range:" numbers
+sed 's/^Sequence range: [0-9,]\+ - [0-9,]\+$/Sequence range: N - N/' |
+
+# Normalize event count percentages in stats
+sed 's/\([0-9,]\+\) ([0-9.]\+%)/N (P%)/g' |
+
+# Normalize "Threads: N ([...])"
+sed 's/^Threads: \([0-9]\+\) (\[.*\])/Threads: \1/' |
+
+# Remove "Top 10 functions" section (platform-dependent)
+sed '/^Top 10 functions/,/^$/d' |
+
+# Remove "Fork events" section (platform-dependent)
+sed '/^Fork events/,/^$/d' |
+
+# Remove "Thread create events" section (platform-dependent)
+sed '/^Thread create events/,/^$/d' |
+
+# Normalize seq numbers in raw arrays: [1234, ...] -> [N, ...]
+sed 's/^\[\([0-9]\+\),/[N,/g' |
+
+# Normalize seq=<number> in formatted output
+sed 's/seq=[0-9]\+/seq=N/g' |
+
+# Strip GCC optimization suffixes from function names (e.g. .constprop.0, .isra.0, .part.0)
+sed 's/\.\(constprop\|isra\|part\|cold\|lto_priv\)\.[0-9]*//g'
diff --git a/tracegrind/tests/test_basic.c b/tracegrind/tests/test_basic.c
new file mode 100644
index 000000000..2dddef620
--- /dev/null
+++ b/tracegrind/tests/test_basic.c
@@ -0,0 +1,14 @@
+#include "tracegrind.h"
+
+static int factorial(int n)
+{
+   if (n <= 1)
+      return 1;
+   return n * factorial(n - 1);
+}
+
+int main(void)
+{
+   int result = factorial(5);
+   return result != 120;
+}
diff --git a/tracegrind/tests/test_basic.post.exp b/tracegrind/tests/test_basic.post.exp
new file mode 100644
index 000000000..19397d9bc
--- /dev/null
+++ b/tracegrind/tests/test_basic.post.exp
@@ -0,0 +1,23 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  5 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
+Counters: ['Ir']
+
+Total rows: N
+
+Events by type:
+  ENTER_FN: N (P%)
+  EXIT_FN: N (P%)
+
+Threads: 1
+Sequence range: N - N
+
diff --git a/tracegrind/tests/test_basic.stderr.exp b/tracegrind/tests/test_basic.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_basic.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_basic.vgtest b/tracegrind/tests/test_basic.vgtest
new file mode 100644
index 000000000..4f2a05cd8
--- /dev/null
+++ b/tracegrind/tests/test_basic.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_basic.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_basic.msgpack.lz4
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_basic.msgpack.lz4 --stats | ./filter_trace
+cleanup: rm -f tracegrind.out.test_basic.msgpack.lz4
diff --git a/tracegrind/tests/test_enter_inlined.c b/tracegrind/tests/test_enter_inlined.c
new file mode 100644
index 000000000..70aa99e84
--- /dev/null
+++ b/tracegrind/tests/test_enter_inlined.c
@@ -0,0 +1,35 @@
+#include "tracegrind.h"
+
+/* Force inlining - with --read-inline-info=yes these should produce
+ * ENTER_INLINED / EXIT_INLINED events in the trace */
+static inline __attribute__((always_inline)) int inlined_work(int a, int b)
+{
+   /* Make the function large enough to span multiple basic blocks
+    * so at least one BB boundary falls inside inlined code */
+   int result = 0;
+   if (a > 0) {
+      result = a * b;
+   } else {
+      result = a + b;
+   }
+   return result;
+}
+
+/* Prevent inlining - SHOULD appear as ENTER/EXIT */
+static int __attribute__((noinline)) not_inlined_caller(int n)
+{
+   /* Use volatile to prevent constant propagation */
+   volatile int x = n;
+   return inlined_work(x, x + 1);
+}
+
+int main(void)
+{
+   volatile int input = 3;
+   TRACEGRIND_ADD_MARKER("start");
+   TRACEGRIND_START_INSTRUMENTATION;
+   int result = not_inlined_caller(input);
+   TRACEGRIND_STOP_INSTRUMENTATION;
+   TRACEGRIND_ADD_MARKER("end");
+   return result != 12;
+}
diff --git a/tracegrind/tests/test_enter_inlined.post.exp b/tracegrind/tests/test_enter_inlined.post.exp
new file mode 100644
index 000000000..f63eb2906
--- /dev/null
+++ b/tracegrind/tests/test_enter_inlined.post.exp
@@ -0,0 +1,21 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  5 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
+Counters: ['Ir']
+
+Showing N of N rows
+seq=N | tid=1 | event=MARKER | marker=start
+seq=N | tid=1 | event=ENTER_FN | fn=not_inlined_caller | obj=test_enter_inlined.bin | file=test_enter_inlined.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_INLINED_FN | fn=inlined_work | obj=test_enter_inlined.bin | file=test_enter_inlined.c | line=10 | Ir=N
+seq=N | tid=1 | event=EXIT_INLINED_FN | fn=inlined_work | obj=test_enter_inlined.bin | file=test_enter_inlined.c | line=10 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=not_inlined_caller | obj=test_enter_inlined.bin | file=test_enter_inlined.c | line=0 | Ir=N
+seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_enter_inlined.stderr.exp b/tracegrind/tests/test_enter_inlined.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_enter_inlined.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_enter_inlined.vgtest b/tracegrind/tests/test_enter_inlined.vgtest
new file mode 100644
index 000000000..1b5d7c55d
--- /dev/null
+++ b/tracegrind/tests/test_enter_inlined.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_enter_inlined.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_enter_inlined.msgpack.lz4 --instr-atstart=no --read-inline-info=yes
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_enter_inlined.msgpack.lz4 | ./filter_trace
+cleanup: rm -f tracegrind.out.test_enter_inlined.msgpack.lz4
diff --git a/tracegrind/tests/test_exception.cpp b/tracegrind/tests/test_exception.cpp
new file mode 100644
index 000000000..b9b599bdd
--- /dev/null
+++ b/tracegrind/tests/test_exception.cpp
@@ -0,0 +1,46 @@
+#include "tracegrind.h"
+#include <stdexcept>
+
+/*
+ * Test: C++ exception unwinding through regular (non-inlined) functions.
+ *
+ * catcher() calls thrower(), which calls do_throw().
+ * do_throw() throws an exception that unwinds back through thrower()
+ * to catcher()'s catch block. Verifies the call stack is properly
+ * maintained across exception unwinding.
+ *
+ * Call chain:  catcher -> thrower -> do_throw (throws)
+ */
+
+static void __attribute__((noinline)) do_throw(int x)
+{
+   if (x > 0)
+      throw std::runtime_error("boom");
+}
+
+static int __attribute__((noinline)) thrower(int n)
+{
+   volatile int x = n;
+   do_throw(x);
+   return x;
+}
+
+static int __attribute__((noinline)) catcher(int n)
+{
+   try {
+      return thrower(n);
+   } catch (const std::exception&) {
+      return -1;
+   }
+}
+
+int main()
+{
+   volatile int input = 5;
+   TRACEGRIND_ADD_MARKER("start");
+   TRACEGRIND_START_INSTRUMENTATION;
+   int result = catcher(input);
+   TRACEGRIND_STOP_INSTRUMENTATION;
+   TRACEGRIND_ADD_MARKER("end");
+   return result != -1;
+}
diff --git a/tracegrind/tests/test_exception.post.exp b/tracegrind/tests/test_exception.post.exp
new file mode 100644
index 000000000..7089c29d3
--- /dev/null
+++ b/tracegrind/tests/test_exception.post.exp
@@ -0,0 +1,15 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+Counters: ['Ir']
+Showing N of N rows
+seq=N | tid=1 | event=MARKER | marker=start
+seq=N | tid=1 | event=ENTER_FN | fn=catcher(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=thrower(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=do_throw(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=do_throw(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=thrower(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=catcher(int) | obj=test_exception.bin | file=test_exception.cpp | line=0 | Ir=N
+seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_exception.stderr.exp b/tracegrind/tests/test_exception.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_exception.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_exception.vgtest b/tracegrind/tests/test_exception.vgtest
new file mode 100644
index 000000000..1567cd9d2
--- /dev/null
+++ b/tracegrind/tests/test_exception.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_exception.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_exception.msgpack.lz4 --instr-atstart=no
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_exception.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Counters:|Showing |MARKER|fn=catcher|fn=thrower|fn=do_throw)'
+cleanup: rm -f tracegrind.out.test_exception.msgpack.lz4
diff --git a/tracegrind/tests/test_foo_bar_baz.c b/tracegrind/tests/test_foo_bar_baz.c
new file mode 100644
index 000000000..f4f2560f4
--- /dev/null
+++ b/tracegrind/tests/test_foo_bar_baz.c
@@ -0,0 +1,18 @@
+#include "tracegrind.h"
+
+static int __attribute__((noinline)) baz(int n) { return n * 2; }
+
+static int __attribute__((noinline)) bar(int n) { return baz(n) + 1; }
+
+static int __attribute__((noinline)) foo(int n) { return bar(n) + bar(n + 1); }
+
+int main(void)
+{
+   TRACEGRIND_ADD_MARKER("start");
+   TRACEGRIND_START_INSTRUMENTATION;
+   int result = foo(3);
+   TRACEGRIND_STOP_INSTRUMENTATION;
+   TRACEGRIND_ADD_MARKER("end");
+
+   return result != (baz(3) + 1 + baz(4) + 1);
+}
diff --git a/tracegrind/tests/test_foo_bar_baz.post.exp b/tracegrind/tests/test_foo_bar_baz.post.exp
new file mode 100644
index 000000000..ad3a60185
--- /dev/null
+++ b/tracegrind/tests/test_foo_bar_baz.post.exp
@@ -0,0 +1,27 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  5 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
+Counters: ['Ir']
+
+Showing N of N rows
+seq=N | tid=1 | event=MARKER | marker=start
+seq=N | tid=1 | event=ENTER_FN | fn=foo | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=bar | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=baz | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=baz | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=bar | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=bar | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=baz | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=baz | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=bar | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=foo | obj=test_foo_bar_baz.bin | file=test_foo_bar_baz.c | line=0 | Ir=N
+seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_foo_bar_baz.stderr.exp b/tracegrind/tests/test_foo_bar_baz.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_foo_bar_baz.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_foo_bar_baz.vgtest b/tracegrind/tests/test_foo_bar_baz.vgtest
new file mode 100644
index 000000000..c2a7b3efb
--- /dev/null
+++ b/tracegrind/tests/test_foo_bar_baz.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_foo_bar_baz.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_foo_bar_baz.msgpack.lz4 --instr-atstart=no
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_foo_bar_baz.msgpack.lz4 | ./filter_trace
+cleanup: rm -f tracegrind.out.test_foo_bar_baz.msgpack.lz4
diff --git a/tracegrind/tests/test_inline.c b/tracegrind/tests/test_inline.c
new file mode 100644
index 000000000..0533ee592
--- /dev/null
+++ b/tracegrind/tests/test_inline.c
@@ -0,0 +1,29 @@
+#include "tracegrind.h"
+
+/* Force inlining - these should NOT appear as ENTER/EXIT in the trace */
+static inline __attribute__((always_inline)) int inlined_add(int a, int b)
+{
+   return a + b;
+}
+
+static inline __attribute__((always_inline)) int inlined_mul(int a, int b)
+{
+   return a * b;
+}
+
+/* Prevent inlining - these SHOULD appear as ENTER/EXIT in the trace */
+static int __attribute__((noinline)) not_inlined_work(int n)
+{
+   return inlined_add(n, inlined_mul(n, 2));
+}
+
+int main(void)
+{
+   TRACEGRIND_ADD_MARKER("start");
+   TRACEGRIND_START_INSTRUMENTATION;
+   int result = not_inlined_work(5);
+   TRACEGRIND_STOP_INSTRUMENTATION;
+   TRACEGRIND_ADD_MARKER("end");
+
+   return result != 15;
+}
diff --git a/tracegrind/tests/test_inline.post.exp b/tracegrind/tests/test_inline.post.exp
new file mode 100644
index 000000000..f06c345cb
--- /dev/null
+++ b/tracegrind/tests/test_inline.post.exp
@@ -0,0 +1,19 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  5 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
+Counters: ['Ir']
+
+Showing N of N rows
+seq=N | tid=1 | event=MARKER | marker=start
+seq=N | tid=1 | event=ENTER_FN | fn=not_inlined_work | obj=test_inline.bin | file=test_inline.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=not_inlined_work | obj=test_inline.bin | file=test_inline.c | line=0 | Ir=N
+seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_inline.stderr.exp b/tracegrind/tests/test_inline.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_inline.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_inline.vgtest b/tracegrind/tests/test_inline.vgtest
new file mode 100644
index 000000000..5c96843d2
--- /dev/null
+++ b/tracegrind/tests/test_inline.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_inline.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_inline.msgpack.lz4 --instr-atstart=no
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_inline.msgpack.lz4 | ./filter_trace
+cleanup: rm -f tracegrind.out.test_inline.msgpack.lz4
diff --git a/tracegrind/tests/test_instr_toggle.c b/tracegrind/tests/test_instr_toggle.c
new file mode 100644
index 000000000..07d5f46f8
--- /dev/null
+++ b/tracegrind/tests/test_instr_toggle.c
@@ -0,0 +1,21 @@
+#include "tracegrind.h"
+
+static int __attribute__((noinline)) fibo(int n)
+{
+   if (n <= 1)
+      return n;
+   return fibo(n - 1) + fibo(n - 2);
+}
+
+int main(void)
+{
+   /* Instrumentation is off (--instr-atstart=no).
+      Only the fibo(2) call will be traced. */
+   TRACEGRIND_ADD_MARKER("before-fibo");
+   TRACEGRIND_START_INSTRUMENTATION;
+   int result = fibo(2);
+   TRACEGRIND_STOP_INSTRUMENTATION;
+   TRACEGRIND_ADD_MARKER("after-fibo");
+
+   return result != 1;
+}
diff --git a/tracegrind/tests/test_instr_toggle.post.exp b/tracegrind/tests/test_instr_toggle.post.exp
new file mode 100644
index 000000000..1ee05299d
--- /dev/null
+++ b/tracegrind/tests/test_instr_toggle.post.exp
@@ -0,0 +1,23 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  5 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
+Counters: ['Ir']
+
+Showing N of N rows
+seq=N | tid=1 | event=MARKER | marker=before-fibo
+seq=N | tid=1 | event=ENTER_FN | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=fibo | obj=test_instr_toggle.bin | file=test_instr_toggle.c | line=0 | Ir=N
+seq=N | tid=1 | event=MARKER | marker=after-fibo
diff --git a/tracegrind/tests/test_instr_toggle.stderr.exp b/tracegrind/tests/test_instr_toggle.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_instr_toggle.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_instr_toggle.vgtest b/tracegrind/tests/test_instr_toggle.vgtest
new file mode 100644
index 000000000..3247a09e4
--- /dev/null
+++ b/tracegrind/tests/test_instr_toggle.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_instr_toggle.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_instr_toggle.msgpack.lz4 --instr-atstart=no
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_instr_toggle.msgpack.lz4 | ./filter_trace
+cleanup: rm -f tracegrind.out.test_instr_toggle.msgpack.lz4
diff --git a/tracegrind/tests/test_longjmp.c b/tracegrind/tests/test_longjmp.c
new file mode 100644
index 000000000..5659431b2
--- /dev/null
+++ b/tracegrind/tests/test_longjmp.c
@@ -0,0 +1,51 @@
+#include "tracegrind.h"
+#include <setjmp.h>
+
+/*
+ * Test: longjmp unwinding multiple call frames.
+ *
+ * outer() calls middle(), which calls inner().
+ * inner() does longjmp back to outer(), skipping middle()'s return.
+ * Verifies tracegrind properly unwinds the call stack on non-local jumps.
+ *
+ * Call chain:  outer -> middle -> inner (longjmp back to outer)
+ */
+
+static jmp_buf env;
+
+static void __attribute__((noinline)) inner(int n)
+{
+   volatile int x = n * 2;
+   (void)x;
+   longjmp(env, 42);
+}
+
+static void __attribute__((noinline)) middle(int n)
+{
+   volatile int x = n + 1;
+   inner(x);
+   /* never reached */
+   x = x + 1;
+}
+
+static int __attribute__((noinline)) outer(int n)
+{
+   int val = setjmp(env);
+   if (val == 0) {
+      middle(n);
+      /* never reached */
+      return -1;
+   }
+   return val;
+}
+
+int main(void)
+{
+   volatile int input = 5;
+   TRACEGRIND_ADD_MARKER("start");
+   TRACEGRIND_START_INSTRUMENTATION;
+   int result = outer(input);
+   TRACEGRIND_STOP_INSTRUMENTATION;
+   TRACEGRIND_ADD_MARKER("end");
+   return result != 42;
+}
diff --git a/tracegrind/tests/test_longjmp.post.exp b/tracegrind/tests/test_longjmp.post.exp
new file mode 100644
index 000000000..d0524b77e
--- /dev/null
+++ b/tracegrind/tests/test_longjmp.post.exp
@@ -0,0 +1,15 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+Counters: ['Ir']
+Showing N of N rows
+seq=N | tid=1 | event=MARKER | marker=start
+seq=N | tid=1 | event=ENTER_FN | fn=outer | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=middle | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=inner | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=inner | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=middle | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=outer | obj=test_longjmp.bin | file=test_longjmp.c | line=0 | Ir=N
+seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_longjmp.stderr.exp b/tracegrind/tests/test_longjmp.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_longjmp.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_longjmp.vgtest b/tracegrind/tests/test_longjmp.vgtest
new file mode 100644
index 000000000..0291a7fbe
--- /dev/null
+++ b/tracegrind/tests/test_longjmp.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_longjmp.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_longjmp.msgpack.lz4 --instr-atstart=no
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_longjmp.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Counters:|Showing |MARKER|fn=outer|fn=middle|fn=inner)'
+cleanup: rm -f tracegrind.out.test_longjmp.msgpack.lz4
diff --git a/tracegrind/tests/test_marker.c b/tracegrind/tests/test_marker.c
new file mode 100644
index 000000000..721883b3b
--- /dev/null
+++ b/tracegrind/tests/test_marker.c
@@ -0,0 +1,17 @@
+#include "tracegrind.h"
+
+static int compute(int n)
+{
+   int sum = 0;
+   for (int i = 0; i < n; i++)
+      sum += i * i;
+   return sum;
+}
+
+int main(void)
+{
+   TRACEGRIND_ADD_MARKER("start-work");
+   int result = compute(1000);
+   TRACEGRIND_ADD_MARKER("end-work");
+   return result == 0;
+}
diff --git a/tracegrind/tests/test_marker.post.exp b/tracegrind/tests/test_marker.post.exp
new file mode 100644
index 000000000..cd8748b02
--- /dev/null
+++ b/tracegrind/tests/test_marker.post.exp
@@ -0,0 +1,17 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  5 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
+Counters: ['Ir']
+
+Showing N of N rows
+[N, 1, 0, 'start-work']
+[N, 1, 0, 'end-work']
diff --git a/tracegrind/tests/test_marker.stderr.exp b/tracegrind/tests/test_marker.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_marker.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_marker.vgtest b/tracegrind/tests/test_marker.vgtest
new file mode 100644
index 000000000..9165191e0
--- /dev/null
+++ b/tracegrind/tests/test_marker.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_marker.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_marker.msgpack.lz4
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_marker.msgpack.lz4 --event MARKER --raw | ./filter_trace
+cleanup: rm -f tracegrind.out.test_marker.msgpack.lz4
diff --git a/tracegrind/tests/test_nested_inlined.c b/tracegrind/tests/test_nested_inlined.c
new file mode 100644
index 000000000..a0daca1e1
--- /dev/null
+++ b/tracegrind/tests/test_nested_inlined.c
@@ -0,0 +1,51 @@
+#include "tracegrind.h"
+
+/* Inner inlined function.
+ * With --read-inline-info=yes, should produce ENTER_INLINED / EXIT_INLINED
+ * events with fn=inner_inline. */
+static inline __attribute__((always_inline)) int inner_inline(int a)
+{
+   int result;
+   if (a > 0) {
+      result = a * 3;
+   } else {
+      result = a + 1;
+   }
+   return result;
+}
+
+/* Outer inlined function - calls inner_inline.
+ * Should produce ENTER_INLINED events for both outer_inline and inner_inline,
+ * showing nested inline transitions.
+ * Uses volatile stores in both branches to prevent the compiler from
+ * converting the if-else to a branchless cmov. */
+static inline __attribute__((always_inline)) int outer_inline(int a, int b)
+{
+   volatile int x;
+   if (a > b) {
+      x = a - b;
+   } else {
+      x = b - a;
+   }
+   int y = inner_inline(x);
+   return y + a;
+}
+
+/* Non-inlined caller */
+static int __attribute__((noinline)) caller(int n)
+{
+   volatile int x = n;
+   return outer_inline(x, x + 1);
+}
+
+int main(void)
+{
+   volatile int input = 5;
+   TRACEGRIND_ADD_MARKER("start");
+   TRACEGRIND_START_INSTRUMENTATION;
+   int result = caller(input);
+   TRACEGRIND_STOP_INSTRUMENTATION;
+   TRACEGRIND_ADD_MARKER("end");
+   /* caller(5) -> outer_inline(5, 6): x=1, inner_inline(1)=3, 3+5=8 */
+   return result != 8;
+}
diff --git a/tracegrind/tests/test_nested_inlined.post.exp b/tracegrind/tests/test_nested_inlined.post.exp
new file mode 100644
index 000000000..0d0571af2
--- /dev/null
+++ b/tracegrind/tests/test_nested_inlined.post.exp
@@ -0,0 +1,23 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  5 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
+Counters: ['Ir']
+
+Showing N of N rows
+seq=N | tid=1 | event=MARKER | marker=start
+seq=N | tid=1 | event=ENTER_FN | fn=caller | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_INLINED_FN | fn=outer_inline | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=26 | Ir=N
+seq=N | tid=1 | event=ENTER_INLINED_FN | fn=inner_inline | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=9 | Ir=N
+seq=N | tid=1 | event=EXIT_INLINED_FN | fn=inner_inline | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=9 | Ir=N
+seq=N | tid=1 | event=EXIT_INLINED_FN | fn=outer_inline | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=9 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=caller | obj=test_nested_inlined.bin | file=test_nested_inlined.c | line=0 | Ir=N
+seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_nested_inlined.stderr.exp b/tracegrind/tests/test_nested_inlined.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_nested_inlined.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_nested_inlined.vgtest b/tracegrind/tests/test_nested_inlined.vgtest
new file mode 100644
index 000000000..adaf9a895
--- /dev/null
+++ b/tracegrind/tests/test_nested_inlined.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_nested_inlined.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_nested_inlined.msgpack.lz4 --instr-atstart=no --read-inline-info=yes
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_nested_inlined.msgpack.lz4 | ./filter_trace
+cleanup: rm -f tracegrind.out.test_nested_inlined.msgpack.lz4
diff --git a/tracegrind/tests/test_recursion.c b/tracegrind/tests/test_recursion.c
new file mode 100644
index 000000000..e3589ae6c
--- /dev/null
+++ b/tracegrind/tests/test_recursion.c
@@ -0,0 +1,28 @@
+#include "tracegrind.h"
+
+/*
+ * Test: deep recursion (100 levels).
+ *
+ * recurse() calls itself 100 times, then returns back through
+ * all frames. Verifies the call stack handles deep nesting and
+ * produces balanced ENTER/EXIT pairs.
+ */
+
+static int __attribute__((noinline)) recurse(int depth)
+{
+   volatile int d = depth;
+   if (d <= 0)
+      return 0;
+   return recurse(d - 1) + 1;
+}
+
+int main(void)
+{
+   volatile int input = 100;
+   TRACEGRIND_ADD_MARKER("start");
+   TRACEGRIND_START_INSTRUMENTATION;
+   int result = recurse(input);
+   TRACEGRIND_STOP_INSTRUMENTATION;
+   TRACEGRIND_ADD_MARKER("end");
+   return result != 100;
+}
diff --git a/tracegrind/tests/test_recursion.post.exp b/tracegrind/tests/test_recursion.post.exp
new file mode 100644
index 000000000..06977039b
--- /dev/null
+++ b/tracegrind/tests/test_recursion.post.exp
@@ -0,0 +1,4 @@
+seq=N | tid=1 | event=MARKER | marker=start
+seq=N | tid=1 | event=MARKER | marker=end
+ENTER_FN count: 101
+EXIT_FN count: 101
diff --git a/tracegrind/tests/test_recursion.stderr.exp b/tracegrind/tests/test_recursion.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_recursion.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_recursion.vgtest b/tracegrind/tests/test_recursion.vgtest
new file mode 100644
index 000000000..bfff7defe
--- /dev/null
+++ b/tracegrind/tests/test_recursion.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_recursion.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_recursion.msgpack.lz4 --instr-atstart=no
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_recursion.msgpack.lz4 | ./filter_trace | grep -E '(event=MARKER|fn=recurse)' | awk '/MARKER/{print} /ENTER_FN/{e++} /EXIT_FN/{x++} END{print "ENTER_FN count: "e; print "EXIT_FN count: "x}'
+cleanup: rm -f tracegrind.out.test_recursion.msgpack.lz4
diff --git a/tracegrind/tests/test_schema.post.exp b/tracegrind/tests/test_schema.post.exp
new file mode 100644
index 000000000..d30dbc939
--- /dev/null
+++ b/tracegrind/tests/test_schema.post.exp
@@ -0,0 +1,14 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  5 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
+Counters: ['Ir']
+
diff --git a/tracegrind/tests/test_schema.stderr.exp b/tracegrind/tests/test_schema.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_schema.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_schema.vgtest b/tracegrind/tests/test_schema.vgtest
new file mode 100644
index 000000000..482a552de
--- /dev/null
+++ b/tracegrind/tests/test_schema.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_basic.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_schema.msgpack.lz4
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_schema.msgpack.lz4 --schema | ./filter_trace
+cleanup: rm -f tracegrind.out.test_schema.msgpack.lz4
diff --git a/tracegrind/tests/test_signal.c b/tracegrind/tests/test_signal.c
new file mode 100644
index 000000000..028354780
--- /dev/null
+++ b/tracegrind/tests/test_signal.c
@@ -0,0 +1,42 @@
+#include "tracegrind.h"
+#include <signal.h>
+#include <string.h>
+
+/*
+ * Test: signal handler interrupting normal function execution.
+ *
+ * caller() raises SIGALRM to itself. The signal handler (handler_fn)
+ * runs, then execution returns to caller(). Verifies the call stack
+ * is properly maintained across signal delivery.
+ */
+
+static volatile sig_atomic_t got_signal = 0;
+
+static void __attribute__((noinline)) handler_fn(int sig)
+{
+   (void)sig;
+   got_signal = 1;
+}
+
+static int __attribute__((noinline)) caller(int n)
+{
+   volatile int x = n;
+   raise(SIGALRM);
+   return x + 1;
+}
+
+int main(void)
+{
+   struct sigaction sa;
+   memset(&sa, 0, sizeof(sa));
+   sa.sa_handler = handler_fn;
+   sigaction(SIGALRM, &sa, NULL);
+
+   volatile int input = 5;
+   TRACEGRIND_ADD_MARKER("start");
+   TRACEGRIND_START_INSTRUMENTATION;
+   int result = caller(input);
+   TRACEGRIND_STOP_INSTRUMENTATION;
+   TRACEGRIND_ADD_MARKER("end");
+   return (result != 6) || !got_signal;
+}
diff --git a/tracegrind/tests/test_signal.post.exp b/tracegrind/tests/test_signal.post.exp
new file mode 100644
index 000000000..ec413adf6
--- /dev/null
+++ b/tracegrind/tests/test_signal.post.exp
@@ -0,0 +1,11 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+Counters: ['Ir']
+Showing N of N rows
+seq=N | tid=1 | event=MARKER | marker=start
+seq=N | tid=1 | event=ENTER_FN | fn=caller | obj=test_signal.bin | file=test_signal.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=caller | obj=test_signal.bin | file=test_signal.c | line=0 | Ir=N
+seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_signal.stderr.exp b/tracegrind/tests/test_signal.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_signal.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_signal.vgtest b/tracegrind/tests/test_signal.vgtest
new file mode 100644
index 000000000..66391dfa1
--- /dev/null
+++ b/tracegrind/tests/test_signal.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_signal.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_signal.msgpack.lz4 --instr-atstart=no
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_signal.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Counters:|Showing |MARKER|fn=caller|fn=handler_fn)'
+cleanup: rm -f tracegrind.out.test_signal.msgpack.lz4
diff --git a/tracegrind/tests/test_syscall.c b/tracegrind/tests/test_syscall.c
new file mode 100644
index 000000000..9aac40a48
--- /dev/null
+++ b/tracegrind/tests/test_syscall.c
@@ -0,0 +1,29 @@
+#include "tracegrind.h"
+#include <fcntl.h>
+#include <unistd.h>
+
+static int __attribute__((noinline)) do_getpid(void) { return getpid(); }
+
+static void __attribute__((noinline)) do_write(int fd)
+{
+   const char msg[] = "hello\n";
+   write(fd, msg, sizeof(msg) - 1);
+}
+
+static void __attribute__((noinline)) caller(int fd)
+{
+   do_getpid();
+   do_write(fd);
+}
+
+int main(void)
+{
+   int fd = open("/dev/null", O_WRONLY);
+   TRACEGRIND_ADD_MARKER("start");
+   TRACEGRIND_START_INSTRUMENTATION;
+   caller(fd);
+   TRACEGRIND_STOP_INSTRUMENTATION;
+   TRACEGRIND_ADD_MARKER("end");
+   close(fd);
+   return 0;
+}
diff --git a/tracegrind/tests/test_syscall.post.exp b/tracegrind/tests/test_syscall.post.exp
new file mode 100644
index 000000000..4f8a1c6fa
--- /dev/null
+++ b/tracegrind/tests/test_syscall.post.exp
@@ -0,0 +1,26 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  5 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
+Counters: ['Ir', 'sysCount', 'sysTime', 'sysCpuTime']
+Counter Units: {'sysCpuTime': 'ns', 'sysTime': 'ns'}
+Showing N of N rows
+seq=N | tid=1 | event=MARKER | marker=start
+seq=N | tid=1 | event=ENTER_FN | fn=caller | obj=test_syscall.bin | file=test_syscall.c | line=0 | Ir=N | sysCount=0 | sysTime=0 | sysCpuTime=0
+seq=N | tid=1 | event=ENTER_FN | fn=do_getpid | obj=test_syscall.bin | file=test_syscall.c | line=0 | Ir=N | sysCount=0 | sysTime=0 | sysCpuTime=0
+seq=N | tid=1 | event=ENTER_FN | fn=getpid | obj=libc.so.6 | file=syscall-template.S | line=0 | Ir=N | sysCount=0 | sysTime=0 | sysCpuTime=0
+seq=N | tid=1 | event=EXIT_FN | fn=getpid | obj=libc.so.6 | file=syscall-template.S | line=0 | Ir=N | sysCount=1 | sysTime=T | sysCpuTime=T
+seq=N | tid=1 | event=EXIT_FN | fn=do_getpid | obj=test_syscall.bin | file=test_syscall.c | line=0 | Ir=N | sysCount=0 | sysTime=0 | sysCpuTime=0
+seq=N | tid=1 | event=ENTER_FN | fn=do_write | obj=test_syscall.bin | file=test_syscall.c | line=0 | Ir=N | sysCount=0 | sysTime=0 | sysCpuTime=0
+seq=N | tid=1 | event=ENTER_FN | fn=write | obj=libc.so.6 | file=write.c | line=0 | Ir=N | sysCount=0 | sysTime=0 | sysCpuTime=0
+seq=N | tid=1 | event=EXIT_FN | fn=write | obj=libc.so.6 | file=write.c | line=0 | Ir=N | sysCount=1 | sysTime=T | sysCpuTime=T
+seq=N | tid=1 | event=EXIT_FN | fn=do_write | obj=test_syscall.bin | file=test_syscall.c | line=0 | Ir=N | sysCount=0 | sysTime=0 | sysCpuTime=0
+seq=N | tid=1 | event=EXIT_FN | fn=caller | obj=test_syscall.bin | file=test_syscall.c | line=0 | Ir=N | sysCount=0 | sysTime=0 | sysCpuTime=0
+seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_syscall.stderr.exp b/tracegrind/tests/test_syscall.stderr.exp
new file mode 100644
index 000000000..838c3d735
--- /dev/null
+++ b/tracegrind/tests/test_syscall.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir sysCount sysTime sysCpuTime
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_syscall.vgtest b/tracegrind/tests/test_syscall.vgtest
new file mode 100644
index 000000000..848ca69f7
--- /dev/null
+++ b/tracegrind/tests/test_syscall.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_syscall.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_syscall.msgpack.lz4 --instr-atstart=no --collect-systime=nsec
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_syscall.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Counters:|Counter Units:|Showing |\(|MARKER|fn=caller |fn=do_getpid |fn=do_write |fn=getpid |fn=write )'
+cleanup: rm -f tracegrind.out.test_syscall.msgpack.lz4
diff --git a/tracegrind/tests/test_tailcall.c b/tracegrind/tests/test_tailcall.c
new file mode 100644
index 000000000..b5524c69d
--- /dev/null
+++ b/tracegrind/tests/test_tailcall.c
@@ -0,0 +1,28 @@
+#include "tracegrind.h"
+
+/*
+ * Test: tail call optimization.
+ *
+ * chain_a() tail-calls chain_b(), which tail-calls chain_c().
+ * At -O2, the compiler should optimize these into JMP instructions
+ * rather than CALL+RET. Verifies tracegrind handles sibling calls.
+ *
+ * Call chain:  chain_a --(tail call)--> chain_b --(tail call)--> chain_c
+ */
+
+static int __attribute__((noinline)) chain_c(int n) { return n + 3; }
+
+static int __attribute__((noinline)) chain_b(int n) { return chain_c(n + 2); }
+
+static int __attribute__((noinline)) chain_a(int n) { return chain_b(n + 1); }
+
+int main(void)
+{
+   volatile int input = 10;
+   TRACEGRIND_ADD_MARKER("start");
+   TRACEGRIND_START_INSTRUMENTATION;
+   int result = chain_a(input);
+   TRACEGRIND_STOP_INSTRUMENTATION;
+   TRACEGRIND_ADD_MARKER("end");
+   return result != 16;
+}
diff --git a/tracegrind/tests/test_tailcall.post.exp b/tracegrind/tests/test_tailcall.post.exp
new file mode 100644
index 000000000..9d08266f6
--- /dev/null
+++ b/tracegrind/tests/test_tailcall.post.exp
@@ -0,0 +1,15 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+Counters: ['Ir']
+Showing N of N rows
+seq=N | tid=1 | event=MARKER | marker=start
+seq=N | tid=1 | event=ENTER_FN | fn=chain_a | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=chain_b | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N
+seq=N | tid=1 | event=ENTER_FN | fn=chain_c | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=chain_c | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=chain_b | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N
+seq=N | tid=1 | event=EXIT_FN | fn=chain_a | obj=test_tailcall.bin | file=test_tailcall.c | line=0 | Ir=N
+seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_tailcall.stderr.exp b/tracegrind/tests/test_tailcall.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_tailcall.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_tailcall.vgtest b/tracegrind/tests/test_tailcall.vgtest
new file mode 100644
index 000000000..c5acf2b7b
--- /dev/null
+++ b/tracegrind/tests/test_tailcall.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_tailcall.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_tailcall.msgpack.lz4 --instr-atstart=no
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_tailcall.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Counters:|Showing |MARKER|fn=chain_)'
+cleanup: rm -f tracegrind.out.test_tailcall.msgpack.lz4
diff --git a/tracegrind/tests/test_thread_create.c b/tracegrind/tests/test_thread_create.c
new file mode 100644
index 000000000..29b340691
--- /dev/null
+++ b/tracegrind/tests/test_thread_create.c
@@ -0,0 +1,20 @@
+#include "tracegrind.h"
+#include <pthread.h>
+
+static void* thread_fn(void* arg)
+{
+   (void)arg;
+   return NULL;
+}
+
+int main(void)
+{
+   pthread_t t;
+   TRACEGRIND_ADD_MARKER("start");
+   TRACEGRIND_START_INSTRUMENTATION;
+   pthread_create(&t, NULL, thread_fn, NULL);
+   pthread_join(t, NULL);
+   TRACEGRIND_STOP_INSTRUMENTATION;
+   TRACEGRIND_ADD_MARKER("end");
+   return 0;
+}
diff --git a/tracegrind/tests/test_thread_create.post.exp b/tracegrind/tests/test_thread_create.post.exp
new file mode 100644
index 000000000..d695fbc3d
--- /dev/null
+++ b/tracegrind/tests/test_thread_create.post.exp
@@ -0,0 +1,18 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  5 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
+Counters: ['Ir']
+Showing N of N rows
+seq=N | tid=1 | event=MARKER | marker=start
+seq=N | tid=1 | event=THREAD_CREATE | child_tid=2
+seq=N | tid=2 | event=ENTER_FN | fn=thread_fn | obj=test_thread_create.bin | file=test_thread_create.c | line=0 | Ir=N
+seq=N | tid=2 | event=EXIT_FN | fn=thread_fn | obj=test_thread_create.bin | file=test_thread_create.c | line=0 | Ir=N
+seq=N | tid=1 | event=MARKER | marker=end
diff --git a/tracegrind/tests/test_thread_create.stderr.exp b/tracegrind/tests/test_thread_create.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_thread_create.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_thread_create.vgtest b/tracegrind/tests/test_thread_create.vgtest
new file mode 100644
index 000000000..6ce6328d2
--- /dev/null
+++ b/tracegrind/tests/test_thread_create.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_thread_create.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_thread_create.msgpack.lz4 --instr-atstart=no
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_thread_create.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Counters:|MARKER|THREAD_CREATE|FORK|thread_fn|Showing |\()' | sed 's/child_pid=[0-9]*/child_pid=N/'
+cleanup: rm -f tracegrind.out.test_thread_create.msgpack.lz4
diff --git a/tracegrind/tests/test_thread_interleave.c b/tracegrind/tests/test_thread_interleave.c
new file mode 100644
index 000000000..93efcec47
--- /dev/null
+++ b/tracegrind/tests/test_thread_interleave.c
@@ -0,0 +1,54 @@
+#include "tracegrind.h"
+#include <pthread.h>
+
+__attribute__((noinline)) static void depth_a2(void) {}
+
+__attribute__((noinline)) static void depth_a1(void) { depth_a2(); }
+
+__attribute__((noinline)) static void* work_a(void* arg)
+{
+   (void)arg;
+   depth_a1();
+   return NULL;
+}
+
+__attribute__((noinline)) static void depth_b1(void) {}
+
+__attribute__((noinline)) static void* work_b(void* arg)
+{
+   (void)arg;
+   depth_b1();
+   return NULL;
+}
+
+__attribute__((noinline)) static void depth_c2(void) {}
+
+__attribute__((noinline)) static void depth_c1(void) { depth_c2(); }
+
+__attribute__((noinline)) static void* work_c(void* arg)
+{
+   (void)arg;
+   depth_c1();
+   return NULL;
+}
+
+int main(void)
+{
+   pthread_t t1, t2, t3;
+
+   TRACEGRIND_ADD_MARKER("start");
+   TRACEGRIND_START_INSTRUMENTATION;
+
+   pthread_create(&t1, NULL, work_a, NULL);
+   pthread_create(&t2, NULL, work_b, NULL);
+   pthread_create(&t3, NULL, work_c, NULL);
+
+   pthread_join(t1, NULL);
+   pthread_join(t2, NULL);
+   pthread_join(t3, NULL);
+
+   TRACEGRIND_STOP_INSTRUMENTATION;
+   TRACEGRIND_ADD_MARKER("end");
+
+   return 0;
+}
diff --git a/tracegrind/tests/test_thread_interleave.post.exp b/tracegrind/tests/test_thread_interleave.post.exp
new file mode 100644
index 000000000..63f1c227f
--- /dev/null
+++ b/tracegrind/tests/test_thread_interleave.post.exp
@@ -0,0 +1,34 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  5 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
+Counters: ['Ir']
+Showing N of N rows
+seq=N | tid=1 | event=MARKER | marker=start
+seq=N | tid=1 | event=THREAD_CREATE | child_tid=2
+seq=N | tid=1 | event=THREAD_CREATE | child_tid=3
+seq=N | tid=1 | event=THREAD_CREATE | child_tid=4
+seq=N | tid=1 | event=MARKER | marker=end
+seq=N | tid=2 | event=ENTER_FN | fn=work_a | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=2 | event=ENTER_FN | fn=depth_a1 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=2 | event=ENTER_FN | fn=depth_a2 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=2 | event=EXIT_FN | fn=depth_a2 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=2 | event=EXIT_FN | fn=depth_a1 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=2 | event=EXIT_FN | fn=work_a | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=3 | event=ENTER_FN | fn=work_b | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=3 | event=ENTER_FN | fn=depth_b1 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=3 | event=EXIT_FN | fn=depth_b1 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=3 | event=EXIT_FN | fn=work_b | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=4 | event=ENTER_FN | fn=work_c | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=4 | event=ENTER_FN | fn=depth_c1 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=4 | event=ENTER_FN | fn=depth_c2 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=4 | event=EXIT_FN | fn=depth_c2 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=4 | event=EXIT_FN | fn=depth_c1 | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
+seq=N | tid=4 | event=EXIT_FN | fn=work_c | obj=test_thread_interleave.bin | file=test_thread_interleave.c | line=0 | Ir=N
diff --git a/tracegrind/tests/test_thread_interleave.stderr.exp b/tracegrind/tests/test_thread_interleave.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_thread_interleave.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_thread_interleave.vgtest b/tracegrind/tests/test_thread_interleave.vgtest
new file mode 100644
index 000000000..02ea2cd8d
--- /dev/null
+++ b/tracegrind/tests/test_thread_interleave.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_thread_interleave.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_thread_interleave.msgpack.lz4 --instr-atstart=no
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_thread_interleave.msgpack.lz4 | ./filter_trace | grep -E '(Format |Schema |Event Schemas|Counters:|Showing |\(|MARKER|THREAD_CREATE|fn=work_a |fn=work_b |fn=work_c |fn=depth_a1 |fn=depth_a2 |fn=depth_b1 |fn=depth_c1 |fn=depth_c2 )' | sort -t'|' -k2,2 -s
+cleanup: rm -f tracegrind.out.test_thread_interleave.msgpack.lz4
diff --git a/tracegrind/tests/test_toggle_collect.c b/tracegrind/tests/test_toggle_collect.c
new file mode 100644
index 000000000..635caaacc
--- /dev/null
+++ b/tracegrind/tests/test_toggle_collect.c
@@ -0,0 +1,25 @@
+#include "tracegrind.h"
+
+static int work(int n)
+{
+   int sum = 0;
+   for (int i = 0; i < n; i++)
+      sum += i;
+   return sum;
+}
+
+int main(void)
+{
+   /* Collection on by default, do some traced work */
+   int result = work(10);
+
+   /* Toggle collection off */
+   TRACEGRIND_TOGGLE_COLLECT;
+   result += work(20); /* not collected */
+
+   /* Toggle collection back on */
+   TRACEGRIND_TOGGLE_COLLECT;
+   result += work(30); /* collected again */
+
+   return result == 0;
+}
diff --git a/tracegrind/tests/test_toggle_collect.post.exp b/tracegrind/tests/test_toggle_collect.post.exp
new file mode 100644
index 000000000..19397d9bc
--- /dev/null
+++ b/tracegrind/tests/test_toggle_collect.post.exp
@@ -0,0 +1,23 @@
+Format Version: N
+Format Name: tracegrind-msgpack
+Schema Version: N
+
+Event Schemas (discriminated union):
+  0 (MARKER): ['seq', 'tid', 'event', 'marker']
+  1 (ENTER_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  2 (EXIT_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  3 (ENTER_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  4 (EXIT_INLINED_FN): ['seq', 'tid', 'event', 'fn', 'obj', 'file', 'line', 'counters']
+  5 (FORK): ['seq', 'tid', 'event', 'child_pid']
+  6 (THREAD_CREATE): ['seq', 'tid', 'event', 'child_tid']
+Counters: ['Ir']
+
+Total rows: N
+
+Events by type:
+  ENTER_FN: N (P%)
+  EXIT_FN: N (P%)
+
+Threads: 1
+Sequence range: N - N
+
diff --git a/tracegrind/tests/test_toggle_collect.stderr.exp b/tracegrind/tests/test_toggle_collect.stderr.exp
new file mode 100644
index 000000000..d0b7820ae
--- /dev/null
+++ b/tracegrind/tests/test_toggle_collect.stderr.exp
@@ -0,0 +1,6 @@
+
+
+Events    : Ir
+Collected :
+
+I   refs:
diff --git a/tracegrind/tests/test_toggle_collect.vgtest b/tracegrind/tests/test_toggle_collect.vgtest
new file mode 100644
index 000000000..0f1123dfb
--- /dev/null
+++ b/tracegrind/tests/test_toggle_collect.vgtest
@@ -0,0 +1,5 @@
+prereq: ../../tracegrind/scripts/tracegrind-analyzer --help 2>/dev/null
+prog: test_toggle_collect.bin
+vgopts: --tracegrind-out-file=tracegrind.out.test_toggle_collect.msgpack.lz4
+post: ../../tracegrind/scripts/tracegrind-analyzer tracegrind.out.test_toggle_collect.msgpack.lz4 --stats | ./filter_trace
+cleanup: rm -f tracegrind.out.test_toggle_collect.msgpack.lz4
diff --git a/tracegrind/tg_lz4.c b/tracegrind/tg_lz4.c
new file mode 100644
index 000000000..6a6dd3bcc
--- /dev/null
+++ b/tracegrind/tg_lz4.c
@@ -0,0 +1,92 @@
+/*
+ * LZ4 compression wrapper for Tracegrind.
+ * Uses vendored LZ4 library adapted for Valgrind (no libc).
+ *
+ * BSD 2-Clause License - see lz4.c for full license.
+ */
+
+#include "pub_tool_basics.h"
+#include "pub_tool_libcassert.h"
+#include "pub_tool_libcbase.h"
+#include "pub_tool_mallocfree.h"
+
+#include "tg_lz4.h"
+
+/*------------------------------------------------------------*/
+/*--- LZ4 Configuration for Valgrind                       ---*/
+/*------------------------------------------------------------*/
+
+/* Disable memory allocation functions (we provide them below) */
+#define LZ4_USER_MEMORY_FUNCTIONS 1
+
+/* Freestanding mode - no string.h */
+#define LZ4_FREESTANDING 1
+
+/* Provide size_t */
+#ifndef size_t
+#define size_t SizeT
+#endif
+
+/* Provide INT_MAX from limits.h */
+#ifndef INT_MAX
+#define INT_MAX 2147483647
+#endif
+
+#ifndef UINT_MAX
+#define UINT_MAX 4294967295U
+#endif
+
+/*------------------------------------------------------------*/
+/*--- Memory function replacements                         ---*/
+/*------------------------------------------------------------*/
+
+/* Define LZ4_memcpy, LZ4_memmove, LZ4_memset before including lz4 */
+#define LZ4_memcpy(dst, src, size)  VG_(memcpy)((dst), (src), (size))
+#define LZ4_memmove(dst, src, size) VG_(memmove)((dst), (src), (size))
+#define LZ4_memset(p, v, s)         VG_(memset)((p), (v), (s))
+
+/*------------------------------------------------------------*/
+/*--- Memory allocation functions (LZ4_USER_MEMORY_FUNCTIONS) */
+/*------------------------------------------------------------*/
+
+void* LZ4_malloc(size_t s) { return VG_(malloc)("tg.lz4", s); }
+
+void* LZ4_calloc(size_t n, size_t s) { return VG_(calloc)("tg.lz4", n, s); }
+
+void LZ4_free(void* p)
+{
+   if (p)
+      VG_(free)(p);
+}
+
+/*------------------------------------------------------------*/
+/*--- Include the original LZ4 implementation              ---*/
+/*------------------------------------------------------------*/
+
+/* Disable assert (LZ4 has its own fallback) */
+#define LZ4_DEBUG 0
+
+/* Include the main LZ4 source */
+#include "lz4.c"
+
+/*------------------------------------------------------------*/
+/*--- Wrapper API                                          ---*/
+/*------------------------------------------------------------*/
+
+SizeT tg_lz4_compress_bound(SizeT src_size)
+{
+   return LZ4_compressBound((int)src_size);
+}
+
+SizeT tg_lz4_compress(void*       dst,
+                      SizeT       dst_capacity,
+                      const void* src,
+                      SizeT       src_size)
+{
+   int result = LZ4_compress_fast((const char*)src, (char*)dst, (int)src_size,
+                                  (int)dst_capacity, 2 /* acceleration */);
+   if (result <= 0) {
+      return 0;
+   }
+   return (SizeT)result;
+}
diff --git a/tracegrind/tg_lz4.h b/tracegrind/tg_lz4.h
new file mode 100644
index 000000000..7e127c0b2
--- /dev/null
+++ b/tracegrind/tg_lz4.h
@@ -0,0 +1,23 @@
+/*
+ * LZ4 compression wrapper for Tracegrind.
+ * Uses vendored LZ4 library adapted for Valgrind (no libc).
+ */
+
+#ifndef TG_LZ4_H
+#define TG_LZ4_H
+
+#include "pub_tool_basics.h"
+
+/* Return the maximum compressed size for a given source length */
+SizeT tg_lz4_compress_bound(SizeT src_size);
+
+/* Compress src[0..src_size-1] into dst.
+ * dst_capacity must be >= tg_lz4_compress_bound(src_size).
+ * Returns the compressed size on success, 0 on error.
+ */
+SizeT tg_lz4_compress(void*       dst,
+                      SizeT       dst_capacity,
+                      const void* src,
+                      SizeT       src_size);
+
+#endif /* TG_LZ4_H */
diff --git a/tracegrind/tg_msgpack.c b/tracegrind/tg_msgpack.c
new file mode 100644
index 000000000..aa202f739
--- /dev/null
+++ b/tracegrind/tg_msgpack.c
@@ -0,0 +1,210 @@
+/*
+ * Minimal MsgPack encoder for Tracegrind.
+ * Write-only, adapted for Valgrind (no libc).
+ *
+ * MsgPack format spec: https://github.com/msgpack/msgpack/blob/master/spec.md
+ */
+
+#include "pub_tool_basics.h"
+#include "pub_tool_libcassert.h"
+#include "pub_tool_libcbase.h"
+#include "pub_tool_mallocfree.h"
+
+#include "tg_msgpack.h"
+
+/* Ensure at least `needed` bytes of capacity */
+static void msgpack_ensure(msgpack_buffer* mb, Int needed)
+{
+   if (mb->size + needed <= mb->capacity)
+      return;
+   Int new_cap = mb->capacity * 2;
+   if (new_cap < mb->size + needed)
+      new_cap = mb->size + needed;
+   mb->data     = VG_(realloc)("tg.msgpack.buf", mb->data, new_cap);
+   mb->capacity = new_cap;
+}
+
+static void write_byte(msgpack_buffer* mb, UChar b)
+{
+   msgpack_ensure(mb, 1);
+   mb->data[mb->size++] = b;
+}
+
+static void write_bytes(msgpack_buffer* mb, const void* data, Int len)
+{
+   msgpack_ensure(mb, len);
+   VG_(memcpy)(mb->data + mb->size, data, len);
+   mb->size += len;
+}
+
+/* Write big-endian integers */
+static void write_be16(msgpack_buffer* mb, UShort val)
+{
+   UChar buf[2];
+   buf[0] = (UChar)(val >> 8);
+   buf[1] = (UChar)(val);
+   write_bytes(mb, buf, 2);
+}
+
+static void write_be32(msgpack_buffer* mb, UInt val)
+{
+   UChar buf[4];
+   buf[0] = (UChar)(val >> 24);
+   buf[1] = (UChar)(val >> 16);
+   buf[2] = (UChar)(val >> 8);
+   buf[3] = (UChar)(val);
+   write_bytes(mb, buf, 4);
+}
+
+static void write_be64(msgpack_buffer* mb, ULong val)
+{
+   UChar buf[8];
+   buf[0] = (UChar)(val >> 56);
+   buf[1] = (UChar)(val >> 48);
+   buf[2] = (UChar)(val >> 40);
+   buf[3] = (UChar)(val >> 32);
+   buf[4] = (UChar)(val >> 24);
+   buf[5] = (UChar)(val >> 16);
+   buf[6] = (UChar)(val >> 8);
+   buf[7] = (UChar)(val);
+   write_bytes(mb, buf, 8);
+}
+
+void msgpack_init(msgpack_buffer* mb, Int capacity)
+{
+   if (capacity < 256)
+      capacity = 256;
+   mb->data     = VG_(malloc)("tg.msgpack.init", capacity);
+   mb->size     = 0;
+   mb->capacity = capacity;
+}
+
+void msgpack_free(msgpack_buffer* mb)
+{
+   if (mb->data) {
+      VG_(free)(mb->data);
+      mb->data = NULL;
+   }
+   mb->size     = 0;
+   mb->capacity = 0;
+}
+
+void msgpack_reset(msgpack_buffer* mb) { mb->size = 0; }
+
+void msgpack_write_nil(msgpack_buffer* mb) { write_byte(mb, 0xc0); }
+
+void msgpack_write_bool(msgpack_buffer* mb, Bool val)
+{
+   write_byte(mb, val ? 0xc3 : 0xc2);
+}
+
+void msgpack_write_int(msgpack_buffer* mb, Long val)
+{
+   if (val >= 0) {
+      msgpack_write_uint(mb, (ULong)val);
+   } else if (val >= -32) {
+      /* negative fixint: 111xxxxx */
+      write_byte(mb, (UChar)(val & 0xff));
+   } else if (val >= -128) {
+      write_byte(mb, 0xd0); /* int8 */
+      write_byte(mb, (UChar)(val & 0xff));
+   } else if (val >= -32768) {
+      write_byte(mb, 0xd1); /* int16 */
+      write_be16(mb, (UShort)(val & 0xffff));
+   } else if (val >= -2147483648LL) {
+      write_byte(mb, 0xd2); /* int32 */
+      write_be32(mb, (UInt)(val & 0xffffffff));
+   } else {
+      write_byte(mb, 0xd3); /* int64 */
+      write_be64(mb, (ULong)val);
+   }
+}
+
+void msgpack_write_uint(msgpack_buffer* mb, ULong val)
+{
+   if (val <= 0x7f) {
+      /* positive fixint: 0xxxxxxx */
+      write_byte(mb, (UChar)val);
+   } else if (val <= 0xff) {
+      write_byte(mb, 0xcc); /* uint8 */
+      write_byte(mb, (UChar)val);
+   } else if (val <= 0xffff) {
+      write_byte(mb, 0xcd); /* uint16 */
+      write_be16(mb, (UShort)val);
+   } else if (val <= 0xffffffff) {
+      write_byte(mb, 0xce); /* uint32 */
+      write_be32(mb, (UInt)val);
+   } else {
+      write_byte(mb, 0xcf); /* uint64 */
+      write_be64(mb, val);
+   }
+}
+
+void msgpack_write_str(msgpack_buffer* mb, const HChar* str, Int len)
+{
+   if (len < 0)
+      len = VG_(strlen)(str);
+
+   if (len <= 31) {
+      /* fixstr: 101xxxxx */
+      write_byte(mb, (UChar)(0xa0 | len));
+   } else if (len <= 0xff) {
+      write_byte(mb, 0xd9); /* str8 */
+      write_byte(mb, (UChar)len);
+   } else if (len <= 0xffff) {
+      write_byte(mb, 0xda); /* str16 */
+      write_be16(mb, (UShort)len);
+   } else {
+      write_byte(mb, 0xdb); /* str32 */
+      write_be32(mb, (UInt)len);
+   }
+   write_bytes(mb, str, len);
+}
+
+void msgpack_write_bin(msgpack_buffer* mb, const UChar* data, Int len)
+{
+   if (len <= 0xff) {
+      write_byte(mb, 0xc4); /* bin8 */
+      write_byte(mb, (UChar)len);
+   } else if (len <= 0xffff) {
+      write_byte(mb, 0xc5); /* bin16 */
+      write_be16(mb, (UShort)len);
+   } else {
+      write_byte(mb, 0xc6); /* bin32 */
+      write_be32(mb, (UInt)len);
+   }
+   write_bytes(mb, data, len);
+}
+
+void msgpack_write_array_header(msgpack_buffer* mb, UInt count)
+{
+   if (count <= 15) {
+      /* fixarray: 1001xxxx */
+      write_byte(mb, (UChar)(0x90 | count));
+   } else if (count <= 0xffff) {
+      write_byte(mb, 0xdc); /* array16 */
+      write_be16(mb, (UShort)count);
+   } else {
+      write_byte(mb, 0xdd); /* array32 */
+      write_be32(mb, count);
+   }
+}
+
+void msgpack_write_map_header(msgpack_buffer* mb, UInt count)
+{
+   if (count <= 15) {
+      /* fixmap: 1000xxxx */
+      write_byte(mb, (UChar)(0x80 | count));
+   } else if (count <= 0xffff) {
+      write_byte(mb, 0xde); /* map16 */
+      write_be16(mb, (UShort)count);
+   } else {
+      write_byte(mb, 0xdf); /* map32 */
+      write_be32(mb, count);
+   }
+}
+
+void msgpack_write_key(msgpack_buffer* mb, const HChar* key)
+{
+   msgpack_write_str(mb, key, -1);
+}
diff --git a/tracegrind/tg_msgpack.h b/tracegrind/tg_msgpack.h
new file mode 100644
index 000000000..ae447970b
--- /dev/null
+++ b/tracegrind/tg_msgpack.h
@@ -0,0 +1,36 @@
+/*
+ * Minimal MsgPack encoder for Tracegrind.
+ * Write-only, adapted for Valgrind (no libc).
+ */
+
+#ifndef TG_MSGPACK_H
+#define TG_MSGPACK_H
+
+#include "pub_tool_basics.h"
+
+typedef struct {
+   UChar* data;
+   Int    size;
+   Int    capacity;
+} msgpack_buffer;
+
+void msgpack_init(msgpack_buffer* mb, Int capacity);
+void msgpack_free(msgpack_buffer* mb);
+void msgpack_reset(msgpack_buffer* mb);
+
+/* Encode primitives */
+void msgpack_write_nil(msgpack_buffer* mb);
+void msgpack_write_bool(msgpack_buffer* mb, Bool val);
+void msgpack_write_int(msgpack_buffer* mb, Long val);
+void msgpack_write_uint(msgpack_buffer* mb, ULong val);
+void msgpack_write_str(msgpack_buffer* mb, const HChar* str, Int len);
+void msgpack_write_bin(msgpack_buffer* mb, const UChar* data, Int len);
+
+/* Containers */
+void msgpack_write_array_header(msgpack_buffer* mb, UInt count);
+void msgpack_write_map_header(msgpack_buffer* mb, UInt count);
+
+/* Convenience: write a string key (for maps) */
+void msgpack_write_key(msgpack_buffer* mb, const HChar* key);
+
+#endif /* TG_MSGPACK_H */
diff --git a/tracegrind/threads.c b/tracegrind/threads.c
new file mode 100644
index 000000000..eaac68851
--- /dev/null
+++ b/tracegrind/threads.c
@@ -0,0 +1,424 @@
+/*--------------------------------------------------------------------*/
+/*--- Tracegrind                                                    ---*/
+/*---                                                 ct_threads.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Tracegrind, a Valgrind tool for call tracing.
+
+   Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "global.h"
+
+#include "pub_tool_threadstate.h"
+
+/* forward decls */
+static exec_state* exec_state_save(void);
+static exec_state* exec_state_restore(void);
+static exec_state* push_exec_state(int);
+static exec_state* top_exec_state(void);
+
+static exec_stack current_states;
+
+/*------------------------------------------------------------*/
+/*--- Support for multi-threading                          ---*/
+/*------------------------------------------------------------*/
+
+/*
+ * For Valgrind, MT is cooperative (no preemting in our code),
+ * so we don't need locks...
+ *
+ * Per-thread data:
+ *  - BBCCs
+ *  - call stack
+ *  - call hash
+ *  - event counters: last, current
+ *
+ * Even when ignoring MT, we need this functions to set up some
+ * datastructures for the process (= Thread 1).
+ */
+
+/* current running thread */
+ThreadId TG_(current_tid);
+
+static thread_info** thread;
+
+thread_info** TG_(get_threads)(void) { return thread; }
+
+thread_info* TG_(get_current_thread)(void) { return thread[TG_(current_tid)]; }
+
+void TG_(init_threads)(void)
+{
+   UInt i;
+
+   thread = TG_MALLOC("cl.threads.it.1", VG_N_THREADS * sizeof thread[0]);
+
+   for (i = 0; i < VG_N_THREADS; i++)
+      thread[i] = 0;
+   TG_(current_tid) = VG_INVALID_THREADID;
+}
+
+/* switches through all threads and calls func */
+void TG_(forall_threads)(void (*func)(thread_info*))
+{
+   Int t, orig_tid = TG_(current_tid);
+
+   for (t = 1; t < VG_N_THREADS; t++) {
+      if (!thread[t])
+         continue;
+      TG_(switch_thread)(t);
+      (*func)(thread[t]);
+   }
+   TG_(switch_thread)(orig_tid);
+}
+
+static thread_info* new_thread(void)
+{
+   thread_info* t;
+
+   t = (thread_info*)TG_MALLOC("cl.threads.nt.1", sizeof(thread_info));
+
+   /* init state */
+   TG_(init_exec_stack)(&(t->states));
+   TG_(init_call_stack)(&(t->calls));
+   TG_(init_fn_stack)(&(t->fns));
+   /* t->states.entry[0]->cxt = TG_(get_cxt)(t->fns.bottom); */
+
+   /* event counters */
+   t->lastdump_cost = TG_(get_eventset_cost)(TG_(sets).full);
+   TG_(init_cost)(TG_(sets).full, t->lastdump_cost);
+
+   /* CSV trace: per-thread sample snapshot (allocated lazily in
+    * trace_emit_sample) */
+   t->last_sample_cost = 0;
+
+   /* init data containers */
+   TG_(init_fn_array)(&(t->fn_active));
+   TG_(init_bbcc_hash)(&(t->bbccs));
+   TG_(init_jcc_hash)(&(t->jccs));
+
+   return t;
+}
+
+void TG_(switch_thread)(ThreadId tid)
+{
+   if (tid == TG_(current_tid))
+      return;
+
+   TG_DEBUG(0, ">> thread %u (was %u)\n", tid, TG_(current_tid));
+
+   if (TG_(current_tid) != VG_INVALID_THREADID) {
+      /* save thread state */
+      thread_info* t = thread[TG_(current_tid)];
+
+      TG_ASSERT(t != 0);
+
+      /* current context (including signal handler contexts) */
+      exec_state_save();
+      TG_(copy_current_exec_stack)(&(t->states));
+      TG_(copy_current_call_stack)(&(t->calls));
+      TG_(copy_current_fn_stack)(&(t->fns));
+
+      TG_(copy_current_fn_array)(&(t->fn_active));
+      /* If we cumulate costs of threads, use TID 1 for all jccs/bccs */
+      if (!TG_(clo).separate_threads)
+         t = thread[1];
+      TG_(copy_current_bbcc_hash)(&(t->bbccs));
+      TG_(copy_current_jcc_hash)(&(t->jccs));
+   }
+
+   TG_(current_tid) = tid;
+   TG_ASSERT(tid < VG_N_THREADS);
+
+   if (tid != VG_INVALID_THREADID) {
+      thread_info* t;
+
+      /* load thread state */
+
+      if (thread[tid] == 0)
+         thread[tid] = new_thread();
+      t = thread[tid];
+
+      /* current context (including signal handler contexts) */
+      TG_(set_current_exec_stack)(&(t->states));
+      exec_state_restore();
+      TG_(set_current_call_stack)(&(t->calls));
+      TG_(set_current_fn_stack)(&(t->fns));
+
+      TG_(set_current_fn_array)(&(t->fn_active));
+      /* If we cumulate costs of threads, use TID 1 for all jccs/bccs */
+      if (!TG_(clo).separate_threads)
+         t = thread[1];
+      TG_(set_current_bbcc_hash)(&(t->bbccs));
+      TG_(set_current_jcc_hash)(&(t->jccs));
+   }
+}
+
+void TG_(run_thread)(ThreadId tid) { TG_(switch_thread)(tid); }
+
+void TG_(pre_signal)(ThreadId tid, Int sigNum, Bool alt_stack)
+{
+   exec_state* es;
+
+   TG_DEBUG(0, ">> pre_signal(TID %u, sig %d, alt_st %s)\n", tid, sigNum,
+            alt_stack ? "yes" : "no");
+
+   /* switch to the thread the handler runs in */
+   TG_(switch_thread)(tid);
+
+   /* save current execution state */
+   exec_state_save();
+
+   /* setup new cxtinfo struct for this signal handler */
+   es = push_exec_state(sigNum);
+   TG_(zero_cost)(TG_(sets).full, es->cost);
+   TG_(current_state).cost = es->cost;
+   es->call_stack_bottom   = TG_(current_call_stack).sp;
+
+   /* setup current state for a spontaneous call */
+   TG_(init_exec_state)(&TG_(current_state));
+   TG_(current_state).sig = sigNum;
+   TG_(push_cxt)(0);
+}
+
+/* Run post-signal if the stackpointer for call stack is at
+ * the bottom in current exec state (e.g. a signal handler)
+ *
+ * Called from TG_(pop_call_stack)
+ */
+void TG_(run_post_signal_on_call_stack_bottom)(void)
+{
+   exec_state* es = top_exec_state();
+   TG_ASSERT(es != 0);
+   TG_ASSERT(TG_(current_state).sig > 0);
+
+   if (TG_(current_call_stack).sp == es->call_stack_bottom)
+      TG_(post_signal)(TG_(current_tid), TG_(current_state).sig);
+}
+
+void TG_(post_signal)(ThreadId tid, Int sigNum)
+{
+   exec_state* es;
+   UInt        fn_number, *pactive;
+
+   TG_DEBUG(0, ">> post_signal(TID %u, sig %d)\n", tid, sigNum);
+
+   /* thread switching potentially needed, eg. with instrumentation off */
+   TG_(switch_thread)(tid);
+   TG_ASSERT(sigNum == TG_(current_state).sig);
+
+   /* Unwind call stack of this signal handler.
+    * This should only be needed at finalisation time
+    */
+   es = top_exec_state();
+   TG_ASSERT(es != 0);
+   while (TG_(current_call_stack).sp > es->call_stack_bottom)
+      TG_(pop_call_stack)();
+
+   if (TG_(current_state).cxt) {
+      /* correct active counts */
+      fn_number = TG_(current_state).cxt->fn[0]->number;
+      pactive   = TG_(get_fn_entry)(fn_number);
+      (*pactive)--;
+      TG_DEBUG(0, "  set active count of %s back to %u\n",
+               TG_(current_state).cxt->fn[0]->name, *pactive);
+   }
+
+   if (TG_(current_fn_stack).top > TG_(current_fn_stack).bottom) {
+      /* set fn_stack_top back.
+       * top can point to 0 if nothing was executed in the signal handler;
+       * this is possible at end on unwinding handlers.
+       */
+      if (*(TG_(current_fn_stack).top) != 0) {
+         TG_(current_fn_stack).top--;
+         TG_ASSERT(*(TG_(current_fn_stack).top) == 0);
+      }
+      if (TG_(current_fn_stack).top > TG_(current_fn_stack).bottom)
+         TG_(current_fn_stack).top--;
+   }
+
+   /* zero signal handler costs before restoring previous context */
+   TG_ASSERT(TG_(current_state).cost == es->cost);
+   TG_(zero_cost)(TG_(sets).full, TG_(current_state).cost);
+
+   /* restore previous context */
+   es->sig = -1;
+   current_states.sp--;
+   es                     = top_exec_state();
+   TG_(current_state).sig = es->sig;
+   exec_state_restore();
+
+   /* There is no way to reliable get the thread ID we are switching to
+    * after this handler returns. So we sync with actual TID at start of
+    * TG_(setup_bb)(), which should be the next for tracegrind.
+    */
+}
+
+/*------------------------------------------------------------*/
+/*--- Execution states in a thread & signal handlers       ---*/
+/*------------------------------------------------------------*/
+
+/* Each thread can be interrupted by a signal handler, and they
+ * themselves again. But as there's no scheduling among handlers
+ * of the same thread, we don't need additional stacks.
+ * So storing execution contexts and
+ * adding separators in the callstack(needed to not intermix normal/handler
+ * functions in contexts) should be enough.
+ */
+
+/* not initialized: call_stack_bottom, sig */
+void TG_(init_exec_state)(exec_state* es)
+{
+   es->collect     = TG_(clo).collect_atstart;
+   es->cxt         = 0;
+   es->jmps_passed = 0;
+   es->bbcc        = 0;
+   es->nonskipped  = 0;
+}
+
+static exec_state* new_exec_state(Int sigNum)
+{
+   exec_state* es;
+   es = (exec_state*)TG_MALLOC("cl.threads.nes.1", sizeof(exec_state));
+
+   /* allocate real cost space: needed as incremented by
+    * simulation functions */
+   es->cost = TG_(get_eventset_cost)(TG_(sets).full);
+   TG_(zero_cost)(TG_(sets).full, es->cost);
+   TG_(init_exec_state)(es);
+   es->sig               = sigNum;
+   es->call_stack_bottom = 0;
+
+   return es;
+}
+
+void TG_(init_exec_stack)(exec_stack* es)
+{
+   Int i;
+
+   /* The first element is for the main thread */
+   es->entry[0] = new_exec_state(0);
+   for (i = 1; i < MAX_SIGHANDLERS; i++)
+      es->entry[i] = 0;
+   es->sp = 0;
+}
+
+void TG_(copy_current_exec_stack)(exec_stack* dst)
+{
+   Int i;
+
+   dst->sp = current_states.sp;
+   for (i = 0; i < MAX_SIGHANDLERS; i++)
+      dst->entry[i] = current_states.entry[i];
+}
+
+void TG_(set_current_exec_stack)(exec_stack* dst)
+{
+   Int i;
+
+   current_states.sp = dst->sp;
+   for (i = 0; i < MAX_SIGHANDLERS; i++)
+      current_states.entry[i] = dst->entry[i];
+}
+
+/* Get top context info struct of current thread */
+static exec_state* top_exec_state(void)
+{
+   Int         sp = current_states.sp;
+   exec_state* es;
+
+   TG_ASSERT((sp >= 0) && (sp < MAX_SIGHANDLERS));
+   es = current_states.entry[sp];
+   TG_ASSERT(es != 0);
+   return es;
+}
+
+/* Allocates a free context info structure for a new entered
+ * signal handler, putting it on the context stack.
+ * Returns a pointer to the structure.
+ */
+static exec_state* push_exec_state(int sigNum)
+{
+   Int         sp;
+   exec_state* es;
+
+   current_states.sp++;
+   sp = current_states.sp;
+
+   TG_ASSERT((sigNum > 0) && (sigNum <= _VKI_NSIG));
+   TG_ASSERT((sp > 0) && (sp < MAX_SIGHANDLERS));
+   es = current_states.entry[sp];
+   if (!es) {
+      es                       = new_exec_state(sigNum);
+      current_states.entry[sp] = es;
+   } else
+      es->sig = sigNum;
+
+   return es;
+}
+
+/* Save current context to top cxtinfo struct */
+static exec_state* exec_state_save(void)
+{
+   exec_state* es = top_exec_state();
+
+   es->cxt         = TG_(current_state).cxt;
+   es->collect     = TG_(current_state).collect;
+   es->jmps_passed = TG_(current_state).jmps_passed;
+   es->bbcc        = TG_(current_state).bbcc;
+   es->nonskipped  = TG_(current_state).nonskipped;
+   TG_ASSERT(es->cost == TG_(current_state).cost);
+
+   TG_DEBUGIF(1)
+   {
+      TG_DEBUG(1, "  cxtinfo_save(sig %d): collect %s, jmps_passed %d\n",
+               es->sig, es->collect ? "Yes" : "No", es->jmps_passed);
+      TG_(print_bbcc)(-9, es->bbcc);
+      TG_(print_cost)(-9, TG_(sets).full, es->cost);
+   }
+
+   /* signal number does not need to be saved */
+   TG_ASSERT(TG_(current_state).sig == es->sig);
+
+   return es;
+}
+
+static exec_state* exec_state_restore(void)
+{
+   exec_state* es = top_exec_state();
+
+   TG_(current_state).cxt         = es->cxt;
+   TG_(current_state).collect     = es->collect;
+   TG_(current_state).jmps_passed = es->jmps_passed;
+   TG_(current_state).bbcc        = es->bbcc;
+   TG_(current_state).nonskipped  = es->nonskipped;
+   TG_(current_state).cost        = es->cost;
+   TG_(current_state).sig         = es->sig;
+
+   TG_DEBUGIF(1)
+   {
+      TG_DEBUG(1, "  exec_state_restore(sig %d): collect %s, jmps_passed %d\n",
+               es->sig, es->collect ? "Yes" : "No", es->jmps_passed);
+      TG_(print_bbcc)(-9, es->bbcc);
+      TG_(print_cxt)(-9, es->cxt, 0);
+      TG_(print_cost)(-9, TG_(sets).full, es->cost);
+   }
+
+   return es;
+}
diff --git a/tracegrind/tracegrind.h b/tracegrind/tracegrind.h
new file mode 100644
index 000000000..f600cf2b7
--- /dev/null
+++ b/tracegrind/tracegrind.h
@@ -0,0 +1,129 @@
+
+/*
+   ----------------------------------------------------------------
+
+   Notice that the following BSD-style license applies to this one
+   file (tracegrind.h) only.  The rest of Valgrind is licensed under the
+   terms of the GNU General Public License, version 3, unless
+   otherwise indicated.  See the COPYING file in the source
+   distribution for details.
+
+   ----------------------------------------------------------------
+
+   This file is part of tracegrind, a valgrind tool for cache simulation
+   and streaming CSV trace output.
+
+   Based on callgrind, Copyright (C) 2003-2017 Josef Weidendorfer.
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. The origin of this software must not be misrepresented; you must
+      not claim that you wrote the original software.  If you use this
+      software in a product, an acknowledgment in the product
+      documentation would be appreciated but is not required.
+
+   3. Altered source versions must be plainly marked as such, and must
+      not be misrepresented as being the original software.
+
+   4. The name of the author may not be used to endorse or promote
+      products derived from this software without specific prior written
+      permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   ----------------------------------------------------------------
+
+   Notice that the above BSD-style license applies to this one file
+   (tracegrind.h) only.  The entire rest of Valgrind is licensed under
+   the terms of the GNU General Public License, version 3.  See the
+   COPYING file in the source distribution for details.
+
+   ----------------------------------------------------------------
+*/
+
+#ifndef __TRACEGRIND_H
+#define __TRACEGRIND_H
+
+#include "valgrind.h"
+
+/* !! ABIWARNING !! ABIWARNING !! ABIWARNING !! ABIWARNING !!
+   This enum comprises an ABI exported by Valgrind to programs
+   which use client requests.  DO NOT CHANGE THE ORDER OF THESE
+   ENTRIES, NOR DELETE ANY -- add new ones at the end.
+
+   The identification ('C','T') for Tracegrind has historical
+   reasons: it was called "Calltree" before. Besides, ('C','G') would
+   clash with cachegrind. We keep ('C','T') for compatibility with
+   callgrind client request macros.
+ */
+
+typedef enum {
+   VG_USERREQ__DUMP_STATS = VG_USERREQ_TOOL_BASE('C', 'T'), // ignored
+   VG_USERREQ__ZERO_STATS,                                  // ignored
+   VG_USERREQ__TOGGLE_COLLECT,
+   VG_USERREQ__ADD_MARKER,
+   VG_USERREQ__START_INSTRUMENTATION,
+   VG_USERREQ__STOP_INSTRUMENTATION
+} Vg_TracegrindClientRequest;
+
+/* Toggles collection state.
+   The collection state specifies whether the happening of events
+   should be noted or if they are to be ignored. Events are noted
+   by increment of counters in a cost center
+
+   Same as CALLGRIND_TOGGLE_COLLECT
+   */
+#define TRACEGRIND_TOGGLE_COLLECT                                              \
+   VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__TOGGLE_COLLECT, 0, 0, 0, 0, 0)
+
+/* Add a named marker into the trace output. The argument is a string
+   that will be recorded as a marker label.
+
+   Same as CALLGRIND_DUMP_STATS_AT
+   */
+#define TRACEGRIND_ADD_MARKER(marker_str)                                      \
+   VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__ADD_MARKER, marker_str, 0, 0,   \
+                                   0, 0)
+
+/* Start full tracegrind instrumentation if not already switched on.
+   When cache simulation is done, it will flush the simulated cache;
+   this will lead to an artificial cache warmup phase afterwards with
+   cache misses which would not have happened in reality.
+
+   Same as CALLGRIND_START_INSTRUMENTATION
+   */
+#define TRACEGRIND_START_INSTRUMENTATION                                       \
+   VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__START_INSTRUMENTATION, 0, 0, 0, \
+                                   0, 0)
+
+/* Stop full tracegrind instrumentation if not already switched off.
+   This flushes Valgrinds translation cache, and does no additional
+   instrumentation afterwards, which effectivly will run at the same
+   speed as the "none" tool (ie. at minimal slowdown).
+   Use this to bypass Tracegrind aggregation for uninteresting code parts.
+   To start Tracegrind in this mode to ignore the setup phase, use
+   the option "--instr-atstart=no".
+
+   Same as CALLGRIND_STOP_INSTRUMENTATION
+   */
+#define TRACEGRIND_STOP_INSTRUMENTATION                                        \
+   VALGRIND_DO_CLIENT_REQUEST_STMT(VG_USERREQ__STOP_INSTRUMENTATION, 0, 0, 0,  \
+                                   0, 0)
+
+#endif /* __TRACEGRIND_H */